From 8811fc2123a25aa9497ccd2028a97003d3a89008 Mon Sep 17 00:00:00 2001
From: nkey <michail.nikolaev@gmail.com>
Date: Tue, 4 Feb 2025 21:34:46 +0100
Subject: [PATCH v6 1/3] This should fix issues with incorrect results when a
 GIST IOS encounters tuples removed from pages by a concurrent vacuum
 operation.

---
 src/backend/access/gist/README       |  16 ++++
 src/backend/access/gist/gistget.c    |  32 ++++++++
 src/backend/access/gist/gistscan.c   | 115 ++++++++++++++++++++++++---
 src/backend/access/gist/gistvacuum.c |   6 +-
 src/include/access/gist_private.h    |   2 +
 5 files changed, 158 insertions(+), 13 deletions(-)

diff --git a/src/backend/access/gist/README b/src/backend/access/gist/README
index 8015ff19f05..c7c2afad088 100644
--- a/src/backend/access/gist/README
+++ b/src/backend/access/gist/README
@@ -287,6 +287,22 @@ would complicate the insertion algorithm. So when an insertion sees a page
 with F_FOLLOW_RIGHT set, it immediately tries to bring the split that
 crashed in the middle to completion by adding the downlink in the parent.
 
+Index-only scans and VACUUM
+---------------------------
+
+Index-only scans require that any tuple returned by the index scan has not
+been removed from the index by a call to ambulkdelete through VACUUM.
+To ensure this invariant, bulkdelete now requires a buffer cleanup lock, and
+every Index-only scan (IOS) will keep a pin on each page that it is returning
+tuples from.  For ordered scans, we keep one pin for each matching leaf tuple,
+for unordered scans we just keep an additional pin while we're still working
+on the page's tuples.  This ensures that pages seen by the scan won't be
+cleaned up until after the tuples have been returned.
+
+These longer pin lifetimes can cause buffer exhaustion with messages like "no
+unpinned buffers available" when the index has many pages that have similar
+ordering; but future work can figure out how to best work that out.
+
 Buffering build algorithm
 -------------------------
 
diff --git a/src/backend/access/gist/gistget.c b/src/backend/access/gist/gistget.c
index cc40e928e0a..3788a855c50 100644
--- a/src/backend/access/gist/gistget.c
+++ b/src/backend/access/gist/gistget.c
@@ -395,6 +395,7 @@ gistScanPage(IndexScanDesc scan, GISTSearchItem *pageItem,
 	}
 
 	so->nPageData = so->curPageData = 0;
+	Assert(so->pagePin == InvalidBuffer);
 	scan->xs_hitup = NULL;		/* might point into pageDataCxt */
 	if (so->pageDataCxt)
 		MemoryContextReset(so->pageDataCxt);
@@ -460,6 +461,7 @@ gistScanPage(IndexScanDesc scan, GISTSearchItem *pageItem,
 			so->pageData[so->nPageData].heapPtr = it->t_tid;
 			so->pageData[so->nPageData].recheck = recheck;
 			so->pageData[so->nPageData].offnum = i;
+			so->pageData[so->nPageData].buffer = InvalidBuffer;
 
 			/*
 			 * In an index-only scan, also fetch the data from the tuple.  The
@@ -471,6 +473,16 @@ gistScanPage(IndexScanDesc scan, GISTSearchItem *pageItem,
 				so->pageData[so->nPageData].recontup =
 					gistFetchTuple(giststate, r, it);
 				MemoryContextSwitchTo(oldcxt);
+
+				/*
+				 * Only maintain a single additional buffer pin for unordered
+				 * IOS scans; as we have all data already in one place.
+				 */
+				if (so->nPageData == 0)
+				{
+					so->pagePin = buffer;
+					IncrBufferRefCount(buffer);
+				}
 			}
 			so->nPageData++;
 		}
@@ -501,7 +513,11 @@ gistScanPage(IndexScanDesc scan, GISTSearchItem *pageItem,
 				 * In an index-only scan, also fetch the data from the tuple.
 				 */
 				if (scan->xs_want_itup)
+				{
 					item->data.heap.recontup = gistFetchTuple(giststate, r, it);
+					item->data.heap.buffer = buffer;
+					IncrBufferRefCount(buffer);
+				}
 			}
 			else
 			{
@@ -567,6 +583,10 @@ getNextNearest(IndexScanDesc scan)
 		/* free previously returned tuple */
 		pfree(scan->xs_hitup);
 		scan->xs_hitup = NULL;
+
+		Assert(BufferIsValid(so->pagePin));
+		ReleaseBuffer(so->pagePin);
+		so->pagePin = InvalidBuffer;
 	}
 
 	do
@@ -588,7 +608,11 @@ getNextNearest(IndexScanDesc scan)
 
 			/* in an index-only scan, also return the reconstructed tuple. */
 			if (scan->xs_want_itup)
+			{
+				Assert(BufferIsValid(item->data.heap.buffer));
 				scan->xs_hitup = item->data.heap.recontup;
+				so->pagePin = item->data.heap.buffer;
+			}
 			res = true;
 		}
 		else
@@ -704,6 +728,14 @@ gistgettuple(IndexScanDesc scan, ScanDirection dir)
 					so->killedItems[so->numKilled++] =
 						so->pageData[so->curPageData - 1].offnum;
 			}
+
+			if (scan->xs_want_itup && so->nPageData > 0)
+			{
+				Assert(BufferIsValid(so->pagePin));
+				ReleaseBuffer(so->pagePin);
+				so->pagePin = InvalidBuffer;
+			}
+
 			/* find and process the next index page */
 			do
 			{
diff --git a/src/backend/access/gist/gistscan.c b/src/backend/access/gist/gistscan.c
index 700fa959d03..932c2271510 100644
--- a/src/backend/access/gist/gistscan.c
+++ b/src/backend/access/gist/gistscan.c
@@ -110,6 +110,7 @@ gistbeginscan(Relation r, int nkeys, int norderbys)
 	so->numKilled = 0;
 	so->curBlkno = InvalidBlockNumber;
 	so->curPageLSN = InvalidXLogRecPtr;
+	so->pagePin = InvalidBuffer;
 
 	scan->opaque = so;
 
@@ -151,18 +152,73 @@ gistrescan(IndexScanDesc scan, ScanKey key, int nkeys,
 		Assert(so->queueCxt == so->giststate->scanCxt);
 		first_time = true;
 	}
-	else if (so->queueCxt == so->giststate->scanCxt)
-	{
-		/* second time through */
-		so->queueCxt = AllocSetContextCreate(so->giststate->scanCxt,
-											 "GiST queue context",
-											 ALLOCSET_DEFAULT_SIZES);
-		first_time = false;
-	}
 	else
 	{
-		/* third or later time through */
-		MemoryContextReset(so->queueCxt);
+		/*
+		 * In the first scan of a query we allocate IOS items in the scan
+		 * context, which is never reset. To not leak this memory, we
+		 * manually free the queue entries.
+		 */
+		const bool freequeue = so->queueCxt == so->giststate->scanCxt;
+		/*
+		 * Index-only scans require that vacuum can't clean up entries that
+		 * we're still planning to return, so we hold a pin on the buffer until
+		 * we're past the returned item (1 pin count for every index tuple).
+		 * When rescan is called, however, we need to clean up the pins that
+		 * we still hold, lest we leak them and lose a buffer entry to that
+		 * page.
+		 */
+		const bool unpinqueue = scan->xs_want_itup;
+
+		if (freequeue || unpinqueue)
+		{
+			while (!pairingheap_is_empty(so->queue))
+			{
+				pairingheap_node *node;
+				GISTSearchItem *item;
+
+				node = pairingheap_remove_first(so->queue);
+				item = pairingheap_container(GISTSearchItem, phNode, node);
+
+				/*
+				 * If we need to unpin a buffer for IOS' heap items, do so
+				 * now.
+				 */
+				if (unpinqueue && item->blkno == InvalidBlockNumber)
+				{
+					Assert(BufferIsValid(item->data.heap.buffer));
+					ReleaseBuffer(item->data.heap.buffer);
+				}
+
+				/*
+				 * item->data.heap.recontup is stored in the separate memory
+				 * context so->pageDataCxt, which is always reset; so we don't
+				 * need to free that.
+				 * "item" itself is allocated into the queue context, which is
+				 * generally reset in rescan.
+				 * However, only in the first scan, we allocate these items
+				 * into the main scan context, which isn't reset; so we must
+				 * free these items, or else we'd leak the memory for the
+				 * duration of the query.
+				 */
+				if (freequeue)
+					pfree(item);
+			}
+		}
+
+		if (so->queueCxt == so->giststate->scanCxt)
+		{
+			/* second time through */
+			so->queueCxt = AllocSetContextCreate(so->giststate->scanCxt,
+												 "GiST queue context",
+												 ALLOCSET_DEFAULT_SIZES);
+		}
+		else
+		{
+			/* third or later time through */
+			MemoryContextReset(so->queueCxt);
+		}
+
 		first_time = false;
 	}
 
@@ -341,6 +397,15 @@ gistrescan(IndexScanDesc scan, ScanKey key, int nkeys,
 
 	/* any previous xs_hitup will have been pfree'd in context resets above */
 	scan->xs_hitup = NULL;
+
+	if (scan->xs_want_itup)
+	{
+		if (BufferIsValid(so->pagePin))
+		{
+			ReleaseBuffer(so->pagePin);
+			so->pagePin = InvalidBuffer;
+		}
+	}
 }
 
 void
@@ -348,6 +413,36 @@ gistendscan(IndexScanDesc scan)
 {
 	GISTScanOpaque so = (GISTScanOpaque) scan->opaque;
 
+	if (scan->xs_want_itup)
+	{
+		if (BufferIsValid(so->pagePin))
+		{
+			ReleaseBuffer(so->pagePin);
+			so->pagePin = InvalidBuffer;
+		}
+
+		/* unpin any leftover buffers */
+		while (!pairingheap_is_empty(so->queue))
+		{
+			pairingheap_node *node;
+			GISTSearchItem *item;
+
+			/*
+			 * Note: unlike gistrescan, there is no need to actually free the
+			 * items here, as that's handled by memory context reset in the
+			 * call to freeGISTstate() below.
+			 */
+			node = pairingheap_remove_first(so->queue);
+			item = pairingheap_container(GISTSearchItem, phNode, node);
+
+			if (item->blkno == InvalidBlockNumber)
+			{
+				Assert(BufferIsValid(item->data.heap.buffer));
+				ReleaseBuffer(item->data.heap.buffer);
+			}
+		}
+	}
+
 	/*
 	 * freeGISTstate is enough to clean up everything made by gistbeginscan,
 	 * as well as the queueCxt if there is a separate context for it.
diff --git a/src/backend/access/gist/gistvacuum.c b/src/backend/access/gist/gistvacuum.c
index fe0bfb781ca..840b3d586ed 100644
--- a/src/backend/access/gist/gistvacuum.c
+++ b/src/backend/access/gist/gistvacuum.c
@@ -289,10 +289,10 @@ restart:
 								info->strategy);
 
 	/*
-	 * We are not going to stay here for a long time, aggressively grab an
-	 * exclusive lock.
+	 * We are not going to stay here for a long time, aggressively grab a
+	 * cleanup lock.
 	 */
-	LockBuffer(buffer, GIST_EXCLUSIVE);
+	LockBufferForCleanup(buffer);
 	page = (Page) BufferGetPage(buffer);
 
 	if (gistPageRecyclable(page))
diff --git a/src/include/access/gist_private.h b/src/include/access/gist_private.h
index 39404ec7cdb..e559117e7d7 100644
--- a/src/include/access/gist_private.h
+++ b/src/include/access/gist_private.h
@@ -124,6 +124,7 @@ typedef struct GISTSearchHeapItem
 								 * index-only scans */
 	OffsetNumber offnum;		/* track offset in page to mark tuple as
 								 * LP_DEAD */
+	Buffer		buffer;			/* buffer to unpin, when IOS */
 } GISTSearchHeapItem;
 
 /* Unvisited item, either index page or heap tuple */
@@ -176,6 +177,7 @@ typedef struct GISTScanOpaqueData
 	OffsetNumber curPageData;	/* next item to return */
 	MemoryContext pageDataCxt;	/* context holding the fetched tuples, for
 								 * index-only scans */
+	Buffer		pagePin;		/* buffer of page, if pinned */
 } GISTScanOpaqueData;
 
 typedef GISTScanOpaqueData *GISTScanOpaque;
-- 
2.43.0

