From 86ca31af76acd71fce3fd36ddf25f63d6d699b77 Mon Sep 17 00:00:00 2001
From: Matthias van de Meent <boekewurm+postgres@gmail.com>
Date: Sat, 4 Jan 2025 01:02:15 +0100
Subject: [PATCH v3 2/2] RFC: Extend buffer pin lifetime for GIST IOS

This should fix issues with incorrect results when IOS encounters concurrent vacuum.
---
 src/include/access/gist_private.h    |  16 ++++
 src/backend/access/gist/gistget.c    | 116 ++++++++++++++++++++++++++-
 src/backend/access/gist/gistscan.c   |  60 ++++++++++++++
 src/backend/access/gist/gistvacuum.c |   4 +-
 4 files changed, 192 insertions(+), 4 deletions(-)

diff --git a/src/include/access/gist_private.h b/src/include/access/gist_private.h
index 7b8749c8db0..cf5fd4336c7 100644
--- a/src/include/access/gist_private.h
+++ b/src/include/access/gist_private.h
@@ -17,6 +17,7 @@
 #include "access/amapi.h"
 #include "access/gist.h"
 #include "access/itup.h"
+#include "lib/ilist.h"
 #include "lib/pairingheap.h"
 #include "storage/bufmgr.h"
 #include "storage/buffile.h"
@@ -124,6 +125,7 @@ typedef struct GISTSearchHeapItem
 								 * index-only scans */
 	OffsetNumber offnum;		/* track offset in page to mark tuple as
 								 * LP_DEAD */
+	uint32		pagepinOffset;	/* Pinned page's offset into GISTScanOpaqueData.pinned */
 } GISTSearchHeapItem;
 
 /* Unvisited item, either index page or heap tuple */
@@ -148,6 +150,13 @@ typedef struct GISTSearchItem
 	(offsetof(GISTSearchItem, distances) + \
 	 sizeof(IndexOrderByDistance) * (n_distances))
 
+typedef struct GISTPagePin
+{
+	slist_node	nextFree;
+	Buffer		buffer;			/* pinned buffer */
+	int			count;			/* number of results not yet returned */
+} GISTPagePin;
+
 /*
  * GISTScanOpaqueData: private state for a scan of a GiST index
  */
@@ -176,6 +185,12 @@ typedef struct GISTScanOpaqueData
 	OffsetNumber curPageData;	/* next item to return */
 	MemoryContext pageDataCxt;	/* context holding the fetched tuples, for
 								 * index-only scans */
+
+	GISTPagePin *pinned;		/* page pins, used in index-only scans.
+								 * otherwise NULL */
+	slist_head	nextFreePin;	/* next free page pin, if available */
+	BlockNumber	pincapacity;	/* current max pin count in pinned */
+	uint32		releasePinOffset;	/* reduce pin count on this buffer next */
 } GISTScanOpaqueData;
 
 typedef GISTScanOpaqueData *GISTScanOpaque;
@@ -463,6 +478,7 @@ extern XLogRecPtr gistXLogAssignLSN(void);
 extern bool gistgettuple(IndexScanDesc scan, ScanDirection dir);
 extern int64 gistgetbitmap(IndexScanDesc scan, TIDBitmap *tbm);
 extern bool gistcanreturn(Relation index, int attno);
+extern uint32 gistgetpin(GISTScanOpaque opaque);
 
 /* gistvalidate.c */
 extern bool gistvalidate(Oid opclassoid);
diff --git a/src/backend/access/gist/gistget.c b/src/backend/access/gist/gistget.c
index b35b8a97577..779478a44b2 100644
--- a/src/backend/access/gist/gistget.c
+++ b/src/backend/access/gist/gistget.c
@@ -337,6 +337,8 @@ gistScanPage(IndexScanDesc scan, GISTSearchItem *pageItem,
 	OffsetNumber maxoff;
 	OffsetNumber i;
 	MemoryContext oldcxt;
+	GISTPagePin *pin = NULL;
+	uint32		pagepinoff;
 
 	Assert(!GISTSearchItemIsHeap(*pageItem));
 
@@ -471,6 +473,16 @@ gistScanPage(IndexScanDesc scan, GISTSearchItem *pageItem,
 				so->pageData[so->nPageData].recontup =
 					gistFetchTuple(giststate, r, it);
 				MemoryContextSwitchTo(oldcxt);
+
+				if (pin == NULL)
+				{
+					pagepinoff = gistgetpin(so);
+					pin = &so->pinned[pagepinoff];
+					pin->buffer = buffer;
+					pin->count = 0;
+				}
+				so->pageData[so->nPageData].pagepinOffset = pagepinoff;
+				pin->count++;
 			}
 			so->nPageData++;
 		}
@@ -501,7 +513,19 @@ gistScanPage(IndexScanDesc scan, GISTSearchItem *pageItem,
 				 * In an index-only scan, also fetch the data from the tuple.
 				 */
 				if (scan->xs_want_itup)
+				{
 					item->data.heap.recontup = gistFetchTuple(giststate, r, it);
+
+					if (!PointerIsValid(pin))
+					{
+						pagepinoff = gistgetpin(so);
+						pin = &so->pinned[pagepinoff];
+						pin->buffer = buffer;
+						pin->count = 0;
+					}
+					pin->count++;
+					item->data.heap.pagepinOffset = pagepinoff;
+				}
 			}
 			else
 			{
@@ -526,7 +550,67 @@ gistScanPage(IndexScanDesc scan, GISTSearchItem *pageItem,
 		}
 	}
 
-	UnlockReleaseBuffer(buffer);
+	if (scan->xs_want_itup && GistPageIsLeaf(page) && PointerIsValid(pin))
+	{
+		Assert(pin->count > 0);
+		LockBuffer(buffer, BUFFER_LOCK_UNLOCK);
+		/* Pin to be released when scan pin->count reaches 0 */
+	}
+	else
+		UnlockReleaseBuffer(buffer);
+}
+
+uint32
+gistgetpin(GISTScanOpaque opaque)
+{
+	slist_node *node;
+	GISTPagePin *nextFree;
+
+	if (opaque->pincapacity == 0)
+	{
+		opaque->pincapacity = 1;
+		opaque->pinned = palloc0(sizeof(GISTPagePin));
+
+		return 0;
+	}
+
+	if (slist_is_empty(&opaque->nextFreePin))
+	{
+		BlockNumber firstEntry = opaque->pincapacity;
+
+		/*
+		 * We don't have any entries in the linked list, so repalloc is used
+		 * safely here.  (If there were entries left, they'd be stale
+		 * references after this.)
+		 *
+		 * Note that before overflowing uint32 we'll always first fail the
+		 * repalloc(), which has a limit of ~1GiB - much less than the 16GiB
+		 * that we'd have to consume before we'd overflow pincapacity.
+		 */
+		opaque->pincapacity *= 2;
+
+		opaque->pinned = repalloc(opaque->pinned,
+								  mul_size(sizeof(GISTPagePin),
+										   opaque->pincapacity));
+
+		/*
+		 * Fill nextFreePin list back-to-front, to push higher indexes into
+		 * the back of the slist.
+		 */
+		for (BlockNumber i = opaque->pincapacity - 1; i >= firstEntry; i--)
+		{
+			slist_push_head(&opaque->nextFreePin,
+							&opaque->pinned[i].nextFree);
+			opaque->pinned[i].buffer = InvalidBuffer;
+			opaque->pinned[i].count = 0;
+		}
+	}
+
+	node = slist_pop_head_node(&opaque->nextFreePin);
+	nextFree = slist_container(GISTPagePin, nextFree, node);
+	nextFree->count = 0;
+	nextFree->buffer = 0;
+	return (uint32) (nextFree - opaque->pinned);
 }
 
 /*
@@ -588,7 +672,10 @@ getNextNearest(IndexScanDesc scan)
 
 			/* in an index-only scan, also return the reconstructed tuple. */
 			if (scan->xs_want_itup)
+			{
 				scan->xs_hitup = item->data.heap.recontup;
+				so->releasePinOffset = item->data.heap.pagepinOffset;
+			}
 			res = true;
 		}
 		else
@@ -637,6 +724,28 @@ gistgettuple(IndexScanDesc scan, ScanDirection dir)
 		gistScanPage(scan, &fakeItem, NULL, NULL, NULL);
 	}
 
+	if (so->releasePinOffset != UINT32_MAX)
+	{
+		GISTPagePin *pin;
+
+		Assert(so->pincapacity > so->releasePinOffset);
+
+		pin = &so->pinned[so->releasePinOffset];
+
+		Assert(pin->count > 0);
+		Assert(BufferIsValid(pin->buffer));
+
+		pin->count--;
+
+		if (pin->count == 0)
+		{
+			ReleaseBuffer(pin->buffer);
+			pin->buffer = InvalidBuffer;
+			pin->count = 0;
+			slist_push_head(&so->nextFreePin, &pin->nextFree);
+		}
+	}
+
 	if (scan->numberOfOrderBys > 0)
 	{
 		/* Must fetch tuples in strict distance order */
@@ -651,7 +760,6 @@ gistgettuple(IndexScanDesc scan, ScanDirection dir)
 			{
 				if (scan->kill_prior_tuple && so->curPageData > 0)
 				{
-
 					if (so->killedItems == NULL)
 					{
 						MemoryContext oldCxt =
@@ -673,7 +781,11 @@ gistgettuple(IndexScanDesc scan, ScanDirection dir)
 
 				/* in an index-only scan, also return the reconstructed tuple */
 				if (scan->xs_want_itup)
+				{
 					scan->xs_hitup = so->pageData[so->curPageData].recontup;
+					so->releasePinOffset =
+						so->pageData[so->curPageData].pagepinOffset;
+				}
 
 				so->curPageData++;
 
diff --git a/src/backend/access/gist/gistscan.c b/src/backend/access/gist/gistscan.c
index de472e16373..738fd0d4ace 100644
--- a/src/backend/access/gist/gistscan.c
+++ b/src/backend/access/gist/gistscan.c
@@ -111,6 +111,11 @@ gistbeginscan(Relation r, int nkeys, int norderbys)
 	so->curBlkno = InvalidBlockNumber;
 	so->curPageLSN = InvalidXLogRecPtr;
 
+	so->pincapacity = 0;
+	so->pinned = NULL;
+	slist_init(&so->nextFreePin);
+	so->releasePinOffset = UINT32_MAX;
+
 	scan->opaque = so;
 
 	/*
@@ -209,6 +214,34 @@ gistrescan(IndexScanDesc scan, ScanKey key, int nkeys,
 												ALLOCSET_DEFAULT_SIZES);
 	}
 
+	if (PointerIsValid(so->pinned))
+	{
+		Assert(scan->xs_want_itup);
+
+		slist_init(&so->nextFreePin);
+
+		/*
+		 * Fill nextFreePin list back-to-front, to push higher indexes into
+		 * the back of the slist.
+		 */
+		for (int p = so->pincapacity - 1; p >= 0; p--)
+		{
+			GISTPagePin *pin = &so->pinned[p];
+
+			if (BufferIsValid(pin->buffer))
+			{
+				Assert(pin->count > 0);
+				ReleaseBuffer(pin->buffer);
+				pin->buffer = InvalidBuffer;
+				pin->count = 0;
+			}
+
+			slist_push_head(&so->nextFreePin, &pin->nextFree);
+		}
+
+		so->releasePinOffset = UINT32_MAX;
+	}
+
 	/* create new, empty pairing heap for search queue */
 	oldCxt = MemoryContextSwitchTo(so->queueCxt);
 	so->queue = pairingheap_allocate(pairingheap_GISTSearchItem_cmp, scan);
@@ -348,6 +381,33 @@ gistendscan(IndexScanDesc scan)
 {
 	GISTScanOpaque so = (GISTScanOpaque) scan->opaque;
 
+	if (PointerIsValid(so->pinned))
+	{
+		Assert(scan->xs_want_itup);
+		slist_init(&so->nextFreePin);
+
+		/*
+		 * Fill nextFreePin list back-to-front, to push higher indexes into
+		 * the back of the slist.
+		 */
+		for (int p = so->pincapacity - 1; p >= 0; p--)
+		{
+			GISTPagePin *pin = &so->pinned[p];
+
+			if (BufferIsValid(pin->buffer))
+			{
+				Assert(pin->count > 0);
+				ReleaseBuffer(pin->buffer);
+				pin->buffer = InvalidBuffer;
+				pin->count = 0;
+			}
+
+			slist_push_head(&so->nextFreePin, &pin->nextFree);
+		}
+
+		so->releasePinOffset = UINT32_MAX;
+	}
+
 	/*
 	 * freeGISTstate is enough to clean up everything made by gistbeginscan,
 	 * as well as the queueCxt if there is a separate context for it.
diff --git a/src/backend/access/gist/gistvacuum.c b/src/backend/access/gist/gistvacuum.c
index 24fb94f473e..b0f0401a8ae 100644
--- a/src/backend/access/gist/gistvacuum.c
+++ b/src/backend/access/gist/gistvacuum.c
@@ -290,9 +290,9 @@ restart:
 
 	/*
 	 * We are not going to stay here for a long time, aggressively grab an
-	 * exclusive lock.
+	 * exclusive lock for cleanup.
 	 */
-	LockBuffer(buffer, GIST_EXCLUSIVE);
+	LockBufferForCleanup(buffer);
 	page = (Page) BufferGetPage(buffer);
 
 	if (gistPageRecyclable(page))
-- 
2.45.2

