From 8eb69789dc1d64ea238c8ed4529a20b884ca3d90 Mon Sep 17 00:00:00 2001
From: Andrey Borodin <amborodin@acm.org>
Date: Sun, 28 Oct 2018 22:20:58 +0500
Subject: [PATCH 1/2] Physical GiST scan in VACUUM v17

---
 src/backend/access/gist/gistvacuum.c | 423 +++++++++++++++------------
 1 file changed, 237 insertions(+), 186 deletions(-)

diff --git a/src/backend/access/gist/gistvacuum.c b/src/backend/access/gist/gistvacuum.c
index 5948218c77..b6b9ee0ae3 100644
--- a/src/backend/access/gist/gistvacuum.c
+++ b/src/backend/access/gist/gistvacuum.c
@@ -21,6 +21,34 @@
 #include "storage/indexfsm.h"
 #include "storage/lmgr.h"
 
+/* Working state needed by gistbulkdelete */
+typedef struct
+{
+	IndexVacuumInfo *info;
+	IndexBulkDeleteResult *stats;
+	IndexBulkDeleteCallback callback;
+	void	   *callback_state;
+	GistNSN		startNSN;
+	BlockNumber totFreePages;	/* true total # of free pages */
+} GistVacState;
+
+static void gistvacuumscan(IndexVacuumInfo *info, IndexBulkDeleteResult *stats,
+			   IndexBulkDeleteCallback callback, void *callback_state);
+static void gistvacuumpage(GistVacState *vstate, BlockNumber blkno,
+			   BlockNumber orig_blkno);
+
+IndexBulkDeleteResult *
+gistbulkdelete(IndexVacuumInfo *info, IndexBulkDeleteResult *stats,
+			   IndexBulkDeleteCallback callback, void *callback_state)
+{
+	/* allocate stats if first time through, else re-use existing struct */
+	if (stats == NULL)
+		stats = (IndexBulkDeleteResult *) palloc0(sizeof(IndexBulkDeleteResult));
+
+	gistvacuumscan(info, stats, callback, callback_state);
+
+	return stats;
+}
 
 /*
  * VACUUM cleanup: update FSM
@@ -28,104 +56,36 @@
 IndexBulkDeleteResult *
 gistvacuumcleanup(IndexVacuumInfo *info, IndexBulkDeleteResult *stats)
 {
-	Relation	rel = info->index;
-	BlockNumber npages,
-				blkno;
-	BlockNumber totFreePages;
-	double		tuplesCount;
-	bool		needLock;
-
 	/* No-op in ANALYZE ONLY mode */
 	if (info->analyze_only)
 		return stats;
 
-	/* Set up all-zero stats if gistbulkdelete wasn't called */
+	/*
+	 * If gistbulkdelete was called, we need not do anything, just return the
+	 * stats from the latest gistbulkdelete call.  If it wasn't called, we still
+	 * need to do a pass over the index, to obtain index statistics.
+	 */
 	if (stats == NULL)
+	{
 		stats = (IndexBulkDeleteResult *) palloc0(sizeof(IndexBulkDeleteResult));
+		gistvacuumscan(info, stats, NULL, NULL);
+	}
 
 	/*
-	 * Need lock unless it's local to this backend.
+	 * It's quite possible for us to be fooled by concurrent page splits into
+	 * double-counting some index tuples, so disbelieve any total that exceeds
+	 * the underlying heap's count ... if we know that accurately.  Otherwise
+	 * this might just make matters worse.
 	 */
-	needLock = !RELATION_IS_LOCAL(rel);
-
-	/* try to find deleted pages */
-	if (needLock)
-		LockRelationForExtension(rel, ExclusiveLock);
-	npages = RelationGetNumberOfBlocks(rel);
-	if (needLock)
-		UnlockRelationForExtension(rel, ExclusiveLock);
-
-	totFreePages = 0;
-	tuplesCount = 0;
-	for (blkno = GIST_ROOT_BLKNO + 1; blkno < npages; blkno++)
+	if (!info->estimated_count)
 	{
-		Buffer		buffer;
-		Page		page;
-
-		vacuum_delay_point();
-
-		buffer = ReadBufferExtended(rel, MAIN_FORKNUM, blkno, RBM_NORMAL,
-									info->strategy);
-		LockBuffer(buffer, GIST_SHARE);
-		page = (Page) BufferGetPage(buffer);
-
-		if (PageIsNew(page) || GistPageIsDeleted(page))
-		{
-			totFreePages++;
-			RecordFreeIndexPage(rel, blkno);
-		}
-		else if (GistPageIsLeaf(page))
-		{
-			/* count tuples in index (considering only leaf tuples) */
-			tuplesCount += PageGetMaxOffsetNumber(page);
-		}
-		UnlockReleaseBuffer(buffer);
+		if (stats->num_index_tuples > info->num_heap_tuples)
+			stats->num_index_tuples = info->num_heap_tuples;
 	}
 
-	/* Finally, vacuum the FSM */
-	IndexFreeSpaceMapVacuum(info->index);
-
-	/* return statistics */
-	stats->pages_free = totFreePages;
-	if (needLock)
-		LockRelationForExtension(rel, ExclusiveLock);
-	stats->num_pages = RelationGetNumberOfBlocks(rel);
-	if (needLock)
-		UnlockRelationForExtension(rel, ExclusiveLock);
-	stats->num_index_tuples = tuplesCount;
-	stats->estimated_count = false;
-
 	return stats;
 }
 
-typedef struct GistBDItem
-{
-	GistNSN		parentlsn;
-	BlockNumber blkno;
-	struct GistBDItem *next;
-} GistBDItem;
-
-static void
-pushStackIfSplited(Page page, GistBDItem *stack)
-{
-	GISTPageOpaque opaque = GistPageGetOpaque(page);
-
-	if (stack->blkno != GIST_ROOT_BLKNO && !XLogRecPtrIsInvalid(stack->parentlsn) &&
-		(GistFollowRight(page) || stack->parentlsn < GistPageGetNSN(page)) &&
-		opaque->rightlink != InvalidBlockNumber /* sanity check */ )
-	{
-		/* split page detected, install right link to the stack */
-
-		GistBDItem *ptr = (GistBDItem *) palloc(sizeof(GistBDItem));
-
-		ptr->blkno = opaque->rightlink;
-		ptr->parentlsn = stack->parentlsn;
-		ptr->next = stack->next;
-		stack->next = ptr;
-	}
-}
-
-
 /*
  * Bulk deletion of all index entries pointing to a set of heap tuples and
  * check invalid tuples left after upgrade.
@@ -134,141 +94,232 @@ pushStackIfSplited(Page page, GistBDItem *stack)
  *
  * Result: a palloc'd struct containing statistical info for VACUUM displays.
  */
-IndexBulkDeleteResult *
-gistbulkdelete(IndexVacuumInfo *info, IndexBulkDeleteResult *stats,
+static void
+gistvacuumscan(IndexVacuumInfo *info, IndexBulkDeleteResult *stats,
 			   IndexBulkDeleteCallback callback, void *callback_state)
 {
 	Relation	rel = info->index;
-	GistBDItem *stack,
-			   *ptr;
+	GistVacState vstate;
+	BlockNumber num_pages;
+	bool		needLock;
+	BlockNumber	blkno;
+	GistNSN startNSN = GetInsertRecPtr();
 
-	/* first time through? */
-	if (stats == NULL)
-		stats = (IndexBulkDeleteResult *) palloc0(sizeof(IndexBulkDeleteResult));
-	/* we'll re-count the tuples each time */
+	/*
+	 * Reset counts that will be incremented during the scan; needed in case
+	 * of multiple scans during a single VACUUM command
+	 */
 	stats->estimated_count = false;
 	stats->num_index_tuples = 0;
+	stats->pages_deleted = 0;
 
-	stack = (GistBDItem *) palloc0(sizeof(GistBDItem));
-	stack->blkno = GIST_ROOT_BLKNO;
+	/* Set up info to pass down to gistvacuumpage */
+	vstate.info = info;
+	vstate.stats = stats;
+	vstate.callback = callback;
+	vstate.callback_state = callback_state;
+	vstate.startNSN = startNSN;
+	vstate.totFreePages = 0;
 
-	while (stack)
+	/*
+	 * Need lock unless it's local to this backend.
+	 */
+	needLock = !RELATION_IS_LOCAL(rel);
+
+	/*
+	 * FIXME: copied from btvacuumscan. Check that all this also holds for
+	 * GiST!
+	 * AB: Yes, gistNewBuffer() takes LockRelationForExtension()
+	 *
+	 * The outer loop iterates over all index pages, in
+	 * physical order (we hope the kernel will cooperate in providing
+	 * read-ahead for speed).  It is critical that we visit all leaf pages,
+	 * including ones added after we start the scan, else we might fail to
+	 * delete some deletable tuples.  Hence, we must repeatedly check the
+	 * relation length.  We must acquire the relation-extension lock while
+	 * doing so to avoid a race condition: if someone else is extending the
+	 * relation, there is a window where bufmgr/smgr have created a new
+	 * all-zero page but it hasn't yet been write-locked by gistNewBuffer(). If
+	 * we manage to scan such a page here, we'll improperly assume it can be
+	 * recycled.  Taking the lock synchronizes things enough to prevent a
+	 * problem: either num_pages won't include the new page, or gistNewBuffer
+	 * already has write lock on the buffer and it will be fully initialized
+	 * before we can examine it.  (See also vacuumlazy.c, which has the same
+	 * issue.)	Also, we need not worry if a page is added immediately after
+	 * we look; the page splitting code already has write-lock on the left
+	 * page before it adds a right page, so we must already have processed any
+	 * tuples due to be moved into such a page.
+	 *
+	 * We can skip locking for new or temp relations, however, since no one
+	 * else could be accessing them.
+	 */
+	needLock = !RELATION_IS_LOCAL(rel);
+
+	blkno = GIST_ROOT_BLKNO;
+	for (;;)
 	{
-		Buffer		buffer;
-		Page		page;
-		OffsetNumber i,
-					maxoff;
-		IndexTuple	idxtuple;
-		ItemId		iid;
-
-		buffer = ReadBufferExtended(rel, MAIN_FORKNUM, stack->blkno,
-									RBM_NORMAL, info->strategy);
-		LockBuffer(buffer, GIST_SHARE);
-		gistcheckpage(rel, buffer);
-		page = (Page) BufferGetPage(buffer);
-
-		if (GistPageIsLeaf(page))
+		/* Get the current relation length */
+		if (needLock)
+			LockRelationForExtension(rel, ExclusiveLock);
+		num_pages = RelationGetNumberOfBlocks(rel);
+		if (needLock)
+			UnlockRelationForExtension(rel, ExclusiveLock);
+
+		/* Quit if we've scanned the whole relation */
+		if (blkno >= num_pages)
+			break;
+		/* Iterate over pages, then loop back to recheck length */
+		for (; blkno < num_pages; blkno++)
 		{
-			OffsetNumber todelete[MaxOffsetNumber];
-			int			ntodelete = 0;
+			gistvacuumpage(&vstate, blkno, blkno);
+		}
+	}
 
-			LockBuffer(buffer, GIST_UNLOCK);
-			LockBuffer(buffer, GIST_EXCLUSIVE);
+	/*
+	 * If we found any recyclable pages (and recorded them in the FSM), then
+	 * forcibly update the upper-level FSM pages to ensure that searchers can
+	 * find them.  It's possible that the pages were also found during
+	 * previous scans and so this is a waste of time, but it's cheap enough
+	 * relative to scanning the index that it shouldn't matter much, and
+	 * making sure that free pages are available sooner not later seems
+	 * worthwhile.
+	 *
+	 * Note that if no recyclable pages exist, we don't bother vacuuming the
+	 * FSM at all.
+	 */
+	if (vstate.totFreePages > 0)
+		IndexFreeSpaceMapVacuum(rel);
 
-			page = (Page) BufferGetPage(buffer);
-			if (stack->blkno == GIST_ROOT_BLKNO && !GistPageIsLeaf(page))
-			{
-				/* only the root can become non-leaf during relock */
-				UnlockReleaseBuffer(buffer);
-				/* one more check */
-				continue;
-			}
+	/* update statistics */
+	stats->num_pages = num_pages;
+	stats->pages_free = vstate.totFreePages;
+}
 
-			/*
-			 * check for split proceeded after look at parent, we should check
-			 * it after relock
-			 */
-			pushStackIfSplited(page, stack);
+/*
+ * gistvacuumpage --- VACUUM one page
+ *
+ * This processes a single page for gistbulkdelete().  In some cases we
+ * must go back and re-examine previously-scanned pages; this routine
+ * recurses when necessary to handle that case.
+ *
+ * blkno is the page to process.  orig_blkno is the highest block number
+ * reached by the outer gistvacuumscan loop (the same as blkno, unless we
+ * are recursing to re-examine a previous page).
+ */
+static void
+gistvacuumpage(GistVacState *vstate, BlockNumber blkno, BlockNumber orig_blkno)
+{
+	IndexVacuumInfo *info = vstate->info;
+	IndexBulkDeleteResult *stats = vstate->stats;
+	IndexBulkDeleteCallback callback = vstate->callback;
+	void	   *callback_state = vstate->callback_state;
+	Relation	rel = info->index;
+	Buffer		buffer;
+	Page		page;
+	BlockNumber recurse_to;
 
-			/*
-			 * Remove deletable tuples from page
-			 */
+restart:
+	recurse_to = InvalidBlockNumber;
 
-			maxoff = PageGetMaxOffsetNumber(page);
+	/* call vacuum_delay_point while not holding any buffer lock */
+	vacuum_delay_point();
 
-			for (i = FirstOffsetNumber; i <= maxoff; i = OffsetNumberNext(i))
-			{
-				iid = PageGetItemId(page, i);
-				idxtuple = (IndexTuple) PageGetItem(page, iid);
+	buffer = ReadBufferExtended(rel, MAIN_FORKNUM, blkno, RBM_NORMAL,
+								info->strategy);
+	/*
+	 * We are not going to stay here for a long time, agressively grab an
+	 * exclusive lock.
+	 */
+	LockBuffer(buffer, GIST_EXCLUSIVE);
+	page = (Page) BufferGetPage(buffer);
 
-				if (callback(&(idxtuple->t_tid), callback_state))
-					todelete[ntodelete++] = i;
-				else
-					stats->num_index_tuples += 1;
-			}
+	if (PageIsNew(page) || GistPageIsDeleted(page))
+	{
+		UnlockReleaseBuffer(buffer);
+		vstate->totFreePages++;
+		RecordFreeIndexPage(rel, blkno);
+		return;
+	}
 
-			stats->tuples_removed += ntodelete;
+	if (GistPageIsLeaf(page))
+	{
+		OffsetNumber todelete[MaxOffsetNumber];
+		int			ntodelete = 0;
+		GISTPageOpaque opaque = GistPageGetOpaque(page);
+		OffsetNumber maxoff = PageGetMaxOffsetNumber(page);
+
+		/*
+		 * If this page was splitted after start of the VACUUM we have to
+		 * revisit rightlink, if it points to block we already scanned.
+		 */
+		if ((GistFollowRight(page) || vstate->startNSN < GistPageGetNSN(page)) &&
+			(opaque->rightlink != InvalidBlockNumber) && (opaque->rightlink < orig_blkno))
+		{
+			recurse_to = opaque->rightlink;
+		}
+
+		/*
+		 * Remove deletable tuples from page
+		 */
+		if (callback)
+		{
+			OffsetNumber off;
 
-			if (ntodelete)
+			for (off = FirstOffsetNumber; off <= maxoff; off = OffsetNumberNext(off))
 			{
-				START_CRIT_SECTION();
+				ItemId		iid = PageGetItemId(page, off);
+				IndexTuple	idxtuple = (IndexTuple) PageGetItem(page, iid);
 
-				MarkBufferDirty(buffer);
+				if (callback(&(idxtuple->t_tid), callback_state))
+					todelete[ntodelete++] = off;
+			}
+		}
 
-				PageIndexMultiDelete(page, todelete, ntodelete);
-				GistMarkTuplesDeleted(page);
+		/* We have dead tuples on the page */
+		if (ntodelete)
+		{
+			START_CRIT_SECTION();
 
-				if (RelationNeedsWAL(rel))
-				{
-					XLogRecPtr	recptr;
+			MarkBufferDirty(buffer);
 
-					recptr = gistXLogUpdate(buffer,
-											todelete, ntodelete,
-											NULL, 0, InvalidBuffer);
-					PageSetLSN(page, recptr);
-				}
-				else
-					PageSetLSN(page, gistGetFakeLSN(rel));
+			PageIndexMultiDelete(page, todelete, ntodelete);
+			GistMarkTuplesDeleted(page);
 
-				END_CRIT_SECTION();
+			if (RelationNeedsWAL(rel))
+			{
+				XLogRecPtr	recptr;
+
+				recptr = gistXLogUpdate(buffer,
+										todelete, ntodelete,
+										NULL, 0, InvalidBuffer);
+				PageSetLSN(page, recptr);
 			}
+			else
+				PageSetLSN(page, gistGetFakeLSN(rel));
 
-		}
-		else
-		{
-			/* check for split proceeded after look at parent */
-			pushStackIfSplited(page, stack);
+			END_CRIT_SECTION();
 
+			stats->tuples_removed += ntodelete;
+			/* must recompute maxoff */
 			maxoff = PageGetMaxOffsetNumber(page);
-
-			for (i = FirstOffsetNumber; i <= maxoff; i = OffsetNumberNext(i))
-			{
-				iid = PageGetItemId(page, i);
-				idxtuple = (IndexTuple) PageGetItem(page, iid);
-
-				ptr = (GistBDItem *) palloc(sizeof(GistBDItem));
-				ptr->blkno = ItemPointerGetBlockNumber(&(idxtuple->t_tid));
-				ptr->parentlsn = BufferGetLSNAtomic(buffer);
-				ptr->next = stack->next;
-				stack->next = ptr;
-
-				if (GistTupleIsInvalid(idxtuple))
-					ereport(LOG,
-							(errmsg("index \"%s\" contains an inner tuple marked as invalid",
-									RelationGetRelationName(rel)),
-							 errdetail("This is caused by an incomplete page split at crash recovery before upgrading to PostgreSQL 9.1."),
-							 errhint("Please REINDEX it.")));
-			}
 		}
 
-		UnlockReleaseBuffer(buffer);
-
-		ptr = stack->next;
-		pfree(stack);
-		stack = ptr;
+		stats->num_index_tuples += maxoff - FirstOffsetNumber + 1;
 
-		vacuum_delay_point();
 	}
 
-	return stats;
+	UnlockReleaseBuffer(buffer);
+
+	/*
+	 * This is really tail recursion, but if the compiler is too stupid to
+	 * optimize it as such, we'd eat an uncomfortably large amount of stack
+	 * space per recursion level (due to the deletable[] array). A failure is
+	 * improbable since the number of levels isn't likely to be large ... but
+	 * just in case, let's hand-optimize into a loop.
+	 */
+	if (recurse_to != InvalidBlockNumber)
+	{
+		blkno = recurse_to;
+		goto restart;
+	}
 }
-- 
2.17.1 (Apple Git-112)

