From 21e5d4b629cca1ad3416efe6a3e978cca244b368 Mon Sep 17 00:00:00 2001
From: Andrey Borodin <amborodin@acm.org>
Date: Tue, 17 Jul 2018 22:34:58 +0400
Subject: [PATCH 2/2] Physical GiST scan during VACUUM v10

---
 src/backend/access/gist/gistvacuum.c | 366 +++++++++++++++++++++++++++++++----
 1 file changed, 326 insertions(+), 40 deletions(-)

diff --git a/src/backend/access/gist/gistvacuum.c b/src/backend/access/gist/gistvacuum.c
index 8d97c44..778c806 100644
--- a/src/backend/access/gist/gistvacuum.c
+++ b/src/backend/access/gist/gistvacuum.c
@@ -103,8 +103,9 @@ gistvacuumcleanup(IndexVacuumInfo *info, IndexBulkDeleteResult *stats)
 
 typedef struct GistBDItem
 {
-	GistNSN		parentlsn;
-	BlockNumber blkno;
+	GistNSN		 parentlsn;
+	BlockNumber  blkno;
+	OffsetNumber parentoffset;
 	struct GistBDItem *next;
 } GistBDItem;
 
@@ -129,30 +130,232 @@ pushStackIfSplited(Page page, GistBDItem *stack)
 }
 
 /*
- * Bulk deletion of all index entries pointing to a set of heap tuples and
- * check invalid tuples left after upgrade.
- * The set of target tuples is specified via a callback routine that tells
- * whether any given heap tuple (identified by ItemPointer) is being deleted.
- *
- * Result: a palloc'd struct containing statistical info for VACUUM displays.
+ * During physical scan for every pair parent-child we can either find parent
+ * first or child first. Every time we open internal page - we mark parent
+ * block no for every child and set GIST_PS_HAS_PARENT. When scan will get to
+ * child page, if this page turns out to be empty - we will get back by
+ * parent link. If we find child first (still without parent link), we mark
+ * the page as GIST_PS_EMPTY_LEAF if it is ready to be deleted. When we will
+ * scan it's parent - we will pick it to rescan list.
  */
-IndexBulkDeleteResult *
-gistbulkdelete(IndexVacuumInfo * info, IndexBulkDeleteResult * stats, IndexBulkDeleteCallback callback, void* callback_state)
+#define GIST_PS_HAS_PARENT 1
+#define GIST_PS_EMPTY_LEAF 2
+
+
+/* Physiscal scan item */
+typedef struct GistPSItem
 {
-	Relation	rel = info->index;
-	GistBDItem *stack,
-			   *ptr;
-	BlockNumber recentParent = InvalidBlockNumber;
-	List	   *rescanList = NULL;
-	ListCell   *cell;
+	BlockNumber  parent;
+	List*        emptyLeafOffsets;
+	OffsetNumber parentOffset;
+	uint16       flags;
+} GistPSItem;
+
+/* Blocknumber of internal pages with offsets to rescan for deletion */
+typedef struct GistRescanItem
+{
+	BlockNumber       blkno;
+	List*             emptyLeafOffsets;
+	struct GistRescanItem* next;
+} GistRescanItem;
 
-	/* first time through? */
-	if (stats == NULL)
-		stats = (IndexBulkDeleteResult *) palloc0(sizeof(IndexBulkDeleteResult));
-	/* we'll re-count the tuples each time */
-	stats->estimated_count = false;
-	stats->num_index_tuples = 0;
+static void
+gistbulkdeletephysicalcanpage(IndexVacuumInfo * info, IndexBulkDeleteResult * stats,
+								IndexBulkDeleteCallback callback, void* callback_state,
+								BlockNumber blkno, GistNSN startNSN, GistPSItem *graph)
+{
+	Relation	 rel = info->index;
+	Buffer		 buffer;
+	Page		 page;
+	OffsetNumber i,
+					maxoff;
+	IndexTuple   idxtuple;
+	ItemId	     iid;
+
+	/*
+	 * This is recursive call, should almost never be deeper than
+	 * GIST_MAX_SPLIT_PAGES, but check anyway.
+	 */
+	check_stack_depth();
+
+	vacuum_delay_point();
+
+	buffer = ReadBufferExtended(rel, MAIN_FORKNUM, blkno, RBM_NORMAL,
+								info->strategy);
+	/*
+	 * We are not going to stay here for a long time, calling recursive algorithms.
+	 * Especially for an internal page. So, agressivly grab an exclusive lock.
+	 */
+	LockBuffer(buffer, GIST_EXCLUSIVE);
+	page = (Page) BufferGetPage(buffer);
+
+	if (PageIsNew(page) || GistPageIsDeleted(page))
+	{
+		UnlockReleaseBuffer(buffer);
+		/* TODO: Should not we record free page here? */
+		return;
+	}
+
+	maxoff = PageGetMaxOffsetNumber(page);
+
+	if (GistPageIsLeaf(page))
+	{
+		OffsetNumber todelete[MaxOffsetNumber];
+		int			ntodelete = 0;
+		GISTPageOpaque opaque = GistPageGetOpaque(page);
+
+		/*
+		 * If this page was splitted after start of the VACUUM we have to
+		 * revisit rightlink, if it points to block we already scanned.
+		 * This is recursive revisit, should not be deep, but we check
+		 * the possibility of stack overflow anyway.
+		 */
+		if ((GistFollowRight(page) || startNSN < GistPageGetNSN(page)) &&
+			(opaque->rightlink != InvalidBlockNumber) && (opaque->rightlink < blkno))
+			{
+				gistbulkdeletephysicalcanpage(info, stats, callback, callback_state, opaque->rightlink, startNSN, graph);
+			}
+
+		/*
+		 * Remove deletable tuples from page
+		 */
+
+		for (i = FirstOffsetNumber; i <= maxoff; i = OffsetNumberNext(i))
+		{
+			iid = PageGetItemId(page, i);
+			idxtuple = (IndexTuple) PageGetItem(page, iid);
+
+			if (callback(&(idxtuple->t_tid), callback_state))
+				todelete[ntodelete++] = i;
+			else
+				stats->num_index_tuples += 1;
+		}
+
+		stats->tuples_removed += ntodelete;
+
+		/* We have dead tuples on the page */
+		if (ntodelete)
+		{
+			START_CRIT_SECTION();
+
+			MarkBufferDirty(buffer);
+
+			PageIndexMultiDelete(page, todelete, ntodelete);
+			GistMarkTuplesDeleted(page);
+
+			if (RelationNeedsWAL(rel))
+			{
+				XLogRecPtr	recptr;
+
+				recptr = gistXLogUpdate(buffer,
+										todelete, ntodelete,
+										NULL, 0, InvalidBuffer);
+				PageSetLSN(page, recptr);
+			}
+			else
+				PageSetLSN(page, gistGetFakeLSN(rel));
+
+			END_CRIT_SECTION();
+		}
+
+		/* The page is completely empty */
+		if (ntodelete == maxoff)
+		{
+			/* This page is a candidate to be deleted. Remember it's parent to rescan it later with xlock */
+			if (graph[blkno].flags & GIST_PS_HAS_PARENT)
+			{
+				/* Go to parent and append myself */
+				BlockNumber parentblockno = graph[blkno].parent;
+				graph[parentblockno].emptyLeafOffsets = lappend_int(graph[parentblockno].emptyLeafOffsets, (int)graph[blkno].parentOffset);
+			}
+			else
+			{
+				/* Parent will collect me later */
+				graph[blkno].flags |= GIST_PS_EMPTY_LEAF;
+			}
+		}
+	}
+	else
+	{
+		/* For internal pages we remember stucture of the tree */
+		for (i = FirstOffsetNumber; i <= maxoff; i = OffsetNumberNext(i))
+		{
+			BlockNumber childblkno;
+			iid = PageGetItemId(page, i);
+			idxtuple = (IndexTuple) PageGetItem(page, iid);
+			childblkno = ItemPointerGetBlockNumber(&(idxtuple->t_tid));
+
+			if (graph[childblkno].flags & GIST_PS_EMPTY_LEAF)
+			{
+				/* Child has been scanned earlier and is ready to be picked up */
+				graph[blkno].emptyLeafOffsets = lappend_int(graph[blkno].emptyLeafOffsets, i);
+			}
+			else
+			{
+				/* Collect leaf when scan will come close */
+				graph[childblkno].parent = blkno;
+				graph[childblkno].parentOffset = i;
+				graph[childblkno].flags |= GIST_PS_HAS_PARENT;
+			}
 
+
+			if (GistTupleIsInvalid(idxtuple))
+				ereport(LOG,
+						(errmsg("index \"%s\" contains an inner tuple marked as invalid",
+								RelationGetRelationName(rel)),
+							errdetail("This is caused by an incomplete page split at crash recovery before upgrading to PostgreSQL 9.1."),
+							errhint("Please REINDEX it.")));
+		}
+	}
+	UnlockReleaseBuffer(buffer);
+}
+
+/* Read all pages sequentially populating array of GistPSItem */
+static GistRescanItem*
+gistbulkdeletephysicalcan(IndexVacuumInfo * info, IndexBulkDeleteResult * stats, IndexBulkDeleteCallback callback, void* callback_state, BlockNumber npages)
+{
+	GistRescanItem *result = NULL;
+	BlockNumber      blkno;
+	GistNSN			 startNSN = GetInsertRecPtr();
+
+	/* Here we will store whole graph of the index */
+	GistPSItem *graph = palloc0(npages * sizeof(GistPSItem));
+
+
+	for (blkno = GIST_ROOT_BLKNO; blkno < npages; blkno++)
+	{
+		gistbulkdeletephysicalcanpage(info, stats, callback, callback_state, blkno, startNSN, graph);
+	}
+
+	/* Search for internal pages pointing to empty leafs */
+	for (blkno = GIST_ROOT_BLKNO; blkno < npages; blkno++)
+	{
+		if (graph[blkno].emptyLeafOffsets)
+		{
+			GistRescanItem *next = palloc(sizeof(GistRescanItem));
+			next->blkno = blkno;
+			next->emptyLeafOffsets = graph[blkno].emptyLeafOffsets;
+			next->next = result;
+			result = next;
+		}
+	}
+
+	pfree(graph);
+
+	return result;
+}
+
+/* Logical scan descends from root to leafs in DFS search */
+static GistRescanItem*
+gistbulkdeletelogicalscan(IndexVacuumInfo * info, IndexBulkDeleteResult * stats, IndexBulkDeleteCallback callback, void* callback_state)
+{
+	Relation        rel = info->index;
+	BlockNumber     recentParent = InvalidBlockNumber;
+	GistBDItem     *stack,
+				   *ptr;
+	GistRescanItem *result = NULL;
+
+	/* This stack is used to organize DFS */
 	stack = (GistBDItem *) palloc0(sizeof(GistBDItem));
 	stack->blkno = GIST_ROOT_BLKNO;
 
@@ -237,11 +440,18 @@ gistbulkdelete(IndexVacuumInfo * info, IndexBulkDeleteResult * stats, IndexBulkD
 				END_CRIT_SECTION();
 			}
 
-			if (ntodelete == maxoff && recentParent!=InvalidBlockNumber &&
-				(rescanList == NULL || (BlockNumber)llast_int(rescanList) != recentParent))
+			if (ntodelete == maxoff && recentParent!=InvalidBlockNumber)
 			{
 				/* This page is a candidate to be deleted. Remember it's parent to rescan it later with xlock */
-				rescanList = lappend_int(rescanList, recentParent);
+				if (result == NULL || result->blkno != recentParent)
+				{
+					GistRescanItem *next = palloc(sizeof(GistRescanItem));
+					next->blkno = recentParent;
+					next->emptyLeafOffsets = NULL;
+					next->next = result;
+					result = next;
+				}
+				result->emptyLeafOffsets = lappend_int(result->emptyLeafOffsets, stack->parentoffset);
 			}
 		}
 		else
@@ -261,6 +471,7 @@ gistbulkdelete(IndexVacuumInfo * info, IndexBulkDeleteResult * stats, IndexBulkD
 				ptr->blkno = ItemPointerGetBlockNumber(&(idxtuple->t_tid));
 				ptr->parentlsn = BufferGetLSNAtomic(buffer);
 				ptr->next = stack->next;
+				ptr->parentoffset = i;
 				stack->next = ptr;
 
 				if (GistTupleIsInvalid(idxtuple))
@@ -281,20 +492,82 @@ gistbulkdelete(IndexVacuumInfo * info, IndexBulkDeleteResult * stats, IndexBulkD
 		vacuum_delay_point();
 	}
 
-	/* rescan inner pages that had empty child pages */
-	foreach(cell,rescanList)
+	return result;
+}
+
+/*
+ * This function is used to sort offsets
+ * When employing physical scan rescan offsets are not ordered.
+ */
+static int
+compare_offsetnumber(const void *x, const void *y)
+{
+	OffsetNumber a = *((OffsetNumber *)x);
+	OffsetNumber b = *((OffsetNumber *)y);
+	return a - b;
+}
+
+/*
+ * Bulk deletion of all index entries pointing to a set of heap tuples and
+ * check invalid tuples left after upgrade.
+ * The set of target tuples is specified via a callback routine that tells
+ * whether any given heap tuple (identified by ItemPointer) is being deleted.
+ *
+ * Result: a palloc'd struct containing statistical info for VACUUM displays.
+ */
+IndexBulkDeleteResult *
+gistbulkdelete(IndexVacuumInfo * info, IndexBulkDeleteResult * stats, IndexBulkDeleteCallback callback, void* callback_state)
+{
+	Relation		rel = info->index;
+	GistRescanItem *rescan;
+	BlockNumber		npages;
+	bool			needLock;
+
+	/* first time through? */
+	if (stats == NULL)
+		stats = (IndexBulkDeleteResult *) palloc0(sizeof(IndexBulkDeleteResult));
+	/* we'll re-count the tuples each time */
+	stats->estimated_count = false;
+	stats->num_index_tuples = 0;
+
+	/*
+	 * Need lock unless it's local to this backend.
+	 */
+	needLock = !RELATION_IS_LOCAL(rel);
+
+	/* try to find deleted pages */
+	if (needLock)
+		LockRelationForExtension(rel, ExclusiveLock);
+	npages = RelationGetNumberOfBlocks(rel);
+	if (needLock)
+		UnlockRelationForExtension(rel, ExclusiveLock);
+
+	/* If we have enough space to contruct map of whole graph, then we can do sequential reading of all index */
+	if (npages * (sizeof(GistPSItem)) > maintenance_work_mem * 1024)
 	{
-		Buffer		buffer;
-		Page		page;
-		OffsetNumber i,
-					maxoff;
-		IndexTuple	idxtuple;
-		ItemId		iid;
-		OffsetNumber todelete[MaxOffsetNumber];
-		Buffer		buftodelete[MaxOffsetNumber];
-		int			ntodelete = 0;
+		rescan = gistbulkdeletelogicalscan(info, stats, callback, callback_state);
+	}
+	else
+	{
+		rescan = gistbulkdeletephysicalcan(info, stats, callback, callback_state, npages);
+	}
 
-		buffer = ReadBufferExtended(rel, MAIN_FORKNUM, (BlockNumber)lfirst_int(cell),
+	/* rescan inner pages that had empty child pages */
+	while (rescan)
+	{
+		Buffer			 buffer;
+		Page			 page;
+		OffsetNumber 	 i,
+						 maxoff;
+		IndexTuple		 idxtuple;
+		ItemId			 iid;
+		OffsetNumber 	 todelete[MaxOffsetNumber];
+		Buffer			 buftodelete[MaxOffsetNumber];
+		int				 ntodelete = 0;
+		ListCell  		*cell;
+		GistRescanItem	*oldRescan;
+
+		buffer = ReadBufferExtended(rel, MAIN_FORKNUM, rescan->blkno,
 									RBM_NORMAL, info->strategy);
 		LockBuffer(buffer, GIST_EXCLUSIVE);
 		gistcheckpage(rel, buffer);
@@ -304,11 +577,18 @@ gistbulkdelete(IndexVacuumInfo * info, IndexBulkDeleteResult * stats, IndexBulkD
 
 		maxoff = PageGetMaxOffsetNumber(page);
 
-		for (i = OffsetNumberNext(FirstOffsetNumber); i <= maxoff; i = OffsetNumberNext(i))
+		/* Check that leafs are still empty and decide what to delete */
+		foreach(cell, rescan->emptyLeafOffsets)
 		{
 			Buffer		leafBuffer;
 			Page		leafPage;
 
+			i = (OffsetNumber)lfirst_int(cell);
+			if(i > maxoff)
+			{
+				continue;
+			}
+
 			iid = PageGetItemId(page, i);
 			idxtuple = (IndexTuple) PageGetItem(page, iid);
 
@@ -333,7 +613,10 @@ gistbulkdelete(IndexVacuumInfo * info, IndexBulkDeleteResult * stats, IndexBulkD
 
 		if (ntodelete)
 		{
-			/*
+			/* Prepare possibly onurdered offsets */
+			qsort(todelete, ntodelete, sizeof(OffsetNumber), compare_offsetnumber);
+
+			/* 
 			 * Like in _bt_unlink_halfdead_page we need a upper bound on xid
 			 * that could hold downlinks to this page. We use
 			 * ReadNewTransactionId() to instead of GetCurrentTransactionId
@@ -378,11 +661,14 @@ gistbulkdelete(IndexVacuumInfo * info, IndexBulkDeleteResult * stats, IndexBulkD
 		}
 
 		UnlockReleaseBuffer(buffer);
+		oldRescan = rescan;
+		rescan = rescan->next;
+		list_free(oldRescan->emptyLeafOffsets);
+		pfree(oldRescan);
 
 		vacuum_delay_point();
 	}
 
-	list_free(rescanList);
 
 	return stats;
 }
\ No newline at end of file
-- 
2.15.2 (Apple Git-101.1)

