On 02/01/2019 17:35, Heikki Linnakangas wrote:
On 28/10/2018 19:32, Andrey Borodin wrote:
Hi everyone!

2 окт. 2018 г., в 6:14, Michael Paquier <mich...@paquier.xyz> написал(а):
Andrey, your latest patch does not apply.  I am moving this to the next
CF, waiting for your input.

I'm doing preps for CF.
Here's rebased version.

Thanks, I had another look at these.

Here are new patch versions, with the fixes I mentioned. Forgot to attach these earlier.

- Heikki
>From 46456dbf1d07aaf3e6963035a02aaa060decace3 Mon Sep 17 00:00:00 2001
From: Andrey Borodin <amboro...@acm.org>
Date: Sun, 28 Oct 2018 22:20:58 +0500
Subject: [PATCH 1/2] Physical GiST scan in VACUUM v18-heikki

---
 src/backend/access/gist/gist.c       |   8 +-
 src/backend/access/gist/gistvacuum.c | 430 +++++++++++++++------------
 2 files changed, 247 insertions(+), 191 deletions(-)

diff --git a/src/backend/access/gist/gist.c b/src/backend/access/gist/gist.c
index a2cb84800e8..d42e810c6b3 100644
--- a/src/backend/access/gist/gist.c
+++ b/src/backend/access/gist/gist.c
@@ -38,7 +38,7 @@ static bool gistinserttuples(GISTInsertState *state, GISTInsertStack *stack,
 				 bool unlockbuf, bool unlockleftchild);
 static void gistfinishsplit(GISTInsertState *state, GISTInsertStack *stack,
 				GISTSTATE *giststate, List *splitinfo, bool releasebuf);
-static void gistvacuumpage(Relation rel, Page page, Buffer buffer,
+static void gistprunepage(Relation rel, Page page, Buffer buffer,
 			   Relation heapRel);
 
 
@@ -261,7 +261,7 @@ gistplacetopage(Relation rel, Size freespace, GISTSTATE *giststate,
 	 */
 	if (is_split && GistPageIsLeaf(page) && GistPageHasGarbage(page))
 	{
-		gistvacuumpage(rel, page, buffer, heapRel);
+		gistprunepage(rel, page, buffer, heapRel);
 		is_split = gistnospace(page, itup, ntup, oldoffnum, freespace);
 	}
 
@@ -1544,11 +1544,11 @@ freeGISTstate(GISTSTATE *giststate)
 }
 
 /*
- * gistvacuumpage() -- try to remove LP_DEAD items from the given page.
+ * gistprunepage() -- try to remove LP_DEAD items from the given page.
  * Function assumes that buffer is exclusively locked.
  */
 static void
-gistvacuumpage(Relation rel, Page page, Buffer buffer, Relation heapRel)
+gistprunepage(Relation rel, Page page, Buffer buffer, Relation heapRel)
 {
 	OffsetNumber deletable[MaxIndexTuplesPerPage];
 	int			ndeletable = 0;
diff --git a/src/backend/access/gist/gistvacuum.c b/src/backend/access/gist/gistvacuum.c
index 5948218c779..4fb32bf76bf 100644
--- a/src/backend/access/gist/gistvacuum.c
+++ b/src/backend/access/gist/gistvacuum.c
@@ -21,6 +21,34 @@
 #include "storage/indexfsm.h"
 #include "storage/lmgr.h"
 
+/* Working state needed by gistbulkdelete */
+typedef struct
+{
+	IndexVacuumInfo *info;
+	IndexBulkDeleteResult *stats;
+	IndexBulkDeleteCallback callback;
+	void	   *callback_state;
+	GistNSN		startNSN;
+	BlockNumber totFreePages;	/* true total # of free pages */
+} GistVacState;
+
+static void gistvacuumscan(IndexVacuumInfo *info, IndexBulkDeleteResult *stats,
+			   IndexBulkDeleteCallback callback, void *callback_state);
+static void gistvacuumpage(GistVacState *vstate, BlockNumber blkno,
+			   BlockNumber orig_blkno);
+
+IndexBulkDeleteResult *
+gistbulkdelete(IndexVacuumInfo *info, IndexBulkDeleteResult *stats,
+			   IndexBulkDeleteCallback callback, void *callback_state)
+{
+	/* allocate stats if first time through, else re-use existing struct */
+	if (stats == NULL)
+		stats = (IndexBulkDeleteResult *) palloc0(sizeof(IndexBulkDeleteResult));
+
+	gistvacuumscan(info, stats, callback, callback_state);
+
+	return stats;
+}
 
 /*
  * VACUUM cleanup: update FSM
@@ -28,104 +56,36 @@
 IndexBulkDeleteResult *
 gistvacuumcleanup(IndexVacuumInfo *info, IndexBulkDeleteResult *stats)
 {
-	Relation	rel = info->index;
-	BlockNumber npages,
-				blkno;
-	BlockNumber totFreePages;
-	double		tuplesCount;
-	bool		needLock;
-
 	/* No-op in ANALYZE ONLY mode */
 	if (info->analyze_only)
 		return stats;
 
-	/* Set up all-zero stats if gistbulkdelete wasn't called */
+	/*
+	 * If gistbulkdelete was called, we need not do anything, just return the
+	 * stats from the latest gistbulkdelete call.  If it wasn't called, we
+	 * still need to do a pass over the index, to obtain index statistics.
+	 */
 	if (stats == NULL)
+	{
 		stats = (IndexBulkDeleteResult *) palloc0(sizeof(IndexBulkDeleteResult));
+		gistvacuumscan(info, stats, NULL, NULL);
+	}
 
 	/*
-	 * Need lock unless it's local to this backend.
+	 * It's quite possible for us to be fooled by concurrent page splits into
+	 * double-counting some index tuples, so disbelieve any total that exceeds
+	 * the underlying heap's count ... if we know that accurately.  Otherwise
+	 * this might just make matters worse.
 	 */
-	needLock = !RELATION_IS_LOCAL(rel);
-
-	/* try to find deleted pages */
-	if (needLock)
-		LockRelationForExtension(rel, ExclusiveLock);
-	npages = RelationGetNumberOfBlocks(rel);
-	if (needLock)
-		UnlockRelationForExtension(rel, ExclusiveLock);
-
-	totFreePages = 0;
-	tuplesCount = 0;
-	for (blkno = GIST_ROOT_BLKNO + 1; blkno < npages; blkno++)
+	if (!info->estimated_count)
 	{
-		Buffer		buffer;
-		Page		page;
-
-		vacuum_delay_point();
-
-		buffer = ReadBufferExtended(rel, MAIN_FORKNUM, blkno, RBM_NORMAL,
-									info->strategy);
-		LockBuffer(buffer, GIST_SHARE);
-		page = (Page) BufferGetPage(buffer);
-
-		if (PageIsNew(page) || GistPageIsDeleted(page))
-		{
-			totFreePages++;
-			RecordFreeIndexPage(rel, blkno);
-		}
-		else if (GistPageIsLeaf(page))
-		{
-			/* count tuples in index (considering only leaf tuples) */
-			tuplesCount += PageGetMaxOffsetNumber(page);
-		}
-		UnlockReleaseBuffer(buffer);
+		if (stats->num_index_tuples > info->num_heap_tuples)
+			stats->num_index_tuples = info->num_heap_tuples;
 	}
 
-	/* Finally, vacuum the FSM */
-	IndexFreeSpaceMapVacuum(info->index);
-
-	/* return statistics */
-	stats->pages_free = totFreePages;
-	if (needLock)
-		LockRelationForExtension(rel, ExclusiveLock);
-	stats->num_pages = RelationGetNumberOfBlocks(rel);
-	if (needLock)
-		UnlockRelationForExtension(rel, ExclusiveLock);
-	stats->num_index_tuples = tuplesCount;
-	stats->estimated_count = false;
-
 	return stats;
 }
 
-typedef struct GistBDItem
-{
-	GistNSN		parentlsn;
-	BlockNumber blkno;
-	struct GistBDItem *next;
-} GistBDItem;
-
-static void
-pushStackIfSplited(Page page, GistBDItem *stack)
-{
-	GISTPageOpaque opaque = GistPageGetOpaque(page);
-
-	if (stack->blkno != GIST_ROOT_BLKNO && !XLogRecPtrIsInvalid(stack->parentlsn) &&
-		(GistFollowRight(page) || stack->parentlsn < GistPageGetNSN(page)) &&
-		opaque->rightlink != InvalidBlockNumber /* sanity check */ )
-	{
-		/* split page detected, install right link to the stack */
-
-		GistBDItem *ptr = (GistBDItem *) palloc(sizeof(GistBDItem));
-
-		ptr->blkno = opaque->rightlink;
-		ptr->parentlsn = stack->parentlsn;
-		ptr->next = stack->next;
-		stack->next = ptr;
-	}
-}
-
-
 /*
  * Bulk deletion of all index entries pointing to a set of heap tuples and
  * check invalid tuples left after upgrade.
@@ -134,141 +94,237 @@ pushStackIfSplited(Page page, GistBDItem *stack)
  *
  * Result: a palloc'd struct containing statistical info for VACUUM displays.
  */
-IndexBulkDeleteResult *
-gistbulkdelete(IndexVacuumInfo *info, IndexBulkDeleteResult *stats,
+static void
+gistvacuumscan(IndexVacuumInfo *info, IndexBulkDeleteResult *stats,
 			   IndexBulkDeleteCallback callback, void *callback_state)
 {
 	Relation	rel = info->index;
-	GistBDItem *stack,
-			   *ptr;
+	GistVacState vstate;
+	BlockNumber num_pages;
+	bool		needLock;
+	BlockNumber blkno;
 
-	/* first time through? */
-	if (stats == NULL)
-		stats = (IndexBulkDeleteResult *) palloc0(sizeof(IndexBulkDeleteResult));
-	/* we'll re-count the tuples each time */
+	/*
+	 * Reset counts that will be incremented during the scan; needed in case
+	 * of multiple scans during a single VACUUM command.
+	 */
 	stats->estimated_count = false;
 	stats->num_index_tuples = 0;
+	stats->pages_deleted = 0;
+
+	/* Set up info to pass down to gistvacuumpage */
+	vstate.info = info;
+	vstate.stats = stats;
+	vstate.callback = callback;
+	vstate.callback_state = callback_state;
+	if (RelationNeedsWAL(rel))
+		vstate.startNSN = GetInsertRecPtr();
+	else
+		vstate.startNSN = gistGetFakeLSN(rel);
+	vstate.totFreePages = 0;
 
-	stack = (GistBDItem *) palloc0(sizeof(GistBDItem));
-	stack->blkno = GIST_ROOT_BLKNO;
-
-	while (stack)
-	{
-		Buffer		buffer;
-		Page		page;
-		OffsetNumber i,
-					maxoff;
-		IndexTuple	idxtuple;
-		ItemId		iid;
-
-		buffer = ReadBufferExtended(rel, MAIN_FORKNUM, stack->blkno,
-									RBM_NORMAL, info->strategy);
-		LockBuffer(buffer, GIST_SHARE);
-		gistcheckpage(rel, buffer);
-		page = (Page) BufferGetPage(buffer);
-
-		if (GistPageIsLeaf(page))
-		{
-			OffsetNumber todelete[MaxOffsetNumber];
-			int			ntodelete = 0;
+	/*
+	 * Need lock unless it's local to this backend.
+	 */
+	needLock = !RELATION_IS_LOCAL(rel);
 
-			LockBuffer(buffer, GIST_UNLOCK);
-			LockBuffer(buffer, GIST_EXCLUSIVE);
+	/*
+	 * The outer loop iterates over all index pages, in physical order (we
+	 * hope the kernel will cooperate in providing read-ahead for speed).  It
+	 * is critical that we visit all leaf pages, including ones added after we
+	 * start the scan, else we might fail to delete some deletable tuples.
+	 * Hence, we must repeatedly check the relation length.  We must acquire
+	 * the relation-extension lock while doing so to avoid a race condition:
+	 * if someone else is extending the relation, there is a window where
+	 * bufmgr/smgr have created a new all-zero page but it hasn't yet been
+	 * write-locked by gistNewBuffer().  If we manage to scan such a page
+	 * here, we'll improperly assume it can be recycled.  Taking the lock
+	 * synchronizes things enough to prevent a problem: either num_pages won't
+	 * include the new page, or gistNewBuffer already has write lock on the
+	 * buffer and it will be fully initialized before we can examine it.  (See
+	 * also vacuumlazy.c, which has the same issue.)	Also, we need not worry
+	 * if a page is added immediately after we look; the page splitting code
+	 * already has write-lock on the left page before it adds a right page, so
+	 * we must already have processed any tuples due to be moved into such a
+	 * page.
+	 *
+	 * We can skip locking for new or temp relations, however, since no one
+	 * else could be accessing them.
+	 */
+	needLock = !RELATION_IS_LOCAL(rel);
 
-			page = (Page) BufferGetPage(buffer);
-			if (stack->blkno == GIST_ROOT_BLKNO && !GistPageIsLeaf(page))
-			{
-				/* only the root can become non-leaf during relock */
-				UnlockReleaseBuffer(buffer);
-				/* one more check */
-				continue;
-			}
+	blkno = GIST_ROOT_BLKNO;
+	for (;;)
+	{
+		/* Get the current relation length */
+		if (needLock)
+			LockRelationForExtension(rel, ExclusiveLock);
+		num_pages = RelationGetNumberOfBlocks(rel);
+		if (needLock)
+			UnlockRelationForExtension(rel, ExclusiveLock);
+
+		/* Quit if we've scanned the whole relation */
+		if (blkno >= num_pages)
+			break;
+		/* Iterate over pages, then loop back to recheck length */
+		for (; blkno < num_pages; blkno++)
+			gistvacuumpage(&vstate, blkno, blkno);
+	}
 
-			/*
-			 * check for split proceeded after look at parent, we should check
-			 * it after relock
-			 */
-			pushStackIfSplited(page, stack);
+	/*
+	 * If we found any recyclable pages (and recorded them in the FSM), then
+	 * forcibly update the upper-level FSM pages to ensure that searchers can
+	 * find them.  It's possible that the pages were also found during
+	 * previous scans and so this is a waste of time, but it's cheap enough
+	 * relative to scanning the index that it shouldn't matter much, and
+	 * making sure that free pages are available sooner not later seems
+	 * worthwhile.
+	 *
+	 * Note that if no recyclable pages exist, we don't bother vacuuming the
+	 * FSM at all.
+	 */
+	if (vstate.totFreePages > 0)
+		IndexFreeSpaceMapVacuum(rel);
 
-			/*
-			 * Remove deletable tuples from page
-			 */
+	/* update statistics */
+	stats->num_pages = num_pages;
+	stats->pages_free = vstate.totFreePages;
+}
 
-			maxoff = PageGetMaxOffsetNumber(page);
+/*
+ * gistvacuumpage --- VACUUM one page
+ *
+ * This processes a single page for gistbulkdelete().  In some cases we
+ * must go back and re-examine previously-scanned pages; this routine
+ * recurses when necessary to handle that case.
+ *
+ * blkno is the page to process.  orig_blkno is the highest block number
+ * reached by the outer gistvacuumscan loop (the same as blkno, unless we
+ * are recursing to re-examine a previous page).
+ */
+static void
+gistvacuumpage(GistVacState *vstate, BlockNumber blkno, BlockNumber orig_blkno)
+{
+	IndexVacuumInfo *info = vstate->info;
+	IndexBulkDeleteResult *stats = vstate->stats;
+	IndexBulkDeleteCallback callback = vstate->callback;
+	void	   *callback_state = vstate->callback_state;
+	Relation	rel = info->index;
+	Buffer		buffer;
+	Page		page;
+	BlockNumber recurse_to;
 
-			for (i = FirstOffsetNumber; i <= maxoff; i = OffsetNumberNext(i))
-			{
-				iid = PageGetItemId(page, i);
-				idxtuple = (IndexTuple) PageGetItem(page, iid);
+restart:
+	recurse_to = InvalidBlockNumber;
 
-				if (callback(&(idxtuple->t_tid), callback_state))
-					todelete[ntodelete++] = i;
-				else
-					stats->num_index_tuples += 1;
-			}
+	/* call vacuum_delay_point while not holding any buffer lock */
+	vacuum_delay_point();
 
-			stats->tuples_removed += ntodelete;
+	buffer = ReadBufferExtended(rel, MAIN_FORKNUM, blkno, RBM_NORMAL,
+								info->strategy);
 
-			if (ntodelete)
-			{
-				START_CRIT_SECTION();
+	/*
+	 * We are not going to stay here for a long time, agressively grab an
+	 * exclusive lock.
+	 */
+	LockBuffer(buffer, GIST_EXCLUSIVE);
+	page = (Page) BufferGetPage(buffer);
 
-				MarkBufferDirty(buffer);
+	if (PageIsNew(page) || GistPageIsDeleted(page))
+	{
+		UnlockReleaseBuffer(buffer);
+		vstate->totFreePages++;
+		RecordFreeIndexPage(rel, blkno);
+		return;
+	}
 
-				PageIndexMultiDelete(page, todelete, ntodelete);
-				GistMarkTuplesDeleted(page);
+	if (GistPageIsLeaf(page))
+	{
+		OffsetNumber todelete[MaxOffsetNumber];
+		int			ntodelete = 0;
+		GISTPageOpaque opaque = GistPageGetOpaque(page);
+		OffsetNumber maxoff = PageGetMaxOffsetNumber(page);
+
+		/*
+		 * Check whether we need to recurse back to earlier pages.  What we
+		 * are concerned about is a page split that happened since we started
+		 * the vacuum scan.  If the split moved some tuples to a lower page
+		 * then we might have missed 'em.  If so, set up for tail recursion.
+		 */
+		if ((GistFollowRight(page) ||
+			 vstate->startNSN < GistPageGetNSN(page)) &&
+			(opaque->rightlink != InvalidBlockNumber) &&
+			(opaque->rightlink < orig_blkno))
+		{
+			recurse_to = opaque->rightlink;
+		}
 
-				if (RelationNeedsWAL(rel))
-				{
-					XLogRecPtr	recptr;
+		/*
+		 * Scan over all items to see which ones need deleted according to the
+		 * callback function.
+		 */
+		if (callback)
+		{
+			OffsetNumber off;
 
-					recptr = gistXLogUpdate(buffer,
-											todelete, ntodelete,
-											NULL, 0, InvalidBuffer);
-					PageSetLSN(page, recptr);
-				}
-				else
-					PageSetLSN(page, gistGetFakeLSN(rel));
+			for (off = FirstOffsetNumber; off <= maxoff; off = OffsetNumberNext(off))
+			{
+				ItemId		iid = PageGetItemId(page, off);
+				IndexTuple	idxtuple = (IndexTuple) PageGetItem(page, iid);
 
-				END_CRIT_SECTION();
+				if (callback(&(idxtuple->t_tid), callback_state))
+					todelete[ntodelete++] = off;
 			}
-
 		}
-		else
+
+		/*
+		 * Apply any needed deletes.  We issue just one WAL record per page,
+		 * so as to minimize WAL traffic.
+		 */
+		if (ntodelete)
 		{
-			/* check for split proceeded after look at parent */
-			pushStackIfSplited(page, stack);
+			START_CRIT_SECTION();
 
-			maxoff = PageGetMaxOffsetNumber(page);
+			MarkBufferDirty(buffer);
 
-			for (i = FirstOffsetNumber; i <= maxoff; i = OffsetNumberNext(i))
+			PageIndexMultiDelete(page, todelete, ntodelete);
+			GistMarkTuplesDeleted(page);
+
+			if (RelationNeedsWAL(rel))
 			{
-				iid = PageGetItemId(page, i);
-				idxtuple = (IndexTuple) PageGetItem(page, iid);
-
-				ptr = (GistBDItem *) palloc(sizeof(GistBDItem));
-				ptr->blkno = ItemPointerGetBlockNumber(&(idxtuple->t_tid));
-				ptr->parentlsn = BufferGetLSNAtomic(buffer);
-				ptr->next = stack->next;
-				stack->next = ptr;
-
-				if (GistTupleIsInvalid(idxtuple))
-					ereport(LOG,
-							(errmsg("index \"%s\" contains an inner tuple marked as invalid",
-									RelationGetRelationName(rel)),
-							 errdetail("This is caused by an incomplete page split at crash recovery before upgrading to PostgreSQL 9.1."),
-							 errhint("Please REINDEX it.")));
+				XLogRecPtr	recptr;
+
+				recptr = gistXLogUpdate(buffer,
+										todelete, ntodelete,
+										NULL, 0, InvalidBuffer);
+				PageSetLSN(page, recptr);
 			}
-		}
+			else
+				PageSetLSN(page, gistGetFakeLSN(rel));
 
-		UnlockReleaseBuffer(buffer);
+			END_CRIT_SECTION();
+
+			stats->tuples_removed += ntodelete;
+			/* must recompute maxoff */
+			maxoff = PageGetMaxOffsetNumber(page);
+		}
 
-		ptr = stack->next;
-		pfree(stack);
-		stack = ptr;
+		stats->num_index_tuples += maxoff - FirstOffsetNumber + 1;
 
-		vacuum_delay_point();
 	}
 
-	return stats;
+	UnlockReleaseBuffer(buffer);
+
+	/*
+	 * This is really tail recursion, but if the compiler is too stupid to
+	 * optimize it as such, we'd eat an uncomfortably large amount of stack
+	 * space per recursion level (due to the deletable[] array). A failure is
+	 * improbable since the number of levels isn't likely to be large ... but
+	 * just in case, let's hand-optimize into a loop.
+	 */
+	if (recurse_to != InvalidBlockNumber)
+	{
+		blkno = recurse_to;
+		goto restart;
+	}
 }
-- 
2.19.2

>From 62fbe0ce5506e006b92dbfb07aee7414040d982f Mon Sep 17 00:00:00 2001
From: Heikki Linnakangas <heikki.linnakan...@iki.fi>
Date: Wed, 2 Jan 2019 16:00:16 +0200
Subject: [PATCH 2/2] Delete pages during GiST VACUUM v18-heikki

---
 src/backend/access/gist/README         |  14 +++
 src/backend/access/gist/gist.c         |  18 +++
 src/backend/access/gist/gistutil.c     |   3 +-
 src/backend/access/gist/gistvacuum.c   | 152 ++++++++++++++++++++++++-
 src/backend/access/gist/gistxlog.c     |  60 ++++++++++
 src/backend/access/rmgrdesc/gistdesc.c |   3 +
 src/backend/nodes/bitmapset.c          |  16 +++
 src/include/access/gist.h              |   3 +
 src/include/access/gist_private.h      |   7 +-
 src/include/access/gistxlog.h          |  10 +-
 src/include/nodes/bitmapset.h          |   1 +
 src/test/regress/expected/gist.out     |   4 +-
 src/test/regress/sql/gist.sql          |   4 +-
 13 files changed, 282 insertions(+), 13 deletions(-)

diff --git a/src/backend/access/gist/README b/src/backend/access/gist/README
index 02228662b81..c84359de310 100644
--- a/src/backend/access/gist/README
+++ b/src/backend/access/gist/README
@@ -413,6 +413,20 @@ emptied yet; tuples never move upwards in the tree. The final emptying loops
 through buffers at a given level until all buffers at that level have been
 emptied, and then moves down to the next level.
 
+Bulk delete algorithm (VACUUM)
+------------------------------
+
+Function gistbulkdelete() is responsible for marking empty leaf pages as free
+so that they can be used it allocate newly split pages. To find this pages
+function scans index in physical order.
+
+Physical scan reads the entire index from the first page to last. This scan
+maintains information necessary to collect block numbers of internal pages
+that need cleansing and block number of empty leafs.
+
+After the scan, for each internal pages under exclusive lock, each potentially
+free leaf page is examined. gistbulkdelete() never delete last one reference
+from internal page to preserve balanced tree properties.
 
 Authors:
 	Teodor Sigaev	<teo...@sigaev.ru>
diff --git a/src/backend/access/gist/gist.c b/src/backend/access/gist/gist.c
index d42e810c6b3..bbfd5a92b88 100644
--- a/src/backend/access/gist/gist.c
+++ b/src/backend/access/gist/gist.c
@@ -704,6 +704,11 @@ gistdoinsert(Relation r, IndexTuple itup, Size freespace,
 			GISTInsertStack *item;
 			OffsetNumber downlinkoffnum;
 
+			/*
+			 * Currently internal pages are not deleted during vacuum,
+			 * so we do not need to check if page is deleted
+			 */
+
 			downlinkoffnum = gistchoose(state.r, stack->page, itup, giststate);
 			iid = PageGetItemId(stack->page, downlinkoffnum);
 			idxtuple = (IndexTuple) PageGetItem(stack->page, iid);
@@ -838,6 +843,19 @@ gistdoinsert(Relation r, IndexTuple itup, Size freespace,
 				}
 			}
 
+			/*
+			 * Leaf pages can be left deleted but still referenced
+			 * until it's space is reused. Downlink to this page may be already
+			 * removed from the internal page, but this scan can posess it.
+			 */
+			if(GistPageIsDeleted(stack->page))
+			{
+				UnlockReleaseBuffer(stack->buffer);
+				xlocked = false;
+				state.stack = stack = stack->parent;
+				continue;
+			}
+
 			/* now state.stack->(page, buffer and blkno) points to leaf page */
 
 			gistinserttuple(&state, stack, giststate, itup,
diff --git a/src/backend/access/gist/gistutil.c b/src/backend/access/gist/gistutil.c
index 70627e5df66..adb316c6afa 100644
--- a/src/backend/access/gist/gistutil.c
+++ b/src/backend/access/gist/gistutil.c
@@ -23,6 +23,7 @@
 #include "storage/lmgr.h"
 #include "utils/float.h"
 #include "utils/syscache.h"
+#include "utils/snapmgr.h"
 #include "utils/lsyscache.h"
 
 
@@ -807,7 +808,7 @@ gistNewBuffer(Relation r)
 
 			gistcheckpage(r, buffer);
 
-			if (GistPageIsDeleted(page))
+			if (GistPageIsDeleted(page) && TransactionIdPrecedes(GistPageGetDeleteXid(page), RecentGlobalDataXmin))
 				return buffer;	/* OK to use */
 
 			LockBuffer(buffer, GIST_UNLOCK);
diff --git a/src/backend/access/gist/gistvacuum.c b/src/backend/access/gist/gistvacuum.c
index 4fb32bf76bf..bac6b8c77af 100644
--- a/src/backend/access/gist/gistvacuum.c
+++ b/src/backend/access/gist/gistvacuum.c
@@ -16,8 +16,10 @@
 
 #include "access/genam.h"
 #include "access/gist_private.h"
+#include "access/transam.h"
 #include "commands/vacuum.h"
 #include "miscadmin.h"
+#include "nodes/bitmapset.h"
 #include "storage/indexfsm.h"
 #include "storage/lmgr.h"
 
@@ -30,6 +32,10 @@ typedef struct
 	void	   *callback_state;
 	GistNSN		startNSN;
 	BlockNumber totFreePages;	/* true total # of free pages */
+	BlockNumber emptyPages;
+
+	Bitmapset  *internalPagesMap;
+	Bitmapset  *emptyLeafPagesMap;
 } GistVacState;
 
 static void gistvacuumscan(IndexVacuumInfo *info, IndexBulkDeleteResult *stats,
@@ -91,8 +97,6 @@ gistvacuumcleanup(IndexVacuumInfo *info, IndexBulkDeleteResult *stats)
  * check invalid tuples left after upgrade.
  * The set of target tuples is specified via a callback routine that tells
  * whether any given heap tuple (identified by ItemPointer) is being deleted.
- *
- * Result: a palloc'd struct containing statistical info for VACUUM displays.
  */
 static void
 gistvacuumscan(IndexVacuumInfo *info, IndexBulkDeleteResult *stats,
@@ -122,6 +126,9 @@ gistvacuumscan(IndexVacuumInfo *info, IndexBulkDeleteResult *stats,
 	else
 		vstate.startNSN = gistGetFakeLSN(rel);
 	vstate.totFreePages = 0;
+	vstate.emptyPages = 0;
+	vstate.internalPagesMap = NULL;
+	vstate.emptyLeafPagesMap = NULL;
 
 	/*
 	 * Need lock unless it's local to this backend.
@@ -166,6 +173,12 @@ gistvacuumscan(IndexVacuumInfo *info, IndexBulkDeleteResult *stats,
 		/* Quit if we've scanned the whole relation */
 		if (blkno >= num_pages)
 			break;
+
+		if (!vstate.internalPagesMap)
+			vstate.internalPagesMap = bms_make_empty(num_pages);
+		if (!vstate.emptyLeafPagesMap)
+			vstate.emptyLeafPagesMap = bms_make_empty(num_pages);
+
 		/* Iterate over pages, then loop back to recheck length */
 		for (; blkno < num_pages; blkno++)
 			gistvacuumpage(&vstate, blkno, blkno);
@@ -189,6 +202,126 @@ gistvacuumscan(IndexVacuumInfo *info, IndexBulkDeleteResult *stats,
 	/* update statistics */
 	stats->num_pages = num_pages;
 	stats->pages_free = vstate.totFreePages;
+
+	/* rescan inner pages that had empty child pages */
+	if (vstate.emptyPages > 0)
+	{
+		int			x;
+
+		x = -1;
+		while (vstate.emptyPages > 0 &&
+			   (x = bms_next_member(vstate.internalPagesMap, x)) >= 0)
+		{
+			Buffer		buffer;
+			Page		page;
+			OffsetNumber off,
+				maxoff;
+			IndexTuple  idxtuple;
+			ItemId	    iid;
+			OffsetNumber todelete[MaxOffsetNumber];
+			Buffer		buftodelete[MaxOffsetNumber];
+			int			ntodelete = 0;
+
+			/* FIXME: 'x' is signed, so this will not work with indexes larger than 2^31 blocks */
+			blkno = (BlockNumber) x;
+
+			buffer = ReadBufferExtended(rel, MAIN_FORKNUM, blkno, RBM_NORMAL,
+										info->strategy);
+
+			LockBuffer(buffer, GIST_EXCLUSIVE);
+			page = (Page) BufferGetPage(buffer);
+			if (PageIsNew(page) || GistPageIsDeleted(page) || GistPageIsLeaf(page))
+			{
+				UnlockReleaseBuffer(buffer);
+				continue;
+			}
+
+			maxoff = PageGetMaxOffsetNumber(page);
+			/* Check that leafs are still empty and decide what to delete */
+			for (off = FirstOffsetNumber; off <= maxoff; off = OffsetNumberNext(off))
+			{
+				Buffer		leafBuffer;
+				Page		leafPage;
+				BlockNumber leafBlockNo;
+
+				iid = PageGetItemId(page, off);
+				idxtuple = (IndexTuple) PageGetItem(page, iid);
+				/* if this page was not empty in previous scan - we do not consider it */
+				leafBlockNo = ItemPointerGetBlockNumber(&(idxtuple->t_tid));
+				if (!bms_is_member(leafBlockNo, vstate.emptyLeafPagesMap))
+					continue;
+
+				leafBuffer = ReadBufferExtended(rel, MAIN_FORKNUM, leafBlockNo,
+												RBM_NORMAL, info->strategy);
+				LockBuffer(leafBuffer, GIST_EXCLUSIVE);
+				gistcheckpage(rel, leafBuffer);
+				leafPage = (Page) BufferGetPage(leafBuffer);
+				if (!GistPageIsLeaf(leafPage))
+				{
+					UnlockReleaseBuffer(leafBuffer);
+					continue;
+				}
+
+				if (PageGetMaxOffsetNumber(leafPage) == InvalidOffsetNumber /* Nothing left to split */
+					&& !(GistFollowRight(leafPage) || GistPageGetNSN(page) < GistPageGetNSN(leafPage)) /* No follow-right */
+					&& ntodelete < maxoff-1) /* We must keep at least one leaf page per each */
+				{
+					buftodelete[ntodelete] = leafBuffer;
+					todelete[ntodelete++] = off;
+				}
+				else
+					UnlockReleaseBuffer(leafBuffer);
+			}
+
+			if (ntodelete)
+			{
+				/*
+				 * Like in _bt_unlink_halfdead_page we need a upper bound on xid
+				 * that could hold downlinks to this page. We use
+				 * ReadNewTransactionId() to instead of GetCurrentTransactionId
+				 * since we are in a VACUUM.
+				 */
+				TransactionId txid = ReadNewTransactionId();
+				int			i;
+
+				START_CRIT_SECTION();
+
+				/* Mark pages as deleted dropping references from internal pages */
+				for (i = 0; i < ntodelete; i++)
+				{
+					Page		leafPage = (Page) BufferGetPage(buftodelete[i]);
+					XLogRecPtr	recptr;
+
+					GistPageSetDeleteXid(leafPage,txid);
+
+					GistPageSetDeleted(leafPage);
+					MarkBufferDirty(buftodelete[i]);
+					stats->pages_deleted++;
+					vstate.emptyPages--;
+
+					MarkBufferDirty(buffer);
+					/* Offsets are changed as long as we delete tuples from internal page */
+					PageIndexTupleDelete(page, todelete[i] - i);
+
+					if (RelationNeedsWAL(rel))
+						recptr 	= gistXLogSetDeleted(rel->rd_node, buftodelete[i],
+													 txid, buffer, todelete[i] - i);
+					else
+						recptr = gistGetFakeLSN(rel);
+					PageSetLSN(page, recptr);
+					PageSetLSN(leafPage, recptr);
+
+					UnlockReleaseBuffer(buftodelete[i]);
+				}
+				END_CRIT_SECTION();
+			}
+
+			UnlockReleaseBuffer(buffer);
+		}
+	}
+
+	bms_free(vstate.emptyLeafPagesMap);
+	bms_free(vstate.internalPagesMap);
 }
 
 /*
@@ -242,6 +375,7 @@ restart:
 	{
 		OffsetNumber todelete[MaxOffsetNumber];
 		int			ntodelete = 0;
+		int			nremain;
 		GISTPageOpaque opaque = GistPageGetOpaque(page);
 		OffsetNumber maxoff = PageGetMaxOffsetNumber(page);
 
@@ -309,8 +443,18 @@ restart:
 			maxoff = PageGetMaxOffsetNumber(page);
 		}
 
-		stats->num_index_tuples += maxoff - FirstOffsetNumber + 1;
-
+		nremain = maxoff - FirstOffsetNumber + 1;
+		if (nremain == 0)
+		{
+			vstate->emptyLeafPagesMap = bms_add_member(vstate->emptyLeafPagesMap, blkno);
+			vstate->emptyPages++;
+		}
+		else
+			stats->num_index_tuples += nremain;
+	}
+	else
+	{
+		vstate->internalPagesMap = bms_add_member(vstate->internalPagesMap, blkno);
 	}
 
 	UnlockReleaseBuffer(buffer);
diff --git a/src/backend/access/gist/gistxlog.c b/src/backend/access/gist/gistxlog.c
index 01e025d5fdb..bb0fa473f5e 100644
--- a/src/backend/access/gist/gistxlog.c
+++ b/src/backend/access/gist/gistxlog.c
@@ -64,6 +64,39 @@ gistRedoClearFollowRight(XLogReaderState *record, uint8 block_id)
 		UnlockReleaseBuffer(buffer);
 }
 
+static void
+gistRedoPageSetDeleted(XLogReaderState *record)
+{
+	XLogRecPtr	lsn = record->EndRecPtr;
+	gistxlogPageDelete *xldata = (gistxlogPageDelete *) XLogRecGetData(record);
+	Buffer		buffer;
+	Page		page;
+
+	if (XLogReadBufferForRedo(record, 0, &buffer) == BLK_NEEDS_REDO)
+	{
+		page = (Page) BufferGetPage(buffer);
+
+		GistPageSetDeleteXid(page, xldata->deleteXid);
+		GistPageSetDeleted(page);
+
+		PageSetLSN(page, lsn);
+		MarkBufferDirty(buffer);
+	}
+	if (BufferIsValid(buffer))
+		UnlockReleaseBuffer(buffer);
+
+	if (XLogReadBufferForRedo(record, 1, &buffer) == BLK_NEEDS_REDO)
+	{
+		page = (Page) BufferGetPage(buffer);
+
+		PageIndexTupleDelete(page, xldata->downlinkOffset);
+
+		PageSetLSN(page, lsn);
+		MarkBufferDirty(buffer);
+	}
+	if (BufferIsValid(buffer))
+		UnlockReleaseBuffer(buffer);
+}
 /*
  * redo any page update (except page split)
  */
@@ -116,6 +149,7 @@ gistRedoPageUpdateRecord(XLogReaderState *record)
 			data += sizeof(OffsetNumber) * xldata->ntodelete;
 
 			PageIndexMultiDelete(page, todelete, xldata->ntodelete);
+
 			if (GistPageIsLeaf(page))
 				GistMarkTuplesDeleted(page);
 		}
@@ -535,6 +569,9 @@ gist_redo(XLogReaderState *record)
 		case XLOG_GIST_CREATE_INDEX:
 			gistRedoCreateIndex(record);
 			break;
+		case XLOG_GIST_PAGE_DELETE:
+			gistRedoPageSetDeleted(record);
+			break;
 		default:
 			elog(PANIC, "gist_redo: unknown op code %u", info);
 	}
@@ -653,6 +690,29 @@ gistXLogSplit(bool page_is_leaf,
 	return recptr;
 }
 
+/*
+ * Write XLOG record describing a page delete. This also includes removal of
+ * downlink from internal page.
+ */
+XLogRecPtr
+gistXLogSetDeleted(RelFileNode node, Buffer buffer, TransactionId xid,
+					Buffer internalPageBuffer, OffsetNumber internalPageOffset) {
+	gistxlogPageDelete xlrec;
+	XLogRecPtr	recptr;
+
+	xlrec.deleteXid = xid;
+	xlrec.downlinkOffset = internalPageOffset;
+
+	XLogBeginInsert();
+	XLogRegisterData((char *) &xlrec, sizeof(gistxlogPageDelete));
+
+	XLogRegisterBuffer(0, buffer, REGBUF_STANDARD);
+	XLogRegisterBuffer(1, internalPageBuffer, REGBUF_STANDARD);
+	/* new tuples */
+	recptr = XLogInsert(RM_GIST_ID, XLOG_GIST_PAGE_DELETE);
+	return recptr;
+}
+
 /*
  * Write XLOG record describing a page update. The update can include any
  * number of deletions and/or insertions of tuples on a single index page.
diff --git a/src/backend/access/rmgrdesc/gistdesc.c b/src/backend/access/rmgrdesc/gistdesc.c
index b79ed1dfdc8..f65335ba23a 100644
--- a/src/backend/access/rmgrdesc/gistdesc.c
+++ b/src/backend/access/rmgrdesc/gistdesc.c
@@ -76,6 +76,9 @@ gist_identify(uint8 info)
 		case XLOG_GIST_CREATE_INDEX:
 			id = "CREATE_INDEX";
 			break;
+		case XLOG_GIST_PAGE_DELETE:
+			id = "PAGE_DELETE";
+			break;
 	}
 
 	return id;
diff --git a/src/backend/nodes/bitmapset.c b/src/backend/nodes/bitmapset.c
index 8ce253c88df..29cfcd78984 100644
--- a/src/backend/nodes/bitmapset.c
+++ b/src/backend/nodes/bitmapset.c
@@ -258,6 +258,22 @@ bms_make_singleton(int x)
 	return result;
 }
 
+/*
+ * bms_make_singleton - preallocate an empty bitmapset
+ */
+Bitmapset *
+bms_make_empty(int size)
+{
+	Bitmapset  *result;
+	int			wordnum;
+
+	if (size < 0)
+		elog(ERROR, "negative bitmapset member not allowed");
+	wordnum = WORDNUM(size - 1);
+	result = (Bitmapset *) palloc0(BITMAPSET_SIZE(wordnum + 1));
+	return result;
+}
+
 /*
  * bms_free - free a bitmapset
  *
diff --git a/src/include/access/gist.h b/src/include/access/gist.h
index 827566dc6e7..0dd2bf47c8c 100644
--- a/src/include/access/gist.h
+++ b/src/include/access/gist.h
@@ -151,6 +151,9 @@ typedef struct GISTENTRY
 #define GistPageGetNSN(page) ( PageXLogRecPtrGet(GistPageGetOpaque(page)->nsn))
 #define GistPageSetNSN(page, val) ( PageXLogRecPtrSet(GistPageGetOpaque(page)->nsn, val))
 
+#define GistPageGetDeleteXid(page) ( ((PageHeader) (page))->pd_prune_xid )
+#define GistPageSetDeleteXid(page, val) ( ((PageHeader) (page))->pd_prune_xid = val)
+
 /*
  * Vector of GISTENTRY structs; user-defined methods union and picksplit
  * take it as one of their arguments
diff --git a/src/include/access/gist_private.h b/src/include/access/gist_private.h
index a73716d6eaa..5d02800dac6 100644
--- a/src/include/access/gist_private.h
+++ b/src/include/access/gist_private.h
@@ -412,12 +412,17 @@ extern bool gistplacetopage(Relation rel, Size freespace, GISTSTATE *giststate,
 extern SplitedPageLayout *gistSplit(Relation r, Page page, IndexTuple *itup,
 		  int len, GISTSTATE *giststate);
 
+/* gistxlog.c */
+extern XLogRecPtr gistXLogSetDeleted(RelFileNode node, Buffer buffer,
+					TransactionId xid, Buffer internalPageBuffer,
+					OffsetNumber internalPageOffset);
+
 extern XLogRecPtr gistXLogUpdate(Buffer buffer,
 			   OffsetNumber *todelete, int ntodelete,
 			   IndexTuple *itup, int ntup,
 			   Buffer leftchild);
 
-XLogRecPtr gistXLogDelete(Buffer buffer, OffsetNumber *todelete,
+extern XLogRecPtr gistXLogDelete(Buffer buffer, OffsetNumber *todelete,
 			   int ntodelete, RelFileNode hnode);
 
 extern XLogRecPtr gistXLogSplit(bool page_is_leaf,
diff --git a/src/include/access/gistxlog.h b/src/include/access/gistxlog.h
index b67c7100500..3c71d0261a1 100644
--- a/src/include/access/gistxlog.h
+++ b/src/include/access/gistxlog.h
@@ -17,13 +17,15 @@
 #include "access/xlogreader.h"
 #include "lib/stringinfo.h"
 
+/* XLog stuff */
+
 #define XLOG_GIST_PAGE_UPDATE		0x00
 #define XLOG_GIST_DELETE			0x10 /* delete leaf index tuples for a page */
  /* #define XLOG_GIST_NEW_ROOT			 0x20 */	/* not used anymore */
 #define XLOG_GIST_PAGE_SPLIT		0x30
  /* #define XLOG_GIST_INSERT_COMPLETE	 0x40 */	/* not used anymore */
 #define XLOG_GIST_CREATE_INDEX		0x50
- /* #define XLOG_GIST_PAGE_DELETE		 0x60 */	/* not used anymore */
+#define XLOG_GIST_PAGE_DELETE		0x60
 
 /*
  * Backup Blk 0: updated page.
@@ -76,6 +78,12 @@ typedef struct gistxlogPageSplit
 	 */
 } gistxlogPageSplit;
 
+typedef struct gistxlogPageDelete
+{
+   TransactionId deleteXid; /* last Xid which could see page in scan */
+   OffsetNumber downlinkOffset; /* Offset of the downlink referencing this page */
+} gistxlogPageDelete;
+
 extern void gist_redo(XLogReaderState *record);
 extern void gist_desc(StringInfo buf, XLogReaderState *record);
 extern const char *gist_identify(uint8 info);
diff --git a/src/include/nodes/bitmapset.h b/src/include/nodes/bitmapset.h
index 433df8a46d0..55435f9ae64 100644
--- a/src/include/nodes/bitmapset.h
+++ b/src/include/nodes/bitmapset.h
@@ -79,6 +79,7 @@ extern Bitmapset *bms_copy(const Bitmapset *a);
 extern bool bms_equal(const Bitmapset *a, const Bitmapset *b);
 extern int	bms_compare(const Bitmapset *a, const Bitmapset *b);
 extern Bitmapset *bms_make_singleton(int x);
+extern Bitmapset *bms_make_empty(int size);
 extern void bms_free(Bitmapset *a);
 
 extern Bitmapset *bms_union(const Bitmapset *a, const Bitmapset *b);
diff --git a/src/test/regress/expected/gist.out b/src/test/regress/expected/gist.out
index f5a2993aaf2..5b92f08c747 100644
--- a/src/test/regress/expected/gist.out
+++ b/src/test/regress/expected/gist.out
@@ -27,9 +27,7 @@ insert into gist_point_tbl (id, p)
 select g+100000, point(g*10+1, g*10+1) from generate_series(1, 10000) g;
 -- To test vacuum, delete some entries from all over the index.
 delete from gist_point_tbl where id % 2 = 1;
--- And also delete some concentration of values. (GiST doesn't currently
--- attempt to delete pages even when they become empty, but if it did, this
--- would exercise it)
+-- And also delete some concentration of values.
 delete from gist_point_tbl where id < 10000;
 vacuum analyze gist_point_tbl;
 -- rebuild the index with a different fillfactor
diff --git a/src/test/regress/sql/gist.sql b/src/test/regress/sql/gist.sql
index bae722fe13c..e66396e851b 100644
--- a/src/test/regress/sql/gist.sql
+++ b/src/test/regress/sql/gist.sql
@@ -28,9 +28,7 @@ select g+100000, point(g*10+1, g*10+1) from generate_series(1, 10000) g;
 -- To test vacuum, delete some entries from all over the index.
 delete from gist_point_tbl where id % 2 = 1;
 
--- And also delete some concentration of values. (GiST doesn't currently
--- attempt to delete pages even when they become empty, but if it did, this
--- would exercise it)
+-- And also delete some concentration of values.
 delete from gist_point_tbl where id < 10000;
 
 vacuum analyze gist_point_tbl;
-- 
2.19.2

Reply via email to