From 4f5368e7a5e39269101b5a74b251aab8ba09ff7e Mon Sep 17 00:00:00 2001
From: Peter Geoghegan <pg@bowt.ie>
Date: Tue, 28 Apr 2020 19:12:41 -0700
Subject: [PATCH v2 3/3] Refactor btvacuumpage().

btvacuumscan() visits nbtree pages in physical order by calling
btvacuumpage() for each block.  It is occasionally necessary for an
individual call to btvacuumpage() to "backtrack" to a block already
visited in the outermost btvacuumscan() loop.  This is necessary to
avoid overlooking the right half page from a page split that used a
block that VACUUM already passed over (this happens when the new page
uses a block that came from the FSM).

Remove one of the arguments to btvacuumpage(), and give up on the idea
that it's a recursive function.  Advertising btvacuumpage() as a
recursive function seems to make it more complicated for several
reasons.  In reality the function always simulated recursion with a
loop, without ever actually calling itself.  This wasn't just necessary
as a precaution as implied by the comments, though.  There is no
reliable natural limit on the number of times we can backtrack, so true
recursion was and is totally unworkable.

There are important behavioral difference when "recursing"/backtracking,
mostly related to page deletion.  That's what clinches it.  We don't
perform page deletion when backtracking due to the extra complexity.
And when we recurse, we're not performing a physical order scan anymore,
so we expect fairly different conditions to hold for the page -- we
backtrack because the page is the right half of a page split that
occurred since the VACUUM operation began.

Recognizing that backtracking is quite different makes it possible to be
more particular about the page that we backtrack to.  We now log cases
where backtracking fails to find a valid leaf right sibling page as
corruption.

Note that this commit uses the same variable name in btvacuumscan() and
btvacuumpage() as were used in the commit that fixed the under-counting
issue.
---
 src/backend/access/nbtree/nbtinsert.c |   3 +
 src/backend/access/nbtree/nbtree.c    | 118 ++++++++++++++------------
 2 files changed, 69 insertions(+), 52 deletions(-)

diff --git a/src/backend/access/nbtree/nbtinsert.c b/src/backend/access/nbtree/nbtinsert.c
index a70b64d964..99827b457b 100644
--- a/src/backend/access/nbtree/nbtinsert.c
+++ b/src/backend/access/nbtree/nbtinsert.c
@@ -1702,6 +1702,9 @@ _bt_split(Relation rel, BTScanInsert itup_key, Buffer buf, Buffer cbuf,
 	 * Finish off remaining leftpage special area fields.  They cannot be set
 	 * before both origpage (leftpage) and rightpage buffers are acquired and
 	 * locked.
+	 *
+	 * btpo_cycleid is only used with leaf pages, though we set it here in all
+	 * cases just to be consistent.
 	 */
 	lopaque->btpo_next = rightpagenumber;
 	lopaque->btpo_cycleid = _bt_vacuum_cycleid(rel);
diff --git a/src/backend/access/nbtree/nbtree.c b/src/backend/access/nbtree/nbtree.c
index b20a592808..699d5a8d42 100644
--- a/src/backend/access/nbtree/nbtree.c
+++ b/src/backend/access/nbtree/nbtree.c
@@ -93,8 +93,7 @@ typedef struct BTParallelScanDescData *BTParallelScanDesc;
 static void btvacuumscan(IndexVacuumInfo *info, IndexBulkDeleteResult *stats,
 						 IndexBulkDeleteCallback callback, void *callback_state,
 						 BTCycleId cycleid);
-static void btvacuumpage(BTVacState *vstate, BlockNumber blkno,
-						 BlockNumber orig_blkno);
+static void btvacuumpage(BTVacState *vstate, BlockNumber scanblkno);
 static BTVacuumPosting btreevacuumposting(BTVacState *vstate,
 										  IndexTuple posting,
 										  OffsetNumber updatedoffset,
@@ -967,7 +966,7 @@ btvacuumscan(IndexVacuumInfo *info, IndexBulkDeleteResult *stats,
 	Relation	rel = info->index;
 	BTVacState	vstate;
 	BlockNumber num_pages;
-	BlockNumber blkno;
+	BlockNumber scanblkno;
 	bool		needLock;
 
 	/*
@@ -1017,7 +1016,7 @@ btvacuumscan(IndexVacuumInfo *info, IndexBulkDeleteResult *stats,
 	 */
 	needLock = !RELATION_IS_LOCAL(rel);
 
-	blkno = BTREE_METAPAGE + 1;
+	scanblkno = BTREE_METAPAGE + 1;
 	for (;;)
 	{
 		/* Get the current relation length */
@@ -1032,15 +1031,15 @@ btvacuumscan(IndexVacuumInfo *info, IndexBulkDeleteResult *stats,
 										 num_pages);
 
 		/* Quit if we've scanned the whole relation */
-		if (blkno >= num_pages)
+		if (scanblkno >= num_pages)
 			break;
 		/* Iterate over pages, then loop back to recheck length */
-		for (; blkno < num_pages; blkno++)
+		for (; scanblkno < num_pages; scanblkno++)
 		{
-			btvacuumpage(&vstate, blkno, blkno);
+			btvacuumpage(&vstate, scanblkno);
 			if (info->report_progress)
 				pgstat_progress_update_param(PROGRESS_SCAN_BLOCKS_DONE,
-											 blkno);
+											 scanblkno);
 		}
 	}
 
@@ -1078,31 +1077,42 @@ btvacuumscan(IndexVacuumInfo *info, IndexBulkDeleteResult *stats,
 /*
  * btvacuumpage --- VACUUM one page
  *
- * This processes a single page for btvacuumscan().  In some cases we
- * must go back and re-examine previously-scanned pages; this routine
- * recurses when necessary to handle that case.
+ * This processes a single page for btvacuumscan() (the page from caller's
+ * scanblkno argument).
  *
- * blkno is the page to process.  orig_blkno is the highest block number
- * reached by the outer btvacuumscan loop (the same as blkno, unless we
- * are recursing to re-examine a previous page).
+ * In some cases we must backtrack to re-examine and VACUUM pages
+ * previously-scanned as the scanblkno during a previous call here.  This
+ * happens when there was a leaf page split that happened since the VACUUM
+ * operation began.
+ *
+ * When we delete the scanblkno page, we may also delete other pages in
+ * passing.  This is needed when page deletion must delete the parent of the
+ * scanblkno page.  In general, the parent page could be located at any block
+ * in the index (it may have already been scanblkno in a previous call here,
+ * or may yet become scanblkno is a future call here).  Note that we never
+ * delete a page when we're backtracking (though we may do so when we're about
+ * to backtrack).
  */
 static void
-btvacuumpage(BTVacState *vstate, BlockNumber blkno, BlockNumber orig_blkno)
+btvacuumpage(BTVacState *vstate, BlockNumber scanblkno)
 {
 	IndexVacuumInfo *info = vstate->info;
 	IndexBulkDeleteResult *stats = vstate->stats;
 	IndexBulkDeleteCallback callback = vstate->callback;
 	void	   *callback_state = vstate->callback_state;
 	Relation	rel = info->index;
-	bool		delete_now;
-	BlockNumber recurse_to;
+	bool		attempt_pagedel;
+	BlockNumber blkno;
+	BlockNumber backtrack_to;
 	Buffer		buf;
 	Page		page;
 	BTPageOpaque opaque = NULL;
 
-restart:
-	delete_now = false;
-	recurse_to = P_NONE;
+	blkno = scanblkno;
+
+backtrack:
+	attempt_pagedel = false;
+	backtrack_to = P_NONE;
 
 	/* call vacuum_delay_point while not holding any buffer lock */
 	vacuum_delay_point();
@@ -1124,17 +1134,23 @@ restart:
 	}
 
 	/*
-	 * If we are recursing, the only case we want to do anything with is a
-	 * live leaf page having the current vacuum cycle ID.  Any other state
-	 * implies we already saw the page (eg, deleted it as being empty).
+	 * If we are backtracking, the only case we want to do anything with is
+	 * a live leaf page having the current vacuum cycle ID.  Any other state
+	 * implies that the index is corrupt.
 	 */
-	if (blkno != orig_blkno)
+	if (blkno != scanblkno)
 	{
 		if (_bt_page_recyclable(page) ||
 			P_IGNORE(opaque) ||
 			!P_ISLEAF(opaque) ||
 			opaque->btpo_cycleid != vstate->cycleid)
 		{
+			Assert(false);
+			ereport(LOG,
+					(errcode(ERRCODE_INDEX_CORRUPTED),
+					 errmsg_internal("found invalid block %u while following right link from block %u in index \"%s\"",
+									 blkno, scanblkno,
+									 RelationGetRelationName(rel))));
 			_bt_relbuf(rel, buf);
 			return;
 		}
@@ -1167,7 +1183,7 @@ restart:
 		 * Half-dead leaf page.  Try to delete now.  Might update
 		 * oldestBtpoXact and pages_deleted below.
 		 */
-		delete_now = true;
+		attempt_pagedel = true;
 	}
 	else if (P_ISLEAF(opaque))
 	{
@@ -1191,18 +1207,24 @@ restart:
 		LockBufferForCleanup(buf);
 
 		/*
-		 * Check whether we need to recurse back to earlier pages.  What we
+		 * Check whether we need to backtrack to earlier pages.  What we
 		 * are concerned about is a page split that happened since we started
-		 * the vacuum scan.  If the split moved some tuples to a lower page
-		 * then we might have missed 'em.  If so, set up for tail recursion.
-		 * (Must do this before possibly clearing btpo_cycleid below!)
+		 * the vacuum scan.  If the split moved tuples on the right half of
+		 * the split (i.e. the tuples that sort high) to a new block that we
+		 * already passed over, then we need to backtrack now.  Must do this
+		 * before possibly clearing btpo_cycleid below!
+		 *
+		 * Note that we don't need to specifically consider the left half page
+		 * of any page splits that have occured since the start of the VACUUM
+		 * scan.  The left half is stored in the same block as the original
+		 * page, so it can't get overlooked.
 		 */
 		if (vstate->cycleid != 0 &&
 			opaque->btpo_cycleid == vstate->cycleid &&
 			!(opaque->btpo_flags & BTP_SPLIT_END) &&
 			!P_RIGHTMOST(opaque) &&
-			opaque->btpo_next < orig_blkno)
-			recurse_to = opaque->btpo_next;
+			opaque->btpo_next < scanblkno)
+			backtrack_to = opaque->btpo_next;
 
 		/*
 		 * When each VACUUM begins, it determines an OldestXmin cutoff value.
@@ -1313,7 +1335,7 @@ restart:
 		 */
 		if (ndeletable > 0 || nupdatable > 0)
 		{
-			Assert(nhtidsdead >= Max(ndeletable, 1));
+			Assert(nhtidsdead >= ndeletable + nupdatable);
 			_bt_delitems_vacuum(rel, buf, deletable, ndeletable, updatable,
 								nupdatable);
 
@@ -1328,11 +1350,10 @@ restart:
 		else
 		{
 			/*
-			 * If the leaf page has been split during this vacuum cycle, it
-			 * seems worth expending a write to clear btpo_cycleid even if we
-			 * don't have any deletions to do.  (If we do, _bt_delitems_vacuum
-			 * takes care of this.)  This ensures we won't process the page
-			 * again.
+			 * If the leaf page has been split during this vacuum cycle, clear
+			 * btpo_cycleid,  even though we don't have any deletions to do.
+			 * (If we do, _bt_delitems_vacuum takes care of this.)  This
+			 * ensures we won't process the page again.
 			 *
 			 * We treat this like a hint-bit update because there's no need to
 			 * WAL-log it.
@@ -1349,21 +1370,21 @@ restart:
 		/*
 		 * If the leaf page is now empty, try to delete; else count the live
 		 * tuples (live table TIDs in posting lists are counted as separate
-		 * live tuples).  We don't delete when recursing, though, to avoid
+		 * live tuples).  We don't delete when backtracking, though, to avoid
 		 * putting entries into freePages out-of-order.  Doesn't seem worth
 		 * any extra code to handle the case, since it's very unlikely that
 		 * we'll be able to delete a leaf page that's the right half of a page
 		 * split that occurred after the current VACUUM operation began.
 		 */
 		if (minoff > maxoff)
-			delete_now = (blkno == orig_blkno);
+			attempt_pagedel = (blkno == scanblkno);
 		else
 			stats->num_index_tuples += nhtidslive;
 
-		Assert(!delete_now || nhtidslive == 0);
+		Assert(!attempt_pagedel || nhtidslive == 0);
 	}
 
-	if (delete_now)
+	if (attempt_pagedel)
 	{
 		MemoryContext oldcontext;
 
@@ -1376,6 +1397,7 @@ restart:
 		 * any page that a future call here from btvacuumscan is expected to
 		 * count.  There will be no double-counting.
 		 */
+		Assert(blkno == scanblkno);
 		stats->pages_deleted += _bt_pagedel(rel, buf, &vstate->oldestBtpoXact);
 
 		MemoryContextSwitchTo(oldcontext);
@@ -1384,18 +1406,10 @@ restart:
 	else
 		_bt_relbuf(rel, buf);
 
-	/*
-	 * This is really tail recursion, but if the compiler is too stupid to
-	 * optimize it as such, we'd eat an uncomfortably large amount of stack
-	 * space per recursion level (due to the arrays used to track details of
-	 * deletable/updatable items).  A failure is improbable since the number
-	 * of levels isn't likely to be large ...  but just in case, let's
-	 * hand-optimize into a loop.
-	 */
-	if (recurse_to != P_NONE)
+	if (backtrack_to != P_NONE)
 	{
-		blkno = recurse_to;
-		goto restart;
+		blkno = backtrack_to;
+		goto backtrack;
 	}
 }
 
-- 
2.25.1

