From ec5e4a907a71427540581003d81caa29486b337e Mon Sep 17 00:00:00 2001
From: Peter Geoghegan <pg@bowt.ie>
Date: Tue, 28 Apr 2020 19:12:41 -0700
Subject: [PATCH v2 2/3] Fix undercounting in VACUUM VERBOSE output.

The logic for determining how many nbtree pages are deleted in an index
must compensate for the fact that indexes deleted by the current VACUUM
might be visited a second time by the outermost btvacuumscan() physical
order scan.  btvacuumpage() avoided double-counting by only counting a
single page as deleted when there was one or more pages deleted.

This approach was better than simply accepting the return value of
_bt_pagedel() uncritically, but it was still slightly wrong.  It failed
to account for the fact that _bt_pagedel() can legitimately delete pages
that the physical order scan will not visit again.  This resulted in
undercounting in VACUUM VERBOSE output: the "%u index pages have been
deleted" figure often failed to include a small number of deleted
internal pages (as well as empty cousin leaf pages that couldn't be
deleted in an earlier _bt_pagedel() call due to the "rightmost page of a
parent" restriction).

Fix the accounting by teaching _bt_pagedel() about its caller's
requirements.  It now counts pages that it knows the caller won't visit
again, and doesn't count pages that it knows the caller will visit
again.  btvacuumpage() can now completely trust _bt_pagedel()'s return
value.
---
 src/include/access/nbtree.h         |  4 +--
 src/backend/access/nbtree/nbtpage.c | 42 ++++++++++++++++++++++-------
 src/backend/access/nbtree/nbtree.c  | 33 ++++++++++++-----------
 3 files changed, 52 insertions(+), 27 deletions(-)

diff --git a/src/include/access/nbtree.h b/src/include/access/nbtree.h
index 7bccd7ecc3..97c96d0a3a 100644
--- a/src/include/access/nbtree.h
+++ b/src/include/access/nbtree.h
@@ -1080,8 +1080,8 @@ extern void _bt_delitems_vacuum(Relation rel, Buffer buf,
 extern void _bt_delitems_delete(Relation rel, Buffer buf,
 								OffsetNumber *deletable, int ndeletable,
 								Relation heapRel);
-extern int	_bt_pagedel(Relation rel, Buffer leafbuf,
-						TransactionId *oldestBtpoXact);
+extern uint32 _bt_pagedel(Relation rel, Buffer leafbuf,
+						  TransactionId *oldestBtpoXact);
 
 /*
  * prototypes for functions in nbtsearch.c
diff --git a/src/backend/access/nbtree/nbtpage.c b/src/backend/access/nbtree/nbtpage.c
index c90a2012d5..b65ce4db32 100644
--- a/src/backend/access/nbtree/nbtpage.c
+++ b/src/backend/access/nbtree/nbtpage.c
@@ -38,8 +38,10 @@ static BTMetaPageData *_bt_getmeta(Relation rel, Buffer metabuf);
 static bool _bt_mark_page_halfdead(Relation rel, Buffer leafbuf,
 								   BTStack stack);
 static bool _bt_unlink_halfdead_page(Relation rel, Buffer leafbuf,
+									 BlockNumber scanblkno,
 									 bool *rightsib_empty,
-									 TransactionId *oldestBtpoXact);
+									 TransactionId *oldestBtpoXact,
+									 uint32 *ndeleted);
 static TransactionId _bt_xid_horizon(Relation rel, Relation heapRel, Page page,
 									 OffsetNumber *deletable, int ndeletable);
 static bool _bt_lock_branch_parent(Relation rel, BlockNumber child,
@@ -1489,7 +1491,10 @@ _bt_lock_branch_parent(Relation rel, BlockNumber child, BTStack stack,
  *
  * Returns the number of pages successfully deleted (zero if page cannot
  * be deleted now; could be more than one if parent or right sibling pages
- * were deleted too).
+ * were deleted too).  The final count is calculated in a way that helps our
+ * caller to avoid double-counting.  When we know that the outermost
+ * btvacuumscan physical-order loop over the index relation will go on to find
+ * a page that we delete now, we don't count it in return value.
  *
  * Maintains *oldestBtpoXact for caller for any pages that are deleted here
  * (including caller's target page and any other internal pages or right
@@ -1501,15 +1506,22 @@ _bt_lock_branch_parent(Relation rel, BlockNumber child, BTStack stack,
  * carefully, it's better to run it in a temp context that can be reset
  * frequently.
  */
-int
+uint32
 _bt_pagedel(Relation rel, Buffer leafbuf, TransactionId *oldestBtpoXact)
 {
-	int			ndeleted = 0;
+	uint32		ndeleted = 0;
 	BlockNumber rightsib;
 	bool		rightsib_empty;
 	Page		page;
 	BTPageOpaque opaque;
 
+	/*
+	 * Save original leafbuf block number from caller.  Only deleted blocks
+	 * that are <= scanblkno get counted in ndeleted return value.  This
+	 * approach helps caller to avoid double-counting deleted pages.
+	 */
+	BlockNumber scanblkno = BufferGetBlockNumber(leafbuf);
+
 	/*
 	 * "stack" is a search stack leading (approximately) to the target page.
 	 * It is initially NULL, but when iterating, we keep it to avoid
@@ -1560,8 +1572,9 @@ _bt_pagedel(Relation rel, Buffer leafbuf, TransactionId *oldestBtpoXact)
 			if (P_ISDELETED(opaque))
 				ereport(LOG,
 						(errcode(ERRCODE_INDEX_CORRUPTED),
-						 errmsg_internal("found deleted block %u while following right link in index \"%s\"",
+						 errmsg_internal("found deleted block %u while following right link from block %u in index \"%s\"",
 										 BufferGetBlockNumber(leafbuf),
+										 scanblkno,
 										 RelationGetRelationName(rel))));
 
 			_bt_relbuf(rel, leafbuf);
@@ -1711,13 +1724,13 @@ _bt_pagedel(Relation rel, Buffer leafbuf, TransactionId *oldestBtpoXact)
 		while (P_ISHALFDEAD(opaque))
 		{
 			/* Check for interrupts in _bt_unlink_halfdead_page */
-			if (!_bt_unlink_halfdead_page(rel, leafbuf, &rightsib_empty,
-										  oldestBtpoXact))
+			if (!_bt_unlink_halfdead_page(rel, leafbuf, scanblkno,
+										  &rightsib_empty, oldestBtpoXact,
+										  &ndeleted))
 			{
 				/* _bt_unlink_halfdead_page failed, released buffer */
 				return ndeleted;
 			}
-			ndeleted++;
 		}
 
 		Assert(P_ISLEAF(opaque) && P_ISDELETED(opaque));
@@ -1976,8 +1989,9 @@ _bt_mark_page_halfdead(Relation rel, Buffer leafbuf, BTStack stack)
  * to avoid having to reacquire a lock we already released).
  */
 static bool
-_bt_unlink_halfdead_page(Relation rel, Buffer leafbuf, bool *rightsib_empty,
-						 TransactionId *oldestBtpoXact)
+_bt_unlink_halfdead_page(Relation rel, Buffer leafbuf, BlockNumber scanblkno,
+						 bool *rightsib_empty, TransactionId *oldestBtpoXact,
+						 uint32 *ndeleted)
 {
 	BlockNumber leafblkno = BufferGetBlockNumber(leafbuf);
 	BlockNumber leafleftsib;
@@ -2373,6 +2387,14 @@ _bt_unlink_halfdead_page(Relation rel, Buffer leafbuf, bool *rightsib_empty,
 		TransactionIdPrecedes(opaque->btpo.xact, *oldestBtpoXact))
 		*oldestBtpoXact = opaque->btpo.xact;
 
+	/*
+	 * If btvacuumscan won't revisit this page in a future btvacuumpage call
+	 * and count it as deleted then, we count it as deleted by current
+	 * btvacuumpage call
+	 */
+	if (target <= scanblkno)
+		(*ndeleted)++;
+
 	/*
 	 * Release the target, if it was not the leaf block.  The leaf is always
 	 * kept locked.
diff --git a/src/backend/access/nbtree/nbtree.c b/src/backend/access/nbtree/nbtree.c
index 65f7fd1945..b20a592808 100644
--- a/src/backend/access/nbtree/nbtree.c
+++ b/src/backend/access/nbtree/nbtree.c
@@ -1328,10 +1328,11 @@ restart:
 		else
 		{
 			/*
-			 * If the page has been split during this vacuum cycle, it seems
-			 * worth expending a write to clear btpo_cycleid even if we don't
-			 * have any deletions to do.  (If we do, _bt_delitems_vacuum takes
-			 * care of this.)  This ensures we won't process the page again.
+			 * If the leaf page has been split during this vacuum cycle, it
+			 * seems worth expending a write to clear btpo_cycleid even if we
+			 * don't have any deletions to do.  (If we do, _bt_delitems_vacuum
+			 * takes care of this.)  This ensures we won't process the page
+			 * again.
 			 *
 			 * We treat this like a hint-bit update because there's no need to
 			 * WAL-log it.
@@ -1346,11 +1347,13 @@ restart:
 		}
 
 		/*
-		 * If it's now empty, try to delete; else count the live tuples (live
-		 * table TIDs in posting lists are counted as separate live tuples).
-		 * We don't delete when recursing, though, to avoid putting entries
-		 * into freePages out-of-order (doesn't seem worth any extra code to
-		 * handle the case).
+		 * If the leaf page is now empty, try to delete; else count the live
+		 * tuples (live table TIDs in posting lists are counted as separate
+		 * live tuples).  We don't delete when recursing, though, to avoid
+		 * putting entries into freePages out-of-order.  Doesn't seem worth
+		 * any extra code to handle the case, since it's very unlikely that
+		 * we'll be able to delete a leaf page that's the right half of a page
+		 * split that occurred after the current VACUUM operation began.
 		 */
 		if (minoff > maxoff)
 			delete_now = (blkno == orig_blkno);
@@ -1363,17 +1366,17 @@ restart:
 	if (delete_now)
 	{
 		MemoryContext oldcontext;
-		int			ndel;
 
 		/* Run pagedel in a temp context to avoid memory leakage */
 		MemoryContextReset(vstate->pagedelcontext);
 		oldcontext = MemoryContextSwitchTo(vstate->pagedelcontext);
 
-		ndel = _bt_pagedel(rel, buf, &vstate->oldestBtpoXact);
-
-		/* count only this page, else may double-count parent */
-		if (ndel)
-			stats->pages_deleted++;
+		/*
+		 * We trust the _bt_pagedel return value because it does not include
+		 * any page that a future call here from btvacuumscan is expected to
+		 * count.  There will be no double-counting.
+		 */
+		stats->pages_deleted += _bt_pagedel(rel, buf, &vstate->oldestBtpoXact);
 
 		MemoryContextSwitchTo(oldcontext);
 		/* pagedel released buffer, so we shouldn't */
-- 
2.25.1

