From 7bf56c30e5fa0cb2c7b10fb658ced3d9d69159a0 Mon Sep 17 00:00:00 2001
From: Peter Geoghegan <pg@bowt.ie>
Date: Tue, 28 Apr 2020 19:12:41 -0700
Subject: [PATCH v2 1/3] Fix bug in nbtree VACUUM "skip full scan" feature.

Commit 857f9c36cda (which taught nbtree to skip VACUUM's full index scan
during cleanup of nbtree indexes in some situations) made nbtree VACUUM
store the oldest btpo_xact in the nbtree metapage for the index as a
whole.  This was based on the oldest observed value across every deleted
index page.  However, it failed to handle pages that the current VACUUM
operation deletes correctly.

The most immediate problem with the handling of pages deleted by the
current VACUUM operation was that the special area of the page was
examined without a buffer pin.  More fundamentally, the handling failed
to account for the full range of _bt_pagedel() behaviors.  For example,
_bt_pagedel() sometimes deletes internal pages in passing, as part of
deleting an entire subtree with btvacuumpage() caller's page as the leaf
level page.

To fix, push down the logic for maintaining the oldest btpo_xact to
_bt_pagedel().  btvacuumpage() is strictly responsible for pages that
were fully deleted by a previous VACUUM operation, while _bt_pagedel()
is strictly responsible for pages that are deleted by the current VACUUM
operation (which includes half-dead pages from a previous interrupted
VACUUM operation).

Discussion: https://postgr.es/m/CAH2-Wz=mWPBLZ2cr95cBV=kzPav8OOBtkHTfg+-+AUiozANK0w@mail.gmail.com
Discussion: https://postgr.es/m/CAH2-WzkLgyN3zBvRZ1pkNJThC=xi_0gpWRUb_45eexLH1+k2_Q@mail.gmail.com
Backpatch: 11-, where the "skip full scan" feature was introduced.
---
 src/include/access/nbtree.h         |   3 +-
 src/backend/access/nbtree/nbtpage.c | 134 ++++++++++++++++++++--------
 src/backend/access/nbtree/nbtree.c  |  85 ++++++++++--------
 3 files changed, 147 insertions(+), 75 deletions(-)

diff --git a/src/include/access/nbtree.h b/src/include/access/nbtree.h
index 9a3acd26b7..7bccd7ecc3 100644
--- a/src/include/access/nbtree.h
+++ b/src/include/access/nbtree.h
@@ -1080,7 +1080,8 @@ extern void _bt_delitems_vacuum(Relation rel, Buffer buf,
 extern void _bt_delitems_delete(Relation rel, Buffer buf,
 								OffsetNumber *deletable, int ndeletable,
 								Relation heapRel);
-extern int	_bt_pagedel(Relation rel, Buffer buf);
+extern int	_bt_pagedel(Relation rel, Buffer leafbuf,
+						TransactionId *oldestBtpoXact);
 
 /*
  * prototypes for functions in nbtsearch.c
diff --git a/src/backend/access/nbtree/nbtpage.c b/src/backend/access/nbtree/nbtpage.c
index f5962f6163..c90a2012d5 100644
--- a/src/backend/access/nbtree/nbtpage.c
+++ b/src/backend/access/nbtree/nbtpage.c
@@ -35,9 +35,11 @@
 #include "utils/snapmgr.h"
 
 static BTMetaPageData *_bt_getmeta(Relation rel, Buffer metabuf);
-static bool _bt_mark_page_halfdead(Relation rel, Buffer buf, BTStack stack);
+static bool _bt_mark_page_halfdead(Relation rel, Buffer leafbuf,
+								   BTStack stack);
 static bool _bt_unlink_halfdead_page(Relation rel, Buffer leafbuf,
-									 bool *rightsib_empty);
+									 bool *rightsib_empty,
+									 TransactionId *oldestBtpoXact);
 static TransactionId _bt_xid_horizon(Relation rel, Relation heapRel, Page page,
 									 OffsetNumber *deletable, int ndeletable);
 static bool _bt_lock_branch_parent(Relation rel, BlockNumber child,
@@ -1470,27 +1472,37 @@ _bt_lock_branch_parent(Relation rel, BlockNumber child, BTStack stack,
 }
 
 /*
- * _bt_pagedel() -- Delete a page from the b-tree, if legal to do so.
+ * _bt_pagedel() -- Delete a leaf page from the b-tree, if legal to do so.
  *
- * This action unlinks the page from the b-tree structure, removing all
+ * This action unlinks the leaf page from the b-tree structure, removing all
  * pointers leading to it --- but not touching its own left and right links.
  * The page cannot be physically reclaimed right away, since other processes
  * may currently be trying to follow links leading to the page; they have to
  * be allowed to use its right-link to recover.  See nbtree/README.
  *
  * On entry, the target buffer must be pinned and locked (either read or write
- * lock is OK).  This lock and pin will be dropped before exiting.
+ * lock is OK).  The page must be an empty leaf-page, which may be half-dead
+ * already (a half-dead page should only be passed to us when an earlier
+ * VACUUM operation was interrupted, though).  Note in particular that caller
+ * should never pass a buffer containing an existing deleted page here.  The
+ * lock and pin on caller's buffer will be dropped before we return.
  *
  * Returns the number of pages successfully deleted (zero if page cannot
- * be deleted now; could be more than one if parent or sibling pages were
- * deleted too).
+ * be deleted now; could be more than one if parent or right sibling pages
+ * were deleted too).
+ *
+ * Maintains *oldestBtpoXact for caller for any pages that are deleted here
+ * (including caller's target page and any other internal pages or right
+ * sibling pages that we manage to delete).  Caller is responsible for
+ * maintaining *oldestBtpoXact in the case of pages that were deleted by a
+ * previous VACUUM.
  *
  * NOTE: this leaks memory.  Rather than trying to clean up everything
  * carefully, it's better to run it in a temp context that can be reset
  * frequently.
  */
 int
-_bt_pagedel(Relation rel, Buffer buf)
+_bt_pagedel(Relation rel, Buffer leafbuf, TransactionId *oldestBtpoXact)
 {
 	int			ndeleted = 0;
 	BlockNumber rightsib;
@@ -1511,14 +1523,21 @@ _bt_pagedel(Relation rel, Buffer buf)
 
 	for (;;)
 	{
-		page = BufferGetPage(buf);
+		page = BufferGetPage(leafbuf);
 		opaque = (BTPageOpaque) PageGetSpecialPointer(page);
 
 		/*
 		 * Internal pages are never deleted directly, only as part of deleting
 		 * the whole branch all the way down to leaf level.
+		 *
+		 * Also check for deleted pages here.  Caller never passes us a fully
+		 * deleted page.  Only VACUUM can delete pages, so there can't have
+		 * been a concurrent deletion.  Assume that we reached any deleted
+		 * page encountered here by following a sibling link, and that the
+		 * index is corrupt.
 		 */
-		if (!P_ISLEAF(opaque))
+		Assert(!P_ISDELETED(opaque));
+		if (!P_ISLEAF(opaque) || P_ISDELETED(opaque))
 		{
 			/*
 			 * Pre-9.4 page deletion only marked internal pages as half-dead,
@@ -1537,13 +1556,22 @@ _bt_pagedel(Relation rel, Buffer buf)
 						 errmsg("index \"%s\" contains a half-dead internal page",
 								RelationGetRelationName(rel)),
 						 errhint("This can be caused by an interrupted VACUUM in version 9.3 or older, before upgrade. Please REINDEX it.")));
-			_bt_relbuf(rel, buf);
+
+			if (P_ISDELETED(opaque))
+				ereport(LOG,
+						(errcode(ERRCODE_INDEX_CORRUPTED),
+						 errmsg_internal("found deleted block %u while following right link in index \"%s\"",
+										 BufferGetBlockNumber(leafbuf),
+										 RelationGetRelationName(rel))));
+
+			_bt_relbuf(rel, leafbuf);
 			return ndeleted;
 		}
 
 		/*
 		 * We can never delete rightmost pages nor root pages.  While at it,
-		 * check that page is not already deleted and is empty.
+		 * check that page is empty, since it's possible that the page was
+		 * empty a moment ago, but has since had some inserts.
 		 *
 		 * To keep the algorithm simple, we also never delete an incompletely
 		 * split page (they should be rare enough that this doesn't make any
@@ -1558,14 +1586,14 @@ _bt_pagedel(Relation rel, Buffer buf)
 		 * to.  On subsequent iterations, we know we stepped right from a page
 		 * that passed these tests, so it's OK.
 		 */
-		if (P_RIGHTMOST(opaque) || P_ISROOT(opaque) || P_ISDELETED(opaque) ||
+		if (P_RIGHTMOST(opaque) || P_ISROOT(opaque) ||
 			P_FIRSTDATAKEY(opaque) <= PageGetMaxOffsetNumber(page) ||
 			P_INCOMPLETE_SPLIT(opaque))
 		{
 			/* Should never fail to delete a half-dead page */
 			Assert(!P_ISHALFDEAD(opaque));
 
-			_bt_relbuf(rel, buf);
+			_bt_relbuf(rel, leafbuf);
 			return ndeleted;
 		}
 
@@ -1603,7 +1631,7 @@ _bt_pagedel(Relation rel, Buffer buf)
 				 * To avoid deadlocks, we'd better drop the leaf page lock
 				 * before going further.
 				 */
-				LockBuffer(buf, BUFFER_LOCK_UNLOCK);
+				LockBuffer(leafbuf, BUFFER_LOCK_UNLOCK);
 
 				/*
 				 * Fetch the left sibling, to check that it's not marked with
@@ -1627,10 +1655,10 @@ _bt_pagedel(Relation rel, Buffer buf)
 					 * incompletely-split page to be split again.  So we don't
 					 * need to walk right here.
 					 */
-					if (lopaque->btpo_next == BufferGetBlockNumber(buf) &&
+					if (lopaque->btpo_next == BufferGetBlockNumber(leafbuf) &&
 						P_INCOMPLETE_SPLIT(lopaque))
 					{
-						ReleaseBuffer(buf);
+						ReleaseBuffer(leafbuf);
 						_bt_relbuf(rel, lbuf);
 						return ndeleted;
 					}
@@ -1646,16 +1674,26 @@ _bt_pagedel(Relation rel, Buffer buf)
 				_bt_relbuf(rel, lbuf);
 
 				/*
-				 * Re-lock the leaf page, and start over, to re-check that the
-				 * page can still be deleted.
+				 * Re-lock the leaf page, and start over to use our stack
+				 * within _bt_mark_page_halfdead().  We must do it that way
+				 * because it's possible that page can no longer be deleted.
+				 * We need to recheck.
 				 */
-				LockBuffer(buf, BT_WRITE);
+				LockBuffer(leafbuf, BT_WRITE);
 				continue;
 			}
 
-			if (!_bt_mark_page_halfdead(rel, buf, stack))
+			/*
+			 * See if it's safe to delete the leaf page, and determine how
+			 * many parent/internal pages above the leaf level will be
+			 * deleted.  If it's safe then _bt_mark_page_halfdead will also
+			 * perform the first phase of deletion, including marking the leaf
+			 * page half-dead.
+			 */
+			Assert(P_ISLEAF(opaque) && !P_IGNORE(opaque));
+			if (!_bt_mark_page_halfdead(rel, leafbuf, stack))
 			{
-				_bt_relbuf(rel, buf);
+				_bt_relbuf(rel, leafbuf);
 				return ndeleted;
 			}
 		}
@@ -1663,23 +1701,32 @@ _bt_pagedel(Relation rel, Buffer buf)
 		/*
 		 * Then unlink it from its siblings.  Each call to
 		 * _bt_unlink_halfdead_page unlinks the topmost page from the branch,
-		 * making it shallower.  Iterate until the leaf page is gone.
+		 * making it shallower.  Iterate until the leaf page is deleted.
+		 *
+		 * _bt_unlink_halfdead_page should never fail, since we established
+		 * that deletion is generally safe in _bt_mark_page_halfdead.
 		 */
 		rightsib_empty = false;
+		Assert(P_ISLEAF(opaque) && P_ISHALFDEAD(opaque));
 		while (P_ISHALFDEAD(opaque))
 		{
-			/* will check for interrupts, once lock is released */
-			if (!_bt_unlink_halfdead_page(rel, buf, &rightsib_empty))
+			/* Check for interrupts in _bt_unlink_halfdead_page */
+			if (!_bt_unlink_halfdead_page(rel, leafbuf, &rightsib_empty,
+										  oldestBtpoXact))
 			{
-				/* _bt_unlink_halfdead_page already released buffer */
+				/* _bt_unlink_halfdead_page failed, released buffer */
 				return ndeleted;
 			}
 			ndeleted++;
 		}
 
+		Assert(P_ISLEAF(opaque) && P_ISDELETED(opaque));
+		Assert(TransactionIdFollowsOrEquals(opaque->btpo.xact,
+											*oldestBtpoXact));
+
 		rightsib = opaque->btpo_next;
 
-		_bt_relbuf(rel, buf);
+		_bt_relbuf(rel, leafbuf);
 
 		/*
 		 * Check here, as calling loops will have locks held, preventing
@@ -1705,7 +1752,7 @@ _bt_pagedel(Relation rel, Buffer buf)
 		if (!rightsib_empty)
 			break;
 
-		buf = _bt_getbuf(rel, rightsib, BT_WRITE);
+		leafbuf = _bt_getbuf(rel, rightsib, BT_WRITE);
 	}
 
 	return ndeleted;
@@ -1909,9 +1956,19 @@ _bt_mark_page_halfdead(Relation rel, Buffer leafbuf, BTStack stack)
  * of the whole branch, including the leaf page itself, iterate until the
  * leaf page is deleted.
  *
- * Returns 'false' if the page could not be unlinked (shouldn't happen).
- * If the (current) right sibling of the page is empty, *rightsib_empty is
- * set to true.
+ * Returns 'false' if the page could not be unlinked (shouldn't happen).  If
+ * the right sibling of the current target page is empty, *rightsib_empty is
+ * set to true, allowing caller to delete the target's right sibling page in
+ * passing.  Note that *rightsib_empty is only actually used by caller when
+ * target page is leafbuf, following last call here for leafbuf/the subtree
+ * containing leafbuf.  (We always set *rightsib_empty for caller, just to be
+ * consistent.)
+ *
+ * We maintain *oldestBtpoXact for VACUUM operation as a whole here.  This is
+ * necessary because we conservatively assume that there must be a call to
+ * ReadNewTransactionId() for each and every page that is deleted, including
+ * when we delete a series of pages that are all part of the same empty
+ * subtree -- see comments about the underlying assumption below.
  *
  * Must hold pin and lock on leafbuf at entry (read or write doesn't matter).
  * On success exit, we'll be holding pin and write lock.  On failure exit,
@@ -1919,7 +1976,8 @@ _bt_mark_page_halfdead(Relation rel, Buffer leafbuf, BTStack stack)
  * to avoid having to reacquire a lock we already released).
  */
 static bool
-_bt_unlink_halfdead_page(Relation rel, Buffer leafbuf, bool *rightsib_empty)
+_bt_unlink_halfdead_page(Relation rel, Buffer leafbuf, bool *rightsib_empty,
+						 TransactionId *oldestBtpoXact)
 {
 	BlockNumber leafblkno = BufferGetBlockNumber(leafbuf);
 	BlockNumber leafleftsib;
@@ -2057,9 +2115,9 @@ _bt_unlink_halfdead_page(Relation rel, Buffer leafbuf, bool *rightsib_empty)
 		lbuf = InvalidBuffer;
 
 	/*
-	 * Next write-lock the target page itself.  It should be okay to take just
-	 * a write lock not a superexclusive lock, since no scans would stop on an
-	 * empty page.
+	 * Next write-lock the target page itself.  It's okay to take a write lock
+	 * rather than a superexclusive lock, since no scan will stop on an empty
+	 * page.
 	 */
 	LockBuffer(buf, BT_WRITE);
 	page = BufferGetPage(buf);
@@ -2204,6 +2262,7 @@ _bt_unlink_halfdead_page(Relation rel, Buffer leafbuf, bool *rightsib_empty)
 	 */
 	page = BufferGetPage(buf);
 	opaque = (BTPageOpaque) PageGetSpecialPointer(page);
+	Assert(P_ISHALFDEAD(opaque) || !P_ISLEAF(opaque));
 	opaque->btpo_flags &= ~BTP_HALF_DEAD;
 	opaque->btpo_flags |= BTP_DELETED;
 	opaque->btpo.xact = ReadNewTransactionId();
@@ -2309,6 +2368,11 @@ _bt_unlink_halfdead_page(Relation rel, Buffer leafbuf, bool *rightsib_empty)
 		_bt_relbuf(rel, lbuf);
 	_bt_relbuf(rel, rbuf);
 
+	/* maintain oldestBtpoXact */
+	if (!TransactionIdIsValid(*oldestBtpoXact) ||
+		TransactionIdPrecedes(opaque->btpo.xact, *oldestBtpoXact))
+		*oldestBtpoXact = opaque->btpo.xact;
+
 	/*
 	 * Release the target, if it was not the leaf block.  The leaf is always
 	 * kept locked.
diff --git a/src/backend/access/nbtree/nbtree.c b/src/backend/access/nbtree/nbtree.c
index 36294789f3..65f7fd1945 100644
--- a/src/backend/access/nbtree/nbtree.c
+++ b/src/backend/access/nbtree/nbtree.c
@@ -92,7 +92,7 @@ typedef struct BTParallelScanDescData *BTParallelScanDesc;
 
 static void btvacuumscan(IndexVacuumInfo *info, IndexBulkDeleteResult *stats,
 						 IndexBulkDeleteCallback callback, void *callback_state,
-						 BTCycleId cycleid, TransactionId *oldestBtpoXact);
+						 BTCycleId cycleid);
 static void btvacuumpage(BTVacState *vstate, BlockNumber blkno,
 						 BlockNumber orig_blkno);
 static BTVacuumPosting btreevacuumposting(BTVacState *vstate,
@@ -787,8 +787,16 @@ _bt_parallel_advance_array_keys(IndexScanDesc scan)
 }
 
 /*
- * _bt_vacuum_needs_cleanup() -- Checks if index needs cleanup assuming that
- *			btbulkdelete() wasn't called.
+ * _bt_vacuum_needs_cleanup() -- Checks if index needs cleanup
+ *
+ * Called by btvacuumcleanup when btbulkdelete was never called, because no
+ * tuples needed to be deleted.
+ *
+ * When we return false, VACUUM can even skip the cleanup-only call to
+ * btvacuumscan (i.e. there will be no btvacuumscan call for the entire
+ * VACUUM).  Otherwise a cleanup-only btvacuumscan call is required, which may
+ * recycle already-deleted pages, and gather accurate statistics for the
+ * index.
  */
 static bool
 _bt_vacuum_needs_cleanup(IndexVacuumInfo *info)
@@ -815,8 +823,15 @@ _bt_vacuum_needs_cleanup(IndexVacuumInfo *info)
 								   RecentGlobalXmin))
 	{
 		/*
-		 * If oldest btpo.xact in the deleted pages is older than
-		 * RecentGlobalXmin, then at least one deleted page can be recycled.
+		 * If any oldest btpo.xact from a previously deleted page in the index
+		 * is older than RecentGlobalXmin, then at least one deleted page can
+		 * be recycled -- don't skip cleanup.
+		 *
+		 * Note that btvacuumpage currently doesn't make any effort to
+		 * recognize when a recycled page is already in the FSM (i.e. put
+		 * there by a previous VACUUM).  We have to be conservative because
+		 * the FSM isn't crash safe.  Hopefully recycled pages get reused
+		 * before too long.
 		 */
 		result = true;
 	}
@@ -873,20 +888,9 @@ btbulkdelete(IndexVacuumInfo *info, IndexBulkDeleteResult *stats,
 	/* The ENSURE stuff ensures we clean up shared memory on failure */
 	PG_ENSURE_ERROR_CLEANUP(_bt_end_vacuum_callback, PointerGetDatum(rel));
 	{
-		TransactionId oldestBtpoXact;
-
 		cycleid = _bt_start_vacuum(rel);
 
-		btvacuumscan(info, stats, callback, callback_state, cycleid,
-					 &oldestBtpoXact);
-
-		/*
-		 * Update cleanup-related information in metapage. This information is
-		 * used only for cleanup but keeping them up to date can avoid
-		 * unnecessary cleanup even after bulkdelete.
-		 */
-		_bt_update_meta_cleanup_info(info->index, oldestBtpoXact,
-									 info->num_heap_tuples);
+		btvacuumscan(info, stats, callback, callback_state, cycleid);
 	}
 	PG_END_ENSURE_ERROR_CLEANUP(_bt_end_vacuum_callback, PointerGetDatum(rel));
 	_bt_end_vacuum(rel);
@@ -918,18 +922,12 @@ btvacuumcleanup(IndexVacuumInfo *info, IndexBulkDeleteResult *stats)
 	 */
 	if (stats == NULL)
 	{
-		TransactionId oldestBtpoXact;
-
 		/* Check if we need a cleanup */
 		if (!_bt_vacuum_needs_cleanup(info))
 			return NULL;
 
 		stats = (IndexBulkDeleteResult *) palloc0(sizeof(IndexBulkDeleteResult));
-		btvacuumscan(info, stats, NULL, NULL, 0, &oldestBtpoXact);
-
-		/* Update cleanup-related information in the metapage */
-		_bt_update_meta_cleanup_info(info->index, oldestBtpoXact,
-									 info->num_heap_tuples);
+		btvacuumscan(info, stats, NULL, NULL, 0);
 	}
 
 	/*
@@ -954,7 +952,9 @@ btvacuumcleanup(IndexVacuumInfo *info, IndexBulkDeleteResult *stats)
  * according to the vacuum callback, looking for empty pages that can be
  * deleted, and looking for old deleted pages that can be recycled.  Both
  * btbulkdelete and btvacuumcleanup invoke this (the latter only if no
- * btbulkdelete call occurred).
+ * btbulkdelete call occurred and _bt_vacuum_needs_cleanup returned true).
+ * Note that this is also where the metadata used by _bt_vacuum_needs_cleanup
+ * is maintained.
  *
  * The caller is responsible for initially allocating/zeroing a stats struct
  * and for obtaining a vacuum cycle ID if necessary.
@@ -962,7 +962,7 @@ btvacuumcleanup(IndexVacuumInfo *info, IndexBulkDeleteResult *stats)
 static void
 btvacuumscan(IndexVacuumInfo *info, IndexBulkDeleteResult *stats,
 			 IndexBulkDeleteCallback callback, void *callback_state,
-			 BTCycleId cycleid, TransactionId *oldestBtpoXact)
+			 BTCycleId cycleid)
 {
 	Relation	rel = info->index;
 	BTVacState	vstate;
@@ -1046,6 +1046,15 @@ btvacuumscan(IndexVacuumInfo *info, IndexBulkDeleteResult *stats,
 
 	MemoryContextDelete(vstate.pagedelcontext);
 
+	/*
+	 * If we deleted any pages (or encountered any existing deleted pages)
+	 * then update the oldest btpo.xact for the index as a whole in the index
+	 * metapage.  We also maintain the number of heap tuples for the last
+	 * VACUUM that actually called btvacuumscan().
+	 */
+	_bt_update_meta_cleanup_info(rel, vstate.oldestBtpoXact,
+								 info->num_heap_tuples);
+
 	/*
 	 * If we found any recyclable pages (and recorded them in the FSM), then
 	 * forcibly update the upper-level FSM pages to ensure that searchers can
@@ -1064,9 +1073,6 @@ btvacuumscan(IndexVacuumInfo *info, IndexBulkDeleteResult *stats,
 	/* update statistics */
 	stats->num_pages = num_pages;
 	stats->pages_free = vstate.totFreePages;
-
-	if (oldestBtpoXact)
-		*oldestBtpoXact = vstate.oldestBtpoXact;
 }
 
 /*
@@ -1137,24 +1143,30 @@ restart:
 	/* Page is valid, see what to do with it */
 	if (_bt_page_recyclable(page))
 	{
-		/* Okay to recycle this page */
+		/* Okay to recycle this page (which could be leaf or internal) */
 		RecordFreeIndexPage(rel, blkno);
 		vstate->totFreePages++;
 		stats->pages_deleted++;
 	}
 	else if (P_ISDELETED(opaque))
 	{
-		/* Already deleted, but can't recycle yet */
+		/*
+		 * Already deleted page (which could be leaf or internal).  Can't
+		 * recycle yet.
+		 */
 		stats->pages_deleted++;
 
-		/* Update the oldest btpo.xact */
+		/* Maintain the oldest btpo.xact */
 		if (!TransactionIdIsValid(vstate->oldestBtpoXact) ||
 			TransactionIdPrecedes(opaque->btpo.xact, vstate->oldestBtpoXact))
 			vstate->oldestBtpoXact = opaque->btpo.xact;
 	}
 	else if (P_ISHALFDEAD(opaque))
 	{
-		/* Half-dead, try to delete */
+		/*
+		 * Half-dead leaf page.  Try to delete now.  Might update
+		 * oldestBtpoXact and pages_deleted below.
+		 */
 		delete_now = true;
 	}
 	else if (P_ISLEAF(opaque))
@@ -1357,16 +1369,11 @@ restart:
 		MemoryContextReset(vstate->pagedelcontext);
 		oldcontext = MemoryContextSwitchTo(vstate->pagedelcontext);
 
-		ndel = _bt_pagedel(rel, buf);
+		ndel = _bt_pagedel(rel, buf, &vstate->oldestBtpoXact);
 
 		/* count only this page, else may double-count parent */
 		if (ndel)
-		{
 			stats->pages_deleted++;
-			if (!TransactionIdIsValid(vstate->oldestBtpoXact) ||
-				TransactionIdPrecedes(opaque->btpo.xact, vstate->oldestBtpoXact))
-				vstate->oldestBtpoXact = opaque->btpo.xact;
-		}
 
 		MemoryContextSwitchTo(oldcontext);
 		/* pagedel released buffer, so we shouldn't */
-- 
2.25.1

