From 7b8f1f4e2b9b8c3067f9c4750005d3f31d2256ef Mon Sep 17 00:00:00 2001
From: Peter Geoghegan <pg@bowt.ie>
Date: Wed, 11 Jan 2023 18:06:51 -0800
Subject: [PATCH v4] Tighten up VACUUM's approach to setting VM bits.

One of the calls to visibilitymap_set() during VACUUM's initial heap
pass could theoretically set a page's all-frozen bit without also
setting the page's all-visible bit, leaving the visibility map in an
inconsistent state.  The problem was the tacit assumption that the
lazy_scan_heap local variable all_visible_according_to_vm could not
become stale without that being detected afterwards, via the prunestate
all_visible/all_frozen fields being set false by lazy_scan_prune.
However, prunestate only indicates whether the page is now eligible to
become all-visible, which doesn't reliably indicate anything about the
state of the page immediately before it was scanned -- including in
cases where all_visible_according_to_vm happened to be set from earlier
on.  In other words, lazy_scan_heap failed to account for the fact that
a page that was set all-visible in the VM in the recent past may not
still be all-visible in the VM by now, even when it's eligible to become
all-visible now.

In practice there doesn't seem to be a concrete scenario in which this
is possible.  It was almost possible in scenarios involving concurrent
HOT updates from transactions that abort, but (unlike pruning) freezing
can never remove an XID that's > VACUUM's OldestXmin, even for an XID
that is known to have aborted, which was protective here.

Tighten up the way that visibilitymap_set() is called: request that both
the all-visible and all-frozen bits get set whenever the all-frozen bit
is set, regardless of what we think we know about the present state of
the all-visible bit.  Also make sure that the page level PD_ALL_VISIBLE
flag is set in the same code path.

These issues have been around since commit a892234f83, which added the
all-frozen bit to the VM fork.

In passing, stop reading the existing state of the VM when setting the
VM in VACUUM's final heap pass.  The existing state of the visibility
map is irrelevant, since pages with LP_DEAD items are not all-visible
(or all-frozen) by definition, and we already know that affected pages
must have had at least one LP_DEAD item before we set it LP_UNUSED.

Author: Peter Geoghegan <pg@bowt.ie>
Reviewed-By: Andres Freund <andres@anarazel.de>
Discussion: https://postgr.es/m/CAH2-WznuNGSzF8v6OsgjaC5aYsb3cZ6HW6MLm30X0d65cmSH6A@mail.gmail.com
---
 src/backend/access/heap/vacuumlazy.c    | 83 ++++++++++++++++---------
 src/backend/access/heap/visibilitymap.c |  9 ++-
 2 files changed, 60 insertions(+), 32 deletions(-)

diff --git a/src/backend/access/heap/vacuumlazy.c b/src/backend/access/heap/vacuumlazy.c
index 369451516..81698d0d3 100644
--- a/src/backend/access/heap/vacuumlazy.c
+++ b/src/backend/access/heap/vacuumlazy.c
@@ -260,7 +260,7 @@ static void lazy_vacuum(LVRelState *vacrel);
 static bool lazy_vacuum_all_indexes(LVRelState *vacrel);
 static void lazy_vacuum_heap_rel(LVRelState *vacrel);
 static int	lazy_vacuum_heap_page(LVRelState *vacrel, BlockNumber blkno,
-								  Buffer buffer, int index, Buffer *vmbuffer);
+								  Buffer buffer, int index, Buffer vmbuffer);
 static bool lazy_check_wraparound_failsafe(LVRelState *vacrel);
 static void lazy_cleanup_all_indexes(LVRelState *vacrel);
 static IndexBulkDeleteResult *lazy_vacuum_one_index(Relation indrel,
@@ -1040,7 +1040,7 @@ lazy_scan_heap(LVRelState *vacrel)
 			{
 				Size		freespace;
 
-				lazy_vacuum_heap_page(vacrel, blkno, buf, 0, &vmbuffer);
+				lazy_vacuum_heap_page(vacrel, blkno, buf, 0, vmbuffer);
 
 				/* Forget the LP_DEAD items that we just vacuumed */
 				dead_items->num_items = 0;
@@ -1092,7 +1092,10 @@ lazy_scan_heap(LVRelState *vacrel)
 			uint8		flags = VISIBILITYMAP_ALL_VISIBLE;
 
 			if (prunestate.all_frozen)
+			{
+				Assert(!TransactionIdIsValid(prunestate.visibility_cutoff_xid));
 				flags |= VISIBILITYMAP_ALL_FROZEN;
+			}
 
 			/*
 			 * It should never be the case that the visibility map page is set
@@ -1120,8 +1123,8 @@ lazy_scan_heap(LVRelState *vacrel)
 		 * got cleared after lazy_scan_skip() was called, so we must recheck
 		 * with buffer lock before concluding that the VM is corrupt.
 		 */
-		else if (all_visible_according_to_vm && !PageIsAllVisible(page)
-				 && VM_ALL_VISIBLE(vacrel->rel, blkno, &vmbuffer))
+		else if (all_visible_according_to_vm && !PageIsAllVisible(page) &&
+				 visibilitymap_get_status(vacrel->rel, blkno, &vmbuffer) != 0)
 		{
 			elog(WARNING, "page is not marked all-visible but visibility map bit is set in relation \"%s\" page %u",
 				 vacrel->relname, blkno);
@@ -1164,12 +1167,27 @@ lazy_scan_heap(LVRelState *vacrel)
 				 !VM_ALL_FROZEN(vacrel->rel, blkno, &vmbuffer))
 		{
 			/*
-			 * We can pass InvalidTransactionId as the cutoff XID here,
-			 * because setting the all-frozen bit doesn't cause recovery
-			 * conflicts.
+			 * Avoid relying on all_visible_according_to_vm as a proxy for the
+			 * page-level PD_ALL_VISIBLE bit being set, since it might have
+			 * become stale -- even when all_visible is set in prunestate
 			 */
+			if (!PageIsAllVisible(page))
+			{
+				PageSetAllVisible(page);
+				MarkBufferDirty(buf);
+			}
+
+			/*
+			 * Set the page all-frozen (and all-visible) in the VM.
+			 *
+			 * We can pass InvalidTransactionId as our visibility_cutoff_xid,
+			 * since a snapshotConflictHorizon sufficient to make everything
+			 * safe for REDO was logged when the page's tuples were frozen.
+			 */
+			Assert(!TransactionIdIsValid(prunestate.visibility_cutoff_xid));
 			visibilitymap_set(vacrel->rel, blkno, buf, InvalidXLogRecPtr,
 							  vmbuffer, InvalidTransactionId,
+							  VISIBILITYMAP_ALL_VISIBLE	|
 							  VISIBILITYMAP_ALL_FROZEN);
 		}
 
@@ -1311,7 +1329,11 @@ lazy_scan_skip(LVRelState *vacrel, Buffer *vmbuffer, BlockNumber next_block,
 
 		/* DISABLE_PAGE_SKIPPING makes all skipping unsafe */
 		if (!vacrel->skipwithvm)
+		{
+			/* Caller shouldn't rely on all_visible_according_to_vm */
+			*next_unskippable_allvis = false;
 			break;
+		}
 
 		/*
 		 * Aggressive VACUUM caller can't skip pages just because they are
@@ -1818,7 +1840,11 @@ retry:
 			 * cutoff by stepping back from OldestXmin.
 			 */
 			if (prunestate->all_visible && prunestate->all_frozen)
+			{
+				/* Using same cutoff when setting VM is now unnecessary */
 				snapshotConflictHorizon = prunestate->visibility_cutoff_xid;
+				prunestate->visibility_cutoff_xid = InvalidTransactionId;
+			}
 			else
 			{
 				/* Avoids false conflicts when hot_standby_feedback in use */
@@ -2417,10 +2443,18 @@ lazy_vacuum_heap_rel(LVRelState *vacrel)
 
 		blkno = ItemPointerGetBlockNumber(&vacrel->dead_items->items[index]);
 		vacrel->blkno = blkno;
+
+		/*
+		 * Pin the visibility map page in case we need to mark the page
+		 * all-visible.  In most cases this will be very cheap, because we'll
+		 * already have the correct page pinned anyway.
+		 */
+		visibilitymap_pin(vacrel->rel, blkno, &vmbuffer);
+
 		buf = ReadBufferExtended(vacrel->rel, MAIN_FORKNUM, blkno, RBM_NORMAL,
 								 vacrel->bstrategy);
 		LockBuffer(buf, BUFFER_LOCK_EXCLUSIVE);
-		index = lazy_vacuum_heap_page(vacrel, blkno, buf, index, &vmbuffer);
+		index = lazy_vacuum_heap_page(vacrel, blkno, buf, index, vmbuffer);
 
 		/* Now that we've vacuumed the page, record its available space */
 		page = BufferGetPage(buf);
@@ -2457,7 +2491,8 @@ lazy_vacuum_heap_rel(LVRelState *vacrel)
  *						  vacrel->dead_items array.
  *
  * Caller must have an exclusive buffer lock on the buffer (though a full
- * cleanup lock is also acceptable).
+ * cleanup lock is also acceptable).  vmbuffer must be valid and already have
+ * a pin on blkno's visibility map page.
  *
  * index is an offset into the vacrel->dead_items array for the first listed
  * LP_DEAD item on the page.  The return value is the first index immediately
@@ -2465,7 +2500,7 @@ lazy_vacuum_heap_rel(LVRelState *vacrel)
  */
 static int
 lazy_vacuum_heap_page(LVRelState *vacrel, BlockNumber blkno, Buffer buffer,
-					  int index, Buffer *vmbuffer)
+					  int index, Buffer vmbuffer)
 {
 	VacDeadItems *dead_items = vacrel->dead_items;
 	Page		page = BufferGetPage(buffer);
@@ -2546,31 +2581,21 @@ lazy_vacuum_heap_page(LVRelState *vacrel, BlockNumber blkno, Buffer buffer,
 	 * dirty, exclusively locked, and, if needed, a full page image has been
 	 * emitted.
 	 */
+	Assert(!PageIsAllVisible(page));
 	if (heap_page_is_all_visible(vacrel, buffer, &visibility_cutoff_xid,
 								 &all_frozen))
-		PageSetAllVisible(page);
-
-	/*
-	 * All the changes to the heap page have been done. If the all-visible
-	 * flag is now set, also set the VM all-visible bit (and, if possible, the
-	 * all-frozen bit) unless this has already been done previously.
-	 */
-	if (PageIsAllVisible(page))
 	{
-		uint8		flags = 0;
-		uint8		vm_status = visibilitymap_get_status(vacrel->rel,
-														 blkno, vmbuffer);
+		uint8		flags = VISIBILITYMAP_ALL_VISIBLE;
 
-		/* Set the VM all-frozen bit to flag, if needed */
-		if ((vm_status & VISIBILITYMAP_ALL_VISIBLE) == 0)
-			flags |= VISIBILITYMAP_ALL_VISIBLE;
-		if ((vm_status & VISIBILITYMAP_ALL_FROZEN) == 0 && all_frozen)
+		if (all_frozen)
+		{
+			Assert(!TransactionIdIsValid(visibility_cutoff_xid));
 			flags |= VISIBILITYMAP_ALL_FROZEN;
+		}
 
-		Assert(BufferIsValid(*vmbuffer));
-		if (flags != 0)
-			visibilitymap_set(vacrel->rel, blkno, buffer, InvalidXLogRecPtr,
-							  *vmbuffer, visibility_cutoff_xid, flags);
+		PageSetAllVisible(page);
+		visibilitymap_set(vacrel->rel, blkno, buffer, InvalidXLogRecPtr,
+						  vmbuffer, visibility_cutoff_xid, flags);
 	}
 
 	/* Revert to the previous phase information for error traceback */
diff --git a/src/backend/access/heap/visibilitymap.c b/src/backend/access/heap/visibilitymap.c
index 1d1ca423a..74ff01bb1 100644
--- a/src/backend/access/heap/visibilitymap.c
+++ b/src/backend/access/heap/visibilitymap.c
@@ -146,7 +146,9 @@ visibilitymap_clear(Relation rel, BlockNumber heapBlk, Buffer vmbuf, uint8 flags
 	char	   *map;
 	bool		cleared = false;
 
+	/* Must never clear all_visible bit while leaving all_frozen bit set */
 	Assert(flags & VISIBILITYMAP_VALID_BITS);
+	Assert(flags != VISIBILITYMAP_ALL_VISIBLE);
 
 #ifdef TRACE_VISIBILITYMAP
 	elog(DEBUG1, "vm_clear %s %d", RelationGetRelationName(rel), heapBlk);
@@ -256,8 +258,11 @@ visibilitymap_set(Relation rel, BlockNumber heapBlk, Buffer heapBuf,
 #endif
 
 	Assert(InRecovery || XLogRecPtrIsInvalid(recptr));
-	Assert(InRecovery || BufferIsValid(heapBuf));
+	Assert(InRecovery || PageIsAllVisible((Page) BufferGetPage(heapBuf)));
+
+	/* Must never set all_frozen bit without also setting all_visible bit */
 	Assert(flags & VISIBILITYMAP_VALID_BITS);
+	Assert(flags != VISIBILITYMAP_ALL_FROZEN);
 
 	/* Check that we have the right heap page pinned, if present */
 	if (BufferIsValid(heapBuf) && BufferGetBlockNumber(heapBuf) != heapBlk)
@@ -299,8 +304,6 @@ visibilitymap_set(Relation rel, BlockNumber heapBlk, Buffer heapBuf,
 				{
 					Page		heapPage = BufferGetPage(heapBuf);
 
-					/* caller is expected to set PD_ALL_VISIBLE first */
-					Assert(PageIsAllVisible(heapPage));
 					PageSetLSN(heapPage, recptr);
 				}
 			}
-- 
2.39.0

