From c225826208c31a980e41e1fad0298bb552f6114c Mon Sep 17 00:00:00 2001
From: Peter Geoghegan <pg@bowt.ie>
Date: Fri, 31 Jul 2020 16:57:35 -0700
Subject: [PATCH v3] Avoid backwards scan page deletion standby race.

Author: Michail Nikolaev
Discussion: https://postgr.es/m/CANtu0ohkR-evAWbpzJu54V8eCOtqjJyYp3PQ_SGoBTRGXWhWRw@mail.gmail.com
---
 src/backend/access/nbtree/README    | 18 ++++++
 src/backend/access/nbtree/nbtxlog.c | 85 ++++++++++++++++++-----------
 2 files changed, 70 insertions(+), 33 deletions(-)

diff --git a/src/backend/access/nbtree/README b/src/backend/access/nbtree/README
index 32ad9e339a..a08c3de3f3 100644
--- a/src/backend/access/nbtree/README
+++ b/src/backend/access/nbtree/README
@@ -572,6 +572,24 @@ replay of page deletion records does not hold a write lock on the target
 leaf page throughout; only the primary needs to block out concurrent
 writers that insert on to the page being deleted.)
 
+There are also locking differences between the primary and WAL replay
+for the first stage of a page split (i.e. same-level differences in
+locking).  Replay of the first phase of a page split can get away with
+locking and updating the original right sibling page (which is also the
+new right sibling page's right sibling) after locks on the original page
+and its new right sibling have been released.  Again, this is okay
+because there are no writers.  Page deletion cannot get away with being
+lax about same-level locking during replay, though -- doing so risks
+confusing concurrent backwards scans.
+
+Page deletion's second phase locks the left sibling page, target page,
+and right page in order on the standby, just like on the primary.  This
+allows backwards scans running on a standby to reason about page
+deletion on the leaf level; a page cannot appear deleted without that
+being reflected in the sibling pages.  It's probably possible to be more
+lax about how locks are acquired on the standby during the second phase
+of page deletion, but that hardly seems worth it.
+
 During recovery all index scans start with ignore_killed_tuples = false
 and we never set kill_prior_tuple. We do this because the oldest xmin
 on the standby server can be older than the oldest xmin on the primary
diff --git a/src/backend/access/nbtree/nbtxlog.c b/src/backend/access/nbtree/nbtxlog.c
index 5d346da84f..8e46437dbe 100644
--- a/src/backend/access/nbtree/nbtxlog.c
+++ b/src/backend/access/nbtree/nbtxlog.c
@@ -774,7 +774,10 @@ btree_xlog_unlink_page(uint8 info, XLogReaderState *record)
 	xl_btree_unlink_page *xlrec = (xl_btree_unlink_page *) XLogRecGetData(record);
 	BlockNumber leftsib;
 	BlockNumber rightsib;
-	Buffer		buffer;
+	Buffer		leftbuf;
+	Buffer		target;
+	Buffer		rightbuf;
+	Buffer		leafbuf;
 	Page		page;
 	BTPageOpaque pageop;
 
@@ -783,46 +786,38 @@ btree_xlog_unlink_page(uint8 info, XLogReaderState *record)
 
 	/*
 	 * In normal operation, we would lock all the pages this WAL record
-	 * touches before changing any of them.  In WAL replay, it should be okay
-	 * to lock just one page at a time, since no concurrent index updates can
-	 * be happening, and readers should not care whether they arrive at the
-	 * target page or not (since it's surely empty).
+	 * touches before changing any of them.  In WAL replay, we at least lock
+	 * the pages in the same standard left-to-right order (leftsib, target,
+	 * rightsib).
+	 *
+	 * btree_xlog_split() can get away with fixing its right sibling page's
+	 * left link last of all, after dropping all other locks.  We prefer to
+	 * avoid dropping locks on same-level pages early compared to normal
+	 * operation.  This keeps things simple for backwards scans.  See
+	 * nbtree/README.
 	 */
 
-	/* Fix left-link of right sibling */
-	if (XLogReadBufferForRedo(record, 2, &buffer) == BLK_NEEDS_REDO)
-	{
-		page = (Page) BufferGetPage(buffer);
-		pageop = (BTPageOpaque) PageGetSpecialPointer(page);
-		pageop->btpo_prev = leftsib;
-
-		PageSetLSN(page, lsn);
-		MarkBufferDirty(buffer);
-	}
-	if (BufferIsValid(buffer))
-		UnlockReleaseBuffer(buffer);
-
 	/* Fix right-link of left sibling, if any */
 	if (leftsib != P_NONE)
 	{
-		if (XLogReadBufferForRedo(record, 1, &buffer) == BLK_NEEDS_REDO)
+		if (XLogReadBufferForRedo(record, 1, &leftbuf) == BLK_NEEDS_REDO)
 		{
-			page = (Page) BufferGetPage(buffer);
+			page = (Page) BufferGetPage(leftbuf);
 			pageop = (BTPageOpaque) PageGetSpecialPointer(page);
 			pageop->btpo_next = rightsib;
 
 			PageSetLSN(page, lsn);
-			MarkBufferDirty(buffer);
+			MarkBufferDirty(leftbuf);
 		}
-		if (BufferIsValid(buffer))
-			UnlockReleaseBuffer(buffer);
 	}
+	else
+		leftbuf = InvalidBuffer;
 
 	/* Rewrite target page as empty deleted page */
-	buffer = XLogInitBufferForRedo(record, 0);
-	page = (Page) BufferGetPage(buffer);
+	target = XLogInitBufferForRedo(record, 0);
+	page = (Page) BufferGetPage(target);
 
-	_bt_pageinit(page, BufferGetPageSize(buffer));
+	_bt_pageinit(page, BufferGetPageSize(target));
 	pageop = (BTPageOpaque) PageGetSpecialPointer(page);
 
 	pageop->btpo_prev = leftsib;
@@ -832,8 +827,27 @@ btree_xlog_unlink_page(uint8 info, XLogReaderState *record)
 	pageop->btpo_cycleid = 0;
 
 	PageSetLSN(page, lsn);
-	MarkBufferDirty(buffer);
-	UnlockReleaseBuffer(buffer);
+	MarkBufferDirty(target);
+
+	/* Fix left-link of right sibling */
+	if (XLogReadBufferForRedo(record, 2, &rightbuf) == BLK_NEEDS_REDO)
+	{
+		page = (Page) BufferGetPage(rightbuf);
+		pageop = (BTPageOpaque) PageGetSpecialPointer(page);
+		pageop->btpo_prev = leftsib;
+
+		PageSetLSN(page, lsn);
+		MarkBufferDirty(rightbuf);
+	}
+
+	/* Release siblings */
+	if (BufferIsValid(leftbuf))
+		UnlockReleaseBuffer(leftbuf);
+	if (BufferIsValid(rightbuf))
+		UnlockReleaseBuffer(rightbuf);
+
+	/* Release target */
+	UnlockReleaseBuffer(target);
 
 	/*
 	 * If we deleted a parent of the targeted leaf page, instead of the leaf
@@ -845,13 +859,18 @@ btree_xlog_unlink_page(uint8 info, XLogReaderState *record)
 		/*
 		 * There is no real data on the page, so we just re-create it from
 		 * scratch using the information from the WAL record.
+		 *
+		 * Note that we don't end up here when the target page is also the
+		 * leafbuf page.  There is no need to add a dummy hikey item with a
+		 * top parent link when deleting leafbuf because it's the last page
+		 * we'll delete in the subtree undergoing deletion.
 		 */
 		IndexTupleData trunctuple;
 
-		buffer = XLogInitBufferForRedo(record, 3);
-		page = (Page) BufferGetPage(buffer);
+		leafbuf = XLogInitBufferForRedo(record, 3);
+		page = (Page) BufferGetPage(leafbuf);
 
-		_bt_pageinit(page, BufferGetPageSize(buffer));
+		_bt_pageinit(page, BufferGetPageSize(leafbuf));
 		pageop = (BTPageOpaque) PageGetSpecialPointer(page);
 
 		pageop->btpo_flags = BTP_HALF_DEAD | BTP_LEAF;
@@ -870,8 +889,8 @@ btree_xlog_unlink_page(uint8 info, XLogReaderState *record)
 			elog(ERROR, "could not add dummy high key to half-dead page");
 
 		PageSetLSN(page, lsn);
-		MarkBufferDirty(buffer);
-		UnlockReleaseBuffer(buffer);
+		MarkBufferDirty(leafbuf);
+		UnlockReleaseBuffer(leafbuf);
 	}
 
 	/* Update metapage if needed */
-- 
2.25.1

