From fe8385c80181b0cca2edb3740efff684334b1ee2 Mon Sep 17 00:00:00 2001
From: Matthias van de Meent <boekewurm+postgres@gmail.com>
Date: Mon, 19 Aug 2024 11:40:09 +0200
Subject: [PATCH v1] Avoid unneeded nbtree backwards scan buffer locks.

Instead of accessing the last scanned buffer, we now optimistically scan the left
sibling block of the current scan, optimistically decreasing the number of buffer
accesses in backward scans.
---
 src/backend/access/nbtree/nbtree.c    |  15 +-
 src/backend/access/nbtree/nbtsearch.c | 190 +++++++++++---------------
 src/backend/storage/lmgr/lwlock.c     |   1 +
 src/include/access/nbtree.h           |   5 +-
 4 files changed, 94 insertions(+), 117 deletions(-)

diff --git a/src/backend/access/nbtree/nbtree.c b/src/backend/access/nbtree/nbtree.c
index 686a3206f7..86f6a29d8e 100644
--- a/src/backend/access/nbtree/nbtree.c
+++ b/src/backend/access/nbtree/nbtree.c
@@ -67,6 +67,8 @@ typedef enum
 typedef struct BTParallelScanDescData
 {
 	BlockNumber btps_scanPage;	/* latest or next page to be scanned */
+	BlockNumber btps_scanPrev;	/* previous page to be scanned, used in
+									 * parallel backward scans. */
 	BTPS_State	btps_pageStatus;	/* indicates whether next page is
 									 * available for scan. see above for
 									 * possible states of parallel scan. */
@@ -602,7 +604,7 @@ btparallelrescan(IndexScanDesc scan)
  * for first=false callers that require another primitive index scan.
  */
 bool
-_bt_parallel_seize(IndexScanDesc scan, BlockNumber *pageno, bool first)
+_bt_parallel_seize(IndexScanDesc scan, BlockNumber *pageno, BlockNumber *prevpageno, bool first)
 {
 	BTScanOpaque so = (BTScanOpaque) scan->opaque;
 	bool		exit_loop = false;
@@ -611,6 +613,8 @@ _bt_parallel_seize(IndexScanDesc scan, BlockNumber *pageno, bool first)
 	BTParallelScanDesc btscan;
 
 	*pageno = P_NONE;
+	if (prevpageno)
+		*prevpageno = P_NONE;
 
 	if (first)
 	{
@@ -671,6 +675,9 @@ _bt_parallel_seize(IndexScanDesc scan, BlockNumber *pageno, bool first)
 				so->needPrimScan = true;
 				so->scanBehind = false;
 				*pageno = InvalidBlockNumber;
+
+				if (prevpageno)
+					*prevpageno = P_NONE;
 				exit_loop = true;
 			}
 		}
@@ -682,6 +689,8 @@ _bt_parallel_seize(IndexScanDesc scan, BlockNumber *pageno, bool first)
 			 */
 			btscan->btps_pageStatus = BTPARALLEL_ADVANCING;
 			*pageno = btscan->btps_scanPage;
+			if (prevpageno)
+				*prevpageno = btscan->btps_scanPrev;
 			exit_loop = true;
 		}
 		SpinLockRelease(&btscan->btps_mutex);
@@ -706,7 +715,8 @@ _bt_parallel_seize(IndexScanDesc scan, BlockNumber *pageno, bool first)
  * scan lands on scan_page).
  */
 void
-_bt_parallel_release(IndexScanDesc scan, BlockNumber scan_page)
+_bt_parallel_release(IndexScanDesc scan, BlockNumber scan_page,
+					 BlockNumber scan_prev)
 {
 	ParallelIndexScanDesc parallel_scan = scan->parallel_scan;
 	BTParallelScanDesc btscan;
@@ -716,6 +726,7 @@ _bt_parallel_release(IndexScanDesc scan, BlockNumber scan_page)
 
 	SpinLockAcquire(&btscan->btps_mutex);
 	btscan->btps_scanPage = scan_page;
+	btscan->btps_scanPrev = scan_prev;
 	btscan->btps_pageStatus = BTPARALLEL_IDLE;
 	SpinLockRelease(&btscan->btps_mutex);
 	ConditionVariableSignal(&btscan->btps_cv);
diff --git a/src/backend/access/nbtree/nbtsearch.c b/src/backend/access/nbtree/nbtsearch.c
index 2551df8a67..cc80b45ac5 100644
--- a/src/backend/access/nbtree/nbtsearch.c
+++ b/src/backend/access/nbtree/nbtsearch.c
@@ -43,10 +43,9 @@ static inline void _bt_savepostingitem(BTScanOpaque so, int itemIndex,
 									   OffsetNumber offnum,
 									   ItemPointer heapTid, int tupleOffset);
 static bool _bt_steppage(IndexScanDesc scan, ScanDirection dir);
-static bool _bt_readnextpage(IndexScanDesc scan, BlockNumber blkno, ScanDirection dir);
-static bool _bt_parallel_readpage(IndexScanDesc scan, BlockNumber blkno,
-								  ScanDirection dir);
-static Buffer _bt_walk_left(Relation rel, Buffer buf);
+static bool _bt_readnextpage(IndexScanDesc scan, BlockNumber blkno, BlockNumber expectNext, ScanDirection dir);
+static bool _bt_parallel_readpage(IndexScanDesc scan, BlockNumber blkno, BlockNumber nextblkno, ScanDirection dir);
+static Buffer _bt_walk_left(Relation rel, BlockNumber blkno, BlockNumber oblknum);
 static bool _bt_endpoint(IndexScanDesc scan, ScanDirection dir);
 static inline void _bt_initialize_more_data(BTScanOpaque so, ScanDirection dir);
 
@@ -926,7 +925,8 @@ _bt_first(IndexScanDesc scan, ScanDirection dir)
 	 */
 	if (scan->parallel_scan != NULL)
 	{
-		status = _bt_parallel_seize(scan, &blkno, true);
+		BlockNumber	expectNext;
+		status = _bt_parallel_seize(scan, &blkno, &expectNext, true);
 
 		/*
 		 * Initialize arrays (when _bt_parallel_seize didn't already set up
@@ -944,7 +944,7 @@ _bt_first(IndexScanDesc scan, ScanDirection dir)
 		}
 		else if (blkno != InvalidBlockNumber)
 		{
-			if (!_bt_parallel_readpage(scan, blkno, dir))
+			if (!_bt_parallel_readpage(scan, blkno, expectNext, dir))
 				return false;
 			goto readcomplete;
 		}
@@ -1584,9 +1584,9 @@ _bt_readpage(IndexScanDesc scan, ScanDirection dir, OffsetNumber offnum,
 		if (ScanDirectionIsForward(dir))
 			pstate.prev_scan_page = opaque->btpo_next;
 		else
-			pstate.prev_scan_page = BufferGetBlockNumber(so->currPos.buf);
+			pstate.prev_scan_page = opaque->btpo_prev;
 
-		_bt_parallel_release(scan, pstate.prev_scan_page);
+		_bt_parallel_release(scan, pstate.prev_scan_page, so->currPos.currPage);
 	}
 
 	indnatts = IndexRelationGetNumberOfAttributes(scan->indexRelation);
@@ -1623,10 +1623,10 @@ _bt_readpage(IndexScanDesc scan, ScanDirection dir, OffsetNumber offnum,
 
 	/*
 	 * we must save the page's right-link while scanning it; this tells us
-	 * where to step right to after we're done with these items.  There is no
-	 * corresponding need for the left-link, since splits always go right.
+	 * where to step right to after we're done with these items.
 	 */
 	so->currPos.nextPage = opaque->btpo_next;
+	so->currPos.prevPage = opaque->btpo_prev;
 
 	/* initialize tuple workspace to empty */
 	so->currPos.nextTupleOffset = 0;
@@ -2044,6 +2044,7 @@ _bt_steppage(IndexScanDesc scan, ScanDirection dir)
 {
 	BTScanOpaque so = (BTScanOpaque) scan->opaque;
 	BlockNumber blkno = InvalidBlockNumber;
+	BlockNumber expectNext = P_NONE;
 	bool		status;
 
 	Assert(BTScanPosIsValid(so->currPos));
@@ -2105,7 +2106,7 @@ _bt_steppage(IndexScanDesc scan, ScanDirection dir)
 			 * Seize the scan to get the next block number; if the scan has
 			 * ended already, bail out.
 			 */
-			status = _bt_parallel_seize(scan, &blkno, false);
+			status = _bt_parallel_seize(scan, &blkno, NULL, false);
 			if (!status)
 			{
 				/* release the previous buffer, if pinned */
@@ -2128,31 +2129,34 @@ _bt_steppage(IndexScanDesc scan, ScanDirection dir)
 	}
 	else
 	{
-		/* Remember we left a page with data */
-		so->currPos.moreRight = true;
-
 		if (scan->parallel_scan != NULL)
 		{
 			/*
 			 * Seize the scan to get the current block number; if the scan has
 			 * ended already, bail out.
 			 */
-			status = _bt_parallel_seize(scan, &blkno, false);
-			BTScanPosUnpinIfPinned(so->currPos);
+			status = _bt_parallel_seize(scan, &blkno, &expectNext, false);
 			if (!status)
 			{
+				BTScanPosUnpinIfPinned(so->currPos);
 				BTScanPosInvalidate(so->currPos);
 				return false;
 			}
 		}
 		else
 		{
-			/* Not parallel, so just use our own notion of the current page */
-			blkno = so->currPos.currPage;
+			/* Not parallel, so use the previously-saved prevPage link. */
+			blkno = so->currPos.prevPage;
+			expectNext = so->currPos.currPage;
 		}
+
+		/* Remember we left a page with data */
+		so->currPos.moreRight = true;
+
+		BTScanPosUnpinIfPinned(so->currPos);
 	}
 
-	if (!_bt_readnextpage(scan, blkno, dir))
+	if (!_bt_readnextpage(scan, blkno, expectNext, dir))
 		return false;
 
 	/* We have at least one item to return as scan's next item */
@@ -2172,7 +2176,8 @@ _bt_steppage(IndexScanDesc scan, ScanDirection dir)
  * locks and pins, set so->currPos.buf to InvalidBuffer, and return false.
  */
 static bool
-_bt_readnextpage(IndexScanDesc scan, BlockNumber blkno, ScanDirection dir)
+_bt_readnextpage(IndexScanDesc scan, BlockNumber blkno, BlockNumber expectNext,
+				 ScanDirection dir)
 {
 	BTScanOpaque so = (BTScanOpaque) scan->opaque;
 	Relation	rel;
@@ -2184,6 +2189,8 @@ _bt_readnextpage(IndexScanDesc scan, BlockNumber blkno, ScanDirection dir)
 
 	if (ScanDirectionIsForward(dir))
 	{
+		Assert(expectNext == P_NONE);
+
 		for (;;)
 		{
 			/*
@@ -2214,14 +2221,14 @@ _bt_readnextpage(IndexScanDesc scan, BlockNumber blkno, ScanDirection dir)
 			else if (scan->parallel_scan != NULL)
 			{
 				/* allow next page be processed by parallel worker */
-				_bt_parallel_release(scan, opaque->btpo_next);
+				_bt_parallel_release(scan, opaque->btpo_next, blkno);
 			}
 
 			/* nope, keep going */
 			if (scan->parallel_scan != NULL)
 			{
 				_bt_relbuf(rel, so->currPos.buf);
-				status = _bt_parallel_seize(scan, &blkno, false);
+				status = _bt_parallel_seize(scan, &blkno, NULL, false);
 				if (!status)
 				{
 					BTScanPosInvalidate(so->currPos);
@@ -2237,75 +2244,32 @@ _bt_readnextpage(IndexScanDesc scan, BlockNumber blkno, ScanDirection dir)
 	}
 	else
 	{
-		/*
-		 * Should only happen in parallel cases, when some other backend
-		 * advanced the scan.
-		 */
-		if (so->currPos.currPage != blkno)
-		{
-			BTScanPosUnpinIfPinned(so->currPos);
-			so->currPos.currPage = blkno;
-		}
-
-		/* Done if we know that the left sibling link isn't of interest */
-		if (!so->currPos.moreLeft)
-		{
-			BTScanPosUnpinIfPinned(so->currPos);
-			_bt_parallel_done(scan);
-			BTScanPosInvalidate(so->currPos);
-			return false;
-		}
-
-		/*
-		 * Walk left to the next page with data.  This is much more complex
-		 * than the walk-right case because of the possibility that the page
-		 * to our left splits while we are in flight to it, plus the
-		 * possibility that the page we were on gets deleted after we leave
-		 * it.  See nbtree/README for details.
-		 *
-		 * It might be possible to rearrange this code to have less overhead
-		 * in pinning and locking, but that would require capturing the left
-		 * sibling block number when the page is initially read, and then
-		 * optimistically starting there (rather than pinning the page twice).
-		 * It is not clear that this would be worth the complexity.
-		 */
-		if (BTScanPosIsPinned(so->currPos))
-			_bt_lockbuf(rel, so->currPos.buf, BT_READ);
-		else
-			so->currPos.buf = _bt_getbuf(rel, so->currPos.currPage, BT_READ);
-
 		for (;;)
 		{
-			/* Done if we know that the left sibling link isn't of interest */
-			if (!so->currPos.moreLeft)
+			/*
+			 * if we're at end of scan, give up and mark parallel scan as
+			 * done, so that all the workers can finish their scan
+			 */
+			if (blkno == P_NONE || !so->currPos.moreLeft)
 			{
-				_bt_relbuf(rel, so->currPos.buf);
 				_bt_parallel_done(scan);
 				BTScanPosInvalidate(so->currPos);
 				return false;
 			}
 
-			/* Step to next physical page */
-			so->currPos.buf = _bt_walk_left(rel, so->currPos.buf);
+			CHECK_FOR_INTERRUPTS();
 
-			/* if we're physically at end of index, return failure */
-			if (so->currPos.buf == InvalidBuffer)
-			{
-				_bt_parallel_done(scan);
-				BTScanPosInvalidate(so->currPos);
-				return false;
-			}
+			Assert(!BTScanPosIsPinned(so->currPos));
 
-			/*
-			 * Okay, we managed to move left to a non-deleted page. Done if
-			 * it's not half-dead and contains matching tuples. Else loop back
-			 * and do it all again.
-			 */
+			/* Step left one page */
+			so->currPos.buf = _bt_walk_left(rel, blkno, expectNext);
 			page = BufferGetPage(so->currPos.buf);
 			opaque = BTPageGetOpaque(page);
+
+			/* check for deleted page */
 			if (!P_IGNORE(opaque))
 			{
-				PredicateLockPage(rel, BufferGetBlockNumber(so->currPos.buf), scan->xs_snapshot);
+				PredicateLockPage(rel, blkno, scan->xs_snapshot);
 				/* see if there are any matches on this page */
 				/* note that this will clear moreLeft if we can stop */
 				if (_bt_readpage(scan, dir, PageGetMaxOffsetNumber(page), false))
@@ -2314,25 +2278,26 @@ _bt_readnextpage(IndexScanDesc scan, BlockNumber blkno, ScanDirection dir)
 			else if (scan->parallel_scan != NULL)
 			{
 				/* allow next page be processed by parallel worker */
-				_bt_parallel_release(scan, BufferGetBlockNumber(so->currPos.buf));
+				_bt_parallel_release(scan, opaque->btpo_prev, blkno);
 			}
 
-			/*
-			 * For parallel scans, get the last page scanned as it is quite
-			 * possible that by the time we try to seize the scan, some other
-			 * worker has already advanced the scan to a different page.  We
-			 * must continue based on the latest page scanned by any worker.
-			 */
+			/* nope, keep going */
 			if (scan->parallel_scan != NULL)
 			{
 				_bt_relbuf(rel, so->currPos.buf);
-				status = _bt_parallel_seize(scan, &blkno, false);
+				status = _bt_parallel_seize(scan, &blkno, &expectNext, false);
+
 				if (!status)
 				{
 					BTScanPosInvalidate(so->currPos);
 					return false;
 				}
-				so->currPos.buf = _bt_getbuf(rel, blkno, BT_READ);
+			}
+			else
+			{
+				expectNext = blkno;
+				blkno = opaque->btpo_prev;
+				_bt_relbuf(rel, so->currPos.buf);
 			}
 		}
 	}
@@ -2347,7 +2312,8 @@ _bt_readnextpage(IndexScanDesc scan, BlockNumber blkno, ScanDirection dir)
  * indicate success.
  */
 static bool
-_bt_parallel_readpage(IndexScanDesc scan, BlockNumber blkno, ScanDirection dir)
+_bt_parallel_readpage(IndexScanDesc scan, BlockNumber blkno,
+					  BlockNumber nextblkno, ScanDirection dir)
 {
 	BTScanOpaque so = (BTScanOpaque) scan->opaque;
 
@@ -2355,7 +2321,7 @@ _bt_parallel_readpage(IndexScanDesc scan, BlockNumber blkno, ScanDirection dir)
 
 	_bt_initialize_more_data(so, dir);
 
-	if (!_bt_readnextpage(scan, blkno, dir))
+	if (!_bt_readnextpage(scan, blkno, nextblkno, dir))
 		return false;
 
 	/* We have at least one item to return as scan's next item */
@@ -2367,43 +2333,29 @@ _bt_parallel_readpage(IndexScanDesc scan, BlockNumber blkno, ScanDirection dir)
 /*
  * _bt_walk_left() -- step left one page, if possible
  *
- * The given buffer must be pinned and read-locked.  This will be dropped
- * before stepping left.  On return, we have pin and read lock on the
- * returned page, instead.
+ * On return, we have a pin and read lock on the returned page.
  *
  * Returns InvalidBuffer if there is no page to the left (no lock is held
  * in that case).
  *
  * It is possible for the returned leaf page to be half-dead; caller must
  * check that condition and step left again when required.
+ *
+ * oblknum is the block we're walking left from
+ * blkno is the left sibling of oblknum when we last read that page
+ * (i.e. blkno was stored in oblknum's bt_prev).
  */
 static Buffer
-_bt_walk_left(Relation rel, Buffer buf)
+_bt_walk_left(Relation rel, BlockNumber blkno, BlockNumber oblknum)
 {
 	Page		page;
 	BTPageOpaque opaque;
-
-	page = BufferGetPage(buf);
-	opaque = BTPageGetOpaque(page);
+	BlockNumber lblkno = blkno;
 
 	for (;;)
 	{
-		BlockNumber obknum;
-		BlockNumber lblkno;
-		BlockNumber blkno;
+		Buffer		buf;
 		int			tries;
-
-		/* if we're at end of tree, release buf and return failure */
-		if (P_LEFTMOST(opaque))
-		{
-			_bt_relbuf(rel, buf);
-			break;
-		}
-		/* remember original page we are stepping left from */
-		obknum = BufferGetBlockNumber(buf);
-		/* step left */
-		blkno = lblkno = opaque->btpo_prev;
-		_bt_relbuf(rel, buf);
 		/* check for interrupts while we're not holding any buffer lock */
 		CHECK_FOR_INTERRUPTS();
 		buf = _bt_getbuf(rel, blkno, BT_READ);
@@ -2423,7 +2375,7 @@ _bt_walk_left(Relation rel, Buffer buf)
 		tries = 0;
 		for (;;)
 		{
-			if (!P_ISDELETED(opaque) && opaque->btpo_next == obknum)
+			if (!P_ISDELETED(opaque) && opaque->btpo_next == oblknum)
 			{
 				/* Found desired page, return it */
 				return buf;
@@ -2437,7 +2389,7 @@ _bt_walk_left(Relation rel, Buffer buf)
 		}
 
 		/* Return to the original page to see what's up */
-		buf = _bt_relandgetbuf(rel, buf, obknum, BT_READ);
+		buf = _bt_relandgetbuf(rel, buf, oblknum, BT_READ);
 		page = BufferGetPage(buf);
 		opaque = BTPageGetOpaque(page);
 		if (P_ISDELETED(opaque))
@@ -2475,9 +2427,21 @@ _bt_walk_left(Relation rel, Buffer buf)
 			 */
 			if (opaque->btpo_prev == lblkno)
 				elog(ERROR, "could not find left sibling of block %u in index \"%s\"",
-					 obknum, RelationGetRelationName(rel));
+					 oblknum, RelationGetRelationName(rel));
 			/* Okay to try again with new lblkno value */
 		}
+
+		/* if we're at end of tree, release buf and return failure */
+		if (P_LEFTMOST(opaque))
+		{
+			_bt_relbuf(rel, buf);
+			break;
+		}
+		/* remember original page we are stepping left from */
+		oblknum = BufferGetBlockNumber(buf);
+		/* step left */
+		blkno = lblkno = opaque->btpo_prev;
+		_bt_relbuf(rel, buf);
 	}
 
 	return InvalidBuffer;
diff --git a/src/backend/storage/lmgr/lwlock.c b/src/backend/storage/lmgr/lwlock.c
index e765754d80..5cff74d6d2 100644
--- a/src/backend/storage/lmgr/lwlock.c
+++ b/src/backend/storage/lmgr/lwlock.c
@@ -87,6 +87,7 @@
 #include "utils/memutils.h"
 
 #ifdef LWLOCK_STATS
+#include "storage/ipc.h"
 #include "utils/hsearch.h"
 #endif
 
diff --git a/src/include/access/nbtree.h b/src/include/access/nbtree.h
index 9af9b3ecdc..2e263b4036 100644
--- a/src/include/access/nbtree.h
+++ b/src/include/access/nbtree.h
@@ -955,6 +955,7 @@ typedef struct BTScanPosData
 	XLogRecPtr	lsn;			/* pos in the WAL stream when page was read */
 	BlockNumber currPage;		/* page referenced by items array */
 	BlockNumber nextPage;		/* page's right link when we scanned it */
+	BlockNumber prevPage;		/* page's left link when we scanned it */
 
 	/*
 	 * moreLeft and moreRight track whether we think there may be matching
@@ -1191,8 +1192,8 @@ extern bool btcanreturn(Relation index, int attno);
  * prototypes for internal functions in nbtree.c
  */
 extern bool _bt_parallel_seize(IndexScanDesc scan, BlockNumber *pageno,
-							   bool first);
-extern void _bt_parallel_release(IndexScanDesc scan, BlockNumber scan_page);
+							   BlockNumber *prevpageno, bool first);
+extern void _bt_parallel_release(IndexScanDesc scan, BlockNumber scan_page, BlockNumber scan_prev);
 extern void _bt_parallel_done(IndexScanDesc scan);
 extern void _bt_parallel_primscan_schedule(IndexScanDesc scan,
 										   BlockNumber prev_scan_page);
-- 
2.40.1

