From ecad16b0bc692180d098bcf39fecfd3d0163c7d9 Mon Sep 17 00:00:00 2001
From: Peter Geoghegan <pg@bowt.ie>
Date: Thu, 7 Nov 2024 16:21:03 -0500
Subject: [PATCH v2] Fix confusion with nbtree parallel scan currPos state.

Author: Peter Geoghegan <pg@bowt.ie>
Reported-By: Masahiro Ikeda <ikedamsh@oss.nttdata.com>
Diagnosed-By: Masahiro Ikeda <ikedamsh@oss.nttdata.com>
Discussion: https://postgr.es/m/f8efb9c0f8d1a71b44fd7f8e42e49c25@oss.nttdata.com
---
 src/backend/access/nbtree/nbtree.c    | 31 ++++++---
 src/backend/access/nbtree/nbtsearch.c | 97 +++++++++++----------------
 2 files changed, 62 insertions(+), 66 deletions(-)

diff --git a/src/backend/access/nbtree/nbtree.c b/src/backend/access/nbtree/nbtree.c
index 2919b1263..cd3c6f4f7 100644
--- a/src/backend/access/nbtree/nbtree.c
+++ b/src/backend/access/nbtree/nbtree.c
@@ -596,9 +596,7 @@ btparallelrescan(IndexScanDesc scan)
  * scan, and *last_curr_page returns the page that *next_scan_page came from.
  * An invalid *next_scan_page means the scan hasn't yet started, or that
  * caller needs to start the next primitive index scan (if it's the latter
- * case we'll set so.needPrimScan).  The first time a participating process
- * reaches the last page, it will return true and set *next_scan_page to
- * P_NONE; after that, further attempts to seize the scan will return false.
+ * case we'll set so.needPrimScan).
  *
  * Callers should ignore the value of *next_scan_page and *last_curr_page if
  * the return value is false.
@@ -724,6 +722,9 @@ _bt_parallel_seize(IndexScanDesc scan, BlockNumber *next_scan_page,
  * that it can be passed to _bt_parallel_primscan_schedule, should caller
  * determine that another primitive index scan is required.
  *
+ * If caller's next_scan_page is P_NONE, the scan has reached the index's
+ * rightmost/leftmost page.  We'll end the parallel scan right away.
+ *
  * Note: unlike the serial case, parallel scans don't need to remember both
  * sibling links.  next_scan_page is whichever link is next given the scan's
  * direction.  That's all we'll ever need, since the direction of a parallel
@@ -735,16 +736,30 @@ _bt_parallel_release(IndexScanDesc scan, BlockNumber next_scan_page,
 {
 	ParallelIndexScanDesc parallel_scan = scan->parallel_scan;
 	BTParallelScanDesc btscan;
+	bool		ended_scan = false;
+
+	Assert(BlockNumberIsValid(next_scan_page));
 
 	btscan = (BTParallelScanDesc) OffsetToPointer((void *) parallel_scan,
 												  parallel_scan->ps_offset);
 
 	SpinLockAcquire(&btscan->btps_mutex);
-	btscan->btps_nextScanPage = next_scan_page;
-	btscan->btps_lastCurrPage = curr_page;
-	btscan->btps_pageStatus = BTPARALLEL_IDLE;
+	if (next_scan_page == P_NONE)
+	{
+		btscan->btps_pageStatus = BTPARALLEL_DONE;
+		ended_scan = true;
+	}
+	else
+	{
+		btscan->btps_nextScanPage = next_scan_page;
+		btscan->btps_lastCurrPage = curr_page;
+		btscan->btps_pageStatus = BTPARALLEL_IDLE;
+	}
 	SpinLockRelease(&btscan->btps_mutex);
-	ConditionVariableSignal(&btscan->btps_cv);
+	if (ended_scan)
+		ConditionVariableBroadcast(&btscan->btps_cv);
+	else
+		ConditionVariableSignal(&btscan->btps_cv);
 }
 
 /*
@@ -770,7 +785,7 @@ _bt_parallel_done(IndexScanDesc scan)
 
 	/*
 	 * Should not mark parallel scan done when there's still a pending
-	 * primitive index scan (defensive)
+	 * primitive index scan
 	 */
 	if (so->needPrimScan)
 		return;
diff --git a/src/backend/access/nbtree/nbtsearch.c b/src/backend/access/nbtree/nbtsearch.c
index 1608dd49d..a3f215e17 100644
--- a/src/backend/access/nbtree/nbtsearch.c
+++ b/src/backend/access/nbtree/nbtsearch.c
@@ -46,7 +46,8 @@ static bool _bt_steppage(IndexScanDesc scan, ScanDirection dir);
 static bool _bt_readfirstpage(IndexScanDesc scan, OffsetNumber offnum,
 							  ScanDirection dir);
 static bool _bt_readnextpage(IndexScanDesc scan, BlockNumber blkno,
-							 BlockNumber lastcurrblkno, ScanDirection dir);
+							 BlockNumber lastcurrblkno, ScanDirection dir,
+							 bool seized);
 static Buffer _bt_lock_and_validate_left(Relation rel, BlockNumber *blkno,
 										 BlockNumber lastcurrblkno);
 static bool _bt_endpoint(IndexScanDesc scan, ScanDirection dir);
@@ -924,9 +925,9 @@ _bt_first(IndexScanDesc scan, ScanDirection dir)
 	{
 		BlockNumber blkno,
 					lastcurrblkno;
-		bool		status;
 
-		status = _bt_parallel_seize(scan, &blkno, &lastcurrblkno, true);
+		if (!_bt_parallel_seize(scan, &blkno, &lastcurrblkno, true))
+			return false;
 
 		/*
 		 * Initialize arrays (when _bt_parallel_seize didn't already set up
@@ -935,14 +936,7 @@ _bt_first(IndexScanDesc scan, ScanDirection dir)
 		if (so->numArrayKeys && !so->needPrimScan)
 			_bt_start_array_keys(scan, dir);
 
-		if (!status)
-			return false;
-		else if (blkno == P_NONE)
-		{
-			_bt_parallel_done(scan);
-			return false;
-		}
-		else if (blkno != InvalidBlockNumber)
+		if (blkno != InvalidBlockNumber)
 		{
 			Assert(!so->needPrimScan);
 
@@ -950,7 +944,7 @@ _bt_first(IndexScanDesc scan, ScanDirection dir)
 			 * We anticipated starting another primitive scan, but some other
 			 * worker bet us to it
 			 */
-			if (!_bt_readnextpage(scan, blkno, lastcurrblkno, dir))
+			if (!_bt_readnextpage(scan, blkno, lastcurrblkno, dir, true))
 				return false;
 			goto readcomplete;
 		}
@@ -2012,12 +2006,8 @@ _bt_savepostingitem(BTScanOpaque so, int itemIndex, OffsetNumber offnum,
  * a valid block, in any case.
  *
  * This is a wrapper on _bt_readnextpage that performs final steps for the
- * current page.  It sets up the _bt_readnextpage call using either local
- * state saved in so->currPos by the most recent _bt_readpage call, or using
- * shared parallel scan state (obtained by seizing the parallel scan here).
- *
- * Parallel scan callers that have already seized the scan should directly
- * call _bt_readnextpage, rather than calling here.
+ * current page.  It sets up the _bt_readnextpage call using local state saved
+ * in so->currPos by the most recent _bt_readpage call.
  */
 static bool
 _bt_steppage(IndexScanDesc scan, ScanDirection dir)
@@ -2081,37 +2071,22 @@ _bt_steppage(IndexScanDesc scan, ScanDirection dir)
 	BTScanPosUnpinIfPinned(so->currPos);
 
 	/* Walk to the next page with data */
-	if (!scan->parallel_scan)
-	{
-		/* Not parallel, so use local state set by the last _bt_readpage */
-		if (ScanDirectionIsForward(dir))
-			blkno = so->currPos.nextPage;
-		else
-			blkno = so->currPos.prevPage;
-		lastcurrblkno = so->currPos.currPage;
-
-		/*
-		 * Cancel primitive index scans that were scheduled when the call to
-		 * _bt_readpage for currPos happened to use the opposite direction to
-		 * the one that we're stepping in now.  (It's okay to leave the scan's
-		 * array keys as-is, since the next _bt_readpage will advance them.)
-		 */
-		if (so->currPos.dir != dir)
-			so->needPrimScan = false;
-	}
+	if (ScanDirectionIsForward(dir))
+		blkno = so->currPos.nextPage;
 	else
-	{
-		/*
-		 * Seize the scan to get the nextPage and currPage from shared
-		 * parallel state (saved from parallel scan's last _bt_readpage)
-		 */
-		if (!_bt_parallel_seize(scan, &blkno, &lastcurrblkno, false))
-			return false;
+		blkno = so->currPos.prevPage;
+	lastcurrblkno = so->currPos.currPage;
 
-		Assert(!so->needPrimScan);
-	}
+	/*
+	 * Cancel primitive index scans that were scheduled when the call to
+	 * _bt_readpage for currPos happened to use the opposite direction to the
+	 * one that we're stepping in now.  (It's okay to leave the scan's array
+	 * keys as-is, since the next _bt_readpage will advance them.)
+	 */
+	if (so->currPos.dir != dir)
+		so->needPrimScan = false;
 
-	return _bt_readnextpage(scan, blkno, lastcurrblkno, dir);
+	return _bt_readnextpage(scan, blkno, lastcurrblkno, dir, false);
 }
 
 /*
@@ -2203,8 +2178,13 @@ _bt_readfirstpage(IndexScanDesc scan, OffsetNumber offnum, ScanDirection dir)
  *
  * On entry, caller shouldn't hold any locks or pins on any page (we work
  * directly off of blkno and lastcurrblkno instead).  Parallel scan callers
- * must have seized the scan before calling here (blkno and lastcurrblkno
- * arguments should come from the seized scan).
+ * that seized the scan before calling here should pass seized=true; such a
+ * caller's blkno and lastcurrblkno arguments come from the seized scan.
+ * seized=false parallel scan callers just pass us the blkno/lastcurrblkno
+ * taken from their so->currPos, which can be used to end the scan, but will
+ * never be used to determine which page to read next (we must seize the scan
+ * to get the blkno that we have to read next, since the correct page to read
+ * might already be beyond a seized=false caller's blkno).
  *
  * On success exit, so->currPos is updated to contain data from the next
  * interesting page, and we return true (parallel scan callers should not use
@@ -2220,12 +2200,12 @@ _bt_readfirstpage(IndexScanDesc scan, OffsetNumber offnum, ScanDirection dir)
  */
 static bool
 _bt_readnextpage(IndexScanDesc scan, BlockNumber blkno,
-				 BlockNumber lastcurrblkno, ScanDirection dir)
+				 BlockNumber lastcurrblkno, ScanDirection dir, bool seized)
 {
 	Relation	rel = scan->indexRelation;
 	BTScanOpaque so = (BTScanOpaque) scan->opaque;
 
-	Assert(so->currPos.currPage == lastcurrblkno || scan->parallel_scan != NULL);
+	Assert(so->currPos.currPage == lastcurrblkno || seized);
 	Assert(!BTScanPosIsPinned(so->currPos));
 
 	/*
@@ -2254,6 +2234,14 @@ _bt_readnextpage(IndexScanDesc scan, BlockNumber blkno,
 
 		Assert(!so->needPrimScan);
 
+		/* parallel scan must never actually visit so->currPos blkno */
+		if (!seized && scan->parallel_scan != NULL &&
+			!_bt_parallel_seize(scan, &blkno, &lastcurrblkno, false))
+		{
+			BTScanPosInvalidate(so->currPos);
+			return false;
+		}
+
 		if (ScanDirectionIsForward(dir))
 		{
 			/* read blkno, but check for interrupts first */
@@ -2308,14 +2296,7 @@ _bt_readnextpage(IndexScanDesc scan, BlockNumber blkno,
 
 		/* no matching tuples on this page */
 		_bt_relbuf(rel, so->currPos.buf);
-
-		/* parallel scan seizes another page (won't use so->currPos blkno) */
-		if (scan->parallel_scan != NULL &&
-			!_bt_parallel_seize(scan, &blkno, &lastcurrblkno, false))
-		{
-			BTScanPosInvalidate(so->currPos);
-			return false;
-		}
+		seized = false;			/* released by _bt_readpage (or by us) */
 	}
 
 	/*
-- 
2.45.2

