From 54127e3be41ac51ff3640960c08b4e65aa4a4892 Mon Sep 17 00:00:00 2001
From: Matthias van de Meent <boekewurm+postgres@gmail.com>
Date: Fri, 11 Oct 2024 15:57:27 +0200
Subject: [PATCH v1] Avoid full btree index scans when skipping is possible

Previously, we could ignore the skip signal until the end of the range of
values producable by the index scan key. Now, we can fail to start a new
primscan only for up to number of parallel workers + 1 buffers, at the cost
of doing a bit more before releasing the scan while we process the 'we may
need a new primitive scan' signal.

If we detect that a parallel worker in the same primscan range thinks
this is the right moment to start a new primitive scan, we don't release
the parallel scan immediately, but instead only release it after reading
the pages contents to find out if we really should start a new primitive
scan.  If so, we start that new primitive scan, and if instead we find
we've already skidded into the range of pages we would've arrived on with
the skip scan, we instead mark that the primitive scan has reached a new
primscan range, do some cleanup, and then continue the scan as usual.
---
 src/include/access/nbtree.h           |  10 ++-
 src/backend/access/nbtree/nbtree.c    | 112 +++++++++++++++++++++++---
 src/backend/access/nbtree/nbtsearch.c |  15 +++-
 3 files changed, 121 insertions(+), 16 deletions(-)

diff --git a/src/include/access/nbtree.h b/src/include/access/nbtree.h
index d64300fb97..efe6728dd3 100644
--- a/src/include/access/nbtree.h
+++ b/src/include/access/nbtree.h
@@ -1052,6 +1052,11 @@ typedef struct BTScanOpaqueData
 	FmgrInfo   *orderProcs;		/* ORDER procs for required equality keys */
 	MemoryContext arrayContext; /* scan-lifespan context for array data */
 
+	/* local state for coordinating skips in parallel scans */
+	bool		testPrimScan;	/* Are we trying to do a new primitive scan */
+	uint32		arrElemsGen;	/* Generation number of prim scan we want to
+								 * improve on */
+
 	/* info about killed items if any (killedItems is NULL if never used) */
 	int		   *killedItems;	/* currPos.items indexes of killed items */
 	int			numKilled;		/* number of currently stored items */
@@ -1193,7 +1198,10 @@ extern int	btgettreeheight(Relation rel);
  */
 extern bool _bt_parallel_seize(IndexScanDesc scan, BlockNumber *pageno,
 							   bool first);
-extern void _bt_parallel_release(IndexScanDesc scan, BlockNumber scan_page);
+extern void _bt_parallel_opt_release_early(IndexScanDesc scan,
+										   BlockNumber scan_page);
+extern void _bt_parallel_opt_release_late(IndexScanDesc scan,
+										  BlockNumber scan_page);
 extern void _bt_parallel_done(IndexScanDesc scan);
 extern void _bt_parallel_primscan_schedule(IndexScanDesc scan,
 										   BlockNumber prev_scan_page);
diff --git a/src/backend/access/nbtree/nbtree.c b/src/backend/access/nbtree/nbtree.c
index 56e502c4fc..04c8d6e786 100644
--- a/src/backend/access/nbtree/nbtree.c
+++ b/src/backend/access/nbtree/nbtree.c
@@ -70,12 +70,15 @@ typedef struct BTParallelScanDescData
 	BTPS_State	btps_pageStatus;	/* indicates whether next page is
 									 * available for scan. see above for
 									 * possible states of parallel scan. */
+	uint32		btps_arrElemsGen;	/* number of new prim scan opportunities */
+	bool		btps_checkPrimScan;	/* did we skid past the most opportune
+									 * endpoint of a primitive scan? */
 	slock_t		btps_mutex;		/* protects above variables, btps_arrElems */
 	ConditionVariable btps_cv;	/* used to synchronize parallel scan */
 
 	/*
 	 * btps_arrElems is used when scans need to schedule another primitive
-	 * index scan.  Holds BTArrayKeyInfo.cur_elem offsets for scan keys.
+	 * index scan.  Holds the values for BTScanOpaque->arrayKeys[.].cur_elem.
 	 */
 	int			btps_arrElems[FLEXIBLE_ARRAY_MEMBER];
 }			BTParallelScanDescData;
@@ -335,6 +338,9 @@ btbeginscan(Relation rel, int nkeys, int norderbys)
 	so->arrayKeys = NULL;
 	so->orderProcs = NULL;
 	so->arrayContext = NULL;
+	
+	so->testPrimScan = false;
+	so->arrElemsGen = 0;
 
 	so->killedItems = NULL;		/* until needed */
 	so->numKilled = 0;
@@ -550,6 +556,8 @@ btinitparallelscan(void *target)
 	SpinLockInit(&bt_target->btps_mutex);
 	bt_target->btps_scanPage = InvalidBlockNumber;
 	bt_target->btps_pageStatus = BTPARALLEL_NOT_INITIALIZED;
+	bt_target->btps_arrElemsGen = 0;
+	bt_target->btps_checkPrimScan = false;
 	ConditionVariableInit(&bt_target->btps_cv);
 }
 
@@ -575,13 +583,15 @@ btparallelrescan(IndexScanDesc scan)
 	SpinLockAcquire(&btscan->btps_mutex);
 	btscan->btps_scanPage = InvalidBlockNumber;
 	btscan->btps_pageStatus = BTPARALLEL_NOT_INITIALIZED;
+	btscan->btps_arrElemsGen = 0;
+	btscan->btps_checkPrimScan = false;
 	SpinLockRelease(&btscan->btps_mutex);
 }
 
 /*
  * _bt_parallel_seize() -- Begin the process of advancing the scan to a new
- *		page.  Other scans must wait until we call _bt_parallel_release()
- *		or _bt_parallel_done().
+ *		page.  Other scans must wait until we call _bt_parallel_done(),
+ *		[_btp]_opt_release_early/late(), or [_btp]_primscan_schedule().
  *
  * The return value is true if we successfully seized the scan and false
  * if we did not.  The latter case occurs when no pages remain, or when
@@ -640,6 +650,8 @@ _bt_parallel_seize(IndexScanDesc scan, BlockNumber *pageno, bool first)
 		{
 			/* We're done with this parallel index scan */
 			status = false;
+			so->testPrimScan = false;
+			so->arrElemsGen = 0;
 		}
 		else if (btscan->btps_pageStatus == BTPARALLEL_NEED_PRIMSCAN)
 		{
@@ -659,6 +671,8 @@ _bt_parallel_seize(IndexScanDesc scan, BlockNumber *pageno, bool first)
 				}
 				*pageno = InvalidBlockNumber;
 				exit_loop = true;
+				so->arrElemsGen = btscan->btps_arrElemsGen;
+				so->testPrimScan = false;
 			}
 			else
 			{
@@ -685,6 +699,9 @@ _bt_parallel_seize(IndexScanDesc scan, BlockNumber *pageno, bool first)
 			 */
 			btscan->btps_pageStatus = BTPARALLEL_ADVANCING;
 			*pageno = btscan->btps_scanPage;
+
+			so->arrElemsGen = btscan->btps_arrElemsGen;
+
 			exit_loop = true;
 		}
 		SpinLockRelease(&btscan->btps_mutex);
@@ -698,7 +715,7 @@ _bt_parallel_seize(IndexScanDesc scan, BlockNumber *pageno, bool first)
 }
 
 /*
- * _bt_parallel_release() -- Complete the process of advancing the scan to a
+ * _bt_parallel_opt_release_early() -- Complete the process of advancing the scan to a
  *		new page.  We now have the new value btps_scanPage; some other backend
  *		can now begin advancing the scan.
  *
@@ -709,19 +726,76 @@ _bt_parallel_seize(IndexScanDesc scan, BlockNumber *pageno, bool first)
  * scan lands on scan_page).
  */
 void
-_bt_parallel_release(IndexScanDesc scan, BlockNumber scan_page)
+_bt_parallel_opt_release_early(IndexScanDesc scan, BlockNumber scan_page)
+{
+	ParallelIndexScanDesc parallel_scan = scan->parallel_scan;
+	BTParallelScanDesc btscan;
+
+	btscan = (BTParallelScanDesc) OffsetToPointer((void *) parallel_scan,
+												  parallel_scan->ps_offset);
+
+	SpinLockAcquire(&btscan->btps_mutex);
+	/*
+	 * If a parallel worker noticed that it had skipped past the end of a
+	 * primitive scan after another backend already acquired the parallel scan
+	 * status, we don't release the scan before reading the page's contents.
+	 * Instead, we transition to a position which will 
+	 */
+	if (likely(!btscan->btps_checkPrimScan))
+	{
+		btscan->btps_scanPage = scan_page;
+		btscan->btps_pageStatus = BTPARALLEL_IDLE;
+		SpinLockRelease(&btscan->btps_mutex);
+		ConditionVariableSignal(&btscan->btps_cv);
+	}
+	else
+	{
+		BTScanOpaque	so = (BTScanOpaque) scan->opaque;
+		so->testPrimScan = true;
+		SpinLockRelease(&btscan->btps_mutex);
+	}
+}
+
+/*
+ * _bt_parallel_opt_release_late() -- Complete the process of advancing the
+ *		scan to a new page.
+ *
+ * We're only called when a concurrent backend wanted to schedule a skip scan,
+ * but failed to do so because the parallel scan already advanced past its
+ * own page.  
+ */
+void
+_bt_parallel_opt_release_late(IndexScanDesc scan, BlockNumber scan_page)
 {
 	ParallelIndexScanDesc parallel_scan = scan->parallel_scan;
+	BTScanOpaque so = (BTScanOpaque) scan->opaque;
 	BTParallelScanDesc btscan;
+	
+	if (!so->testPrimScan)
+		return;
 
 	btscan = (BTParallelScanDesc) OffsetToPointer((void *) parallel_scan,
 												  parallel_scan->ps_offset);
 
 	SpinLockAcquire(&btscan->btps_mutex);
+	Assert(btscan->btps_checkPrimScan);
 	btscan->btps_scanPage = scan_page;
 	btscan->btps_pageStatus = BTPARALLEL_IDLE;
+
+	/*
+	 * A late release implies that 1) a concurrent backend noticed we
+	 * should've started a new primitive scan, and that 2) the current scan
+	 * position is already at-or-past the point where that scan would've
+	 * started.  So, we do what a new primitive scan would've done with the
+	 * shared state: we increase the generation number, and unset
+	 * checkPrimScan.
+	 */
+	btscan->btps_checkPrimScan = false;
+	btscan->btps_arrElemsGen += 1;
 	SpinLockRelease(&btscan->btps_mutex);
 	ConditionVariableSignal(&btscan->btps_cv);
+
+	so->testPrimScan = false;
 }
 
 /*
@@ -738,6 +812,7 @@ _bt_parallel_done(IndexScanDesc scan)
 	ParallelIndexScanDesc parallel_scan = scan->parallel_scan;
 	BTParallelScanDesc btscan;
 	bool		status_changed = false;
+	so->testPrimScan = false;
 
 	/* Do nothing, for non-parallel scans */
 	if (parallel_scan == NULL)
@@ -774,10 +849,10 @@ _bt_parallel_done(IndexScanDesc scan)
 /*
  * _bt_parallel_primscan_schedule() -- Schedule another primitive index scan.
  *
- * Caller passes the block number most recently passed to _bt_parallel_release
- * by its backend.  Caller successfully schedules the next primitive index scan
- * if the shared parallel state hasn't been seized since caller's backend last
- * advanced the scan.
+ * Caller passes the block number most recently passed to
+ * _bt_parallel_opt_release_early by its backend.  Caller successfully
+ * schedules the next primitive index scan if the shared parallel state hasn't
+ * been seized since caller's backend last advanced the scan.
  */
 void
 _bt_parallel_primscan_schedule(IndexScanDesc scan, BlockNumber prev_scan_page)
@@ -792,11 +867,13 @@ _bt_parallel_primscan_schedule(IndexScanDesc scan, BlockNumber prev_scan_page)
 												  parallel_scan->ps_offset);
 
 	SpinLockAcquire(&btscan->btps_mutex);
-	if (btscan->btps_scanPage == prev_scan_page &&
-		btscan->btps_pageStatus == BTPARALLEL_IDLE)
+	if ((btscan->btps_scanPage == prev_scan_page &&
+		 btscan->btps_pageStatus == BTPARALLEL_IDLE) ||
+		unlikely(so->testPrimScan))
 	{
 		btscan->btps_scanPage = InvalidBlockNumber;
 		btscan->btps_pageStatus = BTPARALLEL_NEED_PRIMSCAN;
+		btscan->btps_arrElemsGen += 1;
 
 		/* Serialize scan's current array keys */
 		for (int i = 0; i < so->numArrayKeys; i++)
@@ -806,7 +883,20 @@ _bt_parallel_primscan_schedule(IndexScanDesc scan, BlockNumber prev_scan_page)
 			btscan->btps_arrElems[i] = array->cur_elem;
 		}
 	}
+	/*
+	 * If the shared array keys are still those of the primitive scan we used
+	 * to access prev_scan_page, make a note that the next page may be a good
+	 * opportunity to start a new primitive scan.  Once marked, a worker will
+	 * not release the scan until it has processed its page and knows for
+	 * sure whether a new prim scan is needed.
+	 */
+	else if (btscan->btps_arrElemsGen == so->arrElemsGen)
+	{
+		btscan->btps_checkPrimScan = true;
+	}
 	SpinLockRelease(&btscan->btps_mutex);
+
+	so->testPrimScan = false;
 }
 
 /*
diff --git a/src/backend/access/nbtree/nbtsearch.c b/src/backend/access/nbtree/nbtsearch.c
index fff7c89ead..8a9a0e6626 100644
--- a/src/backend/access/nbtree/nbtsearch.c
+++ b/src/backend/access/nbtree/nbtsearch.c
@@ -1555,7 +1555,7 @@ _bt_next(IndexScanDesc scan, ScanDirection dir)
  *
  * In the case of a parallel scan, caller must have called _bt_parallel_seize
  * prior to calling this function; this function will invoke
- * _bt_parallel_release before returning.
+ * _bt_parallel_opt_release_early before returning.
  *
  * Returns true if any matching items found on the page, false if none.
  */
@@ -1590,7 +1590,7 @@ _bt_readpage(IndexScanDesc scan, ScanDirection dir, OffsetNumber offnum,
 		else
 			pstate.prev_scan_page = BufferGetBlockNumber(so->currPos.buf);
 
-		_bt_parallel_release(scan, pstate.prev_scan_page);
+		_bt_parallel_opt_release_early(scan, pstate.prev_scan_page);
 	}
 
 	indnatts = IndexRelationGetNumberOfAttributes(scan->indexRelation);
@@ -1943,6 +1943,13 @@ _bt_readpage(IndexScanDesc scan, ScanDirection dir, OffsetNumber offnum,
 		so->currPos.itemIndex = MaxTIDsPerBTreePage - 1;
 	}
 
+	/*
+	 * If !continuescan, releasing will be or has been done by either
+	 * [_btp]_done or [_btp]_skipscan_schedule.
+	 */
+	if (scan->parallel_scan && pstate.continuescan)
+		_bt_parallel_opt_release_late(scan, pstate.prev_scan_page);
+
 	return (so->currPos.firstItem <= so->currPos.lastItem);
 }
 
@@ -2218,7 +2225,7 @@ _bt_readnextpage(IndexScanDesc scan, BlockNumber blkno, ScanDirection dir)
 			else if (scan->parallel_scan != NULL)
 			{
 				/* allow next page be processed by parallel worker */
-				_bt_parallel_release(scan, opaque->btpo_next);
+				_bt_parallel_opt_release_early(scan, opaque->btpo_next);
 			}
 
 			/* nope, keep going */
@@ -2318,7 +2325,7 @@ _bt_readnextpage(IndexScanDesc scan, BlockNumber blkno, ScanDirection dir)
 			else if (scan->parallel_scan != NULL)
 			{
 				/* allow next page be processed by parallel worker */
-				_bt_parallel_release(scan, BufferGetBlockNumber(so->currPos.buf));
+				_bt_parallel_opt_release_early(scan, BufferGetBlockNumber(so->currPos.buf));
 			}
 
 			/*
-- 
2.45.2

