From ab3d7230031bfc6461b2334f564c434ae0cd4b12 Mon Sep 17 00:00:00 2001
From: Peter Geoghegan <pg@bowt.ie>
Date: Mon, 19 Jun 2023 09:58:43 -0700
Subject: [PATCH v1] Add nbtree "goback" boundary case optimization.

---
 src/include/access/nbtree.h           |  9 ++--
 src/backend/access/nbtree/nbtpage.c   | 19 +++++--
 src/backend/access/nbtree/nbtsearch.c | 75 ++++++++++++++++-----------
 src/backend/access/nbtree/nbtutils.c  | 12 +++--
 contrib/amcheck/verify_nbtree.c       | 16 +++---
 5 files changed, 79 insertions(+), 52 deletions(-)

diff --git a/src/include/access/nbtree.h b/src/include/access/nbtree.h
index 8891fa797..6cffbf6d5 100644
--- a/src/include/access/nbtree.h
+++ b/src/include/access/nbtree.h
@@ -767,9 +767,10 @@ typedef BTStackData *BTStack;
  * locate the first item >= scankey.  When nextkey is true, they will locate
  * the first item > scan key.
  *
- * pivotsearch is set to true by callers that want to re-find a leaf page
- * using a scankey built from a leaf page's high key.  Most callers set this
- * to false.
+ * goback is set to true by callers that intend to back up from the first
+ * match according to the scankey once on the leaf level.  Forward scan
+ * callers always set this to false.  Some backward scan callers prefer to set
+ * this to true to avoid scanning an extra leaf page in boundary cases.
  *
  * scantid is the heap TID that is used as a final tiebreaker attribute.  It
  * is set to NULL when index scan doesn't need to find a position for a
@@ -792,7 +793,7 @@ typedef struct BTScanInsertData
 	bool		allequalimage;
 	bool		anynullkeys;
 	bool		nextkey;
-	bool		pivotsearch;
+	bool		goback;
 	ItemPointer scantid;		/* tiebreaker for scankeys */
 	int			keysz;			/* Size of scankeys array */
 	ScanKeyData scankeys[INDEX_MAX_KEYS];	/* Must appear last */
diff --git a/src/backend/access/nbtree/nbtpage.c b/src/backend/access/nbtree/nbtpage.c
index c2050656e..61e4c7dac 100644
--- a/src/backend/access/nbtree/nbtpage.c
+++ b/src/backend/access/nbtree/nbtpage.c
@@ -1958,12 +1958,21 @@ _bt_pagedel(Relation rel, Buffer leafbuf, BTVacState *vstate)
 					return;
 				}
 
-				/* we need an insertion scan key for the search, so build one */
+				/*
+				 * We need an insertion scan key for the search, so build one.
+				 * We specify goback=true because we're searching for the
+				 * location of a matching high key (not the location of the
+				 * first matching non-pivot tuple, which will be in the right
+				 * sibling of the sleafbuf page returned by _bt_search).
+				 *
+				 * Note: Unlike regular "goback" searchers, we're not prepared
+				 * to step back from the page returned by _bt_search.  That
+				 * makes specifying goback=true crucial here (for us it's not
+				 * merely about handling certain boundary cases optimally.)
+				 */
 				itup_key = _bt_mkscankey(rel, targetkey);
-				/* find the leftmost leaf page with matching pivot/high key */
-				itup_key->pivotsearch = true;
-				stack = _bt_search(rel, NULL, itup_key, &sleafbuf, BT_READ,
-								   NULL);
+				itup_key->goback = true;
+				stack = _bt_search(rel, NULL, itup_key, &sleafbuf, BT_READ, NULL);
 				/* won't need a second lock or pin on leafbuf */
 				_bt_relbuf(rel, sleafbuf);
 
diff --git a/src/backend/access/nbtree/nbtsearch.c b/src/backend/access/nbtree/nbtsearch.c
index 7e05e5867..d7798510d 100644
--- a/src/backend/access/nbtree/nbtsearch.c
+++ b/src/backend/access/nbtree/nbtsearch.c
@@ -792,11 +792,24 @@ _bt_compare(Relation rel,
 		 * doesn't have to descend left because it isn't interested in a match
 		 * that has a heap TID value of -inf.
 		 *
-		 * However, some searches (pivotsearch searches) actually require that
-		 * we descend left when this happens.  -inf is treated as a possible
-		 * match for omitted scankey attribute(s).  This is needed by page
-		 * deletion, which must re-find leaf pages that are targets for
-		 * deletion using their high keys.
+		 * We take a symmetrical approach in our handling of such a boundary
+		 * case during goback searches (though with the same goal in mind as
+		 * the regular/!goback case).  Here we treat -inf as a possible match
+		 * for omitted scankey attribute(s) instead.  That has the effect of
+		 * making the search avoid uselessly locking/pinning the page to the
+		 * right.  In other words, goback search should descend left directly
+		 * because it is only interested in a match that is strictly < 'foo'.
+		 * All goback searches are from backward scan callers that don't
+		 * actually need to visit the page to the right at all.
+		 *
+		 * Note: Some goback searches won't get this behavior.  Consider a
+		 * backward scan whose insertion scan key locates tuples <= 'foo'.
+		 * This search needs to be able to find matches on both sides of the
+		 * ('foo', inf) leaf level high key, so visiting at least two leaf
+		 * pages is unavoidable.  This case will use an initial insertion scan
+		 * key that is goback=true and nextkey=true.  (When nextkey=true then
+		 * goback won't influence how _bt_search descends the tree, no matter
+		 * what we do here.)
 		 *
 		 * Note: the heap TID part of the test ensures that scankey is being
 		 * compared to a pivot tuple with one or more truncated key
@@ -808,9 +821,13 @@ _bt_compare(Relation rel,
 		 * non-key attributes).  !heapkeyspace searches must always be
 		 * prepared to deal with matches on both sides of the pivot once the
 		 * leaf level is reached.
+		 *
+		 * Note: it would be safe (though far from optimal) for every search
+		 * to use goback=true; heapkeyspace searches _effectively_ do so for
+		 * every _bt_search call.
 		 */
-		if (key->heapkeyspace && !key->pivotsearch &&
-			key->keysz == ntupatts && heapTid == NULL)
+		if (heapTid == NULL && key->keysz == ntupatts && !key->goback &&
+			key->heapkeyspace)
 			return 1;
 
 		/* All provided scankey arguments found to be equal */
@@ -875,8 +892,6 @@ _bt_first(IndexScanDesc scan, ScanDirection dir)
 	BTStack		stack;
 	OffsetNumber offnum;
 	StrategyNumber strat;
-	bool		nextkey;
-	bool		goback;
 	BTScanInsertData inskey;
 	ScanKey		startKeys[INDEX_MAX_KEYS];
 	ScanKeyData notnullkeys[INDEX_MAX_KEYS];
@@ -1283,6 +1298,10 @@ _bt_first(IndexScanDesc scan, ScanDirection dir)
 	 * goback = false, we will start the scan on the located item.
 	 *----------
 	 */
+	_bt_metaversion(rel, &inskey.heapkeyspace, &inskey.allequalimage);
+	inskey.anynullkeys = false; /* unused */
+	inskey.scantid = NULL;
+	inskey.keysz = keysCount;
 	switch (strat_total)
 	{
 		case BTLessStrategyNumber:
@@ -1293,8 +1312,8 @@ _bt_first(IndexScanDesc scan, ScanDirection dir)
 			 * for a backward scan, so that is always the correct starting
 			 * position.)
 			 */
-			nextkey = false;
-			goback = true;
+			inskey.nextkey = false;
+			inskey.goback = true;
 			break;
 
 		case BTLessEqualStrategyNumber:
@@ -1305,8 +1324,8 @@ _bt_first(IndexScanDesc scan, ScanDirection dir)
 			 * for a backward scan, so that is always the correct starting
 			 * position.)
 			 */
-			nextkey = true;
-			goback = true;
+			inskey.nextkey = true;
+			inskey.goback = true;
 			break;
 
 		case BTEqualStrategyNumber:
@@ -1321,8 +1340,8 @@ _bt_first(IndexScanDesc scan, ScanDirection dir)
 				 * This is the same as the <= strategy.  We will check at the
 				 * end whether the found item is actually =.
 				 */
-				nextkey = true;
-				goback = true;
+				inskey.nextkey = true;
+				inskey.goback = true;
 			}
 			else
 			{
@@ -1330,8 +1349,8 @@ _bt_first(IndexScanDesc scan, ScanDirection dir)
 				 * This is the same as the >= strategy.  We will check at the
 				 * end whether the found item is actually =.
 				 */
-				nextkey = false;
-				goback = false;
+				inskey.nextkey = false;
+				inskey.goback = false;
 			}
 			break;
 
@@ -1341,8 +1360,8 @@ _bt_first(IndexScanDesc scan, ScanDirection dir)
 			 * Find first item >= scankey.  (This is only used for forward
 			 * scans.)
 			 */
-			nextkey = false;
-			goback = false;
+			inskey.nextkey = false;
+			inskey.goback = false;
 			break;
 
 		case BTGreaterStrategyNumber:
@@ -1351,8 +1370,8 @@ _bt_first(IndexScanDesc scan, ScanDirection dir)
 			 * Find first item > scankey.  (This is only used for forward
 			 * scans.)
 			 */
-			nextkey = true;
-			goback = false;
+			inskey.nextkey = true;
+			inskey.goback = false;
 			break;
 
 		default:
@@ -1361,14 +1380,6 @@ _bt_first(IndexScanDesc scan, ScanDirection dir)
 			return false;
 	}
 
-	/* Initialize remaining insertion scan key fields */
-	_bt_metaversion(rel, &inskey.heapkeyspace, &inskey.allequalimage);
-	inskey.anynullkeys = false; /* unused */
-	inskey.nextkey = nextkey;
-	inskey.pivotsearch = false;
-	inskey.scantid = NULL;
-	inskey.keysz = keysCount;
-
 	/*
 	 * Use the manufactured insertion scan key to descend the tree and
 	 * position ourselves on the target leaf page.
@@ -1420,9 +1431,9 @@ _bt_first(IndexScanDesc scan, ScanDirection dir)
 	 * the last item on this page.  Adjust the starting offset if needed. (If
 	 * this results in an offset before the first item or after the last one,
 	 * _bt_readpage will report no items found, and then we'll step to the
-	 * next page as needed.)
+	 * next page as needed.  _bt_search avoids this whenever possible.)
 	 */
-	if (goback)
+	if (inskey.goback)
 		offnum = OffsetNumberPrev(offnum);
 
 	/* remember which buffer we have pinned, if any */
@@ -1614,6 +1625,7 @@ _bt_readpage(IndexScanDesc scan, ScanDirection dir, OffsetNumber offnum)
 			}
 
 			itup = (IndexTuple) PageGetItem(page, iid);
+			Assert(!BTreeTupleIsPivot(itup));
 
 			if (_bt_checkkeys(scan, itup, indnatts, dir, &continuescan))
 			{
@@ -1722,6 +1734,7 @@ _bt_readpage(IndexScanDesc scan, ScanDirection dir, OffsetNumber offnum)
 				tuple_alive = true;
 
 			itup = (IndexTuple) PageGetItem(page, iid);
+			Assert(!BTreeTupleIsPivot(itup));
 
 			passes_quals = _bt_checkkeys(scan, itup, indnatts, dir,
 										 &continuescan);
diff --git a/src/backend/access/nbtree/nbtutils.c b/src/backend/access/nbtree/nbtutils.c
index 7da499c4d..87714aba0 100644
--- a/src/backend/access/nbtree/nbtutils.c
+++ b/src/backend/access/nbtree/nbtutils.c
@@ -65,8 +65,8 @@ static int	_bt_keep_natts(Relation rel, IndexTuple lastleft,
  *		When itup is a non-pivot tuple, the returned insertion scan key is
  *		suitable for finding a place for it to go on the leaf level.  Pivot
  *		tuples can be used to re-find leaf page with matching high key, but
- *		then caller needs to set scan key's pivotsearch field to true.  This
- *		allows caller to search for a leaf page with a matching high key,
+ *		then caller needs to set insertion scan key's goback field to true.
+ *		This allows caller to search for a leaf page with a matching high key,
  *		which is usually to the left of the first leaf page a non-pivot match
  *		might appear on.
  *
@@ -121,7 +121,7 @@ _bt_mkscankey(Relation rel, IndexTuple itup)
 	}
 	key->anynullkeys = false;	/* initial assumption */
 	key->nextkey = false;
-	key->pivotsearch = false;
+	key->goback = false;
 	key->keysz = Min(indnkeyatts, tupnatts);
 	key->scantid = key->heapkeyspace && itup ?
 		BTreeTupleGetHeapTID(itup) : NULL;
@@ -725,7 +725,11 @@ _bt_restore_array_keys(IndexScanDesc scan)
  * failure of either key is indeed enough to stop the scan.  (In general, when
  * inequality keys are present, the initial-positioning code only promises to
  * position before the first possible match, not exactly at the first match,
- * for a forward scan; or after the last match for a backward scan.)
+ * for a forward scan; or after the last match for a backward scan.  Note,
+ * however, that there are various optimizations in the initial-positioning
+ * code that maximize the likelihood that _bt_search will return the leaf page
+ * whose key space makes it precisely the first/last leaf page where a match
+ * might be found.)
  *
  * As a byproduct of this work, we can detect contradictory quals such
  * as "x = 1 AND x > 2".  If we see that, we return so->qual_ok = false,
diff --git a/contrib/amcheck/verify_nbtree.c b/contrib/amcheck/verify_nbtree.c
index 94a975932..e108e806e 100644
--- a/contrib/amcheck/verify_nbtree.c
+++ b/contrib/amcheck/verify_nbtree.c
@@ -2779,7 +2779,7 @@ invariant_l_offset(BtreeCheckState *state, BTScanInsert key,
 	ItemId		itemid;
 	int32		cmp;
 
-	Assert(key->pivotsearch);
+	Assert(!key->nextkey && key->goback);
 
 	/* Verify line pointer before checking tuple */
 	itemid = PageGetItemIdCareful(state, state->targetblock, state->target,
@@ -2841,7 +2841,7 @@ invariant_leq_offset(BtreeCheckState *state, BTScanInsert key,
 {
 	int32		cmp;
 
-	Assert(key->pivotsearch);
+	Assert(!key->nextkey && key->goback);
 
 	cmp = _bt_compare(state->rel, key, state->target, upperbound);
 
@@ -2864,7 +2864,7 @@ invariant_g_offset(BtreeCheckState *state, BTScanInsert key,
 {
 	int32		cmp;
 
-	Assert(key->pivotsearch);
+	Assert(!key->nextkey && key->goback);
 
 	cmp = _bt_compare(state->rel, key, state->target, lowerbound);
 
@@ -2902,7 +2902,7 @@ invariant_l_nontarget_offset(BtreeCheckState *state, BTScanInsert key,
 	ItemId		itemid;
 	int32		cmp;
 
-	Assert(key->pivotsearch);
+	Assert(!key->nextkey && key->goback);
 
 	/* Verify line pointer before checking tuple */
 	itemid = PageGetItemIdCareful(state, nontargetblock, nontarget,
@@ -3128,9 +3128,9 @@ palloc_btree_page(BtreeCheckState *state, BlockNumber blocknum)
  * For example, invariant_g_offset() might miss a cross-page invariant failure
  * on an internal level if the scankey built from the first item on the
  * target's right sibling page happened to be equal to (not greater than) the
- * last item on target page.  The !pivotsearch tiebreaker in _bt_compare()
- * might otherwise cause amcheck to assume (rather than actually verify) that
- * the scankey is greater.
+ * last item on target page.  The !goback tiebreaker in _bt_compare() might
+ * otherwise cause amcheck to assume (rather than actually verify) that the
+ * scankey is greater.
  */
 static inline BTScanInsert
 bt_mkscankey_pivotsearch(Relation rel, IndexTuple itup)
@@ -3138,7 +3138,7 @@ bt_mkscankey_pivotsearch(Relation rel, IndexTuple itup)
 	BTScanInsert skey;
 
 	skey = _bt_mkscankey(rel, itup);
-	skey->pivotsearch = true;
+	skey->goback = true;
 
 	return skey;
 }
-- 
2.40.1

