From 3dd679680fe85cf290e6957e1ae0f4f627d467d9 Mon Sep 17 00:00:00 2001
From: Peter Geoghegan <pg@bowt.ie>
Date: Mon, 19 Jun 2023 09:58:43 -0700
Subject: [PATCH v3] Optimize nbtree backward scan boundary cases.

Teach _bt_search (and related helper routines like _bt_binsrch and
_bt_compare) about the initial positioning requirements of backward
scans.  This relieves _bt_first from having to deal with those details
for itself.

Now that certain implementation details are hidden from _bt_first, it's
easy to add a new optimization: backwards scans that use the < strategy
avoid uselessly scanning an extra leaf page in certain "boundary cases".

Consider the following example:

SELECT * FROM tenk1 WHERE hundred < 12 ORDER BY hundred DESC LIMIT 1;

Before this commit, nbtree would needlessly scan two leaf pages, even
though it was only really necessary to scan one leaf page (when the
relevant leaf page's high key had the value (12, -inf), as the relevant
tenk1 regression test index does).  nbtree failed to take advantage of
the fact that we can descend straight to the leaf page that contains the
(12, -inf) high key.

nbtree now descends to the left directly whenever it encounters any
query with a similar boundary case.  This relies on the fact that the
right sibling page cannot possibly contain matching non-pivot tuples.
It only applies to backward scans that use the < strategy -- it does not
apply to backward scans that use the <= strategy (since there might be
matches on both sides of the leaf page high key with the <= strategy).

Author: Peter Geoghegan <pg@bowt.ie>
Discussion: https://postgr.es/m/CAH2-Wz=XPzM8HzaLPq278Vms420mVSHfgs9wi5tjFKHcapZCEw@mail.gmail.com
---
 src/include/access/nbtree.h               |  11 +-
 src/backend/access/nbtree/nbtpage.c       |  16 +-
 src/backend/access/nbtree/nbtsearch.c     | 223 +++++++++++-----------
 src/backend/access/nbtree/nbtutils.c      |  18 +-
 contrib/amcheck/verify_nbtree.c           |  16 +-
 src/test/regress/expected/btree_index.out |  48 +++++
 src/test/regress/sql/btree_index.sql      |  26 +++
 7 files changed, 218 insertions(+), 140 deletions(-)

diff --git a/src/include/access/nbtree.h b/src/include/access/nbtree.h
index f5c66964c..27eb3f093 100644
--- a/src/include/access/nbtree.h
+++ b/src/include/access/nbtree.h
@@ -763,13 +763,8 @@ typedef BTStackData *BTStack;
  * bit, but may not when inserting into an INCLUDE index (tuple header value
  * is affected by the NULL-ness of both key and non-key attributes).
  *
- * When nextkey is false (the usual case), _bt_search and _bt_binsrch will
- * locate the first item >= scankey.  When nextkey is true, they will locate
- * the first item > scan key.
- *
- * pivotsearch is set to true by callers that want to re-find a leaf page
- * using a scankey built from a leaf page's high key.  Most callers set this
- * to false.
+ * See comments in _bt_first for an explanation of the nextkey and backward
+ * fields.
  *
  * scantid is the heap TID that is used as a final tiebreaker attribute.  It
  * is set to NULL when index scan doesn't need to find a position for a
@@ -792,7 +787,7 @@ typedef struct BTScanInsertData
 	bool		allequalimage;
 	bool		anynullkeys;
 	bool		nextkey;
-	bool		pivotsearch;
+	bool		backward;		/* backward index scan? */
 	ItemPointer scantid;		/* tiebreaker for scankeys */
 	int			keysz;			/* Size of scankeys array */
 	ScanKeyData scankeys[INDEX_MAX_KEYS];	/* Must appear last */
diff --git a/src/backend/access/nbtree/nbtpage.c b/src/backend/access/nbtree/nbtpage.c
index b7660a459..86ada8a19 100644
--- a/src/backend/access/nbtree/nbtpage.c
+++ b/src/backend/access/nbtree/nbtpage.c
@@ -1958,10 +1958,20 @@ _bt_pagedel(Relation rel, Buffer leafbuf, BTVacState *vstate)
 					return;
 				}
 
-				/* we need an insertion scan key for the search, so build one */
+				/*
+				 * We need an insertion scan key for the search, so build one.
+				 *
+				 * _bt_search searches for the leaf page with the first
+				 * matching non-pivot tuple.  We intend to "search" for a
+				 * matching pivot tuple in the leaf page underdoing deletion.
+				 *
+				 * Compensate for the mismatch by specifying backward=true.
+				 * That makes _bt_search locate the page < the position where
+				 * matching non-pivot tuples go.  _bt_search handles pivot
+				 * "boundary cases" intelligently, so this does what we need.
+				 */
 				itup_key = _bt_mkscankey(rel, targetkey);
-				/* find the leftmost leaf page with matching pivot/high key */
-				itup_key->pivotsearch = true;
+				itup_key->backward = true;
 				stack = _bt_search(rel, NULL, itup_key, &sleafbuf, BT_READ);
 				/* won't need a second lock or pin on leafbuf */
 				_bt_relbuf(rel, sleafbuf);
diff --git a/src/backend/access/nbtree/nbtsearch.c b/src/backend/access/nbtree/nbtsearch.c
index 17ad89749..9ab8bea52 100644
--- a/src/backend/access/nbtree/nbtsearch.c
+++ b/src/backend/access/nbtree/nbtsearch.c
@@ -318,9 +318,12 @@ _bt_moveright(Relation rel,
  *	_bt_binsrch() -- Do a binary search for a key on a particular page.
  *
  * On a leaf page, _bt_binsrch() returns the OffsetNumber of the first
- * key >= given scankey, or > scankey if nextkey is true.  (NOTE: in
- * particular, this means it is possible to return a value 1 greater than the
- * number of keys on the page, if the scankey is > all keys on the page.)
+ * key >= given scankey, or > scankey if nextkey is true for forward scans.
+ * _bt_binsrch() also "steps back" by one item/tuple on the leaf level in the
+ * case of backward scans.  (NOTE: this means it is possible to return a value
+ * that's 1 greater than the number of keys on the leaf page.  It also means
+ * that we can return an item 1 less than the first non-pivot tuple on any
+ * leaf page.)
  *
  * On an internal (non-leaf) page, _bt_binsrch() returns the OffsetNumber
  * of the last key < given scankey, or last key <= given scankey if nextkey
@@ -362,7 +365,7 @@ _bt_binsrch(Relation rel,
 	 * this covers two cases: the page is really empty (no keys), or it
 	 * contains only a high key.  The latter case is possible after vacuuming.
 	 * This can never happen on an internal page, however, since they are
-	 * never empty (an internal page must have children).
+	 * never empty (an internal page must have at least one child).
 	 */
 	if (unlikely(high < low))
 		return low;
@@ -402,10 +405,27 @@ _bt_binsrch(Relation rel,
 	 * past the last slot on the page.
 	 *
 	 * On a leaf page, we always return the first key >= scan key (resp. >
-	 * scan key), which could be the last slot + 1.
+	 * scan key) for forward scan callers, which could be the last slot + 1.
 	 */
 	if (P_ISLEAF(opaque))
+	{
+		/*
+		 * Backward scans should (by definition) be initially positioned after
+		 * the final non-pivot tuple from the equal-to-scankey grouping on the
+		 * leaf level (with forward scans it's before the first such tuple).
+		 * This explanation works independently of the scan's nextkey-ness
+		 * (actually, nextkey-ness makes the grouping greater-than-scankey,
+		 * but there's still a grouping to position ourselves relative to).
+		 *
+		 * The backward scan case is now "at the start of the wrong grouping".
+		 * It must now step back to "the end of the correct grouping".  This
+		 * leaves us at the correct final offnum from the correct leaf page.
+		 */
+		if (unlikely(key->backward))
+			return OffsetNumberPrev(low);
+
 		return low;
+	}
 
 	/*
 	 * On a non-leaf page, return the last key < scan key (resp. <= scan key).
@@ -767,7 +787,7 @@ _bt_compare(Relation rel,
 	if (key->scantid == NULL)
 	{
 		/*
-		 * Most searches have a scankey that is considered greater than a
+		 * Forward scans have a scankey that is considered greater than a
 		 * truncated pivot tuple if and when the scankey has equal values for
 		 * attributes up to and including the least significant untruncated
 		 * attribute in tuple.
@@ -782,25 +802,33 @@ _bt_compare(Relation rel,
 		 * doesn't have to descend left because it isn't interested in a match
 		 * that has a heap TID value of -inf.
 		 *
-		 * However, some searches (pivotsearch searches) actually require that
-		 * we descend left when this happens.  -inf is treated as a possible
-		 * match for omitted scankey attribute(s).  This is needed by page
-		 * deletion, which must re-find leaf pages that are targets for
-		 * deletion using their high keys.
+		 * We take a symmetrical approach in our handling of this boundary
+		 * case during backward scan searches (with the same goal in mind).
+		 * Here we treat -inf as a possible match for omitted scankey suffix
+		 * attribute(s).  The searcher isn't really interested in matches with
+		 * a heap TID value of -inf, though; it's actually interested in any
+		 * non-pivot tuple match to the immediate left of the ('foo', -inf)
+		 * pivot.  The first leaf page match returned by _bt_binsrch might be
+		 * ('fog', '(4223,7)'), for example.  In other words, backward scan
+		 * search doesn't have to descend right because it is only interested
+		 * in values strictly < 'foo'.
+		 *
+		 * Note: We can exploit invariants for the >= 'foo' and < 'foo' cases
+		 * like this because pivot tuples usually have -inf sentinel values.
+		 * This isn't possible for the <= 'foo' case because suffix truncation
+		 * will never make truncated key attributes have +inf sentinel values.
+		 * When nextkey and backward are both true, our behavior here cannot
+		 * and will not alter how _bt_search descends the tree.  In other
+		 * words, when scan is interested in <= 'foo' matches we must assume
+		 * that leaf pages on both sides of ('foo', -inf) pivot are relevant.
 		 *
 		 * Note: the heap TID part of the test ensures that scankey is being
-		 * compared to a pivot tuple with one or more truncated key
-		 * attributes.
-		 *
-		 * Note: pg_upgrade'd !heapkeyspace indexes must always descend to the
-		 * left here, since they have no heap TID attribute (and cannot have
-		 * any -inf key values in any case, since truncation can only remove
-		 * non-key attributes).  !heapkeyspace searches must always be
-		 * prepared to deal with matches on both sides of the pivot once the
-		 * leaf level is reached.
+		 * compared to a pivot tuple with one or more truncated -inf key
+		 * attributes.  The heap TID attribute is the last key attribute in
+		 * every index, of course, but other than that it isn't special.
 		 */
-		if (key->heapkeyspace && !key->pivotsearch &&
-			key->keysz == ntupatts && heapTid == NULL)
+		if (!key->backward && key->keysz == ntupatts && heapTid == NULL &&
+			key->heapkeyspace)
 			return 1;
 
 		/* All provided scankey arguments found to be equal */
@@ -865,12 +893,10 @@ _bt_first(IndexScanDesc scan, ScanDirection dir)
 	BTStack		stack;
 	OffsetNumber offnum;
 	StrategyNumber strat;
-	bool		nextkey;
-	bool		goback;
 	BTScanInsertData inskey;
 	ScanKey		startKeys[INDEX_MAX_KEYS];
 	ScanKeyData notnullkeys[INDEX_MAX_KEYS];
-	int			keysCount = 0;
+	int			keysz = 0;
 	int			i;
 	bool		status;
 	StrategyNumber strat_total;
@@ -1005,7 +1031,7 @@ _bt_first(IndexScanDesc scan, ScanDirection dir)
 					 ScanDirectionIsBackward(dir)))
 				{
 					/* Yes, so build the key in notnullkeys[keysCount] */
-					chosen = &notnullkeys[keysCount];
+					chosen = &notnullkeys[keysz];
 					ScanKeyEntryInitialize(chosen,
 										   (SK_SEARCHNOTNULL | SK_ISNULL |
 											(impliesNN->sk_flags &
@@ -1026,7 +1052,7 @@ _bt_first(IndexScanDesc scan, ScanDirection dir)
 				 */
 				if (chosen == NULL)
 					break;
-				startKeys[keysCount++] = chosen;
+				startKeys[keysz++] = chosen;
 
 				/*
 				 * Adjust strat_total, and quit if we have stored a > or <
@@ -1100,7 +1126,7 @@ _bt_first(IndexScanDesc scan, ScanDirection dir)
 	 * the tree.  Walk down that edge to the first or last key, and scan from
 	 * there.
 	 */
-	if (keysCount == 0)
+	if (keysz == 0)
 	{
 		bool		match;
 
@@ -1122,8 +1148,8 @@ _bt_first(IndexScanDesc scan, ScanDirection dir)
 	 * identified by startKeys[].  (Remaining insertion scankey fields are
 	 * initialized after initial-positioning strategy is finalized.)
 	 */
-	Assert(keysCount <= INDEX_MAX_KEYS);
-	for (i = 0; i < keysCount; i++)
+	Assert(keysz <= INDEX_MAX_KEYS);
+	for (i = 0; i < keysz; i++)
 	{
 		ScanKey		cur = startKeys[i];
 
@@ -1164,7 +1190,7 @@ _bt_first(IndexScanDesc scan, ScanDirection dir)
 			 * did use has to be treated as just a ">=" or "<=" condition, and
 			 * so we'd better adjust strat_total accordingly.
 			 */
-			if (i == keysCount - 1)
+			if (i == keysz - 1)
 			{
 				bool		used_all_subkeys = false;
 
@@ -1173,16 +1199,16 @@ _bt_first(IndexScanDesc scan, ScanDirection dir)
 				{
 					subkey++;
 					Assert(subkey->sk_flags & SK_ROW_MEMBER);
-					if (subkey->sk_attno != keysCount + 1)
+					if (subkey->sk_attno != keysz + 1)
 						break;	/* out-of-sequence, can't use it */
 					if (subkey->sk_strategy != cur->sk_strategy)
 						break;	/* wrong direction, can't use it */
 					if (subkey->sk_flags & SK_ISNULL)
 						break;	/* can't use null keys */
-					Assert(keysCount < INDEX_MAX_KEYS);
-					memcpy(inskey.scankeys + keysCount, subkey,
+					Assert(keysz < INDEX_MAX_KEYS);
+					memcpy(inskey.scankeys + keysz, subkey,
 						   sizeof(ScanKeyData));
-					keysCount++;
+					keysz++;
 					if (subkey->sk_flags & SK_ROW_END)
 					{
 						used_all_subkeys = true;
@@ -1263,40 +1289,26 @@ _bt_first(IndexScanDesc scan, ScanDirection dir)
 	/*----------
 	 * Examine the selected initial-positioning strategy to determine exactly
 	 * where we need to start the scan, and set flag variables to control the
-	 * code below.
-	 *
-	 * If nextkey = false, _bt_search and _bt_binsrch will locate the first
-	 * item >= scan key.  If nextkey = true, they will locate the first
-	 * item > scan key.
-	 *
-	 * If goback = true, we will then step back one item, while if
-	 * goback = false, we will start the scan on the located item.
+	 * initial descent by _bt_search (and our _bt_binsrch call for the leaf
+	 * page _bt_search returns).
 	 *----------
 	 */
+	_bt_metaversion(rel, &inskey.heapkeyspace, &inskey.allequalimage);
+	inskey.anynullkeys = false; /* unused */
+	inskey.scantid = NULL;
+	inskey.keysz = keysz;
 	switch (strat_total)
 	{
 		case BTLessStrategyNumber:
 
-			/*
-			 * Find first item >= scankey, then back up one to arrive at last
-			 * item < scankey.  (Note: this positioning strategy is only used
-			 * for a backward scan, so that is always the correct starting
-			 * position.)
-			 */
-			nextkey = false;
-			goback = true;
+			inskey.nextkey = false;
+			inskey.backward = true;
 			break;
 
 		case BTLessEqualStrategyNumber:
 
-			/*
-			 * Find first item > scankey, then back up one to arrive at last
-			 * item <= scankey.  (Note: this positioning strategy is only used
-			 * for a backward scan, so that is always the correct starting
-			 * position.)
-			 */
-			nextkey = true;
-			goback = true;
+			inskey.nextkey = true;
+			inskey.backward = true;
 			break;
 
 		case BTEqualStrategyNumber:
@@ -1308,41 +1320,37 @@ _bt_first(IndexScanDesc scan, ScanDirection dir)
 			if (ScanDirectionIsBackward(dir))
 			{
 				/*
-				 * This is the same as the <= strategy.  We will check at the
-				 * end whether the found item is actually =.
+				 * This is the same as the <= strategy
 				 */
-				nextkey = true;
-				goback = true;
+				inskey.nextkey = true;
+				inskey.backward = true;
 			}
 			else
 			{
 				/*
-				 * This is the same as the >= strategy.  We will check at the
-				 * end whether the found item is actually =.
+				 * This is the same as the >= strategy
 				 */
-				nextkey = false;
-				goback = false;
+				inskey.nextkey = false;
+				inskey.backward = false;
 			}
 			break;
 
 		case BTGreaterEqualStrategyNumber:
 
 			/*
-			 * Find first item >= scankey.  (This is only used for forward
-			 * scans.)
+			 * Find first item >= scankey
 			 */
-			nextkey = false;
-			goback = false;
+			inskey.nextkey = false;
+			inskey.backward = false;
 			break;
 
 		case BTGreaterStrategyNumber:
 
 			/*
-			 * Find first item > scankey.  (This is only used for forward
-			 * scans.)
+			 * Find first item > scankey
 			 */
-			nextkey = true;
-			goback = false;
+			inskey.nextkey = true;
+			inskey.backward = false;
 			break;
 
 		default:
@@ -1351,18 +1359,11 @@ _bt_first(IndexScanDesc scan, ScanDirection dir)
 			return false;
 	}
 
-	/* Initialize remaining insertion scan key fields */
-	_bt_metaversion(rel, &inskey.heapkeyspace, &inskey.allequalimage);
-	inskey.anynullkeys = false; /* unused */
-	inskey.nextkey = nextkey;
-	inskey.pivotsearch = false;
-	inskey.scantid = NULL;
-	inskey.keysz = keysCount;
-
 	/*
 	 * Use the manufactured insertion scan key to descend the tree and
 	 * position ourselves on the target leaf page.
 	 */
+	Assert(ScanDirectionIsBackward(dir) == inskey.backward);
 	stack = _bt_search(rel, NULL, &inskey, &buf, BT_READ);
 
 	/* don't need to keep the stack around... */
@@ -1404,34 +1405,29 @@ _bt_first(IndexScanDesc scan, ScanDirection dir)
 
 	/* position to the precise item on the page */
 	offnum = _bt_binsrch(rel, &inskey, buf);
-
-	/*
-	 * If nextkey = false, we are positioned at the first item >= scan key, or
-	 * possibly at the end of a page on which all the existing items are less
-	 * than the scan key and we know that everything on later pages is greater
-	 * than or equal to scan key.
-	 *
-	 * If nextkey = true, we are positioned at the first item > scan key, or
-	 * possibly at the end of a page on which all the existing items are less
-	 * than or equal to the scan key and we know that everything on later
-	 * pages is greater than scan key.
-	 *
-	 * The actually desired starting point is either this item or the prior
-	 * one, or in the end-of-page case it's the first item on the next page or
-	 * the last item on this page.  Adjust the starting offset if needed. (If
-	 * this results in an offset before the first item or after the last one,
-	 * _bt_readpage will report no items found, and then we'll step to the
-	 * next page as needed.)
-	 */
-	if (goback)
-		offnum = OffsetNumberPrev(offnum);
-
-	/* remember which buffer we have pinned, if any */
 	Assert(!BTScanPosIsValid(so->currPos));
 	so->currPos.buf = buf;
 
 	/*
 	 * Now load data from the first page of the scan.
+	 *
+	 * If inskey.nextkey = false and inskey.backward = false, offnum is
+	 * positioned at the first non-pivot tuple >= inskey.scankeys.
+	 *
+	 * If inskey.nextkey = false and inskey.backward = true, offnum is
+	 * positioned at the last non-pivot tuple < inskey.scankeys.
+	 *
+	 * If inskey.nextkey = true and inskey.backward = false, offnum is
+	 * positioned at the first non-pivot tuple > inskey.scankeys.
+	 *
+	 * If inskey.nextkey = true and inskey.backward = true, offnum is
+	 * positioned at the last non-pivot tuple <= inskey.scankeys.
+	 *
+	 * Note: if there isn't an existing non-pivot tuple that is fully equal to
+	 * inskey.scankeys according to _bt_compare, then offnum is the would-be
+	 * offset -- the offset that btinsert would have to use when inserting a
+	 * matching non-pivot tuple onto the page, backfilled with -inf values for
+	 * omitted suffix key attributes (+inf values in the backward scan case).
 	 */
 	if (!_bt_readpage(scan, dir, offnum))
 	{
@@ -1445,7 +1441,7 @@ _bt_first(IndexScanDesc scan, ScanDirection dir)
 	}
 	else
 	{
-		/* Drop the lock, and maybe the pin, on the current page */
+		/* We have at least one item to return as scan's first item */
 		_bt_drop_lock_and_maybe_pin(scan, &so->currPos);
 	}
 
@@ -1522,6 +1518,11 @@ _bt_next(IndexScanDesc scan, ScanDirection dir)
  * moreLeft or moreRight (as appropriate) is cleared if _bt_checkkeys reports
  * that there can be no more matching tuples in the current scan direction.
  *
+ * Some callers pass us an offnum returned by _bt_binsrch, which can be an
+ * "invalid" offnum such as "maxoff+1" in certain corner cases.  We deal with
+ * that by only checking the keys from the offnum tuple after we're sure that
+ * it points to a valid non-pivot tuple (which is a good idea anyway).
+ *
  * In the case of a parallel scan, caller must have called _bt_parallel_seize
  * prior to calling this function; this function will invoke
  * _bt_parallel_release before returning.
@@ -1615,6 +1616,7 @@ _bt_readpage(IndexScanDesc scan, ScanDirection dir, OffsetNumber offnum)
 			}
 
 			itup = (IndexTuple) PageGetItem(page, iid);
+			Assert(!BTreeTupleIsPivot(itup));
 
 			if (_bt_checkkeys(scan, itup, indnatts, dir, &continuescan))
 			{
@@ -1723,6 +1725,7 @@ _bt_readpage(IndexScanDesc scan, ScanDirection dir, OffsetNumber offnum)
 				tuple_alive = true;
 
 			itup = (IndexTuple) PageGetItem(page, iid);
+			Assert(!BTreeTupleIsPivot(itup));
 
 			passes_quals = _bt_checkkeys(scan, itup, indnatts, dir,
 										 &continuescan);
@@ -1971,7 +1974,7 @@ _bt_steppage(IndexScanDesc scan, ScanDirection dir)
 	if (!_bt_readnextpage(scan, blkno, dir))
 		return false;
 
-	/* Drop the lock, and maybe the pin, on the current page */
+	/* We have at least one item to return as scan's next item */
 	_bt_drop_lock_and_maybe_pin(scan, &so->currPos);
 
 	return true;
@@ -2172,7 +2175,7 @@ _bt_parallel_readpage(IndexScanDesc scan, BlockNumber blkno, ScanDirection dir)
 	if (!_bt_readnextpage(scan, blkno, dir))
 		return false;
 
-	/* Drop the lock, and maybe the pin, on the current page */
+	/* We have at least one item to return as scan's next item */
 	_bt_drop_lock_and_maybe_pin(scan, &so->currPos);
 
 	return true;
@@ -2461,7 +2464,7 @@ _bt_endpoint(IndexScanDesc scan, ScanDirection dir)
 	}
 	else
 	{
-		/* Drop the lock, and maybe the pin, on the current page */
+		/* We have at least one item to return as scan's first item */
 		_bt_drop_lock_and_maybe_pin(scan, &so->currPos);
 	}
 
diff --git a/src/backend/access/nbtree/nbtutils.c b/src/backend/access/nbtree/nbtutils.c
index 7da499c4d..941594918 100644
--- a/src/backend/access/nbtree/nbtutils.c
+++ b/src/backend/access/nbtree/nbtutils.c
@@ -62,14 +62,6 @@ static int	_bt_keep_natts(Relation rel, IndexTuple lastleft,
  *		Build an insertion scan key that contains comparison data from itup
  *		as well as comparator routines appropriate to the key datatypes.
  *
- *		When itup is a non-pivot tuple, the returned insertion scan key is
- *		suitable for finding a place for it to go on the leaf level.  Pivot
- *		tuples can be used to re-find leaf page with matching high key, but
- *		then caller needs to set scan key's pivotsearch field to true.  This
- *		allows caller to search for a leaf page with a matching high key,
- *		which is usually to the left of the first leaf page a non-pivot match
- *		might appear on.
- *
  *		The result is intended for use with _bt_compare() and _bt_truncate().
  *		Callers that don't need to fill out the insertion scankey arguments
  *		(e.g. they use an ad-hoc comparison routine, or only need a scankey
@@ -120,8 +112,8 @@ _bt_mkscankey(Relation rel, IndexTuple itup)
 		key->allequalimage = false;
 	}
 	key->anynullkeys = false;	/* initial assumption */
-	key->nextkey = false;
-	key->pivotsearch = false;
+	key->nextkey = false;		/* usual case, required by btinsert */
+	key->backward = false;		/* usual case, required by btinsert */
 	key->keysz = Min(indnkeyatts, tupnatts);
 	key->scantid = key->heapkeyspace && itup ?
 		BTreeTupleGetHeapTID(itup) : NULL;
@@ -725,7 +717,11 @@ _bt_restore_array_keys(IndexScanDesc scan)
  * failure of either key is indeed enough to stop the scan.  (In general, when
  * inequality keys are present, the initial-positioning code only promises to
  * position before the first possible match, not exactly at the first match,
- * for a forward scan; or after the last match for a backward scan.)
+ * for a forward scan; or after the last match for a backward scan.  Note,
+ * however, that there are various optimizations in the initial-positioning
+ * code that maximize the likelihood that _bt_search will return the leaf page
+ * whose key space makes it precisely the first/last leaf page where a match
+ * might be found.)
  *
  * As a byproduct of this work, we can detect contradictory quals such
  * as "x = 1 AND x > 2".  If we see that, we return so->qual_ok = false,
diff --git a/contrib/amcheck/verify_nbtree.c b/contrib/amcheck/verify_nbtree.c
index dbb83d80f..071998bc0 100644
--- a/contrib/amcheck/verify_nbtree.c
+++ b/contrib/amcheck/verify_nbtree.c
@@ -2779,7 +2779,7 @@ invariant_l_offset(BtreeCheckState *state, BTScanInsert key,
 	ItemId		itemid;
 	int32		cmp;
 
-	Assert(key->pivotsearch);
+	Assert(!key->nextkey && key->backward);
 
 	/* Verify line pointer before checking tuple */
 	itemid = PageGetItemIdCareful(state, state->targetblock, state->target,
@@ -2841,7 +2841,7 @@ invariant_leq_offset(BtreeCheckState *state, BTScanInsert key,
 {
 	int32		cmp;
 
-	Assert(key->pivotsearch);
+	Assert(!key->nextkey && key->backward);
 
 	cmp = _bt_compare(state->rel, key, state->target, upperbound);
 
@@ -2864,7 +2864,7 @@ invariant_g_offset(BtreeCheckState *state, BTScanInsert key,
 {
 	int32		cmp;
 
-	Assert(key->pivotsearch);
+	Assert(!key->nextkey && key->backward);
 
 	cmp = _bt_compare(state->rel, key, state->target, lowerbound);
 
@@ -2902,7 +2902,7 @@ invariant_l_nontarget_offset(BtreeCheckState *state, BTScanInsert key,
 	ItemId		itemid;
 	int32		cmp;
 
-	Assert(key->pivotsearch);
+	Assert(!key->nextkey && key->backward);
 
 	/* Verify line pointer before checking tuple */
 	itemid = PageGetItemIdCareful(state, nontargetblock, nontarget,
@@ -3128,9 +3128,9 @@ palloc_btree_page(BtreeCheckState *state, BlockNumber blocknum)
  * For example, invariant_g_offset() might miss a cross-page invariant failure
  * on an internal level if the scankey built from the first item on the
  * target's right sibling page happened to be equal to (not greater than) the
- * last item on target page.  The !pivotsearch tiebreaker in _bt_compare()
- * might otherwise cause amcheck to assume (rather than actually verify) that
- * the scankey is greater.
+ * last item on target page.  The !backward tiebreaker in _bt_compare() might
+ * otherwise cause amcheck to assume (rather than actually verify) that the
+ * scankey is greater.
  */
 static inline BTScanInsert
 bt_mkscankey_pivotsearch(Relation rel, IndexTuple itup)
@@ -3138,7 +3138,7 @@ bt_mkscankey_pivotsearch(Relation rel, IndexTuple itup)
 	BTScanInsert skey;
 
 	skey = _bt_mkscankey(rel, itup);
-	skey->pivotsearch = true;
+	skey->backward = true;
 
 	return skey;
 }
diff --git a/src/test/regress/expected/btree_index.out b/src/test/regress/expected/btree_index.out
index 93ed5e8cc..4dcb9067a 100644
--- a/src/test/regress/expected/btree_index.out
+++ b/src/test/regress/expected/btree_index.out
@@ -142,6 +142,54 @@ SELECT b.*
   4500 | 2080851358
 (1 row)
 
+--
+-- Add coverage for optimization of backwards scan index descents
+--
+-- Here we expect _bt_search to descend straight to a leaf page containing a
+-- non-pivot tuple with the value '47', which comes last (after 11 similar
+-- non-pivot tuples).  Query execution should only need to visit a single
+-- leaf page here, even though we happen to be interested in a tuple that
+-- the nbtree implementation reasons about as a boundary case.
+--
+-- Test case relies on tenk1_hundred index having a leaf page whose high key
+-- is '(48, -inf)'.  We use a low cardinality index to make our test case less
+-- sensitive to implementation details that may change in the future.
+set enable_seqscan to false;
+set enable_indexscan to true;
+set enable_bitmapscan to false;
+explain (costs off)
+select hundred, twenty from tenk1 where hundred < 48 order by hundred desc limit 1;
+                       QUERY PLAN                       
+--------------------------------------------------------
+ Limit
+   ->  Index Scan Backward using tenk1_hundred on tenk1
+         Index Cond: (hundred < 48)
+(3 rows)
+
+select hundred, twenty from tenk1 where hundred < 48 order by hundred desc limit 1;
+ hundred | twenty 
+---------+--------
+      47 |      7
+(1 row)
+
+-- This variant of the query need only return a single tuple located to the immediate
+-- right of the '(48, -inf)' high key.  It also only needs to scan one single
+-- leaf page (the right sibling of the page scanned by the last test case):
+explain (costs off)
+select hundred, twenty from tenk1 where hundred <= 48 order by hundred desc limit 1;
+                       QUERY PLAN                       
+--------------------------------------------------------
+ Limit
+   ->  Index Scan Backward using tenk1_hundred on tenk1
+         Index Cond: (hundred <= 48)
+(3 rows)
+
+select hundred, twenty from tenk1 where hundred <= 48 order by hundred desc limit 1;
+ hundred | twenty 
+---------+--------
+      48 |      8
+(1 row)
+
 --
 -- Check correct optimization of LIKE (special index operator support)
 -- for both indexscan and bitmapscan cases
diff --git a/src/test/regress/sql/btree_index.sql b/src/test/regress/sql/btree_index.sql
index 239f4a475..2075b45a6 100644
--- a/src/test/regress/sql/btree_index.sql
+++ b/src/test/regress/sql/btree_index.sql
@@ -110,6 +110,32 @@ SELECT b.*
    FROM bt_f8_heap b
    WHERE b.seqno = '4500'::float8;
 
+--
+-- Add coverage for optimization of backwards scan index descents
+--
+-- Here we expect _bt_search to descend straight to a leaf page containing a
+-- non-pivot tuple with the value '47', which comes last (after 11 similar
+-- non-pivot tuples).  Query execution should only need to visit a single
+-- leaf page here, even though we happen to be interested in a tuple that
+-- the nbtree implementation reasons about as a boundary case.
+--
+-- Test case relies on tenk1_hundred index having a leaf page whose high key
+-- is '(48, -inf)'.  We use a low cardinality index to make our test case less
+-- sensitive to implementation details that may change in the future.
+set enable_seqscan to false;
+set enable_indexscan to true;
+set enable_bitmapscan to false;
+explain (costs off)
+select hundred, twenty from tenk1 where hundred < 48 order by hundred desc limit 1;
+select hundred, twenty from tenk1 where hundred < 48 order by hundred desc limit 1;
+
+-- This variant of the query need only return a single tuple located to the immediate
+-- right of the '(48, -inf)' high key.  It also only needs to scan one single
+-- leaf page (the right sibling of the page scanned by the last test case):
+explain (costs off)
+select hundred, twenty from tenk1 where hundred <= 48 order by hundred desc limit 1;
+select hundred, twenty from tenk1 where hundred <= 48 order by hundred desc limit 1;
+
 --
 -- Check correct optimization of LIKE (special index operator support)
 -- for both indexscan and bitmapscan cases
-- 
2.40.1

