From ceeebf4636d2c4b1b570ba1333632940acaf5945 Mon Sep 17 00:00:00 2001
From: Matthias van de Meent <boekewurm+postgres@gmail.com>
Date: Tue, 6 Aug 2024 22:05:51 +0200
Subject: [PATCH v16] btree: Implement dynamic prefix truncation

Because tuples in the btree are ordered, if some prefix of the
scan attributes on both sides of the compared tuple are equal
to the scankey, then the current tuple that is being compared
must also contain the same prefix that equals the scankey.

By applying this knowledge we can save a lot of attribute
compares while we descend a tree: We only have to establish
that a tuple left and right of the midpoint during binary
search share a prefix, and then we can use that to skip
comparing that prefix for further compares in the binary
search phase of index descent.

Due to concurrent page splits and deletes, the key range of a
page may move while we travel down the btree.  This means we
can't reuse left- and right-key prefixes we established on the
parent page while we descend the btree: the data on the page
may not match the parent page's separator keys anymore
(see [0]).

So, we can't always descend the btree while keeping the
discovered prefixes.  However, on each page we only see a
snapshot of the page, so we can use dynamic prefix truncation
at the page level.  This still improves the worst-case number
of compare operations during btree descent from  depth *
ceil(log(tuples_per_page)) * natts  to  depth *
(ceil(log(tuples_per_page)) + 3 * natts).

In passing, move _bt_moveright from btree.h to nbtsearch.c:
there was no user of the function left outside nbtsearch.c,
and there doesn't seem to be a use case for external usage.
This also allows the compiler to optimize a bit more than it
could previously.

[0] https://www.postgresql.org/message-id/CAH2-Wzn_NAyK4pR0HRWO0StwHmxjP5qyu+X8vppt030XpqrO6w@mail.gmail.com
---
 contrib/amcheck/verify_nbtree.c       |  24 ++++--
 src/backend/access/nbtree/README      |  37 ++++++++++
 src/backend/access/nbtree/nbtinsert.c |  41 ++++++++---
 src/backend/access/nbtree/nbtsearch.c | 102 +++++++++++++++++++++++---
 src/include/access/nbtree.h           |   9 +--
 5 files changed, 178 insertions(+), 35 deletions(-)

diff --git a/contrib/amcheck/verify_nbtree.c b/contrib/amcheck/verify_nbtree.c
index 7cfb136763..c7dc6725a4 100644
--- a/contrib/amcheck/verify_nbtree.c
+++ b/contrib/amcheck/verify_nbtree.c
@@ -1798,6 +1798,8 @@ bt_target_page_check(BtreeCheckState *state)
 		if (state->checkunique && state->indexinfo->ii_Unique &&
 			P_ISLEAF(topaque) && OffsetNumberNext(offset) <= max)
 		{
+			int		sprefix = 0;
+
 			/* Save current scankey tid */
 			scantid = skey->scantid;
 
@@ -1817,7 +1819,7 @@ bt_target_page_check(BtreeCheckState *state)
 			 * bt_entry_unique_check() call if it was postponed.
 			 */
 			if (_bt_compare(state->rel, skey, state->target,
-							OffsetNumberNext(offset)) != 0 || skey->anynullkeys)
+							OffsetNumberNext(offset), &sprefix) != 0 || skey->anynullkeys)
 			{
 				lVis.blkno = InvalidBlockNumber;
 				lVis.offset = InvalidOffsetNumber;
@@ -1900,6 +1902,7 @@ bt_target_page_check(BtreeCheckState *state)
 				rightkey && P_ISLEAF(topaque) && !P_RIGHTMOST(topaque))
 			{
 				BlockNumber rightblock_number = topaque->btpo_next;
+				int		sprefix = 0;
 
 				elog(DEBUG2, "check cross page unique condition");
 
@@ -1911,7 +1914,7 @@ bt_target_page_check(BtreeCheckState *state)
 				rightkey->scantid = NULL;
 
 				/* The first key on the next page is the same */
-				if (_bt_compare(state->rel, rightkey, state->target, max) == 0 &&
+				if (_bt_compare(state->rel, rightkey, state->target, max, &sprefix) == 0 &&
 					!rightkey->anynullkeys)
 				{
 					Page		rightpage;
@@ -3174,6 +3177,7 @@ bt_rootdescend(BtreeCheckState *state, IndexTuple itup)
 		BTInsertStateData insertstate;
 		OffsetNumber offnum;
 		Page		page;
+		int			sprefix = 0;
 
 		insertstate.itup = itup;
 		insertstate.itemsz = MAXALIGN(IndexTupleSize(itup));
@@ -3183,13 +3187,13 @@ bt_rootdescend(BtreeCheckState *state, IndexTuple itup)
 		insertstate.buf = lbuf;
 
 		/* Get matching tuple on leaf page */
-		offnum = _bt_binsrch_insert(state->rel, &insertstate);
+		offnum = _bt_binsrch_insert(state->rel, &insertstate, 1);
 		/* Compare first >= matching item on leaf page, if any */
 		page = BufferGetPage(lbuf);
 		/* Should match on first heap TID when tuple has a posting list */
 		if (offnum <= PageGetMaxOffsetNumber(page) &&
 			insertstate.postingoff <= 0 &&
-			_bt_compare(state->rel, key, page, offnum) == 0)
+			_bt_compare(state->rel, key, page, offnum, &sprefix) == 0)
 			exists = true;
 		_bt_relbuf(state->rel, lbuf);
 	}
@@ -3251,6 +3255,7 @@ invariant_l_offset(BtreeCheckState *state, BTScanInsert key,
 {
 	ItemId		itemid;
 	int32		cmp;
+	int			sprefix = 0;
 
 	Assert(!key->nextkey && key->backward);
 
@@ -3261,7 +3266,7 @@ invariant_l_offset(BtreeCheckState *state, BTScanInsert key,
 	if (!key->heapkeyspace)
 		return invariant_leq_offset(state, key, upperbound);
 
-	cmp = _bt_compare(state->rel, key, state->target, upperbound);
+	cmp = _bt_compare(state->rel, key, state->target, upperbound, &sprefix);
 
 	/*
 	 * _bt_compare() is capable of determining that a scankey with a
@@ -3313,10 +3318,11 @@ invariant_leq_offset(BtreeCheckState *state, BTScanInsert key,
 					 OffsetNumber upperbound)
 {
 	int32		cmp;
+	int			sprefix = 0;
 
 	Assert(!key->nextkey && key->backward);
 
-	cmp = _bt_compare(state->rel, key, state->target, upperbound);
+	cmp = _bt_compare(state->rel, key, state->target, upperbound, &sprefix);
 
 	return cmp <= 0;
 }
@@ -3336,10 +3342,11 @@ invariant_g_offset(BtreeCheckState *state, BTScanInsert key,
 				   OffsetNumber lowerbound)
 {
 	int32		cmp;
+	int			sprefix = 0;
 
 	Assert(!key->nextkey && key->backward);
 
-	cmp = _bt_compare(state->rel, key, state->target, lowerbound);
+	cmp = _bt_compare(state->rel, key, state->target, lowerbound, &sprefix);
 
 	/* pg_upgrade'd indexes may legally have equal sibling tuples */
 	if (!key->heapkeyspace)
@@ -3374,13 +3381,14 @@ invariant_l_nontarget_offset(BtreeCheckState *state, BTScanInsert key,
 {
 	ItemId		itemid;
 	int32		cmp;
+	int			sprefix = 0;
 
 	Assert(!key->nextkey && key->backward);
 
 	/* Verify line pointer before checking tuple */
 	itemid = PageGetItemIdCareful(state, nontargetblock, nontarget,
 								  upperbound);
-	cmp = _bt_compare(state->rel, key, nontarget, upperbound);
+	cmp = _bt_compare(state->rel, key, nontarget, upperbound, &sprefix);
 
 	/* pg_upgrade'd indexes may legally have equal sibling tuples */
 	if (!key->heapkeyspace)
diff --git a/src/backend/access/nbtree/README b/src/backend/access/nbtree/README
index 52e646c7f7..9966d3b051 100644
--- a/src/backend/access/nbtree/README
+++ b/src/backend/access/nbtree/README
@@ -901,6 +901,43 @@ large groups of duplicates, maximizing space utilization.  Note also that
 deduplication more efficient.  Deduplication can be performed infrequently,
 without merging together existing posting list tuples too often.
 
+Notes about dynamic prefix truncation
+-------------------------------------
+
+When we do a binary search on a sorted set (such as a BTree), we know that a
+tuple will be smaller than its right neighbour, and larger than its left
+neighbour.  If we know that both the left and right neighbour share a prefix
+with the scan key, we deduce that the current tuple must also share this
+prefix with the scan key, which allows us to skip this shared prefix during
+tuple compare operations when we descend the btree.
+
+However, the BTree is not a static sorted list: It is a page-based dynamic
+data structure which changes over time.  Because of this, we cannot just reuse
+the established prefix from the parent page: concurrent page deletions may
+have moved in new tuples from the keyspace left of the left separator of this
+page's downlink on the parent page, and page splits may have moved tuples
+that were previously in this page's indicated keyspace away from the page.
+This means that we have to re-establish the prefix that is shared with the
+scan key at every level of the btree: the page we're about to visit could
+contain only values that would sort left of the parent page's left downlink,
+i.e. outside the range we've built a prefix for.
+
+NOTE: This optimization is most effective on indexes where a prefix of key
+columns has many duplicate values.  Compare operations on index tuples stop at
+the first non-equal attribute, so it is quite unlikely that an index on
+(unique_values, 1, 1, 1) will ever compare the latter columns during index
+descent(_bt_search), while one on (1, 1, 1, unique_values) will essentially
+always have to compare all attributes to hit an attribute that isn't equal to
+that of the scan key's (assuming the scan key is in the index's keyspace).
+The dynamic prefix truncation allows PostgreSQL to skip most compare
+operations on the (1, 1, 1 ...) prefix, but it cannot improve the
+(unique, ...) case because that has no "wasteful" prefix that we can skip.
+
+With the above optimizations, dynamic prefix truncation improves the worst
+case complexity of indexing from O(tree_height * natts * log(tups_per_page))
+to O(tree_height * (3*natts + log(tups_per_page))) attribute compare
+operations, which can improve performance significantly.
+
 Notes about deduplication
 -------------------------
 
diff --git a/src/backend/access/nbtree/nbtinsert.c b/src/backend/access/nbtree/nbtinsert.c
index 7e8902e48c..62c87e72d8 100644
--- a/src/backend/access/nbtree/nbtinsert.c
+++ b/src/backend/access/nbtree/nbtinsert.c
@@ -328,6 +328,7 @@ _bt_search_insert(Relation rel, Relation heaprel, BTInsertState insertstate)
 		{
 			Page		page;
 			BTPageOpaque opaque;
+			int			hikey_sprefix = 0;
 
 			_bt_checkpage(rel, insertstate->buf);
 			page = BufferGetPage(insertstate->buf);
@@ -346,7 +347,8 @@ _bt_search_insert(Relation rel, Relation heaprel, BTInsertState insertstate)
 				!P_IGNORE(opaque) &&
 				PageGetFreeSpace(page) > insertstate->itemsz &&
 				PageGetMaxOffsetNumber(page) >= P_HIKEY &&
-				_bt_compare(rel, insertstate->itup_key, page, P_HIKEY) > 0)
+				_bt_compare(rel, insertstate->itup_key, page, P_HIKEY,
+							&hikey_sprefix) > 0)
 			{
 				/*
 				 * Caller can use the fastpath optimization because cached
@@ -440,7 +442,7 @@ _bt_check_unique(Relation rel, BTInsertState insertstate, Relation heapRel,
 	 * in the fastpath below, but also in the _bt_findinsertloc() call later.
 	 */
 	Assert(!insertstate->bounds_valid);
-	offset = _bt_binsrch_insert(rel, insertstate);
+	offset = _bt_binsrch_insert(rel, insertstate, 0);
 
 	/*
 	 * Scan over all equal tuples, looking for live conflicts.
@@ -450,6 +452,8 @@ _bt_check_unique(Relation rel, BTInsertState insertstate, Relation heapRel,
 	Assert(itup_key->scantid == NULL);
 	for (;;)
 	{
+		int			sprefix = 0;
+
 		/*
 		 * Each iteration of the loop processes one heap TID, not one index
 		 * tuple.  Current offset number for page isn't usually advanced on
@@ -485,7 +489,7 @@ _bt_check_unique(Relation rel, BTInsertState insertstate, Relation heapRel,
 				Assert(insertstate->bounds_valid);
 				Assert(insertstate->low >= P_FIRSTDATAKEY(opaque));
 				Assert(insertstate->low <= insertstate->stricthigh);
-				Assert(_bt_compare(rel, itup_key, page, offset) < 0);
+				Assert(_bt_compare(rel, itup_key, page, offset, &sprefix) < 0);
 				break;
 			}
 
@@ -510,7 +514,7 @@ _bt_check_unique(Relation rel, BTInsertState insertstate, Relation heapRel,
 				if (!inposting)
 				{
 					/* Plain tuple, or first TID in posting list tuple */
-					if (_bt_compare(rel, itup_key, page, offset) != 0)
+					if (_bt_compare(rel, itup_key, page, offset, &sprefix) != 0)
 						break;	/* we're past all the equal tuples */
 
 					/* Advanced curitup */
@@ -720,11 +724,12 @@ _bt_check_unique(Relation rel, BTInsertState insertstate, Relation heapRel,
 		else
 		{
 			int			highkeycmp;
+			sprefix = 0;
 
 			/* If scankey == hikey we gotta check the next page too */
 			if (P_RIGHTMOST(opaque))
 				break;
-			highkeycmp = _bt_compare(rel, itup_key, page, P_HIKEY);
+			highkeycmp = _bt_compare(rel, itup_key, page, P_HIKEY, &sprefix);
 			Assert(highkeycmp <= 0);
 			if (highkeycmp != 0)
 				break;
@@ -823,6 +828,7 @@ _bt_findinsertloc(Relation rel,
 	Page		page = BufferGetPage(insertstate->buf);
 	BTPageOpaque opaque;
 	OffsetNumber newitemoff;
+	int			hikeysprefix = 0;
 
 	opaque = BTPageGetOpaque(page);
 
@@ -867,6 +873,8 @@ _bt_findinsertloc(Relation rel,
 
 			for (;;)
 			{
+				int		sprefix = 0;
+
 				/*
 				 * Does the new tuple belong on this page?
 				 *
@@ -884,8 +892,11 @@ _bt_findinsertloc(Relation rel,
 
 				/* Test '<=', not '!=', since scantid is set now */
 				if (P_RIGHTMOST(opaque) ||
-					_bt_compare(rel, itup_key, page, P_HIKEY) <= 0)
+					_bt_compare(rel, itup_key, page, P_HIKEY, &sprefix) <= 0)
+				{
+					hikeysprefix = sprefix;
 					break;
+				}
 
 				_bt_stepright(rel, heapRel, insertstate, stack);
 				/* Update local state after stepping right */
@@ -937,6 +948,8 @@ _bt_findinsertloc(Relation rel,
 		 */
 		while (PageGetFreeSpace(page) < insertstate->itemsz)
 		{
+			int		sprefix = 0;
+
 			/*
 			 * Before considering moving right, see if we can obtain enough
 			 * space by erasing LP_DEAD items
@@ -967,9 +980,12 @@ _bt_findinsertloc(Relation rel,
 				break;
 
 			if (P_RIGHTMOST(opaque) ||
-				_bt_compare(rel, itup_key, page, P_HIKEY) != 0 ||
+				_bt_compare(rel, itup_key, page, P_HIKEY, &sprefix) != 0 ||
 				pg_prng_uint32(&pg_global_prng_state) <= (PG_UINT32_MAX / 100))
+			{
+				hikeysprefix = sprefix;
 				break;
+			}
 
 			_bt_stepright(rel, heapRel, insertstate, stack);
 			/* Update local state after stepping right */
@@ -982,10 +998,13 @@ _bt_findinsertloc(Relation rel,
 	 * We should now be on the correct page.  Find the offset within the page
 	 * for the new tuple. (Possibly reusing earlier search bounds.)
 	 */
-	Assert(P_RIGHTMOST(opaque) ||
-		   _bt_compare(rel, itup_key, page, P_HIKEY) <= 0);
+	{
+		int			sprefix PG_USED_FOR_ASSERTS_ONLY = 0;
+		Assert(P_RIGHTMOST(opaque) ||
+			   _bt_compare(rel, itup_key, page, P_HIKEY, &sprefix) <= 0);
+	}
 
-	newitemoff = _bt_binsrch_insert(rel, insertstate);
+	newitemoff = _bt_binsrch_insert(rel, insertstate, hikeysprefix);
 
 	if (insertstate->postingoff == -1)
 	{
@@ -1004,7 +1023,7 @@ _bt_findinsertloc(Relation rel,
 		 */
 		Assert(!insertstate->bounds_valid);
 		insertstate->postingoff = 0;
-		newitemoff = _bt_binsrch_insert(rel, insertstate);
+		newitemoff = _bt_binsrch_insert(rel, insertstate, hikeysprefix);
 		Assert(insertstate->postingoff == 0);
 	}
 
diff --git a/src/backend/access/nbtree/nbtsearch.c b/src/backend/access/nbtree/nbtsearch.c
index 57bcfc7e4c..1abb1f8231 100644
--- a/src/backend/access/nbtree/nbtsearch.c
+++ b/src/backend/access/nbtree/nbtsearch.c
@@ -27,6 +27,9 @@
 
 static void _bt_drop_lock_and_maybe_pin(IndexScanDesc scan, BTScanPos sp);
 static OffsetNumber _bt_binsrch(Relation rel, BTScanInsert key, Buffer buf);
+static Buffer _bt_moveright(Relation rel, Relation heaprel, BTScanInsert key,
+							Buffer buf, bool forupdate, BTStack stack,
+							int access, int *pagehikeysprefix);
 static int	_bt_binsrch_posting(BTScanInsert key, Page page,
 								OffsetNumber offnum);
 static bool _bt_readpage(IndexScanDesc scan, ScanDirection dir,
@@ -98,6 +101,7 @@ _bt_search(Relation rel, Relation heaprel, BTScanInsert key, Buffer *bufP,
 {
 	BTStack		stack_in = NULL;
 	int			page_access = BT_READ;
+	int			hikeysprefix = 0;
 
 	/* heaprel must be set whenever _bt_allocbuf is reachable */
 	Assert(access == BT_READ || access == BT_WRITE);
@@ -134,7 +138,7 @@ _bt_search(Relation rel, Relation heaprel, BTScanInsert key, Buffer *bufP,
 		 * opportunity to finish splits of internal pages too.
 		 */
 		*bufP = _bt_moveright(rel, heaprel, key, *bufP, (access == BT_WRITE),
-							  stack_in, page_access);
+							  stack_in, page_access, &hikeysprefix);
 
 		/* if this is a leaf page, we're done */
 		page = BufferGetPage(*bufP);
@@ -185,6 +189,8 @@ _bt_search(Relation rel, Relation heaprel, BTScanInsert key, Buffer *bufP,
 	 */
 	if (access == BT_WRITE && page_access == BT_READ)
 	{
+		hikeysprefix = 0;
+
 		/* trade in our read lock for a write lock */
 		_bt_unlockbuf(rel, *bufP);
 		_bt_lockbuf(rel, *bufP, BT_WRITE);
@@ -194,7 +200,8 @@ _bt_search(Relation rel, Relation heaprel, BTScanInsert key, Buffer *bufP,
 		 * but before we acquired a write lock.  If it has, we may need to
 		 * move right to its new sibling.  Do that.
 		 */
-		*bufP = _bt_moveright(rel, heaprel, key, *bufP, true, stack_in, BT_WRITE);
+		*bufP = _bt_moveright(rel, heaprel, key, *bufP, true, stack_in, BT_WRITE,
+							  &hikeysprefix);
 	}
 
 	return stack_in;
@@ -230,6 +237,9 @@ _bt_search(Relation rel, Relation heaprel, BTScanInsert key, Buffer *bufP,
  * On entry, we have the buffer pinned and a lock of the type specified by
  * 'access'.  If we move right, we release the buffer and lock and acquire
  * the same on the right sibling.  Return value is the buffer we stop at.
+ *
+ * On exit, we've updated *pagehikeysprefix with the shared prefix that
+ * the returned buffer's page's highkey shares with the search key.
  */
 Buffer
 _bt_moveright(Relation rel,
@@ -238,7 +248,8 @@ _bt_moveright(Relation rel,
 			  Buffer buf,
 			  bool forupdate,
 			  BTStack stack,
-			  int access)
+			  int access,
+			  int *pagehikeysprefix)
 {
 	Page		page;
 	BTPageOpaque opaque;
@@ -262,16 +273,34 @@ _bt_moveright(Relation rel,
 	 *
 	 * We also have to move right if we followed a link that brought us to a
 	 * dead page.
+	 *
+	 * Note: It is important that *pagehikeysprefix is set to an accurate
+	 * value at return.  Rightmost pages have no prefix, thus set it to 0.  In
+	 * all other cases, it must be updated with the prefix result of this
+	 * page's P_HIKEY's full _bt_compare.
 	 */
 	cmpval = key->nextkey ? 0 : 1;
 
 	for (;;)
 	{
+		/*
+		 * We explicitly don't reuse the prefix argument of this function,
+		 * as we may need to move multiple times, and we don't want to
+		 * overwrite the value stored in *prefix if we could reuse it.
+		 * Additionally, its value will be useless for any of our own
+		 * _bt_compare calls due to potential concurrent page splits. 
+		 */
+		int			hikeysprefix = 0;
+
 		page = BufferGetPage(buf);
 		opaque = BTPageGetOpaque(page);
 
 		if (P_RIGHTMOST(opaque))
+		{
+			/* set the compare column when breaking out of the loop */
+			*pagehikeysprefix = 0;
 			break;
+		}
 
 		/*
 		 * Finish any incomplete splits we encounter along the way.
@@ -297,14 +326,19 @@ _bt_moveright(Relation rel,
 			continue;
 		}
 
-		if (P_IGNORE(opaque) || _bt_compare(rel, key, page, P_HIKEY) >= cmpval)
+		if (P_IGNORE(opaque) ||
+			_bt_compare(rel, key, page, P_HIKEY, &hikeysprefix) >= cmpval)
 		{
-			/* step right one page */
+			/* set the compare column when breaking out of the loop */
 			buf = _bt_relandgetbuf(rel, buf, opaque->btpo_next, access);
 			continue;
 		}
 		else
+		{
+			/* make sure to communicate the established prefix */
+			*pagehikeysprefix = hikeysprefix;
 			break;
+		}
 	}
 
 	if (P_IGNORE(opaque))
@@ -344,6 +378,13 @@ _bt_binsrch(Relation rel,
 				high;
 	int32		result,
 				cmpval;
+	/*
+	 * Prefix bounds, for the high/low offset's compare columns.
+	 * "highkeyprefix" is the value for this page's high key (if any) or 1
+	 * (no established shared prefix)
+	 */
+	AttrNumber	highkeyprefix = 0,
+				lowkeyprefix = 0;
 
 	page = BufferGetPage(buf);
 	opaque = BTPageGetOpaque(page);
@@ -376,6 +417,10 @@ _bt_binsrch(Relation rel,
 	 * For nextkey=true (cmpval=0), the loop invariant is: all slots before
 	 * 'low' are <= scan key, all slots at or after 'high' are > scan key.
 	 *
+	 * We maintain highkeyprefix and lowkeyprefix to keep track of prefixes
+	 * that tuples share with the scan key, potentially allowing us to skip a
+	 * prefix in the midpoint comparison.
+	 *
 	 * We can fall out when high == low.
 	 */
 	high++;						/* establish the loop invariant for high */
@@ -385,15 +430,22 @@ _bt_binsrch(Relation rel,
 	while (high > low)
 	{
 		OffsetNumber mid = low + ((high - low) / 2);
+		int			prefix = Min(highkeyprefix, lowkeyprefix); /* update prefix bounds */
 
 		/* We have low <= mid < high, so mid points at a real slot */
 
-		result = _bt_compare(rel, key, page, mid);
+		result = _bt_compare(rel, key, page, mid, &prefix);
 
 		if (result >= cmpval)
+		{
 			low = mid + 1;
+			lowkeyprefix = prefix;
+		}
 		else
+		{
 			high = mid;
+			highkeyprefix = prefix;
+		}
 	}
 
 	/*
@@ -465,7 +517,8 @@ _bt_binsrch(Relation rel,
  * list split).
  */
 OffsetNumber
-_bt_binsrch_insert(Relation rel, BTInsertState insertstate)
+_bt_binsrch_insert(Relation rel, BTInsertState insertstate,
+				   int highkeyprefix)
 {
 	BTScanInsert key = insertstate->itup_key;
 	Page		page;
@@ -475,6 +528,7 @@ _bt_binsrch_insert(Relation rel, BTInsertState insertstate)
 				stricthigh;
 	int32		result,
 				cmpval;
+	int			lowkeyprefix = 0;
 
 	page = BufferGetPage(insertstate->buf);
 	opaque = BTPageGetOpaque(page);
@@ -525,16 +579,22 @@ _bt_binsrch_insert(Relation rel, BTInsertState insertstate)
 	while (high > low)
 	{
 		OffsetNumber mid = low + ((high - low) / 2);
+		int			prefix = Min(highkeyprefix, lowkeyprefix);
 
 		/* We have low <= mid < high, so mid points at a real slot */
 
-		result = _bt_compare(rel, key, page, mid);
+		result = _bt_compare(rel, key, page, mid, &prefix);
 
 		if (result >= cmpval)
+		{
 			low = mid + 1;
+			lowkeyprefix = prefix;
+		}
 		else
 		{
 			high = mid;
+			highkeyprefix = prefix;
+
 			if (result != 0)
 				stricthigh = high;
 		}
@@ -669,6 +729,13 @@ _bt_binsrch_posting(BTScanInsert key, Page page, OffsetNumber offnum)
  * matching TID in the posting tuple, which caller must handle
  * themselves (e.g., by splitting the posting list tuple).
  *
+ * NOTE: The "sprefix" argument must refer to the number of attributes of
+ * the index tuple of which the caller knows that are equal to the scankey's
+ * attributes. This means 0 for "no known matching attributes", up to the
+ * number of key attributes if the caller knows that all key attributes of
+ * the index tuple match those of the scan key.
+ * See also backend/access/nbtree/README for details.
+ *
  * CRUCIAL NOTE: on a non-leaf page, the first data key is assumed to be
  * "minus infinity": this routine will always claim it is less than the
  * scankey.  The actual key value stored is explicitly truncated to 0
@@ -682,7 +749,8 @@ int32
 _bt_compare(Relation rel,
 			BTScanInsert key,
 			Page page,
-			OffsetNumber offnum)
+			OffsetNumber offnum,
+			int *sprefix)
 {
 	TupleDesc	itupdesc = RelationGetDescr(rel);
 	BTPageOpaque opaque = BTPageGetOpaque(page);
@@ -722,8 +790,11 @@ _bt_compare(Relation rel,
 	ncmpkey = Min(ntupatts, key->keysz);
 	Assert(key->heapkeyspace || ncmpkey == key->keysz);
 	Assert(!BTreeTupleIsPosting(itup) || key->allequalimage);
-	scankey = key->scankeys;
-	for (int i = 1; i <= ncmpkey; i++)
+
+	scankey = key->scankeys + *sprefix;
+
+	/* get the first non-equal attribute number */
+	for (int i = *sprefix + 1; i <= ncmpkey; i++)
 	{
 		Datum		datum;
 		bool		isNull;
@@ -767,11 +838,20 @@ _bt_compare(Relation rel,
 
 		/* if the keys are unequal, return the difference */
 		if (result != 0)
+		{
+			*sprefix = i - 1;
 			return result;
+		}
 
 		scankey++;
 	}
 
+	/*
+	 * All tuple attributes are equal to the scan key, only later attributes
+	 * could potentially not equal the scan key.
+	 */
+	*sprefix = ntupatts;
+
 	/*
 	 * All non-truncated attributes (other than heap TID) were found to be
 	 * equal.  Treat truncated attributes as minus infinity when scankey has a
diff --git a/src/include/access/nbtree.h b/src/include/access/nbtree.h
index 7493043348..95a5b46b3e 100644
--- a/src/include/access/nbtree.h
+++ b/src/include/access/nbtree.h
@@ -1272,11 +1272,10 @@ extern void _bt_pendingfsm_finalize(Relation rel, BTVacState *vstate);
  */
 extern BTStack _bt_search(Relation rel, Relation heaprel, BTScanInsert key,
 						  Buffer *bufP, int access);
-extern Buffer _bt_moveright(Relation rel, Relation heaprel, BTScanInsert key,
-							Buffer buf, bool forupdate, BTStack stack,
-							int access);
-extern OffsetNumber _bt_binsrch_insert(Relation rel, BTInsertState insertstate);
-extern int32 _bt_compare(Relation rel, BTScanInsert key, Page page, OffsetNumber offnum);
+extern OffsetNumber _bt_binsrch_insert(Relation rel, BTInsertState insertstate,
+									   int highkeyprefix);
+extern int32 _bt_compare(Relation rel, BTScanInsert key, Page page,
+						 OffsetNumber offnum, int *sprefix);
 extern bool _bt_first(IndexScanDesc scan, ScanDirection dir);
 extern bool _bt_next(IndexScanDesc scan, ScanDirection dir);
 extern Buffer _bt_get_endpoint(Relation rel, uint32 level, bool rightmost);
-- 
2.40.1

