From a3aad32e679d75ec98f05ea7fa24ebc6109d434e Mon Sep 17 00:00:00 2001
From: Matthias van de Meent <boekewurm+postgres@gmail.com>
Date: Fri, 16 Sep 2022 17:38:32 +0200
Subject: [PATCH v7] Implement dynamic prefix compression in nbtree

Because tuples are ordered on the page, if some prefix of the
scan attributes on both sides of the compared tuple are equal
to the scankey, then the current tuple that is being compared
must also have those prefixing attributes that equal the
scankey.

We cannot generally propagate this information to _binsrch on
lower pages, as this downstream page may have concurrently split
and/or have merged with its deleted left neighbour (see [0]),
which moves the keyspace of the linked page. We thus can only
trust the current state of this current page for this optimization,
which means we must validate this state each time we open the page.

Although this limits the overall applicability of the
performance improvement, it still allows for a nice performance
improvement in most cases where initial columns have many
duplicate values and a compare function that is not cheap.

As an exception to the above rule, most of the time a pages'
highkey is equal to the right seperator on the parent page due to
how btree splits are done. By storing this right seperator from
the parent page and then validating that the highkey of the child
page contains the exact same data, we can restore the right prefix
bound without having to call the relatively expensive _bt_compare.

In the worst-case scenario of a concurrent page split, we'd still
have to validate the full key, but that doesn't happen very often,
so we pay one branch and memcmp() to validate whether .
---
 contrib/amcheck/verify_nbtree.c       |  17 ++--
 src/backend/access/nbtree/README      |  42 +++++++++
 src/backend/access/nbtree/nbtinsert.c |  34 +++++---
 src/backend/access/nbtree/nbtsearch.c | 119 +++++++++++++++++++++++---
 src/include/access/nbtree.h           |  10 ++-
 5 files changed, 188 insertions(+), 34 deletions(-)

diff --git a/contrib/amcheck/verify_nbtree.c b/contrib/amcheck/verify_nbtree.c
index 9021d156eb..041ff5464e 100644
--- a/contrib/amcheck/verify_nbtree.c
+++ b/contrib/amcheck/verify_nbtree.c
@@ -2701,6 +2701,7 @@ bt_rootdescend(BtreeCheckState *state, IndexTuple itup)
 		BTInsertStateData insertstate;
 		OffsetNumber offnum;
 		Page		page;
+		AttrNumber	cmpcol = 1;
 
 		insertstate.itup = itup;
 		insertstate.itemsz = MAXALIGN(IndexTupleSize(itup));
@@ -2710,13 +2711,13 @@ bt_rootdescend(BtreeCheckState *state, IndexTuple itup)
 		insertstate.buf = lbuf;
 
 		/* Get matching tuple on leaf page */
-		offnum = _bt_binsrch_insert(state->rel, &insertstate);
+		offnum = _bt_binsrch_insert(state->rel, &insertstate, 1);
 		/* Compare first >= matching item on leaf page, if any */
 		page = BufferGetPage(lbuf);
 		/* Should match on first heap TID when tuple has a posting list */
 		if (offnum <= PageGetMaxOffsetNumber(page) &&
 			insertstate.postingoff <= 0 &&
-			_bt_compare(state->rel, key, page, offnum) == 0)
+			_bt_compare(state->rel, key, page, offnum, &cmpcol) == 0)
 			exists = true;
 		_bt_relbuf(state->rel, lbuf);
 	}
@@ -2778,6 +2779,7 @@ invariant_l_offset(BtreeCheckState *state, BTScanInsert key,
 {
 	ItemId		itemid;
 	int32		cmp;
+	AttrNumber	cmpcol = 1;
 
 	Assert(key->pivotsearch);
 
@@ -2788,7 +2790,7 @@ invariant_l_offset(BtreeCheckState *state, BTScanInsert key,
 	if (!key->heapkeyspace)
 		return invariant_leq_offset(state, key, upperbound);
 
-	cmp = _bt_compare(state->rel, key, state->target, upperbound);
+	cmp = _bt_compare(state->rel, key, state->target, upperbound, &cmpcol);
 
 	/*
 	 * _bt_compare() is capable of determining that a scankey with a
@@ -2840,10 +2842,11 @@ invariant_leq_offset(BtreeCheckState *state, BTScanInsert key,
 					 OffsetNumber upperbound)
 {
 	int32		cmp;
+	AttrNumber	cmpcol = 1;
 
 	Assert(key->pivotsearch);
 
-	cmp = _bt_compare(state->rel, key, state->target, upperbound);
+	cmp = _bt_compare(state->rel, key, state->target, upperbound, &cmpcol);
 
 	return cmp <= 0;
 }
@@ -2863,10 +2866,11 @@ invariant_g_offset(BtreeCheckState *state, BTScanInsert key,
 				   OffsetNumber lowerbound)
 {
 	int32		cmp;
+	AttrNumber	cmpcol = 1;
 
 	Assert(key->pivotsearch);
 
-	cmp = _bt_compare(state->rel, key, state->target, lowerbound);
+	cmp = _bt_compare(state->rel, key, state->target, lowerbound, &cmpcol);
 
 	/* pg_upgrade'd indexes may legally have equal sibling tuples */
 	if (!key->heapkeyspace)
@@ -2901,13 +2905,14 @@ invariant_l_nontarget_offset(BtreeCheckState *state, BTScanInsert key,
 {
 	ItemId		itemid;
 	int32		cmp;
+	AttrNumber	cmpcol = 1;
 
 	Assert(key->pivotsearch);
 
 	/* Verify line pointer before checking tuple */
 	itemid = PageGetItemIdCareful(state, nontargetblock, nontarget,
 								  upperbound);
-	cmp = _bt_compare(state->rel, key, nontarget, upperbound);
+	cmp = _bt_compare(state->rel, key, nontarget, upperbound, &cmpcol);
 
 	/* pg_upgrade'd indexes may legally have equal sibling tuples */
 	if (!key->heapkeyspace)
diff --git a/src/backend/access/nbtree/README b/src/backend/access/nbtree/README
index 5529afc1fe..5df29a692e 100644
--- a/src/backend/access/nbtree/README
+++ b/src/backend/access/nbtree/README
@@ -901,6 +901,48 @@ large groups of duplicates, maximizing space utilization.  Note also that
 deduplication more efficient.  Deduplication can be performed infrequently,
 without merging together existing posting list tuples too often.
 
+Notes about dynamic prefix truncation
+-------------------------------------
+
+Because NBTrees have a sorted keyspace, when we have determined that some
+prefixing columns of tuples on both sides of the tuple that is being
+compared are equal to the scankey, then the current tuple must also share
+this prefix with the scankey. This allows us to skip comparing those columns,
+saving the indirect function calls in the compare operation.
+
+We can only use this constraint if we have proven this information while we
+hold a pin on the page, so this is only useful on the page level: Concurrent
+page deletions and splits may have moved the keyspace of the page referenced
+by a parent page to the right. If we re-used high- and low-column-prefixes,
+we would not be able to detect a change of keyspace from e.g. [2,3) to [1,2),
+and subsequently return invalid results. This race condition can only be
+prevented by re-establishing the prefix-equal-columns for each page.
+
+There is positive news, though: A page split will put a binary copy of the
+page's highkey in the parent page. This means that we usually can reuse
+the compare result of the parent page's downlink's right sibling when we
+discover that their representation is binary equal. In general this will
+be the case, as only in concurrent page splits and deletes the downlink
+may not point to the page with the correct highkey bound (_bt_moveright
+only rarely actually moves right).
+
+To implement this, we copy the downlink's right differentiator key into a
+temporary buffer, which is then compared against the child pages' highkey.
+If they match, we reuse the compare result (plus prefix) we had for it from
+the parent page, if not, we need to do a full _bt_compare. Because memcpy +
+memcmp is cheap compared to _bt_compare, and because it's quite unlikely
+that we guess wrong this speeds up our _bt_moveright code (at cost of some
+stack memory in _bt_search and some overhead in case of a wrong prediction)
+
+Now that we have prefix bounds on the highest value of a page, the
+_bt_binsrch procedure will use this result as a rightmost prefix compare,
+and for each step in the binary search (that does not compare less than the
+insert key) improve the equal-prefix bounds.
+
+Using the above optimization, we now (on average) only need 2 full key
+compares per page, as opposed to ceil(log2(ntupsperpage)) + 1; a significant
+improvement.
+
 Notes about deduplication
 -------------------------
 
diff --git a/src/backend/access/nbtree/nbtinsert.c b/src/backend/access/nbtree/nbtinsert.c
index f6f4af8bfe..36e2d8ffed 100644
--- a/src/backend/access/nbtree/nbtinsert.c
+++ b/src/backend/access/nbtree/nbtinsert.c
@@ -326,6 +326,7 @@ _bt_search_insert(Relation rel, BTInsertState insertstate)
 		{
 			Page		page;
 			BTPageOpaque opaque;
+			AttrNumber	cmpcol = 1;
 
 			_bt_checkpage(rel, insertstate->buf);
 			page = BufferGetPage(insertstate->buf);
@@ -344,7 +345,8 @@ _bt_search_insert(Relation rel, BTInsertState insertstate)
 				!P_IGNORE(opaque) &&
 				PageGetFreeSpace(page) > insertstate->itemsz &&
 				PageGetMaxOffsetNumber(page) >= P_HIKEY &&
-				_bt_compare(rel, insertstate->itup_key, page, P_HIKEY) > 0)
+				_bt_compare(rel, insertstate->itup_key, page, P_HIKEY,
+							&cmpcol) > 0)
 			{
 				/*
 				 * Caller can use the fastpath optimization because cached
@@ -438,7 +440,7 @@ _bt_check_unique(Relation rel, BTInsertState insertstate, Relation heapRel,
 	 * in the fastpath below, but also in the _bt_findinsertloc() call later.
 	 */
 	Assert(!insertstate->bounds_valid);
-	offset = _bt_binsrch_insert(rel, insertstate);
+	offset = _bt_binsrch_insert(rel, insertstate, 1);
 
 	/*
 	 * Scan over all equal tuples, looking for live conflicts.
@@ -448,6 +450,8 @@ _bt_check_unique(Relation rel, BTInsertState insertstate, Relation heapRel,
 	Assert(itup_key->scantid == NULL);
 	for (;;)
 	{
+		AttrNumber	cmpcol = 1;
+
 		/*
 		 * Each iteration of the loop processes one heap TID, not one index
 		 * tuple.  Current offset number for page isn't usually advanced on
@@ -483,7 +487,7 @@ _bt_check_unique(Relation rel, BTInsertState insertstate, Relation heapRel,
 				Assert(insertstate->bounds_valid);
 				Assert(insertstate->low >= P_FIRSTDATAKEY(opaque));
 				Assert(insertstate->low <= insertstate->stricthigh);
-				Assert(_bt_compare(rel, itup_key, page, offset) < 0);
+				Assert(_bt_compare(rel, itup_key, page, offset, &cmpcol) < 0);
 				break;
 			}
 
@@ -508,7 +512,7 @@ _bt_check_unique(Relation rel, BTInsertState insertstate, Relation heapRel,
 				if (!inposting)
 				{
 					/* Plain tuple, or first TID in posting list tuple */
-					if (_bt_compare(rel, itup_key, page, offset) != 0)
+					if (_bt_compare(rel, itup_key, page, offset, &cmpcol) != 0)
 						break;	/* we're past all the equal tuples */
 
 					/* Advanced curitup */
@@ -718,11 +722,12 @@ _bt_check_unique(Relation rel, BTInsertState insertstate, Relation heapRel,
 		else
 		{
 			int			highkeycmp;
+			cmpcol = 1;
 
 			/* If scankey == hikey we gotta check the next page too */
 			if (P_RIGHTMOST(opaque))
 				break;
-			highkeycmp = _bt_compare(rel, itup_key, page, P_HIKEY);
+			highkeycmp = _bt_compare(rel, itup_key, page, P_HIKEY, &cmpcol);
 			Assert(highkeycmp <= 0);
 			if (highkeycmp != 0)
 				break;
@@ -865,6 +870,8 @@ _bt_findinsertloc(Relation rel,
 
 			for (;;)
 			{
+				AttrNumber	cmpcol = 1;
+
 				/*
 				 * Does the new tuple belong on this page?
 				 *
@@ -882,7 +889,7 @@ _bt_findinsertloc(Relation rel,
 
 				/* Test '<=', not '!=', since scantid is set now */
 				if (P_RIGHTMOST(opaque) ||
-					_bt_compare(rel, itup_key, page, P_HIKEY) <= 0)
+					_bt_compare(rel, itup_key, page, P_HIKEY, &cmpcol) <= 0)
 					break;
 
 				_bt_stepright(rel, insertstate, stack);
@@ -935,6 +942,8 @@ _bt_findinsertloc(Relation rel,
 		 */
 		while (PageGetFreeSpace(page) < insertstate->itemsz)
 		{
+			AttrNumber	cmpcol = 1;
+
 			/*
 			 * Before considering moving right, see if we can obtain enough
 			 * space by erasing LP_DEAD items
@@ -965,7 +974,7 @@ _bt_findinsertloc(Relation rel,
 				break;
 
 			if (P_RIGHTMOST(opaque) ||
-				_bt_compare(rel, itup_key, page, P_HIKEY) != 0 ||
+				_bt_compare(rel, itup_key, page, P_HIKEY, &cmpcol) != 0 ||
 				pg_prng_uint32(&pg_global_prng_state) <= (PG_UINT32_MAX / 100))
 				break;
 
@@ -980,10 +989,13 @@ _bt_findinsertloc(Relation rel,
 	 * We should now be on the correct page.  Find the offset within the page
 	 * for the new tuple. (Possibly reusing earlier search bounds.)
 	 */
-	Assert(P_RIGHTMOST(opaque) ||
-		   _bt_compare(rel, itup_key, page, P_HIKEY) <= 0);
+	{
+		AttrNumber	cmpcol PG_USED_FOR_ASSERTS_ONLY = 1;
+		Assert(P_RIGHTMOST(opaque) ||
+			   _bt_compare(rel, itup_key, page, P_HIKEY, &cmpcol) <= 0);
+	}
 
-	newitemoff = _bt_binsrch_insert(rel, insertstate);
+	newitemoff = _bt_binsrch_insert(rel, insertstate, 1);
 
 	if (insertstate->postingoff == -1)
 	{
@@ -1002,7 +1014,7 @@ _bt_findinsertloc(Relation rel,
 		 */
 		Assert(!insertstate->bounds_valid);
 		insertstate->postingoff = 0;
-		newitemoff = _bt_binsrch_insert(rel, insertstate);
+		newitemoff = _bt_binsrch_insert(rel, insertstate, 1);
 		Assert(insertstate->postingoff == 0);
 	}
 
diff --git a/src/backend/access/nbtree/nbtsearch.c b/src/backend/access/nbtree/nbtsearch.c
index c74543bfde..67c8705c54 100644
--- a/src/backend/access/nbtree/nbtsearch.c
+++ b/src/backend/access/nbtree/nbtsearch.c
@@ -25,7 +25,8 @@
 
 
 static void _bt_drop_lock_and_maybe_pin(IndexScanDesc scan, BTScanPos sp);
-static OffsetNumber _bt_binsrch(Relation rel, BTScanInsert key, Buffer buf);
+static OffsetNumber _bt_binsrch(Relation rel, BTScanInsert key, Buffer buf,
+								AttrNumber *highkeycmpcol);
 static int	_bt_binsrch_posting(BTScanInsert key, Page page,
 								OffsetNumber offnum);
 static bool _bt_readpage(IndexScanDesc scan, ScanDirection dir,
@@ -98,6 +99,8 @@ _bt_search(Relation rel, BTScanInsert key, Buffer *bufP, int access,
 {
 	BTStack		stack_in = NULL;
 	int			page_access = BT_READ;
+	char		tupdatabuf[BLCKSZ / 3];
+	AttrNumber	highkeycmpcol = 1;
 
 	/* Get the root page to start with */
 	*bufP = _bt_getroot(rel, access);
@@ -130,7 +133,8 @@ _bt_search(Relation rel, BTScanInsert key, Buffer *bufP, int access,
 		 * opportunity to finish splits of internal pages too.
 		 */
 		*bufP = _bt_moveright(rel, key, *bufP, (access == BT_WRITE), stack_in,
-							  page_access, snapshot);
+							  page_access, snapshot, &highkeycmpcol,
+							  (char *) tupdatabuf);
 
 		/* if this is a leaf page, we're done */
 		page = BufferGetPage(*bufP);
@@ -142,12 +146,15 @@ _bt_search(Relation rel, BTScanInsert key, Buffer *bufP, int access,
 		 * Find the appropriate pivot tuple on this page.  Its downlink points
 		 * to the child page that we're about to descend to.
 		 */
-		offnum = _bt_binsrch(rel, key, *bufP);
+		offnum = _bt_binsrch(rel, key, *bufP, &highkeycmpcol);
 		itemid = PageGetItemId(page, offnum);
 		itup = (IndexTuple) PageGetItem(page, itemid);
 		Assert(BTreeTupleIsPivot(itup) || !key->heapkeyspace);
 		child = BTreeTupleGetDownLink(itup);
 
+		Assert(IndexTupleSize(itup) < sizeof(tupdatabuf));
+		memcpy((char *) tupdatabuf, (char *) itup, IndexTupleSize(itup));
+
 		/*
 		 * We need to save the location of the pivot tuple we chose in a new
 		 * stack entry for this page/level.  If caller ends up splitting a
@@ -181,6 +188,8 @@ _bt_search(Relation rel, BTScanInsert key, Buffer *bufP, int access,
 	 */
 	if (access == BT_WRITE && page_access == BT_READ)
 	{
+		highkeycmpcol = 1;
+
 		/* trade in our read lock for a write lock */
 		_bt_unlockbuf(rel, *bufP);
 		_bt_lockbuf(rel, *bufP, BT_WRITE);
@@ -191,7 +200,7 @@ _bt_search(Relation rel, BTScanInsert key, Buffer *bufP, int access,
 		 * move right to its new sibling.  Do that.
 		 */
 		*bufP = _bt_moveright(rel, key, *bufP, true, stack_in, BT_WRITE,
-							  snapshot);
+							  snapshot, &highkeycmpcol, (char *) tupdatabuf);
 	}
 
 	return stack_in;
@@ -239,12 +248,16 @@ _bt_moveright(Relation rel,
 			  bool forupdate,
 			  BTStack stack,
 			  int access,
-			  Snapshot snapshot)
+			  Snapshot snapshot,
+			  AttrNumber *comparecol,
+			  char *tupdatabuf)
 {
 	Page		page;
 	BTPageOpaque opaque;
 	int32		cmpval;
 
+	Assert(PointerIsValid(comparecol) && PointerIsValid(tupdatabuf));
+
 	/*
 	 * When nextkey = false (normal case): if the scan key that brought us to
 	 * this page is > the high key stored on the page, then the page has split
@@ -266,12 +279,17 @@ _bt_moveright(Relation rel,
 
 	for (;;)
 	{
+		AttrNumber	cmpcol = 1;
+
 		page = BufferGetPage(buf);
 		TestForOldSnapshot(snapshot, rel, page);
 		opaque = BTPageGetOpaque(page);
 
 		if (P_RIGHTMOST(opaque))
+		{
+			*comparecol = 1;
 			break;
+		}
 
 		/*
 		 * Finish any incomplete splits we encounter along the way.
@@ -297,14 +315,55 @@ _bt_moveright(Relation rel,
 			continue;
 		}
 
-		if (P_IGNORE(opaque) || _bt_compare(rel, key, page, P_HIKEY) >= cmpval)
+		/*
+		 * tupdatabuf is filled with the right seperator of the parent node.
+		 * This allows us to do a binary equality check between the parent
+		 * node's right seperator (which is < key) and this page's P_HIKEY.
+		 * If they equal, we can reuse the result of the parent node's
+		 * rightkey compare, which means we can potentially save a full key
+		 * compare (which includes indirect calls to attribute comparison
+		 * functions).
+		 * 
+		 * Without this, we'd on average use 3 full key compares per page before
+		 * we achieve full dynamic prefix bounds, but with this optimization
+		 * that is only 2.
+		 * 
+		 * 3 compares: 1 for the highkey (rightmost), and on average 2 before
+		 * we move right in the binary search on the page, this average equals
+		 * SUM (1/2 ^ x) for x from 0 to log(n items)), which tends to 2.
+		 */
+		if (!P_IGNORE(opaque) && *comparecol > 1)
+		{
+			IndexTuple itup = (IndexTuple) PageGetItem(page, PageGetItemId(page, P_HIKEY));
+			IndexTuple buftuple = (IndexTuple) tupdatabuf;
+			if (IndexTupleSize(itup) == IndexTupleSize(buftuple))
+			{
+				char *dataptr = (char *) itup;
+
+				if (memcmp(dataptr + sizeof(IndexTupleData),
+						   tupdatabuf + sizeof(IndexTupleData),
+						   IndexTupleSize(itup) - sizeof(IndexTupleData)) == 0)
+					break;
+			} else {
+				*comparecol = 1;
+			}
+		} else {
+			*comparecol = 1;
+		}
+
+		if (P_IGNORE(opaque) ||
+				_bt_compare(rel, key, page, P_HIKEY, &cmpcol) >= cmpval)
 		{
+			*comparecol = 1;
 			/* step right one page */
 			buf = _bt_relandgetbuf(rel, buf, opaque->btpo_next, access);
 			continue;
 		}
 		else
+		{
+			*comparecol = cmpcol;
 			break;
+		}
 	}
 
 	if (P_IGNORE(opaque))
@@ -337,7 +396,8 @@ _bt_moveright(Relation rel,
 static OffsetNumber
 _bt_binsrch(Relation rel,
 			BTScanInsert key,
-			Buffer buf)
+			Buffer buf,
+			AttrNumber *highkeycmpcol)
 {
 	Page		page;
 	BTPageOpaque opaque;
@@ -345,6 +405,8 @@ _bt_binsrch(Relation rel,
 				high;
 	int32		result,
 				cmpval;
+	AttrNumber	highcmpcol = *highkeycmpcol,
+				lowcmpcol = 1;
 
 	page = BufferGetPage(buf);
 	opaque = BTPageGetOpaque(page);
@@ -386,16 +448,25 @@ _bt_binsrch(Relation rel,
 	while (high > low)
 	{
 		OffsetNumber mid = low + ((high - low) / 2);
+		AttrNumber cmpcol = Min(highcmpcol, lowcmpcol);
 
 		/* We have low <= mid < high, so mid points at a real slot */
 
-		result = _bt_compare(rel, key, page, mid);
+		result = _bt_compare(rel, key, page, mid, &cmpcol);
 
 		if (result >= cmpval)
+		{
 			low = mid + 1;
+			lowcmpcol = cmpcol;
+		}
 		else
+		{
 			high = mid;
+			highcmpcol = cmpcol;
+		}
 	}
+	
+	*highkeycmpcol = highcmpcol;
 
 	/*
 	 * At this point we have high == low, but be careful: they could point
@@ -439,7 +510,8 @@ _bt_binsrch(Relation rel,
  * list split).
  */
 OffsetNumber
-_bt_binsrch_insert(Relation rel, BTInsertState insertstate)
+_bt_binsrch_insert(Relation rel, BTInsertState insertstate,
+				   AttrNumber highcmpcol)
 {
 	BTScanInsert key = insertstate->itup_key;
 	Page		page;
@@ -449,6 +521,7 @@ _bt_binsrch_insert(Relation rel, BTInsertState insertstate)
 				stricthigh;
 	int32		result,
 				cmpval;
+	AttrNumber	lowcmpcol = 1;
 
 	page = BufferGetPage(insertstate->buf);
 	opaque = BTPageGetOpaque(page);
@@ -499,16 +572,22 @@ _bt_binsrch_insert(Relation rel, BTInsertState insertstate)
 	while (high > low)
 	{
 		OffsetNumber mid = low + ((high - low) / 2);
+		AttrNumber	cmpcol = Min(highcmpcol, lowcmpcol);
 
 		/* We have low <= mid < high, so mid points at a real slot */
 
-		result = _bt_compare(rel, key, page, mid);
+		result = _bt_compare(rel, key, page, mid, &cmpcol);
 
 		if (result >= cmpval)
+		{
 			low = mid + 1;
+			lowcmpcol = cmpcol;
+		}
 		else
 		{
 			high = mid;
+			highcmpcol = cmpcol;
+
 			if (result != 0)
 				stricthigh = high;
 		}
@@ -656,7 +735,8 @@ int32
 _bt_compare(Relation rel,
 			BTScanInsert key,
 			Page page,
-			OffsetNumber offnum)
+			OffsetNumber offnum,
+			AttrNumber *comparecol)
 {
 	TupleDesc	itupdesc = RelationGetDescr(rel);
 	BTPageOpaque opaque = BTPageGetOpaque(page);
@@ -696,8 +776,9 @@ _bt_compare(Relation rel,
 	ncmpkey = Min(ntupatts, key->keysz);
 	Assert(key->heapkeyspace || ncmpkey == key->keysz);
 	Assert(!BTreeTupleIsPosting(itup) || key->allequalimage);
-	scankey = key->scankeys;
-	for (int i = 1; i <= ncmpkey; i++)
+
+	scankey = key->scankeys + ((*comparecol) - 1);
+	for (int i = *comparecol; i <= ncmpkey; i++)
 	{
 		Datum		datum;
 		bool		isNull;
@@ -741,11 +822,20 @@ _bt_compare(Relation rel,
 
 		/* if the keys are unequal, return the difference */
 		if (result != 0)
+		{
+			*comparecol = i;
 			return result;
+		}
 
 		scankey++;
 	}
 
+	/*
+	 * All tuple attributes are equal to the scan key, only later attributes
+	 * could potentially not equal the scan key.
+	 */
+	*comparecol = ntupatts + 1;
+
 	/*
 	 * All non-truncated attributes (other than heap TID) were found to be
 	 * equal.  Treat truncated attributes as minus infinity when scankey has a
@@ -876,6 +966,7 @@ _bt_first(IndexScanDesc scan, ScanDirection dir)
 	StrategyNumber strat_total;
 	BTScanPosItem *currItem;
 	BlockNumber blkno;
+	AttrNumber	cmpcol = 1;
 
 	Assert(!BTScanPosIsValid(so->currPos));
 
@@ -1392,7 +1483,7 @@ _bt_first(IndexScanDesc scan, ScanDirection dir)
 	_bt_initialize_more_data(so, dir);
 
 	/* position to the precise item on the page */
-	offnum = _bt_binsrch(rel, &inskey, buf);
+	offnum = _bt_binsrch(rel, &inskey, buf, &cmpcol);
 
 	/*
 	 * If nextkey = false, we are positioned at the first item >= scan key, or
diff --git a/src/include/access/nbtree.h b/src/include/access/nbtree.h
index 8e4f6864e5..ddc34a7a9e 100644
--- a/src/include/access/nbtree.h
+++ b/src/include/access/nbtree.h
@@ -1225,9 +1225,13 @@ extern void _bt_pendingfsm_finalize(Relation rel, BTVacState *vstate);
 extern BTStack _bt_search(Relation rel, BTScanInsert key, Buffer *bufP,
 						  int access, Snapshot snapshot);
 extern Buffer _bt_moveright(Relation rel, BTScanInsert key, Buffer buf,
-							bool forupdate, BTStack stack, int access, Snapshot snapshot);
-extern OffsetNumber _bt_binsrch_insert(Relation rel, BTInsertState insertstate);
-extern int32 _bt_compare(Relation rel, BTScanInsert key, Page page, OffsetNumber offnum);
+							bool forupdate, BTStack stack, int access,
+							Snapshot snapshot, AttrNumber *comparecol,
+							char *tupdatabuf);
+extern OffsetNumber _bt_binsrch_insert(Relation rel, BTInsertState insertstate,
+									   AttrNumber highcmpcol);
+extern int32 _bt_compare(Relation rel, BTScanInsert key, Page page,
+						 OffsetNumber offnum, AttrNumber *comparecol);
 extern bool _bt_first(IndexScanDesc scan, ScanDirection dir);
 extern bool _bt_next(IndexScanDesc scan, ScanDirection dir);
 extern Buffer _bt_get_endpoint(Relation rel, uint32 level, bool rightmost,
-- 
2.30.2

