From 9cef26a6ea695544d439f27be1c61f2af5b47530 Mon Sep 17 00:00:00 2001
From: Peter Geoghegan <pg@bowt.ie>
Date: Wed, 18 Jun 2025 19:32:57 -0400
Subject: [PATCH v3 2/2] Simplify and improve row compare NULL handling.

Improve _bt_first's handling of NULL row comparison members in such a
way as to make it consistent with _bt_check_rowcompare's approach.  In
other words, make sure that code that starts primitive index scans that
involve row compares fully agrees with code that ends primitive index
scans -- even in the presence of a NULL row compare member.  This makes
row comparisons more similar to scalar inequality keys, obviating the
need for _bt_set_startikey to consider row comparisons directly.

_bt_first will now avoid uselessly scanning earlier index tuples, that
cannot possibly contain matching tuples due to the use of a NULL row
compare member.  The goal isn't to improve performance; the goal is to
make _bt_first agree with _bt_check_rowcompare about where primitive
index scans should start and end.

Also go back to marking row compare members after the first as required
when safe.  This is more or less a straight revert of 2016 bugfix commit
a298a1e0.  We can address the complaint that led to that commit by being
more careful about the conditions where NULLs can end the scan in the
row compare path (for row members beyond the first one, which can once
again be marked required when safe).

Follow-up to bugfix commit 5f4d98d4, which taught _bt_set_startikey to
avoid the use of forcenonrequired mode iff it reached a row compare key.
Now _bt_set_startikey never needs to avoid the use of nonrequired mode,
which is simpler, more consistent, and more robust.
---
 src/backend/access/nbtree/nbtpreprocesskeys.c |  19 +-
 src/backend/access/nbtree/nbtsearch.c         | 115 ++++++----
 src/backend/access/nbtree/nbtutils.c          | 210 ++++++++++--------
 3 files changed, 201 insertions(+), 143 deletions(-)

diff --git a/src/backend/access/nbtree/nbtpreprocesskeys.c b/src/backend/access/nbtree/nbtpreprocesskeys.c
index 855b32e83..58c735c0b 100644
--- a/src/backend/access/nbtree/nbtpreprocesskeys.c
+++ b/src/backend/access/nbtree/nbtpreprocesskeys.c
@@ -777,12 +777,25 @@ _bt_mark_scankey_required(ScanKey skey)
 	if (skey->sk_flags & SK_ROW_HEADER)
 	{
 		ScanKey		subkey = (ScanKey) DatumGetPointer(skey->sk_argument);
+		AttrNumber	attno = skey->sk_attno;
 
 		/* First subkey should be same column/operator as the header */
-		Assert(subkey->sk_flags & SK_ROW_MEMBER);
-		Assert(subkey->sk_attno == skey->sk_attno);
+		Assert(subkey->sk_attno == attno);
 		Assert(subkey->sk_strategy == skey->sk_strategy);
-		subkey->sk_flags |= addflags;
+
+		for (;;)
+		{
+			Assert(subkey->sk_flags & SK_ROW_MEMBER);
+			if (subkey->sk_attno != attno)
+				break;			/* non-adjacent key, so not required */
+			if (subkey->sk_strategy != skey->sk_strategy)
+				break;			/* wrong direction, so not required */
+			subkey->sk_flags |= addflags;
+			if (subkey->sk_flags & SK_ROW_END)
+				break;
+			subkey++;
+			attno++;
+		}
 	}
 }
 
diff --git a/src/backend/access/nbtree/nbtsearch.c b/src/backend/access/nbtree/nbtsearch.c
index a40b0cf81..7d8417425 100644
--- a/src/backend/access/nbtree/nbtsearch.c
+++ b/src/backend/access/nbtree/nbtsearch.c
@@ -1257,6 +1257,8 @@ _bt_first(IndexScanDesc scan, ScanDirection dir)
 			 * Row comparison header: look to the first row member instead
 			 */
 			ScanKey		subkey = (ScanKey) DatumGetPointer(cur->sk_argument);
+			bool		keep_strat_total = false;
+			bool		tighten_strat_total = false;
 
 			/*
 			 * Cannot be a NULL in the first row member: _bt_preprocess_keys
@@ -1267,6 +1269,14 @@ _bt_first(IndexScanDesc scan, ScanDirection dir)
 			Assert(subkey->sk_attno == cur->sk_attno);
 			Assert(!(subkey->sk_flags & SK_ISNULL));
 
+			/*
+			 * A row comparison key can only become an initial position key
+			 * when it was marked required by preprocessing, which means that
+			 * it must be the final key we picked as a positioning key
+			 */
+			Assert(i == keysz - 1);
+			Assert(cur->sk_flags & (SK_BT_REQFWD | SK_BT_REQBKWD));
+
 			/*
 			 * The member scankeys are already in insertion format (ie, they
 			 * have sk_func = 3-way-comparison function)
@@ -1274,58 +1284,73 @@ _bt_first(IndexScanDesc scan, ScanDirection dir)
 			memcpy(inskey.scankeys + i, subkey, sizeof(ScanKeyData));
 
 			/*
-			 * If the row comparison is the last positioning key we accepted,
-			 * try to add additional keys from the lower-order row members.
-			 * (If we accepted independent conditions on additional index
-			 * columns, we use those instead --- doesn't seem worth trying to
-			 * determine which is more restrictive.)  Note that this is OK
-			 * even if the row comparison is of ">" or "<" type, because the
-			 * condition applied to all but the last row member is effectively
-			 * ">=" or "<=", and so the extra keys don't break the positioning
-			 * scheme.  But, by the same token, if we aren't able to use all
-			 * the row members, then the part of the row comparison that we
-			 * did use has to be treated as just a ">=" or "<=" condition, and
-			 * so we'd better adjust strat_total accordingly.
+			 * If a "column gap" appears between row compare members, then the
+			 * part of the row comparison that we use has to be treated as
+			 * just a ">=" or "<=" condition. and so we'd better adjust
+			 * strat_total accordingly.
+			 *
+			 * We're able to use a _more_ restrictive strategy when we
+			 * encounter a NULL row compare member (which is unsatisfiable).
+			 * For example, a qual "(a, b, c) >= (1, NULL, 77)" will use an
+			 * insertion scan key "a > 1".  All rows where "a = 1" have to
+			 * perform a NULL row member comparison (or would, if we didn't
+			 * use the more restrictive ">" strategy), which is guranteed to
+			 * return false/return NULL.
 			 */
-			if (i == keysz - 1)
+			Assert(!(subkey->sk_flags & SK_ROW_END));
+			for (;;)
 			{
-				bool		used_all_subkeys = false;
-
-				Assert(!(subkey->sk_flags & SK_ROW_END));
-				for (;;)
+				subkey++;
+				Assert(subkey->sk_flags & SK_ROW_MEMBER);
+				if (subkey->sk_attno != keysz + 1)
+					break;		/* out-of-sequence, can't use it */
+				if (subkey->sk_strategy != cur->sk_strategy)
+					break;		/* wrong direction, can't use it */
+				if (subkey->sk_flags & SK_ISNULL)
 				{
-					subkey++;
-					Assert(subkey->sk_flags & SK_ROW_MEMBER);
-					if (subkey->sk_attno != keysz + 1)
-						break;	/* out-of-sequence, can't use it */
-					if (subkey->sk_strategy != cur->sk_strategy)
-						break;	/* wrong direction, can't use it */
-					if (subkey->sk_flags & SK_ISNULL)
-						break;	/* can't use null keys */
-					Assert(keysz < INDEX_MAX_KEYS);
-					memcpy(inskey.scankeys + keysz, subkey,
-						   sizeof(ScanKeyData));
-					keysz++;
-					if (subkey->sk_flags & SK_ROW_END)
-					{
-						used_all_subkeys = true;
-						break;
-					}
+					/* Unsatisfiable NULL row member */
+					keep_strat_total = true;
+					tighten_strat_total = true;
+					break;
 				}
-				if (!used_all_subkeys)
+				Assert(keysz < INDEX_MAX_KEYS);
+				memcpy(inskey.scankeys + keysz, subkey,
+					   sizeof(ScanKeyData));
+				keysz++;
+				if (subkey->sk_flags & SK_ROW_END)
 				{
-					switch (strat_total)
-					{
-						case BTLessStrategyNumber:
-							strat_total = BTLessEqualStrategyNumber;
-							break;
-						case BTGreaterStrategyNumber:
-							strat_total = BTGreaterEqualStrategyNumber;
-							break;
-					}
+					keep_strat_total = true;
+					break;
 				}
-				break;			/* done with outer loop */
 			}
+			if (!keep_strat_total)
+			{
+				/* Use less restrictive strategy (and fewer keys) */
+				Assert(!tighten_strat_total);
+				switch (strat_total)
+				{
+					case BTLessStrategyNumber:
+						strat_total = BTLessEqualStrategyNumber;
+						break;
+					case BTGreaterStrategyNumber:
+						strat_total = BTGreaterEqualStrategyNumber;
+						break;
+				}
+			}
+			if (tighten_strat_total)
+			{
+				/* Use more restrictive strategy (and fewer keys) */
+				switch (strat_total)
+				{
+					case BTLessEqualStrategyNumber:
+						strat_total = BTLessStrategyNumber;
+						break;
+					case BTGreaterEqualStrategyNumber:
+						strat_total = BTGreaterStrategyNumber;
+						break;
+				}
+			}
+			break;				/* done building insertion scan key */
 		}
 		else
 		{
diff --git a/src/backend/access/nbtree/nbtutils.c b/src/backend/access/nbtree/nbtutils.c
index eb6dbfda3..a5ff5f708 100644
--- a/src/backend/access/nbtree/nbtutils.c
+++ b/src/backend/access/nbtree/nbtutils.c
@@ -2445,29 +2445,11 @@ _bt_set_startikey(IndexScanDesc scan, BTReadPageState *pstate)
 			/*
 			 * RowCompare inequality.
 			 *
-			 * Only the first subkey from a RowCompare can ever be marked
-			 * required (that happens when the row header is marked required).
 			 * There is no simple, general way for us to transitively deduce
 			 * whether or not every tuple on the page satisfies a RowCompare
 			 * key based only on firsttup and lasttup -- so we just give up.
 			 */
-			if (!start_past_saop_eq && !so->skipScan)
-				break;			/* unsafe to go further */
-
-			/*
-			 * We have to be even more careful with RowCompares that come
-			 * after an array: we assume it's unsafe to even bypass the array.
-			 * Calling _bt_start_array_keys to recover the scan's arrays
-			 * following use of forcenonrequired mode isn't compatible with
-			 * _bt_check_rowcompare's continuescan=false behavior with NULL
-			 * row compare members.  _bt_advance_array_keys must not make a
-			 * decision on the basis of a key not being satisfied in the
-			 * opposite-to-scan direction until the scan reaches a leaf page
-			 * where the same key begins to be satisfied in scan direction.
-			 * The _bt_first !used_all_subkeys behavior makes this limitation
-			 * hard to work around some other way.
-			 */
-			return;				/* completely unsafe to set pstate.startikey */
+			break;			/* unsafe */
 		}
 		if (key->sk_strategy != BTEqualStrategyNumber)
 		{
@@ -2964,87 +2946,17 @@ _bt_check_rowcompare(ScanKey skey, IndexTuple tuple, int tupnatts,
 
 		Assert(subkey->sk_flags & SK_ROW_MEMBER);
 
-		if (subkey->sk_attno > tupnatts)
-		{
-			/*
-			 * This attribute is truncated (must be high key).  The value for
-			 * this attribute in the first non-pivot tuple on the page to the
-			 * right could be any possible value.  Assume that truncated
-			 * attribute passes the qual.
-			 */
-			Assert(BTreeTupleIsPivot(tuple));
-			cmpresult = 0;
-			if (subkey->sk_flags & SK_ROW_END)
-				break;
-			subkey++;
-			continue;
-		}
-
-		datum = index_getattr(tuple,
-							  subkey->sk_attno,
-							  tupdesc,
-							  &isNull);
-
-		if (isNull)
-		{
-			if (forcenonrequired)
-			{
-				/* treating scan's keys as non-required */
-			}
-			else if (subkey->sk_flags & SK_BT_NULLS_FIRST)
-			{
-				/*
-				 * Since NULLs are sorted before non-NULLs, we know we have
-				 * reached the lower limit of the range of values for this
-				 * index attr.  On a backward scan, we can stop if this qual
-				 * is one of the "must match" subset.  We can stop regardless
-				 * of whether the qual is > or <, so long as it's required,
-				 * because it's not possible for any future tuples to pass. On
-				 * a forward scan, however, we must keep going, because we may
-				 * have initially positioned to the start of the index.
-				 * (_bt_advance_array_keys also relies on this behavior during
-				 * forward scans.)
-				 */
-				if ((subkey->sk_flags & (SK_BT_REQFWD | SK_BT_REQBKWD)) &&
-					ScanDirectionIsBackward(dir))
-					*continuescan = false;
-			}
-			else
-			{
-				/*
-				 * Since NULLs are sorted after non-NULLs, we know we have
-				 * reached the upper limit of the range of values for this
-				 * index attr.  On a forward scan, we can stop if this qual is
-				 * one of the "must match" subset.  We can stop regardless of
-				 * whether the qual is > or <, so long as it's required,
-				 * because it's not possible for any future tuples to pass. On
-				 * a backward scan, however, we must keep going, because we
-				 * may have initially positioned to the end of the index.
-				 * (_bt_advance_array_keys also relies on this behavior during
-				 * backward scans.)
-				 */
-				if ((subkey->sk_flags & (SK_BT_REQFWD | SK_BT_REQBKWD)) &&
-					ScanDirectionIsForward(dir))
-					*continuescan = false;
-			}
-
-			/*
-			 * In any case, this indextuple doesn't match the qual.
-			 */
-			return false;
-		}
-
+		/* When a NULL row member is compared, the row never matches */
 		if (subkey->sk_flags & SK_ISNULL)
 		{
 			/*
-			 * Unlike the simple-scankey case, this isn't a disallowed case
-			 * (except when it's the first row element that has the NULL arg).
-			 * But it can never match.  If all the earlier row comparison
-			 * columns are required for the scan direction, we can stop the
-			 * scan, because there can't be another tuple that will succeed.
+			 * Cannot be a NULL in the first row member: _bt_preprocess_keys
+			 * would've marked the qual as unsatisfiable, ending the scan
+			 * before any tuples needed to be read (just like it will with
+			 * simple scalar inequalities with a NULL arg)
 			 */
 			Assert(subkey != (ScanKey) DatumGetPointer(skey->sk_argument));
-			subkey--;
+
 			if (forcenonrequired)
 			{
 				/* treating scan's keys as non-required */
@@ -3058,6 +2970,114 @@ _bt_check_rowcompare(ScanKey skey, IndexTuple tuple, int tupnatts,
 			return false;
 		}
 
+		if (subkey->sk_attno > tupnatts)
+		{
+			/*
+			 * This attribute is truncated (must be high key).  The value for
+			 * this attribute in the first non-pivot tuple on the page to the
+			 * right could be any possible value.  Assume that truncated
+			 * attribute passes the qual.
+			 */
+			Assert(BTreeTupleIsPivot(tuple));
+			return true;
+		}
+
+		datum = index_getattr(tuple,
+							  subkey->sk_attno,
+							  tupdesc,
+							  &isNull);
+
+		if (isNull)
+		{
+			int			reqflags;
+
+			if (forcenonrequired)
+			{
+				/* treating scan's keys as non-required */
+			}
+			else if (subkey->sk_flags & SK_BT_NULLS_FIRST)
+			{
+				/*
+				 * Since NULLs are sorted before non-NULLs, we know we have
+				 * reached the lower limit of the range of values for this
+				 * index attr.  On a backward scan, we can stop if this qual
+				 * is one of the "must match" subset.  However, on a forwards
+				 * scan, we must keep going, because we may have initially
+				 * positioned to the start of the index.
+				 *
+				 * All required NULLS FIRST > row members can use NULL tuple
+				 * values to end backwards scans, just like with other values.
+				 * A qual "WHERE (a, b, c) > (9, 42, 'foo')" can terminate a
+				 * backwards scan upon reaching the index's rightmost "a = 9"
+				 * tuple whose "b" column contains a NULL (if not sooner).
+				 * Since "b" is NULLS FIRST, we can treat its NULLs as "<" 42.
+				 */
+				reqflags = SK_BT_REQBKWD;
+
+				/*
+				 * When a most significant required NULLS FIRST < row compare
+				 * member sees NULL tuple values during a backwards scan, it
+				 * signals the end of matches for the whole row compare/scan.
+				 * A qual "WHERE (a, b, c) < (9, 42, 'foo')" will terminate a
+				 * backwards scan upon reaching the rightmost tuple whose "a"
+				 * column has a NULL.  The "a" NULL value is "<" 9, and yet
+				 * our < row compare will still end the scan.  (This isn't
+				 * safe with later/lower-order row members.  Notice that it
+				 * can only happen with an "a" NULL some time after the scan
+				 * completely stops needing to use its "b" and "c" members.)
+				 */
+				if (subkey == (ScanKey) DatumGetPointer(skey->sk_argument))
+					reqflags |= SK_BT_REQFWD;	/* safe, first row member */
+
+				if ((subkey->sk_flags & reqflags) &&
+					ScanDirectionIsBackward(dir))
+					*continuescan = false;
+			}
+			else
+			{
+				/*
+				 * Since NULLs are sorted after non-NULLs, we know we have
+				 * reached the upper limit of the range of values for this
+				 * index attr.  On a forward scan, we can stop if this qual is
+				 * one of the "must match" subset.  However, on a backward
+				 * scan, we must keep going, because we may have initially
+				 * positioned to the end of the index.
+				 *
+				 * All required NULLS LAST < row members can use NULL tuple
+				 * values to end forwards scans, just like with other values.
+				 * A qual "WHERE (a, b, c) < (9, 42, 'foo')" can terminate a
+				 * forwards scan upon reaching the index's leftmost "a = 9"
+				 * tuple whose "b" column contains a NULL (if not sooner).
+				 * Since "b" is NULLS LAST, we can treat its NULLs as ">" 42.
+				 */
+				reqflags = SK_BT_REQFWD;
+
+				/*
+				 * When a most significant required NULLS LAST > row compare
+				 * member sees NULL tuple values during a forwards scan, it
+				 * signals the end of matches for the whole row compare/scan.
+				 * A qual "WHERE (a, b, c) > (9, 42, 'foo')" will terminate a
+				 * forwards scan upon reaching the leftmost tuple whose "a"
+				 * column has a NULL.  The "a" NULL value is ">" 9, and yet
+				 * our > row compare will end the scan.  (This isn't safe with
+				 * later/lower-order row members.  Notice that it can only
+				 * happen with an "a" NULL some time after the scan completely
+				 * stops needing to use its "b" and "c" members.)
+				 */
+				if (subkey == (ScanKey) DatumGetPointer(skey->sk_argument))
+					reqflags |= SK_BT_REQBKWD;	/* safe, first row member */
+
+				if ((subkey->sk_flags & reqflags) &&
+					ScanDirectionIsForward(dir))
+					*continuescan = false;
+			}
+
+			/*
+			 * In any case, this indextuple doesn't match the qual.
+			 */
+			return false;
+		}
+
 		/* Perform the test --- three-way comparison not bool operator */
 		cmpresult = DatumGetInt32(FunctionCall2Coll(&subkey->sk_func,
 													subkey->sk_collation,
-- 
2.50.0