From 234cf6f13f43bebdba2da7f2933278d6c2d2de68 Mon Sep 17 00:00:00 2001
From: Peter Geoghegan <pg@bowt.ie>
Date: Wed, 18 Jun 2025 19:32:57 -0400
Subject: [PATCH v1] Improve _bt_first row compare NULL member handling.

Improve _bt_first's handling of NULL row comparison members in such a
way as to make it consistent with _bt_check_rowcompare's approach.  In
other words, make sure that code that starts primitive index scans that
involve row compares fully agrees with code that ends primitive index
scans -- even in the presence of a NULL row compare member.  This makes
row comparisons more similar to scalar inequality keys, obviating the
need for _bt_set_startikey to consider row comparisons directly.

_bt_first will now avoid uselessly scanning earlier index tuples, that
cannot possibly contain matching tuples due to the use of a NULL row
compare member.  The goal isn't to improve performance; the goal is to
make _bt_first agree with _bt_check_rowcompare about where primitive
index scans should start and end.

It would have been possible to make these two functions agree in the
same way by removing (rather than adding) an optimization: we could have
prevented _bt_check_rowcompare from terminating the scan at the point
that it compare an unsatisfiable NULL row compare member.  However, that
approach would likely lead to complaints about performance regressions.

Follow-up to bugfix commit 5f4d98d4, which taught _bt_set_startikey to
avoid the use of forcenonrequired mode iff it reached a row compare key.
Now _bt_set_startikey never needs to avoid the use of nonrequired mode,
which is simpler, more consistent, and more robust.
---
 src/backend/access/nbtree/nbtsearch.c | 41 +++++++++--
 src/backend/access/nbtree/nbtutils.c  | 98 ++++++++++++++-------------
 2 files changed, 87 insertions(+), 52 deletions(-)

diff --git a/src/backend/access/nbtree/nbtsearch.c b/src/backend/access/nbtree/nbtsearch.c
index 36544ecfd..80d727bb1 100644
--- a/src/backend/access/nbtree/nbtsearch.c
+++ b/src/backend/access/nbtree/nbtsearch.c
@@ -1288,14 +1288,23 @@ _bt_first(IndexScanDesc scan, ScanDirection dir)
 			 * even if the row comparison is of ">" or "<" type, because the
 			 * condition applied to all but the last row member is effectively
 			 * ">=" or "<=", and so the extra keys don't break the positioning
-			 * scheme.  But, by the same token, if we aren't able to use all
-			 * the row members, then the part of the row comparison that we
+			 * scheme.  But, by the same token, if a "column gap" appears
+			 * between members, then the part of the row comparison that we
 			 * did use has to be treated as just a ">=" or "<=" condition, and
 			 * so we'd better adjust strat_total accordingly.
+			 *
+			 * We're able to use a _more_ restrictive strategy when we
+			 * encounter a NULL row compare member (which is unsatisfiable).
+			 * For example, a qual "(a, b, c) >= (1, NULL, 77)" will use an
+			 * insertion scan key "a > 1".  All rows where "a = 1" have to
+			 * perform a NULL row member comparison (or would, if we didn't
+			 * use the more restrictive ">" strategy), which is guranteed to
+			 * return false/return NULL.
 			 */
 			if (i == keysz - 1)
 			{
-				bool		used_all_subkeys = false;
+				bool		keep_strat_total = false;
+				bool		tighten_strat_total = false;
 
 				Assert(!(subkey->sk_flags & SK_ROW_END));
 				for (;;)
@@ -1307,19 +1316,26 @@ _bt_first(IndexScanDesc scan, ScanDirection dir)
 					if (subkey->sk_strategy != cur->sk_strategy)
 						break;	/* wrong direction, can't use it */
 					if (subkey->sk_flags & SK_ISNULL)
-						break;	/* can't use null keys */
+					{
+						/* Unsatisfiable NULL row member */
+						keep_strat_total = true;
+						tighten_strat_total = true;
+						break;
+					}
 					Assert(keysz < INDEX_MAX_KEYS);
 					memcpy(inskey.scankeys + keysz, subkey,
 						   sizeof(ScanKeyData));
 					keysz++;
 					if (subkey->sk_flags & SK_ROW_END)
 					{
-						used_all_subkeys = true;
+						keep_strat_total = true;
 						break;
 					}
 				}
-				if (!used_all_subkeys)
+				if (!keep_strat_total)
 				{
+					/* Use less restrictive strategy (and fewer keys) */
+					Assert(!tighten_strat_total);
 					switch (strat_total)
 					{
 						case BTLessStrategyNumber:
@@ -1330,6 +1346,19 @@ _bt_first(IndexScanDesc scan, ScanDirection dir)
 							break;
 					}
 				}
+				if (tighten_strat_total)
+				{
+					/* Use more restrictive strategy (and fewer keys) */
+					switch (strat_total)
+					{
+						case BTLessEqualStrategyNumber:
+							strat_total = BTLessStrategyNumber;
+							break;
+						case BTGreaterEqualStrategyNumber:
+							strat_total = BTGreaterStrategyNumber;
+							break;
+					}
+				}
 				break;			/* done with outer loop */
 			}
 		}
diff --git a/src/backend/access/nbtree/nbtutils.c b/src/backend/access/nbtree/nbtutils.c
index c71d1b6f2..457786885 100644
--- a/src/backend/access/nbtree/nbtutils.c
+++ b/src/backend/access/nbtree/nbtutils.c
@@ -2565,23 +2565,7 @@ _bt_set_startikey(IndexScanDesc scan, BTReadPageState *pstate)
 			 * whether or not every tuple on the page satisfies a RowCompare
 			 * key based only on firsttup and lasttup -- so we just give up.
 			 */
-			if (!start_past_saop_eq && !so->skipScan)
-				break;			/* unsafe to go further */
-
-			/*
-			 * We have to be even more careful with RowCompares that come
-			 * after an array: we assume it's unsafe to even bypass the array.
-			 * Calling _bt_start_array_keys to recover the scan's arrays
-			 * following use of forcenonrequired mode isn't compatible with
-			 * _bt_check_rowcompare's continuescan=false behavior with NULL
-			 * row compare members.  _bt_advance_array_keys must not make a
-			 * decision on the basis of a key not being satisfied in the
-			 * opposite-to-scan direction until the scan reaches a leaf page
-			 * where the same key begins to be satisfied in scan direction.
-			 * The _bt_first !used_all_subkeys behavior makes this limitation
-			 * hard to work around some other way.
-			 */
-			return;				/* completely unsafe to set pstate.startikey */
+			break;				/* unsafe */
 		}
 		if (key->sk_strategy != BTEqualStrategyNumber)
 		{
@@ -3078,6 +3062,56 @@ _bt_check_rowcompare(ScanKey skey, IndexTuple tuple, int tupnatts,
 
 		Assert(subkey->sk_flags & SK_ROW_MEMBER);
 
+		/*
+		 * Unlike the simple-scankey case, NULL row members aren't disallowed
+		 * (except when it's the first row element that has the NULL arg,
+		 * where preprocessing recognizes the scan's qual as unsatisfiable).
+		 * But it can never match any rows.
+		 *
+		 * If the first row member's column is marked required, and this row
+		 * comparison is on the very next column, we can stop the scan; there
+		 * can't be another tuple that will succeed.
+		 */
+		if (subkey->sk_flags & SK_ISNULL)
+		{
+			AttrNumber	isnull_attno = subkey->sk_attno;
+
+			/* can't be the first row member (preprocessing catches this) */
+			Assert(subkey != (ScanKey) DatumGetPointer(skey->sk_argument));
+
+			subkey--;
+			if (forcenonrequired)
+			{
+				/* treating scan's keys as non-required */
+			}
+			else if (subkey->sk_attno != isnull_attno - 1)
+			{
+				/*
+				 * There's an index column gap between the NULL row member,
+				 * and the next most significant row member.  Cannot end scan.
+				 *
+				 * The presence of this gap doesn't actually imply that there
+				 * really can be another tuple that will succeed; there can't.
+				 * This isn't really about the _current_ primitive index scan.
+				 *
+				 * When we set continuescan=false, it had better be safe for a
+				 * scan with a more-significant-than-rowcompare array key to
+				 * reposition itself to some later leaf page, in the usual way
+				 * (by starting the next primitive index scan).  But there'd
+				 * be no way for _bt_first to actually locate some later leaf
+				 * page if we were to set continuescan=false here -- we'd run
+				 * the risk of getting stuck in an unending cycle.
+				 */
+			}
+			else if ((subkey->sk_flags & SK_BT_REQFWD) &&
+					 ScanDirectionIsForward(dir))
+				*continuescan = false;
+			else if ((subkey->sk_flags & SK_BT_REQBKWD) &&
+					 ScanDirectionIsBackward(dir))
+				*continuescan = false;
+			return false;
+		}
+
 		if (subkey->sk_attno > tupnatts)
 		{
 			/*
@@ -3087,11 +3121,7 @@ _bt_check_rowcompare(ScanKey skey, IndexTuple tuple, int tupnatts,
 			 * attribute passes the qual.
 			 */
 			Assert(BTreeTupleIsPivot(tuple));
-			cmpresult = 0;
-			if (subkey->sk_flags & SK_ROW_END)
-				break;
-			subkey++;
-			continue;
+			return true;
 		}
 
 		datum = index_getattr(tuple,
@@ -3148,30 +3178,6 @@ _bt_check_rowcompare(ScanKey skey, IndexTuple tuple, int tupnatts,
 			return false;
 		}
 
-		if (subkey->sk_flags & SK_ISNULL)
-		{
-			/*
-			 * Unlike the simple-scankey case, this isn't a disallowed case
-			 * (except when it's the first row element that has the NULL arg).
-			 * But it can never match.  If all the earlier row comparison
-			 * columns are required for the scan direction, we can stop the
-			 * scan, because there can't be another tuple that will succeed.
-			 */
-			Assert(subkey != (ScanKey) DatumGetPointer(skey->sk_argument));
-			subkey--;
-			if (forcenonrequired)
-			{
-				/* treating scan's keys as non-required */
-			}
-			else if ((subkey->sk_flags & SK_BT_REQFWD) &&
-					 ScanDirectionIsForward(dir))
-				*continuescan = false;
-			else if ((subkey->sk_flags & SK_BT_REQBKWD) &&
-					 ScanDirectionIsBackward(dir))
-				*continuescan = false;
-			return false;
-		}
-
 		/* Perform the test --- three-way comparison not bool operator */
 		cmpresult = DatumGetInt32(FunctionCall2Coll(&subkey->sk_func,
 													subkey->sk_collation,
-- 
2.49.0

