From c45d6550629bfec7e5f263f484c29b1571aba7ea Mon Sep 17 00:00:00 2001
From: Peter Geoghegan <pg@bowt.ie>
Date: Sun, 19 Sep 2021 16:14:43 -0700
Subject: [PATCH v1] Fix "single value strategy" index deletion bug.

---
 src/include/access/nbtree.h           |  2 +-
 src/backend/access/nbtree/nbtdedup.c  | 31 ++++++++++++++-------------
 src/backend/access/nbtree/nbtinsert.c |  2 +-
 3 files changed, 18 insertions(+), 17 deletions(-)

diff --git a/src/include/access/nbtree.h b/src/include/access/nbtree.h
index 42c66fac5..16480f93f 100644
--- a/src/include/access/nbtree.h
+++ b/src/include/access/nbtree.h
@@ -1155,7 +1155,7 @@ extern void _bt_parallel_advance_array_keys(IndexScanDesc scan);
  */
 extern void _bt_dedup_pass(Relation rel, Buffer buf, Relation heapRel,
 						   IndexTuple newitem, Size newitemsz,
-						   bool checkingunique);
+						   bool bottomupfail);
 extern bool _bt_bottomupdel_pass(Relation rel, Buffer buf, Relation heapRel,
 								 Size newitemsz);
 extern void _bt_dedup_start_pending(BTDedupState state, IndexTuple base,
diff --git a/src/backend/access/nbtree/nbtdedup.c b/src/backend/access/nbtree/nbtdedup.c
index 271994b08..b6e890816 100644
--- a/src/backend/access/nbtree/nbtdedup.c
+++ b/src/backend/access/nbtree/nbtdedup.c
@@ -34,14 +34,18 @@ static bool _bt_posting_valid(IndexTuple posting);
  *
  * The general approach taken here is to perform as much deduplication as
  * possible to free as much space as possible.  Note, however, that "single
- * value" strategy is sometimes used for !checkingunique callers, in which
- * case deduplication will leave a few tuples untouched at the end of the
- * page.  The general idea is to prepare the page for an anticipated page
- * split that uses nbtsplitloc.c's "single value" strategy to determine a
- * split point.  (There is no reason to deduplicate items that will end up on
- * the right half of the page after the anticipated page split; better to
- * handle those if and when the anticipated right half page gets its own
- * deduplication pass, following further inserts of duplicates.)
+ * value" strategy is used for !bottomupfail callers when the page is full of
+ * tuples of a single value.  This strategy makes deduplication leave behind a
+ * few untouched tuples at the end of the page, preparing the page for an
+ * anticipated page split that uses nbtsplitloc.c's own single value strategy.
+ * We delay merging the untouched tuples until _after_ the page splits.
+ *
+ * We only consider applying single value strategy when a page split already
+ * appears to be inevitable, which is why we exclude !bottomupfail calls.
+ * _bt_bottomupdel_pass() expects to be able to trigger "version-orientated"
+ * deduplication passes by reporting "failure" to its caller.  This is usually
+ * just a failure to free a significant amount of free space all together at
+ * once, which is no reason to give up and split the page.
  *
  * The page will have to be split if we cannot successfully free at least
  * newitemsz (we also need space for newitem's line pointer, which isn't
@@ -52,7 +56,7 @@ static bool _bt_posting_valid(IndexTuple posting);
  */
 void
 _bt_dedup_pass(Relation rel, Buffer buf, Relation heapRel, IndexTuple newitem,
-			   Size newitemsz, bool checkingunique)
+			   Size newitemsz, bool bottomupfail)
 {
 	OffsetNumber offnum,
 				minoff,
@@ -98,7 +102,7 @@ _bt_dedup_pass(Relation rel, Buffer buf, Relation heapRel, IndexTuple newitem,
 	maxoff = PageGetMaxOffsetNumber(page);
 
 	/* Determine if "single value" strategy should be used */
-	if (!checkingunique)
+	if (!bottomupfail)
 		singlevalstrat = _bt_do_singleval(rel, page, state, minoff, newitem);
 
 	/*
@@ -767,11 +771,8 @@ _bt_bottomupdel_finish_pending(Page page, BTDedupState state,
  *
  * Note: We deliberately don't bother checking if the high key is a distinct
  * value (prior to the TID tiebreaker column) before proceeding, unlike
- * nbtsplitloc.c.  Its single value strategy only gets applied on the
- * rightmost page of duplicates of the same value (other leaf pages full of
- * duplicates will get a simple 50:50 page split instead of splitting towards
- * the end of the page).  There is little point in making the same distinction
- * here.
+ * nbtsplitloc.c.  It seems better to err in the direction of not applying the
+ * strategy.
  */
 static bool
 _bt_do_singleval(Relation rel, Page page, BTDedupState state,
diff --git a/src/backend/access/nbtree/nbtinsert.c b/src/backend/access/nbtree/nbtinsert.c
index 6ac205c98..7355e1dba 100644
--- a/src/backend/access/nbtree/nbtinsert.c
+++ b/src/backend/access/nbtree/nbtinsert.c
@@ -2748,7 +2748,7 @@ _bt_delete_or_dedup_one_page(Relation rel, Relation heapRel,
 	/* Perform deduplication pass (when enabled and index-is-allequalimage) */
 	if (BTGetDeduplicateItems(rel) && itup_key->allequalimage)
 		_bt_dedup_pass(rel, buffer, heapRel, insertstate->itup,
-					   insertstate->itemsz, checkingunique);
+					   insertstate->itemsz, (indexUnchanged || uniquedup));
 }
 
 /*
-- 
2.27.0

