From 4b93191614e76d703b77eb80faec8259f91374fc Mon Sep 17 00:00:00 2001
From: Peter Geoghegan <pg@bowt.ie>
Date: Tue, 30 Jun 2020 16:29:27 -0700
Subject: [PATCH] Non-opportunistically delete B-Tree items.

Repurpose deduplication infrastructure to delete items in unique indexes
at the point where we'd usually have to split the page, even when they
don't have their LP_DEAD bits set.  Testing has shown that this is
almost completely effective at preventing "version bloat" from non-HOT
updates in unique indexes, provided there are no long running
transactions.

Note that INCLUDE indexes support the optimization.
---
 src/include/access/nbtree.h           |   2 +-
 src/backend/access/nbtree/README      |  13 +-
 src/backend/access/nbtree/nbtdedup.c  | 332 +++++++++++++++++++++++++-
 src/backend/access/nbtree/nbtinsert.c |   9 +-
 4 files changed, 349 insertions(+), 7 deletions(-)

diff --git a/src/include/access/nbtree.h b/src/include/access/nbtree.h
index 79506c748b..3c6b3c1cf4 100644
--- a/src/include/access/nbtree.h
+++ b/src/include/access/nbtree.h
@@ -1029,7 +1029,7 @@ extern void _bt_parallel_advance_array_keys(IndexScanDesc scan);
  */
 extern void _bt_dedup_one_page(Relation rel, Buffer buf, Relation heapRel,
 							   IndexTuple newitem, Size newitemsz,
-							   bool checkingunique);
+							   bool checkingunique, bool allequalimage);
 extern void _bt_dedup_start_pending(BTDedupState state, IndexTuple base,
 									OffsetNumber baseoff);
 extern bool _bt_dedup_save_htid(BTDedupState state, IndexTuple itup);
diff --git a/src/backend/access/nbtree/README b/src/backend/access/nbtree/README
index 216d419841..27d1ef2669 100644
--- a/src/backend/access/nbtree/README
+++ b/src/backend/access/nbtree/README
@@ -800,7 +800,18 @@ Deduplication in unique indexes helps to prevent these pathological page
 splits.  Storing duplicates in a space efficient manner is not the goal,
 since in the long run there won't be any duplicates anyway.  Rather, we're
 buying time for standard garbage collection mechanisms to run before a
-page split is needed.
+page split is needed.  Also, the deduplication pass performs a targeted
+form of opportunistic deletion in the case of unique indexes.
+
+The deduplication code first opportunistically delete whatever duplicates
+happen to be present on the page with a unique indexes, since in general
+some of them are likely to already be dead to everybody.  This mechanism
+is quite similar to on-the-fly deletion of index tuples that will already
+have failed to prevent a page split by the time deduplication is
+considered.  The main difference is that the tuples that get deleted are
+not opportunistically marked LP_DEAD by transactions that had to read the
+tuples in any case.  The implementation must weigh the need to avoid a
+page split against the extra work performed with a buffer lock held.
 
 Unique index leaf pages only get a deduplication pass when an insertion
 (that might have to split the page) observed an existing duplicate on the
diff --git a/src/backend/access/nbtree/nbtdedup.c b/src/backend/access/nbtree/nbtdedup.c
index f6be865b17..16123bf820 100644
--- a/src/backend/access/nbtree/nbtdedup.c
+++ b/src/backend/access/nbtree/nbtdedup.c
@@ -16,6 +16,8 @@
 
 #include "access/nbtree.h"
 #include "access/nbtxlog.h"
+#include "access/tableam.h"
+#include "catalog/catalog.h"
 #include "miscadmin.h"
 #include "utils/rel.h"
 
@@ -23,6 +25,14 @@ static bool _bt_do_singleval(Relation rel, Page page, BTDedupState state,
 							 OffsetNumber minoff, IndexTuple newitem);
 static void _bt_singleval_fillfactor(Page page, BTDedupState state,
 									 Size newitemsz);
+static bool _bt_dedup_vacuum_one_page(Relation rel, Buffer buffer,
+									  Relation heapRel, Size itemsz);
+static void _bt_dedup_vacuum_finish_pending(BTDedupState state);
+static bool _bt_dedup_delete_item(Relation heapRel,
+								  Snapshot SnapshotNonVacuumable,
+								  IndexTuple itup, int *heapaccesses);
+static int	_bt_intervalcmp(const void *arg1, const void *arg2);
+static int	_bt_offsetnumbercmp(const void *arg1, const void *arg2);
 #ifdef USE_ASSERT_CHECKING
 static bool _bt_posting_valid(IndexTuple posting);
 #endif
@@ -54,7 +64,8 @@ static bool _bt_posting_valid(IndexTuple posting);
  */
 void
 _bt_dedup_one_page(Relation rel, Buffer buf, Relation heapRel,
-				   IndexTuple newitem, Size newitemsz, bool checkingunique)
+				   IndexTuple newitem, Size newitemsz, bool checkingunique,
+				   bool allequalimage)
 {
 	OffsetNumber offnum,
 				minoff,
@@ -108,6 +119,30 @@ _bt_dedup_one_page(Relation rel, Buffer buf, Relation heapRel,
 		maxoff = PageGetMaxOffsetNumber(page);
 	}
 
+	/*
+	 * Before we actually deduplicate, see if we can non-opportunistically
+	 * free some duplicate if this is special checkingunique case
+	 */
+	if (checkingunique)
+	{
+		if (_bt_dedup_vacuum_one_page(rel, buf, heapRel, newitemsz))
+			return;
+
+		/*
+		 * Reconsider number of items on page, in case
+		 * _bt_dedup_vacuum_one_page() managed to delete an item or two
+		 */
+		minoff = P_FIRSTDATAKEY(opaque);
+		maxoff = PageGetMaxOffsetNumber(page);
+	}
+
+	/*
+	 * Can't actually deduplicate, so give up and split page (only here to see
+	 * if we can delete non-opportunistically)
+	 */
+	if (!allequalimage)
+		return;
+
 	/* Passed-in newitemsz is MAXALIGNED but does not include line pointer */
 	newitemsz += sizeof(ItemIdData);
 
@@ -809,6 +844,301 @@ _bt_swap_posting(IndexTuple newitem, IndexTuple oposting, int postingoff)
 	return nposting;
 }
 
+/*
+ * See if duplicate unique index tuples in a unique index are eligible to be
+ * deleted, even though they don't have their LP_DEAD bit set already.
+ * Concentrate on groups of duplicates, since we're most likely to be
+ * successful there.  Give up if we have to access more than a few heap pages
+ * before we can free enough space to avoid a page split.
+ *
+ * Note: Caller should have already deleted all existing items with their
+ * LP_DEAD bits set.
+ *
+ * FIXME: Be less eager with tuples that contain NULLs, since they can be
+ * duplicates without that signaling anything about version churn.  Just
+ * because we're checkingunique (which implies that incoming newitem isn't
+ * a NULL) doesn't mean there aren't lots of other NULLs on the page.
+ *
+ * FIXME: Skip over duplicates of the incoming item, which will surely have
+ * been considered within _bt_check_unique().
+ */
+static bool
+_bt_dedup_vacuum_one_page(Relation rel, Buffer buffer, Relation heapRel,
+						  Size itemsz)
+{
+	OffsetNumber deletable[MaxIndexTuplesPerPage];
+	int			heapaccesses = 0;
+	int			ndeletable = 0;
+	OffsetNumber offnum,
+				minoff,
+				maxoff;
+	Size		freespace;
+	Page		page = BufferGetPage(buffer);
+	BTPageOpaque opaque = (BTPageOpaque) PageGetSpecialPointer(page);
+	bool		finished = false;
+	bool		success = false;
+	BTDedupState state;
+	SnapshotData SnapshotNonVacuumable;
+	TransactionId OldestXmin;
+	int			nkeyatts = IndexRelationGetNumberOfKeyAttributes(rel);
+
+	state = (BTDedupState) palloc(sizeof(BTDedupStateData));
+	state->deduplicate = true;
+	state->nmaxitems = 0;
+	/* Final "posting list" size should not restrict anything */
+	state->maxpostingsize = BLCKSZ;
+	state->base = NULL;
+	state->baseoff = InvalidOffsetNumber;
+	state->basetupsize = 0;
+	state->htids = palloc(state->maxpostingsize);
+	state->nhtids = 0;
+	state->nitems = 0;
+	state->phystupsize = 0;
+	state->nintervals = 0;
+
+	minoff = P_FIRSTDATAKEY(opaque);
+	maxoff = PageGetMaxOffsetNumber(page);
+	for (offnum = minoff;
+		 offnum <= maxoff;
+		 offnum = OffsetNumberNext(offnum))
+	{
+		ItemId		itemid = PageGetItemId(page, offnum);
+		IndexTuple	itup = (IndexTuple) PageGetItem(page, itemid);
+
+		Assert(!ItemIdIsDead(itemid));
+
+		if (offnum == minoff)
+		{
+			_bt_dedup_start_pending(state, itup, offnum);
+		}
+		else if (state->deduplicate &&
+				 _bt_keep_natts_fast(rel, state->base, itup) > nkeyatts &&
+				 _bt_dedup_save_htid(state, itup))
+		{
+
+		}
+		else
+		{
+			_bt_dedup_vacuum_finish_pending(state);
+
+			/* itup starts new pending posting list */
+			_bt_dedup_start_pending(state, itup, offnum);
+		}
+	}
+	/* Handle the last item */
+	_bt_dedup_vacuum_finish_pending(state);
+
+	if (state->nintervals == 0)
+	{
+		pfree(state->htids);
+		pfree(state);
+		return false;
+	}
+
+	/* Access items in order of nitems, then asc page offset number order */
+	qsort(state->intervals, state->nintervals, sizeof(BTDedupInterval),
+		  _bt_intervalcmp);
+
+	if (IsCatalogRelation(rel) ||
+		RelationIsAccessibleInLogicalDecoding(rel))
+		OldestXmin = RecentGlobalXmin;
+	else
+		OldestXmin =
+			TransactionIdLimitedForOldSnapshots(RecentGlobalDataXmin, rel);
+
+	InitNonVacuumableSnapshot(SnapshotNonVacuumable, OldestXmin);
+
+	freespace = PageGetFreeSpace(page);
+	for (int i = 0; i < state->nintervals; i++)
+	{
+		BTDedupInterval interval = state->intervals[i];
+		bool		killed_in_interval = false;
+
+		Assert(interval.nitems > 0);
+		/* Iterate through tuples of given value */
+		for (int j = 0; j < interval.nitems; j++)
+		{
+			OffsetNumber ioff = interval.baseoff + j;
+			ItemId		itemid = PageGetItemId(page, ioff);
+			IndexTuple	itup = (IndexTuple) PageGetItem(page, itemid);
+
+			if (killed_in_interval && j == 1 && interval.nitems == 2)
+			{
+				/*
+				 * If we already killed first item in interval, and there are
+				 * only two items, then assume that this isn't going to be
+				 * another kill -- move on to next interval
+				 */
+				break;
+			}
+
+			if (_bt_dedup_delete_item(heapRel, &SnapshotNonVacuumable, itup,
+									  &heapaccesses))
+			{
+				/* Delete item */
+				ItemIdMarkDead(itemid);
+				MarkBufferDirtyHint(buffer, true);
+				deletable[ndeletable++] = ioff;
+				freespace += ItemIdGetLength(itemid);
+				killed_in_interval = true;
+
+				if (freespace >= itemsz)
+				{
+					/*
+					 * We successfully avoided a page split, so we're done.
+					 *
+					 * XXX: We should probably find a way of doing a bit more
+					 * work in passing.  That should probably be accomplished
+					 * by pushing much of this logic down into heapam, which
+					 * can judge when it's worth going a bit further than
+					 * strictly needed.
+					 */
+					success = true;
+					finished = true;
+					break;
+				}
+			}
+
+			if (heapaccesses >= 10)
+			{
+				finished = true;
+				break;
+			}
+		}
+
+		if (finished)
+			break;
+		if (i >= 50)
+			break;
+	}
+
+	if (ndeletable > 0)
+	{
+		/* Have to give array to _bt_delitems_delete in asc order */
+		qsort(deletable, ndeletable, sizeof(OffsetNumber),
+			  _bt_offsetnumbercmp);
+		_bt_delitems_delete(rel, buffer, deletable, ndeletable, heapRel);
+	}
+
+	pfree(state->htids);
+	pfree(state);
+
+	return success;
+}
+
+static void
+_bt_dedup_vacuum_finish_pending(BTDedupState state)
+{
+	Assert(state->nitems > 0);
+	Assert(state->nitems <= state->nhtids);
+	Assert(state->intervals[state->nintervals].baseoff == state->baseoff);
+
+	if (state->nitems == 1)
+	{
+		/* Don't merge */
+	}
+	else
+	{
+		/* Save final number of items for posting list */
+		state->intervals[state->nintervals].nitems = state->nitems;
+		/* Increment nintervals, since we wrote a new posting list tuple */
+		state->nintervals++;
+	}
+
+	/* Reset state for next pending posting list */
+	state->nhtids = 0;
+	state->nitems = 0;
+	state->phystupsize = 0;
+}
+
+static bool
+_bt_dedup_delete_item(Relation heapRel, Snapshot SnapshotNonVacuumable,
+					  IndexTuple itup, int *heapaccesses)
+{
+	bool		all_dead;
+	ItemPointerData htid;
+
+	all_dead = false;
+
+	if (!BTreeTupleIsPosting(itup))
+	{
+		Assert(ItemPointerIsValid(&itup->t_tid));
+
+		(*heapaccesses)++;
+		htid = itup->t_tid;
+		if (table_index_fetch_tuple_check(heapRel, &htid,
+										  SnapshotNonVacuumable, &all_dead))
+			return false;
+
+		if (!all_dead)
+			return false;
+	}
+	else
+	{
+		Assert(_bt_posting_valid(itup));
+
+		for (int i = 0; i < BTreeTupleGetNPosting(itup); i++)
+		{
+			(*heapaccesses)++;
+			htid = *BTreeTupleGetPostingN(itup, i);
+			if (table_index_fetch_tuple_check(heapRel,
+											  &htid,
+											  SnapshotNonVacuumable,
+											  &all_dead))
+				return false;
+
+			if (!all_dead)
+				return false;
+		}
+	}
+
+	/* All tuples in HOT chain are vacuumable */
+	return true;
+}
+
+/*
+ * qsort-style comparator used by _bt_dedup_vacuum_one_page()
+ */
+static int
+_bt_intervalcmp(const void *arg1, const void *arg2)
+{
+	BTDedupInterval *inter1 = (BTDedupInterval *) arg1;
+	BTDedupInterval *inter2 = (BTDedupInterval *) arg2;
+
+	/* Note: This sorts intervals in descending order */
+	if (inter1->nitems > inter2->nitems)
+		return -1;
+	if (inter1->nitems < inter2->nitems)
+		return 1;
+
+	/* Now tiebreak on itemoffsetnumber order, asc */
+	if (inter1->baseoff > inter2->baseoff)
+		return 1;
+	if (inter1->baseoff < inter2->baseoff)
+		return -1;
+
+	return 0;
+}
+
+/*
+ * qsort-style comparator used by _bt_dedup_vacuum_one_page()
+ */
+static int
+_bt_offsetnumbercmp(const void *arg1, const void *arg2)
+{
+	OffsetNumber *inter1 = (OffsetNumber *) arg1;
+	OffsetNumber *inter2 = (OffsetNumber *) arg2;
+
+	if (*inter1 > *inter2)
+		return 1;
+	if (*inter1 < *inter2)
+		return -1;
+
+	Assert(false);
+
+	return 0;
+}
+
 /*
  * Verify posting list invariants for "posting", which must be a posting list
  * tuple.  Used within assertions.
diff --git a/src/backend/access/nbtree/nbtinsert.c b/src/backend/access/nbtree/nbtinsert.c
index b86c122763..17549ba24f 100644
--- a/src/backend/access/nbtree/nbtinsert.c
+++ b/src/backend/access/nbtree/nbtinsert.c
@@ -874,7 +874,9 @@ _bt_findinsertloc(Relation rel,
 		 * If the target page is full, see if we can obtain enough space by
 		 * erasing LP_DEAD items.  If that fails to free enough space, see if
 		 * we can avoid a page split by performing a deduplication pass over
-		 * the page.
+		 * the page.  If it's a unique index we will try to delete whatever
+		 * duplicates happen to be on the page, which might be enough to avoid
+		 * a page split.
 		 *
 		 * We only perform a deduplication pass for a checkingunique caller
 		 * when the incoming item is a duplicate of an existing item on the
@@ -893,13 +895,12 @@ _bt_findinsertloc(Relation rel,
 				uniquedup = true;
 			}
 
-			if (itup_key->allequalimage && BTGetDeduplicateItems(rel) &&
-				(!checkingunique || uniquedup) &&
+			if (BTGetDeduplicateItems(rel) && (!checkingunique || uniquedup) &&
 				PageGetFreeSpace(page) < insertstate->itemsz)
 			{
 				_bt_dedup_one_page(rel, insertstate->buf, heapRel,
 								   insertstate->itup, insertstate->itemsz,
-								   checkingunique);
+								   checkingunique, itup_key->allequalimage);
 				insertstate->bounds_valid = false;
 			}
 		}
-- 
2.25.1