commit 0f1b12bc433f0db580d2af0c893a208f7105e4b0
Author: Alexander Korotkov <akorotkov@postgresql.org>
Date:   Thu Dec 6 01:04:33 2018 +0300

    Improve checking for missing parent links
    
    Instead of collecting lossy bitmap, traverse from one downlink to subsequent
    using rightlinks.  Intermediate pages are candidates to have lost parent
    downlinks.

diff --git a/contrib/amcheck/verify_nbtree.c b/contrib/amcheck/verify_nbtree.c
index 55a3a4bbe0c..e9b1b779453 100644
--- a/contrib/amcheck/verify_nbtree.c
+++ b/contrib/amcheck/verify_nbtree.c
@@ -93,14 +93,19 @@ typedef struct BtreeCheckState
 	/* Target page's LSN */
 	XLogRecPtr	targetlsn;
 
+	/*
+	 * Rightlink and incomplete split flag of previous block referenced by
+	 * downlink.
+	 */
+	BlockNumber prevrightlink;
+	bool		previncompletesplit;
+
 	/*
 	 * Mutable state, for optional heapallindexed verification:
 	 */
 
 	/* Bloom filter fingerprints B-Tree index */
 	bloom_filter *filter;
-	/* Bloom filter fingerprints downlink blocks within tree */
-	bloom_filter *downlinkfilter;
 	/* Right half of incomplete split marker */
 	bool		rightsplit;
 	/* Debug counter */
@@ -137,7 +142,10 @@ static void bt_target_page_check(BtreeCheckState *state);
 static BTScanInsert bt_right_page_check_scankey(BtreeCheckState *state);
 static void bt_downlink_check(BtreeCheckState *state, BTScanInsert targetkey,
 							  BlockNumber childblock);
-static void bt_downlink_missing_check(BtreeCheckState *state);
+static void bt_downlink_connectivity_check(BtreeCheckState *state,
+										   BlockNumber downlink, Page loaded_page, uint32 parent_level);
+static void bt_downlink_missing_check(BtreeCheckState *state, bool rightsplit,
+									  BlockNumber targetblock, Page target);
 static void bt_tuple_present_callback(Relation index, HeapTuple htup,
 									  Datum *values, bool *isnull,
 									  bool tupleIsAlive, void *checkstate);
@@ -432,20 +440,6 @@ bt_check_every_level(Relation rel, Relation heaprel, bool heapkeyspace,
 						 errmsg("index \"%s\" cannot be verified using transaction snapshot",
 								RelationGetRelationName(rel))));
 		}
-		else
-		{
-			/*
-			 * Extra readonly downlink check.
-			 *
-			 * In readonly case, we know that there cannot be a concurrent
-			 * page split or a concurrent page deletion, which gives us the
-			 * opportunity to verify that every non-ignorable page had a
-			 * downlink one level up.  We must be tolerant of interrupted page
-			 * splits and page deletions, though.  This is taken care of in
-			 * bt_downlink_missing_check().
-			 */
-			state->downlinkfilter = bloom_create(total_pages, work_mem, seed);
-		}
 	}
 
 	Assert(!state->rootdescend || state->readonly);
@@ -523,16 +517,6 @@ bt_check_every_level(Relation rel, Relation heaprel, bool heapkeyspace,
 		IndexInfo  *indexinfo = BuildIndexInfo(state->rel);
 		TableScanDesc scan;
 
-		/* Report on extra downlink checks performed in readonly case */
-		if (state->readonly)
-		{
-			ereport(DEBUG1,
-					(errmsg_internal("finished verifying presence of downlink blocks within index \"%s\" with bitset %.2f%% set",
-									 RelationGetRelationName(rel),
-									 100.0 * bloom_prop_bits_set(state->downlinkfilter))));
-			bloom_free(state->downlinkfilter);
-		}
-
 		/*
 		 * Create our own scan for table_index_build_scan(), rather than
 		 * getting it to do so for us.  This is required so that we can
@@ -635,6 +619,9 @@ bt_check_level_from_leftmost(BtreeCheckState *state, BtreeLevel level)
 		 level.istruerootlevel ?
 		 " (true root level)" : level.level == 0 ? " (leaf level)" : "");
 
+	state->prevrightlink = InvalidBlockNumber;
+	state->previncompletesplit = false;
+
 	do
 	{
 		/* Don't rely on CHECK_FOR_INTERRUPTS() calls at lower level */
@@ -940,22 +927,24 @@ bt_target_page_check(BtreeCheckState *state)
 										(uint32) state->targetlsn)));
 		}
 
-		/* Fingerprint downlink blocks in heapallindexed + readonly case */
-		if (state->heapallindexed && state->readonly && !P_ISLEAF(topaque))
-		{
-			BlockNumber childblock = BTreeInnerTupleGetDownLink(itup);
-
-			bloom_add_element(state->downlinkfilter,
-							  (unsigned char *) &childblock,
-							  sizeof(BlockNumber));
-		}
-
 		/*
 		 * Don't try to generate scankey using "negative infinity" item on
 		 * internal pages. They are always truncated to zero attributes.
 		 */
 		if (offset_is_negative_infinity(topaque, offset))
+		{
+			/*
+			 * Initialize downlink connectivity check if needed.
+			 */
+			if (!P_ISLEAF(topaque) && state->readonly)
+			{
+				bt_downlink_connectivity_check(state,
+											   BTreeInnerTupleGetDownLink(itup),
+											   NULL,
+											   topaque->btpo.level);
+			}
 			continue;
+		}
 
 		/*
 		 * Readonly callers may optionally verify that non-pivot tuples can
@@ -1239,12 +1228,15 @@ bt_target_page_check(BtreeCheckState *state)
 	}
 
 	/*
-	 * * Check if page has a downlink in parent *
-	 *
-	 * This can only be checked in heapallindexed + readonly case.
+	 * If we traversed the whole level to the rightmost page, there might be
+	 * missing downlinks for the pages to the right of rightmost downlink.
+	 * Check for them.
 	 */
-	if (state->heapallindexed && state->readonly)
-		bt_downlink_missing_check(state);
+	if (!P_ISLEAF(topaque) && P_RIGHTMOST(topaque) && state->readonly)
+	{
+		bt_downlink_connectivity_check(state, P_NONE,
+									   NULL, topaque->btpo.level);
+	}
 }
 
 /*
@@ -1461,6 +1453,115 @@ bt_right_page_check_scankey(BtreeCheckState *state)
 	return bt_mkscankey_pivotsearch(state->rel, firstitup);
 }
 
+/*
+ * Check connectivity of downlinks.  Traverse rightlinks from previous downlink
+ * to the current one.  Check that there are no intermediate pages with missing
+ * downlinks.
+ *
+ * If 'loaded_page' is given, it's assumed to be contents of 'downlink'.
+ */
+static void
+bt_downlink_connectivity_check(BtreeCheckState *state,
+							   BlockNumber downlink,
+							   Page loaded_page,
+							   uint32 parent_level)
+{
+	BlockNumber blkno = state->prevrightlink;
+	Page		page;
+	BTPageOpaque opaque;
+	bool		rightsplit = state->previncompletesplit;
+	bool		first = true;
+
+	/*
+	 * If no previos rightlink is memorized, get it from current downlink for
+	 * future usage.
+	 */
+	if (!BlockNumberIsValid(blkno))
+	{
+		Assert(BlockNumberIsValid(downlink) && downlink != P_NONE);
+
+		if (loaded_page)
+			page = loaded_page;
+		else
+			page = palloc_btree_page(state, downlink);
+
+		opaque = (BTPageOpaque) PageGetSpecialPointer(page);
+		state->prevrightlink = opaque->btpo_next;
+		state->previncompletesplit = P_INCOMPLETE_SPLIT(opaque);
+		return;
+	}
+
+	while (true)
+	{
+		/*
+		 * Did we traverse the whole tree level and this is check for pages to
+		 * the right of rightmost downlink?
+		 */
+		if (blkno == P_NONE && downlink == P_NONE)
+		{
+			state->prevrightlink = InvalidBlockNumber;
+			state->previncompletesplit = false;
+			return;
+		}
+
+		/* Did we traverse the whole tree level and don't find next downlink? */
+		if (blkno == P_NONE)
+			ereport(ERROR,
+					(errcode(ERRCODE_INDEX_CORRUPTED),
+					 errmsg("can't traverse from downlink %u to downlink %u of index \"%s\"",
+							state->prevrightlink, downlink,
+							RelationGetRelationName(state->rel))));
+
+		/* Load page contents */
+		if (blkno == downlink && loaded_page)
+			page = loaded_page;
+		else
+			page = palloc_btree_page(state, blkno);
+
+		opaque = (BTPageOpaque) PageGetSpecialPointer(page);
+
+		/* Check level for non-ignorable page */
+		if (!P_IGNORE(opaque) && opaque->btpo.level != parent_level - 1)
+			ereport(ERROR,
+					(errcode(ERRCODE_INDEX_CORRUPTED),
+					 errmsg("block found while traversing rightlinks from downlink of index \"%s\" has invalid level",
+							RelationGetRelationName(state->rel)),
+					 errdetail_internal("Block pointed to=%u expected level=%u level in pointed to block=%u.",
+										blkno, parent_level - 1, opaque->btpo.level)));
+
+		/* Try to detect circular links */
+		if ((!first && blkno == state->prevrightlink) || blkno == opaque->btpo_prev)
+			ereport(ERROR,
+					(errcode(ERRCODE_INDEX_CORRUPTED),
+					 errmsg("circular link chain found in block %u of index \"%s\"",
+							blkno, RelationGetRelationName(state->rel))));
+
+		if (!first && !P_IGNORE(opaque))
+		{
+			/* blkno probably has missing parent downlink */
+			bt_downlink_missing_check(state, rightsplit, blkno, page);
+		}
+
+		rightsplit = P_INCOMPLETE_SPLIT(opaque);
+
+		/* Exit if we already found next downlink */
+		if (blkno == downlink)
+		{
+			state->prevrightlink = opaque->btpo_next;
+			state->previncompletesplit = rightsplit;
+			return;
+		}
+
+		/* Traverse to the next page using rightlink */
+		blkno = opaque->btpo_next;
+
+		/* Free page contents if it's allocated by us */
+		if (page != loaded_page)
+			pfree(page);
+		first = false;
+	}
+}
+
 /*
  * Checks one of target's downlink against its child page.
  *
@@ -1478,6 +1579,7 @@ bt_downlink_check(BtreeCheckState *state, BTScanInsert targetkey,
 	OffsetNumber maxoffset;
 	Page		child;
 	BTPageOpaque copaque;
+	BTPageOpaque topaque;
 
 	/*
 	 * Caller must have ShareLock on target relation, because of
@@ -1527,10 +1629,18 @@ bt_downlink_check(BtreeCheckState *state, BTScanInsert targetkey,
 	 * Check all items, rather than checking just the first and trusting that
 	 * the operator class obeys the transitive law.
 	 */
+	topaque = (BTPageOpaque) PageGetSpecialPointer(state->target);
 	child = palloc_btree_page(state, childblock);
 	copaque = (BTPageOpaque) PageGetSpecialPointer(child);
 	maxoffset = PageGetMaxOffsetNumber(child);
 
+	/*
+	 * Since we've already loaded the child block, combine this check with
+	 * check for downlink connectivity.
+	 */
+	bt_downlink_connectivity_check(state, childblock,
+								   child, topaque->btpo.level);
+
 	/*
 	 * Since there cannot be a concurrent VACUUM operation in readonly mode,
 	 * and since a page has no links within other pages (siblings and parent)
@@ -1625,23 +1735,27 @@ bt_downlink_check(BtreeCheckState *state, BTScanInsert targetkey,
  * be concerned about concurrent page splits or page deletions.
  */
 static void
-bt_downlink_missing_check(BtreeCheckState *state)
+bt_downlink_missing_check(BtreeCheckState *state, bool rightsplit,
+						  BlockNumber targetblock, Page target)
 {
-	BTPageOpaque topaque = (BTPageOpaque) PageGetSpecialPointer(state->target);
+	BTPageOpaque topaque = (BTPageOpaque) PageGetSpecialPointer(target);
 	ItemId		itemid;
 	IndexTuple	itup;
 	Page		child;
 	BTPageOpaque copaque;
 	uint32		level;
 	BlockNumber childblk;
+	XLogRecPtr	targetlsn;
 
-	Assert(state->heapallindexed && state->readonly);
+	Assert(state->readonly);
 	Assert(!P_IGNORE(topaque));
 
 	/* No next level up with downlinks to fingerprint from the true root */
 	if (P_ISROOT(topaque))
 		return;
 
+	targetlsn = PageGetLSN(target);
+
 	/*
 	 * Incomplete (interrupted) page splits can account for the lack of a
 	 * downlink.  Some inserting transaction should eventually complete the
@@ -1663,26 +1777,20 @@ bt_downlink_missing_check(BtreeCheckState *state)
 	 * by design, so it shouldn't be taken to indicate corruption.  See
 	 * _bt_pagedel() for full details.
 	 */
-	if (state->rightsplit)
+	if (rightsplit)
 	{
 		ereport(DEBUG1,
 				(errcode(ERRCODE_NO_DATA),
 				 errmsg("harmless interrupted page split detected in index %s",
 						RelationGetRelationName(state->rel)),
 				 errdetail_internal("Block=%u level=%u left sibling=%u page lsn=%X/%X.",
-									state->targetblock, topaque->btpo.level,
+									targetblock, topaque->btpo.level,
 									topaque->btpo_prev,
-									(uint32) (state->targetlsn >> 32),
-									(uint32) state->targetlsn)));
+									(uint32) (targetlsn >> 32),
+									(uint32) targetlsn)));
 		return;
 	}
 
-	/* Target's downlink is typically present in parent/fingerprinted */
-	if (!bloom_lacks_element(state->downlinkfilter,
-							 (unsigned char *) &state->targetblock,
-							 sizeof(BlockNumber)))
-		return;
-
 	/*
 	 * Target is probably the "top parent" of a multi-level page deletion.
 	 * We'll need to descend the subtree to make sure that descendant pages
@@ -1699,9 +1807,9 @@ bt_downlink_missing_check(BtreeCheckState *state)
 				 errmsg("leaf index block lacks downlink in index \"%s\"",
 						RelationGetRelationName(state->rel)),
 				 errdetail_internal("Block=%u page lsn=%X/%X.",
-									state->targetblock,
-									(uint32) (state->targetlsn >> 32),
-									(uint32) state->targetlsn)));
+									targetblock,
+									(uint32) (targetlsn >> 32),
+									(uint32) targetlsn)));
 
 	/* Descend from the target page, which is an internal page */
 	elog(DEBUG1, "checking for interrupted multi-level deletion due to missing downlink in index \"%s\"",
@@ -1729,7 +1837,7 @@ bt_downlink_missing_check(BtreeCheckState *state)
 					 errmsg_internal("downlink points to block in index \"%s\" whose level is not one level down",
 									 RelationGetRelationName(state->rel)),
 					 errdetail_internal("Top parent/target block=%u block pointed to=%u expected level=%u level in pointed to block=%u.",
-										state->targetblock, childblk,
+										targetblock, childblk,
 										level - 1, copaque->btpo.level)));
 
 		level = copaque->btpo.level;
@@ -1767,9 +1875,9 @@ bt_downlink_missing_check(BtreeCheckState *state)
 				 errmsg_internal("downlink to deleted leaf page found in index \"%s\"",
 								 RelationGetRelationName(state->rel)),
 				 errdetail_internal("Top parent/target block=%u leaf block=%u top parent/target lsn=%X/%X.",
-									state->targetblock, childblk,
-									(uint32) (state->targetlsn >> 32),
-									(uint32) state->targetlsn)));
+									targetblock, childblk,
+									(uint32) (targetlsn >> 32),
+									(uint32) targetlsn)));
 
 	/*
 	 * Iff leaf page is half-dead, its high key top parent link should point
@@ -1785,7 +1893,7 @@ bt_downlink_missing_check(BtreeCheckState *state)
 	{
 		itemid = PageGetItemIdCareful(state, childblk, child, P_HIKEY);
 		itup = (IndexTuple) PageGetItem(child, itemid);
-		if (BTreeTupleGetTopParent(itup) == state->targetblock)
+		if (BTreeTupleGetTopParent(itup) == targetblock)
 			return;
 	}
 
@@ -1794,9 +1902,9 @@ bt_downlink_missing_check(BtreeCheckState *state)
 			 errmsg("internal index block lacks downlink in index \"%s\"",
 					RelationGetRelationName(state->rel)),
 			 errdetail_internal("Block=%u level=%u page lsn=%X/%X.",
-								state->targetblock, topaque->btpo.level,
-								(uint32) (state->targetlsn >> 32),
-								(uint32) state->targetlsn)));
+								targetblock, topaque->btpo.level,
+								(uint32) (targetlsn >> 32),
+								(uint32) targetlsn)));
 }
 
 /*
