On 30.07.2020 16:40, Anastasia Lubennikova wrote:
While testing this fix, Alexander Lakhin spotted another problem.

After a few runs, it will fail with "ERROR: corrupted BRIN index: inconsistent range map"

The problem is caused by a race in page locking in brinGetTupleForHeapBlock [1]:

(1) bitmapsan locks revmap->rm_currBuf and finds the address of the tuple on a regular page "page", then unlocks revmap->rm_currBuf (2) in another transaction desummarize locks both revmap->rm_currBuf and "page", cleans up the tuple and unlocks both buffers (1) bitmapscan locks buffer, containing "page", attempts to access the tuple and fails to find it


At first, I tried to fix it by holding the lock on revmap->rm_currBuf until we locked the regular page, but it causes a deadlock with brinsummarize(), It can be easily reproduced with the same test as above. Is there any rule about the order of locking revmap and regular pages in brin? I haven't found anything in README.

As an alternative, we can leave locks as is and add a recheck, before throwing an error.

Here are the updated patches for both problems.

1) brin_summarize_fix_REL_12_v2 fixes
"failed to find parent tuple for heap-only tuple at (50661,130) in table "tbl'"

This patch checks that we only access initialized entries of root_offsets[] array. If necessary, collect the array again. One recheck is enough here, since concurrent pruning is not possible.

2) brin_pagelock_fix_REL_12_v1.patch fixes
"ERROR: corrupted BRIN index: inconsistent range map"

This patch adds a recheck as suggested in previous message.
I am not sure if one recheck is enough to eliminate the race completely, but the problem cannot be reproduced anymore.

--
Anastasia Lubennikova
Postgres Professional: http://www.postgrespro.com
The Russian Postgres Company

commit 0b1fbfb34236882591c5ac6665a407d9780f4017
Author: anastasia <a.lubennik...@postgrespro.ru>
Date:   Wed Aug 5 15:56:10 2020 +0300

    fix race in BRIN page locking

diff --git a/src/backend/access/brin/brin_revmap.c b/src/backend/access/brin/brin_revmap.c
index 7dcb1cd73f..89724c3bd4 100644
--- a/src/backend/access/brin/brin_revmap.c
+++ b/src/backend/access/brin/brin_revmap.c
@@ -207,6 +207,7 @@ brinGetTupleForHeapBlock(BrinRevmap *revmap, BlockNumber heapBlk,
 	ItemId		lp;
 	BrinTuple  *tup;
 	ItemPointerData previptr;
+	bool in_retry = false;
 
 	/* normalize the heap block number to be the first page in the range */
 	heapBlk = (heapBlk / revmap->rm_pagesPerRange) * revmap->rm_pagesPerRange;
@@ -283,9 +284,23 @@ brinGetTupleForHeapBlock(BrinRevmap *revmap, BlockNumber heapBlk,
 		if (BRIN_IS_REGULAR_PAGE(page))
 		{
 			if (*off > PageGetMaxOffsetNumber(page))
-				ereport(ERROR,
-						(errcode(ERRCODE_INDEX_CORRUPTED),
-						 errmsg_internal("corrupted BRIN index: inconsistent range map")));
+			{
+				if (in_retry)
+					ereport(ERROR,
+							(errcode(ERRCODE_INDEX_CORRUPTED),
+							errmsg_internal("corrupted BRIN index: inconsistent range map")));
+				else
+				{
+					/*
+					 * Assume that the revmap was updated concurrently.
+					 * Retry only once to avoid eternal looping in case
+					 * the index is really corrupted.
+					 */
+					LockBuffer(*buf, BUFFER_LOCK_UNLOCK);
+					in_retry = true;
+					continue;
+				}
+			}
 			lp = PageGetItemId(page, *off);
 			if (ItemIdIsUsed(lp))
 			{
commit 16f0387dbeba3570836b506241bf0a1820de3390
Author: anastasia <a.lubennik...@postgrespro.ru>
Date:   Mon Aug 10 20:07:22 2020 +0300

    brin_summarize_fix_REL_12_v2.patch

diff --git a/src/backend/access/heap/heapam_handler.c b/src/backend/access/heap/heapam_handler.c
index 267a6ee25a..c651d8b04e 100644
--- a/src/backend/access/heap/heapam_handler.c
+++ b/src/backend/access/heap/heapam_handler.c
@@ -1159,6 +1159,7 @@ heapam_index_build_range_scan(Relation heapRelation,
 	BlockNumber previous_blkno = InvalidBlockNumber;
 	BlockNumber root_blkno = InvalidBlockNumber;
 	OffsetNumber root_offsets[MaxHeapTuplesPerPage];
+	OffsetNumber root_offsets_size = 0;
 
 	/*
 	 * sanity checks
@@ -1324,6 +1325,11 @@ heapam_index_build_range_scan(Relation heapRelation,
 		 * buffer continuously while visiting the page, so no pruning
 		 * operation can occur either.
 		 *
+		 * It is essential, though, to check the root_offsets_size bound
+		 * before accessing the array, because concurrently inserted HOT tuples
+		 * don't have a valid cached root offset and we need to build the map
+		 * once again for them.
+		 *
 		 * Also, although our opinions about tuple liveness could change while
 		 * we scan the page (due to concurrent transaction commits/aborts),
 		 * the chain root locations won't, so this info doesn't need to be
@@ -1338,7 +1344,7 @@ heapam_index_build_range_scan(Relation heapRelation,
 			Page		page = BufferGetPage(hscan->rs_cbuf);
 
 			LockBuffer(hscan->rs_cbuf, BUFFER_LOCK_SHARE);
-			heap_get_root_tuples(page, root_offsets);
+			root_offsets_size = heap_get_root_tuples(page, root_offsets);
 			LockBuffer(hscan->rs_cbuf, BUFFER_LOCK_UNLOCK);
 
 			root_blkno = hscan->rs_cblock;
@@ -1625,6 +1631,25 @@ heapam_index_build_range_scan(Relation heapRelation,
 
 			offnum = ItemPointerGetOffsetNumber(&heapTuple->t_self);
 
+			/*
+			 * As we do not hold buffer lock, concurrent insertion can happen.
+			 * If so, collect the map once again to find the root offset for
+			 * the new tuple.
+			 * (MaxOffsetNumber+1) is a special value that we use to
+			 * differentiate uninitialized entries.
+			 */
+			if (root_offsets_size < offnum ||
+				root_offsets[offnum - 1] == (MaxOffsetNumber+1))
+			{
+				Page	page = BufferGetPage(hscan->rs_cbuf);
+
+				LockBuffer(hscan->rs_cbuf, BUFFER_LOCK_SHARE);
+				root_offsets_size = heap_get_root_tuples(page, root_offsets);
+				LockBuffer(hscan->rs_cbuf, BUFFER_LOCK_UNLOCK);
+			}
+
+			Assert(root_offsets_size >= offnum);
+
 			if (!OffsetNumberIsValid(root_offsets[offnum - 1]))
 				ereport(ERROR,
 						(errcode(ERRCODE_DATA_CORRUPTED),
diff --git a/src/backend/access/heap/pruneheap.c b/src/backend/access/heap/pruneheap.c
index 256df4de10..3810481df7 100644
--- a/src/backend/access/heap/pruneheap.c
+++ b/src/backend/access/heap/pruneheap.c
@@ -732,7 +732,9 @@ heap_page_prune_execute(Buffer buffer,
  * root_offsets[k - 1] = j.
  *
  * The passed-in root_offsets array must have MaxHeapTuplesPerPage entries.
- * We zero out all unused entries.
+ * The return value is the size of the root_offsets array. The caller must
+ * never access values that are out of this bound.
+ * We also set special value of (MaxOffsetNumber+1) to unused entries.
  *
  * The function must be called with at least share lock on the buffer, to
  * prevent concurrent prune operations.
@@ -740,12 +742,13 @@ heap_page_prune_execute(Buffer buffer,
  * Note: The information collected here is valid only as long as the caller
  * holds a pin on the buffer. Once pin is released, a tuple might be pruned
  * and reused by a completely unrelated tuple.
+ *
  */
-void
+OffsetNumber
 heap_get_root_tuples(Page page, OffsetNumber *root_offsets)
 {
-	OffsetNumber offnum,
-				maxoff;
+	OffsetNumber offnum, maxoff;
+	OffsetNumber last_valid_offnum = 0;
 
 	MemSet(root_offsets, 0, MaxHeapTuplesPerPage * sizeof(OffsetNumber));
 
@@ -759,7 +762,14 @@ heap_get_root_tuples(Page page, OffsetNumber *root_offsets)
 
 		/* skip unused and dead items */
 		if (!ItemIdIsUsed(lp) || ItemIdIsDead(lp))
+		{
+			/*
+			 * (MaxOffsetNumber+1) is a special value that we use to
+			 * differentiate values that we've skipped.
+			 */
+			root_offsets[offnum - 1] = (MaxOffsetNumber+1);
 			continue;
+		}
 
 		if (ItemIdIsNormal(lp))
 		{
@@ -778,6 +788,8 @@ heap_get_root_tuples(Page page, OffsetNumber *root_offsets)
 			 * Remember it in the mapping.
 			 */
 			root_offsets[offnum - 1] = offnum;
+			if (offnum > last_valid_offnum)
+				last_valid_offnum = offnum;
 
 			/* If it's not the start of a HOT-chain, we're done with it */
 			if (!HeapTupleHeaderIsHotUpdated(htup))
@@ -820,6 +832,8 @@ heap_get_root_tuples(Page page, OffsetNumber *root_offsets)
 
 			/* Remember the root line pointer for this item */
 			root_offsets[nextoffnum - 1] = offnum;
+			if (nextoffnum > last_valid_offnum)
+				last_valid_offnum = nextoffnum;
 
 			/* Advance to next chain member, if any */
 			if (!HeapTupleHeaderIsHotUpdated(htup))
@@ -832,4 +846,6 @@ heap_get_root_tuples(Page page, OffsetNumber *root_offsets)
 			priorXmax = HeapTupleHeaderGetUpdateXid(htup);
 		}
 	}
+
+	return last_valid_offnum;
 }
diff --git a/src/include/access/heapam.h b/src/include/access/heapam.h
index b31de38910..b7f570887e 100644
--- a/src/include/access/heapam.h
+++ b/src/include/access/heapam.h
@@ -180,7 +180,7 @@ extern void heap_page_prune_execute(Buffer buffer,
 									OffsetNumber *redirected, int nredirected,
 									OffsetNumber *nowdead, int ndead,
 									OffsetNumber *nowunused, int nunused);
-extern void heap_get_root_tuples(Page page, OffsetNumber *root_offsets);
+extern OffsetNumber heap_get_root_tuples(Page page, OffsetNumber *root_offsets);
 
 /* in heap/vacuumlazy.c */
 struct VacuumParams;

Reply via email to