One of our clients caught an error "failed to find parent tuple for heap-only tuple at (50661,130) in table "tbl'" in PostgreSQL v12.

Steps to reproduce (REL_12_STABLE):

1) Create table with primary key, create brin index, fill table with some initial data:

create table tbl (id int primary key, a int) with (fillfactor=50);
create index idx on tbl using brin (a) with (autosummarize=on);
insert into tbl select i, i from generate_series(0,100000) as i;

2) Run script test_brin.sql using pgbench:

 pgbench postgres -f ../review/brin_test.sql  -n -T 120

The script is a bit messy because I was trying to reproduce a problematic workload. Though I didn't manage to simplify it. The idea is that it inserts new values into the table to produce unindexed pages and also updates some values to trigger HOT-updates on these pages.

3) Open psql session and run brin_summarize_new_values

select brin_summarize_new_values('idx'::regclass::oid); \watch 2

Wait a bit. And in psql you will see the ERROR.

This error is caused by the problem with root_offsets array bounds. It occurs if a new HOT tuple was inserted after we've collected root_offsets, and thus we don't have root_offset for tuple's offnum. Concurrent insertions are possible, because brin_summarize_new_values() only holds ShareUpdateLock on table and no lock (only pin) on the page.

The draft fix is in the attachments. It saves root_offsets_size and checks that we only access valid fields.
Patch also adds some debug messages, just to ensure that problem was caught.

TODO:

- check if  heapam_index_validate_scan() has the same problem
- code cleanup
- test other PostgreSQL versions

[1] https://www.postgresql.org/message-id/flat/CA%2BTgmoYgwjmmjK24Qxb_vWAu8_Hh7gfVFcr3%2BR7ocdLvYOWJXg%40mail.gmail.com

--
Anastasia Lubennikova
Postgres Professional: http://www.postgrespro.com
The Russian Postgres Company

commit 71bdcd268386c4240613b080b9ebd5ae935c1405
Author: anastasia <a.lubennik...@postgrespro.ru>
Date:   Thu Jul 23 17:55:16 2020 +0300

    WIP Fix root_offsets out of bound error in brin_summarize_new_values()

diff --git a/src/backend/access/heap/heapam_handler.c b/src/backend/access/heap/heapam_handler.c
index fc19f40a2e3..34f74a61ea3 100644
--- a/src/backend/access/heap/heapam_handler.c
+++ b/src/backend/access/heap/heapam_handler.c
@@ -1176,6 +1176,7 @@ heapam_index_build_range_scan(Relation heapRelation,
 	BlockNumber previous_blkno = InvalidBlockNumber;
 	BlockNumber root_blkno = InvalidBlockNumber;
 	OffsetNumber root_offsets[MaxHeapTuplesPerPage];
+	OffsetNumber root_offsets_size = 0;
 
 	/*
 	 * sanity checks
@@ -1355,7 +1356,7 @@ heapam_index_build_range_scan(Relation heapRelation,
 			Page		page = BufferGetPage(hscan->rs_cbuf);
 
 			LockBuffer(hscan->rs_cbuf, BUFFER_LOCK_SHARE);
-			heap_get_root_tuples(page, root_offsets);
+			root_offsets_size = heap_get_root_tuples(page, root_offsets);
 			LockBuffer(hscan->rs_cbuf, BUFFER_LOCK_UNLOCK);
 
 			root_blkno = hscan->rs_cblock;
@@ -1643,13 +1644,31 @@ heapam_index_build_range_scan(Relation heapRelation,
 			rootTuple = *heapTuple;
 			offnum = ItemPointerGetOffsetNumber(&heapTuple->t_self);
 
+			elog(LOG, "HeapTupleIsHeapOnly. Check Offset last_valid %d offnum %d", root_offsets_size, offnum);
+
+			if (root_offsets_size <= offnum)
+			{
+				Page		page = BufferGetPage(hscan->rs_cbuf);
+
+				ereport(WARNING,
+					(errcode(ERRCODE_DATA_CORRUPTED),
+						errmsg_internal("DEBUG failed to find parent tuple for heap-only tuple at (%u,%u) in table \"%s\"",
+										ItemPointerGetBlockNumber(&heapTuple->t_self),
+										offnum,
+										RelationGetRelationName(heapRelation))));
+
+				LockBuffer(hscan->rs_cbuf, BUFFER_LOCK_SHARE);
+				root_offsets_size = heap_get_root_tuples(page, root_offsets);
+				LockBuffer(hscan->rs_cbuf, BUFFER_LOCK_UNLOCK);
+			}
+
 			if (!OffsetNumberIsValid(root_offsets[offnum - 1]))
 				ereport(ERROR,
-						(errcode(ERRCODE_DATA_CORRUPTED),
-						 errmsg_internal("failed to find parent tuple for heap-only tuple at (%u,%u) in table \"%s\"",
-										 ItemPointerGetBlockNumber(&heapTuple->t_self),
-										 offnum,
-										 RelationGetRelationName(heapRelation))));
+					(errcode(ERRCODE_DATA_CORRUPTED),
+					 errmsg_internal("failed to find parent tuple for heap-only tuple at (%u,%u) in table \"%s\"",
+									ItemPointerGetBlockNumber(&heapTuple->t_self),
+									offnum,
+									RelationGetRelationName(heapRelation))));
 
 			ItemPointerSetOffsetNumber(&rootTuple.t_self,
 									   root_offsets[offnum - 1]);
@@ -1820,6 +1839,7 @@ heapam_index_validate_scan(Relation heapRelation,
 		if (HeapTupleIsHeapOnly(heapTuple))
 		{
 			root_offnum = root_offsets[root_offnum - 1];
+			// TODO add check of root_offsets_size like in heapam_index_build_range_scan()
 			if (!OffsetNumberIsValid(root_offnum))
 				ereport(ERROR,
 						(errcode(ERRCODE_DATA_CORRUPTED),
diff --git a/src/backend/access/heap/pruneheap.c b/src/backend/access/heap/pruneheap.c
index 0efe3ce9995..9b2bff1b8fd 100644
--- a/src/backend/access/heap/pruneheap.c
+++ b/src/backend/access/heap/pruneheap.c
@@ -741,11 +741,11 @@ heap_page_prune_execute(Buffer buffer,
  * holds a pin on the buffer. Once pin is released, a tuple might be pruned
  * and reused by a completely unrelated tuple.
  */
-void
+OffsetNumber
 heap_get_root_tuples(Page page, OffsetNumber *root_offsets)
 {
-	OffsetNumber offnum,
-				maxoff;
+	OffsetNumber offnum = 0;
+	OffsetNumber maxoff;
 
 	MemSet(root_offsets, 0, MaxHeapTuplesPerPage * sizeof(OffsetNumber));
 
@@ -832,4 +832,5 @@ heap_get_root_tuples(Page page, OffsetNumber *root_offsets)
 			priorXmax = HeapTupleHeaderGetUpdateXid(htup);
 		}
 	}
+	return offnum;
 }
diff --git a/src/include/access/heapam.h b/src/include/access/heapam.h
index dffb57bf11a..45f8622c6cb 100644
--- a/src/include/access/heapam.h
+++ b/src/include/access/heapam.h
@@ -181,7 +181,7 @@ extern void heap_page_prune_execute(Buffer buffer,
 									OffsetNumber *redirected, int nredirected,
 									OffsetNumber *nowdead, int ndead,
 									OffsetNumber *nowunused, int nunused);
-extern void heap_get_root_tuples(Page page, OffsetNumber *root_offsets);
+extern OffsetNumber heap_get_root_tuples(Page page, OffsetNumber *root_offsets);
 
 /* in heap/syncscan.c */
 extern void ss_report_location(Relation rel, BlockNumber location);

Attachment: brin_test.sql
Description: application/sql

Reply via email to