Hi,

While finalizing some fixes in BRIN, I decided to stress-test the
relevant part of the code to check if I missed something. Imagine a
simple script that builds BRIN indexes on random data, does random
changes and cross-checks the results with/without the index.

But instead of I almost immediately ran into a LWLock deadlock :-(

I've managed to reproduce this on PG13+, but I believe it's there since
the brinRevmapDesummarizeRange was introduced in PG10. I just haven't
tried on pre-13 releases.

The stress-test-2.sh script (attached .tgz) builds a table, fills it
with random data and then runs a mix of updates and (de)summarization
DDL of random fraction of the index. The lockup is usually triggered
within a couple minutes, but might take longer (I guess it depends on
parameters used to generate the random data, so it may take a couple
runs to hit the right combination).


The root cause is that brin_doupdate and brinRevmapDesummarizeRange end
up locking buffers in different orders. Attached is also a .patch that
adds a bunch of LOG messages for buffer locking in BRIN code (it's for
PG13, but should work on newer releases too).

Here's a fairly typical example of the interaction between brin_doupdate
and brinRevmapDesummarizeRange:

brin_doupdate (from UPDATE query):

  LOG:  brin_doupdate: samepage 0
  LOG:  number of LWLocks held: 0
  LOG:  brin_getinsertbuffer: locking 898 lock 0x7f9a99a5af64
  LOG:  brin_getinsertbuffer: buffer locked
  LOG:  brin_getinsertbuffer B: locking 899 lock 0x7f9a99a5afa4
  LOG:  brin_getinsertbuffer B: buffer locked
  LOG:  number of LWLocks held: 2
  LOG:  lock 0 => 0x7f9a99a5af64
  LOG:  lock 1 => 0x7f9a99a5afa4
  LOG:  brin_doupdate: locking buffer for update
  LOG:  brinLockRevmapPageForUpdate: locking 158 lock 0x7f9a99a4f664

brinRevmapDesummarizeRange (from brin_desummarize_range):

  LOG:  starting brinRevmapDesummarizeRange
  LOG:  number of LWLocks held: 0
  LOG:  brinLockRevmapPageForUpdate: locking 158 lock 0x7f9a99a4f664
  LOG:  brinLockRevmapPageForUpdate: buffer locked
  LOG:  number of LWLocks held: 1
  LOG:  lock 0 => 0x7f9a99a4f664
  LOG:  brinRevmapDesummarizeRange: locking 898 lock 0x7f9a99a5af64

So, brin_doupdate starts with no LWLocks, and locks buffers 898, 899
(through getinsertbuffer). And then tries to lock 158.

Meanwhile brinRevmapDesummarizeRange locks 158 first, and then tries to
lock 898.

So, a LWLock deadlock :-(

I've now seen a bunch of these traces, with only minor differences. For
example brinRevmapDesummarizeRange might gets stuck on the second buffer
locked by getinsertbuffer (not the first one like here).


I don't have a great idea how to fix this - I guess we need to ensure
the buffers are locked in the same order, but that seems tricky.

Obviously, people don't call brin_desummarize_range() very often, which
likely explains the lack of reports.


regards

-- 
Tomas Vondra
EnterpriseDB: http://www.enterprisedb.com
The Enterprise PostgreSQL Company
diff --git a/src/backend/access/brin/brin.c b/src/backend/access/brin/brin.c
index 0becfde113..65e4ed76b4 100644
--- a/src/backend/access/brin/brin.c
+++ b/src/backend/access/brin/brin.c
@@ -689,7 +689,9 @@ brinbuild(Relation heap, Relation index, IndexInfo *indexInfo)
 
 	meta = ReadBuffer(index, P_NEW);
 	Assert(BufferGetBlockNumber(meta) == BRIN_METAPAGE_BLKNO);
+	elog(LOG, "brinbuild: locking buffer %d lock %p", meta, BufferDescriptorGetContentLock2(meta));
 	LockBuffer(meta, BUFFER_LOCK_EXCLUSIVE);
+	elog(LOG, "brinbuild: buffer locked");
 
 	brin_metapage_init(BufferGetPage(meta), BrinGetPagesPerRange(index),
 					   BRIN_CURRENT_VERSION);
@@ -756,7 +758,9 @@ brinbuildempty(Relation index)
 	/* An empty BRIN index has a metapage only. */
 	metabuf =
 		ReadBufferExtended(index, INIT_FORKNUM, P_NEW, RBM_NORMAL, NULL);
+	elog(LOG, "brinbuildempty: locking buffer %d lock %p", metabuf, BufferDescriptorGetContentLock2(metabuf));
 	LockBuffer(metabuf, BUFFER_LOCK_EXCLUSIVE);
+	elog(LOG, "brinbuildempty: buffer locked");
 
 	/* Initialize and xlog metabuffer. */
 	START_CRIT_SECTION();
diff --git a/src/backend/access/brin/brin_pageops.c b/src/backend/access/brin/brin_pageops.c
index d5b19293a0..d4a118eb54 100644
--- a/src/backend/access/brin/brin_pageops.c
+++ b/src/backend/access/brin/brin_pageops.c
@@ -78,6 +78,10 @@ brin_doupdate(Relation idxrel, BlockNumber pagesPerRange,
 		return false;			/* keep compiler quiet */
 	}
 
+	elog(LOG, "brin_doupdate: samepage %d", samepage);
+
+	DumpHeldLWLocks();
+
 	/* make sure the revmap is long enough to contain the entry we need */
 	brinRevmapExtend(revmap, heapBlk);
 
@@ -106,13 +110,18 @@ brin_doupdate(Relation idxrel, BlockNumber pagesPerRange,
 	}
 	else
 	{
+		elog(LOG, "brin_doupdate: locking buffer %d lock %p", oldbuf, BufferDescriptorGetContentLock2(oldbuf));
 		LockBuffer(oldbuf, BUFFER_LOCK_EXCLUSIVE);
+		elog(LOG, "brin_doupdate: buffer locked");
 		newbuf = InvalidBuffer;
 		extended = false;
 	}
 	oldpage = BufferGetPage(oldbuf);
 	oldlp = PageGetItemId(oldpage, oldoff);
 
+
+	DumpHeldLWLocks();
+
 	/*
 	 * Check that the old tuple wasn't updated concurrently: it might have
 	 * moved someplace else entirely, and for that matter the whole page
@@ -125,6 +134,7 @@ brin_doupdate(Relation idxrel, BlockNumber pagesPerRange,
 		oldoff > PageGetMaxOffsetNumber(oldpage) ||
 		!ItemIdIsNormal(oldlp))
 	{
+		elog(LOG, "brin_doupdate A: unlock %d new %d", oldbuf, newbuf);
 		LockBuffer(oldbuf, BUFFER_LOCK_UNLOCK);
 
 		/*
@@ -151,6 +161,7 @@ brin_doupdate(Relation idxrel, BlockNumber pagesPerRange,
 	 */
 	if (!brin_tuples_equal(oldtup, oldsz, origtup, origsz))
 	{
+		elog(LOG, "brin_doupdate B: unlock %d new %d", oldbuf, newbuf);
 		LockBuffer(oldbuf, BUFFER_LOCK_UNLOCK);
 		if (BufferIsValid(newbuf))
 		{
@@ -223,6 +234,7 @@ brin_doupdate(Relation idxrel, BlockNumber pagesPerRange,
 		 * Not enough space, but caller said that there was. Tell them to
 		 * start over.
 		 */
+		elog(LOG, "brin_doupdate C: unlock %d", oldbuf);
 		LockBuffer(oldbuf, BUFFER_LOCK_UNLOCK);
 		return false;
 	}
@@ -238,7 +250,9 @@ brin_doupdate(Relation idxrel, BlockNumber pagesPerRange,
 		OffsetNumber newoff;
 		Size		freespace = 0;
 
+		elog(LOG, "brin_doupdate: locking buffer for update");
 		revmapbuf = brinLockRevmapPageForUpdate(revmap, heapBlk);
+		elog(LOG, "brin_doupdate: buffer locked");
 
 		START_CRIT_SECTION();
 
@@ -303,6 +317,7 @@ brin_doupdate(Relation idxrel, BlockNumber pagesPerRange,
 
 		END_CRIT_SECTION();
 
+		elog(LOG, "brin_doupdate D: unlock %d %d %d", revmapbuf, oldbuf, newbuf);
 		LockBuffer(revmapbuf, BUFFER_LOCK_UNLOCK);
 		LockBuffer(oldbuf, BUFFER_LOCK_UNLOCK);
 		UnlockReleaseBuffer(newbuf);
@@ -378,7 +393,9 @@ brin_doinsert(Relation idxrel, BlockNumber pagesPerRange,
 		 * revmap over the page we held a pin on, so we cannot assume that
 		 * it's still a regular page.
 		 */
+		elog(LOG, "brin_doinsert: locking buffer %d lock %p", *buffer, BufferDescriptorGetContentLock2(*buffer));
 		LockBuffer(*buffer, BUFFER_LOCK_EXCLUSIVE);
+		elog(LOG, "brin_doinsert: buffer locked");
 		if (br_page_get_freespace(BufferGetPage(*buffer)) < itemsz)
 		{
 			UnlockReleaseBuffer(*buffer);
@@ -641,7 +658,9 @@ brin_page_cleanup(Relation idxrel, Buffer buf)
 		LockRelationForExtension(idxrel, ShareLock);
 		UnlockRelationForExtension(idxrel, ShareLock);
 
+		elog(LOG, "brin_page_cleanup: locking buffer %d lock %p", buf, BufferDescriptorGetContentLock2(buf));
 		LockBuffer(buf, BUFFER_LOCK_EXCLUSIVE);
+		elog(LOG, "brin_page_cleanup: buffer locked");
 		if (PageIsNew(page))
 		{
 			brin_initialize_empty_new_buffer(idxrel, buf);
@@ -764,7 +783,9 @@ brin_getinsertbuffer(Relation irel, Buffer oldbuf, Size itemsz,
 		 */
 		if (BufferIsValid(oldbuf) && oldblk < newblk)
 		{
+			elog(LOG, "brin_getinsertbuffer: locking buffer %d lock %p", oldbuf, BufferDescriptorGetContentLock2(oldbuf));
 			LockBuffer(oldbuf, BUFFER_LOCK_EXCLUSIVE);
+			elog(LOG, "brin_getinsertbuffer: buffer locked");
 			if (!BRIN_IS_REGULAR_PAGE(BufferGetPage(oldbuf)))
 			{
 				LockBuffer(oldbuf, BUFFER_LOCK_UNLOCK);
@@ -796,7 +817,9 @@ brin_getinsertbuffer(Relation irel, Buffer oldbuf, Size itemsz,
 			}
 		}
 
+		elog(LOG, "brin_getinsertbuffer B: locking buffer %d lock %p", buf, BufferDescriptorGetContentLock2(buf));
 		LockBuffer(buf, BUFFER_LOCK_EXCLUSIVE);
+		elog(LOG, "brin_getinsertbuffer B: buffer locked");
 
 		if (extensionLockHeld)
 			UnlockRelationForExtension(irel, ExclusiveLock);
@@ -823,7 +846,9 @@ brin_getinsertbuffer(Relation irel, Buffer oldbuf, Size itemsz,
 			 */
 			if (BufferIsValid(oldbuf) && oldblk > newblk)
 			{
+				elog(LOG, "brin_getinsertbuffer C: locking buffer %d lock %p", oldbuf, BufferDescriptorGetContentLock2(oldbuf));
 				LockBuffer(oldbuf, BUFFER_LOCK_EXCLUSIVE);
+				elog(LOG, "brin_getinsertbuffer: buffer locked");
 				Assert(BRIN_IS_REGULAR_PAGE(BufferGetPage(oldbuf)));
 			}
 
diff --git a/src/backend/access/brin/brin_revmap.c b/src/backend/access/brin/brin_revmap.c
index 35746714a7..072d82514c 100644
--- a/src/backend/access/brin/brin_revmap.c
+++ b/src/backend/access/brin/brin_revmap.c
@@ -139,7 +139,9 @@ brinLockRevmapPageForUpdate(BrinRevmap *revmap, BlockNumber heapBlk)
 	Buffer		rmBuf;
 
 	rmBuf = revmap_get_buffer(revmap, heapBlk);
+	elog(LOG, "brinLockRevmapPageForUpdate: locking buffer %d lock %p", rmBuf, BufferDescriptorGetContentLock2(rmBuf));
 	LockBuffer(rmBuf, BUFFER_LOCK_EXCLUSIVE);
+	elog(LOG, "brinLockRevmapPageForUpdate: buffer locked");
 
 	return rmBuf;
 }
@@ -341,6 +343,9 @@ brinRevmapDesummarizeRange(Relation idxrel, BlockNumber heapBlk)
 	OffsetNumber regOffset;
 	ItemId		lp;
 
+	elog(LOG, "starting brinRevmapDesummarizeRange");
+	DumpHeldLWLocks();
+
 	revmap = brinRevmapInitialize(idxrel, &pagesPerRange, NULL);
 
 	revmapBlk = revmap_get_blkno(revmap, heapBlk);
@@ -368,8 +373,12 @@ brinRevmapDesummarizeRange(Relation idxrel, BlockNumber heapBlk)
 		return true;
 	}
 
+	DumpHeldLWLocks();
+
 	regBuf = ReadBuffer(idxrel, ItemPointerGetBlockNumber(iptr));
+	elog(LOG, "brinRevmapDesummarizeRange: locking buffer %d lock %p", regBuf, BufferDescriptorGetContentLock2(regBuf));
 	LockBuffer(regBuf, BUFFER_LOCK_EXCLUSIVE);
+	elog(LOG, "brinRevmapDesummarizeRange: buffer locked");
 	regPg = BufferGetPage(regBuf);
 	/*
 	 * We're only removing data, not reading it, so there's no need to
@@ -544,7 +553,9 @@ revmap_physical_extend(BrinRevmap *revmap)
 	 * but note that we still need to grab the relation extension lock because
 	 * another backend can extend the index with regular BRIN pages.
 	 */
+	elog(LOG, "revmap_physical_extend: locking buffer %d lock %p", revmap->rm_metaBuf, BufferDescriptorGetContentLock2(revmap->rm_metaBuf));
 	LockBuffer(revmap->rm_metaBuf, BUFFER_LOCK_EXCLUSIVE);
+	elog(LOG, "revmap_physical_extend: buffer locked");
 	metapage = BufferGetPage(revmap->rm_metaBuf);
 	metadata = (BrinMetaPageData *) PageGetContents(metapage);
 
@@ -564,7 +575,9 @@ revmap_physical_extend(BrinRevmap *revmap)
 	if (mapBlk < nblocks)
 	{
 		buf = ReadBuffer(irel, mapBlk);
+		elog(LOG, "revmap_physical_extend B: locking buffer %d lock %p", buf, BufferDescriptorGetContentLock2(buf));
 		LockBuffer(buf, BUFFER_LOCK_EXCLUSIVE);
+		elog(LOG, "revmap_physical_extend B: buffer locked");
 		page = BufferGetPage(buf);
 	}
 	else
@@ -587,7 +600,9 @@ revmap_physical_extend(BrinRevmap *revmap)
 			ReleaseBuffer(buf);
 			return;
 		}
+		elog(LOG, "revmap_physical_extend C: locking buffer %d lock %p", buf, BufferDescriptorGetContentLock2(buf));
 		LockBuffer(buf, BUFFER_LOCK_EXCLUSIVE);
+		elog(LOG, "revmap_physical_extend C: buffer locked");
 		page = BufferGetPage(buf);
 
 		if (needLock)
diff --git a/src/backend/storage/buffer/bufmgr.c b/src/backend/storage/buffer/bufmgr.c
index 69a1401654..0a9231f649 100644
--- a/src/backend/storage/buffer/bufmgr.c
+++ b/src/backend/storage/buffer/bufmgr.c
@@ -3740,6 +3740,16 @@ LockBuffer(Buffer buffer, int mode)
 		elog(ERROR, "unrecognized buffer lock mode: %d", mode);
 }
 
+void *
+BufferDescriptorGetContentLock2(Buffer buffer)
+{
+	BufferDesc *buf;
+
+	buf = GetBufferDescriptor(buffer - 1);
+
+	return BufferDescriptorGetContentLock(buf);
+}
+
 /*
  * Acquire the content_lock for the buffer, but only if we don't have to wait.
  *
diff --git a/src/backend/storage/lmgr/lwlock.c b/src/backend/storage/lmgr/lwlock.c
index 365d215f2b..8952226e1d 100644
--- a/src/backend/storage/lmgr/lwlock.c
+++ b/src/backend/storage/lmgr/lwlock.c
@@ -1984,3 +1984,11 @@ LWLockHeldByMeInMode(LWLock *l, LWLockMode mode)
 	}
 	return false;
 }
+
+void
+DumpHeldLWLocks(void)
+{
+	elog(LOG, "number of LWLocks held: %d", num_held_lwlocks);
+	for (int i = 0; i < num_held_lwlocks; i++)
+		elog(LOG, "lock %d => %p", i, held_lwlocks[i].lock);
+}
diff --git a/src/include/storage/bufmgr.h b/src/include/storage/bufmgr.h
index ee91b8fa26..d1325154ab 100644
--- a/src/include/storage/bufmgr.h
+++ b/src/include/storage/bufmgr.h
@@ -168,6 +168,8 @@ extern PGDLLIMPORT int32 *LocalRefCount;
  */
 #define BufferGetPage(buffer) ((Page)BufferGetBlock(buffer))
 
+extern void *BufferDescriptorGetContentLock2(Buffer desc);
+
 /*
  * prototypes for functions in bufmgr.c
  */
diff --git a/src/include/storage/lwlock.h b/src/include/storage/lwlock.h
index cdbfbed118..bd34994013 100644
--- a/src/include/storage/lwlock.h
+++ b/src/include/storage/lwlock.h
@@ -187,6 +187,8 @@ extern int	LWLockNewTrancheId(void);
 extern void LWLockRegisterTranche(int tranche_id, const char *tranche_name);
 extern void LWLockInitialize(LWLock *lock, int tranche_id);
 
+extern void DumpHeldLWLocks(void);
+
 /*
  * Every tranche ID less than NUM_INDIVIDUAL_LWLOCKS is reserved; also,
  * we reserve additional tranche IDs for builtin tranches not included in

Attachment: stress-test.tgz
Description: application/compressed-tar

Reply via email to