Hi, While finalizing some fixes in BRIN, I decided to stress-test the relevant part of the code to check if I missed something. Imagine a simple script that builds BRIN indexes on random data, does random changes and cross-checks the results with/without the index.
But instead of I almost immediately ran into a LWLock deadlock :-( I've managed to reproduce this on PG13+, but I believe it's there since the brinRevmapDesummarizeRange was introduced in PG10. I just haven't tried on pre-13 releases. The stress-test-2.sh script (attached .tgz) builds a table, fills it with random data and then runs a mix of updates and (de)summarization DDL of random fraction of the index. The lockup is usually triggered within a couple minutes, but might take longer (I guess it depends on parameters used to generate the random data, so it may take a couple runs to hit the right combination). The root cause is that brin_doupdate and brinRevmapDesummarizeRange end up locking buffers in different orders. Attached is also a .patch that adds a bunch of LOG messages for buffer locking in BRIN code (it's for PG13, but should work on newer releases too). Here's a fairly typical example of the interaction between brin_doupdate and brinRevmapDesummarizeRange: brin_doupdate (from UPDATE query): LOG: brin_doupdate: samepage 0 LOG: number of LWLocks held: 0 LOG: brin_getinsertbuffer: locking 898 lock 0x7f9a99a5af64 LOG: brin_getinsertbuffer: buffer locked LOG: brin_getinsertbuffer B: locking 899 lock 0x7f9a99a5afa4 LOG: brin_getinsertbuffer B: buffer locked LOG: number of LWLocks held: 2 LOG: lock 0 => 0x7f9a99a5af64 LOG: lock 1 => 0x7f9a99a5afa4 LOG: brin_doupdate: locking buffer for update LOG: brinLockRevmapPageForUpdate: locking 158 lock 0x7f9a99a4f664 brinRevmapDesummarizeRange (from brin_desummarize_range): LOG: starting brinRevmapDesummarizeRange LOG: number of LWLocks held: 0 LOG: brinLockRevmapPageForUpdate: locking 158 lock 0x7f9a99a4f664 LOG: brinLockRevmapPageForUpdate: buffer locked LOG: number of LWLocks held: 1 LOG: lock 0 => 0x7f9a99a4f664 LOG: brinRevmapDesummarizeRange: locking 898 lock 0x7f9a99a5af64 So, brin_doupdate starts with no LWLocks, and locks buffers 898, 899 (through getinsertbuffer). And then tries to lock 158. Meanwhile brinRevmapDesummarizeRange locks 158 first, and then tries to lock 898. So, a LWLock deadlock :-( I've now seen a bunch of these traces, with only minor differences. For example brinRevmapDesummarizeRange might gets stuck on the second buffer locked by getinsertbuffer (not the first one like here). I don't have a great idea how to fix this - I guess we need to ensure the buffers are locked in the same order, but that seems tricky. Obviously, people don't call brin_desummarize_range() very often, which likely explains the lack of reports. regards -- Tomas Vondra EnterpriseDB: http://www.enterprisedb.com The Enterprise PostgreSQL Company
diff --git a/src/backend/access/brin/brin.c b/src/backend/access/brin/brin.c index 0becfde113..65e4ed76b4 100644 --- a/src/backend/access/brin/brin.c +++ b/src/backend/access/brin/brin.c @@ -689,7 +689,9 @@ brinbuild(Relation heap, Relation index, IndexInfo *indexInfo) meta = ReadBuffer(index, P_NEW); Assert(BufferGetBlockNumber(meta) == BRIN_METAPAGE_BLKNO); + elog(LOG, "brinbuild: locking buffer %d lock %p", meta, BufferDescriptorGetContentLock2(meta)); LockBuffer(meta, BUFFER_LOCK_EXCLUSIVE); + elog(LOG, "brinbuild: buffer locked"); brin_metapage_init(BufferGetPage(meta), BrinGetPagesPerRange(index), BRIN_CURRENT_VERSION); @@ -756,7 +758,9 @@ brinbuildempty(Relation index) /* An empty BRIN index has a metapage only. */ metabuf = ReadBufferExtended(index, INIT_FORKNUM, P_NEW, RBM_NORMAL, NULL); + elog(LOG, "brinbuildempty: locking buffer %d lock %p", metabuf, BufferDescriptorGetContentLock2(metabuf)); LockBuffer(metabuf, BUFFER_LOCK_EXCLUSIVE); + elog(LOG, "brinbuildempty: buffer locked"); /* Initialize and xlog metabuffer. */ START_CRIT_SECTION(); diff --git a/src/backend/access/brin/brin_pageops.c b/src/backend/access/brin/brin_pageops.c index d5b19293a0..d4a118eb54 100644 --- a/src/backend/access/brin/brin_pageops.c +++ b/src/backend/access/brin/brin_pageops.c @@ -78,6 +78,10 @@ brin_doupdate(Relation idxrel, BlockNumber pagesPerRange, return false; /* keep compiler quiet */ } + elog(LOG, "brin_doupdate: samepage %d", samepage); + + DumpHeldLWLocks(); + /* make sure the revmap is long enough to contain the entry we need */ brinRevmapExtend(revmap, heapBlk); @@ -106,13 +110,18 @@ brin_doupdate(Relation idxrel, BlockNumber pagesPerRange, } else { + elog(LOG, "brin_doupdate: locking buffer %d lock %p", oldbuf, BufferDescriptorGetContentLock2(oldbuf)); LockBuffer(oldbuf, BUFFER_LOCK_EXCLUSIVE); + elog(LOG, "brin_doupdate: buffer locked"); newbuf = InvalidBuffer; extended = false; } oldpage = BufferGetPage(oldbuf); oldlp = PageGetItemId(oldpage, oldoff); + + DumpHeldLWLocks(); + /* * Check that the old tuple wasn't updated concurrently: it might have * moved someplace else entirely, and for that matter the whole page @@ -125,6 +134,7 @@ brin_doupdate(Relation idxrel, BlockNumber pagesPerRange, oldoff > PageGetMaxOffsetNumber(oldpage) || !ItemIdIsNormal(oldlp)) { + elog(LOG, "brin_doupdate A: unlock %d new %d", oldbuf, newbuf); LockBuffer(oldbuf, BUFFER_LOCK_UNLOCK); /* @@ -151,6 +161,7 @@ brin_doupdate(Relation idxrel, BlockNumber pagesPerRange, */ if (!brin_tuples_equal(oldtup, oldsz, origtup, origsz)) { + elog(LOG, "brin_doupdate B: unlock %d new %d", oldbuf, newbuf); LockBuffer(oldbuf, BUFFER_LOCK_UNLOCK); if (BufferIsValid(newbuf)) { @@ -223,6 +234,7 @@ brin_doupdate(Relation idxrel, BlockNumber pagesPerRange, * Not enough space, but caller said that there was. Tell them to * start over. */ + elog(LOG, "brin_doupdate C: unlock %d", oldbuf); LockBuffer(oldbuf, BUFFER_LOCK_UNLOCK); return false; } @@ -238,7 +250,9 @@ brin_doupdate(Relation idxrel, BlockNumber pagesPerRange, OffsetNumber newoff; Size freespace = 0; + elog(LOG, "brin_doupdate: locking buffer for update"); revmapbuf = brinLockRevmapPageForUpdate(revmap, heapBlk); + elog(LOG, "brin_doupdate: buffer locked"); START_CRIT_SECTION(); @@ -303,6 +317,7 @@ brin_doupdate(Relation idxrel, BlockNumber pagesPerRange, END_CRIT_SECTION(); + elog(LOG, "brin_doupdate D: unlock %d %d %d", revmapbuf, oldbuf, newbuf); LockBuffer(revmapbuf, BUFFER_LOCK_UNLOCK); LockBuffer(oldbuf, BUFFER_LOCK_UNLOCK); UnlockReleaseBuffer(newbuf); @@ -378,7 +393,9 @@ brin_doinsert(Relation idxrel, BlockNumber pagesPerRange, * revmap over the page we held a pin on, so we cannot assume that * it's still a regular page. */ + elog(LOG, "brin_doinsert: locking buffer %d lock %p", *buffer, BufferDescriptorGetContentLock2(*buffer)); LockBuffer(*buffer, BUFFER_LOCK_EXCLUSIVE); + elog(LOG, "brin_doinsert: buffer locked"); if (br_page_get_freespace(BufferGetPage(*buffer)) < itemsz) { UnlockReleaseBuffer(*buffer); @@ -641,7 +658,9 @@ brin_page_cleanup(Relation idxrel, Buffer buf) LockRelationForExtension(idxrel, ShareLock); UnlockRelationForExtension(idxrel, ShareLock); + elog(LOG, "brin_page_cleanup: locking buffer %d lock %p", buf, BufferDescriptorGetContentLock2(buf)); LockBuffer(buf, BUFFER_LOCK_EXCLUSIVE); + elog(LOG, "brin_page_cleanup: buffer locked"); if (PageIsNew(page)) { brin_initialize_empty_new_buffer(idxrel, buf); @@ -764,7 +783,9 @@ brin_getinsertbuffer(Relation irel, Buffer oldbuf, Size itemsz, */ if (BufferIsValid(oldbuf) && oldblk < newblk) { + elog(LOG, "brin_getinsertbuffer: locking buffer %d lock %p", oldbuf, BufferDescriptorGetContentLock2(oldbuf)); LockBuffer(oldbuf, BUFFER_LOCK_EXCLUSIVE); + elog(LOG, "brin_getinsertbuffer: buffer locked"); if (!BRIN_IS_REGULAR_PAGE(BufferGetPage(oldbuf))) { LockBuffer(oldbuf, BUFFER_LOCK_UNLOCK); @@ -796,7 +817,9 @@ brin_getinsertbuffer(Relation irel, Buffer oldbuf, Size itemsz, } } + elog(LOG, "brin_getinsertbuffer B: locking buffer %d lock %p", buf, BufferDescriptorGetContentLock2(buf)); LockBuffer(buf, BUFFER_LOCK_EXCLUSIVE); + elog(LOG, "brin_getinsertbuffer B: buffer locked"); if (extensionLockHeld) UnlockRelationForExtension(irel, ExclusiveLock); @@ -823,7 +846,9 @@ brin_getinsertbuffer(Relation irel, Buffer oldbuf, Size itemsz, */ if (BufferIsValid(oldbuf) && oldblk > newblk) { + elog(LOG, "brin_getinsertbuffer C: locking buffer %d lock %p", oldbuf, BufferDescriptorGetContentLock2(oldbuf)); LockBuffer(oldbuf, BUFFER_LOCK_EXCLUSIVE); + elog(LOG, "brin_getinsertbuffer: buffer locked"); Assert(BRIN_IS_REGULAR_PAGE(BufferGetPage(oldbuf))); } diff --git a/src/backend/access/brin/brin_revmap.c b/src/backend/access/brin/brin_revmap.c index 35746714a7..072d82514c 100644 --- a/src/backend/access/brin/brin_revmap.c +++ b/src/backend/access/brin/brin_revmap.c @@ -139,7 +139,9 @@ brinLockRevmapPageForUpdate(BrinRevmap *revmap, BlockNumber heapBlk) Buffer rmBuf; rmBuf = revmap_get_buffer(revmap, heapBlk); + elog(LOG, "brinLockRevmapPageForUpdate: locking buffer %d lock %p", rmBuf, BufferDescriptorGetContentLock2(rmBuf)); LockBuffer(rmBuf, BUFFER_LOCK_EXCLUSIVE); + elog(LOG, "brinLockRevmapPageForUpdate: buffer locked"); return rmBuf; } @@ -341,6 +343,9 @@ brinRevmapDesummarizeRange(Relation idxrel, BlockNumber heapBlk) OffsetNumber regOffset; ItemId lp; + elog(LOG, "starting brinRevmapDesummarizeRange"); + DumpHeldLWLocks(); + revmap = brinRevmapInitialize(idxrel, &pagesPerRange, NULL); revmapBlk = revmap_get_blkno(revmap, heapBlk); @@ -368,8 +373,12 @@ brinRevmapDesummarizeRange(Relation idxrel, BlockNumber heapBlk) return true; } + DumpHeldLWLocks(); + regBuf = ReadBuffer(idxrel, ItemPointerGetBlockNumber(iptr)); + elog(LOG, "brinRevmapDesummarizeRange: locking buffer %d lock %p", regBuf, BufferDescriptorGetContentLock2(regBuf)); LockBuffer(regBuf, BUFFER_LOCK_EXCLUSIVE); + elog(LOG, "brinRevmapDesummarizeRange: buffer locked"); regPg = BufferGetPage(regBuf); /* * We're only removing data, not reading it, so there's no need to @@ -544,7 +553,9 @@ revmap_physical_extend(BrinRevmap *revmap) * but note that we still need to grab the relation extension lock because * another backend can extend the index with regular BRIN pages. */ + elog(LOG, "revmap_physical_extend: locking buffer %d lock %p", revmap->rm_metaBuf, BufferDescriptorGetContentLock2(revmap->rm_metaBuf)); LockBuffer(revmap->rm_metaBuf, BUFFER_LOCK_EXCLUSIVE); + elog(LOG, "revmap_physical_extend: buffer locked"); metapage = BufferGetPage(revmap->rm_metaBuf); metadata = (BrinMetaPageData *) PageGetContents(metapage); @@ -564,7 +575,9 @@ revmap_physical_extend(BrinRevmap *revmap) if (mapBlk < nblocks) { buf = ReadBuffer(irel, mapBlk); + elog(LOG, "revmap_physical_extend B: locking buffer %d lock %p", buf, BufferDescriptorGetContentLock2(buf)); LockBuffer(buf, BUFFER_LOCK_EXCLUSIVE); + elog(LOG, "revmap_physical_extend B: buffer locked"); page = BufferGetPage(buf); } else @@ -587,7 +600,9 @@ revmap_physical_extend(BrinRevmap *revmap) ReleaseBuffer(buf); return; } + elog(LOG, "revmap_physical_extend C: locking buffer %d lock %p", buf, BufferDescriptorGetContentLock2(buf)); LockBuffer(buf, BUFFER_LOCK_EXCLUSIVE); + elog(LOG, "revmap_physical_extend C: buffer locked"); page = BufferGetPage(buf); if (needLock) diff --git a/src/backend/storage/buffer/bufmgr.c b/src/backend/storage/buffer/bufmgr.c index 69a1401654..0a9231f649 100644 --- a/src/backend/storage/buffer/bufmgr.c +++ b/src/backend/storage/buffer/bufmgr.c @@ -3740,6 +3740,16 @@ LockBuffer(Buffer buffer, int mode) elog(ERROR, "unrecognized buffer lock mode: %d", mode); } +void * +BufferDescriptorGetContentLock2(Buffer buffer) +{ + BufferDesc *buf; + + buf = GetBufferDescriptor(buffer - 1); + + return BufferDescriptorGetContentLock(buf); +} + /* * Acquire the content_lock for the buffer, but only if we don't have to wait. * diff --git a/src/backend/storage/lmgr/lwlock.c b/src/backend/storage/lmgr/lwlock.c index 365d215f2b..8952226e1d 100644 --- a/src/backend/storage/lmgr/lwlock.c +++ b/src/backend/storage/lmgr/lwlock.c @@ -1984,3 +1984,11 @@ LWLockHeldByMeInMode(LWLock *l, LWLockMode mode) } return false; } + +void +DumpHeldLWLocks(void) +{ + elog(LOG, "number of LWLocks held: %d", num_held_lwlocks); + for (int i = 0; i < num_held_lwlocks; i++) + elog(LOG, "lock %d => %p", i, held_lwlocks[i].lock); +} diff --git a/src/include/storage/bufmgr.h b/src/include/storage/bufmgr.h index ee91b8fa26..d1325154ab 100644 --- a/src/include/storage/bufmgr.h +++ b/src/include/storage/bufmgr.h @@ -168,6 +168,8 @@ extern PGDLLIMPORT int32 *LocalRefCount; */ #define BufferGetPage(buffer) ((Page)BufferGetBlock(buffer)) +extern void *BufferDescriptorGetContentLock2(Buffer desc); + /* * prototypes for functions in bufmgr.c */ diff --git a/src/include/storage/lwlock.h b/src/include/storage/lwlock.h index cdbfbed118..bd34994013 100644 --- a/src/include/storage/lwlock.h +++ b/src/include/storage/lwlock.h @@ -187,6 +187,8 @@ extern int LWLockNewTrancheId(void); extern void LWLockRegisterTranche(int tranche_id, const char *tranche_name); extern void LWLockInitialize(LWLock *lock, int tranche_id); +extern void DumpHeldLWLocks(void); + /* * Every tranche ID less than NUM_INDIVIDUAL_LWLOCKS is reserved; also, * we reserve additional tranche IDs for builtin tranches not included in
stress-test.tgz
Description: application/compressed-tar