On 07.11.2013 22:49, Heikki Linnakangas wrote:
Gin page deletion fails to take into account that there might be a
search in-flight to the page that is deleted. If the page is reused for
something else, the search can get very confused.
...
The regular b-tree code solves this by stamping deleted pages with the
current XID, and only allowing them to be reused once that XID becomes
old enough (< RecentGlobalXmin). Another approach might be to grab a
cleanup-strength lock on the left and parent pages when deleting a page,
and requiring search to keep the pin on the page its coming from, until
it has locked the next page.
I came up with the attached fix. In a nutshell, when walking along a
right-link, the new page is locked before releasing the lock on the old
one. Also, never delete the leftmost branch of a posting tree. I believe
these changes are sufficient to fix the problem, because of the way the
posting tree is searched:
All searches on a posting tree are "full searches", scanning the whole
tree from left to right. Insertions do seek to the middle of the tree,
but they are locked out during vacuum by holding a cleanup lock on the
posting tree root (even without this patch). Thanks to this property, a
search cannot step on a page that's being deleted by following a
downlink, if we refrain from deleting the leftmost page on a level, as
searches only descend down the leftmost branch.
It would be nice to improve that in master - holding an exclusive lock
on the root page is pretty horrible - but this is a nice back-patchable
patch. I'm not worried about the loss of concurrency because we now have
to hold the lock on previous page when stepping to next page. In
searches, it's only a share-lock, and in insertions, it's very rare that
you have to step right.
The patch also adds some sanity checks to stepping right: the next page
should be of the same type as the current page, e.g stepping right
should not go from leaf to non-leaf or vice versa.
- Heikki
diff --git a/src/backend/access/gin/ginbtree.c b/src/backend/access/gin/ginbtree.c
index 2a6be4b..a738d80 100644
--- a/src/backend/access/gin/ginbtree.c
+++ b/src/backend/access/gin/ginbtree.c
@@ -112,10 +112,8 @@ ginFindLeafPage(GinBtree btree, GinBtreeStack *stack)
/* rightmost page */
break;
+ stack->buffer = ginStepRight(stack->buffer, btree->index, access);
stack->blkno = rightlink;
- LockBuffer(stack->buffer, GIN_UNLOCK);
- stack->buffer = ReleaseAndReadBuffer(stack->buffer, btree->index, stack->blkno);
- LockBuffer(stack->buffer, access);
page = BufferGetPage(stack->buffer);
}
@@ -148,6 +146,41 @@ ginFindLeafPage(GinBtree btree, GinBtreeStack *stack)
}
}
+/*
+ * Step right from current page.
+ *
+ * The next page is locked first, before releasing the current page. This is
+ * crucial to protect from concurrent page deletion (see comment in
+ * ginDeletePage).
+ */
+Buffer
+ginStepRight(Buffer buffer, Relation index, int lockmode)
+{
+ Buffer nextbuffer;
+ Page page = BufferGetPage(buffer);
+ bool isLeaf = GinPageIsLeaf(page);
+ bool isData = GinPageIsData(page);
+ BlockNumber blkno = GinPageGetOpaque(page)->rightlink;
+
+ nextbuffer = ReadBuffer(index, blkno);
+ LockBuffer(nextbuffer, lockmode);
+ UnlockReleaseBuffer(buffer);
+
+ /* Sanity check that the page we stepped to is of similar kind. */
+ page = BufferGetPage(nextbuffer);
+ if (isLeaf != GinPageIsLeaf(page) || isData != GinPageIsData(page))
+ elog(ERROR, "right sibling of GIN page is of different type");
+
+ /*
+ * Given the proper lock sequence above, we should never land on a
+ * deleted page.
+ */
+ if (GinPageIsDeleted(page))
+ elog(ERROR, "right sibling of GIN page was deleted");
+
+ return nextbuffer;
+}
+
void
freeGinBtreeStack(GinBtreeStack *stack)
{
@@ -235,12 +268,12 @@ ginFindParents(GinBtree btree, GinBtreeStack *stack,
while ((offset = btree->findChildPtr(btree, page, stack->blkno, InvalidOffsetNumber)) == InvalidOffsetNumber)
{
blkno = GinPageGetOpaque(page)->rightlink;
- LockBuffer(buffer, GIN_UNLOCK);
- ReleaseBuffer(buffer);
if (blkno == InvalidBlockNumber)
+ {
+ UnlockReleaseBuffer(buffer);
break;
- buffer = ReadBuffer(btree->index, blkno);
- LockBuffer(buffer, GIN_EXCLUSIVE);
+ }
+ buffer = ginStepRight(buffer, btree->index, GIN_EXCLUSIVE);
page = BufferGetPage(buffer);
}
@@ -439,23 +472,21 @@ ginInsertValue(GinBtree btree, GinBtreeStack *stack, GinStatsData *buildStats)
{
BlockNumber rightlink = GinPageGetOpaque(page)->rightlink;
- LockBuffer(parent->buffer, GIN_UNLOCK);
-
if (rightlink == InvalidBlockNumber)
{
/*
* rightmost page, but we don't find parent, we should use
* plain search...
*/
+ LockBuffer(parent->buffer, GIN_UNLOCK);
ginFindParents(btree, stack, rootBlkno);
parent = stack->parent;
Assert(parent != NULL);
break;
}
+ parent->buffer = ginStepRight(parent->buffer, btree->index, GIN_EXCLUSIVE);
parent->blkno = rightlink;
- parent->buffer = ReleaseAndReadBuffer(parent->buffer, btree->index, parent->blkno);
- LockBuffer(parent->buffer, GIN_EXCLUSIVE);
page = BufferGetPage(parent->buffer);
}
diff --git a/src/backend/access/gin/ginget.c b/src/backend/access/gin/ginget.c
index cb17d38..5531b49 100644
--- a/src/backend/access/gin/ginget.c
+++ b/src/backend/access/gin/ginget.c
@@ -105,16 +105,11 @@ moveRightIfItNeeded(GinBtreeData *btree, GinBtreeStack *stack)
/*
* We scanned the whole page, so we should take right page
*/
- stack->blkno = GinPageGetOpaque(page)->rightlink;
-
if (GinPageRightMost(page))
return false; /* no more pages */
- LockBuffer(stack->buffer, GIN_UNLOCK);
- stack->buffer = ReleaseAndReadBuffer(stack->buffer,
- btree->index,
- stack->blkno);
- LockBuffer(stack->buffer, GIN_SHARE);
+ stack->buffer = ginStepRight(stack->buffer, btree->index, GIN_SHARE);
+ stack->blkno = BufferGetBlockNumber(stack->buffer);
stack->off = FirstOffsetNumber;
}
@@ -132,7 +127,6 @@ scanPostingTree(Relation index, GinScanEntry scanEntry,
GinPostingTreeScan *gdi;
Buffer buffer;
Page page;
- BlockNumber blkno;
/* Descend to the leftmost leaf page */
gdi = ginPrepareScanPostingTree(index, rootPostingTree, TRUE);
@@ -162,10 +156,7 @@ scanPostingTree(Relation index, GinScanEntry scanEntry,
if (GinPageRightMost(page))
break; /* no more pages */
- blkno = GinPageGetOpaque(page)->rightlink;
- LockBuffer(buffer, GIN_UNLOCK);
- buffer = ReleaseAndReadBuffer(buffer, index, blkno);
- LockBuffer(buffer, GIN_SHARE);
+ buffer = ginStepRight(buffer, index, GIN_SHARE);
}
UnlockReleaseBuffer(buffer);
@@ -563,20 +554,18 @@ entryGetNextItem(GinState *ginstate, GinScanEntry entry)
blkno = GinPageGetOpaque(page)->rightlink;
- LockBuffer(entry->buffer, GIN_UNLOCK);
if (blkno == InvalidBlockNumber)
{
- ReleaseBuffer(entry->buffer);
+ UnlockReleaseBuffer(entry->buffer);
ItemPointerSetInvalid(&entry->curItem);
entry->buffer = InvalidBuffer;
entry->isFinished = TRUE;
return;
}
- entry->buffer = ReleaseAndReadBuffer(entry->buffer,
- ginstate->index,
- blkno);
- LockBuffer(entry->buffer, GIN_SHARE);
+ entry->buffer = ginStepRight(entry->buffer,
+ ginstate->index,
+ GIN_SHARE);
page = BufferGetPage(entry->buffer);
entry->offset = InvalidOffsetNumber;
diff --git a/src/backend/access/gin/ginvacuum.c b/src/backend/access/gin/ginvacuum.c
index b84d3a3..c99ce81 100644
--- a/src/backend/access/gin/ginvacuum.c
+++ b/src/backend/access/gin/ginvacuum.c
@@ -237,6 +237,9 @@ ginVacuumPostingTreeLeaves(GinVacuumState *gvs, BlockNumber blkno, bool isRoot,
return hasVoidPage;
}
+/*
+ * Delete a posting tree page.
+ */
static void
ginDeletePage(GinVacuumState *gvs, BlockNumber deleteBlkno, BlockNumber leftBlkno,
BlockNumber parentBlkno, OffsetNumber myoff, bool isParentRoot)
@@ -246,39 +249,55 @@ ginDeletePage(GinVacuumState *gvs, BlockNumber deleteBlkno, BlockNumber leftBlkn
Buffer pBuffer;
Page page,
parentPage;
-
+ BlockNumber rightlink;
+
+ /*----------
+ * Lock the pages in the same order as a scan would, to avoid deadlocks:
+ * left, then right, then parent. The correctness of this hinges on three
+ * assumptions:
+ *
+ * 1. There are no concurrent insertions. (the caller holds a super-
+ * exclusive lock on the root page to prevent them)
+ *
+ * 2. When following the rightlink from one page to another, the next page
+ * is locked before releasing previous page. This prevents a search
+ * from following a right link to a deleted page.
+ *
+ * 3. Only full-searches are performed concurrently, starting from the
+ * leftmost leaf. Without this restriction, a search might visit the
+ * page we're about to delete by following its downlink. To protect
+ * from that, searches would need to lock the child page before
+ * releasing the parent, but that would cause a deadlock risk with
+ * insertions, which lock the child before the parent. This assumption
+ * holds for posting trees; only insertions descend down a non-leftmost
+ * branch. This does *not* hold for the entry tree, which is why
+ * we cannot support deleting entry tree pages at the moment.
+ *----------
+ */
+ lBuffer = ReadBufferExtended(gvs->index, MAIN_FORKNUM, leftBlkno,
+ RBM_NORMAL, gvs->strategy);
dBuffer = ReadBufferExtended(gvs->index, MAIN_FORKNUM, deleteBlkno,
RBM_NORMAL, gvs->strategy);
- if (leftBlkno != InvalidBlockNumber)
- lBuffer = ReadBufferExtended(gvs->index, MAIN_FORKNUM, leftBlkno,
- RBM_NORMAL, gvs->strategy);
- else
- lBuffer = InvalidBuffer;
-
pBuffer = ReadBufferExtended(gvs->index, MAIN_FORKNUM, parentBlkno,
RBM_NORMAL, gvs->strategy);
+ LockBuffer(lBuffer, GIN_EXCLUSIVE);
LockBuffer(dBuffer, GIN_EXCLUSIVE);
if (!isParentRoot) /* parent is already locked by
* LockBufferForCleanup() */
LockBuffer(pBuffer, GIN_EXCLUSIVE);
- if (leftBlkno != InvalidBlockNumber)
- LockBuffer(lBuffer, GIN_EXCLUSIVE);
START_CRIT_SECTION();
- if (leftBlkno != InvalidBlockNumber)
- {
- BlockNumber rightlink;
-
- page = BufferGetPage(dBuffer);
- rightlink = GinPageGetOpaque(page)->rightlink;
+ /* Unlink the page by changing left sibling's rightlink */
+ page = BufferGetPage(dBuffer);
+ rightlink = GinPageGetOpaque(page)->rightlink;
- page = BufferGetPage(lBuffer);
- GinPageGetOpaque(page)->rightlink = rightlink;
- }
+ page = BufferGetPage(lBuffer);
+ GinPageGetOpaque(page)->rightlink = rightlink;
+ /* Delete downlink from parent */
parentPage = BufferGetPage(pBuffer);
#ifdef USE_ASSERT_CHECKING
do
@@ -360,10 +379,7 @@ ginDeletePage(GinVacuumState *gvs, BlockNumber deleteBlkno, BlockNumber leftBlkn
if (!isParentRoot)
LockBuffer(pBuffer, GIN_UNLOCK);
ReleaseBuffer(pBuffer);
-
- if (leftBlkno != InvalidBlockNumber)
- UnlockReleaseBuffer(lBuffer);
-
+ UnlockReleaseBuffer(lBuffer);
UnlockReleaseBuffer(dBuffer);
END_CRIT_SECTION();
@@ -431,9 +447,9 @@ ginScanToDelete(GinVacuumState *gvs, BlockNumber blkno, bool isRoot, DataPageDel
if (GinPageGetOpaque(page)->maxoff < FirstOffsetNumber)
{
- if (!(me->leftBlkno == InvalidBlockNumber && GinPageRightMost(page)))
+ /* we never delete the left- or rightmost branch */
+ if (me->leftBlkno != InvalidBlockNumber && !GinPageRightMost(page))
{
- /* we never delete right most branch */
Assert(!isRoot);
if (GinPageGetOpaque(page)->maxoff < FirstOffsetNumber)
{
diff --git a/src/include/access/gin_private.h b/src/include/access/gin_private.h
index 1868b77..8e45ecf 100644
--- a/src/include/access/gin_private.h
+++ b/src/include/access/gin_private.h
@@ -513,6 +513,7 @@ typedef struct GinBtreeData
extern GinBtreeStack *ginPrepareFindLeafPage(GinBtree btree, BlockNumber blkno);
extern GinBtreeStack *ginFindLeafPage(GinBtree btree, GinBtreeStack *stack);
+extern Buffer ginStepRight(Buffer buffer, Relation index, int lockmode);
extern void freeGinBtreeStack(GinBtreeStack *stack);
extern void ginInsertValue(GinBtree btree, GinBtreeStack *stack,
GinStatsData *buildStats);
--
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers