On 07.11.2013 22:49, Heikki Linnakangas wrote:
Gin page deletion fails to take into account that there might be a
search in-flight to the page that is deleted. If the page is reused for
something else, the search can get very confused.

...

The regular b-tree code solves this by stamping deleted pages with the
current XID, and only allowing them to be reused once that XID becomes
old enough (< RecentGlobalXmin). Another approach might be to grab a
cleanup-strength lock on the left and parent pages when deleting a page,
and requiring search to keep the pin on the page its coming from, until
it has locked the next page.

I came up with the attached fix. In a nutshell, when walking along a right-link, the new page is locked before releasing the lock on the old one. Also, never delete the leftmost branch of a posting tree. I believe these changes are sufficient to fix the problem, because of the way the posting tree is searched:

All searches on a posting tree are "full searches", scanning the whole tree from left to right. Insertions do seek to the middle of the tree, but they are locked out during vacuum by holding a cleanup lock on the posting tree root (even without this patch). Thanks to this property, a search cannot step on a page that's being deleted by following a downlink, if we refrain from deleting the leftmost page on a level, as searches only descend down the leftmost branch.

It would be nice to improve that in master - holding an exclusive lock on the root page is pretty horrible - but this is a nice back-patchable patch. I'm not worried about the loss of concurrency because we now have to hold the lock on previous page when stepping to next page. In searches, it's only a share-lock, and in insertions, it's very rare that you have to step right.

The patch also adds some sanity checks to stepping right: the next page should be of the same type as the current page, e.g stepping right should not go from leaf to non-leaf or vice versa.

- Heikki
diff --git a/src/backend/access/gin/ginbtree.c b/src/backend/access/gin/ginbtree.c
index 2a6be4b..a738d80 100644
--- a/src/backend/access/gin/ginbtree.c
+++ b/src/backend/access/gin/ginbtree.c
@@ -112,10 +112,8 @@ ginFindLeafPage(GinBtree btree, GinBtreeStack *stack)
 				/* rightmost page */
 				break;
 
+			stack->buffer = ginStepRight(stack->buffer, btree->index, access);
 			stack->blkno = rightlink;
-			LockBuffer(stack->buffer, GIN_UNLOCK);
-			stack->buffer = ReleaseAndReadBuffer(stack->buffer, btree->index, stack->blkno);
-			LockBuffer(stack->buffer, access);
 			page = BufferGetPage(stack->buffer);
 		}
 
@@ -148,6 +146,41 @@ ginFindLeafPage(GinBtree btree, GinBtreeStack *stack)
 	}
 }
 
+/*
+ * Step right from current page.
+ *
+ * The next page is locked first, before releasing the current page. This is
+ * crucial to protect from concurrent page deletion (see comment in
+ * ginDeletePage).
+ */
+Buffer
+ginStepRight(Buffer buffer, Relation index, int lockmode)
+{
+	Buffer		nextbuffer;
+	Page		page = BufferGetPage(buffer);
+	bool		isLeaf = GinPageIsLeaf(page);
+	bool		isData = GinPageIsData(page);
+	BlockNumber	blkno = GinPageGetOpaque(page)->rightlink;
+
+	nextbuffer = ReadBuffer(index, blkno);
+	LockBuffer(nextbuffer, lockmode);
+	UnlockReleaseBuffer(buffer);
+
+	/* Sanity check that the page we stepped to is of similar kind. */
+	page = BufferGetPage(nextbuffer);
+	if (isLeaf != GinPageIsLeaf(page) || isData != GinPageIsData(page))
+		elog(ERROR, "right sibling of GIN page is of different type");
+
+	/*
+	 * Given the proper lock sequence above, we should never land on a
+	 * deleted page.
+	 */
+	if  (GinPageIsDeleted(page))
+		elog(ERROR, "right sibling of GIN page was deleted");
+
+	return nextbuffer;
+}
+
 void
 freeGinBtreeStack(GinBtreeStack *stack)
 {
@@ -235,12 +268,12 @@ ginFindParents(GinBtree btree, GinBtreeStack *stack,
 		while ((offset = btree->findChildPtr(btree, page, stack->blkno, InvalidOffsetNumber)) == InvalidOffsetNumber)
 		{
 			blkno = GinPageGetOpaque(page)->rightlink;
-			LockBuffer(buffer, GIN_UNLOCK);
-			ReleaseBuffer(buffer);
 			if (blkno == InvalidBlockNumber)
+			{
+				UnlockReleaseBuffer(buffer);
 				break;
-			buffer = ReadBuffer(btree->index, blkno);
-			LockBuffer(buffer, GIN_EXCLUSIVE);
+			}
+			buffer = ginStepRight(buffer, btree->index, GIN_EXCLUSIVE);
 			page = BufferGetPage(buffer);
 		}
 
@@ -439,23 +472,21 @@ ginInsertValue(GinBtree btree, GinBtreeStack *stack, GinStatsData *buildStats)
 		{
 			BlockNumber rightlink = GinPageGetOpaque(page)->rightlink;
 
-			LockBuffer(parent->buffer, GIN_UNLOCK);
-
 			if (rightlink == InvalidBlockNumber)
 			{
 				/*
 				 * rightmost page, but we don't find parent, we should use
 				 * plain search...
 				 */
+				LockBuffer(parent->buffer, GIN_UNLOCK);
 				ginFindParents(btree, stack, rootBlkno);
 				parent = stack->parent;
 				Assert(parent != NULL);
 				break;
 			}
 
+			parent->buffer = ginStepRight(parent->buffer, btree->index, GIN_EXCLUSIVE);
 			parent->blkno = rightlink;
-			parent->buffer = ReleaseAndReadBuffer(parent->buffer, btree->index, parent->blkno);
-			LockBuffer(parent->buffer, GIN_EXCLUSIVE);
 			page = BufferGetPage(parent->buffer);
 		}
 
diff --git a/src/backend/access/gin/ginget.c b/src/backend/access/gin/ginget.c
index cb17d38..5531b49 100644
--- a/src/backend/access/gin/ginget.c
+++ b/src/backend/access/gin/ginget.c
@@ -105,16 +105,11 @@ moveRightIfItNeeded(GinBtreeData *btree, GinBtreeStack *stack)
 		/*
 		 * We scanned the whole page, so we should take right page
 		 */
-		stack->blkno = GinPageGetOpaque(page)->rightlink;
-
 		if (GinPageRightMost(page))
 			return false;		/* no more pages */
 
-		LockBuffer(stack->buffer, GIN_UNLOCK);
-		stack->buffer = ReleaseAndReadBuffer(stack->buffer,
-											 btree->index,
-											 stack->blkno);
-		LockBuffer(stack->buffer, GIN_SHARE);
+		stack->buffer = ginStepRight(stack->buffer, btree->index, GIN_SHARE);
+		stack->blkno = BufferGetBlockNumber(stack->buffer);
 		stack->off = FirstOffsetNumber;
 	}
 
@@ -132,7 +127,6 @@ scanPostingTree(Relation index, GinScanEntry scanEntry,
 	GinPostingTreeScan *gdi;
 	Buffer		buffer;
 	Page		page;
-	BlockNumber blkno;
 
 	/* Descend to the leftmost leaf page */
 	gdi = ginPrepareScanPostingTree(index, rootPostingTree, TRUE);
@@ -162,10 +156,7 @@ scanPostingTree(Relation index, GinScanEntry scanEntry,
 		if (GinPageRightMost(page))
 			break;				/* no more pages */
 
-		blkno = GinPageGetOpaque(page)->rightlink;
-		LockBuffer(buffer, GIN_UNLOCK);
-		buffer = ReleaseAndReadBuffer(buffer, index, blkno);
-		LockBuffer(buffer, GIN_SHARE);
+		buffer = ginStepRight(buffer, index, GIN_SHARE);
 	}
 
 	UnlockReleaseBuffer(buffer);
@@ -563,20 +554,18 @@ entryGetNextItem(GinState *ginstate, GinScanEntry entry)
 
 			blkno = GinPageGetOpaque(page)->rightlink;
 
-			LockBuffer(entry->buffer, GIN_UNLOCK);
 			if (blkno == InvalidBlockNumber)
 			{
-				ReleaseBuffer(entry->buffer);
+				UnlockReleaseBuffer(entry->buffer);
 				ItemPointerSetInvalid(&entry->curItem);
 				entry->buffer = InvalidBuffer;
 				entry->isFinished = TRUE;
 				return;
 			}
 
-			entry->buffer = ReleaseAndReadBuffer(entry->buffer,
-												 ginstate->index,
-												 blkno);
-			LockBuffer(entry->buffer, GIN_SHARE);
+			entry->buffer = ginStepRight(entry->buffer,
+										 ginstate->index,
+										 GIN_SHARE);
 			page = BufferGetPage(entry->buffer);
 
 			entry->offset = InvalidOffsetNumber;
diff --git a/src/backend/access/gin/ginvacuum.c b/src/backend/access/gin/ginvacuum.c
index b84d3a3..c99ce81 100644
--- a/src/backend/access/gin/ginvacuum.c
+++ b/src/backend/access/gin/ginvacuum.c
@@ -237,6 +237,9 @@ ginVacuumPostingTreeLeaves(GinVacuumState *gvs, BlockNumber blkno, bool isRoot,
 	return hasVoidPage;
 }
 
+/*
+ * Delete a posting tree page.
+ */
 static void
 ginDeletePage(GinVacuumState *gvs, BlockNumber deleteBlkno, BlockNumber leftBlkno,
 			  BlockNumber parentBlkno, OffsetNumber myoff, bool isParentRoot)
@@ -246,39 +249,55 @@ ginDeletePage(GinVacuumState *gvs, BlockNumber deleteBlkno, BlockNumber leftBlkn
 	Buffer		pBuffer;
 	Page		page,
 				parentPage;
-
+	BlockNumber	rightlink;
+
+	/*----------
+	 * Lock the pages in the same order as a scan would, to avoid deadlocks:
+	 * left, then right, then parent. The correctness of this hinges on three
+	 * assumptions:
+	 *
+	 * 1. There are no concurrent insertions. (the caller holds a super-
+	 *    exclusive lock on the root page to prevent them)
+	 *
+	 * 2. When following the rightlink from one page to another, the next page
+	 *    is locked before releasing previous page. This prevents a search
+	 *    from following a right link to a deleted page.
+	 *
+	 * 3. Only full-searches are performed concurrently, starting from the
+	 *    leftmost leaf. Without this restriction, a search might visit the
+	 *    page we're about to delete by following its downlink. To protect
+	 *    from that, searches would need to lock the child page before
+	 *    releasing the parent, but that would cause a deadlock risk with
+	 *    insertions, which lock the child before the parent. This assumption
+	 *    holds for posting trees; only insertions descend down a non-leftmost
+	 *    branch. This does *not* hold for the entry tree, which is why
+	 *    we cannot support deleting entry tree pages at the moment.
+	 *----------
+	 */
+	lBuffer = ReadBufferExtended(gvs->index, MAIN_FORKNUM, leftBlkno,
+								 RBM_NORMAL, gvs->strategy);
 	dBuffer = ReadBufferExtended(gvs->index, MAIN_FORKNUM, deleteBlkno,
 								 RBM_NORMAL, gvs->strategy);
 
-	if (leftBlkno != InvalidBlockNumber)
-		lBuffer = ReadBufferExtended(gvs->index, MAIN_FORKNUM, leftBlkno,
-									 RBM_NORMAL, gvs->strategy);
-	else
-		lBuffer = InvalidBuffer;
-
 	pBuffer = ReadBufferExtended(gvs->index, MAIN_FORKNUM, parentBlkno,
 								 RBM_NORMAL, gvs->strategy);
 
+	LockBuffer(lBuffer, GIN_EXCLUSIVE);
 	LockBuffer(dBuffer, GIN_EXCLUSIVE);
 	if (!isParentRoot)			/* parent is already locked by
 								 * LockBufferForCleanup() */
 		LockBuffer(pBuffer, GIN_EXCLUSIVE);
-	if (leftBlkno != InvalidBlockNumber)
-		LockBuffer(lBuffer, GIN_EXCLUSIVE);
 
 	START_CRIT_SECTION();
 
-	if (leftBlkno != InvalidBlockNumber)
-	{
-		BlockNumber rightlink;
-
-		page = BufferGetPage(dBuffer);
-		rightlink = GinPageGetOpaque(page)->rightlink;
+	/* Unlink the page by changing left sibling's rightlink */
+	page = BufferGetPage(dBuffer);
+	rightlink = GinPageGetOpaque(page)->rightlink;
 
-		page = BufferGetPage(lBuffer);
-		GinPageGetOpaque(page)->rightlink = rightlink;
-	}
+	page = BufferGetPage(lBuffer);
+	GinPageGetOpaque(page)->rightlink = rightlink;
 
+	/* Delete downlink from parent */
 	parentPage = BufferGetPage(pBuffer);
 #ifdef USE_ASSERT_CHECKING
 	do
@@ -360,10 +379,7 @@ ginDeletePage(GinVacuumState *gvs, BlockNumber deleteBlkno, BlockNumber leftBlkn
 	if (!isParentRoot)
 		LockBuffer(pBuffer, GIN_UNLOCK);
 	ReleaseBuffer(pBuffer);
-
-	if (leftBlkno != InvalidBlockNumber)
-		UnlockReleaseBuffer(lBuffer);
-
+	UnlockReleaseBuffer(lBuffer);
 	UnlockReleaseBuffer(dBuffer);
 
 	END_CRIT_SECTION();
@@ -431,9 +447,9 @@ ginScanToDelete(GinVacuumState *gvs, BlockNumber blkno, bool isRoot, DataPageDel
 
 	if (GinPageGetOpaque(page)->maxoff < FirstOffsetNumber)
 	{
-		if (!(me->leftBlkno == InvalidBlockNumber && GinPageRightMost(page)))
+		/* we never delete the left- or rightmost branch */
+		if (me->leftBlkno != InvalidBlockNumber && !GinPageRightMost(page))
 		{
-			/* we never delete right most branch */
 			Assert(!isRoot);
 			if (GinPageGetOpaque(page)->maxoff < FirstOffsetNumber)
 			{
diff --git a/src/include/access/gin_private.h b/src/include/access/gin_private.h
index 1868b77..8e45ecf 100644
--- a/src/include/access/gin_private.h
+++ b/src/include/access/gin_private.h
@@ -513,6 +513,7 @@ typedef struct GinBtreeData
 
 extern GinBtreeStack *ginPrepareFindLeafPage(GinBtree btree, BlockNumber blkno);
 extern GinBtreeStack *ginFindLeafPage(GinBtree btree, GinBtreeStack *stack);
+extern Buffer ginStepRight(Buffer buffer, Relation index, int lockmode);
 extern void freeGinBtreeStack(GinBtreeStack *stack);
 extern void ginInsertValue(GinBtree btree, GinBtreeStack *stack,
 			   GinStatsData *buildStats);
-- 
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers

Reply via email to