From 3f3a97174f0f8b97ffee5121607bd89c6583a2b7 Mon Sep 17 00:00:00 2001
From: Peter Geoghegan <pg@bowt.ie>
Date: Thu, 23 Apr 2020 14:43:54 -0700
Subject: [PATCH v1 2/4] Add nbtree Valgrind buffer lock checks.

Teach nbtree to use Valgrind memcheck client requests to mark pages as
NOACCESS at the point the buffer content lock is released.  Callers
usually also drop their pin at the same time, though there are some
exceptions.  This instrumentation makes Valgrind detect any page access
that occurs without a buffer lock, which is fundamentally unsafe with
B-Tree index pages.  Buffer pins only prevent VACUUM from performing
concurrent recycling of heap TIDs from the pinned leaf page.  Our
assumption is that they don't make it safe to access the page in any way
whatsoever.

We provide nbtree specific wrappers for LockBuffer() and related
functions.  All nbtree code is required to use the wrappers.

These checks are a stricter version of the memcheck client requests in
bufmgr.c that detect buffer accesses that take place without a buffer
pin held.  The checks never make the buffer pin checking more lax.
---
 src/include/access/nbtree.h           |   4 +
 src/backend/access/nbtree/nbtinsert.c |   2 +-
 src/backend/access/nbtree/nbtpage.c   | 118 ++++++++++++++++++++++----
 src/backend/access/nbtree/nbtree.c    |   6 +-
 src/backend/access/nbtree/nbtsearch.c |  16 ++--
 src/backend/access/nbtree/nbtutils.c  |   4 +-
 6 files changed, 118 insertions(+), 32 deletions(-)

diff --git a/src/include/access/nbtree.h b/src/include/access/nbtree.h
index 9a3acd26b7..284524a8f3 100644
--- a/src/include/access/nbtree.h
+++ b/src/include/access/nbtree.h
@@ -1072,6 +1072,10 @@ extern Buffer _bt_getbuf(Relation rel, BlockNumber blkno, int access);
 extern Buffer _bt_relandgetbuf(Relation rel, Buffer obuf,
 							   BlockNumber blkno, int access);
 extern void _bt_relbuf(Relation rel, Buffer buf);
+extern void _bt_lockbuf(Relation rel, Buffer buf, int access);
+extern void _bt_unlockbuf(Relation rel, Buffer buf);
+extern bool _bt_conditionallockbuf(Relation rel, Buffer buf);
+extern void _bt_upgradelockbufcleanup(Relation rel, Buffer buf);
 extern void _bt_pageinit(Page page, Size size);
 extern bool _bt_page_recyclable(Page page);
 extern void _bt_delitems_vacuum(Relation rel, Buffer buf,
diff --git a/src/backend/access/nbtree/nbtinsert.c b/src/backend/access/nbtree/nbtinsert.c
index a70b64d964..80c3a42bca 100644
--- a/src/backend/access/nbtree/nbtinsert.c
+++ b/src/backend/access/nbtree/nbtinsert.c
@@ -303,7 +303,7 @@ _bt_search_insert(Relation rel, BTInsertState insertstate)
 	{
 		/* Simulate a _bt_getbuf() call with conditional locking */
 		insertstate->buf = ReadBuffer(rel, RelationGetTargetBlock(rel));
-		if (ConditionalLockBuffer(insertstate->buf))
+		if (_bt_conditionallockbuf(rel, insertstate->buf))
 		{
 			Page		page;
 			BTPageOpaque lpageop;
diff --git a/src/backend/access/nbtree/nbtpage.c b/src/backend/access/nbtree/nbtpage.c
index 39b8f17f4b..aa163cb493 100644
--- a/src/backend/access/nbtree/nbtpage.c
+++ b/src/backend/access/nbtree/nbtpage.c
@@ -32,6 +32,7 @@
 #include "storage/indexfsm.h"
 #include "storage/lmgr.h"
 #include "storage/predicate.h"
+#include "utils/memdebug.h"
 #include "utils/snapmgr.h"
 
 static BTMetaPageData *_bt_getmeta(Relation rel, Buffer metabuf);
@@ -191,8 +192,8 @@ _bt_update_meta_cleanup_info(Relation rel, TransactionId oldestBtpoXact,
 	}
 
 	/* trade in our read lock for a write lock */
-	LockBuffer(metabuf, BUFFER_LOCK_UNLOCK);
-	LockBuffer(metabuf, BT_WRITE);
+	_bt_unlockbuf(rel, metabuf);
+	_bt_lockbuf(rel, metabuf, BT_WRITE);
 
 	START_CRIT_SECTION();
 
@@ -333,8 +334,8 @@ _bt_getroot(Relation rel, int access)
 		}
 
 		/* trade in our read lock for a write lock */
-		LockBuffer(metabuf, BUFFER_LOCK_UNLOCK);
-		LockBuffer(metabuf, BT_WRITE);
+		_bt_unlockbuf(rel, metabuf);
+		_bt_lockbuf(rel, metabuf, BT_WRITE);
 
 		/*
 		 * Race condition:	if someone else initialized the metadata between
@@ -427,8 +428,8 @@ _bt_getroot(Relation rel, int access)
 		 * else accessing the new root page while it's unlocked, since no one
 		 * else knows where it is yet.
 		 */
-		LockBuffer(rootbuf, BUFFER_LOCK_UNLOCK);
-		LockBuffer(rootbuf, BT_READ);
+		_bt_unlockbuf(rel, rootbuf);
+		_bt_lockbuf(rel, rootbuf, BT_READ);
 
 		/* okay, metadata is correct, release lock on it without caching */
 		_bt_relbuf(rel, metabuf);
@@ -790,7 +791,7 @@ _bt_getbuf(Relation rel, BlockNumber blkno, int access)
 	{
 		/* Read an existing block of the relation */
 		buf = ReadBuffer(rel, blkno);
-		LockBuffer(buf, access);
+		_bt_lockbuf(rel, buf, access);
 		_bt_checkpage(rel, buf);
 	}
 	else
@@ -830,7 +831,7 @@ _bt_getbuf(Relation rel, BlockNumber blkno, int access)
 			if (blkno == InvalidBlockNumber)
 				break;
 			buf = ReadBuffer(rel, blkno);
-			if (ConditionalLockBuffer(buf))
+			if (_bt_conditionallockbuf(rel, buf))
 			{
 				page = BufferGetPage(buf);
 				if (_bt_page_recyclable(page))
@@ -883,7 +884,7 @@ _bt_getbuf(Relation rel, BlockNumber blkno, int access)
 		buf = ReadBuffer(rel, P_NEW);
 
 		/* Acquire buffer lock on new page */
-		LockBuffer(buf, BT_WRITE);
+		_bt_lockbuf(rel, buf, BT_WRITE);
 
 		/*
 		 * Release the file-extension lock; it's now OK for someone else to
@@ -924,9 +925,10 @@ _bt_relandgetbuf(Relation rel, Buffer obuf, BlockNumber blkno, int access)
 
 	Assert(blkno != P_NEW);
 	if (BufferIsValid(obuf))
-		LockBuffer(obuf, BUFFER_LOCK_UNLOCK);
+		_bt_unlockbuf(rel, obuf);
 	buf = ReleaseAndReadBuffer(obuf, rel, blkno);
-	LockBuffer(buf, access);
+	_bt_lockbuf(rel, buf, access);
+
 	_bt_checkpage(rel, buf);
 	return buf;
 }
@@ -939,7 +941,87 @@ _bt_relandgetbuf(Relation rel, Buffer obuf, BlockNumber blkno, int access)
 void
 _bt_relbuf(Relation rel, Buffer buf)
 {
-	UnlockReleaseBuffer(buf);
+	_bt_unlockbuf(rel, buf);
+	ReleaseBuffer(buf);
+}
+
+/*
+ *	_bt_lockbuf() -- lock a pinned buffer.
+ *
+ * Lock is acquired without acquiring another pin.  This is like a raw
+ * LockBuffer() call, but performs extra steps needed by Valgrind.
+ */
+void
+_bt_lockbuf(Relation rel, Buffer buf, int access)
+{
+	LockBuffer(buf, access);
+
+	/*
+	 * Must not have called _bt_killitems() from _bt_steppage() (our caller),
+	 * since we have a pin but not a lock.  Mark buffer's page defined here
+	 * (this usually happens in _bt_getbuf()).
+	 */
+	if (!RelationUsesLocalBuffers(rel))
+		VALGRIND_MAKE_MEM_DEFINED(BufferGetPage(buf), BLCKSZ);
+}
+
+/*
+ *	_bt_unlockbuf() -- unlock a pinned buffer.
+ *
+ * Only lock is dropped.  Performs extra steps needed by Valgrind.
+ */
+void
+_bt_unlockbuf(Relation rel, Buffer buf)
+{
+	/*
+	 * Buffer is pinned and locked, which means that it is expected to be
+	 * defined and addressable in general.  Make sure of it explicitly here.
+	 */
+	if (!RelationUsesLocalBuffers(rel))
+		VALGRIND_CHECK_MEM_IS_DEFINED(BufferGetPage(buf), BLCKSZ);
+
+	LockBuffer(buf, BUFFER_LOCK_UNLOCK);
+
+	if (!RelationUsesLocalBuffers(rel))
+		VALGRIND_MAKE_MEM_NOACCESS(BufferGetPage(buf), BLCKSZ);
+}
+
+/*
+ *	_bt_conditionallockbuf() -- conditionally BT_WRITE lock pinned buffer.
+ *
+ * Performs extra steps needed by Valgrind.
+ */
+bool
+_bt_conditionallockbuf(Relation rel, Buffer buf)
+{
+	if (!ConditionalLockBuffer(buf))
+		return false;
+
+	if (!RelationUsesLocalBuffers(rel))
+		VALGRIND_MAKE_MEM_DEFINED(BufferGetPage(buf), BLCKSZ);
+
+	return true;
+}
+
+/*
+ *	_bt_upgradelockbufcleanup() -- upgrade lock to super-exclusive/cleanup
+ *	lock.
+ *
+ * Performs extra steps needed by Valgrind.
+ */
+void
+_bt_upgradelockbufcleanup(Relation rel, Buffer buf)
+{
+	/*
+	 * We don't need to mark the buffer NOACCESS, since we're just upgrading
+	 * caller's lock to a cleanup lock.  Make sure that buf's page is already
+	 * defined memory to Valgrind in passing, though.
+	 */
+	if (!RelationUsesLocalBuffers(rel))
+		VALGRIND_CHECK_MEM_IS_DEFINED(BufferGetPage(buf), BLCKSZ);
+
+	LockBuffer(buf, BUFFER_LOCK_UNLOCK);
+	LockBufferForCleanup(buf);
 }
 
 /*
@@ -1603,7 +1685,7 @@ _bt_pagedel(Relation rel, Buffer buf)
 				 * To avoid deadlocks, we'd better drop the leaf page lock
 				 * before going further.
 				 */
-				LockBuffer(buf, BUFFER_LOCK_UNLOCK);
+				_bt_unlockbuf(rel, buf);
 
 				/*
 				 * Fetch the left sibling, to check that it's not marked with
@@ -1649,7 +1731,7 @@ _bt_pagedel(Relation rel, Buffer buf)
 				 * Re-lock the leaf page, and start over, to re-check that the
 				 * page can still be deleted.
 				 */
-				LockBuffer(buf, BT_WRITE);
+				_bt_lockbuf(rel, buf, BT_WRITE);
 				continue;
 			}
 
@@ -1948,7 +2030,7 @@ _bt_unlink_halfdead_page(Relation rel, Buffer leafbuf, bool *rightsib_empty)
 	leafleftsib = opaque->btpo_prev;
 	leafrightsib = opaque->btpo_next;
 
-	LockBuffer(leafbuf, BUFFER_LOCK_UNLOCK);
+	_bt_unlockbuf(rel, leafbuf);
 
 	/*
 	 * Check here, as calling loops will have locks held, preventing
@@ -1979,7 +2061,7 @@ _bt_unlink_halfdead_page(Relation rel, Buffer leafbuf, bool *rightsib_empty)
 		 * To avoid deadlocks, we'd better drop the target page lock before
 		 * going further.
 		 */
-		LockBuffer(buf, BUFFER_LOCK_UNLOCK);
+		_bt_unlockbuf(rel, buf);
 	}
 	else
 	{
@@ -2004,7 +2086,7 @@ _bt_unlink_halfdead_page(Relation rel, Buffer leafbuf, bool *rightsib_empty)
 	 * table.)
 	 */
 	if (target != leafblkno)
-		LockBuffer(leafbuf, BT_WRITE);
+		_bt_lockbuf(rel, leafbuf, BT_WRITE);
 	if (leftsib != P_NONE)
 	{
 		lbuf = _bt_getbuf(rel, leftsib, BT_WRITE);
@@ -2054,7 +2136,7 @@ _bt_unlink_halfdead_page(Relation rel, Buffer leafbuf, bool *rightsib_empty)
 	 * a write lock not a superexclusive lock, since no scans would stop on an
 	 * empty page.
 	 */
-	LockBuffer(buf, BT_WRITE);
+	_bt_lockbuf(rel, buf, BT_WRITE);
 	page = BufferGetPage(buf);
 	opaque = (BTPageOpaque) PageGetSpecialPointer(page);
 
diff --git a/src/backend/access/nbtree/nbtree.c b/src/backend/access/nbtree/nbtree.c
index 36294789f3..5856a1cdc5 100644
--- a/src/backend/access/nbtree/nbtree.c
+++ b/src/backend/access/nbtree/nbtree.c
@@ -1109,7 +1109,8 @@ restart:
 	 */
 	buf = ReadBufferExtended(rel, MAIN_FORKNUM, blkno, RBM_NORMAL,
 							 info->strategy);
-	LockBuffer(buf, BT_READ);
+	_bt_lockbuf(rel, buf, BT_READ);
+
 	page = BufferGetPage(buf);
 	if (!PageIsNew(page))
 	{
@@ -1175,8 +1176,7 @@ restart:
 		 * course of the vacuum scan, whether or not it actually contains any
 		 * deletable tuples --- see nbtree/README.
 		 */
-		LockBuffer(buf, BUFFER_LOCK_UNLOCK);
-		LockBufferForCleanup(buf);
+		_bt_upgradelockbufcleanup(rel, buf);
 
 		/*
 		 * Check whether we need to recurse back to earlier pages.  What we
diff --git a/src/backend/access/nbtree/nbtsearch.c b/src/backend/access/nbtree/nbtsearch.c
index 8ff49ce6d6..80ce78b348 100644
--- a/src/backend/access/nbtree/nbtsearch.c
+++ b/src/backend/access/nbtree/nbtsearch.c
@@ -64,7 +64,7 @@ static inline void _bt_initialize_more_data(BTScanOpaque so, ScanDirection dir);
 static void
 _bt_drop_lock_and_maybe_pin(IndexScanDesc scan, BTScanPos sp)
 {
-	LockBuffer(sp->buf, BUFFER_LOCK_UNLOCK);
+	_bt_unlockbuf(scan->indexRelation, sp->buf);
 
 	if (IsMVCCSnapshot(scan->xs_snapshot) &&
 		RelationNeedsWAL(scan->indexRelation) &&
@@ -190,8 +190,8 @@ _bt_search(Relation rel, BTScanInsert key, Buffer *bufP, int access,
 	if (access == BT_WRITE && page_access == BT_READ)
 	{
 		/* trade in our read lock for a write lock */
-		LockBuffer(*bufP, BUFFER_LOCK_UNLOCK);
-		LockBuffer(*bufP, BT_WRITE);
+		_bt_unlockbuf(rel, *bufP);
+		_bt_lockbuf(rel, *bufP, BT_WRITE);
 
 		/*
 		 * If the page was split between the time that we surrendered our read
@@ -293,8 +293,8 @@ _bt_moveright(Relation rel,
 			/* upgrade our lock if necessary */
 			if (access == BT_READ)
 			{
-				LockBuffer(buf, BUFFER_LOCK_UNLOCK);
-				LockBuffer(buf, BT_WRITE);
+				_bt_unlockbuf(rel, buf);
+				_bt_lockbuf(rel, buf, BT_WRITE);
 			}
 
 			if (P_INCOMPLETE_SPLIT(opaque))
@@ -1417,7 +1417,7 @@ _bt_first(IndexScanDesc scan, ScanDirection dir)
 		 * There's no actually-matching data on this page.  Try to advance to
 		 * the next page.  Return false if there's no matching data at all.
 		 */
-		LockBuffer(so->currPos.buf, BUFFER_LOCK_UNLOCK);
+		_bt_unlockbuf(scan->indexRelation, so->currPos.buf);
 		if (!_bt_steppage(scan, dir))
 			return false;
 	}
@@ -2065,7 +2065,7 @@ _bt_readnextpage(IndexScanDesc scan, BlockNumber blkno, ScanDirection dir)
 		 * deleted.
 		 */
 		if (BTScanPosIsPinned(so->currPos))
-			LockBuffer(so->currPos.buf, BT_READ);
+			_bt_lockbuf(rel, so->currPos.buf, BT_READ);
 		else
 			so->currPos.buf = _bt_getbuf(rel, so->currPos.currPage, BT_READ);
 
@@ -2443,7 +2443,7 @@ _bt_endpoint(IndexScanDesc scan, ScanDirection dir)
 		 * There's no actually-matching data on this page.  Try to advance to
 		 * the next page.  Return false if there's no matching data at all.
 		 */
-		LockBuffer(so->currPos.buf, BUFFER_LOCK_UNLOCK);
+		_bt_unlockbuf(scan->indexRelation, so->currPos.buf);
 		if (!_bt_steppage(scan, dir))
 			return false;
 	}
diff --git a/src/backend/access/nbtree/nbtutils.c b/src/backend/access/nbtree/nbtutils.c
index ce48a51640..f30c0fbdf3 100644
--- a/src/backend/access/nbtree/nbtutils.c
+++ b/src/backend/access/nbtree/nbtutils.c
@@ -1744,7 +1744,7 @@ _bt_killitems(IndexScanDesc scan)
 		 * LSN.
 		 */
 		droppedpin = false;
-		LockBuffer(so->currPos.buf, BT_READ);
+		_bt_lockbuf(scan->indexRelation, so->currPos.buf, BT_READ);
 
 		page = BufferGetPage(so->currPos.buf);
 	}
@@ -1881,7 +1881,7 @@ _bt_killitems(IndexScanDesc scan)
 		MarkBufferDirtyHint(so->currPos.buf, true);
 	}
 
-	LockBuffer(so->currPos.buf, BUFFER_LOCK_UNLOCK);
+	_bt_unlockbuf(scan->indexRelation, so->currPos.buf);
 }
 
 
-- 
2.25.1

