From 2d89b1708b7f9d904fd11a10cd8a13042e406f69 Mon Sep 17 00:00:00 2001
From: Peter Geoghegan <pg@bowt.ie>
Date: Thu, 23 Apr 2020 14:43:54 -0700
Subject: [PATCH v2 2/2] Add nbtree Valgrind buffer lock checks.

Teach nbtree to use Valgrind memcheck client requests to mark pages as
NOACCESS at the point the buffer content lock is released.  Callers
usually also drop their pin at the same time, though there are some
exceptions.  This instrumentation makes Valgrind detect any page access
that occurs without a buffer lock, which is fundamentally unsafe with
B-Tree index pages.  Buffer pins only prevent VACUUM from performing
concurrent recycling of heap TIDs from the pinned leaf page.  Our
assumption is that they don't make it safe to access the page in any way
whatsoever.

We provide nbtree specific wrappers for LockBuffer() and related
functions.  All nbtree code is required to use the wrappers.

These checks are a stricter version of the memcheck client requests in
bufmgr.c that detect buffer accesses that take place without a buffer
pin held.  The checks never make the buffer pin checking more lax.
---
 src/include/access/nbtree.h           |   4 +
 src/backend/access/nbtree/nbtinsert.c |   2 +-
 src/backend/access/nbtree/nbtpage.c   | 126 ++++++++++++++++++++++----
 src/backend/access/nbtree/nbtree.c    |   6 +-
 src/backend/access/nbtree/nbtsearch.c |  16 ++--
 src/backend/access/nbtree/nbtutils.c  |   4 +-
 6 files changed, 125 insertions(+), 33 deletions(-)

diff --git a/src/include/access/nbtree.h b/src/include/access/nbtree.h
index 9a3acd26b7..284524a8f3 100644
--- a/src/include/access/nbtree.h
+++ b/src/include/access/nbtree.h
@@ -1072,6 +1072,10 @@ extern Buffer _bt_getbuf(Relation rel, BlockNumber blkno, int access);
 extern Buffer _bt_relandgetbuf(Relation rel, Buffer obuf,
 							   BlockNumber blkno, int access);
 extern void _bt_relbuf(Relation rel, Buffer buf);
+extern void _bt_lockbuf(Relation rel, Buffer buf, int access);
+extern void _bt_unlockbuf(Relation rel, Buffer buf);
+extern bool _bt_conditionallockbuf(Relation rel, Buffer buf);
+extern void _bt_upgradelockbufcleanup(Relation rel, Buffer buf);
 extern void _bt_pageinit(Page page, Size size);
 extern bool _bt_page_recyclable(Page page);
 extern void _bt_delitems_vacuum(Relation rel, Buffer buf,
diff --git a/src/backend/access/nbtree/nbtinsert.c b/src/backend/access/nbtree/nbtinsert.c
index a70b64d964..80c3a42bca 100644
--- a/src/backend/access/nbtree/nbtinsert.c
+++ b/src/backend/access/nbtree/nbtinsert.c
@@ -303,7 +303,7 @@ _bt_search_insert(Relation rel, BTInsertState insertstate)
 	{
 		/* Simulate a _bt_getbuf() call with conditional locking */
 		insertstate->buf = ReadBuffer(rel, RelationGetTargetBlock(rel));
-		if (ConditionalLockBuffer(insertstate->buf))
+		if (_bt_conditionallockbuf(rel, insertstate->buf))
 		{
 			Page		page;
 			BTPageOpaque lpageop;
diff --git a/src/backend/access/nbtree/nbtpage.c b/src/backend/access/nbtree/nbtpage.c
index f5962f6163..22e2560d00 100644
--- a/src/backend/access/nbtree/nbtpage.c
+++ b/src/backend/access/nbtree/nbtpage.c
@@ -32,6 +32,7 @@
 #include "storage/indexfsm.h"
 #include "storage/lmgr.h"
 #include "storage/predicate.h"
+#include "utils/memdebug.h"
 #include "utils/snapmgr.h"
 
 static BTMetaPageData *_bt_getmeta(Relation rel, Buffer metabuf);
@@ -191,8 +192,8 @@ _bt_update_meta_cleanup_info(Relation rel, TransactionId oldestBtpoXact,
 	}
 
 	/* trade in our read lock for a write lock */
-	LockBuffer(metabuf, BUFFER_LOCK_UNLOCK);
-	LockBuffer(metabuf, BT_WRITE);
+	_bt_unlockbuf(rel, metabuf);
+	_bt_lockbuf(rel, metabuf, BT_WRITE);
 
 	START_CRIT_SECTION();
 
@@ -333,8 +334,8 @@ _bt_getroot(Relation rel, int access)
 		}
 
 		/* trade in our read lock for a write lock */
-		LockBuffer(metabuf, BUFFER_LOCK_UNLOCK);
-		LockBuffer(metabuf, BT_WRITE);
+		_bt_unlockbuf(rel, metabuf);
+		_bt_lockbuf(rel, metabuf, BT_WRITE);
 
 		/*
 		 * Race condition:	if someone else initialized the metadata between
@@ -427,8 +428,8 @@ _bt_getroot(Relation rel, int access)
 		 * else accessing the new root page while it's unlocked, since no one
 		 * else knows where it is yet.
 		 */
-		LockBuffer(rootbuf, BUFFER_LOCK_UNLOCK);
-		LockBuffer(rootbuf, BT_READ);
+		_bt_unlockbuf(rel, rootbuf);
+		_bt_lockbuf(rel, rootbuf, BT_READ);
 
 		/* okay, metadata is correct, release lock on it without caching */
 		_bt_relbuf(rel, metabuf);
@@ -779,7 +780,26 @@ _bt_log_reuse_page(Relation rel, BlockNumber blkno, TransactionId latestRemovedX
  *		When this routine returns, the appropriate lock is set on the
  *		requested buffer and its reference count has been incremented
  *		(ie, the buffer is "locked and pinned").  Also, we apply
- *		_bt_checkpage to sanity-check the page (except in P_NEW case).
+ *		_bt_checkpage to sanity-check the page (except in P_NEW case),
+ *		and perform Valgrind client requests that indicate whether or
+ *		not it's currently safe to access the buffer (i.e. whether or
+ *		not a lock is held on the buffer).
+ *
+ *		The general rule in nbtree is that it's never okay to access a
+ *		page without holding both a buffer pin and a buffer lock on
+ *		the page's buffer.  This is slightly stricter than necessary,
+ *		but it makes the Valgrind checks simpler.
+ *
+ *		The Valgrind checks are a strict superset of the generic
+ *		buffer pin checks in bufmgr.c.  Raw LockBuffer() calls are
+ *		disallowed in nbtree; all buffer lock requests need to go
+ *		through the wrapper functions.  The client requests here rely
+ *		on the generic bufmgr.c client requests to mark memory as
+ *		defined when a shared buffer is reused for another page in the
+ *		event of an error.  That's why we don't mark pages as NOACCESS
+ *		when they're in a local buffer: it can lead to false positives
+ *		from Valgrind.  (Besides, local buffers are protected by the
+ *		generic aset.c client requests.)
  */
 Buffer
 _bt_getbuf(Relation rel, BlockNumber blkno, int access)
@@ -790,7 +810,7 @@ _bt_getbuf(Relation rel, BlockNumber blkno, int access)
 	{
 		/* Read an existing block of the relation */
 		buf = ReadBuffer(rel, blkno);
-		LockBuffer(buf, access);
+		_bt_lockbuf(rel, buf, access);
 		_bt_checkpage(rel, buf);
 	}
 	else
@@ -830,7 +850,7 @@ _bt_getbuf(Relation rel, BlockNumber blkno, int access)
 			if (blkno == InvalidBlockNumber)
 				break;
 			buf = ReadBuffer(rel, blkno);
-			if (ConditionalLockBuffer(buf))
+			if (_bt_conditionallockbuf(rel, buf))
 			{
 				page = BufferGetPage(buf);
 				if (_bt_page_recyclable(page))
@@ -883,7 +903,7 @@ _bt_getbuf(Relation rel, BlockNumber blkno, int access)
 		buf = ReadBuffer(rel, P_NEW);
 
 		/* Acquire buffer lock on new page */
-		LockBuffer(buf, BT_WRITE);
+		_bt_lockbuf(rel, buf, BT_WRITE);
 
 		/*
 		 * Release the file-extension lock; it's now OK for someone else to
@@ -924,9 +944,10 @@ _bt_relandgetbuf(Relation rel, Buffer obuf, BlockNumber blkno, int access)
 
 	Assert(blkno != P_NEW);
 	if (BufferIsValid(obuf))
-		LockBuffer(obuf, BUFFER_LOCK_UNLOCK);
+		_bt_unlockbuf(rel, obuf);
 	buf = ReleaseAndReadBuffer(obuf, rel, blkno);
-	LockBuffer(buf, access);
+	_bt_lockbuf(rel, buf, access);
+
 	_bt_checkpage(rel, buf);
 	return buf;
 }
@@ -939,7 +960,74 @@ _bt_relandgetbuf(Relation rel, Buffer obuf, BlockNumber blkno, int access)
 void
 _bt_relbuf(Relation rel, Buffer buf)
 {
-	UnlockReleaseBuffer(buf);
+	_bt_unlockbuf(rel, buf);
+	ReleaseBuffer(buf);
+}
+
+/*
+ *	_bt_lockbuf() -- lock a pinned buffer.
+ *
+ * Lock is acquired without acquiring another pin.  This is like a raw
+ * LockBuffer() call, but performs extra steps needed by Valgrind.
+ */
+void
+_bt_lockbuf(Relation rel, Buffer buf, int access)
+{
+	LockBuffer(buf, access);
+
+	if (!RelationUsesLocalBuffers(rel))
+		VALGRIND_MAKE_MEM_DEFINED(BufferGetPage(buf), BLCKSZ);
+}
+
+/*
+ *	_bt_unlockbuf() -- unlock a pinned buffer.
+ */
+void
+_bt_unlockbuf(Relation rel, Buffer buf)
+{
+	/*
+	 * Buffer is pinned and locked, which means that it is expected to
+	 * be defined and addressable in general.  Make sure of it.
+	 */
+	VALGRIND_CHECK_MEM_IS_DEFINED(BufferGetPage(buf), BLCKSZ);
+
+	LockBuffer(buf, BUFFER_LOCK_UNLOCK);
+
+	if (!RelationUsesLocalBuffers(rel))
+		VALGRIND_MAKE_MEM_NOACCESS(BufferGetPage(buf), BLCKSZ);
+}
+
+/*
+ *	_bt_conditionallockbuf() -- conditionally BT_WRITE lock pinned
+ *	buffer.
+ */
+bool
+_bt_conditionallockbuf(Relation rel, Buffer buf)
+{
+	if (!ConditionalLockBuffer(buf))
+		return false;
+
+	if (!RelationUsesLocalBuffers(rel))
+		VALGRIND_MAKE_MEM_DEFINED(BufferGetPage(buf), BLCKSZ);
+
+	return true;
+}
+
+/*
+ *	_bt_upgradelockbufcleanup() -- upgrade lock to super-exclusive/cleanup
+ *	lock.
+ */
+void
+_bt_upgradelockbufcleanup(Relation rel, Buffer buf)
+{
+	/*
+	 * Buffer is pinned and locked, which means that it is expected to
+	 * be defined and addressable in general.  Make sure of it.
+	 */
+	VALGRIND_CHECK_MEM_IS_DEFINED(BufferGetPage(buf), BLCKSZ);
+
+	LockBuffer(buf, BUFFER_LOCK_UNLOCK);
+	LockBufferForCleanup(buf);
 }
 
 /*
@@ -1603,7 +1691,7 @@ _bt_pagedel(Relation rel, Buffer buf)
 				 * To avoid deadlocks, we'd better drop the leaf page lock
 				 * before going further.
 				 */
-				LockBuffer(buf, BUFFER_LOCK_UNLOCK);
+				_bt_unlockbuf(rel, buf);
 
 				/*
 				 * Fetch the left sibling, to check that it's not marked with
@@ -1649,7 +1737,7 @@ _bt_pagedel(Relation rel, Buffer buf)
 				 * Re-lock the leaf page, and start over, to re-check that the
 				 * page can still be deleted.
 				 */
-				LockBuffer(buf, BT_WRITE);
+				_bt_lockbuf(rel, buf, BT_WRITE);
 				continue;
 			}
 
@@ -1955,7 +2043,7 @@ _bt_unlink_halfdead_page(Relation rel, Buffer leafbuf, bool *rightsib_empty)
 	leafleftsib = opaque->btpo_prev;
 	leafrightsib = opaque->btpo_next;
 
-	LockBuffer(leafbuf, BUFFER_LOCK_UNLOCK);
+	_bt_unlockbuf(rel, leafbuf);
 
 	/*
 	 * Check here, as calling loops will have locks held, preventing
@@ -1994,7 +2082,7 @@ _bt_unlink_halfdead_page(Relation rel, Buffer leafbuf, bool *rightsib_empty)
 		 * To avoid deadlocks, we'd better drop the target page lock before
 		 * going further.
 		 */
-		LockBuffer(buf, BUFFER_LOCK_UNLOCK);
+		_bt_unlockbuf(rel, buf);
 	}
 
 	/*
@@ -2011,7 +2099,7 @@ _bt_unlink_halfdead_page(Relation rel, Buffer leafbuf, bool *rightsib_empty)
 	 * table.)
 	 */
 	if (target != leafblkno)
-		LockBuffer(leafbuf, BT_WRITE);
+		_bt_lockbuf(rel, leafbuf, BT_WRITE);
 	if (leftsib != P_NONE)
 	{
 		lbuf = _bt_getbuf(rel, leftsib, BT_WRITE);
@@ -2061,7 +2149,7 @@ _bt_unlink_halfdead_page(Relation rel, Buffer leafbuf, bool *rightsib_empty)
 	 * a write lock not a superexclusive lock, since no scans would stop on an
 	 * empty page.
 	 */
-	LockBuffer(buf, BT_WRITE);
+	_bt_lockbuf(rel, buf, BT_WRITE);
 	page = BufferGetPage(buf);
 	opaque = (BTPageOpaque) PageGetSpecialPointer(page);
 
diff --git a/src/backend/access/nbtree/nbtree.c b/src/backend/access/nbtree/nbtree.c
index 36294789f3..5856a1cdc5 100644
--- a/src/backend/access/nbtree/nbtree.c
+++ b/src/backend/access/nbtree/nbtree.c
@@ -1109,7 +1109,8 @@ restart:
 	 */
 	buf = ReadBufferExtended(rel, MAIN_FORKNUM, blkno, RBM_NORMAL,
 							 info->strategy);
-	LockBuffer(buf, BT_READ);
+	_bt_lockbuf(rel, buf, BT_READ);
+
 	page = BufferGetPage(buf);
 	if (!PageIsNew(page))
 	{
@@ -1175,8 +1176,7 @@ restart:
 		 * course of the vacuum scan, whether or not it actually contains any
 		 * deletable tuples --- see nbtree/README.
 		 */
-		LockBuffer(buf, BUFFER_LOCK_UNLOCK);
-		LockBufferForCleanup(buf);
+		_bt_upgradelockbufcleanup(rel, buf);
 
 		/*
 		 * Check whether we need to recurse back to earlier pages.  What we
diff --git a/src/backend/access/nbtree/nbtsearch.c b/src/backend/access/nbtree/nbtsearch.c
index 8ff49ce6d6..80ce78b348 100644
--- a/src/backend/access/nbtree/nbtsearch.c
+++ b/src/backend/access/nbtree/nbtsearch.c
@@ -64,7 +64,7 @@ static inline void _bt_initialize_more_data(BTScanOpaque so, ScanDirection dir);
 static void
 _bt_drop_lock_and_maybe_pin(IndexScanDesc scan, BTScanPos sp)
 {
-	LockBuffer(sp->buf, BUFFER_LOCK_UNLOCK);
+	_bt_unlockbuf(scan->indexRelation, sp->buf);
 
 	if (IsMVCCSnapshot(scan->xs_snapshot) &&
 		RelationNeedsWAL(scan->indexRelation) &&
@@ -190,8 +190,8 @@ _bt_search(Relation rel, BTScanInsert key, Buffer *bufP, int access,
 	if (access == BT_WRITE && page_access == BT_READ)
 	{
 		/* trade in our read lock for a write lock */
-		LockBuffer(*bufP, BUFFER_LOCK_UNLOCK);
-		LockBuffer(*bufP, BT_WRITE);
+		_bt_unlockbuf(rel, *bufP);
+		_bt_lockbuf(rel, *bufP, BT_WRITE);
 
 		/*
 		 * If the page was split between the time that we surrendered our read
@@ -293,8 +293,8 @@ _bt_moveright(Relation rel,
 			/* upgrade our lock if necessary */
 			if (access == BT_READ)
 			{
-				LockBuffer(buf, BUFFER_LOCK_UNLOCK);
-				LockBuffer(buf, BT_WRITE);
+				_bt_unlockbuf(rel, buf);
+				_bt_lockbuf(rel, buf, BT_WRITE);
 			}
 
 			if (P_INCOMPLETE_SPLIT(opaque))
@@ -1417,7 +1417,7 @@ _bt_first(IndexScanDesc scan, ScanDirection dir)
 		 * There's no actually-matching data on this page.  Try to advance to
 		 * the next page.  Return false if there's no matching data at all.
 		 */
-		LockBuffer(so->currPos.buf, BUFFER_LOCK_UNLOCK);
+		_bt_unlockbuf(scan->indexRelation, so->currPos.buf);
 		if (!_bt_steppage(scan, dir))
 			return false;
 	}
@@ -2065,7 +2065,7 @@ _bt_readnextpage(IndexScanDesc scan, BlockNumber blkno, ScanDirection dir)
 		 * deleted.
 		 */
 		if (BTScanPosIsPinned(so->currPos))
-			LockBuffer(so->currPos.buf, BT_READ);
+			_bt_lockbuf(rel, so->currPos.buf, BT_READ);
 		else
 			so->currPos.buf = _bt_getbuf(rel, so->currPos.currPage, BT_READ);
 
@@ -2443,7 +2443,7 @@ _bt_endpoint(IndexScanDesc scan, ScanDirection dir)
 		 * There's no actually-matching data on this page.  Try to advance to
 		 * the next page.  Return false if there's no matching data at all.
 		 */
-		LockBuffer(so->currPos.buf, BUFFER_LOCK_UNLOCK);
+		_bt_unlockbuf(scan->indexRelation, so->currPos.buf);
 		if (!_bt_steppage(scan, dir))
 			return false;
 	}
diff --git a/src/backend/access/nbtree/nbtutils.c b/src/backend/access/nbtree/nbtutils.c
index ce48a51640..f30c0fbdf3 100644
--- a/src/backend/access/nbtree/nbtutils.c
+++ b/src/backend/access/nbtree/nbtutils.c
@@ -1744,7 +1744,7 @@ _bt_killitems(IndexScanDesc scan)
 		 * LSN.
 		 */
 		droppedpin = false;
-		LockBuffer(so->currPos.buf, BT_READ);
+		_bt_lockbuf(scan->indexRelation, so->currPos.buf, BT_READ);
 
 		page = BufferGetPage(so->currPos.buf);
 	}
@@ -1881,7 +1881,7 @@ _bt_killitems(IndexScanDesc scan)
 		MarkBufferDirtyHint(so->currPos.buf, true);
 	}
 
-	LockBuffer(so->currPos.buf, BUFFER_LOCK_UNLOCK);
+	_bt_unlockbuf(scan->indexRelation, so->currPos.buf);
 }
 
 
-- 
2.25.1

