From 2f2b6b3006be047f730530b0fdf4d219bcc77222 Mon Sep 17 00:00:00 2001
From: Julien Rouhaud <jrouhaud@vmware.com>
Date: Mon, 4 Nov 2019 08:40:23 +0100
Subject: [PATCH v17 1/2] Add backend infrastructure to check the validity of
 an on-disk block.

A new CheckBuffer function is introduced.  It takes care of the various locking
aspects to make sure that no false positive can be returned.

Author: Julien Rouhaud
Reviewed-by: Michael Paquier, Masahiko Sawada, Justin Pryzby
Discussion: https://postgr.es/m/CAOBaU_aVvMjQn%3Dge5qPiJOPMmOj5%3Dii3st5Q0Y%2BWuLML5sR17w%40mail.gmail.com
---
 src/backend/storage/buffer/bufmgr.c | 216 ++++++++++++++++++++++++++++
 src/include/storage/bufmgr.h        |   8 ++
 2 files changed, 224 insertions(+)

diff --git a/src/backend/storage/buffer/bufmgr.c b/src/backend/storage/buffer/bufmgr.c
index e549fa1d30..e81e899594 100644
--- a/src/backend/storage/buffer/bufmgr.c
+++ b/src/backend/storage/buffer/bufmgr.c
@@ -45,7 +45,9 @@
 #include "postmaster/bgwriter.h"
 #include "storage/buf_internals.h"
 #include "storage/bufmgr.h"
+#include "storage/checksum.h"
 #include "storage/ipc.h"
+#include "storage/lmgr.h"
 #include "storage/proc.h"
 #include "storage/smgr.h"
 #include "storage/standby.h"
@@ -480,6 +482,42 @@ static int	buffertag_comparator(const void *p1, const void *p2);
 static int	ckpt_buforder_comparator(const void *pa, const void *pb);
 static int	ts_ckpt_progress_comparator(Datum a, Datum b, void *arg);
 
+/* ----------------
+ * The rest of this module provides a set of functions that can be used to
+ * safely check all checksums on a running cluster.
+ *
+ * Please note that those only perform standard buffered reads, and don't try
+ * to bypass or discard the operating system cache.  If you want to check the
+ * actual storage, you have to discard the operating system cache before
+ * running those functions.
+ *
+ * To avoid torn pages and possible false positives when reading data, the
+ * following heuristics are used:
+ *
+ * - a shared LWLock is taken on the target buffer pool partition mapping, and
+ *   we detect if a block is in shared_buffers or not.  See check_get_buffer()
+ *   comments for more details about the locking strategy.
+ *
+ * - if a block is dirty in shared_buffers, it's ignored as it'll be flushed to
+ *   disk either before the end of the next checkpoint or during recovery in
+ *   case of unsafe shutdown
+ *
+ * - if a block is otherwise found in shared_buffers, an IO lock is taken on
+ *   the block and the block is then read from storage, ignoring the block in
+ *   shared_buffers
+ *
+ * - if a block is not found in shared_buffers, the block is read from disk
+ * while holding the buffer pool partition mapping LWLock.
+ *
+ * The check can be performed using an SQL function, returning the list of
+ * problematic blocks.
+ * ----------------
+ */
+static bool check_buffer(char *buffer, uint32 blkno, uint16 *chk_expected,
+							   uint16 *chk_found);
+static bool check_get_buffer(Relation relation, ForkNumber forknum,
+							 BlockNumber blkno, char *buffer);
+
 
 /*
  * Implementation of PrefetchBuffer() for shared buffers.
@@ -4583,3 +4621,181 @@ TestForOldSnapshot_impl(Snapshot snapshot, Relation relation)
 				(errcode(ERRCODE_SNAPSHOT_TOO_OLD),
 				 errmsg("snapshot too old")));
 }
+
+/*
+ * Perform a checksum check on the passed page.  Return True iff the page is
+ * valid or not, and assign the expected and found checksum in chk_expected and
+ * chk_found, respectively.  Note that a page can look like new but could be
+ * the result of corruption.  We still check for this case, but we can't
+ * compute its checksum as pg_checksum_page() is explicitly checking for
+ * non-new pages, so NoComputedChecksum will be set in chk_found.
+ */
+static bool
+check_buffer(char *buffer, uint32 blkno, uint16 *chk_expected,
+				   uint16 *chk_found)
+{
+	Page		page = (Page) buffer;
+	PageHeader	hdr = (PageHeader) page;
+
+	Assert(chk_expected && chk_found);
+
+	if (PageIsNew(page))
+	{
+		/*
+		 * Check if the page is really new or if there's corruption that
+		 * affected PageIsNew detection.  Note that PageIsVerified won't try to
+		 * detect checksum corruption in this case, so there's no risk of
+		 * duplicated corruption report.
+		 */
+		if (PageIsVerified(page, blkno))
+		{
+			/* No corruption. */
+			return true;
+		}
+
+		/*
+		 * There's corruption, but since this affects PageIsNew, we
+		 * can't compute a checksum, so set NoComputedChecksum for the
+		 * expected checksum.
+		 */
+		*chk_expected = NoComputedChecksum;
+		*chk_found = hdr->pd_checksum;
+		return false;
+	}
+
+	*chk_expected = pg_checksum_page(buffer, blkno);
+	*chk_found = hdr->pd_checksum;
+
+	return (*chk_expected == *chk_found);
+}
+
+/*
+ *-------------------------
+ * Safely read the wanted buffer from disk, dealing with possible concurrency
+ * issue.  Note that if a buffer is found dirty in shared_buffers, no read will
+ * be performed and the caller will be informed that no check should be done.
+ * We can safely ignore such buffers as they'll be written before next
+ * checkpoint's completion.
+ *
+ * Note that the given buffer will be retrieved in a private memory.
+ *
+ * The following locks can be used in this function:
+ *
+ *   - shared LWLock on the target buffer pool partition mapping.
+ *   - IOLock on the buffer
+ *
+ * The IOLock is taken when reading the buffer from disk if it exists in
+ * shared_buffers, to avoid torn pages.
+ *
+ * If the buffer isn't in shared_buffers, it'll be read  while the buffer
+ * mapping partition LWLock is still being held.  Reading with this lock is to
+ * avoid the unlikely but possible case that a buffer wasn't present in shared
+ * buffers when we checked but it was then alloc'ed in shared_buffers, modified
+ * and flushed concurrently when we later try to read it, leading to false
+ * positives due to a torn page.
+ *
+ * Caller should hold an AccessShareLock on the Relation
+ *-------------------------
+ */
+static bool
+check_get_buffer(Relation relation, ForkNumber forknum,
+				 BlockNumber blkno, char *buffer)
+{
+	bool		checkit = true;
+	BufferTag	buf_tag;		/* identity of requested block */
+	uint32		buf_hash;		/* hash value for buf_tag */
+	LWLock	   *partLock;		/* buffer partition lock for the buffer */
+	BufferDesc *bufdesc;
+	int			buf_id;
+
+	/* create a tag so we can lookup the buffer */
+	INIT_BUFFERTAG(buf_tag, relation->rd_smgr->smgr_rnode.node, forknum, blkno);
+
+	/* determine its hash code and partition lock ID */
+	buf_hash = BufTableHashCode(&buf_tag);
+	partLock = BufMappingPartitionLock(buf_hash);
+
+	/* see if the block is in the buffer pool already */
+	LWLockAcquire(partLock, LW_SHARED);
+	buf_id = BufTableLookup(&buf_tag, buf_hash);
+	if (buf_id >= 0)
+	{
+		uint32		buf_state;
+
+		/*
+		 * Found it.  Now, retrieve its state to know what to do with it, and
+		 * release the pin immediately.  We do so to limit overhead as much
+		 * as possible.  We'll keep the shared lightweight lock on the target
+		 * buffer mapping partition, so this buffer can't be evicted, and
+		 * we'll acquire an IOLock on the buffer if we need to read the
+		 * content on disk.
+		 */
+		bufdesc = GetBufferDescriptor(buf_id);
+
+		buf_state = LockBufHdr(bufdesc);
+		UnlockBufHdr(bufdesc, buf_state);
+
+		/*
+		 * Dirty pages are ignored as they'll be flushed soon. Invalid buffers
+		 * are also skipped.
+		 */
+		if ((buf_state & BM_DIRTY) || !(buf_state & BM_TAG_VALID))
+			checkit = false;
+
+		/*
+		 * Read the buffer from disk, taking an IO lock to prevent torn-page
+		 * reads, in the unlikely event that it was concurrently dirtied and
+		 * flushed.
+		 */
+		if (checkit)
+		{
+			LWLockAcquire(BufferDescriptorGetIOLock(bufdesc), LW_SHARED);
+			smgrread(relation->rd_smgr, forknum, blkno, buffer);
+			LWLockRelease(BufferDescriptorGetIOLock(bufdesc));
+		}
+	}
+	else
+	{
+		/*
+		 * Simply read the buffer.  There's no risk of modification on it as we
+		 * kept the buffer pool partition mapping lock.
+		 */
+		smgrread(relation->rd_smgr, forknum, blkno, buffer);
+
+		/* The buffer will have to be checked. */
+		Assert(checkit);
+	}
+
+	LWLockRelease(partLock);
+
+	return checkit;
+}
+
+/*
+ * Check data sanity for a specific block in the given fork of the given
+ * relation, always retrieved locally with smgrread even if a version exists in
+ * shared_buffers.  Returns false if the block appears to be corrupted, true
+ * otherwise.  Note that dirty and invalid blocks won't be checked.  Caller
+ * must hold at least an AccessShareLock on the relation.
+ */
+bool
+CheckBuffer(Relation relation, ForkNumber forknum, BlockNumber blkno,
+				uint16 *chk_expected, uint16 *chk_found)
+{
+	char		buffer[BLCKSZ];
+
+	Assert(CheckRelationLockedByMe(relation, AccessShareLock, true));
+	Assert(blkno < RelationGetNumberOfBlocksInFork(relation, forknum));
+	Assert(smgrexists(relation->rd_smgr, forknum));
+
+	*chk_expected = *chk_found = NoComputedChecksum;
+
+	if (!check_get_buffer(relation, forknum, blkno, buffer))
+		return true;
+
+	if (check_buffer(buffer, blkno, chk_expected, chk_found))
+		return true;
+
+	/* A corruption is detected. */
+	return false;
+}
diff --git a/src/include/storage/bufmgr.h b/src/include/storage/bufmgr.h
index ee91b8fa26..24aa102175 100644
--- a/src/include/storage/bufmgr.h
+++ b/src/include/storage/bufmgr.h
@@ -244,6 +244,14 @@ extern void TestForOldSnapshot_impl(Snapshot snapshot, Relation relation);
 extern BufferAccessStrategy GetAccessStrategy(BufferAccessStrategyType btype);
 extern void FreeAccessStrategy(BufferAccessStrategy strategy);
 
+/*
+ * A zero checksum can never be computed, see pg_checksum_page() */
+#define NoComputedChecksum	0
+
+extern bool CheckBuffer(Relation relation, ForkNumber forknum,
+						BlockNumber blkno, uint16 *chk_expected,
+						uint16 *chk_found);
+
 
 /* inline functions */
 
-- 
2.20.1

