From 1b92bf8618e8a310ac141f0f407178147f72e4dd Mon Sep 17 00:00:00 2001
From: Michael Paquier <michael@otacoo.com>
Date: Wed, 9 Dec 2015 16:11:08 +0900
Subject: [PATCH] Ensure consistent on-disk state of UNLOGGED indexes at
 recovery

Unlogged relation indexes need to have a consistent initial on-disk state
at the time of replay to ensure that their replayed pages are found on disk
should end of recovery happen and subsequently reset those relations. All
the INIT_FORKNUM pages found are hence forcibly flushed when replaying one.

All types of relation indexes whose persistence is unlogged are impacted
by the bug this commit fixes, with various degrees of problems, most of
them causing errors on promoted standbys when trying to INSERT new tuples
to their parent relations. The worst problem found was with GIN indexes,
where trying to insert a new tuple in it caused the system to remain stuck
on a semaphore lock, making the system unresponsive.
---
 src/backend/access/heap/heapam.c    | 10 ++++++++++
 src/backend/storage/buffer/bufmgr.c | 23 +++++++++++++++++++++++
 src/include/storage/bufmgr.h        |  1 +
 3 files changed, 34 insertions(+)

diff --git a/src/backend/access/heap/heapam.c b/src/backend/access/heap/heapam.c
index 0b397a8..f3481bc 100644
--- a/src/backend/access/heap/heapam.c
+++ b/src/backend/access/heap/heapam.c
@@ -7642,6 +7642,16 @@ heap_xlog_newpage(XLogRecPtr lsn, XLogRecord *record)
 	}
 
 	MarkBufferDirty(buffer);
+
+	/*
+	 * INIT_FORKNUM need to be forcibly flushed to the OS to ensure a
+	 * consistent on-disk at the end of recovery, as unlogged relations
+	 * will be reset at this point using it and cannot have it within
+	 * shared buffers.
+	 */
+	if (xlrec->forknum == INIT_FORKNUM)
+		FlushSingleBuffer(buffer);
+
 	UnlockReleaseBuffer(buffer);
 }
 
diff --git a/src/backend/storage/buffer/bufmgr.c b/src/backend/storage/buffer/bufmgr.c
index 18013d5..9fe3dd9 100644
--- a/src/backend/storage/buffer/bufmgr.c
+++ b/src/backend/storage/buffer/bufmgr.c
@@ -2548,6 +2548,29 @@ FlushDatabaseBuffers(Oid dbid)
 	}
 }
 
+/* ---------------------------------------------------------------------+
+ *		FlushSingleBuffer
+ *
+ *		This function flushes buffer already pinned and locked, be it
+ *		in shared or exclusive mode to the OS.
+ * --------------------------------------------------------------------
+ */
+void
+FlushSingleBuffer(Buffer buffer)
+{
+	BufferDesc *bufHdr;
+
+	/* currently not needed, but no fundamental reason not to support */
+	Assert(!BufferIsLocal(buffer));
+	Assert(BufferIsPinned(buffer));
+
+	bufHdr = &BufferDescriptors[buffer - 1];
+
+	LWLockHeldByMe(bufHdr->content_lock);
+
+	FlushBuffer(bufHdr, NULL);
+}
+
 /*
  * ReleaseBuffer -- release the pin on a buffer
  */
diff --git a/src/include/storage/bufmgr.h b/src/include/storage/bufmgr.h
index 921e4ed..8c8ab48 100644
--- a/src/include/storage/bufmgr.h
+++ b/src/include/storage/bufmgr.h
@@ -191,6 +191,7 @@ extern void CheckPointBuffers(int flags);
 extern BlockNumber BufferGetBlockNumber(Buffer buffer);
 extern BlockNumber RelationGetNumberOfBlocksInFork(Relation relation,
 								ForkNumber forkNum);
+extern void FlushSingleBuffer(Buffer buffer);
 extern void FlushRelationBuffers(Relation rel);
 extern void FlushDatabaseBuffers(Oid dbid);
 extern void DropRelFileNodeBuffers(RelFileNodeBackend rnode,
-- 
2.6.3

