On Fri, Nov 27, 2015 at 3:42 PM, Michael Paquier wrote:
> I am still investigating for a correct fix, looking at reinit.c the
> code in charge of copying the init fork as the main fork for a
> relation at the end of recovery looks to be doing its job correctly...

Attached is a patch that fixes the issue for me in master and 9.5.
Actually in the last patch I forgot a call to smgrwrite to ensure that
the INIT_FORKNUM is correctly synced to disk when those pages are
replayed at recovery, letting the reset routines for unlogged
relations do their job correctly. I have noticed as well that we need
to do the same for gin and brin relations. In this case I think that
we could limit the flush to unlogged relations, my patch does it
unconditionally though to generalize the logic. Thoughts?
-- 
Michael
diff --git a/src/backend/access/brin/brin.c b/src/backend/access/brin/brin.c
index 99337b0..d7964ac 100644
--- a/src/backend/access/brin/brin.c
+++ b/src/backend/access/brin/brin.c
@@ -682,7 +682,12 @@ brinbuildempty(PG_FUNCTION_ARGS)
 	brin_metapage_init(BufferGetPage(metabuf), BrinGetPagesPerRange(index),
 					   BRIN_CURRENT_VERSION);
 	MarkBufferDirty(metabuf);
-	log_newpage_buffer(metabuf, false);
+	/*
+	 * When this full page image is replayed, there is no guarantee that
+	 * this page will be present to disk when replayed particularly for
+	 * unlogged relations, hence enforce it to be flushed to disk.
+	 */
+	log_newpage_buffer(metabuf, false, true);
 	END_CRIT_SECTION();
 
 	UnlockReleaseBuffer(metabuf);
diff --git a/src/backend/access/brin/brin_pageops.c b/src/backend/access/brin/brin_pageops.c
index f876f62..572fe20 100644
--- a/src/backend/access/brin/brin_pageops.c
+++ b/src/backend/access/brin/brin_pageops.c
@@ -865,7 +865,7 @@ brin_initialize_empty_new_buffer(Relation idxrel, Buffer buffer)
 	page = BufferGetPage(buffer);
 	brin_page_init(page, BRIN_PAGETYPE_REGULAR);
 	MarkBufferDirty(buffer);
-	log_newpage_buffer(buffer, true);
+	log_newpage_buffer(buffer, true, false);
 	END_CRIT_SECTION();
 
 	/*
diff --git a/src/backend/access/gin/gininsert.c b/src/backend/access/gin/gininsert.c
index 49e9185..755c983 100644
--- a/src/backend/access/gin/gininsert.c
+++ b/src/backend/access/gin/gininsert.c
@@ -450,14 +450,21 @@ ginbuildempty(PG_FUNCTION_ARGS)
 		ReadBufferExtended(index, INIT_FORKNUM, P_NEW, RBM_NORMAL, NULL);
 	LockBuffer(RootBuffer, BUFFER_LOCK_EXCLUSIVE);
 
-	/* Initialize and xlog metabuffer and root buffer. */
+	/*
+	 * Initialize and xlog metabuffer and root buffer. When those full
+	 * pages are replayed, it is not guaranteed that those relation
+	 * init forks will be flushed to disk after replaying them, hence
+	 * enforce those pages to be flushed to disk at replay, only the
+	 * last record will enforce a flush for performance reasons and
+	 * because it is actually unnecessary to do it multiple times.
+	 */
 	START_CRIT_SECTION();
 	GinInitMetabuffer(MetaBuffer);
 	MarkBufferDirty(MetaBuffer);
-	log_newpage_buffer(MetaBuffer, false);
+	log_newpage_buffer(MetaBuffer, false, false);
 	GinInitBuffer(RootBuffer, GIN_LEAF);
 	MarkBufferDirty(RootBuffer);
-	log_newpage_buffer(RootBuffer, false);
+	log_newpage_buffer(RootBuffer, false, true);
 	END_CRIT_SECTION();
 
 	/* Unlock and release the buffers. */
diff --git a/src/backend/access/gist/gist.c b/src/backend/access/gist/gist.c
index 53bccf6..fd53268 100644
--- a/src/backend/access/gist/gist.c
+++ b/src/backend/access/gist/gist.c
@@ -84,7 +84,7 @@ gistbuildempty(PG_FUNCTION_ARGS)
 	START_CRIT_SECTION();
 	GISTInitBuffer(buffer, F_LEAF);
 	MarkBufferDirty(buffer);
-	log_newpage_buffer(buffer, true);
+	log_newpage_buffer(buffer, true, true);
 	END_CRIT_SECTION();
 
 	/* Unlock and release the buffer */
diff --git a/src/backend/access/heap/rewriteheap.c b/src/backend/access/heap/rewriteheap.c
index 6a6fc3b..e9a9a8f 100644
--- a/src/backend/access/heap/rewriteheap.c
+++ b/src/backend/access/heap/rewriteheap.c
@@ -336,7 +336,8 @@ end_heap_rewrite(RewriteState state)
 						MAIN_FORKNUM,
 						state->rs_blockno,
 						state->rs_buffer,
-						true);
+						true,
+						false);
 		RelationOpenSmgr(state->rs_new_rel);
 
 		PageSetChecksumInplace(state->rs_buffer, state->rs_blockno);
@@ -685,7 +686,8 @@ raw_heap_insert(RewriteState state, HeapTuple tup)
 							MAIN_FORKNUM,
 							state->rs_blockno,
 							page,
-							true);
+							true,
+							false);
 
 			/*
 			 * Now write the page. We say isTemp = true even if it's not a
diff --git a/src/backend/access/nbtree/nbtree.c b/src/backend/access/nbtree/nbtree.c
index cf4a6dc..d211a98 100644
--- a/src/backend/access/nbtree/nbtree.c
+++ b/src/backend/access/nbtree/nbtree.c
@@ -206,7 +206,7 @@ btbuildempty(PG_FUNCTION_ARGS)
 			  (char *) metapage, true);
 	if (XLogIsNeeded())
 		log_newpage(&index->rd_smgr->smgr_rnode.node, INIT_FORKNUM,
-					BTREE_METAPAGE, metapage, false);
+					BTREE_METAPAGE, metapage, false, true);
 
 	/*
 	 * An immediate sync is required even if we xlog'd the page, because the
diff --git a/src/backend/access/nbtree/nbtsort.c b/src/backend/access/nbtree/nbtsort.c
index f95f67a..faf611c 100644
--- a/src/backend/access/nbtree/nbtsort.c
+++ b/src/backend/access/nbtree/nbtsort.c
@@ -277,7 +277,8 @@ _bt_blwritepage(BTWriteState *wstate, Page page, BlockNumber blkno)
 	if (wstate->btws_use_wal)
 	{
 		/* We use the heap NEWPAGE record type for this */
-		log_newpage(&wstate->index->rd_node, MAIN_FORKNUM, blkno, page, true);
+		log_newpage(&wstate->index->rd_node, MAIN_FORKNUM, blkno, page,
+					true, false);
 	}
 
 	/*
diff --git a/src/backend/access/rmgrdesc/xlogdesc.c b/src/backend/access/rmgrdesc/xlogdesc.c
index 83cc9e8..5f6fed5 100644
--- a/src/backend/access/rmgrdesc/xlogdesc.c
+++ b/src/backend/access/rmgrdesc/xlogdesc.c
@@ -77,7 +77,9 @@ xlog_desc(StringInfo buf, XLogReaderState *record)
 
 		appendStringInfoString(buf, xlrec->rp_name);
 	}
-	else if (info == XLOG_FPI || info == XLOG_FPI_FOR_HINT)
+	else if (info == XLOG_FPI ||
+			 info == XLOG_FPI_FOR_HINT ||
+			 info == XLOG_FPI_FOR_SYNC)
 	{
 		/* no further information to print */
 	}
@@ -181,6 +183,9 @@ xlog_identify(uint8 info)
 		case XLOG_FPI_FOR_HINT:
 			id = "FPI_FOR_HINT";
 			break;
+		case XLOG_FPI_FOR_SYNC:
+			id = "FPI_FOR_SYNC";
+			break;
 	}
 
 	return id;
diff --git a/src/backend/access/spgist/spginsert.c b/src/backend/access/spgist/spginsert.c
index bceee8d..15d148c 100644
--- a/src/backend/access/spgist/spginsert.c
+++ b/src/backend/access/spgist/spginsert.c
@@ -173,7 +173,7 @@ spgbuildempty(PG_FUNCTION_ARGS)
 			  (char *) page, true);
 	if (XLogIsNeeded())
 		log_newpage(&index->rd_smgr->smgr_rnode.node, INIT_FORKNUM,
-					SPGIST_METAPAGE_BLKNO, page, false);
+					SPGIST_METAPAGE_BLKNO, page, false, false);
 
 	/* Likewise for the root page. */
 	SpGistInitPage(page, SPGIST_LEAF);
@@ -183,7 +183,7 @@ spgbuildempty(PG_FUNCTION_ARGS)
 			  (char *) page, true);
 	if (XLogIsNeeded())
 		log_newpage(&index->rd_smgr->smgr_rnode.node, INIT_FORKNUM,
-					SPGIST_ROOT_BLKNO, page, true);
+					SPGIST_ROOT_BLKNO, page, true, false);
 
 	/* Likewise for the null-tuples root page. */
 	SpGistInitPage(page, SPGIST_LEAF | SPGIST_NULLS);
@@ -193,12 +193,15 @@ spgbuildempty(PG_FUNCTION_ARGS)
 			  (char *) page, true);
 	if (XLogIsNeeded())
 		log_newpage(&index->rd_smgr->smgr_rnode.node, INIT_FORKNUM,
-					SPGIST_NULL_BLKNO, page, true);
+					SPGIST_NULL_BLKNO, page, true, true);
 
 	/*
 	 * An immediate sync is required even if we xlog'd the pages, because the
 	 * writes did not go through shared buffers and therefore a concurrent
 	 * checkpoint may have moved the redo pointer past our xlog record.
+	 * When the pages are WAL-logged, record requesting the relation to be
+	 * sync'ed is requested only once for the last record emitted in this
+	 * sequence.
 	 */
 	smgrimmedsync(index->rd_smgr, INIT_FORKNUM);
 
diff --git a/src/backend/access/transam/xlog.c b/src/backend/access/transam/xlog.c
index f17f834..1cafe48 100644
--- a/src/backend/access/transam/xlog.c
+++ b/src/backend/access/transam/xlog.c
@@ -9184,7 +9184,9 @@ xlog_redo(XLogReaderState *record)
 	XLogRecPtr	lsn = record->EndRecPtr;
 
 	/* in XLOG rmgr, backup blocks are only used by XLOG_FPI records */
-	Assert(info == XLOG_FPI || info == XLOG_FPI_FOR_HINT ||
+	Assert(info == XLOG_FPI ||
+		   info == XLOG_FPI_FOR_HINT ||
+		   info == XLOG_FPI_FOR_SYNC ||
 		   !XLogRecHasAnyBlockRefs(record));
 
 	if (info == XLOG_NEXTOID)
@@ -9374,7 +9376,9 @@ xlog_redo(XLogReaderState *record)
 	{
 		/* nothing to do here */
 	}
-	else if (info == XLOG_FPI || info == XLOG_FPI_FOR_HINT)
+	else if (info == XLOG_FPI ||
+			 info == XLOG_FPI_FOR_HINT ||
+			 info == XLOG_FPI_FOR_SYNC)
 	{
 		Buffer		buffer;
 
@@ -9392,9 +9396,28 @@ xlog_redo(XLogReaderState *record)
 		 * when checksums are enabled. There is no difference in handling
 		 * XLOG_FPI and XLOG_FPI_FOR_HINT records, they use a different info
 		 * code just to distinguish them for statistics purposes.
+		 *
+		 * XLOG_FPI_FOR_SYNC records are generated when a relation needs to
+		 * be sync'ed just after replaying a full page. This is important
+		 * to match the master behavior in the case where a page is written
+		 * directly to disk without going through shared buffers, particularly
+		 * when writing init forks for index relations.
 		 */
 		if (XLogReadBufferForRedo(record, 0, &buffer) != BLK_RESTORED)
 			elog(ERROR, "unexpected XLogReadBufferForRedo result when restoring backup block");
+
+		if (info == XLOG_FPI_FOR_SYNC)
+		{
+			RelFileNode	rnode;
+			ForkNumber	forknum;
+			BlockNumber	blkno;
+			SMgrRelation srel;
+
+			(void) XLogRecGetBlockTag(record, 0, &rnode, &forknum, &blkno);
+			srel = smgropen(rnode, InvalidBackendId);
+			smgrwrite(srel, forknum, blkno, BufferGetPage(buffer), false);
+		}
+
 		UnlockReleaseBuffer(buffer);
 	}
 	else if (info == XLOG_BACKUP_END)
diff --git a/src/backend/access/transam/xloginsert.c b/src/backend/access/transam/xloginsert.c
index 925255f..ef0cc17 100644
--- a/src/backend/access/transam/xloginsert.c
+++ b/src/backend/access/transam/xloginsert.c
@@ -932,10 +932,13 @@ XLogSaveBufferForHint(Buffer buffer, bool buffer_std)
  * If the page follows the standard page layout, with a PageHeader and unused
  * space between pd_lower and pd_upper, set 'page_std' to TRUE. That allows
  * the unused space to be left out from the WAL record, making it smaller.
+ *
+ * If 'is_sync' is set to TRUE, relation will be requested to sync immediately
+ * its data at replay after replaying this full page image.
  */
 XLogRecPtr
 log_newpage(RelFileNode *rnode, ForkNumber forkNum, BlockNumber blkno,
-			Page page, bool page_std)
+			Page page, bool page_std, bool is_sync)
 {
 	int			flags;
 	XLogRecPtr	recptr;
@@ -946,7 +949,7 @@ log_newpage(RelFileNode *rnode, ForkNumber forkNum, BlockNumber blkno,
 
 	XLogBeginInsert();
 	XLogRegisterBlock(0, rnode, forkNum, blkno, page, flags);
-	recptr = XLogInsert(RM_XLOG_ID, XLOG_FPI);
+	recptr = XLogInsert(RM_XLOG_ID, is_sync ? XLOG_FPI_FOR_SYNC : XLOG_FPI);
 
 	/*
 	 * The page may be uninitialized. If so, we can't set the LSN because that
@@ -969,9 +972,12 @@ log_newpage(RelFileNode *rnode, ForkNumber forkNum, BlockNumber blkno,
  * If the page follows the standard page layout, with a PageHeader and unused
  * space between pd_lower and pd_upper, set 'page_std' to TRUE. That allows
  * the unused space to be left out from the WAL record, making it smaller.
+ *
+ * If 'is_sync' is set to TRUE, relation will be requested to sync immediately
+ * its data at replay after replaying this full page image.
  */
 XLogRecPtr
-log_newpage_buffer(Buffer buffer, bool page_std)
+log_newpage_buffer(Buffer buffer, bool page_std, bool is_sync)
 {
 	Page		page = BufferGetPage(buffer);
 	RelFileNode rnode;
@@ -983,7 +989,7 @@ log_newpage_buffer(Buffer buffer, bool page_std)
 
 	BufferGetTag(buffer, &rnode, &forkNum, &blkno);
 
-	return log_newpage(&rnode, forkNum, blkno, page, page_std);
+	return log_newpage(&rnode, forkNum, blkno, page, page_std, is_sync);
 }
 
 /*
diff --git a/src/backend/commands/tablecmds.c b/src/backend/commands/tablecmds.c
index 0ddde72..eb22a51 100644
--- a/src/backend/commands/tablecmds.c
+++ b/src/backend/commands/tablecmds.c
@@ -9920,7 +9920,8 @@ copy_relation_data(SMgrRelation src, SMgrRelation dst,
 		 * space.
 		 */
 		if (use_wal)
-			log_newpage(&dst->smgr_rnode.node, forkNum, blkno, page, false);
+			log_newpage(&dst->smgr_rnode.node, forkNum, blkno, page,
+						false, false);
 
 		PageSetChecksumInplace(page, blkno);
 
diff --git a/src/backend/commands/vacuumlazy.c b/src/backend/commands/vacuumlazy.c
index 2429889..b0e3901 100644
--- a/src/backend/commands/vacuumlazy.c
+++ b/src/backend/commands/vacuumlazy.c
@@ -736,7 +736,7 @@ lazy_scan_heap(Relation onerel, LVRelStats *vacrelstats,
 				 */
 				if (RelationNeedsWAL(onerel) &&
 					PageGetLSN(page) == InvalidXLogRecPtr)
-					log_newpage_buffer(buf, true);
+					log_newpage_buffer(buf, true, false);
 
 				PageSetAllVisible(page);
 				visibilitymap_set(onerel, blkno, buf, InvalidXLogRecPtr,
diff --git a/src/backend/replication/logical/decode.c b/src/backend/replication/logical/decode.c
index 9f60687..fff0c7a 100644
--- a/src/backend/replication/logical/decode.c
+++ b/src/backend/replication/logical/decode.c
@@ -174,6 +174,7 @@ DecodeXLogOp(LogicalDecodingContext *ctx, XLogRecordBuffer *buf)
 		case XLOG_FPW_CHANGE:
 		case XLOG_FPI_FOR_HINT:
 		case XLOG_FPI:
+		case XLOG_FPI_FOR_SYNC:
 			break;
 		default:
 			elog(ERROR, "unexpected RM_XLOG_ID record type: %u", info);
diff --git a/src/include/access/xloginsert.h b/src/include/access/xloginsert.h
index 31b45ba..bc91802 100644
--- a/src/include/access/xloginsert.h
+++ b/src/include/access/xloginsert.h
@@ -53,8 +53,10 @@ extern void XLogResetInsertion(void);
 extern bool XLogCheckBufferNeedsBackup(Buffer buffer);
 
 extern XLogRecPtr log_newpage(RelFileNode *rnode, ForkNumber forkNum,
-			BlockNumber blk, char *page, bool page_std);
-extern XLogRecPtr log_newpage_buffer(Buffer buffer, bool page_std);
+				  BlockNumber blk, char *page, bool page_std,
+				  bool is_sync);
+extern XLogRecPtr log_newpage_buffer(Buffer buffer, bool page_std,
+				  bool is_sync);
 extern XLogRecPtr XLogSaveBufferForHint(Buffer buffer, bool buffer_std);
 
 extern void InitXLogInsert(void);
diff --git a/src/include/catalog/pg_control.h b/src/include/catalog/pg_control.h
index ad1eb4b..91445f1 100644
--- a/src/include/catalog/pg_control.h
+++ b/src/include/catalog/pg_control.h
@@ -73,6 +73,7 @@ typedef struct CheckPoint
 #define XLOG_END_OF_RECOVERY			0x90
 #define XLOG_FPI_FOR_HINT				0xA0
 #define XLOG_FPI						0xB0
+#define XLOG_FPI_FOR_SYNC				0xC0
 
 
 /*
-- 
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers

Reply via email to