From f51f2aba432afa49483bf8e7fd9f5aeb0dae83cc Mon Sep 17 00:00:00 2001
From: Robert Haas <rhaas@postgresql.org>
Date: Tue, 9 Jul 2024 13:22:06 -0400
Subject: [PATCH v2] Do not summarize WAL if generated with wal_level=minimal.

To do this, we must include the wal_level in the first WAL record
covered by each summary file; so add wal_level to struct Checkpoint
and the payload of XLOG_CHECKPOINT_REDO and XLOG_END_OF_RECOVERY.

This, in turn, requires bumping XLOG_PAGE_MAGIC and, since the
Checkpoint is also stored in the control file, also
PG_CONTROL_VERSION. It's not great to do that so late in the release
cycle, but the alternative seems to ship v17 without robust
protections against this scenario, which could result in corrupted
incremental backups.

Report by Fujii Masao. Tested by Jakub Wartak.
---
 src/backend/access/rmgrdesc/xlogdesc.c |  47 +++---
 src/backend/access/transam/xlog.c      |   8 +-
 src/backend/postmaster/walsummarizer.c | 192 ++++++++++++++++++-------
 src/include/access/xlog_internal.h     |   3 +-
 src/include/catalog/pg_control.h       |   3 +-
 5 files changed, 177 insertions(+), 76 deletions(-)

diff --git a/src/backend/access/rmgrdesc/xlogdesc.c b/src/backend/access/rmgrdesc/xlogdesc.c
index e455400716..363294d623 100644
--- a/src/backend/access/rmgrdesc/xlogdesc.c
+++ b/src/backend/access/rmgrdesc/xlogdesc.c
@@ -33,6 +33,27 @@ const struct config_enum_entry wal_level_options[] = {
 	{NULL, 0, false}
 };
 
+/*
+ * Find a string representation for wal_level
+ */
+static const char *
+get_wal_level_string(int wal_level)
+{
+	const struct config_enum_entry *entry;
+	const char *wal_level_str = "?";
+
+	for (entry = wal_level_options; entry->name; entry++)
+	{
+		if (entry->val == wal_level)
+		{
+			wal_level_str = entry->name;
+			break;
+		}
+	}
+
+	return wal_level_str;
+}
+
 void
 xlog_desc(StringInfo buf, XLogReaderState *record)
 {
@@ -45,7 +66,7 @@ xlog_desc(StringInfo buf, XLogReaderState *record)
 		CheckPoint *checkpoint = (CheckPoint *) rec;
 
 		appendStringInfo(buf, "redo %X/%X; "
-						 "tli %u; prev tli %u; fpw %s; xid %u:%u; oid %u; multi %u; offset %u; "
+						 "tli %u; prev tli %u; fpw %s; wal_level %s; xid %u:%u; oid %u; multi %u; offset %u; "
 						 "oldest xid %u in DB %u; oldest multi %u in DB %u; "
 						 "oldest/newest commit timestamp xid: %u/%u; "
 						 "oldest running xid %u; %s",
@@ -53,6 +74,7 @@ xlog_desc(StringInfo buf, XLogReaderState *record)
 						 checkpoint->ThisTimeLineID,
 						 checkpoint->PrevTimeLineID,
 						 checkpoint->fullPageWrites ? "true" : "false",
+						 get_wal_level_string(checkpoint->wal_level),
 						 EpochFromFullTransactionId(checkpoint->nextXid),
 						 XidFromFullTransactionId(checkpoint->nextXid),
 						 checkpoint->nextOid,
@@ -95,20 +117,9 @@ xlog_desc(StringInfo buf, XLogReaderState *record)
 	{
 		xl_parameter_change xlrec;
 		const char *wal_level_str;
-		const struct config_enum_entry *entry;
 
 		memcpy(&xlrec, rec, sizeof(xl_parameter_change));
-
-		/* Find a string representation for wal_level */
-		wal_level_str = "?";
-		for (entry = wal_level_options; entry->name; entry++)
-		{
-			if (entry->val == xlrec.wal_level)
-			{
-				wal_level_str = entry->name;
-				break;
-			}
-		}
+		wal_level_str = get_wal_level_string(xlrec.wal_level);
 
 		appendStringInfo(buf, "max_connections=%d max_worker_processes=%d "
 						 "max_wal_senders=%d max_prepared_xacts=%d "
@@ -135,9 +146,10 @@ xlog_desc(StringInfo buf, XLogReaderState *record)
 		xl_end_of_recovery xlrec;
 
 		memcpy(&xlrec, rec, sizeof(xl_end_of_recovery));
-		appendStringInfo(buf, "tli %u; prev tli %u; time %s",
+		appendStringInfo(buf, "tli %u; prev tli %u; time %s; wal_level %s",
 						 xlrec.ThisTimeLineID, xlrec.PrevTimeLineID,
-						 timestamptz_to_str(xlrec.end_time));
+						 timestamptz_to_str(xlrec.end_time),
+						 get_wal_level_string(xlrec.wal_level));
 	}
 	else if (info == XLOG_OVERWRITE_CONTRECORD)
 	{
@@ -150,7 +162,10 @@ xlog_desc(StringInfo buf, XLogReaderState *record)
 	}
 	else if (info == XLOG_CHECKPOINT_REDO)
 	{
-		/* No details to write out */
+		int			wal_level;
+
+		memcpy(&wal_level, rec, sizeof(int));
+		appendStringInfo(buf, "wal_level %s", get_wal_level_string(wal_level));
 	}
 }
 
diff --git a/src/backend/access/transam/xlog.c b/src/backend/access/transam/xlog.c
index 33e27a6e72..636be5ca4d 100644
--- a/src/backend/access/transam/xlog.c
+++ b/src/backend/access/transam/xlog.c
@@ -6934,6 +6934,7 @@ CreateCheckPoint(int flags)
 	WALInsertLockAcquireExclusive();
 
 	checkPoint.fullPageWrites = Insert->fullPageWrites;
+	checkPoint.wal_level = wal_level;
 
 	if (shutdown)
 	{
@@ -6987,11 +6988,9 @@ CreateCheckPoint(int flags)
 	 */
 	if (!shutdown)
 	{
-		int			dummy = 0;
-
-		/* Record must have payload to avoid assertion failure. */
+		/* Include WAL level in record for WAL summarizer's benefit. */
 		XLogBeginInsert();
-		XLogRegisterData((char *) &dummy, sizeof(dummy));
+		XLogRegisterData((char *) &wal_level, sizeof(wal_level));
 		(void) XLogInsert(RM_XLOG_ID, XLOG_CHECKPOINT_REDO);
 
 		/*
@@ -7314,6 +7313,7 @@ CreateEndOfRecoveryRecord(void)
 		elog(ERROR, "can only be used to end recovery");
 
 	xlrec.end_time = GetCurrentTimestamp();
+	xlrec.wal_level = wal_level;
 
 	WALInsertLockAcquireExclusive();
 	xlrec.ThisTimeLineID = XLogCtl->InsertTimeLineID;
diff --git a/src/backend/postmaster/walsummarizer.c b/src/backend/postmaster/walsummarizer.c
index 55dc2ea8f5..207b126983 100644
--- a/src/backend/postmaster/walsummarizer.c
+++ b/src/backend/postmaster/walsummarizer.c
@@ -71,6 +71,12 @@ typedef struct
 	 * and so the LSN might point to the start of the next file even though
 	 * that might happen to be in the middle of a WAL record.
 	 *
+	 * fast_forward is normally false, but is true when we have encountered
+	 * WAL generated with wal_level=minimal and are skipping over it without
+	 * emitting summary files. In this case, summarized_tli and summarized_lsn
+	 * will advance even though nothing is being written to disk, until we
+	 * again reach a point where wal_level>minimal.
+	 *
 	 * summarizer_pgprocno is the proc number of the summarizer process, if
 	 * one is running, or else INVALID_PROC_NUMBER.
 	 *
@@ -83,6 +89,7 @@ typedef struct
 	TimeLineID	summarized_tli;
 	XLogRecPtr	summarized_lsn;
 	bool		lsn_is_exact;
+	bool		fast_forward;
 	ProcNumber	summarizer_pgprocno;
 	XLogRecPtr	pending_lsn;
 
@@ -154,7 +161,8 @@ static void SummarizeSmgrRecord(XLogReaderState *xlogreader,
 								BlockRefTable *brtab);
 static void SummarizeXactRecord(XLogReaderState *xlogreader,
 								BlockRefTable *brtab);
-static bool SummarizeXlogRecord(XLogReaderState *xlogreader);
+static bool SummarizeXlogRecord(XLogReaderState *xlogreader,
+								bool *new_fast_forward);
 static int	summarizer_read_local_xlog_page(XLogReaderState *state,
 											XLogRecPtr targetPagePtr,
 											int reqLen,
@@ -802,6 +810,7 @@ SummarizeWAL(TimeLineID tli, XLogRecPtr start_lsn, bool exact,
 	char		final_path[MAXPGPATH];
 	WalSummaryIO io;
 	BlockRefTable *brtab = CreateEmptyBlockRefTable();
+	bool		fast_forward = true;
 
 	/* Initialize private data for xlogreader. */
 	private_data = (SummarizerReadLocalXLogPrivate *)
@@ -900,7 +909,7 @@ SummarizeWAL(TimeLineID tli, XLogRecPtr start_lsn, bool exact,
 		int			block_id;
 		char	   *errormsg;
 		XLogRecord *record;
-		bool		stop_requested = false;
+		uint8		rmid;
 
 		HandleWalSummarizerInterrupts();
 
@@ -969,56 +978,86 @@ SummarizeWAL(TimeLineID tli, XLogRecPtr start_lsn, bool exact,
 			break;
 		}
 
-		/* Special handling for particular types of WAL records. */
-		switch (XLogRecGetRmid(xlogreader))
-		{
-			case RM_DBASE_ID:
-				SummarizeDbaseRecord(xlogreader, brtab);
-				break;
-			case RM_SMGR_ID:
-				SummarizeSmgrRecord(xlogreader, brtab);
-				break;
-			case RM_XACT_ID:
-				SummarizeXactRecord(xlogreader, brtab);
-				break;
-			case RM_XLOG_ID:
-				stop_requested = SummarizeXlogRecord(xlogreader);
-				break;
-			default:
-				break;
-		}
-
 		/*
-		 * If we've been told that it's time to end this WAL summary file, do
-		 * so. As an exception, if there's nothing included in this WAL
-		 * summary file yet, then stopping doesn't make any sense, and we
-		 * should wait until the next stop point instead.
+		 * Certain types of records require special handling. Redo points and
+		 * shutdown checkpoints trigger creation of new summary files and can
+		 * also cause us to enter or exit "fast forward" mode. Other types of
+		 * records can require special updates to the block reference table.
 		 */
-		if (stop_requested && xlogreader->ReadRecPtr > summary_start_lsn)
+		rmid = XLogRecGetRmid(xlogreader);
+		if (rmid == RM_XLOG_ID)
 		{
-			summary_end_lsn = xlogreader->ReadRecPtr;
-			break;
+			bool		new_fast_forward;
+
+			/*
+			 * If we've already processed some WAL records when we hit a redo
+			 * point or shutdown checkpoint, then we stop summarization before
+			 * including this record in the current file, so that it will be
+			 * the first record in the next file.
+			 *
+			 * When we hit one of those record types as the first record in a
+			 * file, we adjust our notion of whether we're fast-forwarding.
+			 * Any WAL generated with wal_level=minimal must be skipped
+			 * without actually generating any summary file, because an
+			 * incremental backup that crosses such WAL would be unsafe.
+			 */
+			if (SummarizeXlogRecord(xlogreader, &new_fast_forward))
+			{
+				if (xlogreader->ReadRecPtr > summary_start_lsn)
+				{
+					summary_end_lsn = xlogreader->ReadRecPtr;
+					break;
+				}
+				else
+					fast_forward = new_fast_forward;
+			}
+		}
+		else if (!fast_forward)
+		{
+			/*
+			 * This switch handles record types that require extra updates to
+			 * the contents of the block reference table.
+			 */
+			switch (rmid)
+			{
+				case RM_DBASE_ID:
+					SummarizeDbaseRecord(xlogreader, brtab);
+					break;
+				case RM_SMGR_ID:
+					SummarizeSmgrRecord(xlogreader, brtab);
+					break;
+				case RM_XACT_ID:
+					SummarizeXactRecord(xlogreader, brtab);
+					break;
+			}
 		}
 
-		/* Feed block references from xlog record to block reference table. */
-		for (block_id = 0; block_id <= XLogRecMaxBlockId(xlogreader);
-			 block_id++)
+		/*
+		 * If we're in fast-forward mode, we don't really need to do anything.
+		 * Otherwise, feed block references from xlog record to block
+		 * reference table.
+		 */
+		if (!fast_forward)
 		{
-			RelFileLocator rlocator;
-			ForkNumber	forknum;
-			BlockNumber blocknum;
+			for (block_id = 0; block_id <= XLogRecMaxBlockId(xlogreader);
+				 block_id++)
+			{
+				RelFileLocator rlocator;
+				ForkNumber	forknum;
+				BlockNumber blocknum;
 
-			if (!XLogRecGetBlockTagExtended(xlogreader, block_id, &rlocator,
-											&forknum, &blocknum, NULL))
-				continue;
+				if (!XLogRecGetBlockTagExtended(xlogreader, block_id, &rlocator,
+												&forknum, &blocknum, NULL))
+					continue;
 
-			/*
-			 * As we do elsewhere, ignore the FSM fork, because it's not fully
-			 * WAL-logged.
-			 */
-			if (forknum != FSM_FORKNUM)
-				BlockRefTableMarkBlockModified(brtab, &rlocator, forknum,
-											   blocknum);
+				/*
+				 * As we do elsewhere, ignore the FSM fork, because it's not
+				 * fully WAL-logged.
+				 */
+				if (forknum != FSM_FORKNUM)
+					BlockRefTableMarkBlockModified(brtab, &rlocator, forknum,
+												   blocknum);
+			}
 		}
 
 		/* Update our notion of where this summary file ends. */
@@ -1047,9 +1086,10 @@ SummarizeWAL(TimeLineID tli, XLogRecPtr start_lsn, bool exact,
 	/*
 	 * If a timeline switch occurs, we may fail to make any progress at all
 	 * before exiting the loop above. If that happens, we don't write a WAL
-	 * summary file at all.
+	 * summary file at all. We can also skip writing a file if we're in
+	 * fast-forward mode.
 	 */
-	if (summary_end_lsn > summary_start_lsn)
+	if (summary_end_lsn > summary_start_lsn && !fast_forward)
 	{
 		/* Generate temporary and final path name. */
 		snprintf(temp_path, MAXPGPATH,
@@ -1085,6 +1125,14 @@ SummarizeWAL(TimeLineID tli, XLogRecPtr start_lsn, bool exact,
 		durable_rename(temp_path, final_path, ERROR);
 	}
 
+	/* If we skipped a non-zero amount of WAL, log a debug message. */
+	if (summary_end_lsn > summary_start_lsn && fast_forward)
+		ereport(DEBUG1,
+				errmsg("skipped summarizing WAL on TLI %u from %X/%X to %X/%X",
+					   tli,
+					   LSN_FORMAT_ARGS(summary_start_lsn),
+					   LSN_FORMAT_ARGS(summary_end_lsn)));
+
 	return summary_end_lsn;
 }
 
@@ -1263,22 +1311,58 @@ SummarizeXactRecord(XLogReaderState *xlogreader, BlockRefTable *brtab)
 
 /*
  * Special handling for WAL records with RM_XLOG_ID.
+ *
+ * The return value is true if WAL summarization should stop before this
+ * record and false otherwise. When the return value is true,
+ * *new_fast_forward indicates whether future processing should be done
+ * in fast forward mode (i.e. read WAL without emitting summaries) or not.
  */
 static bool
-SummarizeXlogRecord(XLogReaderState *xlogreader)
+SummarizeXlogRecord(XLogReaderState *xlogreader, bool *new_fast_forward)
 {
 	uint8		info = XLogRecGetInfo(xlogreader) & ~XLR_INFO_MASK;
+	int			record_wal_level;
 
-	if (info == XLOG_CHECKPOINT_REDO || info == XLOG_CHECKPOINT_SHUTDOWN)
+	if (info == XLOG_CHECKPOINT_REDO)
 	{
-		/*
-		 * This is an LSN at which redo might begin, so we'd like
-		 * summarization to stop just before this WAL record.
-		 */
-		return true;
+		/* Payload is wal_level at the time record was written. */
+		memcpy(&record_wal_level, XLogRecGetData(xlogreader), sizeof(int));
+	}
+	else if (info == XLOG_CHECKPOINT_SHUTDOWN)
+	{
+		CheckPoint	rec_ckpt;
+
+		/* Extract wal_level at time record was written from payload. */
+		memcpy(&rec_ckpt, XLogRecGetData(xlogreader), sizeof(CheckPoint));
+		record_wal_level = rec_ckpt.wal_level;
+	}
+	else if (info == XLOG_END_OF_RECOVERY)
+	{
+		xl_end_of_recovery	xlrec;
+
+		/* Extract wal_level at time record was written from payload. */
+		memcpy(&xlrec, XLogRecGetData(xlogreader), sizeof(xl_end_of_recovery));
+		record_wal_level = xlrec.wal_level;
+	}
+	else
+	{
+		/* No special handling required. Return false. */
+		return false;
 	}
 
-	return false;
+	/*
+	 * Redo can only begin at an XLOG_CHECKPOINT_REDO or
+	 * XLOG_CHECKPOINT_SHUTDOWN record, so we want WAL summarization to begin
+	 * at those points. Hence, when those records are encountered, return true,
+	 * so that we stop just before summarizing either of those records.
+	 *
+	 * We also reach here if we just saw XLOG_END_OF_RECOVERY. That's not
+	 * a place where recovery can start, but it is a place where we might need
+	 * WAL summarization can start, because it's the first record written
+	 * after a timeline switch.
+	 */
+	*new_fast_forward = (record_wal_level == WAL_LEVEL_MINIMAL);
+	return true;
 }
 
 /*
diff --git a/src/include/access/xlog_internal.h b/src/include/access/xlog_internal.h
index 5161b72f28..c6a91fb456 100644
--- a/src/include/access/xlog_internal.h
+++ b/src/include/access/xlog_internal.h
@@ -31,7 +31,7 @@
 /*
  * Each page of XLOG file has a header like this:
  */
-#define XLOG_PAGE_MAGIC 0xD115	/* can be used as WAL version indicator */
+#define XLOG_PAGE_MAGIC 0xD116	/* can be used as WAL version indicator */
 
 typedef struct XLogPageHeaderData
 {
@@ -302,6 +302,7 @@ typedef struct xl_end_of_recovery
 	TimestampTz end_time;
 	TimeLineID	ThisTimeLineID; /* new TLI */
 	TimeLineID	PrevTimeLineID; /* previous TLI we forked off from */
+	int			wal_level;
 } xl_end_of_recovery;
 
 /*
diff --git a/src/include/catalog/pg_control.h b/src/include/catalog/pg_control.h
index a00606ffcd..e80ff8e414 100644
--- a/src/include/catalog/pg_control.h
+++ b/src/include/catalog/pg_control.h
@@ -22,7 +22,7 @@
 
 
 /* Version identifier for this pg_control format */
-#define PG_CONTROL_VERSION	1300
+#define PG_CONTROL_VERSION	1700
 
 /* Nonce key length, see below */
 #define MOCK_AUTH_NONCE_LEN		32
@@ -40,6 +40,7 @@ typedef struct CheckPoint
 	TimeLineID	PrevTimeLineID; /* previous TLI, if this record begins a new
 								 * timeline (equals ThisTimeLineID otherwise) */
 	bool		fullPageWrites; /* current full_page_writes */
+	int			wal_level;		/* current wal_level */
 	FullTransactionId nextXid;	/* next free transaction ID */
 	Oid			nextOid;		/* next free OID */
 	MultiXactId nextMulti;		/* next free MultiXactId */
-- 
2.39.3 (Apple Git-145)

