From 407958a9230152420ce72c92dbe08ee38bbafaf2 Mon Sep 17 00:00:00 2001
From: Robert Haas <rhaas@postgresql.org>
Date: Fri, 23 Jul 2021 13:07:56 -0400
Subject: [PATCH 1/3] Refactor some end-of-recovery code out of StartupXLOG().

Split the code that performs whether to write a checkpoint or an
end-of-recovery record into DetermineRecoveryXlogAction(), which
decides what to do, and PerformRecoveryXlogAction(). Right now
these are always called one after the other, but further refactoring
is planned which will separate them.

Also create a new function CleanupAfterArchiveRecovery() to
perform a few tasks that we want to do after we've actually exited
archive recovery but before we start accepting new WAL writes.
This is straightforward code movement to make StartupXLOG() a
little bit shorter and a little bit easier to understand.
---
 src/backend/access/transam/xlog.c | 351 ++++++++++++++++++------------
 1 file changed, 216 insertions(+), 135 deletions(-)

diff --git a/src/backend/access/transam/xlog.c b/src/backend/access/transam/xlog.c
index 3479402272..203a9babc9 100644
--- a/src/backend/access/transam/xlog.c
+++ b/src/backend/access/transam/xlog.c
@@ -525,6 +525,31 @@ typedef enum ExclusiveBackupState
 	EXCLUSIVE_BACKUP_STOPPING
 } ExclusiveBackupState;
 
+/*
+ * What should we do when we reach the end of REDO to ensure that we'll
+ * be able to recover properly if we crash again?
+ *
+ * RECOVERY_XLOG_NOTHING means we didn't actually REDO anything and therefore
+ * no action is required.
+ *
+ * RECOVERY_XLOG_WRITE_END_OF_RECOVERY means we need to write an
+ * end-of-recovery record but don't need to checkpoint.
+ *
+ * RECOVERY_XLOG_WRITE_CHECKPOINT means we need to write a checkpoint.
+ * This is only valid when the checkpointer is not running.
+ *
+ * RECOVERY_XLOG_REQUEST_CHECKPOINT means we need a request that the
+ * checkpointer perform a checkpoint. This is only valid when the
+ * checkpointer is running.
+ */
+typedef enum
+{
+	RECOVERY_XLOG_NOTHING,
+	RECOVERY_XLOG_WRITE_END_OF_RECOVERY,
+	RECOVERY_XLOG_WRITE_CHECKPOINT,
+	RECOVERY_XLOG_REQUEST_CHECKPOINT
+} RecoveryXlogAction;
+
 /*
  * Session status of running backup, used for sanity checks in SQL-callable
  * functions to start and stop backups.
@@ -902,6 +927,8 @@ static MemoryContext walDebugCxt = NULL;
 static void readRecoverySignalFile(void);
 static void validateRecoveryParameters(void);
 static void exitArchiveRecovery(TimeLineID endTLI, XLogRecPtr endOfLog);
+static void CleanupAfterArchiveRecovery(TimeLineID EndOfLogTLI,
+										XLogRecPtr EndOfLog);
 static bool recoveryStopsBefore(XLogReaderState *record);
 static bool recoveryStopsAfter(XLogReaderState *record);
 static void ConfirmRecoveryPaused(void);
@@ -946,6 +973,8 @@ static void UpdateMinRecoveryPoint(XLogRecPtr lsn, bool force);
 static XLogRecord *ReadRecord(XLogReaderState *xlogreader,
 							  int emode, bool fetching_ckpt);
 static void CheckRecoveryConsistency(void);
+static RecoveryXlogAction DetermineRecoveryXlogAction(XLogReaderState *xlogreader);
+static void PerformRecoveryXLogAction(RecoveryXlogAction action);
 static XLogRecord *ReadCheckpointRecord(XLogReaderState *xlogreader,
 										XLogRecPtr RecPtr, int whichChkpt, bool report);
 static bool rescanLatestTimeLine(void);
@@ -5717,6 +5746,88 @@ exitArchiveRecovery(TimeLineID endTLI, XLogRecPtr endOfLog)
 			(errmsg("archive recovery complete")));
 }
 
+/*
+ * Perform cleanup actions at the conclusion of archive recovery.
+ */
+static void
+CleanupAfterArchiveRecovery(TimeLineID EndOfLogTLI, XLogRecPtr EndOfLog)
+{
+	/*
+	 * Execute the recovery_end_command, if any.
+	 */
+	if (recoveryEndCommand && strcmp(recoveryEndCommand, "") != 0)
+		ExecuteRecoveryCommand(recoveryEndCommand,
+							   "recovery_end_command",
+							   true);
+
+	/*
+	 * We switched to a new timeline. Clean up segments on the old timeline.
+	 *
+	 * If there are any higher-numbered segments on the old timeline, remove
+	 * them. They might contain valid WAL, but they might also be
+	 * pre-allocated files containing garbage. In any case, they are not part
+	 * of the new timeline's history so we don't need them.
+	 */
+	RemoveNonParentXlogFiles(EndOfLog, ThisTimeLineID);
+
+	/*
+	 * If the switch happened in the middle of a segment, what to do with the
+	 * last, partial segment on the old timeline? If we don't archive it, and
+	 * the server that created the WAL never archives it either (e.g. because
+	 * it was hit by a meteor), it will never make it to the archive. That's
+	 * OK from our point of view, because the new segment that we created with
+	 * the new TLI contains all the WAL from the old timeline up to the switch
+	 * point. But if you later try to do PITR to the "missing" WAL on the old
+	 * timeline, recovery won't find it in the archive. It's physically
+	 * present in the new file with new TLI, but recovery won't look there
+	 * when it's recovering to the older timeline. On the other hand, if we
+	 * archive the partial segment, and the original server on that timeline
+	 * is still running and archives the completed version of the same segment
+	 * later, it will fail. (We used to do that in 9.4 and below, and it
+	 * caused such problems).
+	 *
+	 * As a compromise, we rename the last segment with the .partial suffix,
+	 * and archive it. Archive recovery will never try to read .partial
+	 * segments, so they will normally go unused. But in the odd PITR case,
+	 * the administrator can copy them manually to the pg_wal directory
+	 * (removing the suffix). They can be useful in debugging, too.
+	 *
+	 * If a .done or .ready file already exists for the old timeline, however,
+	 * we had already determined that the segment is complete, so we can let
+	 * it be archived normally. (In particular, if it was restored from the
+	 * archive to begin with, it's expected to have a .done file).
+	 */
+	if (XLogSegmentOffset(EndOfLog, wal_segment_size) != 0 &&
+		XLogArchivingActive())
+	{
+		char		origfname[MAXFNAMELEN];
+		XLogSegNo	endLogSegNo;
+
+		XLByteToPrevSeg(EndOfLog, endLogSegNo, wal_segment_size);
+		XLogFileName(origfname, EndOfLogTLI, endLogSegNo, wal_segment_size);
+
+		if (!XLogArchiveIsReadyOrDone(origfname))
+		{
+			char		origpath[MAXPGPATH];
+			char		partialfname[MAXFNAMELEN];
+			char		partialpath[MAXPGPATH];
+
+			XLogFilePath(origpath, EndOfLogTLI, endLogSegNo, wal_segment_size);
+			snprintf(partialfname, MAXFNAMELEN, "%s.partial", origfname);
+			snprintf(partialpath, MAXPGPATH, "%s.partial", origpath);
+
+			/*
+			 * Make sure there's no .done or .ready file for the .partial
+			 * file.
+			 */
+			XLogArchiveCleanup(partialfname);
+
+			durable_rename(origpath, partialpath, ERROR);
+			XLogArchiveNotify(partialfname);
+		}
+	}
+}
+
 /*
  * Extract timestamp from WAL record.
  *
@@ -6490,7 +6601,7 @@ StartupXLOG(void)
 	DBState		dbstate_at_startup;
 	XLogReaderState *xlogreader;
 	XLogPageReadPrivate private;
-	bool		promoted = false;
+	RecoveryXlogAction xlogaction;
 	struct stat st;
 
 	/*
@@ -7897,141 +8008,13 @@ StartupXLOG(void)
 	UpdateFullPageWrites();
 	LocalXLogInsertAllowed = -1;
 
-	if (InRecovery)
-	{
-		/*
-		 * Perform a checkpoint to update all our recovery activity to disk.
-		 *
-		 * Note that we write a shutdown checkpoint rather than an on-line
-		 * one. This is not particularly critical, but since we may be
-		 * assigning a new TLI, using a shutdown checkpoint allows us to have
-		 * the rule that TLI only changes in shutdown checkpoints, which
-		 * allows some extra error checking in xlog_redo.
-		 *
-		 * In promotion, only create a lightweight end-of-recovery record
-		 * instead of a full checkpoint. A checkpoint is requested later,
-		 * after we're fully out of recovery mode and already accepting
-		 * queries.
-		 */
-		if (bgwriterLaunched)
-		{
-			if (LocalPromoteIsTriggered)
-			{
-				checkPointLoc = ControlFile->checkPoint;
-
-				/*
-				 * Confirm the last checkpoint is available for us to recover
-				 * from if we fail.
-				 */
-				record = ReadCheckpointRecord(xlogreader, checkPointLoc, 1, false);
-				if (record != NULL)
-				{
-					promoted = true;
-
-					/*
-					 * Insert a special WAL record to mark the end of
-					 * recovery, since we aren't doing a checkpoint. That
-					 * means that the checkpointer process may likely be in
-					 * the middle of a time-smoothed restartpoint and could
-					 * continue to be for minutes after this. That sounds
-					 * strange, but the effect is roughly the same and it
-					 * would be stranger to try to come out of the
-					 * restartpoint and then checkpoint. We request a
-					 * checkpoint later anyway, just for safety.
-					 */
-					CreateEndOfRecoveryRecord();
-				}
-			}
-
-			if (!promoted)
-				RequestCheckpoint(CHECKPOINT_END_OF_RECOVERY |
-								  CHECKPOINT_IMMEDIATE |
-								  CHECKPOINT_WAIT);
-		}
-		else
-			CreateCheckPoint(CHECKPOINT_END_OF_RECOVERY | CHECKPOINT_IMMEDIATE);
-	}
+	/* Emit checkpoint or end-of-recovery record in XLOG, if required. */
+	xlogaction = DetermineRecoveryXlogAction(xlogreader);
+	PerformRecoveryXLogAction(xlogaction);
 
+	/* If this is archive recovery, perform post-recovery cleanup actions. */
 	if (ArchiveRecoveryRequested)
-	{
-		/*
-		 * And finally, execute the recovery_end_command, if any.
-		 */
-		if (recoveryEndCommand && strcmp(recoveryEndCommand, "") != 0)
-			ExecuteRecoveryCommand(recoveryEndCommand,
-								   "recovery_end_command",
-								   true);
-
-		/*
-		 * We switched to a new timeline. Clean up segments on the old
-		 * timeline.
-		 *
-		 * If there are any higher-numbered segments on the old timeline,
-		 * remove them. They might contain valid WAL, but they might also be
-		 * pre-allocated files containing garbage. In any case, they are not
-		 * part of the new timeline's history so we don't need them.
-		 */
-		RemoveNonParentXlogFiles(EndOfLog, ThisTimeLineID);
-
-		/*
-		 * If the switch happened in the middle of a segment, what to do with
-		 * the last, partial segment on the old timeline? If we don't archive
-		 * it, and the server that created the WAL never archives it either
-		 * (e.g. because it was hit by a meteor), it will never make it to the
-		 * archive. That's OK from our point of view, because the new segment
-		 * that we created with the new TLI contains all the WAL from the old
-		 * timeline up to the switch point. But if you later try to do PITR to
-		 * the "missing" WAL on the old timeline, recovery won't find it in
-		 * the archive. It's physically present in the new file with new TLI,
-		 * but recovery won't look there when it's recovering to the older
-		 * timeline. On the other hand, if we archive the partial segment, and
-		 * the original server on that timeline is still running and archives
-		 * the completed version of the same segment later, it will fail. (We
-		 * used to do that in 9.4 and below, and it caused such problems).
-		 *
-		 * As a compromise, we rename the last segment with the .partial
-		 * suffix, and archive it. Archive recovery will never try to read
-		 * .partial segments, so they will normally go unused. But in the odd
-		 * PITR case, the administrator can copy them manually to the pg_wal
-		 * directory (removing the suffix). They can be useful in debugging,
-		 * too.
-		 *
-		 * If a .done or .ready file already exists for the old timeline,
-		 * however, we had already determined that the segment is complete, so
-		 * we can let it be archived normally. (In particular, if it was
-		 * restored from the archive to begin with, it's expected to have a
-		 * .done file).
-		 */
-		if (XLogSegmentOffset(EndOfLog, wal_segment_size) != 0 &&
-			XLogArchivingActive())
-		{
-			char		origfname[MAXFNAMELEN];
-			XLogSegNo	endLogSegNo;
-
-			XLByteToPrevSeg(EndOfLog, endLogSegNo, wal_segment_size);
-			XLogFileName(origfname, EndOfLogTLI, endLogSegNo, wal_segment_size);
-
-			if (!XLogArchiveIsReadyOrDone(origfname))
-			{
-				char		origpath[MAXPGPATH];
-				char		partialfname[MAXFNAMELEN];
-				char		partialpath[MAXPGPATH];
-
-				XLogFilePath(origpath, EndOfLogTLI, endLogSegNo, wal_segment_size);
-				snprintf(partialfname, MAXFNAMELEN, "%s.partial", origfname);
-				snprintf(partialpath, MAXPGPATH, "%s.partial", origpath);
-
-				/*
-				 * Make sure there's no .done or .ready file for the .partial
-				 * file.
-				 */
-				XLogArchiveCleanup(partialfname);
-
-				durable_rename(origpath, partialpath, ERROR);
-				XLogArchiveNotify(partialfname);
-			}
-		}
-	}
+		CleanupAfterArchiveRecovery(EndOfLogTLI, EndOfLog);
 
 	/*
 	 * Preallocate additional log files, if wanted.
@@ -8135,7 +8118,7 @@ StartupXLOG(void)
 	 * and in case of a crash, recovering from it might take a longer than is
 	 * appropriate now that we're not in standby mode anymore.
 	 */
-	if (promoted)
+	if (xlogaction == RECOVERY_XLOG_WRITE_END_OF_RECOVERY)
 		RequestCheckpoint(CHECKPOINT_FORCE);
 }
 
@@ -8235,6 +8218,104 @@ CheckRecoveryConsistency(void)
 	}
 }
 
+/*
+ * Determine what needs to be done upon completing REDO.
+ */
+static RecoveryXlogAction
+DetermineRecoveryXlogAction(XLogReaderState *xlogreader)
+{
+	/* No REDO, hence no action required. */
+	if (!InRecovery)
+		return RECOVERY_XLOG_NOTHING;
+
+	/*
+	 * bgwriterLaunched actually indicates both whether the bgwriter process
+	 * has been launched and also whether the checkpointer process has been
+	 * launched. So, if it's false, we can't request a checkpoint and must do
+	 * it locally.
+	 *
+	 * NB: We don't launch the bgwriter and checkpointer during crash
+	 * recovery, which will therefore always write a checkpoint.
+	 */
+	if (!bgwriterLaunched)
+		return RECOVERY_XLOG_WRITE_CHECKPOINT;
+
+	/*
+	 * In promotion, only create a lightweight end-of-recovery record instead
+	 * of a full checkpoint. A checkpoint is requested later, after we're
+	 * fully out of recovery mode and already accepting WAL writes.
+	 */
+	if (LocalPromoteIsTriggered)
+	{
+		XLogRecPtr	checkPointLoc = ControlFile->checkPoint;
+		XLogRecord *record;
+
+		/*
+		 * Confirm the last checkpoint is available for us to recover from if
+		 * we fail.
+		 */
+		record = ReadCheckpointRecord(xlogreader, checkPointLoc, 1, false);
+		if (record != NULL)
+		{
+			/*
+			 * Insert a special WAL record to mark the end of recovery, since
+			 * we aren't doing a checkpoint. That means that the checkpointer
+			 * process may likely be in the middle of a time-smoothed
+			 * restartpoint and could continue to be for minutes after this.
+			 * That sounds strange, but the effect is roughly the same and it
+			 * would be stranger to try to come out of the restartpoint and
+			 * then checkpoint. We request a checkpoint later anyway, just for
+			 * safety.
+			 */
+			return RECOVERY_XLOG_WRITE_END_OF_RECOVERY;
+		}
+	}
+
+	/*
+	 * We decided against writing only an end-of-recovery record, and we know
+	 * that the postmaster was told to launch the checkpointer, so just
+	 * request a checkpoint.
+	 */
+	return RECOVERY_XLOG_REQUEST_CHECKPOINT;
+}
+
+/*
+ * Perform whatever XLOG actions are necessary at end of REDO.
+ *
+ * The goal here is to make sure that we'll be able to recover properly if
+ * we crash again. If we choose to write a checkpoint, we'll write a shutdown
+ * checkpoint rather than an on-line one. This is not particularly critical,
+ * but since we may be assigning a new TLI, using a shutdown checkpoint allows
+ * us to have the rule that TLI only changes in shutdown checkpoints, which
+ * allows some extra error checking in xlog_redo.
+ */
+static void
+PerformRecoveryXLogAction(RecoveryXlogAction action)
+{
+	switch (action)
+	{
+		case RECOVERY_XLOG_NOTHING:
+			/* No REDO performed, hence nothing to do. */
+			break;
+
+		case RECOVERY_XLOG_WRITE_END_OF_RECOVERY:
+			/* Lightweight end-of-recovery record in lieu of checkpoint. */
+			CreateEndOfRecoveryRecord();
+			break;
+
+		case RECOVERY_XLOG_WRITE_CHECKPOINT:
+			/* Full checkpoint, when checkpointer is not running. */
+			CreateCheckPoint(CHECKPOINT_END_OF_RECOVERY | CHECKPOINT_IMMEDIATE);
+			break;
+
+		case RECOVERY_XLOG_REQUEST_CHECKPOINT:
+			/* Full checkpoint, when checkpointer is running. */
+			RequestCheckpoint(CHECKPOINT_END_OF_RECOVERY |
+							  CHECKPOINT_IMMEDIATE |
+							  CHECKPOINT_WAIT);
+	}
+}
+
 /*
  * Is the system still in recovery?
  *
-- 
2.24.3 (Apple Git-128)