From 9418668687a4ecf36531c2999efde29248488ae3 Mon Sep 17 00:00:00 2001
From: srinathv2 <srinath2133@gmail.com>
Date: Sat, 6 Sep 2025 21:15:57 +0530
Subject: [PATCH 1/1] pg_rewind: ignore shutdown-only WAL when determining
 end-of-WAL

Previously, pg_rewind determined the end-of-WAL on the target by using
the last shutdown checkpoint (or minRecoveryPoint for a standby). This
caused false positives in scenarios where the old primary was shut down
after a failover: the only WAL record generated was a shutdown checkpoint,
while the new primary and old primary still contained identical data.

In such cases, pg_rewind incorrectly concluded that
    if (target_wal_endrec > divergerec) rewind_needed = true;
and performed a rewind even though no real changes existed after the
divergence point.

With this patch, pg_rewind now scans backward from the last checkpoint
to locate the most recent valid WAL record that is not a shutdown
checkpoint or XLOG switch. As a result, a rewind is only required when
the target contains actual changes past the divergence point, avoiding
unnecessary rewind operations in clean failover scenarios.
---
 src/bin/pg_rewind/parsexlog.c | 36 ++++++++++++++++++++++++++++++-----
 src/bin/pg_rewind/pg_rewind.c | 33 ++++++++++++++++++++++----------
 src/bin/pg_rewind/pg_rewind.h |  2 +-
 3 files changed, 55 insertions(+), 16 deletions(-)

diff --git a/src/bin/pg_rewind/parsexlog.c b/src/bin/pg_rewind/parsexlog.c
index 8f4b282c6b1..442515249f4 100644
--- a/src/bin/pg_rewind/parsexlog.c
+++ b/src/bin/pg_rewind/parsexlog.c
@@ -117,11 +117,15 @@ extractPageMap(const char *datadir, XLogRecPtr startpoint, int tliIndex,
 }
 
 /*
- * Reads one WAL record. Returns the end position of the record, without
- * doing anything with the record itself.
+ * Find the last valid WAL record after the divergence point.
+ *
+ * Skips over records such as shutdown checkpoints and XLOG
+ * switch records, which otherwise could make pg_rewind think a
+ * rewind is required even when no real changes happened after failover.
+ * Returns the end position of the last meaningful record.
  */
 XLogRecPtr
-readOneRecord(const char *datadir, XLogRecPtr ptr, int tliIndex,
+findLastValidRecord(const char *datadir, XLogRecPtr ptr, int tliIndex,
 			  const char *restoreCommand)
 {
 	XLogRecord *record;
@@ -129,6 +133,7 @@ readOneRecord(const char *datadir, XLogRecPtr ptr, int tliIndex,
 	char	   *errormsg;
 	XLogPageReadPrivate private;
 	XLogRecPtr	endptr;
+	uint8		info;
 
 	private.tliIndex = tliIndex;
 	private.restoreCommand = restoreCommand;
@@ -138,16 +143,37 @@ readOneRecord(const char *datadir, XLogRecPtr ptr, int tliIndex,
 	if (xlogreader == NULL)
 		pg_fatal("out of memory while allocating a WAL reading processor");
 
+	for (;;)
+	{
+		XLogBeginRead(xlogreader, ptr);
+		record = XLogReadRecord(xlogreader, &errormsg);
+		if (record == NULL)
+		{
+			if (errormsg)
+				pg_fatal("could not read WAL record at %X/%08X: %s",
+						LSN_FORMAT_ARGS(ptr), errormsg);
+			else
+				pg_fatal("could not read WAL record at %X/%08X",
+						LSN_FORMAT_ARGS(ptr));
+		}
+		ptr = record->xl_prev;
+		info = record->xl_info & ~XLR_INFO_MASK;
+		if((info != XLOG_CHECKPOINT_SHUTDOWN) && (info != XLOG_SWITCH))
+		{
+			break;
+		}
+	}
+	ptr = xlogreader->EndRecPtr;
 	XLogBeginRead(xlogreader, ptr);
 	record = XLogReadRecord(xlogreader, &errormsg);
 	if (record == NULL)
 	{
 		if (errormsg)
 			pg_fatal("could not read WAL record at %X/%08X: %s",
-					 LSN_FORMAT_ARGS(ptr), errormsg);
+					LSN_FORMAT_ARGS(ptr), errormsg);
 		else
 			pg_fatal("could not read WAL record at %X/%08X",
-					 LSN_FORMAT_ARGS(ptr));
+					LSN_FORMAT_ARGS(ptr));
 	}
 	endptr = xlogreader->EndRecPtr;
 
diff --git a/src/bin/pg_rewind/pg_rewind.c b/src/bin/pg_rewind/pg_rewind.c
index 0c68dd4235e..c97789584d7 100644
--- a/src/bin/pg_rewind/pg_rewind.c
+++ b/src/bin/pg_rewind/pg_rewind.c
@@ -405,16 +405,29 @@ main(int argc, char **argv)
 
 
 		/*
-		 * Determine the end-of-WAL on the target.
-		 *
-		 * The WAL ends at the last shutdown checkpoint, or at
-		 * minRecoveryPoint if it was a standby. (If we supported rewinding a
-		 * server that was not shut down cleanly, we would need to replay
-		 * until we reach the first invalid record, like crash recovery does.)
-		 */
-
-		/* read the checkpoint record on the target to see where it ends. */
-		chkptendrec = readOneRecord(datadir_target,
+		* Determine the effective end-of-WAL on the target.
+		*
+		* Previously, this was taken directly from the last shutdown checkpoint,
+		* or from minRecoveryPoint if the server was a standby. However, this
+		* approach can falsely indicate divergence: when the old primary is shut
+		* down after promoting a standby, the only WAL record generated on the
+		* old primary is a shutdown checkpoint. In such cases, both clusters have
+		* identical data, yet the presence of that extra checkpoint record makes
+		* pg_rewind believe the target WAL extends past the divergence point:
+		*
+		*     if (target_wal_endrec > divergerec)
+		*         rewind_needed = true;
+		*
+		* That sets rewind_needed = true even though no user data changes exist.
+		*
+		* To avoid this, we no longer treat a plain shutdown checkpoint
+		* as a meaningful record when determining end-of-WAL. We instead
+		* scan backward to the last valid WAL record *after* divergence,
+		* skipping over shutdown-only artifacts. This ensures rewind is only
+		* triggered if there are actual changes on the target after divergence.
+		*/
+
+		chkptendrec = findLastValidRecord(datadir_target,
 									ControlFile_target.checkPoint,
 									targetNentries - 1,
 									restore_command);
diff --git a/src/bin/pg_rewind/pg_rewind.h b/src/bin/pg_rewind/pg_rewind.h
index 9cea144d2b2..304c9cd5ca9 100644
--- a/src/bin/pg_rewind/pg_rewind.h
+++ b/src/bin/pg_rewind/pg_rewind.h
@@ -40,7 +40,7 @@ extern void findLastCheckpoint(const char *datadir, XLogRecPtr forkptr,
 							   XLogRecPtr *lastchkptrec, TimeLineID *lastchkpttli,
 							   XLogRecPtr *lastchkptredo,
 							   const char *restoreCommand);
-extern XLogRecPtr readOneRecord(const char *datadir, XLogRecPtr ptr,
+extern XLogRecPtr findLastValidRecord(const char *datadir, XLogRecPtr ptr,
 								int tliIndex, const char *restoreCommand);
 
 /* in pg_rewind.c */
-- 
2.43.0

