From a1cc55a3c6d2acaecf627153b2dd8858d29be1de Mon Sep 17 00:00:00 2001
From: John Hsu <johnyvr@gmail.com>
Date: Wed, 8 Oct 2025 21:20:59 +0000
Subject: [PATCH v8] Avoid copying WAL segments before divergence to speed up
 pg_rewind

Adds a check to avoid copying any WAL segment files from source
to target if they are common between both servers before the
point of WAL divergence during pg_rewind.
All WAL files that exist on source and target, which fall
before the segment of the first diverged LSN can safely be
skipped from copying to the target as they have been replicated
from the original primary.

Author: John Hsu <johnhyvr@gmail.com>
Co-Author: Justin Kwan <justinpkwan@outlook.com>
---
 src/bin/pg_rewind/filemap.c                   | 246 +++++++++++-------
 src/bin/pg_rewind/filemap.h                   |  12 +-
 src/bin/pg_rewind/pg_rewind.c                 |   6 +-
 .../t/011_avoid_copying_common_wals.pl        |  87 +++++++
 4 files changed, 254 insertions(+), 97 deletions(-)
 create mode 100644 src/bin/pg_rewind/t/011_avoid_copying_common_wals.pl

diff --git a/src/bin/pg_rewind/filemap.c b/src/bin/pg_rewind/filemap.c
index c933871ca9f..a4162bce24c 100644
--- a/src/bin/pg_rewind/filemap.c
+++ b/src/bin/pg_rewind/filemap.c
@@ -55,7 +55,6 @@
 
 static filehash_hash *filehash;
 
-static bool isRelDataFile(const char *path);
 static char *datasegpath(RelFileLocator rlocator, ForkNumber forknum,
 						 BlockNumber segno);
 
@@ -199,6 +198,106 @@ filehash_init(void)
 	filehash = filehash_create(FILEHASH_INITIAL_SIZE, NULL);
 }
 
+static file_content_type_t
+getFileContentType(const char *path)
+{
+	RelFileLocator rlocator;
+	unsigned int segNo;
+	int nmatch;
+	bool matched;
+
+	/* Check if it's a WAL file */
+	if (strncmp("pg_wal/", path, 7) == 0)
+	{
+		const char *filename = path + 7; /* Skip "pg_wal/" */
+
+		if (IsXLogFileName(filename))
+			return FILE_CONTENT_TYPE_WAL;
+		else
+			return FILE_CONTENT_TYPE_OTHER;
+	}
+
+	/*
+	 * Does it look like a relation data file?
+	 *
+	 * For our purposes, only files belonging to the main fork are considered
+	 * relation files. Other forks are always copied in toto, because we cannot
+	 * reliably track changes to them, because WAL only contains block references
+	 * for the main fork.
+	 *
+	 * ----
+	 * Relation data files can be in one of the following directories:
+	 *
+	 * global/
+	 *		shared relations
+	 *
+	 * base/<db oid>/
+	 *		regular relations, default tablespace
+	 *
+	 * pg_tblspc/<tblspc oid>/<tblspc version>/
+	 *		within a non-default tablespace (the name of the directory
+	 *		depends on version)
+	 *
+	 * And the relation data files themselves have a filename like:
+	 *
+	 * <oid>.<segment number>
+	 *
+	 *----
+	 */
+	rlocator.spcOid = InvalidOid;
+	rlocator.dbOid = InvalidOid;
+	rlocator.relNumber = InvalidRelFileNumber;
+	segNo = 0;
+	matched = false;
+
+	nmatch = sscanf(path, "global/%u.%u", &rlocator.relNumber, &segNo);
+	if (nmatch == 1 || nmatch == 2)
+	{
+		rlocator.spcOid = GLOBALTABLESPACE_OID;
+		rlocator.dbOid = 0;
+		matched = true;
+	}
+	else
+	{
+		nmatch = sscanf(path, "base/%u/%u.%u",
+						&rlocator.dbOid, &rlocator.relNumber, &segNo);
+		if (nmatch == 2 || nmatch == 3)
+		{
+			rlocator.spcOid = DEFAULTTABLESPACE_OID;
+			matched = true;
+		}
+		else
+		{
+			nmatch = sscanf(path, "pg_tblspc/%u/" TABLESPACE_VERSION_DIRECTORY "/%u/%u.%u",
+							&rlocator.spcOid, &rlocator.dbOid, &rlocator.relNumber,
+							&segNo);
+			if (nmatch == 3 || nmatch == 4)
+				matched = true;
+		}
+	}
+
+	/*
+	 * The sscanf tests above can match files that have extra characters at
+	 * the end. To eliminate such cases, cross-check that GetRelationPath
+	 * creates the exact same filename, when passed the RelFileLocator
+	 * information we extracted from the filename.
+	 */
+	if (matched)
+	{
+		char *check_path = datasegpath(rlocator, MAIN_FORKNUM, segNo);
+
+		if (strcmp(check_path, path) != 0)
+			matched = false;
+
+		pfree(check_path);
+	}
+
+	if (matched)
+		return FILE_CONTENT_TYPE_RELATION;
+
+	return FILE_CONTENT_TYPE_OTHER;
+}
+
 /* Look up entry for 'path', creating a new one if it doesn't exist */
 static file_entry_t *
 insert_filehash_entry(const char *path)
@@ -210,7 +309,7 @@ insert_filehash_entry(const char *path)
 	if (!found)
 	{
 		entry->path = pg_strdup(path);
-		entry->isrelfile = isRelDataFile(path);
+		entry->content_type = getFileContentType(path);
 
 		entry->target_exists = false;
 		entry->target_type = FILE_TYPE_UNDEFINED;
@@ -294,7 +393,7 @@ process_source_file(const char *path, file_type_t type, size_t size,
 	 * sanity check: a filename that looks like a data file better be a
 	 * regular file
 	 */
-	if (type != FILE_TYPE_REGULAR && isRelDataFile(path))
+	if (type != FILE_TYPE_REGULAR && getFileContentType(path) == FILE_CONTENT_TYPE_RELATION)
 		pg_fatal("data file \"%s\" in source is not a regular file", path);
 
 	/* Remember this source file */
@@ -383,7 +482,7 @@ process_target_wal_block_change(ForkNumber forknum, RelFileLocator rlocator,
 	 */
 	if (entry)
 	{
-		Assert(entry->isrelfile);
+		Assert(entry->content_type == FILE_CONTENT_TYPE_RELATION);
 
 		if (entry->target_exists)
 		{
@@ -559,92 +658,6 @@ print_filemap(filemap_t *filemap)
 	fflush(stdout);
 }
 
-/*
- * Does it look like a relation data file?
- *
- * For our purposes, only files belonging to the main fork are considered
- * relation files. Other forks are always copied in toto, because we cannot
- * reliably track changes to them, because WAL only contains block references
- * for the main fork.
- */
-static bool
-isRelDataFile(const char *path)
-{
-	RelFileLocator rlocator;
-	unsigned int segNo;
-	int			nmatch;
-	bool		matched;
-
-	/*----
-	 * Relation data files can be in one of the following directories:
-	 *
-	 * global/
-	 *		shared relations
-	 *
-	 * base/<db oid>/
-	 *		regular relations, default tablespace
-	 *
-	 * pg_tblspc/<tblspc oid>/<tblspc version>/
-	 *		within a non-default tablespace (the name of the directory
-	 *		depends on version)
-	 *
-	 * And the relation data files themselves have a filename like:
-	 *
-	 * <oid>.<segment number>
-	 *
-	 *----
-	 */
-	rlocator.spcOid = InvalidOid;
-	rlocator.dbOid = InvalidOid;
-	rlocator.relNumber = InvalidRelFileNumber;
-	segNo = 0;
-	matched = false;
-
-	nmatch = sscanf(path, "global/%u.%u", &rlocator.relNumber, &segNo);
-	if (nmatch == 1 || nmatch == 2)
-	{
-		rlocator.spcOid = GLOBALTABLESPACE_OID;
-		rlocator.dbOid = 0;
-		matched = true;
-	}
-	else
-	{
-		nmatch = sscanf(path, "base/%u/%u.%u",
-						&rlocator.dbOid, &rlocator.relNumber, &segNo);
-		if (nmatch == 2 || nmatch == 3)
-		{
-			rlocator.spcOid = DEFAULTTABLESPACE_OID;
-			matched = true;
-		}
-		else
-		{
-			nmatch = sscanf(path, "pg_tblspc/%u/" TABLESPACE_VERSION_DIRECTORY "/%u/%u.%u",
-							&rlocator.spcOid, &rlocator.dbOid, &rlocator.relNumber,
-							&segNo);
-			if (nmatch == 3 || nmatch == 4)
-				matched = true;
-		}
-	}
-
-	/*
-	 * The sscanf tests above can match files that have extra characters at
-	 * the end. To eliminate such cases, cross-check that GetRelationPath
-	 * creates the exact same filename, when passed the RelFileLocator
-	 * information we extracted from the filename.
-	 */
-	if (matched)
-	{
-		char	   *check_path = datasegpath(rlocator, MAIN_FORKNUM, segNo);
-
-		if (strcmp(check_path, path) != 0)
-			matched = false;
-
-		pfree(check_path);
-	}
-
-	return matched;
-}
-
 /*
  * A helper function to create the path of a relation file and segment.
  *
@@ -693,11 +706,45 @@ final_filemap_cmp(const void *a, const void *b)
 		return strcmp(fa->path, fb->path);
 }
 
+/*
+ * Decide what to do with a WAL segment file based on its position
+ * relative to the point of divergence.
+ * Caller is responsible for ensuring the file exists on both
+ * source and target.
+ */
+static file_action_t
+decide_wal_file_action(const char *fname, XLogSegNo last_common_segno,
+					   size_t source_size, size_t target_size)
+{
+	TimeLineID file_tli;
+	XLogSegNo file_segno;
+
+	/* Get current WAL segment number given current segment file name */
+	XLogFromFileName(fname, &file_tli, &file_segno, WalSegSz);
+
+	/*
+	 * Avoid copying files before the last common segment.
+	 *
+	 * These files are assumed to exist on source and target.
+	 * Only WAL segment files after the last common segment number on
+	 * the new source need to be copied to the new target.
+	 *
+	 * If the sizes are different copy again for sanity.
+	 */
+	if (file_segno < last_common_segno && source_size == target_size)
+	{
+		pg_log_debug("WAL segment \"%s\" not copied to target", fname);
+		return FILE_ACTION_NONE;
+	}
+
+	return FILE_ACTION_COPY;
+}
+
 /*
  * Decide what action to perform to a file.
  */
 static file_action_t
-decide_file_action(file_entry_t *entry)
+decide_file_action(file_entry_t *entry, XLogSegNo last_common_segno)
 {
 	const char *path = entry->path;
 
@@ -799,7 +846,18 @@ decide_file_action(file_entry_t *entry)
 			return FILE_ACTION_NONE;
 
 		case FILE_TYPE_REGULAR:
-			if (!entry->isrelfile)
+			if (entry->content_type == FILE_CONTENT_TYPE_WAL)
+			{
+				/* Handle WAL segment file */
+				const char *filename = last_dir_separator(entry->path);
+				if (filename == NULL)
+					filename = entry->path;
+				else
+					filename++; /* Skip the separator */
+
+				return decide_wal_file_action(filename, last_common_segno, entry->source_size, entry->target_size);
+			}
+			else if (entry->content_type != FILE_CONTENT_TYPE_RELATION)
 			{
 				/*
 				 * It's a non-data file that we have no special processing
@@ -858,7 +916,7 @@ decide_file_action(file_entry_t *entry)
  * should be executed.
  */
 filemap_t *
-decide_file_actions(void)
+decide_file_actions(XLogSegNo last_common_segno)
 {
 	int			i;
 	filehash_iterator it;
@@ -868,7 +926,7 @@ decide_file_actions(void)
 	filehash_start_iterate(filehash, &it);
 	while ((entry = filehash_iterate(filehash, &it)) != NULL)
 	{
-		entry->action = decide_file_action(entry);
+		entry->action = decide_file_action(entry, last_common_segno);
 	}
 
 	/*
diff --git a/src/bin/pg_rewind/filemap.h b/src/bin/pg_rewind/filemap.h
index df78a02e3da..5145f0b4c46 100644
--- a/src/bin/pg_rewind/filemap.h
+++ b/src/bin/pg_rewind/filemap.h
@@ -11,6 +11,7 @@
 #include "datapagemap.h"
 #include "storage/block.h"
 #include "storage/relfilelocator.h"
+#include "access/xlogdefs.h"
 
 /* these enum values are sorted in the order we want actions to be processed */
 typedef enum
@@ -36,6 +37,13 @@ typedef enum
 	FILE_TYPE_SYMLINK,
 } file_type_t;
 
+typedef enum
+{
+	FILE_CONTENT_TYPE_OTHER = 0,
+	FILE_CONTENT_TYPE_RELATION,
+	FILE_CONTENT_TYPE_WAL
+} file_content_type_t;
+
 /*
  * For every file found in the local or remote system, we have a file entry
  * that contains information about the file on both systems.  For relation
@@ -51,7 +59,7 @@ typedef struct file_entry_t
 	uint32		status;			/* hash status */
 
 	const char *path;
-	bool		isrelfile;		/* is it a relation data file? */
+	file_content_type_t content_type;
 
 	/*
 	 * Status of the file in the target.
@@ -106,7 +114,7 @@ extern void process_target_wal_block_change(ForkNumber forknum,
 											RelFileLocator rlocator,
 											BlockNumber blkno);
 
-extern filemap_t *decide_file_actions(void);
+extern filemap_t *decide_file_actions(XLogSegNo last_common_segno);
 extern void calculate_totals(filemap_t *filemap);
 extern void print_filemap(filemap_t *filemap);
 
diff --git a/src/bin/pg_rewind/pg_rewind.c b/src/bin/pg_rewind/pg_rewind.c
index 0c68dd4235e..318d0a27184 100644
--- a/src/bin/pg_rewind/pg_rewind.c
+++ b/src/bin/pg_rewind/pg_rewind.c
@@ -147,6 +147,7 @@ main(int argc, char **argv)
 	TimeLineID	source_tli;
 	TimeLineID	target_tli;
 	XLogRecPtr	target_wal_endrec;
+	XLogSegNo	last_common_segno;
 	size_t		size;
 	char	   *buffer;
 	bool		no_ensure_shutdown = false;
@@ -397,6 +398,9 @@ main(int argc, char **argv)
 					LSN_FORMAT_ARGS(divergerec),
 					targetHistory[lastcommontliIndex].tli);
 
+		/* Convert divergence LSN to segment number */
+		XLByteToSeg(divergerec, last_common_segno, ControlFile_target.xlog_seg_size);
+
 		/*
 		 * Don't need the source history anymore. The target history is still
 		 * needed by the routines in parsexlog.c, when we read the target WAL.
@@ -492,7 +496,7 @@ main(int argc, char **argv)
 	 * We have collected all information we need from both systems. Decide
 	 * what to do with each file.
 	 */
-	filemap = decide_file_actions();
+	filemap = decide_file_actions(last_common_segno);
 	if (showprogress)
 		calculate_totals(filemap);
 
diff --git a/src/bin/pg_rewind/t/011_avoid_copying_common_wals.pl b/src/bin/pg_rewind/t/011_avoid_copying_common_wals.pl
new file mode 100644
index 00000000000..d5fa28b1f49
--- /dev/null
+++ b/src/bin/pg_rewind/t/011_avoid_copying_common_wals.pl
@@ -0,0 +1,87 @@
+# Copyright (c) 2021-2025, PostgreSQL Global Development Group
+#
+# Test situation where source and target data directory contains
+# the same WAL files
+#
+
+use strict;
+use warnings FATAL => 'all';
+use PostgreSQL::Test::Utils;
+use Test::More;
+use File::Spec;
+use File::stat qw(stat);
+
+use FindBin;
+use lib $FindBin::RealBin;
+use RewindTest;
+use Time::HiRes qw(usleep);
+
+RewindTest::setup_cluster();
+RewindTest::start_primary();
+
+RewindTest::create_standby();
+
+# advance WAL on primary; this WAL segment will be common between both
+RewindTest::primary_psql("CREATE TABLE t(a int)");
+RewindTest::primary_psql("INSERT INTO t VALUES(0)");
+
+# Common segment to be skipped copying over
+my $wal_seg_skipped = $node_primary->safe_psql('postgres', 'SELECT pg_walfile_name(pg_current_wal_lsn())');
+
+RewindTest::primary_psql("SELECT pg_switch_wal()");
+
+# last common checkpoint
+RewindTest::primary_psql("CHECKPOINT");
+
+RewindTest::promote_standby;
+
+my $wal_seg_copied = $node_standby->safe_psql('postgres', 'SELECT pg_walfile_name(pg_current_wal_lsn())');
+
+# Get stat info for the WAL file that should be skipped
+my $wal_skipped_path = File::Spec->catfile($node_primary->data_dir, 'pg_wal', $wal_seg_skipped);
+my $wal_skipped_stat = stat($wal_skipped_path);
+defined($wal_skipped_stat) or die("unable to stat $wal_skipped_path");
+
+# Store modification time for later comparison
+my $wal_seg_skipped_mod_time = $wal_skipped_stat->mtime;
+
+# Verify that the WAL segment on new timeline does not exist in target before rewind
+my $wal_copied_path = File::Spec->catfile($node_primary->data_dir, 'pg_wal', $wal_seg_copied);
+my $wal_copied_stat = stat($wal_copied_path);
+ok(!defined($wal_copied_stat), "WAL segment $wal_seg_copied should not exist in target before rewind");
+
+
+$node_standby->stop();
+$node_primary->stop();
+
+# Sleep to allow mtime to be different
+usleep(1000);
+
+command_checks_all(
+	[
+		'pg_rewind', '--debug',
+		'--source-pgdata' => $node_standby->data_dir,
+		'--target-pgdata' => $node_primary->data_dir,
+		'--no-sync',
+	],
+	0,
+	[qr//
+	],
+	[qr/WAL segment \"$wal_seg_skipped\" not copied to target/],
+	'run pg_rewind'
+);
+
+# Verify that the copied WAL segment now exists in target
+$wal_copied_stat = stat($wal_copied_path);
+ok(defined($wal_copied_stat), "WAL segment $wal_seg_copied should exist in target after rewind");
+
+# Get current modification time of the skipped WAL segment
+my $wal_skipped_stat_after = stat($wal_skipped_path);
+defined($wal_skipped_stat_after) or die("unable to stat $wal_skipped_path after rewind");
+my $wal_seg_latest_skipped_mod_time = $wal_skipped_stat_after->mtime;
+
+# 6. Validate that modification time hasn't changed,
+is($wal_seg_latest_skipped_mod_time, $wal_seg_skipped_mod_time,
+   "WAL segment $wal_seg_skipped modification time should be unchanged (not overwritten)");
+
+done_testing();
-- 
2.50.1

