Hi, hackers

The issue here is pg_rewind looks into control file to determine the
soruce timeline, because the control file is not updated until the
first checkpoint ends after promotion finishes, even though file
blocks are already diverged.

Even in that case history file for the new timeline is already
created, so searching for the latest history file works.

I think this change is a good one because if I want
pg_rewind to run automatically after a promotion,
I don't have to wait for the checkpoint to complete.

The attached patch is Horiguchi-san's patch with
additional tests. The tests are based on James's tests,
"010_no_checkpoint_after_promotion.pl" tests that
pg_rewind is successfully executed without running
checkpoint after promote.

Best Regards,
Keisuke Kuroda
NTT COMWARE
diff --git a/doc/src/sgml/ref/pg_rewind.sgml b/doc/src/sgml/ref/pg_rewind.sgml
index 69d6924b3a..7f752b2ed0 100644
--- a/doc/src/sgml/ref/pg_rewind.sgml
+++ b/doc/src/sgml/ref/pg_rewind.sgml
@@ -334,15 +334,6 @@ GRANT EXECUTE ON function pg_catalog.pg_read_binary_file(text, bigint, bigint, b
 </programlisting>
   </para>
 
-  <para>
-   When executing <application>pg_rewind</application> using an online
-   cluster as source which has been recently promoted, it is necessary
-   to execute a <command>CHECKPOINT</command> after promotion such that its
-   control file reflects up-to-date timeline information, which is used by
-   <application>pg_rewind</application> to check if the target cluster
-   can be rewound using the designated source cluster.
-  </para>
-
   <refsect2>
    <title>How It Works</title>
 
diff --git a/src/bin/pg_rewind/file_ops.c b/src/bin/pg_rewind/file_ops.c
index db190bcba7..fe98f47519 100644
--- a/src/bin/pg_rewind/file_ops.c
+++ b/src/bin/pg_rewind/file_ops.c
@@ -309,9 +309,11 @@ sync_target_dir(void)
  * buffer is actually *filesize + 1. That's handy when reading a text file.
  * This function can be used to read binary files as well, you can just
  * ignore the zero-terminator in that case.
+ *
+ * If noerror is true, returns NULL when the file is not found.
  */
 char *
-slurpFile(const char *datadir, const char *path, size_t *filesize)
+slurpFile(const char *datadir, const char *path, size_t *filesize, bool noerror)
 {
 	int			fd;
 	char	   *buffer;
@@ -323,8 +325,13 @@ slurpFile(const char *datadir, const char *path, size_t *filesize)
 	snprintf(fullpath, sizeof(fullpath), "%s/%s", datadir, path);
 
 	if ((fd = open(fullpath, O_RDONLY | PG_BINARY, 0)) == -1)
+	{
+		if (noerror && errno == ENOENT)
+			return NULL;
+
 		pg_fatal("could not open file \"%s\" for reading: %m",
 				 fullpath);
+	}
 
 	if (fstat(fd, &statbuf) < 0)
 		pg_fatal("could not open file \"%s\" for reading: %m",
diff --git a/src/bin/pg_rewind/file_ops.h b/src/bin/pg_rewind/file_ops.h
index e6277c4631..559787a072 100644
--- a/src/bin/pg_rewind/file_ops.h
+++ b/src/bin/pg_rewind/file_ops.h
@@ -21,7 +21,8 @@ extern void create_target(file_entry_t *entry);
 extern void remove_target(file_entry_t *entry);
 extern void sync_target_dir(void);
 
-extern char *slurpFile(const char *datadir, const char *path, size_t *filesize);
+extern char *slurpFile(const char *datadir, const char *path, size_t *filesize,
+       bool noerror);
 
 typedef void (*process_file_callback_t) (const char *path, file_type_t type, size_t size, const char *link_target);
 extern void traverse_datadir(const char *datadir, process_file_callback_t callback);
diff --git a/src/bin/pg_rewind/libpq_source.c b/src/bin/pg_rewind/libpq_source.c
index 011c9cce6e..751c96e6e4 100644
--- a/src/bin/pg_rewind/libpq_source.c
+++ b/src/bin/pg_rewind/libpq_source.c
@@ -68,7 +68,7 @@ static void libpq_queue_fetch_range(rewind_source *source, const char *path,
 									off_t off, size_t len);
 static void libpq_finish_fetch(rewind_source *source);
 static char *libpq_fetch_file(rewind_source *source, const char *path,
-							  size_t *filesize);
+							  size_t *filesize, bool noerror);
 static XLogRecPtr libpq_get_current_wal_insert_lsn(rewind_source *source);
 static void libpq_destroy(rewind_source *source);
 
@@ -620,9 +620,12 @@ appendArrayEscapedString(StringInfo buf, const char *str)
 
 /*
  * Fetch a single file as a malloc'd buffer.
+ *
+ * If noerror is true, returns NULL if pg_read_binary_file() failed.
  */
 static char *
-libpq_fetch_file(rewind_source *source, const char *path, size_t *filesize)
+libpq_fetch_file(rewind_source *source, const char *path, size_t *filesize,
+				 bool noerror)
 {
 	PGconn	   *conn = ((libpq_source *) source)->conn;
 	PGresult   *res;
@@ -631,6 +634,34 @@ libpq_fetch_file(rewind_source *source, const char *path, size_t *filesize)
 	const char *paramValues[1];
 
 	paramValues[0] = path;
+
+	/*
+	 * check the existence of the file. We do this before executing
+	 * pg_read_binary_file so that server doesn't emit an error
+	 */
+	if (noerror)
+	{
+		res = PQexecParams(conn, "SELECT pg_stat_file($1, true)",
+						   1, NULL, paramValues, NULL, NULL, 1);
+
+		if (PQresultStatus(res) != PGRES_TUPLES_OK)
+		{
+			pg_fatal("could not stat remote file \"%s\": %s",
+					 path, PQresultErrorMessage(res));
+		}
+
+		/* sanity check the result set */
+		if (PQntuples(res) != 1)
+			pg_fatal("unexpected result set while stating remote file \"%s\"",
+					 path);
+
+		/* Return NULL if the file was not found */
+		if (PQgetisnull(res, 0, 0))
+			return NULL;
+
+		PQclear(res);
+	}
+
 	res = PQexecParams(conn, "SELECT pg_read_binary_file($1)",
 					   1, NULL, paramValues, NULL, NULL, 1);
 
diff --git a/src/bin/pg_rewind/local_source.c b/src/bin/pg_rewind/local_source.c
index 2e50485c39..fc2e1e9f11 100644
--- a/src/bin/pg_rewind/local_source.c
+++ b/src/bin/pg_rewind/local_source.c
@@ -28,7 +28,7 @@ typedef struct
 static void local_traverse_files(rewind_source *source,
 								 process_file_callback_t callback);
 static char *local_fetch_file(rewind_source *source, const char *path,
-							  size_t *filesize);
+							  size_t *filesize, bool noerror);
 static void local_queue_fetch_file(rewind_source *source, const char *path,
 								   size_t len);
 static void local_queue_fetch_range(rewind_source *source, const char *path,
@@ -63,9 +63,11 @@ local_traverse_files(rewind_source *source, process_file_callback_t callback)
 }
 
 static char *
-local_fetch_file(rewind_source *source, const char *path, size_t *filesize)
+local_fetch_file(rewind_source *source, const char *path, size_t *filesize,
+	bool noerror)
 {
-	return slurpFile(((local_source *) source)->datadir, path, filesize);
+	return slurpFile(((local_source *) source)->datadir, path, filesize,
+					 noerror);
 }
 
 /*
diff --git a/src/bin/pg_rewind/pg_rewind.c b/src/bin/pg_rewind/pg_rewind.c
index 3cd77c09b1..75a7df0b1f 100644
--- a/src/bin/pg_rewind/pg_rewind.c
+++ b/src/bin/pg_rewind/pg_rewind.c
@@ -43,6 +43,8 @@ static void createBackupLabel(XLogRecPtr startpoint, TimeLineID starttli,
 
 static void digestControlFile(ControlFileData *ControlFile,
 							  const char *content, size_t size);
+static TimeLineHistoryEntry *getTimelineHistory(ControlFileData *controlFile,
+												int *nentries);
 static void getRestoreCommand(const char *argv0);
 static void sanityChecks(void);
 static void findCommonAncestorTimeline(XLogRecPtr *recptr, int *tliIndex);
@@ -70,9 +72,13 @@ bool		do_sync = true;
 bool		restore_wal = false;
 
 /* Target history */
-TimeLineHistoryEntry *targetHistory;
+TimeLineHistoryEntry *targetHistory = NULL;
 int			targetNentries;
 
+/* Source history */
+TimeLineHistoryEntry *sourceHistory = NULL;
+int			sourceNentries;
+
 /* Progress counters */
 uint64		fetch_size;
 uint64		fetch_done;
@@ -141,6 +147,7 @@ main(int argc, char **argv)
 	bool		rewind_needed;
 	bool		writerecoveryconf = false;
 	filemap_t  *filemap;
+	TimeLineID	source_tli;
 
 	pg_logging_init(argv[0]);
 	set_pglocale_pgservice(argv[0], PG_TEXTDOMAIN("pg_rewind"));
@@ -311,7 +318,7 @@ main(int argc, char **argv)
 	 * need to make sure by themselves that the target cluster is in a clean
 	 * state.
 	 */
-	buffer = slurpFile(datadir_target, "global/pg_control", &size);
+	buffer = slurpFile(datadir_target, "global/pg_control", &size, false);
 	digestControlFile(&ControlFile_target, buffer, size);
 	pg_free(buffer);
 
@@ -321,25 +328,43 @@ main(int argc, char **argv)
 	{
 		ensureCleanShutdown(argv[0]);
 
-		buffer = slurpFile(datadir_target, "global/pg_control", &size);
+		buffer = slurpFile(datadir_target, "global/pg_control", &size, false);
 		digestControlFile(&ControlFile_target, buffer, size);
 		pg_free(buffer);
 	}
 
-	buffer = source->fetch_file(source, "global/pg_control", &size);
+	buffer = source->fetch_file(source, "global/pg_control", &size, false);
 	digestControlFile(&ControlFile_source, buffer, size);
 	pg_free(buffer);
 
 	sanityChecks();
 
+	/* Retrieve timelines for both source and target */
+	sourceHistory =	getTimelineHistory(&ControlFile_source, &sourceNentries);
+	targetHistory =	getTimelineHistory(&ControlFile_target, &targetNentries);
+
+	/*
+	 * If the source just has been promoted but the end-of-recovery checkpoint
+	 * has not completed, the soruce control file has a bit older content for
+	 * identifying the source's timeline.  Instead, look into timeline history,
+	 * which is always refreshed up-to-date.
+	 */
+	source_tli = ControlFile_source.checkPointCopy.ThisTimeLineID;
+
+	if (sourceHistory[sourceNentries - 1].tli > source_tli)
+	{
+		pg_log_info("source's actual timeline ID (%d) is newer than control file (%d)",
+					sourceHistory[sourceNentries - 1].tli, source_tli);
+		source_tli = sourceHistory[sourceNentries - 1].tli;
+	}
+
 	/*
 	 * Find the common ancestor timeline between the clusters.
 	 *
 	 * If both clusters are already on the same timeline, there's nothing to
 	 * do.
 	 */
-	if (ControlFile_target.checkPointCopy.ThisTimeLineID ==
-		ControlFile_source.checkPointCopy.ThisTimeLineID)
+	if (ControlFile_target.checkPointCopy.ThisTimeLineID == source_tli)
 	{
 		pg_log_info("source and target cluster are on the same timeline");
 		rewind_needed = false;
@@ -581,7 +606,7 @@ perform_rewind(filemap_t *filemap, rewind_source *source,
 	 * Fetch the control file from the source last. This ensures that the
 	 * minRecoveryPoint is up-to-date.
 	 */
-	buffer = source->fetch_file(source, "global/pg_control", &size);
+	buffer = source->fetch_file(source, "global/pg_control", &size, false);
 	digestControlFile(&ControlFile_source_after, buffer, size);
 	pg_free(buffer);
 
@@ -654,7 +679,22 @@ perform_rewind(filemap_t *filemap, rewind_source *source,
 				pg_fatal("source system was in unexpected state at end of rewind");
 
 			endrec = source->get_current_wal_insert_lsn(source);
-			endtli = ControlFile_source_after.checkPointCopy.ThisTimeLineID;
+
+			/*
+			 * Find the timeline ID corresponding to endrec on the source.
+			 *
+			 * In most cases we can use the TLI in the source control file, but
+			 * that is not the case after promotion until end-of-recovery
+			 * checkpoint completes, where the control file is a bit old for
+			 * this purpose.  It should be the latest timeline in the source's
+			 * history file.
+			 */
+			if (!((sourceHistory[sourceNentries - 1].begin == 0 ||
+				   sourceHistory[sourceNentries - 1].begin <= endrec) &&
+				  sourceHistory[sourceNentries - 1].end == 0))
+				pg_fatal("source server's current insert LSN was not on the latest timeline in history file");
+
+			endtli = sourceHistory[sourceNentries - 1].tli;
 		}
 	}
 	else
@@ -796,45 +836,82 @@ MinXLogRecPtr(XLogRecPtr a, XLogRecPtr b)
 }
 
 /*
- * Retrieve timeline history for given control file which should behold
- * either source or target.
+ * Retrieve the latest timeline history for given control file which should
+ * behold either source or target.
+ *
+ * This works on the assumption that the timeline IDs of existing history files
+ * are contiguous up to the latest history file. This is true for
+ * recently-promoted servers.  See findNewestTimeLine() for this assumption.
  */
 static TimeLineHistoryEntry *
 getTimelineHistory(ControlFileData *controlFile, int *nentries)
 {
-	TimeLineHistoryEntry *history;
+	TimeLineHistoryEntry *history = NULL;
 	TimeLineID	tli;
+	TimeLineID	probe_tli;
 
 	tli = controlFile->checkPointCopy.ThisTimeLineID;
 
-	/*
-	 * Timeline 1 does not have a history file, so there is no need to check
-	 * and fake an entry with infinite start and end positions.
-	 */
-	if (tli == 1)
-	{
-		history = (TimeLineHistoryEntry *) pg_malloc(sizeof(TimeLineHistoryEntry));
-		history->tli = tli;
-		history->begin = history->end = InvalidXLogRecPtr;
-		*nentries = 1;
-	}
-	else
+	Assert(tli > 0);
+
+	/* Probe history files */
+	for (probe_tli = tli ;; probe_tli++)
 	{
 		char		path[MAXPGPATH];
 		char	   *histfile;
+		TimeLineHistoryEntry *tmphistory;
+		int			nent;
+		int			i;
+
+		if (probe_tli < 2)
+			continue;
 
-		TLHistoryFilePath(path, tli);
+		TLHistoryFilePath(path, probe_tli);
 
 		/* Get history file from appropriate source */
 		if (controlFile == &ControlFile_source)
-			histfile = source->fetch_file(source, path, NULL);
+			histfile = source->fetch_file(source, path, NULL, true);
 		else if (controlFile == &ControlFile_target)
-			histfile = slurpFile(datadir_target, path, NULL);
+			histfile = slurpFile(datadir_target, path, NULL, true);
 		else
 			pg_fatal("invalid control file");
 
-		history = rewind_parseTimeLineHistory(histfile, tli, nentries);
+		/* no such history file, exit */
+		if (!histfile)
+			break;
+
+		/* preserve the history if we're part of it */
+		tmphistory = rewind_parseTimeLineHistory(histfile, probe_tli, &nent);
 		pg_free(histfile);
+
+		for (i = 0 ; i < nent ; i++)
+		{
+			if (tmphistory[i].tli == tli)
+			{
+				if (history)
+					pg_free(history);
+
+				history = tmphistory;
+				*nentries = nent;
+				break;
+			}
+		}
+		if (tmphistory != history)
+			pg_free(tmphistory);
+	}
+
+	/*
+	 * Timeline 1 does not have a history file, so there is no need to check
+	 * and fake an entry with infinite start and end positions.
+	 */
+	if (!history)
+	{
+		Assert (tli == 1);
+
+		history = (TimeLineHistoryEntry *) pg_malloc(sizeof(TimeLineHistoryEntry));
+		history->tli = tli;
+		history->begin = history->end = InvalidXLogRecPtr;
+		*nentries = 1;
 	}
 
 	if (debug)
@@ -879,15 +956,9 @@ getTimelineHistory(ControlFileData *controlFile, int *nentries)
 static void
 findCommonAncestorTimeline(XLogRecPtr *recptr, int *tliIndex)
 {
-	TimeLineHistoryEntry *sourceHistory;
-	int			sourceNentries;
 	int			i,
 				n;
 
-	/* Retrieve timelines for both source and target */
-	sourceHistory = getTimelineHistory(&ControlFile_source, &sourceNentries);
-	targetHistory = getTimelineHistory(&ControlFile_target, &targetNentries);
-
 	/*
 	 * Trace the history forward, until we hit the timeline diverge. It may
 	 * still be possible that the source and target nodes used the same
@@ -910,7 +981,6 @@ findCommonAncestorTimeline(XLogRecPtr *recptr, int *tliIndex)
 		*recptr = MinXLogRecPtr(sourceHistory[i].end, targetHistory[i].end);
 		*tliIndex = i;
 
-		pg_free(sourceHistory);
 		return;
 	}
 	else
diff --git a/src/bin/pg_rewind/rewind_source.h b/src/bin/pg_rewind/rewind_source.h
index 1310e86e75..6975848668 100644
--- a/src/bin/pg_rewind/rewind_source.h
+++ b/src/bin/pg_rewind/rewind_source.h
@@ -35,7 +35,7 @@ typedef struct rewind_source
 	 * handy for text files.
 	 */
 	char	   *(*fetch_file) (struct rewind_source *, const char *path,
-							   size_t *filesize);
+							   size_t *filesize, bool noerror);
 
 	/*
 	 * Request to fetch (part of) a file in the source system, specified by an
diff --git a/src/bin/pg_rewind/t/010_no_checkpoint_after_promotion.pl b/src/bin/pg_rewind/t/010_no_checkpoint_after_promotion.pl
new file mode 100644
index 0000000000..a8512c1db5
--- /dev/null
+++ b/src/bin/pg_rewind/t/010_no_checkpoint_after_promotion.pl
@@ -0,0 +1,76 @@
+
+# Copyright (c) 2021-2022, PostgreSQL Global Development Group
+
+use strict;
+use warnings;
+use PostgreSQL::Test::Utils;
+use Test::More;
+
+use FindBin;
+use lib $FindBin::RealBin;
+
+use RewindTest;
+
+RewindTest::setup_cluster("remote");
+RewindTest::start_primary();
+
+# Create a test table and insert a row in primary.
+primary_psql("CREATE TABLE tbl1 (d text)");
+primary_psql("INSERT INTO tbl1 VALUES ('in primary')");
+
+# This test table will be used to test truncation, i.e. the table
+# is extended in the old primary after promotion
+primary_psql("CREATE TABLE trunc_tbl (d text)");
+primary_psql("INSERT INTO trunc_tbl VALUES ('in primary')");
+
+# This test table will be used to test the "copy-tail" case, i.e. the
+# table is truncated in the old primary after promotion
+primary_psql("CREATE TABLE tail_tbl (id integer, d text)");
+primary_psql("INSERT INTO tail_tbl VALUES (0, 'in primary')");
+
+# This test table is dropped in the old primary after promotion.
+primary_psql("CREATE TABLE drop_tbl (d text)");
+primary_psql("INSERT INTO drop_tbl VALUES ('in primary')");
+
+primary_psql("CHECKPOINT");
+
+RewindTest::create_standby("remote");
+
+# Insert additional data on primary that will be replicated to standby
+primary_psql("INSERT INTO tbl1 values ('in primary, before promotion')");
+primary_psql(
+  "INSERT INTO trunc_tbl values ('in primary, before promotion')");
+primary_psql(
+  "INSERT INTO tail_tbl SELECT g, 'in primary, before promotion: ' || g FROM generate_series(1, 10000) g"
+);
+
+primary_psql('CHECKPOINT');
+
+RewindTest::promote_standby('skip_checkpoint' => 1);
+
+# Insert a row in the old primary. This causes the primary and standby
+# to have "diverged", it's no longer possible to just apply the
+# standy's logs over primary directory - you need to rewind.
+primary_psql("INSERT INTO tbl1 VALUES ('in primary, after promotion')");
+
+# Also insert a new row in the standby, which won't be present in the
+# old primary.
+standby_psql("INSERT INTO tbl1 VALUES ('in standby, after promotion')");
+
+$node_primary->stop;
+
+my $primary_pgdata = $node_primary->data_dir;
+my $standby_connstr = $node_standby->connstr('postgres');
+command_checks_all(
+  [
+    'pg_rewind',       '--dry-run',
+    "--source-server", $standby_connstr,
+    '--target-pgdata', $primary_pgdata,
+    '--no-sync',       '--no-ensure-shutdown'
+  ],
+  0,
+  [],
+  [qr/pg_rewind: source's actual timeline ID \(2\) is newer than control file \(1\)/],
+  'pg_rewind no checkpoint after promotion');
+
+done_testing();
diff --git a/src/bin/pg_rewind/t/RewindTest.pm b/src/bin/pg_rewind/t/RewindTest.pm
index 98b66b01f8..89fddc1656 100644
--- a/src/bin/pg_rewind/t/RewindTest.pm
+++ b/src/bin/pg_rewind/t/RewindTest.pm
@@ -188,6 +188,11 @@ primary_conninfo='$connstr_primary'
 
 sub promote_standby
 {
+	my (%options) = (
+		'skip_checkpoint' => 0,
+		@_
+	);
+
 	#### Now run the test-specific parts to run after standby has been started
 	# up standby
 
@@ -198,13 +203,16 @@ sub promote_standby
 	# the primary out-of-sync with the standby.
 	$node_standby->promote;
 
-	# Force a checkpoint after the promotion. pg_rewind looks at the control
-	# file to determine what timeline the server is on, and that isn't updated
-	# immediately at promotion, but only at the next checkpoint. When running
-	# pg_rewind in remote mode, it's possible that we complete the test steps
-	# after promotion so quickly that when pg_rewind runs, the standby has not
-	# performed a checkpoint after promotion yet.
-	standby_psql("checkpoint");
+	unless ($options{'skip_checkpoint'})
+	{
+		# Force a checkpoint after the promotion. pg_rewind looks at the control
+		# file to determine what timeline the server is on, and that isn't updated
+		# immediately at promotion, but only at the next checkpoint. When running
+		# pg_rewind in remote mode, it's possible that we complete the test steps
+		# after promotion so quickly that when pg_rewind runs, the standby has not
+		# performed a checkpoint after promotion yet.
+		standby_psql("checkpoint");
+	}
 
 	return;
 }

Reply via email to