At Wed, 08 Jun 2022 18:15:09 +0900 (JST), Kyotaro Horiguchi 
<horikyota....@gmail.com> wrote in 
> At Tue, 07 Jun 2022 16:05:47 +0900 (JST), Kyotaro Horiguchi 
> <horikyota....@gmail.com> wrote in 
> > At Tue, 07 Jun 2022 12:39:38 +0900 (JST), Kyotaro Horiguchi 
> > <horikyota....@gmail.com> wrote in 
> > > One possible way to detect promotion reliably is to look into timeline
> > > history files. It is written immediately at promotion even on
> > > standbys.
> > 
> > The attached seems to work. It uses timeline history files to identify
> > the source timeline.  With this change pg_waldump no longer need to
> > wait for end-of-recovery to finish.
> > 
> > (It lacks doc part and test.. But I'm not sure how we can test this
> > behavior.)
> 
> This is a revised version.
> 
> Revised getTimelineHistory()'s logic (refactored, and changed so that
> it doesn't pick-up the wrong history files).
> 
> perform_rewind always identify endtli based on source's timeline
> history.

No need to "search" history file to identify it.  The latest timeline
must be that.

regards.

-- 
Kyotaro Horiguchi
NTT Open Source Software Center
diff --git a/doc/src/sgml/ref/pg_rewind.sgml b/doc/src/sgml/ref/pg_rewind.sgml
index 69d6924b3a..7f752b2ed0 100644
--- a/doc/src/sgml/ref/pg_rewind.sgml
+++ b/doc/src/sgml/ref/pg_rewind.sgml
@@ -334,15 +334,6 @@ GRANT EXECUTE ON function 
pg_catalog.pg_read_binary_file(text, bigint, bigint, b
 </programlisting>
   </para>
 
-  <para>
-   When executing <application>pg_rewind</application> using an online
-   cluster as source which has been recently promoted, it is necessary
-   to execute a <command>CHECKPOINT</command> after promotion such that its
-   control file reflects up-to-date timeline information, which is used by
-   <application>pg_rewind</application> to check if the target cluster
-   can be rewound using the designated source cluster.
-  </para>
-
   <refsect2>
    <title>How It Works</title>
 
diff --git a/src/bin/pg_rewind/file_ops.c b/src/bin/pg_rewind/file_ops.c
index 6cb288f099..2a407da1e4 100644
--- a/src/bin/pg_rewind/file_ops.c
+++ b/src/bin/pg_rewind/file_ops.c
@@ -309,9 +309,11 @@ sync_target_dir(void)
  * buffer is actually *filesize + 1. That's handy when reading a text file.
  * This function can be used to read binary files as well, you can just
  * ignore the zero-terminator in that case.
+ *
+ * If noerror is true, returns NULL when the file is not found.
  */
 char *
-slurpFile(const char *datadir, const char *path, size_t *filesize)
+slurpFile(const char *datadir, const char *path, size_t *filesize, bool 
noerror)
 {
        int                     fd;
        char       *buffer;
@@ -323,8 +325,13 @@ slurpFile(const char *datadir, const char *path, size_t 
*filesize)
        snprintf(fullpath, sizeof(fullpath), "%s/%s", datadir, path);
 
        if ((fd = open(fullpath, O_RDONLY | PG_BINARY, 0)) == -1)
+       {
+               if (noerror && errno == ENOENT)
+                       return NULL;
+
                pg_fatal("could not open file \"%s\" for reading: %m",
                                 fullpath);
+       }
 
        if (fstat(fd, &statbuf) < 0)
                pg_fatal("could not open file \"%s\" for reading: %m",
diff --git a/src/bin/pg_rewind/file_ops.h b/src/bin/pg_rewind/file_ops.h
index 54a853bd42..92e19042cb 100644
--- a/src/bin/pg_rewind/file_ops.h
+++ b/src/bin/pg_rewind/file_ops.h
@@ -21,7 +21,8 @@ extern void create_target(file_entry_t *t);
 extern void remove_target(file_entry_t *t);
 extern void sync_target_dir(void);
 
-extern char *slurpFile(const char *datadir, const char *path, size_t 
*filesize);
+extern char *slurpFile(const char *datadir, const char *path, size_t *filesize,
+       bool noerror);
 
 typedef void (*process_file_callback_t) (const char *path, file_type_t type, 
size_t size, const char *link_target);
 extern void traverse_datadir(const char *datadir, process_file_callback_t 
callback);
diff --git a/src/bin/pg_rewind/libpq_source.c b/src/bin/pg_rewind/libpq_source.c
index 011c9cce6e..751c96e6e4 100644
--- a/src/bin/pg_rewind/libpq_source.c
+++ b/src/bin/pg_rewind/libpq_source.c
@@ -68,7 +68,7 @@ static void libpq_queue_fetch_range(rewind_source *source, 
const char *path,
                                                                        off_t 
off, size_t len);
 static void libpq_finish_fetch(rewind_source *source);
 static char *libpq_fetch_file(rewind_source *source, const char *path,
-                                                         size_t *filesize);
+                                                         size_t *filesize, 
bool noerror);
 static XLogRecPtr libpq_get_current_wal_insert_lsn(rewind_source *source);
 static void libpq_destroy(rewind_source *source);
 
@@ -620,9 +620,12 @@ appendArrayEscapedString(StringInfo buf, const char *str)
 
 /*
  * Fetch a single file as a malloc'd buffer.
+ *
+ * If noerror is true, returns NULL if pg_read_binary_file() failed.
  */
 static char *
-libpq_fetch_file(rewind_source *source, const char *path, size_t *filesize)
+libpq_fetch_file(rewind_source *source, const char *path, size_t *filesize,
+                                bool noerror)
 {
        PGconn     *conn = ((libpq_source *) source)->conn;
        PGresult   *res;
@@ -631,6 +634,34 @@ libpq_fetch_file(rewind_source *source, const char *path, 
size_t *filesize)
        const char *paramValues[1];
 
        paramValues[0] = path;
+
+       /*
+        * check the existence of the file. We do this before executing
+        * pg_read_binary_file so that server doesn't emit an error
+        */
+       if (noerror)
+       {
+               res = PQexecParams(conn, "SELECT pg_stat_file($1, true)",
+                                                  1, NULL, paramValues, NULL, 
NULL, 1);
+
+               if (PQresultStatus(res) != PGRES_TUPLES_OK)
+               {
+                       pg_fatal("could not stat remote file \"%s\": %s",
+                                        path, PQresultErrorMessage(res));
+               }
+
+               /* sanity check the result set */
+               if (PQntuples(res) != 1)
+                       pg_fatal("unexpected result set while stating remote 
file \"%s\"",
+                                        path);
+
+               /* Return NULL if the file was not found */
+               if (PQgetisnull(res, 0, 0))
+                       return NULL;
+
+               PQclear(res);
+       }
+
        res = PQexecParams(conn, "SELECT pg_read_binary_file($1)",
                                           1, NULL, paramValues, NULL, NULL, 1);
 
diff --git a/src/bin/pg_rewind/local_source.c b/src/bin/pg_rewind/local_source.c
index 2e50485c39..fc2e1e9f11 100644
--- a/src/bin/pg_rewind/local_source.c
+++ b/src/bin/pg_rewind/local_source.c
@@ -28,7 +28,7 @@ typedef struct
 static void local_traverse_files(rewind_source *source,
                                                                 
process_file_callback_t callback);
 static char *local_fetch_file(rewind_source *source, const char *path,
-                                                         size_t *filesize);
+                                                         size_t *filesize, 
bool noerror);
 static void local_queue_fetch_file(rewind_source *source, const char *path,
                                                                   size_t len);
 static void local_queue_fetch_range(rewind_source *source, const char *path,
@@ -63,9 +63,11 @@ local_traverse_files(rewind_source *source, 
process_file_callback_t callback)
 }
 
 static char *
-local_fetch_file(rewind_source *source, const char *path, size_t *filesize)
+local_fetch_file(rewind_source *source, const char *path, size_t *filesize,
+       bool noerror)
 {
-       return slurpFile(((local_source *) source)->datadir, path, filesize);
+       return slurpFile(((local_source *) source)->datadir, path, filesize,
+                                        noerror);
 }
 
 /*
diff --git a/src/bin/pg_rewind/pg_rewind.c b/src/bin/pg_rewind/pg_rewind.c
index 1ff8da1676..221e97c5d8 100644
--- a/src/bin/pg_rewind/pg_rewind.c
+++ b/src/bin/pg_rewind/pg_rewind.c
@@ -43,6 +43,8 @@ static void createBackupLabel(XLogRecPtr startpoint, 
TimeLineID starttli,
 
 static void digestControlFile(ControlFileData *ControlFile,
                                                          const char *content, 
size_t size);
+static TimeLineHistoryEntry *getTimelineHistory(ControlFileData *controlFile,
+                                                                               
                int *nentries);
 static void getRestoreCommand(const char *argv0);
 static void sanityChecks(void);
 static void findCommonAncestorTimeline(XLogRecPtr *recptr, int *tliIndex);
@@ -70,9 +72,13 @@ bool         do_sync = true;
 bool           restore_wal = false;
 
 /* Target history */
-TimeLineHistoryEntry *targetHistory;
+TimeLineHistoryEntry *targetHistory = NULL;
 int                    targetNentries;
 
+/* Source history */
+TimeLineHistoryEntry *sourceHistory = NULL;
+int                    sourceNentries;
+
 /* Progress counters */
 uint64         fetch_size;
 uint64         fetch_done;
@@ -141,6 +147,7 @@ main(int argc, char **argv)
        bool            rewind_needed;
        bool            writerecoveryconf = false;
        filemap_t  *filemap;
+       TimeLineID      source_tli;
 
        pg_logging_init(argv[0]);
        set_pglocale_pgservice(argv[0], PG_TEXTDOMAIN("pg_rewind"));
@@ -311,7 +318,7 @@ main(int argc, char **argv)
         * need to make sure by themselves that the target cluster is in a clean
         * state.
         */
-       buffer = slurpFile(datadir_target, "global/pg_control", &size);
+       buffer = slurpFile(datadir_target, "global/pg_control", &size, false);
        digestControlFile(&ControlFile_target, buffer, size);
        pg_free(buffer);
 
@@ -321,25 +328,43 @@ main(int argc, char **argv)
        {
                ensureCleanShutdown(argv[0]);
 
-               buffer = slurpFile(datadir_target, "global/pg_control", &size);
+               buffer = slurpFile(datadir_target, "global/pg_control", &size, 
false);
                digestControlFile(&ControlFile_target, buffer, size);
                pg_free(buffer);
        }
 
-       buffer = source->fetch_file(source, "global/pg_control", &size);
+       buffer = source->fetch_file(source, "global/pg_control", &size, false);
        digestControlFile(&ControlFile_source, buffer, size);
        pg_free(buffer);
 
        sanityChecks();
 
+       /* Retrieve timelines for both source and target */
+       sourceHistory = getTimelineHistory(&ControlFile_source, 
&sourceNentries);
+       targetHistory = getTimelineHistory(&ControlFile_target, 
&targetNentries);
+
+       /*
+        * If the source just has been promoted but the end-of-recovery 
checkpoint
+        * has not completed, the soruce control file has a bit older content 
for
+        * identifying the source's timeline.  Instead, look into timeline 
history,
+        * which is always refreshed up-to-date.
+        */
+       source_tli = ControlFile_source.checkPointCopy.ThisTimeLineID;
+
+       if (sourceHistory[sourceNentries - 1].tli > source_tli)
+       {
+               pg_log_info("source's actual timeline ID (%d) is newer than 
control file (%d)",
+                                       sourceHistory[sourceNentries - 1].tli, 
source_tli);
+               source_tli = sourceHistory[sourceNentries - 1].tli;
+       }
+
        /*
         * Find the common ancestor timeline between the clusters.
         *
         * If both clusters are already on the same timeline, there's nothing to
         * do.
         */
-       if (ControlFile_target.checkPointCopy.ThisTimeLineID ==
-               ControlFile_source.checkPointCopy.ThisTimeLineID)
+       if (ControlFile_target.checkPointCopy.ThisTimeLineID == source_tli)
        {
                pg_log_info("source and target cluster are on the same 
timeline");
                rewind_needed = false;
@@ -581,7 +606,7 @@ perform_rewind(filemap_t *filemap, rewind_source *source,
         * Fetch the control file from the source last. This ensures that the
         * minRecoveryPoint is up-to-date.
         */
-       buffer = source->fetch_file(source, "global/pg_control", &size);
+       buffer = source->fetch_file(source, "global/pg_control", &size, false);
        digestControlFile(&ControlFile_source_after, buffer, size);
        pg_free(buffer);
 
@@ -654,7 +679,22 @@ perform_rewind(filemap_t *filemap, rewind_source *source,
                                pg_fatal("source system was in unexpected state 
at end of rewind");
 
                        endrec = source->get_current_wal_insert_lsn(source);
-                       endtli = 
ControlFile_source_after.checkPointCopy.ThisTimeLineID;
+
+                       /*
+                        * Find the timeline ID corresponding to endrec on the 
source.
+                        *
+                        * In most cases we can use the TLI in the source 
control file, but
+                        * that is not the case after promotion until 
end-of-recovery
+                        * checkpoint completes, where the control file is a 
bit old for
+                        * this purpose.  It should be the latest timeline in 
the source's
+                        * history file.
+                        */
+                       if (!((sourceHistory[sourceNentries - 1].begin == 0 ||
+                                  sourceHistory[sourceNentries - 1].begin <= 
endrec) &&
+                                 sourceHistory[sourceNentries - 1].end == 0))
+                               pg_fatal("source server's current insert LSN 
was not on the latest timeline in history file");
+
+                       endtli = sourceHistory[sourceNentries - 1].tli;
                }
        }
        else
@@ -796,46 +836,83 @@ MinXLogRecPtr(XLogRecPtr a, XLogRecPtr b)
 }
 
 /*
- * Retrieve timeline history for given control file which should behold
- * either source or target.
+ * Retrieve the latest timeline history for given control file which should
+ * behold either source or target.
+ *
+ * This works on the assumption that the timeline IDs of existing history files
+ * are contiguous up to the latest history file. This is true for
+ * recently-promoted servers.  See findNewestTimeLine() for this assumption.
  */
 static TimeLineHistoryEntry *
 getTimelineHistory(ControlFileData *controlFile, int *nentries)
 {
-       TimeLineHistoryEntry *history;
+       TimeLineHistoryEntry *history = NULL;
        TimeLineID      tli;
+       TimeLineID      probe_tli;
 
        tli = controlFile->checkPointCopy.ThisTimeLineID;
 
+       Assert(tli > 0);
+
+       /* Probe history files */
+       for (probe_tli = tli ;; probe_tli++)
+       {
+               char            path[MAXPGPATH];
+               char       *histfile;
+               TimeLineHistoryEntry *tmphistory;
+               int                     nent;
+               int                     i;
+
+               if (probe_tli < 2)
+                       continue;
+
+               TLHistoryFilePath(path, probe_tli);
+
+               /* Get history file from appropriate source */
+               if (controlFile == &ControlFile_source)
+                       histfile = source->fetch_file(source, path, NULL, true);
+               else if (controlFile == &ControlFile_target)
+                       histfile = slurpFile(datadir_target, path, NULL, true);
+               else
+                       pg_fatal("invalid control file");
+
+               /* no such history file, exit */
+               if (!histfile)
+                       break;
+
+               /* preserve the history if we're part of it */
+               tmphistory = rewind_parseTimeLineHistory(histfile, probe_tli, 
&nent);
+               pg_free(histfile);
+
+               for (i = 0 ; i < nent ; i++)
+               {
+                       if (tmphistory[i].tli == tli)
+                       {
+                               if (history)
+                                       pg_free(history);
+
+                               history = tmphistory;
+                               *nentries = nent;
+                               break;
+                       }
+               }
+               if (tmphistory != history)
+                       pg_free(tmphistory);
+       }
+
        /*
         * Timeline 1 does not have a history file, so there is no need to check
         * and fake an entry with infinite start and end positions.
         */
-       if (tli == 1)
+       if (!history)
        {
+               Assert (tli == 1);
+
                history = (TimeLineHistoryEntry *) 
pg_malloc(sizeof(TimeLineHistoryEntry));
                history->tli = tli;
                history->begin = history->end = InvalidXLogRecPtr;
                *nentries = 1;
        }
-       else
-       {
-               char            path[MAXPGPATH];
-               char       *histfile;
-
-               TLHistoryFilePath(path, tli);
-
-               /* Get history file from appropriate source */
-               if (controlFile == &ControlFile_source)
-                       histfile = source->fetch_file(source, path, NULL);
-               else if (controlFile == &ControlFile_target)
-                       histfile = slurpFile(datadir_target, path, NULL);
-               else
-                       pg_fatal("invalid control file");
-
-               history = rewind_parseTimeLineHistory(histfile, tli, nentries);
-               pg_free(histfile);
-       }
 
        if (debug)
        {
@@ -879,15 +956,9 @@ getTimelineHistory(ControlFileData *controlFile, int 
*nentries)
 static void
 findCommonAncestorTimeline(XLogRecPtr *recptr, int *tliIndex)
 {
-       TimeLineHistoryEntry *sourceHistory;
-       int                     sourceNentries;
        int                     i,
                                n;
 
-       /* Retrieve timelines for both source and target */
-       sourceHistory = getTimelineHistory(&ControlFile_source, 
&sourceNentries);
-       targetHistory = getTimelineHistory(&ControlFile_target, 
&targetNentries);
-
        /*
         * Trace the history forward, until we hit the timeline diverge. It may
         * still be possible that the source and target nodes used the same
@@ -910,7 +981,6 @@ findCommonAncestorTimeline(XLogRecPtr *recptr, int 
*tliIndex)
                *recptr = MinXLogRecPtr(sourceHistory[i].end, 
targetHistory[i].end);
                *tliIndex = i;
 
-               pg_free(sourceHistory);
                return;
        }
        else
diff --git a/src/bin/pg_rewind/rewind_source.h 
b/src/bin/pg_rewind/rewind_source.h
index 1310e86e75..6975848668 100644
--- a/src/bin/pg_rewind/rewind_source.h
+++ b/src/bin/pg_rewind/rewind_source.h
@@ -35,7 +35,7 @@ typedef struct rewind_source
         * handy for text files.
         */
        char       *(*fetch_file) (struct rewind_source *, const char *path,
-                                                          size_t *filesize);
+                                                          size_t *filesize, 
bool noerror);
 
        /*
         * Request to fetch (part of) a file in the source system, specified by 
an

Reply via email to