At Tue, 07 Jun 2022 12:39:38 +0900 (JST), Kyotaro Horiguchi 
<horikyota....@gmail.com> wrote in 
> One possible way to detect promotion reliably is to look into timeline
> history files. It is written immediately at promotion even on
> standbys.

The attached seems to work. It uses timeline history files to identify
the source timeline.  With this change pg_waldump no longer need to
wait for end-of-recovery to finish.

(It lacks doc part and test.. But I'm not sure how we can test this
behavior.)

regards.

-- 
Kyotaro Horiguchi
NTT Open Source Software Center
diff --git a/src/bin/pg_rewind/file_ops.c b/src/bin/pg_rewind/file_ops.c
index 6cb288f099..2a407da1e4 100644
--- a/src/bin/pg_rewind/file_ops.c
+++ b/src/bin/pg_rewind/file_ops.c
@@ -309,9 +309,11 @@ sync_target_dir(void)
  * buffer is actually *filesize + 1. That's handy when reading a text file.
  * This function can be used to read binary files as well, you can just
  * ignore the zero-terminator in that case.
+ *
+ * If noerror is true, returns NULL when the file is not found.
  */
 char *
-slurpFile(const char *datadir, const char *path, size_t *filesize)
+slurpFile(const char *datadir, const char *path, size_t *filesize, bool 
noerror)
 {
        int                     fd;
        char       *buffer;
@@ -323,8 +325,13 @@ slurpFile(const char *datadir, const char *path, size_t 
*filesize)
        snprintf(fullpath, sizeof(fullpath), "%s/%s", datadir, path);
 
        if ((fd = open(fullpath, O_RDONLY | PG_BINARY, 0)) == -1)
+       {
+               if (noerror && errno == ENOENT)
+                       return NULL;
+
                pg_fatal("could not open file \"%s\" for reading: %m",
                                 fullpath);
+       }
 
        if (fstat(fd, &statbuf) < 0)
                pg_fatal("could not open file \"%s\" for reading: %m",
diff --git a/src/bin/pg_rewind/file_ops.h b/src/bin/pg_rewind/file_ops.h
index 54a853bd42..92e19042cb 100644
--- a/src/bin/pg_rewind/file_ops.h
+++ b/src/bin/pg_rewind/file_ops.h
@@ -21,7 +21,8 @@ extern void create_target(file_entry_t *t);
 extern void remove_target(file_entry_t *t);
 extern void sync_target_dir(void);
 
-extern char *slurpFile(const char *datadir, const char *path, size_t 
*filesize);
+extern char *slurpFile(const char *datadir, const char *path, size_t *filesize,
+       bool noerror);
 
 typedef void (*process_file_callback_t) (const char *path, file_type_t type, 
size_t size, const char *link_target);
 extern void traverse_datadir(const char *datadir, process_file_callback_t 
callback);
diff --git a/src/bin/pg_rewind/libpq_source.c b/src/bin/pg_rewind/libpq_source.c
index 011c9cce6e..92067d4f2c 100644
--- a/src/bin/pg_rewind/libpq_source.c
+++ b/src/bin/pg_rewind/libpq_source.c
@@ -68,7 +68,7 @@ static void libpq_queue_fetch_range(rewind_source *source, 
const char *path,
                                                                        off_t 
off, size_t len);
 static void libpq_finish_fetch(rewind_source *source);
 static char *libpq_fetch_file(rewind_source *source, const char *path,
-                                                         size_t *filesize);
+                                                         size_t *filesize, 
bool noerror);
 static XLogRecPtr libpq_get_current_wal_insert_lsn(rewind_source *source);
 static void libpq_destroy(rewind_source *source);
 
@@ -620,9 +620,12 @@ appendArrayEscapedString(StringInfo buf, const char *str)
 
 /*
  * Fetch a single file as a malloc'd buffer.
+ *
+ * If noerror is true, returns NULL if pg_read_binary_file() failed.
  */
 static char *
-libpq_fetch_file(rewind_source *source, const char *path, size_t *filesize)
+libpq_fetch_file(rewind_source *source, const char *path, size_t *filesize,
+                                bool noerror)
 {
        PGconn     *conn = ((libpq_source *) source)->conn;
        PGresult   *res;
@@ -631,6 +634,34 @@ libpq_fetch_file(rewind_source *source, const char *path, 
size_t *filesize)
        const char *paramValues[1];
 
        paramValues[0] = path;
+
+       /*
+        * check the existence of the file. We don't do this separately from
+        * pg_read_binary_file so that server doesn't emit an error
+        */
+       if (noerror)
+       {
+               res = PQexecParams(conn, "SELECT pg_stat_file($1, true)",
+                                                  1, NULL, paramValues, NULL, 
NULL, 1);
+
+               if (PQresultStatus(res) != PGRES_TUPLES_OK)
+               {
+                       pg_fatal("could not stat remote file \"%s\": %s",
+                                        path, PQresultErrorMessage(res));
+               }
+
+               /* sanity check the result set */
+               if (PQntuples(res) != 1)
+                       pg_fatal("unexpected result set while stating remote 
file \"%s\"",
+                                        path);
+
+               /* Return NULL if the file was not found */
+               if (PQgetisnull(res, 0, 0))
+                       return NULL;
+
+               PQclear(res);
+       }
+
        res = PQexecParams(conn, "SELECT pg_read_binary_file($1)",
                                           1, NULL, paramValues, NULL, NULL, 1);
 
diff --git a/src/bin/pg_rewind/local_source.c b/src/bin/pg_rewind/local_source.c
index 2e50485c39..fc2e1e9f11 100644
--- a/src/bin/pg_rewind/local_source.c
+++ b/src/bin/pg_rewind/local_source.c
@@ -28,7 +28,7 @@ typedef struct
 static void local_traverse_files(rewind_source *source,
                                                                 
process_file_callback_t callback);
 static char *local_fetch_file(rewind_source *source, const char *path,
-                                                         size_t *filesize);
+                                                         size_t *filesize, 
bool noerror);
 static void local_queue_fetch_file(rewind_source *source, const char *path,
                                                                   size_t len);
 static void local_queue_fetch_range(rewind_source *source, const char *path,
@@ -63,9 +63,11 @@ local_traverse_files(rewind_source *source, 
process_file_callback_t callback)
 }
 
 static char *
-local_fetch_file(rewind_source *source, const char *path, size_t *filesize)
+local_fetch_file(rewind_source *source, const char *path, size_t *filesize,
+       bool noerror)
 {
-       return slurpFile(((local_source *) source)->datadir, path, filesize);
+       return slurpFile(((local_source *) source)->datadir, path, filesize,
+                                        noerror);
 }
 
 /*
diff --git a/src/bin/pg_rewind/pg_rewind.c b/src/bin/pg_rewind/pg_rewind.c
index 1ff8da1676..f9c7853f08 100644
--- a/src/bin/pg_rewind/pg_rewind.c
+++ b/src/bin/pg_rewind/pg_rewind.c
@@ -43,6 +43,8 @@ static void createBackupLabel(XLogRecPtr startpoint, 
TimeLineID starttli,
 
 static void digestControlFile(ControlFileData *ControlFile,
                                                          const char *content, 
size_t size);
+static TimeLineHistoryEntry *getTimelineHistory(ControlFileData *controlFile,
+                                                                               
                int *nentries);
 static void getRestoreCommand(const char *argv0);
 static void sanityChecks(void);
 static void findCommonAncestorTimeline(XLogRecPtr *recptr, int *tliIndex);
@@ -141,6 +143,7 @@ main(int argc, char **argv)
        bool            rewind_needed;
        bool            writerecoveryconf = false;
        filemap_t  *filemap;
+       TimeLineID      source_tli;
 
        pg_logging_init(argv[0]);
        set_pglocale_pgservice(argv[0], PG_TEXTDOMAIN("pg_rewind"));
@@ -311,7 +314,7 @@ main(int argc, char **argv)
         * need to make sure by themselves that the target cluster is in a clean
         * state.
         */
-       buffer = slurpFile(datadir_target, "global/pg_control", &size);
+       buffer = slurpFile(datadir_target, "global/pg_control", &size, false);
        digestControlFile(&ControlFile_target, buffer, size);
        pg_free(buffer);
 
@@ -321,25 +324,47 @@ main(int argc, char **argv)
        {
                ensureCleanShutdown(argv[0]);
 
-               buffer = slurpFile(datadir_target, "global/pg_control", &size);
+               buffer = slurpFile(datadir_target, "global/pg_control", &size, 
false);
                digestControlFile(&ControlFile_target, buffer, size);
                pg_free(buffer);
        }
 
-       buffer = source->fetch_file(source, "global/pg_control", &size);
+       buffer = source->fetch_file(source, "global/pg_control", &size, false);
        digestControlFile(&ControlFile_source, buffer, size);
        pg_free(buffer);
 
        sanityChecks();
 
+       /*
+        * There may be a case where the source has been promoted but the
+        * end-of-recovery checkpoint has not completed. In this case the soruce
+        * control file is has a bit older content for this purpose.  Look into
+        * timeline history file, which is refreshed up-to-date.
+        */
+       source_tli = ControlFile_source.checkPointCopy.ThisTimeLineID;
+       if (ControlFile_target.checkPointCopy.ThisTimeLineID == source_tli)
+       {
+               int nentries;
+               TimeLineHistoryEntry *hist;
+
+               hist = getTimelineHistory(&ControlFile_source, &nentries);
+
+               /* last line of history file is the newest timeline */
+               if (nentries > 0 && hist[nentries - 1].tli > source_tli)
+               {
+                       pg_log_info("source's actual timeline ID (%d) is newer 
than control file (%d)", hist[nentries - 1].tli, source_tli);
+                       source_tli = hist[nentries - 1].tli;
+               }
+               pg_free(hist);
+       }
+
        /*
         * Find the common ancestor timeline between the clusters.
         *
         * If both clusters are already on the same timeline, there's nothing to
         * do.
         */
-       if (ControlFile_target.checkPointCopy.ThisTimeLineID ==
-               ControlFile_source.checkPointCopy.ThisTimeLineID)
+       if (ControlFile_target.checkPointCopy.ThisTimeLineID == source_tli)
        {
                pg_log_info("source and target cluster are on the same 
timeline");
                rewind_needed = false;
@@ -581,7 +606,7 @@ perform_rewind(filemap_t *filemap, rewind_source *source,
         * Fetch the control file from the source last. This ensures that the
         * minRecoveryPoint is up-to-date.
         */
-       buffer = source->fetch_file(source, "global/pg_control", &size);
+       buffer = source->fetch_file(source, "global/pg_control", &size, false);
        digestControlFile(&ControlFile_source_after, buffer, size);
        pg_free(buffer);
 
@@ -630,6 +655,10 @@ perform_rewind(filemap_t *filemap, rewind_source *source,
         */
        if (connstr_source)
        {
+               int nentries;
+               TimeLineHistoryEntry *hist;
+               int i;
+
                /*
                 * The source is a live server. Like in an online backup, it's
                 * important that we recover all the WAL that was generated 
while we
@@ -655,6 +684,29 @@ perform_rewind(filemap_t *filemap, rewind_source *source,
 
                        endrec = source->get_current_wal_insert_lsn(source);
                        endtli = 
ControlFile_source_after.checkPointCopy.ThisTimeLineID;
+
+                       /*
+                        * Find the timeline ID corresponding to endrec on the 
source.
+                        *
+                        * In most cases we can rely on control file, but that 
is not the
+                        * case after promotion until end-of-recovery 
checkpoint completes.
+                        * Identify the timeline ID the hard way since we don't 
have a
+                        * easer way to detect that case.  In case where we 
failed to do
+                        * that, fall back to the control file's value.
+                        */
+                       hist = getTimelineHistory(&ControlFile_source, 
&nentries);
+                       if (hist[nentries - 1].tli != endtli)
+                       {
+                               for (i = 0; i < nentries; i++)
+                               {
+                                       if ((hist[i].begin == 0 || 
hist[i].begin <= endrec) &&
+                                               (hist[i].end == 0   || endrec < 
hist[i].end))
+                                       {
+                                               endtli = hist[i].tli;
+                                               break;
+                                       }
+                               }
+                       }
                }
        }
        else
@@ -804,9 +856,32 @@ getTimelineHistory(ControlFileData *controlFile, int 
*nentries)
 {
        TimeLineHistoryEntry *history;
        TimeLineID      tli;
+       TimeLineID      probe_tli;
 
        tli = controlFile->checkPointCopy.ThisTimeLineID;
 
+       Assert(tli > 0);
+       for (probe_tli = tli + 1 ;; probe_tli++)
+       {
+               char            path[MAXPGPATH];
+               char       *histfile;
+
+               TLHistoryFilePath(path, probe_tli);
+
+               /* Get history file from appropriate source */
+               if (controlFile == &ControlFile_source)
+                       histfile = source->fetch_file(source, path, NULL, true);
+               else if (controlFile == &ControlFile_target)
+                       histfile = slurpFile(datadir_target, path, NULL, true);
+
+               if (!histfile)
+                       break;
+
+               pg_free(histfile);
+       }
+
+       tli = probe_tli - 1;
+
        /*
         * Timeline 1 does not have a history file, so there is no need to check
         * and fake an entry with infinite start and end positions.
@@ -827,9 +902,9 @@ getTimelineHistory(ControlFileData *controlFile, int 
*nentries)
 
                /* Get history file from appropriate source */
                if (controlFile == &ControlFile_source)
-                       histfile = source->fetch_file(source, path, NULL);
+                       histfile = source->fetch_file(source, path, NULL, 
false);
                else if (controlFile == &ControlFile_target)
-                       histfile = slurpFile(datadir_target, path, NULL);
+                       histfile = slurpFile(datadir_target, path, NULL, false);
                else
                        pg_fatal("invalid control file");
 
diff --git a/src/bin/pg_rewind/rewind_source.h 
b/src/bin/pg_rewind/rewind_source.h
index 1310e86e75..6975848668 100644
--- a/src/bin/pg_rewind/rewind_source.h
+++ b/src/bin/pg_rewind/rewind_source.h
@@ -35,7 +35,7 @@ typedef struct rewind_source
         * handy for text files.
         */
        char       *(*fetch_file) (struct rewind_source *, const char *path,
-                                                          size_t *filesize);
+                                                          size_t *filesize, 
bool noerror);
 
        /*
         * Request to fetch (part of) a file in the source system, specified by 
an

Reply via email to