WALRestore process asynchronously executes restore_command while
recovery continues working.
Overlaps downloading of next WAL file to reduce time delays in file
based archive recovery.
Handles cases of file-only and streaming/file correctly.
--
Simon Riggs http://www.2ndQuadrant.com/
PostgreSQL Development, 24x7 Support, Training & Services
diff --git a/src/backend/access/transam/xlog.c b/src/backend/access/transam/xlog.c
index ce659ec..e8b0b69 100644
--- a/src/backend/access/transam/xlog.c
+++ b/src/backend/access/transam/xlog.c
@@ -40,6 +40,7 @@
#include "pgstat.h"
#include "postmaster/bgwriter.h"
#include "postmaster/startup.h"
+#include "postmaster/walrestore.h"
#include "replication/walreceiver.h"
#include "replication/walsender.h"
#include "storage/bufmgr.h"
@@ -187,7 +188,6 @@ static bool InArchiveRecovery = false;
static bool restoredFromArchive = false;
/* options taken from recovery.conf for archive recovery */
-static char *recoveryRestoreCommand = NULL;
static char *recoveryEndCommand = NULL;
static char *archiveCleanupCommand = NULL;
static RecoveryTargetType recoveryTarget = RECOVERY_TARGET_UNSET;
@@ -575,8 +575,8 @@ bool reachedConsistency = false;
static bool InRedo = false;
-/* Have we launched bgwriter during recovery? */
-static bool bgwriterLaunched = false;
+/* Have we launched background procs during archive recovery yet? */
+static bool ArchRecoveryBgProcsActive = false;
/*
* Information logged when we detect a change in one of the parameters
@@ -632,8 +632,6 @@ static bool XLogPageRead(XLogRecPtr *RecPtr, int emode, bool fetching_ckpt,
bool randAccess);
static int emode_for_corrupt_record(int emode, XLogRecPtr RecPtr);
static void XLogFileClose(void);
-static bool RestoreArchivedFile(char *path, const char *xlogfname,
- const char *recovername, off_t expectedSize);
static void ExecuteRecoveryCommand(char *command, char *commandName,
bool failOnerror);
static void PreallocXlogFiles(XLogRecPtr endptr);
@@ -2706,19 +2704,47 @@ XLogFileRead(uint32 log, uint32 seg, int emode, TimeLineID tli,
XLogFileName(xlogfname, tli, log, seg);
+#define TMPRECOVERYXLOG "RECOVERYXLOG"
+
switch (source)
{
case XLOG_FROM_ARCHIVE:
+ /*
+ * Check to see if the WALRestore process has already put the
+ * next file in place while we were working. If so, use that.
+ * If not, get it ourselves. This makes it easier to handle
+ * initial state before the WALRestore is active, and also
+ * handles the stop/start logic correctly when we have both
+ * streaming and file based replication active.
+ *
+ * We queue up the next task for WALRestore after we've begun to
+ * use this file later in XLogFileRead().
+ *
+ * If the WALRestore process is still active, the lock wait makes
+ * us wait, which is just like we were executing the command
+ * ourselves and so doesn't alter the logic elsewhere.
+ */
+ if (XLogFileIsNowFullyRestored(tli, log, seg))
+ {
+ snprintf(path, MAXPGPATH, XLOGDIR "/%s", TMPRECOVERYXLOG);
+ restoredFromArchive = true;
+ break;
+ }
+
/* Report recovery progress in PS display */
snprintf(activitymsg, sizeof(activitymsg), "waiting for %s",
xlogfname);
set_ps_display(activitymsg, false);
restoredFromArchive = RestoreArchivedFile(path, xlogfname,
- "RECOVERYXLOG",
+ TMPRECOVERYXLOG,
XLogSegSize);
+
if (!restoredFromArchive)
+ {
+ LWLockRelease(WALRestoreCommandLock);
return -1;
+ }
break;
case XLOG_FROM_PG_XLOG:
@@ -2748,18 +2774,42 @@ XLogFileRead(uint32 log, uint32 seg, int emode, TimeLineID tli,
if (stat(xlogfpath, &statbuf) == 0)
{
if (unlink(xlogfpath) != 0)
+ {
+ LWLockRelease(WALRestoreCommandLock);
ereport(FATAL,
(errcode_for_file_access(),
errmsg("could not remove file \"%s\": %m",
xlogfpath)));
+ }
reload = true;
}
if (rename(path, xlogfpath) < 0)
+ {
+ LWLockRelease(WALRestoreCommandLock);
ereport(ERROR,
(errcode_for_file_access(),
errmsg("could not rename file \"%s\" to \"%s\": %m",
path, xlogfpath)));
+ }
+
+ /*
+ * Make sure we recover from the new filename, so we can reuse the
+ * temporary filename for asynchronous restore actions.
+ */
+ strcpy(path, xlogfpath);
+
+ /*
+ * Tell the WALRestore process to get the next file now.
+ * Hopefully it will be ready for use in time for the next call the
+ * Startup process makes to XLogFileRead().
+ *
+ * It might seem like we should do that earlier but then there is a
+ * race condition that might lead to replacing RECOVERYXLOG with
+ * another file before we've copied it.
+ */
+ SetNextWALRestoreLogSeg(tli, log, seg);
+ LWLockRelease(WALRestoreCommandLock);
/*
* If the existing segment was replaced, since walsenders might have
@@ -2911,8 +2961,11 @@ XLogFileClose(void)
* For fixed-size files, the caller may pass the expected size as an
* additional crosscheck on successful recovery. If the file size is not
* known, set expectedSize = 0.
+ *
+ * Must be called with WALRestoreCommandLock held and must be held at exit,
+ * if the function returns.
*/
-static bool
+bool
RestoreArchivedFile(char *path, const char *xlogfname,
const char *recovername, off_t expectedSize)
{
@@ -2929,7 +2982,7 @@ RestoreArchivedFile(char *path, const char *xlogfname,
uint32 restartSeg;
/* In standby mode, restore_command might not be supplied */
- if (recoveryRestoreCommand == NULL)
+ if (GetRecoveryRestoreCommand() == NULL)
goto not_available;
/*
@@ -2963,18 +3016,24 @@ RestoreArchivedFile(char *path, const char *xlogfname,
if (stat(xlogpath, &stat_buf) != 0)
{
if (errno != ENOENT)
+ {
+ LWLockRelease(WALRestoreCommandLock);
ereport(FATAL,
(errcode_for_file_access(),
errmsg("could not stat file \"%s\": %m",
xlogpath)));
+ }
}
else
{
if (unlink(xlogpath) != 0)
+ {
+ LWLockRelease(WALRestoreCommandLock);
ereport(FATAL,
(errcode_for_file_access(),
errmsg("could not remove file \"%s\": %m",
xlogpath)));
+ }
}
/*
@@ -3013,7 +3072,7 @@ RestoreArchivedFile(char *path, const char *xlogfname,
endp = xlogRestoreCmd + MAXPGPATH - 1;
*endp = '\0';
- for (sp = recoveryRestoreCommand; *sp; sp++)
+ for (sp = GetRecoveryRestoreCommand(); *sp; sp++)
{
if (*sp == '%')
{
@@ -3059,21 +3118,29 @@ RestoreArchivedFile(char *path, const char *xlogfname,
}
*dp = '\0';
- ereport(DEBUG3,
+ ereport(DEBUG2,
(errmsg_internal("executing restore command \"%s\"",
xlogRestoreCmd)));
/*
- * Check signals before restore command and reset afterwards.
+ * Set in_restore_command to tell the signal handler that we should exit
+ * right away on SIGTERM. We know that we're at a safe point to do that.
+ * Check if we had already received the signal, so that we don't miss a
+ * shutdown request received just before this.
*/
- PreRestoreCommand();
+ in_restore_command = true;
+ if (startup_shutdown_requested || walrestore_shutdown_requested)
+ {
+ LWLockRelease(WALRestoreCommandLock);
+ proc_exit(1);
+ }
/*
* Copy xlog from archival storage to XLOGDIR
*/
rc = system(xlogRestoreCmd);
- PostRestoreCommand();
+ in_restore_command = false;
if (rc == 0)
{
@@ -3102,7 +3169,10 @@ RestoreArchivedFile(char *path, const char *xlogfname,
if (StandbyMode && stat_buf.st_size < expectedSize)
elevel = DEBUG1;
else
+ {
+ LWLockRelease(WALRestoreCommandLock);
elevel = FATAL;
+ }
ereport(elevel,
(errmsg("archive file \"%s\" has wrong size: %lu instead of %lu",
xlogfname,
@@ -3123,10 +3193,13 @@ RestoreArchivedFile(char *path, const char *xlogfname,
{
/* stat failed */
if (errno != ENOENT)
+ {
+ LWLockRelease(WALRestoreCommandLock);
ereport(FATAL,
(errcode_for_file_access(),
errmsg("could not stat file \"%s\": %m",
xlogpath)));
+ }
}
}
@@ -3158,10 +3231,18 @@ RestoreArchivedFile(char *path, const char *xlogfname,
* too.
*/
if (WIFSIGNALED(rc) && WTERMSIG(rc) == SIGTERM)
+ {
+ LWLockRelease(WALRestoreCommandLock);
proc_exit(1);
+ }
signaled = WIFSIGNALED(rc) || WEXITSTATUS(rc) > 125;
+ /*
+ * If signaled we will immediately issue a FATAL error so drop the lock
+ */
+ if (signaled)
+ LWLockRelease(WALRestoreCommandLock);
ereport(signaled ? FATAL : DEBUG2,
(errmsg("could not restore file \"%s\" from archive: return code %d",
xlogfname, rc)));
@@ -4203,7 +4284,9 @@ readTimeLineHistory(TimeLineID targetTLI)
if (InArchiveRecovery)
{
TLHistoryFileName(histfname, targetTLI);
+ LWLockAcquire(WALRestoreCommandLock, LW_EXCLUSIVE);
RestoreArchivedFile(path, histfname, "RECOVERYHISTORY", 0);
+ LWLockRelease(WALRestoreCommandLock);
}
else
TLHistoryFilePath(path, targetTLI);
@@ -4292,7 +4375,9 @@ existsTimeLineHistory(TimeLineID probeTLI)
if (InArchiveRecovery)
{
TLHistoryFileName(histfname, probeTLI);
+ LWLockAcquire(WALRestoreCommandLock, LW_EXCLUSIVE);
RestoreArchivedFile(path, histfname, "RECOVERYHISTORY", 0);
+ LWLockRelease(WALRestoreCommandLock);
}
else
TLHistoryFilePath(path, probeTLI);
@@ -4453,7 +4538,9 @@ writeTimeLineHistory(TimeLineID newTLI, TimeLineID parentTLI,
if (InArchiveRecovery)
{
TLHistoryFileName(histfname, parentTLI);
+ LWLockAcquire(WALRestoreCommandLock, LW_EXCLUSIVE);
RestoreArchivedFile(path, histfname, "RECOVERYHISTORY", 0);
+ LWLockRelease(WALRestoreCommandLock);
}
else
TLHistoryFilePath(path, parentTLI);
@@ -5299,10 +5386,10 @@ readRecoveryCommandFile(void)
{
if (strcmp(item->name, "restore_command") == 0)
{
- recoveryRestoreCommand = pstrdup(item->value);
+ SetRecoveryRestoreCommand(pstrdup(item->value));
ereport(DEBUG2,
(errmsg_internal("restore_command = '%s'",
- recoveryRestoreCommand)));
+ GetRecoveryRestoreCommand())));
}
else if (strcmp(item->name, "recovery_end_command") == 0)
{
@@ -5455,7 +5542,7 @@ readRecoveryCommandFile(void)
*/
if (StandbyMode)
{
- if (PrimaryConnInfo == NULL && recoveryRestoreCommand == NULL)
+ if (PrimaryConnInfo == NULL && GetRecoveryRestoreCommand() == NULL)
ereport(WARNING,
(errmsg("recovery command file \"%s\" specified neither primary_conninfo nor restore_command",
RECOVERY_COMMAND_FILE),
@@ -5463,7 +5550,7 @@ readRecoveryCommandFile(void)
}
else
{
- if (recoveryRestoreCommand == NULL)
+ if (GetRecoveryRestoreCommand() == NULL)
ereport(FATAL,
(errmsg("recovery command file \"%s\" must specify restore_command when standby mode is not enabled",
RECOVERY_COMMAND_FILE)));
@@ -6432,7 +6519,7 @@ StartupXLOG(void)
PublishStartupProcessInformation();
SetForwardFsyncRequests();
SendPostmasterSignal(PMSIGNAL_RECOVERY_STARTED);
- bgwriterLaunched = true;
+ ArchRecoveryBgProcsActive = true;
}
/*
@@ -6795,7 +6882,7 @@ StartupXLOG(void)
* the rule that TLI only changes in shutdown checkpoints, which
* allows some extra error checking in xlog_redo.
*/
- if (bgwriterLaunched)
+ if (ArchRecoveryBgProcsActive)
RequestCheckpoint(CHECKPOINT_END_OF_RECOVERY |
CHECKPOINT_IMMEDIATE |
CHECKPOINT_WAIT);
@@ -9640,7 +9727,7 @@ XLogPageRead(XLogRecPtr *RecPtr, int emode, bool fetching_ckpt,
* Request a restartpoint if we've replayed too much
* xlog since the last one.
*/
- if (StandbyMode && bgwriterLaunched)
+ if (StandbyMode && ArchRecoveryBgProcsActive)
{
if (XLogCheckpointNeeded(readId, readSeg))
{
diff --git a/src/backend/bootstrap/bootstrap.c b/src/backend/bootstrap/bootstrap.c
index e3ae92d..81a8cb3 100644
--- a/src/backend/bootstrap/bootstrap.c
+++ b/src/backend/bootstrap/bootstrap.c
@@ -30,6 +30,7 @@
#include "nodes/makefuncs.h"
#include "postmaster/bgwriter.h"
#include "postmaster/startup.h"
+#include "postmaster/walrestore.h"
#include "postmaster/walwriter.h"
#include "replication/walreceiver.h"
#include "storage/bufmgr.h"
@@ -319,6 +320,9 @@ AuxiliaryProcessMain(int argc, char *argv[])
case CheckpointerProcess:
statmsg = "checkpointer process";
break;
+ case WalRestoreProcess:
+ statmsg = "wal restore process";
+ break;
case WalWriterProcess:
statmsg = "wal writer process";
break;
@@ -424,6 +428,11 @@ AuxiliaryProcessMain(int argc, char *argv[])
CheckpointerMain();
proc_exit(1); /* should never return */
+ case WalRestoreProcess:
+ /* don't set signals, wal restore has its own agenda */
+ WalRestoreMain();
+ proc_exit(1); /* should never return */
+
case WalWriterProcess:
/* don't set signals, walwriter has its own agenda */
InitXLOGAccess();
diff --git a/src/backend/postmaster/Makefile b/src/backend/postmaster/Makefile
index 3056b09..349e722 100644
--- a/src/backend/postmaster/Makefile
+++ b/src/backend/postmaster/Makefile
@@ -13,6 +13,6 @@ top_builddir = ../../..
include $(top_builddir)/src/Makefile.global
OBJS = autovacuum.o bgwriter.o fork_process.o pgarch.o pgstat.o postmaster.o \
- startup.o syslogger.o walwriter.o checkpointer.o
+ startup.o syslogger.o walrestore.o walwriter.o checkpointer.o
include $(top_srcdir)/src/backend/common.mk
diff --git a/src/backend/postmaster/postmaster.c b/src/backend/postmaster/postmaster.c
index ad0c17a..15684c0 100644
--- a/src/backend/postmaster/postmaster.c
+++ b/src/backend/postmaster/postmaster.c
@@ -210,6 +210,7 @@ static pid_t StartupPID = 0,
BgWriterPID = 0,
CheckpointerPID = 0,
WalWriterPID = 0,
+ WalRestorePID = 0,
WalReceiverPID = 0,
AutoVacPID = 0,
PgArchPID = 0,
@@ -470,6 +471,7 @@ static void ShmemBackendArrayRemove(Backend *bn);
#define StartCheckpointer() StartChildProcess(CheckpointerProcess)
#define StartWalWriter() StartChildProcess(WalWriterProcess)
#define StartWalReceiver() StartChildProcess(WalReceiverProcess)
+#define StartWalRestore() StartChildProcess(WalRestoreProcess)
/* Macros to check exit status of a child process */
#define EXIT_STATUS_0(st) ((st) == 0)
@@ -2060,6 +2062,8 @@ SIGHUP_handler(SIGNAL_ARGS)
signal_child(WalWriterPID, SIGHUP);
if (WalReceiverPID != 0)
signal_child(WalReceiverPID, SIGHUP);
+ if (WalRestorePID != 0)
+ signal_child(WalRestorePID, SIGHUP);
if (AutoVacPID != 0)
signal_child(AutoVacPID, SIGHUP);
if (PgArchPID != 0)
@@ -2170,6 +2174,8 @@ pmdie(SIGNAL_ARGS)
signal_child(StartupPID, SIGTERM);
if (WalReceiverPID != 0)
signal_child(WalReceiverPID, SIGTERM);
+ if (WalRestorePID != 0)
+ signal_child(WalRestorePID, SIGTERM);
if (BgWriterPID != 0)
signal_child(BgWriterPID, SIGTERM);
if (pmState == PM_RECOVERY)
@@ -2225,6 +2231,8 @@ pmdie(SIGNAL_ARGS)
signal_child(WalWriterPID, SIGQUIT);
if (WalReceiverPID != 0)
signal_child(WalReceiverPID, SIGQUIT);
+ if (WalRestorePID != 0)
+ signal_child(WalRestorePID, SIGQUIT);
if (AutoVacPID != 0)
signal_child(AutoVacPID, SIGQUIT);
if (PgArchPID != 0)
@@ -2331,6 +2339,12 @@ reaper(SIGNAL_ARGS)
pmState = PM_RUN;
/*
+ * Shutdown the WALRestore process
+ */
+ if (WalRestorePID != 0)
+ signal_child(WalRestorePID, SIGTERM);
+
+ /*
* Kill any walsenders to force the downstream standby(s) to
* reread the timeline history file, adjust their timelines and
* establish replication connections again. This is required
@@ -2477,6 +2491,30 @@ reaper(SIGNAL_ARGS)
}
/*
+ * Was it the wal restore? If exit status is zero (normal) or one
+ * (FATAL exit), we assume everything is all right just like normal
+ * backends.
+ */
+ if (pid == WalRestorePID)
+ {
+ if (pmState >= PM_RUN)
+ {
+ WalRestorePID = 0;
+ continue;
+ }
+
+ /*
+ * Any unexpected exit (including FATAL exit) of the WALRestore
+ * process is treated as a crash, except that we don't want to
+ * reinitialize because availability is important.
+ */
+ RecoveryError = true;
+ HandleChildCrash(pid, exitstatus,
+ _("walrestore process"));
+ continue;
+ }
+
+ /*
* Was it the autovacuum launcher? Normal exit can be ignored; we'll
* start a new one at the next iteration of the postmaster's main
* loop, if necessary. Any other exit condition is treated as a
@@ -2756,6 +2794,18 @@ HandleChildCrash(int pid, int exitstatus, const char *procname)
signal_child(WalReceiverPID, (SendStop ? SIGSTOP : SIGQUIT));
}
+ /* Take care of the walrestore too */
+ if (pid == WalRestorePID)
+ WalRestorePID = 0;
+ else if (WalRestorePID != 0 && !FatalError)
+ {
+ ereport(DEBUG2,
+ (errmsg_internal("sending %s to process %d",
+ (SendStop ? "SIGSTOP" : "SIGQUIT"),
+ (int) WalRestorePID)));
+ signal_child(WalRestorePID, (SendStop ? SIGSTOP : SIGQUIT));
+ }
+
/* Take care of the autovacuum launcher too */
if (pid == AutoVacPID)
AutoVacPID = 0;
@@ -2916,6 +2966,8 @@ PostmasterStateMachine(void)
signal_child(StartupPID, SIGTERM);
if (WalReceiverPID != 0)
signal_child(WalReceiverPID, SIGTERM);
+ if (WalRestorePID != 0)
+ signal_child(WalRestorePID, SIGTERM);
pmState = PM_WAIT_BACKENDS;
}
}
@@ -2940,6 +2992,7 @@ PostmasterStateMachine(void)
if (CountChildren(BACKEND_TYPE_NORMAL | BACKEND_TYPE_AUTOVAC) == 0 &&
StartupPID == 0 &&
WalReceiverPID == 0 &&
+ WalRestorePID == 0 &&
BgWriterPID == 0 &&
(CheckpointerPID == 0 || !FatalError) &&
WalWriterPID == 0 &&
@@ -3005,11 +3058,11 @@ PostmasterStateMachine(void)
* left by now anyway; what we're really waiting for is walsenders and
* archiver.
*
- * Walreceiver should normally be dead by now, but not when a fast
- * shutdown is performed during recovery.
+ * Walreceiver and Walrestore should normally be dead by now, but not
+ * when a fast shutdown is performed during recovery.
*/
if (PgArchPID == 0 && CountChildren(BACKEND_TYPE_ALL) == 0 &&
- WalReceiverPID == 0)
+ WalReceiverPID == 0 && WalRestorePID == 0)
{
pmState = PM_WAIT_DEAD_END;
}
@@ -3036,6 +3089,7 @@ PostmasterStateMachine(void)
/* These other guys should be dead already */
Assert(StartupPID == 0);
Assert(WalReceiverPID == 0);
+ Assert(WalRestorePID == 0);
Assert(BgWriterPID == 0);
Assert(CheckpointerPID == 0);
Assert(WalWriterPID == 0);
@@ -4219,6 +4273,8 @@ sigusr1_handler(SIGNAL_ARGS)
BgWriterPID = StartBackgroundWriter();
Assert(CheckpointerPID == 0);
CheckpointerPID = StartCheckpointer();
+ Assert(WalRestorePID == 0);
+ WalRestorePID = StartWalRestore();
pmState = PM_RECOVERY;
}
diff --git a/src/backend/postmaster/startup.c b/src/backend/postmaster/startup.c
index ed75d09..1791feb 100644
--- a/src/backend/postmaster/startup.c
+++ b/src/backend/postmaster/startup.c
@@ -35,14 +35,14 @@
* Flags set by interrupt handlers for later service in the redo loop.
*/
static volatile sig_atomic_t got_SIGHUP = false;
-static volatile sig_atomic_t shutdown_requested = false;
static volatile sig_atomic_t promote_triggered = false;
+volatile sig_atomic_t startup_shutdown_requested = false;
/*
* Flag set when executing a restore command, to tell SIGTERM signal handler
* that it's safe to just proc_exit.
*/
-static volatile sig_atomic_t in_restore_command = false;
+volatile sig_atomic_t in_restore_command = false;
/* Signal handlers */
static void startupproc_quickdie(SIGNAL_ARGS);
@@ -131,9 +131,16 @@ StartupProcShutdownHandler(SIGNAL_ARGS)
int save_errno = errno;
if (in_restore_command)
+ {
+ /*
+ * See RestoreArchivedFile() for explanation of why this
+ * lock is always held when in_restore_command is true.
+ */
+ LWLockRelease(WALRestoreCommandLock);
proc_exit(1);
+ }
else
- shutdown_requested = true;
+ startup_shutdown_requested = true;
WakeupRecovery();
errno = save_errno;
@@ -155,7 +162,7 @@ HandleStartupProcInterrupts(void)
/*
* Check if we were requested to exit without finishing recovery.
*/
- if (shutdown_requested)
+ if (startup_shutdown_requested)
proc_exit(1);
/*
@@ -226,26 +233,6 @@ StartupProcessMain(void)
proc_exit(0);
}
-void
-PreRestoreCommand(void)
-{
- /*
- * Set in_restore_command to tell the signal handler that we should exit
- * right away on SIGTERM. We know that we're at a safe point to do that.
- * Check if we had already received the signal, so that we don't miss a
- * shutdown request received just before this.
- */
- in_restore_command = true;
- if (shutdown_requested)
- proc_exit(1);
-}
-
-void
-PostRestoreCommand(void)
-{
- in_restore_command = false;
-}
-
bool
IsPromoteTriggered(void)
{
diff --git a/src/backend/postmaster/walrestore.c b/src/backend/postmaster/walrestore.c
new file mode 100644
index 0000000..7634d36
--- /dev/null
+++ b/src/backend/postmaster/walrestore.c
@@ -0,0 +1,474 @@
+/*-------------------------------------------------------------------------
+ *
+ * walrestore.c
+ *
+ * The WAL restore process is new as of Postgres 9.2, though the work it performs
+ * has been handled by the startup process from Postgres 8.0 until 9.1.
+ *
+ * WALRestore process executes the restore_command. If not set, it sleeps.
+ * The startup process no longer executes the restore_command and knows
+ * little about where the WAL files have come from.
+ *
+ * The WAL restore process is started by the postmaster when we enter
+ * PM_RECOVERY state and exits immediately after startup finishes.
+ * It remains alive until the postmaster commands it to terminate.
+ * Normal termination is by SIGTERM, which instructs restore process to exit(0).
+ * Like any backend, restore process will simply abort and exit on SIGQUIT.
+ *
+ * Note that the WAL restore process only executes the restore_command.
+ * The archive_cleanup_command is exeuted by the checkpointer, while the
+ * recovery_end_command and requests for history files are executed by the
+ * startup process. That is not important to the way those commands execute.
+ * All processes that use the restore_command must hold WALRestoreCommandLock
+ * before they execute it, since we definitely wish to avoid trying to get the
+ * same file more than once concurrently, plus we can't assume that the
+ * user has specified command that would succeed if run concurrently.
+ *
+ * If the WAL restore exits unexpectedly, the postmaster treats that the same
+ * as a backend crash: shared memory may be corrupted, so remaining backends
+ * should be killed by SIGQUIT and then a recovery cycle started.
+ *
+ *
+ * Portions Copyright (c) 1996-2011, PostgreSQL Global Development Group
+ *
+ *
+ * IDENTIFICATION
+ * src/backend/postmaster/walrestore.c
+ *
+ *-------------------------------------------------------------------------
+ */
+#include "postgres.h"
+
+#include <signal.h>
+#include <sys/time.h>
+#include <time.h>
+#include <unistd.h>
+
+#include "access/xlog_internal.h"
+#include "libpq/pqsignal.h"
+#include "miscadmin.h"
+#include "pgstat.h"
+#include "postmaster/startup.h"
+#include "postmaster/walrestore.h"
+#include "storage/bufmgr.h"
+#include "storage/ipc.h"
+#include "storage/lwlock.h"
+#include "storage/pmsignal.h"
+#include "storage/shmem.h"
+#include "utils/guc.h"
+#include "utils/ps_status.h"
+#include "utils/timestamp.h"
+
+/* XXX Set to DEBUG4 prior to patch commit */
+#define WALRSTR_DEBUG_LEVEL LOG
+
+/*
+ * GUC parameters
+ */
+int WalRestoreDelay = 10000;
+
+WalRestoreData *WalRstr = NULL;
+
+/*
+ * Flags set by interrupt handlers for later service in the main loop.
+ */
+static volatile sig_atomic_t got_SIGHUP = false;
+volatile sig_atomic_t walrestore_shutdown_requested = false;
+
+/* Prototypes for private functions */
+
+static bool WalRestoreNextFile(void);
+
+/* Signal handlers */
+
+static void walrestore_quickdie(SIGNAL_ARGS);
+static void WalRestoreProcSigUsr1Handler(SIGNAL_ARGS);
+static void WalRestoreSigHupHandler(SIGNAL_ARGS);
+static void WalRestoreShutdownHandler(SIGNAL_ARGS);
+
+
+/*
+ * Main entry point for walrestore process
+ *
+ * This is invoked from BootstrapMain, which has already created the basic
+ * execution environment, but not enabled signals yet.
+ */
+void
+WalRestoreMain(void)
+{
+ /* use volatile pointer to prevent code rearrangement */
+ volatile WalRestoreData *walrstr = WalRstr;
+
+ /*
+ * WalRstr should be set up already (if we are a backend, we inherit this
+ * by fork() or EXEC_BACKEND mechanism from the postmaster).
+ */
+ Assert(walrstr != NULL);
+
+ InitLatch(&walrstr->WALRestoreLatch); /* initialize latch used in main loop */
+
+ /*
+ * If possible, make this process a group leader, so that the postmaster
+ * can signal any child processes too.
+ */
+#ifdef HAVE_SETSID
+ if (setsid() < 0)
+ elog(FATAL, "setsid() failed: %m");
+#endif
+
+ /*
+ * Properly accept or ignore signals the postmaster might send us
+ *
+ * SIGUSR1 is presently unused; keep it spare in case someday we want this
+ * process to participate in ProcSignal signalling.
+ */
+ pqsignal(SIGHUP, WalRestoreSigHupHandler); /* set flag to read config file */
+ pqsignal(SIGINT, SIG_IGN);
+ pqsignal(SIGTERM, WalRestoreShutdownHandler); /* shutdown */
+ pqsignal(SIGQUIT, walrestore_quickdie); /* hard crash time */
+ pqsignal(SIGALRM, SIG_IGN);
+ pqsignal(SIGPIPE, SIG_IGN);
+ pqsignal(SIGUSR1, WalRestoreProcSigUsr1Handler); /* reserve for ProcSignal */
+ pqsignal(SIGUSR2, SIG_IGN);
+
+ /*
+ * Reset some signals that are accepted by postmaster but not here
+ */
+ pqsignal(SIGCHLD, SIG_DFL);
+ pqsignal(SIGTTIN, SIG_DFL);
+ pqsignal(SIGTTOU, SIG_DFL);
+ pqsignal(SIGCONT, SIG_DFL);
+ pqsignal(SIGWINCH, SIG_DFL);
+
+ /* We allow SIGQUIT (quickdie) at all times */
+ sigdelset(&BlockSig, SIGQUIT);
+
+ /*
+ * Unblock signals (they were blocked when the postmaster forked us)
+ */
+ PG_SETMASK(&UnBlockSig);
+
+ /*
+ * Loop forever
+ */
+ for (;;)
+ {
+ ResetLatch(&walrstr->WALRestoreLatch);
+
+ /*
+ * Emergency bailout if postmaster has died. This is to avoid the
+ * necessity for manual cleanup of all postmaster children.
+ */
+ if (!PostmasterIsAlive())
+ exit(1);
+
+ if (got_SIGHUP)
+ {
+ got_SIGHUP = false;
+ ProcessConfigFile(PGC_SIGHUP);
+ }
+
+ if (walrestore_shutdown_requested)
+ {
+ /*
+ * From here on, elog(ERROR) should end with exit(1), not send
+ * control back to the sigsetjmp block above
+ */
+ ExitOnAnyError = true;
+ /* Normal exit from the walwriter is here */
+ proc_exit(0); /* done */
+ }
+
+ /*
+ * Keep restoring as long as there are files to process and we have
+ * not exceeded wal_keep_files
+ */
+ if (!WalRestoreNextFile())
+ {
+ (void) WaitLatch(&walrstr->WALRestoreLatch,
+ WL_LATCH_SET | WL_TIMEOUT | WL_POSTMASTER_DEATH,
+ WalRestoreDelay /* ms */);
+ }
+ }
+}
+
+/*
+ * SetNextWALRestoreLogSeg - set the target for next WALrestore cycle
+ *
+ * Only called by Startup process
+ *
+ * Must be called with WALRestoreCommandLock held and must be held at exit,
+ * if the function returns.
+ */
+void
+SetNextWALRestoreLogSeg(TimeLineID tli, uint32 log, uint32 seg)
+{
+ char xlogfname[MAXFNAMELEN];
+ uint32 newlog = log;
+ uint32 newseg = seg;
+
+ NextLogSeg(newlog, newseg);
+
+ XLogFileName(xlogfname, tli, newlog, newseg);
+ elog(WALRSTR_DEBUG_LEVEL, "requesting restore of %s", xlogfname);
+
+ {
+ /* use volatile pointer to prevent code rearrangement */
+ volatile WalRestoreData *walrstr = WalRstr;
+
+ walrstr->nextFileTli = tli;
+ walrstr->nextFileLog = newlog;
+ walrstr->nextFileSeg = newseg;
+ }
+
+ SetLatch(&WalRstr->WALRestoreLatch);
+}
+
+/*
+ * Run in Startup process to see if next file has arrived. We protect
+ * WalRstr with a LWlock so that the Startup process will wait until
+ * the restore_command succeeds or is cancelled. We set interrupt flags
+ * as if we were running the restore_command ourselves; there is no
+ * difference.
+ *
+ * WALRestoreCommandLock is not held on entry, but will be held at exit.
+ */
+bool
+XLogFileIsNowFullyRestored(TimeLineID tli, uint32 log, uint32 seg)
+{
+ /* use volatile pointer to prevent code rearrangement */
+ volatile WalRestoreData *walrstr = WalRstr;
+ char xlogfname[MAXFNAMELEN];
+
+ /*
+ * Issue debug message before we wait for the lock, to allow
+ * log entries to show interleaving of Startup and WALRestore actions
+ */
+ XLogFileName(xlogfname, tli, log, seg);
+ elog(WALRSTR_DEBUG_LEVEL,
+ "startup process requests %s from archive", xlogfname);
+
+ LWLockAcquire(WALRestoreCommandLock, LW_EXCLUSIVE);
+
+ XLogFileName(xlogfname,
+ walrstr->lastFileTli,
+ walrstr->lastFileLog,
+ walrstr->lastFileSeg);
+ elog(WALRSTR_DEBUG_LEVEL,
+ "startup process sees last file was %s", xlogfname);
+
+ if (tli == walrstr->lastFileTli &&
+ log == walrstr->lastFileLog &&
+ seg == walrstr->lastFileSeg)
+ return true;
+
+ return false;
+}
+
+/*
+ * WalRestoreNextFile - returns true if next file was restored
+ *
+ * Broadly follows the logic in XLogFileRead() when called with source of
+ * XLOG_FROM_ARCHIVE, except we have to read the next file from shmem.
+ */
+static bool
+WalRestoreNextFile(void)
+{
+ /* use volatile pointer to prevent code rearrangement */
+ volatile WalRestoreData *walrstr = WalRstr;
+ char xlogfname[MAXFNAMELEN];
+ char activitymsg[MAXFNAMELEN + 16];
+ char path[MAXPGPATH];
+ bool restoredFromArchive;
+ uint32 nextFileLog;
+ uint32 nextFileSeg;
+ TimeLineID nextFileTli;
+
+ elog(WALRSTR_DEBUG_LEVEL, "walrestore checking for next file to restore");
+
+ LWLockAcquire(WALRestoreCommandLock, LW_EXCLUSIVE);
+
+ {
+ /* use volatile pointer to prevent code rearrangement */
+ volatile WalRestoreData *walrstr = WalRstr;
+
+ nextFileTli = walrstr->nextFileTli;
+ nextFileLog = walrstr->nextFileLog;
+ nextFileSeg = walrstr->nextFileSeg;
+ }
+
+ /*
+ * If we aren't being requested to restore a file exit quickly.
+ */
+ if (nextFileTli == walrstr->lastFileTli &&
+ nextFileLog == walrstr->lastFileLog &&
+ nextFileSeg == walrstr->lastFileSeg)
+ {
+ LWLockRelease(WALRestoreCommandLock);
+ XLogFileName(xlogfname, nextFileTli, nextFileLog, nextFileSeg);
+ elog(WALRSTR_DEBUG_LEVEL,
+ "restore of %s is already complete, so sleep", xlogfname);
+ return false;
+ }
+
+ XLogFileName(xlogfname, nextFileTli, nextFileLog, nextFileSeg);
+
+ /* Report recovery progress in PS display */
+ snprintf(activitymsg, sizeof(activitymsg), "waiting for %s",
+ xlogfname);
+ set_ps_display(activitymsg, false);
+
+ elog(WALRSTR_DEBUG_LEVEL, "walrestore will restore %s", xlogfname);
+
+ restoredFromArchive = RestoreArchivedFile(path, xlogfname,
+ "RECOVERYXLOG",
+ XLogSegSize);
+
+ if (restoredFromArchive)
+ {
+ /* use volatile pointer to prevent code rearrangement */
+ volatile WalRestoreData *walrstr = WalRstr;
+
+ walrstr->lastFileTli = nextFileTli;
+ walrstr->lastFileLog = nextFileLog;
+ walrstr->lastFileSeg = nextFileSeg;
+ walrstr->lastFileRestoreTime = GetCurrentTimestamp();
+ }
+
+ LWLockRelease(WALRestoreCommandLock);
+
+ set_ps_display("", false);
+
+ /*
+ * Make sure Startup process is active so it can see new file, or
+ * react to it not being there.
+ */
+ WakeupRecovery();
+
+ return restoredFromArchive;
+}
+
+void
+SetRecoveryRestoreCommand(char *cmd)
+{
+ if (cmd == NULL)
+ return;
+
+ if (strlen(cmd) <= MAXPGPATH)
+ strcpy(WalRstr->recoveryRestoreCommand, cmd);
+ else
+ elog(FATAL, "recovery_restore_command is too long");
+}
+
+char *
+GetRecoveryRestoreCommand(void)
+{
+ return WalRstr->recoveryRestoreCommand;
+}
+
+/* Report shared memory space needed by WalRestoreShmemInit */
+Size
+WalRestoreShmemSize(void)
+{
+ Size size = 0;
+
+ size = add_size(size, sizeof(WalRestoreData));
+
+ return size;
+}
+
+/* Allocate and initialize walrestore-related shared memory */
+void
+WalRestoreShmemInit(void)
+{
+ bool found;
+
+ WalRstr = (WalRestoreData *)
+ ShmemInitStruct("Wal Restore Ctl", WalRestoreShmemSize(), &found);
+
+ if (found)
+ return;
+
+ /* First time through, so initialize */
+ MemSet(WalRstr, 0, WalRestoreShmemSize());
+ InitSharedLatch(&WalRstr->WALRestoreLatch);
+
+}
+
+/* --------------------------------
+ * signal handler routines
+ * --------------------------------
+ */
+
+/*
+ * walrestore_quickdie() occurs when signalled SIGQUIT by the postmaster.
+ *
+ * Some backend has bought the farm,
+ * so we need to stop what we're doing and exit.
+ */
+static void
+walrestore_quickdie(SIGNAL_ARGS)
+{
+ PG_SETMASK(&BlockSig);
+
+ /*
+ * We DO NOT want to run proc_exit() callbacks -- we're here because
+ * shared memory may be corrupted, so we don't want to try to clean up our
+ * transaction. Just nail the windows shut and get out of town. Now that
+ * there's an atexit callback to prevent third-party code from breaking
+ * things by calling exit() directly, we have to reset the callbacks
+ * explicitly to make this work as intended.
+ */
+ on_exit_reset();
+
+ /*
+ * Note we do exit(2) not exit(0). This is to force the postmaster into a
+ * system reset cycle if some idiot DBA sends a manual SIGQUIT to a random
+ * backend. This is necessary precisely because we don't clean up our
+ * shared memory state. (The "dead man switch" mechanism in pmsignal.c
+ * should ensure the postmaster sees this as a crash, too, but no harm in
+ * being doubly sure.)
+ */
+ exit(2);
+}
+
+/* SIGUSR1: let latch facility handle the signal */
+static void
+WalRestoreProcSigUsr1Handler(SIGNAL_ARGS)
+{
+ int save_errno = errno;
+
+ latch_sigusr1_handler();
+
+ errno = save_errno;
+}
+
+/* SIGHUP: set flag to re-read config file at next convenient time */
+static void
+WalRestoreSigHupHandler(SIGNAL_ARGS)
+{
+ int save_errno = errno;
+
+ got_SIGHUP = true;
+ SetLatch(&WalRstr->WALRestoreLatch);
+
+ errno = save_errno;
+}
+
+/* SIGTERM: set flag to shutdown and exit */
+static void
+WalRestoreShutdownHandler(SIGNAL_ARGS)
+{
+ int save_errno = errno;
+
+ if (in_restore_command)
+ {
+ LWLockRelease(WALRestoreCommandLock);
+ proc_exit(1);
+ }
+ else
+ walrestore_shutdown_requested = true;
+ SetLatch(&WalRstr->WALRestoreLatch);
+
+ errno = save_errno;
+}
diff --git a/src/backend/storage/ipc/ipci.c b/src/backend/storage/ipc/ipci.c
index ef1dc91..8f4443a 100644
--- a/src/backend/storage/ipc/ipci.c
+++ b/src/backend/storage/ipc/ipci.c
@@ -26,6 +26,7 @@
#include "postmaster/autovacuum.h"
#include "postmaster/bgwriter.h"
#include "postmaster/postmaster.h"
+#include "postmaster/walrestore.h"
#include "replication/walreceiver.h"
#include "replication/walsender.h"
#include "storage/bufmgr.h"
@@ -123,6 +124,7 @@ CreateSharedMemoryAndSemaphores(bool makePrivate, int port)
size = add_size(size, AutoVacuumShmemSize());
size = add_size(size, WalSndShmemSize());
size = add_size(size, WalRcvShmemSize());
+ size = add_size(size, WalRestoreShmemSize());
size = add_size(size, BTreeShmemSize());
size = add_size(size, SyncScanShmemSize());
size = add_size(size, AsyncShmemSize());
@@ -228,6 +230,7 @@ CreateSharedMemoryAndSemaphores(bool makePrivate, int port)
AutoVacuumShmemInit();
WalSndShmemInit();
WalRcvShmemInit();
+ WalRestoreShmemInit();
/*
* Set up other modules that need some shared memory space
diff --git a/src/include/access/xlog.h b/src/include/access/xlog.h
index 1ddf4bf..e9e5325 100644
--- a/src/include/access/xlog.h
+++ b/src/include/access/xlog.h
@@ -270,7 +270,8 @@ extern bool XLogNeedsFlush(XLogRecPtr RecPtr);
extern int XLogFileInit(uint32 log, uint32 seg,
bool *use_existent, bool use_lock);
extern int XLogFileOpen(uint32 log, uint32 seg);
-
+extern bool RestoreArchivedFile(char *path, const char *xlogfname,
+ const char *recovername, off_t expectedSize);
extern void XLogGetLastRemoved(uint32 *log, uint32 *seg);
extern void XLogSetAsyncXactLSN(XLogRecPtr record);
@@ -316,6 +317,7 @@ extern TimeLineID GetRecoveryTargetTLI(void);
extern bool CheckPromoteSignal(void);
extern void WakeupRecovery(void);
extern Latch *WALWriterLatch(void);
+extern Latch *WALRestoreLatch(void);
/*
* Starting/stopping a base backup
diff --git a/src/include/bootstrap/bootstrap.h b/src/include/bootstrap/bootstrap.h
index e966a73..b90ce33 100644
--- a/src/include/bootstrap/bootstrap.h
+++ b/src/include/bootstrap/bootstrap.h
@@ -23,6 +23,7 @@ typedef enum
StartupProcess,
BgWriterProcess,
CheckpointerProcess,
+ WalRestoreProcess,
WalWriterProcess,
WalReceiverProcess,
diff --git a/src/include/postmaster/startup.h b/src/include/postmaster/startup.h
index 3ec6950..35d9665 100644
--- a/src/include/postmaster/startup.h
+++ b/src/include/postmaster/startup.h
@@ -12,10 +12,11 @@
#ifndef _STARTUP_H
#define _STARTUP_H
+extern volatile sig_atomic_t startup_shutdown_requested;
+extern volatile sig_atomic_t in_restore_command;
+
extern void HandleStartupProcInterrupts(void);
extern void StartupProcessMain(void);
-extern void PreRestoreCommand(void);
-extern void PostRestoreCommand(void);
extern bool IsPromoteTriggered(void);
extern void ResetPromoteTriggered(void);
diff --git a/src/include/postmaster/walrestore.h b/src/include/postmaster/walrestore.h
new file mode 100644
index 0000000..98d7830
--- /dev/null
+++ b/src/include/postmaster/walrestore.h
@@ -0,0 +1,72 @@
+/*-------------------------------------------------------------------------
+ *
+ * walrestore.h
+ * Exports from postmaster/walrestore.c.
+ *
+ * Portions Copyright (c) 1996-2012, PostgreSQL Global Development Group
+ *
+ * src/include/postmaster/walrestore.h
+ *
+ *-------------------------------------------------------------------------
+ */
+#ifndef _WALRESTORE_H
+#define _WALRESTORE_H
+
+#include "access/xlog.h"
+#include "access/xlogdefs.h"
+#include "storage/spin.h"
+#include "pgtime.h"
+
+extern volatile sig_atomic_t walrestore_shutdown_requested;
+
+/* GUC options */
+
+extern void WalRestoreMain(void);
+extern bool XLogFileIsNowFullyRestored(TimeLineID tli, uint32 log, uint32 seg);
+extern void SetNextWALRestoreLogSeg(TimeLineID tli, uint32 log, uint32 seg);
+extern void SetRecoveryRestoreCommand(char *cmd);
+extern char *GetRecoveryRestoreCommand(void);
+extern Size WalRestoreShmemSize(void);
+extern void WalRestoreShmemInit(void);
+
+/* Shared memory area for management of walrestore process */
+typedef struct
+{
+ /*
+ * The identifiers of the last WAL file restored by WALrestore
+ */
+ TimeLineID lastFileTli;
+ uint32 lastFileLog;
+ uint32 lastFileSeg;
+
+ /*
+ * Time of last restore by WALrestore
+ */
+ TimestampTz lastFileRestoreTime;
+
+ /*
+ * The next WAL file requested for the WALrestore process to restore
+ */
+ TimeLineID nextFileTli;
+ uint32 nextFileLog;
+ uint32 nextFileSeg;
+
+ /*
+ * All of the above read and set only while holding WALRestoreCommandLock
+ */
+
+ /*
+ * WALRestoreLatch is used to wake up the WALRestore to restore WAL files.
+ */
+ Latch WALRestoreLatch;
+
+ /*
+ * recoveryRestoreCommand for use by walrestore; can remove if becomes GUC
+ * Set once at startup and read-only after that
+ */
+ char recoveryRestoreCommand[MAXPGPATH];
+} WalRestoreData;
+
+extern WalRestoreData *WalRstr;
+
+#endif /* _WALRESTORE_H */
diff --git a/src/include/storage/lwlock.h b/src/include/storage/lwlock.h
index df3df29..c316dcc 100644
--- a/src/include/storage/lwlock.h
+++ b/src/include/storage/lwlock.h
@@ -79,6 +79,7 @@ typedef enum LWLockId
SerializablePredicateLockListLock,
OldSerXidLock,
SyncRepLock,
+ WALRestoreCommandLock,
/* Individual lock IDs end here */
FirstBufMappingLock,
FirstLockMgrLock = FirstBufMappingLock + NUM_BUFFER_PARTITIONS,
--
Sent via pgsql-hackers mailing list ([email protected])
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers