On 2021/03/23 15:50, Fujii Masao wrote:
+ * The statistics collector is started by the postmaster as soon as the
+ * startup subprocess finishes.
This comment needs to be updated? Because the stats collector can
be invoked when the startup process sends PMSIGNAL_BEGIN_HOT_STANDBY
signal.
I updated this comment in the patch.
Attached is the updated version of the patch.
Barring any objection, I'm thinking to commit this patch.
Regards,
--
Fujii Masao
Advanced Computing Technology Center
Research and Development Headquarters
NTT DATA CORPORATION
diff --git a/src/backend/postmaster/interrupt.c
b/src/backend/postmaster/interrupt.c
index dd9136a942..d1b1f95400 100644
--- a/src/backend/postmaster/interrupt.c
+++ b/src/backend/postmaster/interrupt.c
@@ -64,9 +64,28 @@ SignalHandlerForConfigReload(SIGNAL_ARGS)
}
/*
- * Simple signal handler for exiting quickly as if due to a crash.
+ * Simple signal handler for processes which have not yet touched or do not
+ * touch shared memory to exit quickly.
*
- * Normally, this would be used for handling SIGQUIT.
+ * Note that if processes already touched shared memory, use
+ * SignalHandlerForCrashExit() instead and force the postmaster into
+ * a system reset cycle because shared memory may be corrupted.
+ */
+void
+SignalHandlerForNonCrashExit(SIGNAL_ARGS)
+{
+ /*
+ * Since we don't touch shared memory, we can just pull the plug and
exit
+ * without running any atexit handlers.
+ */
+ _exit(1);
+}
+
+/*
+ * Simple signal handler for processes which have touched shared memory to
+ * exit quickly.
+ *
+ * Normally, this would be used for handling SIGQUIT as if due to a crash.
*/
void
SignalHandlerForCrashExit(SIGNAL_ARGS)
@@ -93,9 +112,8 @@ SignalHandlerForCrashExit(SIGNAL_ARGS)
* shut down and exit.
*
* Typically, this handler would be used for SIGTERM, but some processes use
- * other signals. In particular, the checkpointer exits on SIGUSR2, the
- * stats collector on SIGQUIT, and the WAL writer exits on either SIGINT
- * or SIGTERM.
+ * other signals. In particular, the checkpointer and the stats collector exit
+ * on SIGUSR2, and the WAL writer exits on either SIGINT or SIGTERM.
*
* ShutdownRequestPending should be checked at a convenient place within the
* main loop, or else the main loop should call HandleMainLoopInterrupts.
diff --git a/src/backend/postmaster/pgstat.c b/src/backend/postmaster/pgstat.c
index 60f45ccc4e..fd0af0f289 100644
--- a/src/backend/postmaster/pgstat.c
+++ b/src/backend/postmaster/pgstat.c
@@ -1,7 +1,23 @@
/* ----------
* pgstat.c
*
- * All the statistics collector stuff hacked up in one big, ugly file.
+ * All the statistics collector stuff hacked up in one big, ugly file.
+ *
+ * The statistics collector is started by the postmaster as soon as the
+ * startup subprocess finishes, or as soon as the postmaster is ready
+ * to accept read only connections during archive recovery. It remains
+ * alive until the postmaster commands it to terminate. Normal
+ * termination is by SIGUSR2 after the checkpointer must exit(0),
+ * which instructs the statistics collector to save the final statistics
+ * to reuse at next startup and then exit(0).
+ * Emergency termination is by SIGQUIT; like any backend, the statistics
+ * collector will exit quickly without saving the final statistics. It's
+ * ok because the startup process will remove all statistics at next
+ * startup after emergency termination.
+ *
+ * Because the statistics collector doesn't touch shared memory, even if
+ * the statistics collector exits unexpectedly, the postmaster doesn't
+ * treat it as a crash. The postmaster will just try to restart a new one.
*
* TODO: - Separate collector, postmaster and backend stuff
* into different files.
@@ -724,6 +740,7 @@ pgstat_reset_remove_files(const char *directory)
snprintf(fname, sizeof(fname), "%s/%s", directory,
entry->d_name);
+ elog(DEBUG2, "removing stats file \"%s\"", fname);
unlink(fname);
}
FreeDir(dir);
@@ -4821,17 +4838,31 @@ PgstatCollectorMain(int argc, char *argv[])
/*
* Ignore all signals usually bound to some action in the postmaster,
- * except SIGHUP and SIGQUIT. Note we don't need a SIGUSR1 handler to
- * support latch operations, because we only use a local latch.
+ * except SIGHUP, SIGQUIT and SIGUSR2. Note we don't need a SIGUSR1
+ * handler to support latch operations, because we only use a local
latch.
+ *
+ * We deliberately ignore SIGTERM and exit in correct order because we
+ * want to collect the stats sent during the shutdown from all
processes.
+ * SIGTERM will be received during a standard Unix system shutdown cycle
+ * because init will SIGTERM all processes at once, and the postmaster
+ * will SIGTERM all processes at once when
recovery_target_action=shutdown
+ * and the startup process exits after reaching the recovery target. We
+ * want to wait for the checkpointer, which is the process sends the
stats
+ * finally, to exit, whereupon the postmaster will tell us it's okay to
+ * shut down (via SIGUSR2)
+ *
+ * If SIGQUIT is received, exit quickly without doing any additional
work,
+ * for example writing stats files. We arrange to do _exit(1) because
the
+ * stats collector doesn't touch shared memory.
*/
pqsignal(SIGHUP, SignalHandlerForConfigReload);
pqsignal(SIGINT, SIG_IGN);
pqsignal(SIGTERM, SIG_IGN);
- pqsignal(SIGQUIT, SignalHandlerForShutdownRequest);
+ pqsignal(SIGQUIT, SignalHandlerForNonCrashExit);
pqsignal(SIGALRM, SIG_IGN);
pqsignal(SIGPIPE, SIG_IGN);
pqsignal(SIGUSR1, SIG_IGN);
- pqsignal(SIGUSR2, SIG_IGN);
+ pqsignal(SIGUSR2, SignalHandlerForShutdownRequest);
/* Reset some signals that are accepted by postmaster but not here */
pqsignal(SIGCHLD, SIG_DFL);
PG_SETMASK(&UnBlockSig);
@@ -4852,8 +4883,8 @@ PgstatCollectorMain(int argc, char *argv[])
AddWaitEventToSet(wes, WL_SOCKET_READABLE, pgStatSock, NULL, NULL);
/*
- * Loop to process messages until we get SIGQUIT or detect ungraceful
- * death of our parent postmaster.
+ * Loop to process messages until we get SIGUSR2, SIGQUIT or detect
+ * ungraceful death of our parent postmaster.
*
* For performance reasons, we don't want to do ResetLatch/WaitLatch
after
* every message; instead, do that only after a recv() fails to obtain a
@@ -4871,7 +4902,7 @@ PgstatCollectorMain(int argc, char *argv[])
ResetLatch(MyLatch);
/*
- * Quit if we get SIGQUIT from the postmaster.
+ * Quit if we get SIGUSR2 from the postmaster.
*/
if (ShutdownRequestPending)
break;
diff --git a/src/backend/postmaster/postmaster.c
b/src/backend/postmaster/postmaster.c
index ef0be4ca38..08792b0033 100644
--- a/src/backend/postmaster/postmaster.c
+++ b/src/backend/postmaster/postmaster.c
@@ -401,7 +401,6 @@ static void SIGHUP_handler(SIGNAL_ARGS);
static void pmdie(SIGNAL_ARGS);
static void reaper(SIGNAL_ARGS);
static void sigusr1_handler(SIGNAL_ARGS);
-static void process_startup_packet_die(SIGNAL_ARGS);
static void dummy_handler(SIGNAL_ARGS);
static void StartupPacketTimeoutHandler(void);
static void CleanupBackend(int pid, int exitstatus);
@@ -3085,7 +3084,7 @@ reaper(SIGNAL_ARGS)
* nothing left for it to do.
*/
if (PgStatPID != 0)
- signal_child(PgStatPID, SIGQUIT);
+ signal_child(PgStatPID, SIGUSR2);
}
else
{
@@ -4320,8 +4319,14 @@ BackendInitialize(Port *port)
* Exiting with _exit(1) is only possible because we have not yet
touched
* shared memory; therefore no outside-the-process state needs to get
* cleaned up.
+ *
+ * One might be tempted to try to send a message, or log one, indicating
+ * why we are disconnecting. However, that would be quite unsafe in
+ * itself. Also, it seems undesirable to provide clues about the
+ * database's state to a client that has not yet completed
authentication,
+ * or even sent us a startup packet.
*/
- pqsignal(SIGTERM, process_startup_packet_die);
+ pqsignal(SIGTERM, SignalHandlerForNonCrashExit);
/* SIGQUIT handler was already set up by InitPostmasterChild */
InitializeTimeouts(); /* establishes SIGALRM handler */
PG_SETMASK(&StartupBlockSig);
@@ -5274,25 +5279,6 @@ sigusr1_handler(SIGNAL_ARGS)
errno = save_errno;
}
-/*
- * SIGTERM while processing startup packet.
- *
- * Running proc_exit() from a signal handler would be quite unsafe.
- * However, since we have not yet touched shared memory, we can just
- * pull the plug and exit without running any atexit handlers.
- *
- * One might be tempted to try to send a message, or log one, indicating
- * why we are disconnecting. However, that would be quite unsafe in itself.
- * Also, it seems undesirable to provide clues about the database's state
- * to a client that has not yet completed authentication, or even sent us
- * a startup packet.
- */
-static void
-process_startup_packet_die(SIGNAL_ARGS)
-{
- _exit(1);
-}
-
/*
* Dummy signal handler
*
@@ -5309,7 +5295,7 @@ dummy_handler(SIGNAL_ARGS)
/*
* Timeout while processing startup packet.
- * As for process_startup_packet_die(), we exit via _exit(1).
+ * As for SignalHandlerForNonCrashExit(), we exit via _exit(1).
*/
static void
StartupPacketTimeoutHandler(void)
diff --git a/src/include/postmaster/interrupt.h
b/src/include/postmaster/interrupt.h
index 85a1293ef1..3f3dc19e24 100644
--- a/src/include/postmaster/interrupt.h
+++ b/src/include/postmaster/interrupt.h
@@ -26,6 +26,7 @@ extern PGDLLIMPORT volatile sig_atomic_t
ShutdownRequestPending;
extern void HandleMainLoopInterrupts(void);
extern void SignalHandlerForConfigReload(SIGNAL_ARGS);
+extern void SignalHandlerForNonCrashExit(SIGNAL_ARGS);
extern void SignalHandlerForCrashExit(SIGNAL_ARGS);
extern void SignalHandlerForShutdownRequest(SIGNAL_ARGS);