Attached is a patch that builds upon Florian Pflug's earlier proof of
concept program for monitoring the postmaster. The code creates a
non-blocking pipe in the postmaster that child processes block on
using a select() call. This all occurs in the latch code, which now
monitors postmaster death, but only for clients that request it (and,
almost invariably in addition to monitoring other things, like having
a timeout occur or a latch set).

I've implemented an interface originally sketched by Heikki that
allows clients to specify events to wake on, and to see what event
actually caused the wakeup when we're done by bitwise AND'ing the
returned int against various new bitmasks.

I've included my existing changes to the archiver as a convenience to
anyone that wants to quickly see the effects of the patch in action;
even though we don't have a tight loop that polls PostmasterIsAlive()
every second, we still wake up on postmaster death, so there is no
potential denial of service as previously described by Tom. This can
be easily observed by sending the postmaster SIGKILL while the
archiver is on - the archiver immediately finishes. Note that I've
deferred changing the existing call sites of WaitLatch()/
WaitLatchOrSocket(), except to make them use the new interface. Just
as before, they don't ask to be woken on postmaster death, even though
in some cases they probably should. Whether or not they should and how
they should are questions for another day though.

I expect that this patch will be split into two separate patches: The
latch patch (complete with currently missing win32 implementation) and
the archiver patch. For now, I'd like to hear thoughts on how I've
implemented the extra latch functionality.

How should I be handling the EXEC_BACKEND case?

-- 
Peter Geoghegan       http://www.2ndQuadrant.com/
PostgreSQL Development, 24x7 Support, Training and Services
diff --git a/src/backend/access/transam/xlog.c b/src/backend/access/transam/xlog.c
index e71090f..b1d38f5 100644
--- a/src/backend/access/transam/xlog.c
+++ b/src/backend/access/transam/xlog.c
@@ -10150,7 +10150,7 @@ retry:
 					/*
 					 * Wait for more WAL to arrive, or timeout to be reached
 					 */
-					WaitLatch(&XLogCtl->recoveryWakeupLatch, 5000000L);
+					WaitLatch(&XLogCtl->recoveryWakeupLatch, WL_LATCH_SET | WL_TIMEOUT, 5000000L);
 					ResetLatch(&XLogCtl->recoveryWakeupLatch);
 				}
 				else
diff --git a/src/backend/port/unix_latch.c b/src/backend/port/unix_latch.c
index 6dae7c9..5bd389e 100644
--- a/src/backend/port/unix_latch.c
+++ b/src/backend/port/unix_latch.c
@@ -108,6 +108,19 @@ static void initSelfPipe(void);
 static void drainSelfPipe(void);
 static void sendSelfPipeByte(void);
 
+/* 
+ * Constants that represent which of a pair of fds given
+ * to pipe() is watched and owned in the context of 
+ * dealing with life sign file descriptors
+ */
+#define LIFESIGN_FD_WATCH 0
+#define LIFESIGN_FD_OWN 1
+
+/* 
+ * 2 file descriptors that represent postmaster lifesign.
+ * First is LIFESIGN_FD_WATCH, second is LIFESIGN_FD_OWN.
+ */
+static int life_sign_fds[2];
 
 /*
  * Initialize a backend-local latch.
@@ -188,22 +201,22 @@ DisownLatch(volatile Latch *latch)
  * backend-local latch initialized with InitLatch, or a shared latch
  * associated with the current process by calling OwnLatch.
  *
- * Returns 'true' if the latch was set, or 'false' if timeout was reached.
+ * Returns bit field indicating which condition(s) caused the wake-up.
  */
-bool
-WaitLatch(volatile Latch *latch, long timeout)
+int
+WaitLatch(volatile Latch *latch, int wakeEvents, long timeout)
 {
-	return WaitLatchOrSocket(latch, PGINVALID_SOCKET, false, false, timeout) > 0;
+	return WaitLatchOrSocket(latch, wakeEvents, PGINVALID_SOCKET, timeout);
 }
 
 /*
  * Like WaitLatch, but will also return when there's data available in
- * 'sock' for reading or writing. Returns 0 if timeout was reached,
- * 1 if the latch was set, 2 if the socket became readable or writable.
+ * 'sock' for reading or writing.
+ *
+ * Returns bit field indicating which condition(s) caused the wake-up.
  */
 int
-WaitLatchOrSocket(volatile Latch *latch, pgsocket sock, bool forRead,
-				  bool forWrite, long timeout)
+WaitLatchOrSocket(volatile Latch *latch, int wakeEvents, pgsocket sock, long timeout)
 {
 	struct timeval tv,
 			   *tvp = NULL;
@@ -211,12 +224,13 @@ WaitLatchOrSocket(volatile Latch *latch, pgsocket sock, bool forRead,
 	fd_set		output_mask;
 	int			rc;
 	int			result = 0;
+	bool		found = false;
 
 	if (latch->owner_pid != MyProcPid)
 		elog(ERROR, "cannot wait on a latch owned by another process");
 
 	/* Initialize timeout */
-	if (timeout >= 0)
+	if (timeout >= 0 && (wakeEvents & WL_TIMEOUT))
 	{
 		tv.tv_sec = timeout / 1000000L;
 		tv.tv_usec = timeout % 1000000L;
@@ -224,7 +238,7 @@ WaitLatchOrSocket(volatile Latch *latch, pgsocket sock, bool forRead,
 	}
 
 	waiting = true;
-	for (;;)
+	do
 	{
 		int			hifd;
 
@@ -235,16 +249,29 @@ WaitLatchOrSocket(volatile Latch *latch, pgsocket sock, bool forRead,
 		 * do that), and the select() will return immediately.
 		 */
 		drainSelfPipe();
-		if (latch->is_set)
-		{
-			result = 1;
-			break;
+		if (latch->is_set && (wakeEvents & WL_LATCH_SET))
+ 		{
+			result |= WL_LATCH_SET;
+			found = true;
+			/* Leave loop immediately, avoid blocking again.
+			 * Since latch is set, no other factor could have 
+			 * coincided that could make us wake up 
+			 * independently of the latch being set, so no
+			 * need to worry about having missed something.
+			 */
+			break; 
 		}
-
 		FD_ZERO(&input_mask);
 		FD_SET(selfpipe_readfd, &input_mask);
+
+		if (wakeEvents & WL_POSTMASTER_DEATH)
+		{
+			FD_SET(life_sign_fds[LIFESIGN_FD_WATCH], &input_mask);
+			if (life_sign_fds[LIFESIGN_FD_WATCH] > hifd)
+				hifd = life_sign_fds[LIFESIGN_FD_WATCH];
+		}
 		hifd = selfpipe_readfd;
-		if (sock != PGINVALID_SOCKET && forRead)
+		if (sock != PGINVALID_SOCKET && (wakeEvents & WL_SOCKET_READABLE))
 		{
 			FD_SET(sock, &input_mask);
 			if (sock > hifd)
@@ -252,7 +279,7 @@ WaitLatchOrSocket(volatile Latch *latch, pgsocket sock, bool forRead,
 		}
 
 		FD_ZERO(&output_mask);
-		if (sock != PGINVALID_SOCKET && forWrite)
+		if (sock != PGINVALID_SOCKET && (wakeEvents & WL_SOCKET_WRITEABLE))
 		{
 			FD_SET(sock, &output_mask);
 			if (sock > hifd)
@@ -268,20 +295,35 @@ WaitLatchOrSocket(volatile Latch *latch, pgsocket sock, bool forRead,
 					(errcode_for_socket_access(),
 					 errmsg("select() failed: %m")));
 		}
-		if (rc == 0)
+		if (rc == 0 && (wakeEvents & WL_TIMEOUT))
 		{
 			/* timeout exceeded */
-			result = 0;
-			break;
+			result |= WL_TIMEOUT;
+			found = true;
 		}
-		if (sock != PGINVALID_SOCKET &&
-			((forRead && FD_ISSET(sock, &input_mask)) ||
-			 (forWrite && FD_ISSET(sock, &output_mask))))
+		if (sock != PGINVALID_SOCKET)
 		{
-			result = 2;
-			break;				/* data available in socket */
+			if ((wakeEvents & WL_SOCKET_READABLE ) && FD_ISSET(sock, &input_mask))
+			{
+				result |= WL_SOCKET_READABLE;
+				found = true; /* data available in socket */
+			}
+			if ((wakeEvents & WL_SOCKET_WRITEABLE) && FD_ISSET(sock, &output_mask))
+			{
+				result |= WL_SOCKET_WRITEABLE;
+				found = true;
+			}
 		}
+		if ((wakeEvents & WL_POSTMASTER_DEATH) && 
+			 FD_ISSET(life_sign_fds[LIFESIGN_FD_WATCH], &input_mask) && 
+			 !PostmasterIsAlive(true))
+ 		{
+			result |= WL_POSTMASTER_DEATH;
+ 			found = true;
+ 		}
 	}
+	while(!found);
+
 	waiting = false;
 
 	return result;
@@ -430,3 +472,93 @@ drainSelfPipe(void)
 			elog(ERROR, "unexpected EOF on self-pipe");
 	}
 }
+
+/*
+ * Initialise a "life sign" from the postmaster; This is subsequently used 
+ * by child processes to monitor the postmaster. We open up an anoymous 
+ * pipe, and have child processes block on a select() call that examines 
+ * if the read file descriptor is ready for reading. They do so through a 
+ * latch.
+ *
+ * Child auxiliary processes are responsible for "releasing their parent's 
+ * lifesign", so that only the postmaster holds it and a select() on the fd
+ * returns upon the one and only holder (the postmaster) dying
+ *
+ * This is a trick that obviates the need for auxiliary backends to have tight 
+ * polling loops where they check if the postmaster is alive. We do this because 
+ * that pattern results in an excessive number of wakeups per second when idle.
+ */
+void 
+InitLifeSign(void)
+{
+	int flags;
+
+	/* 
+	 * Create pipe. The life sign is deemed to have vanished if
+	 * no process has the writing end (LIFESIGN_FD_OWN) open.
+	 *
+	 * In practice, this is caused by premature postmaster death.
+	 */
+	Assert(MyProcPid == PostmasterPid);
+	if (pipe(life_sign_fds)) 
+	{
+		ereport(FATAL,
+			(errcode_for_socket_access(),
+			 errmsg( "Failed to create life sign: %s", strerror(errno))));
+	}
+
+	flags = fcntl(life_sign_fds[LIFESIGN_FD_WATCH], F_GETFL);
+	if (flags < 0)
+	{
+		ereport(FATAL,
+			(errcode_for_socket_access(),
+			 errmsg("Failed to set the life sign's watching end's flags: %s", strerror(errno))));
+	}
+
+	/* 
+	 * Set FNONBLOCK to allow checking for the life sign's presence with a read() call
+	 * and FASYNC to deliver a signal to our process group if the life sign vanishes
+	 */
+	flags |= FNONBLOCK | FASYNC;
+	if (fcntl(life_sign_fds[LIFESIGN_FD_WATCH], F_SETFL, flags))
+	{	
+		ereport(FATAL,
+			(errcode_for_socket_access(),
+			 errmsg("Failed to set the life sign's watching end's flags: %s", strerror(errno))));
+	}
+ 
+	/* Send SIGIO signal to the whole process group */
+	if (fcntl(life_sign_fds[LIFESIGN_FD_WATCH], F_SETOWN, -getpgrp())) 
+	{
+		ereport(FATAL,
+			(errcode_for_socket_access(),
+			 errmsg("Failed to set the life sign's watching end's notification pid: %s", strerror(errno))));
+	}
+}
+
+/*
+ * Release ownership of parent lifesign
+ *
+ * Important: This must be called immediately after an auxiliary
+ * process forks from the postmaster. Otherwise, latch clients
+ * will not wake up on postmaster death, even if they have
+ * requested to.
+ *
+ * It is safe to not call ReleaseParentLifeSign() at that time, 
+ * provided that the process will never wait on a latch with
+ * the WL_POSTMASTER_DEATH flag set.
+ */
+void
+ReleaseParentLifesign(void)
+{
+	Assert(MyProcPid != PostmasterPid);
+	/* Release parent's life sign - only postmaster should hold it */
+	if (close(life_sign_fds[LIFESIGN_FD_OWN])) 
+	{
+		ereport(FATAL,
+			(errcode_for_socket_access(),
+			 errmsg("Failed to close file descriptor associated with Postmaster life sign in child process %d", MyProcPid)));
+	}
+	life_sign_fds[LIFESIGN_FD_OWN] = -1;
+}
+
diff --git a/src/backend/postmaster/pgarch.c b/src/backend/postmaster/pgarch.c
index b40375a..65f9ed9 100644
--- a/src/backend/postmaster/pgarch.c
+++ b/src/backend/postmaster/pgarch.c
@@ -40,6 +40,7 @@
 #include "postmaster/postmaster.h"
 #include "storage/fd.h"
 #include "storage/ipc.h"
+#include "storage/latch.h"
 #include "storage/pg_shmem.h"
 #include "storage/pmsignal.h"
 #include "utils/guc.h"
@@ -87,6 +88,12 @@ static volatile sig_atomic_t got_SIGTERM = false;
 static volatile sig_atomic_t wakened = false;
 static volatile sig_atomic_t ready_to_stop = false;
 
+/*
+ * Latch that archiver loop waits on until it is awakened by 
+ * signals, each of which there is a handler for
+ */
+static volatile Latch mainloop_latch;
+
 /* ----------
  * Local function forward declarations
  * ----------
@@ -228,6 +235,8 @@ PgArchiverMain(int argc, char *argv[])
 
 	MyProcPid = getpid();		/* reset MyProcPid */
 
+	InitLatch(&mainloop_latch); /* initialise latch used in main loop, now that we are a subprocess */
+
 	MyStartTime = time(NULL);	/* record Start Time for logging */
 
 	/*
@@ -263,6 +272,12 @@ PgArchiverMain(int argc, char *argv[])
 	 */
 	init_ps_display("archiver process", "", "", "");
 
+	/* 
+	 * We still own the parent lifesign. Release - 
+	 * only the postmaster should have this.
+	 */
+	ReleaseParentLifesign();
+
 	pgarch_MainLoop();
 
 	exit(0);
@@ -282,6 +297,8 @@ ArchSigHupHandler(SIGNAL_ARGS)
 {
 	/* set flag to re-read config file at next convenient time */
 	got_SIGHUP = true;
+	/* Let the waiting loop iterate */
+	SetLatch(&mainloop_latch);
 }
 
 /* SIGTERM signal handler for archiver process */
@@ -295,6 +312,8 @@ ArchSigTermHandler(SIGNAL_ARGS)
 	 * archive commands.
 	 */
 	got_SIGTERM = true;
+	/* Let the waiting loop iterate */
+	SetLatch(&mainloop_latch);
 }
 
 /* SIGUSR1 signal handler for archiver process */
@@ -303,6 +322,8 @@ pgarch_waken(SIGNAL_ARGS)
 {
 	/* set flag that there is work to be done */
 	wakened = true;
+	/* Let the waiting loop iterate */
+	SetLatch(&mainloop_latch);
 }
 
 /* SIGUSR2 signal handler for archiver process */
@@ -311,6 +332,8 @@ pgarch_waken_stop(SIGNAL_ARGS)
 {
 	/* set flag to do a final cycle and shut down afterwards */
 	ready_to_stop = true;
+	/* Let the waiting loop iterate */
+	SetLatch(&mainloop_latch);
 }
 
 /*
@@ -334,6 +357,13 @@ pgarch_MainLoop(void)
 
 	do
 	{
+		/*
+		 * There shouldn't be anything for the archiver to do except to wait
+		 * on a latch ... however, the archiver exists to protect our data,
+		 * so she wakes up occasionally to allow herself to be proactive.
+		 */
+		ResetLatch(&mainloop_latch);
+
 		/* When we get SIGUSR2, we do one more archive cycle, then exit */
 		time_to_stop = ready_to_stop;
 
@@ -370,28 +400,28 @@ pgarch_MainLoop(void)
 			last_copy_time = time(NULL);
 		}
 
-		/*
-		 * There shouldn't be anything for the archiver to do except to wait
-		 * for a signal ... however, the archiver exists to protect our data,
-		 * so she wakes up occasionally to allow herself to be proactive.
+		/* 
+		 * Wait on latch, until various signals are received, or 
+		 * until a poll will be forced by PGARCH_AUTOWAKE_INTERVAL
+		 * having passed since last_copy_time
 		 *
-		 * On some platforms, signals won't interrupt the sleep.  To ensure we
-		 * respond reasonably promptly when someone signals us, break down the
-		 * sleep into 1-second increments, and check for interrupts after each
-		 * nap.
+		 * The caveat about signals invalidating the timeout of 
+		 * WaitLatch() on some platforms can be safely disregarded, 
+		 * because we handle all expected signals, and all handlers 
+		 * call SetLatch() where that matters anyway
 		 */
-		while (!(wakened || ready_to_stop || got_SIGHUP ||
-				 !PostmasterIsAlive(true)))
-		{
-			time_t		curtime;
 
-			pg_usleep(1000000L);
+		if (!time_to_stop) /* Don't wait during last iteration */
+		{
+			time_t		 curtime = time(NULL);	
+			unsigned int timeout_secs  = (unsigned int) PGARCH_AUTOWAKE_INTERVAL - 
+					(unsigned int) (curtime - last_copy_time);
+			WaitLatch(&mainloop_latch, WL_LATCH_SET | WL_TIMEOUT | WL_POSTMASTER_DEATH, timeout_secs * 1000000L);
 			curtime = time(NULL);
 			if ((unsigned int) (curtime - last_copy_time) >=
 				(unsigned int) PGARCH_AUTOWAKE_INTERVAL)
-				wakened = true;
+				wakened = true; /* wakened by timeout - this wasn't a SIGHUP, etc */
 		}
-
 		/*
 		 * The archiver quits either when the postmaster dies (not expected)
 		 * or after completing one more archiving cycle after receiving
diff --git a/src/backend/postmaster/postmaster.c b/src/backend/postmaster/postmaster.c
index 1e2aa9f..a87e05f 100644
--- a/src/backend/postmaster/postmaster.c
+++ b/src/backend/postmaster/postmaster.c
@@ -111,6 +111,7 @@
 #include "replication/walsender.h"
 #include "storage/fd.h"
 #include "storage/ipc.h"
+#include "storage/latch.h"
 #include "storage/pg_shmem.h"
 #include "storage/pmsignal.h"
 #include "storage/proc.h"
@@ -356,6 +357,7 @@ static void RandomSalt(char *md5Salt);
 static void signal_child(pid_t pid, int signal);
 static bool SignalSomeChildren(int signal, int targets);
 
+
 #define SignalChildren(sig)			   SignalSomeChildren(sig, BACKEND_TYPE_ALL)
 
 /*
@@ -492,6 +494,12 @@ PostmasterMain(int argc, char *argv[])
 	IsPostmasterEnvironment = true;
 
 	/*
+	 * Initialise lifesign so that waiting latch clients can know
+	 * if the postmaster has died 
+	 */
+	InitLifeSign();
+
+	/*
 	 * for security, no dir or file created can be group or other accessible
 	 */
 	umask(S_IRWXG | S_IRWXO);
diff --git a/src/backend/replication/syncrep.c b/src/backend/replication/syncrep.c
index 1d4df8a..efda804 100644
--- a/src/backend/replication/syncrep.c
+++ b/src/backend/replication/syncrep.c
@@ -171,7 +171,7 @@ SyncRepWaitForLSN(XLogRecPtr XactCommitLSN)
 		 * postmaster death regularly while waiting. Note that timeout here
 		 * does not necessarily release from loop.
 		 */
-		WaitLatch(&MyProc->waitLatch, 60000000L);
+		WaitLatch(&MyProc->waitLatch, WL_LATCH_SET | WL_TIMEOUT, 60000000L);
 
 		/* Must reset the latch before testing state. */
 		ResetLatch(&MyProc->waitLatch);
diff --git a/src/backend/replication/walsender.c b/src/backend/replication/walsender.c
index 470e6d1..27cc350 100644
--- a/src/backend/replication/walsender.c
+++ b/src/backend/replication/walsender.c
@@ -805,8 +805,9 @@ WalSndLoop(void)
 			}
 
 			/* Sleep */
-			WaitLatchOrSocket(&MyWalSnd->latch, MyProcPort->sock,
-							  true, pq_is_send_pending(),
+			WaitLatchOrSocket(&MyWalSnd->latch,
+							  WL_LATCH_SET | WL_SOCKET_READABLE | (pq_is_send_pending()? WL_SOCKET_WRITEABLE:0) |  WL_TIMEOUT,
+							  MyProcPort->sock,
 							  sleeptime * 1000L);
 
 			/* Check for replication timeout */
diff --git a/src/include/storage/latch.h b/src/include/storage/latch.h
index 03ec071..5a8e4b4 100644
--- a/src/include/storage/latch.h
+++ b/src/include/storage/latch.h
@@ -38,11 +38,12 @@ extern void InitLatch(volatile Latch *latch);
 extern void InitSharedLatch(volatile Latch *latch);
 extern void OwnLatch(volatile Latch *latch);
 extern void DisownLatch(volatile Latch *latch);
-extern bool WaitLatch(volatile Latch *latch, long timeout);
-extern int WaitLatchOrSocket(volatile Latch *latch, pgsocket sock,
-				  bool forRead, bool forWrite, long timeout);
+extern int WaitLatch(volatile Latch *latch, int wakeEvents, long timeout);
+extern int WaitLatchOrSocket(volatile Latch *latch, int wakeEvents, pgsocket sock, long timeout);
 extern void SetLatch(volatile Latch *latch);
 extern void ResetLatch(volatile Latch *latch);
+extern void InitLifeSign(void);
+extern void ReleaseParentLifesign(void);
 
 #define TestLatch(latch) (((volatile Latch *) latch)->is_set)
 
@@ -56,4 +57,11 @@ extern void latch_sigusr1_handler(void);
 #define latch_sigusr1_handler()
 #endif
 
+/* Bitmasks for events that may wake-up WaitLatch() clients */
+#define WL_LATCH_SET         (1 << 0)
+#define WL_SOCKET_READABLE   (1 << 1)
+#define WL_SOCKET_WRITEABLE  (1 << 2)
+#define WL_TIMEOUT           (1 << 3)
+#define WL_POSTMASTER_DEATH  (1 << 4)
+
 #endif   /* LATCH_H */
-- 
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers

Reply via email to