Ok, here's a new patch, addressing the issues Fujii raised, and with a bunch of stylistic changes of my own. Also, I committed a patch to remove silent_mode, so the fork_process() changes are now gone. I'm going to sleep over this and review once again tomorrow, and commit if it still looks good to me and no-one else reports new issues.

There's two small issues left:

I don't like the names POSTMASTER_FD_WATCH and POSTMASTER_FD_OWN. At a quick glance, it's not at all clear which is which. I couldn't come up with better names, so for now I just added some comments to clarify that. I would find WRITE/READ more clear, but to make sense of that you need to how the pipe is used. Any suggestions or opinions on that?

The BUGS section of Linux man page for select(2) says:

       Under Linux, select() may report a socket file descriptor as "ready for
       reading",  while nevertheless a subsequent read blocks.  This could for
       example happen when data has arrived but  upon  examination  has  wrong
       checksum and is discarded.  There may be other circumstances in which a
       file descriptor is spuriously reported as ready.  Thus it may be  safer
       to use O_NONBLOCK on sockets that should not block.

So in theory, on Linux you might WaitLatch might sometimes incorrectly return WL_POSTMASTER_DEATH. None of the callers check for WL_POSTMASTER_DEATH return code, they call PostmasterIsAlive() before assuming the postmaster has died, so that won't affect correctness at the moment. I doubt that scenario can even happen in our case, select() on a pipe that is never written to. But maybe we should add add an assertion to WaitLatch to assert that if select() reports that the postmaster pipe has been closed, PostmasterIsAlive() also returns false.

--
  Heikki Linnakangas
  EnterpriseDB   http://www.enterprisedb.com
*** a/src/backend/access/transam/xlog.c
--- b/src/backend/access/transam/xlog.c
***************
*** 10165,10171 **** retry:
  					/*
  					 * Wait for more WAL to arrive, or timeout to be reached
  					 */
! 					WaitLatch(&XLogCtl->recoveryWakeupLatch, 5000000L);
  					ResetLatch(&XLogCtl->recoveryWakeupLatch);
  				}
  				else
--- 10165,10171 ----
  					/*
  					 * Wait for more WAL to arrive, or timeout to be reached
  					 */
! 					WaitLatch(&XLogCtl->recoveryWakeupLatch, WL_LATCH_SET | WL_TIMEOUT, 5000000L);
  					ResetLatch(&XLogCtl->recoveryWakeupLatch);
  				}
  				else
*** a/src/backend/port/unix_latch.c
--- b/src/backend/port/unix_latch.c
***************
*** 93,98 ****
--- 93,99 ----
  #endif
  
  #include "miscadmin.h"
+ #include "postmaster/postmaster.h"
  #include "storage/latch.h"
  #include "storage/shmem.h"
  
***************
*** 179,209 **** DisownLatch(volatile Latch *latch)
   * Wait for given latch to be set or until timeout is exceeded.
   * If the latch is already set, the function returns immediately.
   *
!  * The 'timeout' is given in microseconds, and -1 means wait forever.
!  * On some platforms, signals cause the timeout to be restarted, so beware
!  * that the function can sleep for several times longer than the specified
!  * timeout.
   *
   * The latch must be owned by the current process, ie. it must be a
   * backend-local latch initialized with InitLatch, or a shared latch
   * associated with the current process by calling OwnLatch.
   *
!  * Returns 'true' if the latch was set, or 'false' if timeout was reached.
   */
! bool
! WaitLatch(volatile Latch *latch, long timeout)
  {
! 	return WaitLatchOrSocket(latch, PGINVALID_SOCKET, false, false, timeout) > 0;
  }
  
  /*
!  * Like WaitLatch, but will also return when there's data available in
!  * 'sock' for reading or writing. Returns 0 if timeout was reached,
!  * 1 if the latch was set, 2 if the socket became readable or writable.
   */
  int
! WaitLatchOrSocket(volatile Latch *latch, pgsocket sock, bool forRead,
! 				  bool forWrite, long timeout)
  {
  	struct timeval tv,
  			   *tvp = NULL;
--- 180,211 ----
   * Wait for given latch to be set or until timeout is exceeded.
   * If the latch is already set, the function returns immediately.
   *
!  * The 'timeout' is given in microseconds. It must be >= 0 if WL_TIMEOUT
!  * event is given, otherwise it is ignored. On some platforms, signals cause
!  * the timeout to be restarted, so beware that the function can sleep for
!  * several times longer than the specified timeout.
   *
   * The latch must be owned by the current process, ie. it must be a
   * backend-local latch initialized with InitLatch, or a shared latch
   * associated with the current process by calling OwnLatch.
   *
!  * Returns bit field indicating which condition(s) caused the wake-up. Note
!  * that if multiple wake-up conditions are true, there is no guarantee that
!  * we return all of them in one call, but we will return at least one.
   */
! int
! WaitLatch(volatile Latch *latch, int wakeEvents, long timeout)
  {
! 	return WaitLatchOrSocket(latch, wakeEvents, PGINVALID_SOCKET, timeout);
  }
  
  /*
!  * Like WaitLatch, but with an extra socket argument for WL_SOCKET_*
!  * conditions.
   */
  int
! WaitLatchOrSocket(volatile Latch *latch, int wakeEvents, pgsocket sock,
! 				  long timeout)
  {
  	struct timeval tv,
  			   *tvp = NULL;
***************
*** 212,230 **** WaitLatchOrSocket(volatile Latch *latch, pgsocket sock, bool forRead,
  	int			rc;
  	int			result = 0;
  
  	if (latch->owner_pid != MyProcPid)
  		elog(ERROR, "cannot wait on a latch owned by another process");
  
  	/* Initialize timeout */
! 	if (timeout >= 0)
  	{
  		tv.tv_sec = timeout / 1000000L;
  		tv.tv_usec = timeout % 1000000L;
  		tvp = &tv;
  	}
  
  	waiting = true;
! 	for (;;)
  	{
  		int			hifd;
  
--- 214,239 ----
  	int			rc;
  	int			result = 0;
  
+ 	Assert(wakeEvents != 0);
+ 
+ 	/* Ignore WL_SOCKET_* events if no valid socket is given */
+ 	if (sock == PGINVALID_SOCKET)
+ 		wakeEvents &= ~(WL_SOCKET_READABLE | WL_SOCKET_WRITEABLE);
+ 
  	if (latch->owner_pid != MyProcPid)
  		elog(ERROR, "cannot wait on a latch owned by another process");
  
  	/* Initialize timeout */
! 	if (wakeEvents & WL_TIMEOUT)
  	{
+ 		Assert(timeout >= 0);
  		tv.tv_sec = timeout / 1000000L;
  		tv.tv_usec = timeout % 1000000L;
  		tvp = &tv;
  	}
  
  	waiting = true;
! 	do
  	{
  		int			hifd;
  
***************
*** 235,250 **** WaitLatchOrSocket(volatile Latch *latch, pgsocket sock, bool forRead,
  		 * do that), and the select() will return immediately.
  		 */
  		drainSelfPipe();
! 		if (latch->is_set)
  		{
! 			result = 1;
  			break;
  		}
  
  		FD_ZERO(&input_mask);
  		FD_SET(selfpipe_readfd, &input_mask);
  		hifd = selfpipe_readfd;
! 		if (sock != PGINVALID_SOCKET && forRead)
  		{
  			FD_SET(sock, &input_mask);
  			if (sock > hifd)
--- 244,271 ----
  		 * do that), and the select() will return immediately.
  		 */
  		drainSelfPipe();
! 		if (latch->is_set && (wakeEvents & WL_LATCH_SET))
  		{
! 			result |= WL_LATCH_SET;
! 			/*
! 			 * Leave loop immediately, avoid blocking again. We don't attempt
! 			 * to report any other events that might also be satisfied.
! 			 */
  			break;
  		}
  
  		FD_ZERO(&input_mask);
  		FD_SET(selfpipe_readfd, &input_mask);
  		hifd = selfpipe_readfd;
! 
! 		if (wakeEvents & WL_POSTMASTER_DEATH)
! 		{
! 			FD_SET(postmaster_alive_fds[POSTMASTER_FD_WATCH], &input_mask);
! 			if (postmaster_alive_fds[POSTMASTER_FD_WATCH] > hifd)
! 				hifd = postmaster_alive_fds[POSTMASTER_FD_WATCH];
! 		}
! 
! 		if (wakeEvents & WL_SOCKET_READABLE)
  		{
  			FD_SET(sock, &input_mask);
  			if (sock > hifd)
***************
*** 252,265 **** WaitLatchOrSocket(volatile Latch *latch, pgsocket sock, bool forRead,
  		}
  
  		FD_ZERO(&output_mask);
! 		if (sock != PGINVALID_SOCKET && forWrite)
  		{
  			FD_SET(sock, &output_mask);
  			if (sock > hifd)
  				hifd = sock;
  		}
  
  		rc = select(hifd + 1, &input_mask, &output_mask, NULL, tvp);
  		if (rc < 0)
  		{
  			if (errno == EINTR)
--- 273,289 ----
  		}
  
  		FD_ZERO(&output_mask);
! 		if (wakeEvents & WL_SOCKET_WRITEABLE)
  		{
  			FD_SET(sock, &output_mask);
  			if (sock > hifd)
  				hifd = sock;
  		}
  
+ 		/* Sleep */
  		rc = select(hifd + 1, &input_mask, &output_mask, NULL, tvp);
+ 
+ 		/* Check return code */
  		if (rc < 0)
  		{
  			if (errno == EINTR)
***************
*** 268,287 **** WaitLatchOrSocket(volatile Latch *latch, pgsocket sock, bool forRead,
  					(errcode_for_socket_access(),
  					 errmsg("select() failed: %m")));
  		}
! 		if (rc == 0)
  		{
  			/* timeout exceeded */
! 			result = 0;
! 			break;
  		}
! 		if (sock != PGINVALID_SOCKET &&
! 			((forRead && FD_ISSET(sock, &input_mask)) ||
! 			 (forWrite && FD_ISSET(sock, &output_mask))))
  		{
! 			result = 2;
! 			break;				/* data available in socket */
  		}
! 	}
  	waiting = false;
  
  	return result;
--- 292,317 ----
  					(errcode_for_socket_access(),
  					 errmsg("select() failed: %m")));
  		}
! 		if (rc == 0 && (wakeEvents & WL_TIMEOUT))
  		{
  			/* timeout exceeded */
! 			result |= WL_TIMEOUT;
  		}
! 		if ((wakeEvents & WL_SOCKET_READABLE) && FD_ISSET(sock, &input_mask))
  		{
! 			/* data available in socket */
! 			result |= WL_SOCKET_READABLE;
  		}
! 		if ((wakeEvents & WL_SOCKET_WRITEABLE) && FD_ISSET(sock, &output_mask))
! 		{
! 			result |= WL_SOCKET_WRITEABLE;
! 		}
! 		if ((wakeEvents & WL_POSTMASTER_DEATH) &&
! 			 FD_ISSET(postmaster_alive_fds[POSTMASTER_FD_WATCH], &input_mask))
! 		{
! 			result |= WL_POSTMASTER_DEATH;
! 		}
! 	} while(result == 0);
  	waiting = false;
  
  	return result;
*** a/src/backend/port/win32_latch.c
--- b/src/backend/port/win32_latch.c
***************
*** 23,28 ****
--- 23,29 ----
  #include <unistd.h>
  
  #include "miscadmin.h"
+ #include "postmaster/postmaster.h"
  #include "replication/walsender.h"
  #include "storage/latch.h"
  #include "storage/shmem.h"
***************
*** 81,123 **** DisownLatch(volatile Latch *latch)
  	latch->owner_pid = 0;
  }
  
! bool
! WaitLatch(volatile Latch *latch, long timeout)
  {
! 	return WaitLatchOrSocket(latch, PGINVALID_SOCKET, false, false, timeout) > 0;
  }
  
  int
! WaitLatchOrSocket(volatile Latch *latch, SOCKET sock, bool forRead,
! 				  bool forWrite, long timeout)
  {
  	DWORD		rc;
! 	HANDLE		events[3];
  	HANDLE		latchevent;
! 	HANDLE		sockevent = WSA_INVALID_EVENT;	/* silence compiler */
  	int			numevents;
  	int			result = 0;
  
  	latchevent = latch->event;
  
  	events[0] = latchevent;
  	events[1] = pgwin32_signal_event;
  	numevents = 2;
! 	if (sock != PGINVALID_SOCKET && (forRead || forWrite))
  	{
  		int			flags = 0;
  
! 		if (forRead)
  			flags |= FD_READ;
! 		if (forWrite)
  			flags |= FD_WRITE;
  
  		sockevent = WSACreateEvent();
  		WSAEventSelect(sock, sockevent, flags);
  		events[numevents++] = sockevent;
  	}
  
! 	for (;;)
  	{
  		/*
  		 * Reset the event, and check if the latch is set already. If someone
--- 82,148 ----
  	latch->owner_pid = 0;
  }
  
! int
! WaitLatch(volatile Latch *latch, int wakeEvents, long timeout)
  {
! 	return WaitLatchOrSocket(latch, wakeEvents, PGINVALID_SOCKET, timeout);
  }
  
  int
! WaitLatchOrSocket(volatile Latch *latch, int wakeEvents, SOCKET sock,
! 				  long timeout)
  {
  	DWORD		rc;
! 	HANDLE		events[4];
  	HANDLE		latchevent;
! 	HANDLE		sockevent = WSA_INVALID_EVENT;
  	int			numevents;
  	int			result = 0;
+ 	int			pmdeath_eventno;
+ 	long		timeout_ms;
+ 
+ 	Assert(wakeEvents != 0);
+ 
+ 	/* Ignore WL_SOCKET_* events if no valid socket is given */
+ 	if (sock == PGINVALID_SOCKET)
+ 		wakeEvents &= ~(WL_SOCKET_READABLE | WL_SOCKET_WRITEABLE);
+ 
+ 	/* Convert timeout to milliseconds for WaitForMultipleObjects() */
+ 	if (wakeEvents & WL_TIMEOUT)
+ 	{
+ 		Assert(timeout >= 0);
+ 		timeout_ms = timeout / 1000;
+ 	}
+ 	else
+ 		timeout_ms = INFINITE;
  
+ 	/* Construct an array of event handles for WaitforMultipleObjects() */
  	latchevent = latch->event;
  
  	events[0] = latchevent;
  	events[1] = pgwin32_signal_event;
  	numevents = 2;
! 	if (((wakeEvents & WL_SOCKET_READABLE) ||
! 		 (wakeEvents & WL_SOCKET_WRITEABLE)))
  	{
  		int			flags = 0;
  
! 		if (wakeEvents & WL_SOCKET_READABLE)
  			flags |= FD_READ;
! 		if (wakeEvents & WL_SOCKET_WRITEABLE)
  			flags |= FD_WRITE;
  
  		sockevent = WSACreateEvent();
  		WSAEventSelect(sock, sockevent, flags);
  		events[numevents++] = sockevent;
  	}
+ 	if (wakeEvents & WL_POSTMASTER_DEATH)
+ 	{
+ 		pmdeath_eventno = numevents;
+ 		events[numevents++] = PostmasterHandle;
+ 	}
  
! 	do
  	{
  		/*
  		 * Reset the event, and check if the latch is set already. If someone
***************
*** 127,171 **** WaitLatchOrSocket(volatile Latch *latch, SOCKET sock, bool forRead,
  		 */
  		if (!ResetEvent(latchevent))
  			elog(ERROR, "ResetEvent failed: error code %d", (int) GetLastError());
! 		if (latch->is_set)
  		{
! 			result = 1;
  			break;
  		}
  
! 		rc = WaitForMultipleObjects(numevents, events, FALSE,
! 							   (timeout >= 0) ? (timeout / 1000) : INFINITE);
  		if (rc == WAIT_FAILED)
  			elog(ERROR, "WaitForMultipleObjects() failed: error code %d", (int) GetLastError());
  		else if (rc == WAIT_TIMEOUT)
  		{
! 			result = 0;
! 			break;
  		}
! 		else if (rc == WAIT_OBJECT_0 + 1)
! 			pgwin32_dispatch_queued_signals();
! 		else if (rc == WAIT_OBJECT_0 + 2)
  		{
  			WSANETWORKEVENTS resEvents;
  
- 			Assert(sock != PGINVALID_SOCKET);
- 
  			ZeroMemory(&resEvents, sizeof(resEvents));
  			if (WSAEnumNetworkEvents(sock, sockevent, &resEvents) == SOCKET_ERROR)
  				ereport(FATAL,
  						(errmsg_internal("failed to enumerate network events: %i", (int) GetLastError())));
  
! 			if ((forRead && resEvents.lNetworkEvents & FD_READ) ||
! 				(forWrite && resEvents.lNetworkEvents & FD_WRITE))
! 				result = 2;
! 			break;
  		}
  		else if (rc != WAIT_OBJECT_0)
  			elog(ERROR, "unexpected return code from WaitForMultipleObjects(): %d", (int) rc);
  	}
  
  	/* Clean up the handle we created for the socket */
! 	if (sock != PGINVALID_SOCKET && (forRead || forWrite))
  	{
  		WSAEventSelect(sock, sockevent, 0);
  		WSACloseEvent(sockevent);
--- 152,215 ----
  		 */
  		if (!ResetEvent(latchevent))
  			elog(ERROR, "ResetEvent failed: error code %d", (int) GetLastError());
! 		if (latch->is_set && (wakeEvents & WL_LATCH_SET))
  		{
! 			result |= WL_LATCH_SET;
! 			/*
! 			 * Leave loop immediately, avoid blocking again. We don't attempt
! 			 * to report any other events that might also be satisfied.
! 			 */
  			break;
  		}
  
! 		rc = WaitForMultipleObjects(numevents, events, FALSE, timeout_ms);
! 
  		if (rc == WAIT_FAILED)
  			elog(ERROR, "WaitForMultipleObjects() failed: error code %d", (int) GetLastError());
+ 
+ 		/* Participate in Windows signal emulation */
+ 		else if (rc == WAIT_OBJECT_0 + 1)
+ 			pgwin32_dispatch_queued_signals();
+ 
+ 		else if ((wakeEvents & WL_POSTMASTER_DEATH) &&
+ 			rc == WAIT_OBJECT_0 + pmdeath_eventno)
+ 		{
+ 			/* Postmaster died */
+ 			result |= WL_POSTMASTER_DEATH;
+ 		}
  		else if (rc == WAIT_TIMEOUT)
  		{
! 			result |= WL_TIMEOUT;
  		}
! 		else if ((wakeEvents & (WL_SOCKET_READABLE | WL_SOCKET_WRITEABLE)) != 0 &&
! 				 rc == WAIT_OBJECT_0 + 2)	/* socket is at event slot 2 */
  		{
  			WSANETWORKEVENTS resEvents;
  
  			ZeroMemory(&resEvents, sizeof(resEvents));
  			if (WSAEnumNetworkEvents(sock, sockevent, &resEvents) == SOCKET_ERROR)
  				ereport(FATAL,
  						(errmsg_internal("failed to enumerate network events: %i", (int) GetLastError())));
  
! 			if ((wakeEvents & WL_SOCKET_READABLE) &&
! 				(resEvents.lNetworkEvents & FD_READ))
! 			{
! 				result |= WL_SOCKET_READABLE;
! 			}
! 			if ((wakeEvents & WL_SOCKET_WRITEABLE) &&
! 				(resEvents.lNetworkEvents & FD_WRITE))
! 			{
! 				result |= WL_SOCKET_WRITEABLE;
! 			}
  		}
+ 		/* Otherwise it must be the latch event */
  		else if (rc != WAIT_OBJECT_0)
  			elog(ERROR, "unexpected return code from WaitForMultipleObjects(): %d", (int) rc);
  	}
+ 	while(result == 0);
  
  	/* Clean up the handle we created for the socket */
! 	if (sockevent != WSA_INVALID_EVENT)
  	{
  		WSAEventSelect(sock, sockevent, 0);
  		WSACloseEvent(sockevent);
*** a/src/backend/postmaster/pgarch.c
--- b/src/backend/postmaster/pgarch.c
***************
*** 40,45 ****
--- 40,46 ----
  #include "postmaster/postmaster.h"
  #include "storage/fd.h"
  #include "storage/ipc.h"
+ #include "storage/latch.h"
  #include "storage/pg_shmem.h"
  #include "storage/pmsignal.h"
  #include "utils/guc.h"
***************
*** 87,92 **** static volatile sig_atomic_t got_SIGTERM = false;
--- 88,99 ----
  static volatile sig_atomic_t wakened = false;
  static volatile sig_atomic_t ready_to_stop = false;
  
+ /*
+  * Latch that archiver loop waits on until it is awakened by
+  * signals, each of which there is a handler for
+  */
+ static volatile Latch mainloop_latch;
+ 
  /* ----------
   * Local function forward declarations
   * ----------
***************
*** 228,233 **** PgArchiverMain(int argc, char *argv[])
--- 235,242 ----
  
  	MyProcPid = getpid();		/* reset MyProcPid */
  
+ 	InitLatch(&mainloop_latch); /* initialise latch used in main loop, now that we are a subprocess */
+ 
  	MyStartTime = time(NULL);	/* record Start Time for logging */
  
  	/*
***************
*** 282,287 **** ArchSigHupHandler(SIGNAL_ARGS)
--- 291,298 ----
  {
  	/* set flag to re-read config file at next convenient time */
  	got_SIGHUP = true;
+ 	/* Let the waiting loop iterate */
+ 	SetLatch(&mainloop_latch);
  }
  
  /* SIGTERM signal handler for archiver process */
***************
*** 295,300 **** ArchSigTermHandler(SIGNAL_ARGS)
--- 306,313 ----
  	 * archive commands.
  	 */
  	got_SIGTERM = true;
+ 	/* Let the waiting loop iterate */
+ 	SetLatch(&mainloop_latch);
  }
  
  /* SIGUSR1 signal handler for archiver process */
***************
*** 303,308 **** pgarch_waken(SIGNAL_ARGS)
--- 316,323 ----
  {
  	/* set flag that there is work to be done */
  	wakened = true;
+ 	/* Let the waiting loop iterate */
+ 	SetLatch(&mainloop_latch);
  }
  
  /* SIGUSR2 signal handler for archiver process */
***************
*** 311,316 **** pgarch_waken_stop(SIGNAL_ARGS)
--- 326,333 ----
  {
  	/* set flag to do a final cycle and shut down afterwards */
  	ready_to_stop = true;
+ 	/* Let the waiting loop iterate */
+ 	SetLatch(&mainloop_latch);
  }
  
  /*
***************
*** 334,339 **** pgarch_MainLoop(void)
--- 351,363 ----
  
  	do
  	{
+ 		/*
+ 		 * There shouldn't be anything for the archiver to do except to wait
+ 		 * on a latch ... however, the archiver exists to protect our data,
+ 		 * so she wakes up occasionally to allow herself to be proactive.
+ 		 */
+ 		ResetLatch(&mainloop_latch);
+ 
  		/* When we get SIGUSR2, we do one more archive cycle, then exit */
  		time_to_stop = ready_to_stop;
  
***************
*** 371,395 **** pgarch_MainLoop(void)
  		}
  
  		/*
! 		 * There shouldn't be anything for the archiver to do except to wait
! 		 * for a signal ... however, the archiver exists to protect our data,
! 		 * so she wakes up occasionally to allow herself to be proactive.
  		 *
! 		 * On some platforms, signals won't interrupt the sleep.  To ensure we
! 		 * respond reasonably promptly when someone signals us, break down the
! 		 * sleep into 1-second increments, and check for interrupts after each
! 		 * nap.
  		 */
- 		while (!(wakened || ready_to_stop || got_SIGHUP ||
- 				 !PostmasterIsAlive(true)))
- 		{
- 			time_t		curtime;
  
! 			pg_usleep(1000000L);
  			curtime = time(NULL);
  			if ((unsigned int) (curtime - last_copy_time) >=
  				(unsigned int) PGARCH_AUTOWAKE_INTERVAL)
! 				wakened = true;
  		}
  
  		/*
--- 395,421 ----
  		}
  
  		/*
! 		 * Wait on latch, until various signals are received, or
! 		 * until a poll will be forced by PGARCH_AUTOWAKE_INTERVAL
! 		 * having passed since last_copy_time, or on the postmaster's
! 		 * untimely demise.
  		 *
! 		 * The caveat about signals resetting the timeout of
! 		 * WaitLatch()/select() on some platforms can be safely disregarded,
! 		 * because we handle all expected signals, and all handlers
! 		 * call SetLatch() where that matters anyway
  		 */
  
! 		if (!time_to_stop) /* Don't wait during last iteration */
! 		{
! 			time_t		 curtime = time(NULL);
! 			unsigned int timeout_secs  = (unsigned int) PGARCH_AUTOWAKE_INTERVAL -
! 					(unsigned int) (curtime - last_copy_time);
! 			WaitLatch(&mainloop_latch, WL_LATCH_SET | WL_TIMEOUT | WL_POSTMASTER_DEATH, timeout_secs * 1000000L);
  			curtime = time(NULL);
  			if ((unsigned int) (curtime - last_copy_time) >=
  				(unsigned int) PGARCH_AUTOWAKE_INTERVAL)
! 				wakened = true; /* wakened by timeout - this wasn't a SIGHUP, etc */
  		}
  
  		/*
*** a/src/backend/postmaster/postmaster.c
--- b/src/backend/postmaster/postmaster.c
***************
*** 368,373 **** static int	CountChildren(int target);
--- 368,374 ----
  static bool CreateOptsFile(int argc, char *argv[], char *fullprogname);
  static pid_t StartChildProcess(AuxProcType type);
  static void StartAutovacuumWorker(void);
+ static void InitPostmasterDeathWatchHandle(void);
  
  #ifdef EXEC_BACKEND
  
***************
*** 383,390 **** typedef struct
  	HANDLE		procHandle;
  	DWORD		procId;
  } win32_deadchild_waitinfo;
- 
- HANDLE		PostmasterHandle;
  #endif
  
  static pid_t backend_forkexec(Port *port);
--- 384,389 ----
***************
*** 439,444 **** typedef struct
--- 438,444 ----
  	HANDLE		initial_signal_pipe;
  	HANDLE		syslogPipe[2];
  #else
+ 	int			postmaster_alive_fds[2];
  	int			syslogPipe[2];
  #endif
  	char		my_exec_path[MAXPGPATH];
***************
*** 469,474 **** static void ShmemBackendArrayRemove(Backend *bn);
--- 469,484 ----
  #define EXIT_STATUS_0(st)  ((st) == 0)
  #define EXIT_STATUS_1(st)  (WIFEXITED(st) && WEXITSTATUS(st) == 1)
  
+ /*
+  * File descriptors for pipe used to monitor if postmaster is alive.
+  * First is POSTMASTER_FD_WATCH, second is POSTMASTER_FD_OWN.
+  */
+ #ifndef WIN32
+ int postmaster_alive_fds[2] = { -1, -1 };
+ #else
+ /* Process handle of postmaster used for the same purpose on Windows */
+ HANDLE		PostmasterHandle;
+ #endif
  
  /*
   * Postmaster main entry point
***************
*** 962,969 **** PostmasterMain(int argc, char *argv[])
  	 */
  	BackendList = DLNewList();
  
! #ifdef WIN32
  
  	/*
  	 * Initialize I/O completion port used to deliver list of dead children.
  	 */
--- 972,984 ----
  	 */
  	BackendList = DLNewList();
  
! 	/*
! 	 * Initialize pipe (or process handle on Windows) that allows children to
! 	 * wake up from sleep on postmaster death.
! 	 */
! 	InitPostmasterDeathWatchHandle();
  
+ #ifdef WIN32
  	/*
  	 * Initialize I/O completion port used to deliver list of dead children.
  	 */
***************
*** 971,991 **** PostmasterMain(int argc, char *argv[])
  	if (win32ChildQueue == NULL)
  		ereport(FATAL,
  		   (errmsg("could not create I/O completion port for child queue")));
- 
- 	/*
- 	 * Set up a handle that child processes can use to check whether the
- 	 * postmaster is still running.
- 	 */
- 	if (DuplicateHandle(GetCurrentProcess(),
- 						GetCurrentProcess(),
- 						GetCurrentProcess(),
- 						&PostmasterHandle,
- 						0,
- 						TRUE,
- 						DUPLICATE_SAME_ACCESS) == 0)
- 		ereport(FATAL,
- 				(errmsg_internal("could not duplicate postmaster handle: error code %d",
- 								 (int) GetLastError())));
  #endif
  
  	/*
--- 986,991 ----
***************
*** 1965,1970 **** ClosePostmasterPorts(bool am_syslogger)
--- 1965,1983 ----
  {
  	int			i;
  
+ #ifndef WIN32
+ 	/*
+ 	 * Close the write end of postmaster death watch pipe. It's important to
+ 	 * do this as early as possible, so that if postmaster dies, others won't
+ 	 * think that it's still running because we're holding the pipe open.
+ 	 */
+ 	if (close(postmaster_alive_fds[POSTMASTER_FD_OWN]))
+ 		ereport(FATAL,
+ 			(errcode_for_file_access(),
+ 			 errmsg_internal("could not close postmaster death monitoring pipe in child process: %m")));
+ 	postmaster_alive_fds[POSTMASTER_FD_OWN] = -1;
+ #endif
+ 
  	/* Close the listen sockets */
  	for (i = 0; i < MAXLISTEN; i++)
  	{
***************
*** 4643,4648 **** save_backend_variables(BackendParameters *param, Port *port,
--- 4656,4664 ----
  								 pgwin32_create_signal_listener(childPid),
  								 childProcess))
  		return false;
+ #else
+ 	memcpy(&param->postmaster_alive_fds, &postmaster_alive_fds,
+ 		   sizeof(postmaster_alive_fds));
  #endif
  
  	memcpy(&param->syslogPipe, &syslogPipe, sizeof(syslogPipe));
***************
*** 4858,4863 **** restore_backend_variables(BackendParameters *param, Port *port)
--- 4874,4882 ----
  #ifdef WIN32
  	PostmasterHandle = param->PostmasterHandle;
  	pgwin32_initial_signal_pipe = param->initial_signal_pipe;
+ #else
+ 	memcpy(&postmaster_alive_fds, &param->postmaster_alive_fds,
+ 		   sizeof(postmaster_alive_fds));
  #endif
  
  	memcpy(&syslogPipe, &param->syslogPipe, sizeof(syslogPipe));
***************
*** 4979,4981 **** pgwin32_deadchild_callback(PVOID lpParameter, BOOLEAN TimerOrWaitFired)
--- 4998,5051 ----
  }
  
  #endif   /* WIN32 */
+ 
+ /*
+  * Initialize one and only handle for monitoring postmaster death.
+  *
+  * Called once in the postmaster, so that child processes can subsequently
+  * monitor if their parent is dead.
+  */
+ static void
+ InitPostmasterDeathWatchHandle(void)
+ {
+ #ifndef WIN32
+ 	/*
+ 	 * Create a pipe. Postmaster holds the write end of the pipe open
+ 	 * (POSTMASTER_FD_OWN), and children hold the read end. Children can
+ 	 * pass the read file descriptor to select() to wake up in case postmaster
+ 	 * dies. Children must close the write end as soon as possible after
+ 	 * forking, because EOF won't be signaled in the read end until all
+ 	 * processes have closed the write fd. ClosePostmasterPorts() takes care
+ 	 * of closing the write fd.
+ 	 */
+ 	Assert(MyProcPid == PostmasterPid);
+ 	if (pipe(postmaster_alive_fds))
+ 		ereport(FATAL,
+ 				(errcode_for_file_access(),
+ 				 errmsg_internal("could not create pipe to monitor postmaster death: %m")));
+ 
+ 	/*
+ 	 * Set O_NONBLOCK to allow testing for the fd's presence with a read()
+ 	 * call.
+ 	 */
+ 	if (fcntl(postmaster_alive_fds[POSTMASTER_FD_WATCH], F_SETFL, O_NONBLOCK))
+ 		ereport(FATAL,
+ 				(errcode_for_socket_access(),
+ 				 errmsg_internal("could not set postmaster death monitoring pipe to non-blocking mode: %m")));
+ 
+ #else
+ 	/*
+ 	 * On Windows, we use a process handle for the same purpose.
+ 	 */
+ 	if (DuplicateHandle(GetCurrentProcess(),
+ 						GetCurrentProcess(),
+ 						GetCurrentProcess(),
+ 						&PostmasterHandle,
+ 						0,
+ 						TRUE,
+ 						DUPLICATE_SAME_ACCESS) == 0)
+ 		ereport(FATAL,
+ 				(errmsg_internal("could not duplicate postmaster handle: error code %d",
+ 								 (int) GetLastError())));
+ #endif   /* WIN32 */
+ }
*** a/src/backend/replication/syncrep.c
--- b/src/backend/replication/syncrep.c
***************
*** 171,177 **** SyncRepWaitForLSN(XLogRecPtr XactCommitLSN)
  		 * postmaster death regularly while waiting. Note that timeout here
  		 * does not necessarily release from loop.
  		 */
! 		WaitLatch(&MyProc->waitLatch, 60000000L);
  
  		/* Must reset the latch before testing state. */
  		ResetLatch(&MyProc->waitLatch);
--- 171,177 ----
  		 * postmaster death regularly while waiting. Note that timeout here
  		 * does not necessarily release from loop.
  		 */
! 		WaitLatch(&MyProc->waitLatch, WL_LATCH_SET | WL_TIMEOUT, 60000000L);
  
  		/* Must reset the latch before testing state. */
  		ResetLatch(&MyProc->waitLatch);
*** a/src/backend/replication/walsender.c
--- b/src/backend/replication/walsender.c
***************
*** 779,784 **** WalSndLoop(void)
--- 779,785 ----
  		{
  			TimestampTz finish_time = 0;
  			long		sleeptime;
+ 			int			wakeEvents;
  
  			/* Reschedule replication timeout */
  			if (replication_timeout > 0)
***************
*** 805,813 **** WalSndLoop(void)
  			}
  
  			/* Sleep */
! 			WaitLatchOrSocket(&MyWalSnd->latch, MyProcPort->sock,
! 							  true, pq_is_send_pending(),
! 							  sleeptime * 1000L);
  
  			/* Check for replication timeout */
  			if (replication_timeout > 0 &&
--- 806,816 ----
  			}
  
  			/* Sleep */
! 			wakeEvents  = WL_LATCH_SET | WL_SOCKET_READABLE | WL_TIMEOUT;
! 			if (pq_is_send_pending())
! 				wakeEvents |= WL_SOCKET_WRITEABLE;
! 			WaitLatchOrSocket(&MyWalSnd->latch, wakeEvents,
! 							  MyProcPort->sock, sleeptime * 1000L);
  
  			/* Check for replication timeout */
  			if (replication_timeout > 0 &&
*** a/src/include/postmaster/postmaster.h
--- b/src/include/postmaster/postmaster.h
***************
*** 33,38 **** extern bool restart_after_crash;
--- 33,46 ----
  
  #ifdef WIN32
  extern HANDLE PostmasterHandle;
+ #else
+ extern int postmaster_alive_fds[2];
+ /*
+  * Constants that represent which of postmaster_alive_fds is held by
+  * postmaster, and which is used in children to check for postmaster death.
+  */
+ #define POSTMASTER_FD_WATCH		0	/* used in children to check for postmaster death */
+ #define POSTMASTER_FD_OWN		1	/* kept open by postmaster only */
  #endif
  
  extern const char *progname;
*** a/src/include/storage/latch.h
--- b/src/include/storage/latch.h
***************
*** 38,46 **** extern void InitLatch(volatile Latch *latch);
  extern void InitSharedLatch(volatile Latch *latch);
  extern void OwnLatch(volatile Latch *latch);
  extern void DisownLatch(volatile Latch *latch);
! extern bool WaitLatch(volatile Latch *latch, long timeout);
! extern int WaitLatchOrSocket(volatile Latch *latch, pgsocket sock,
! 				  bool forRead, bool forWrite, long timeout);
  extern void SetLatch(volatile Latch *latch);
  extern void ResetLatch(volatile Latch *latch);
  
--- 38,45 ----
  extern void InitSharedLatch(volatile Latch *latch);
  extern void OwnLatch(volatile Latch *latch);
  extern void DisownLatch(volatile Latch *latch);
! extern int WaitLatch(volatile Latch *latch, int wakeEvents, long timeout);
! extern int WaitLatchOrSocket(volatile Latch *latch, int wakeEvents, pgsocket sock, long timeout);
  extern void SetLatch(volatile Latch *latch);
  extern void ResetLatch(volatile Latch *latch);
  
***************
*** 56,59 **** extern void latch_sigusr1_handler(void);
--- 55,65 ----
  #define latch_sigusr1_handler()
  #endif
  
+ /* Bitmasks for events that may wake-up WaitLatch() clients */
+ #define WL_LATCH_SET         (1 << 0)
+ #define WL_SOCKET_READABLE   (1 << 1)
+ #define WL_SOCKET_WRITEABLE  (1 << 2)
+ #define WL_TIMEOUT           (1 << 3)
+ #define WL_POSTMASTER_DEATH  (1 << 4)
+ 
  #endif   /* LATCH_H */
-- 
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers

Reply via email to