On 2015-12-26 12:22:48 +0100, Andres Freund wrote:
> On 2015-12-25 16:29:53 -0500, Tom Lane wrote:
> > Andres Freund <and...@anarazel.de> writes:
> > > There's a couple solutions I can think of to that problem:
> > > 1) Use epoll()/kqueue, or other similar interfaces that don't require
> > >    re-registering fds at every invocation. My guess is that that'd be
> > >    desirable for performance anyway.
> >
> > Portability, on the other hand, would be problematic.
>
> Indeed. But we might be able to get away with it because there's
> realistically just one platform on which people run four socket
> servers. Obviously we'd leave poll and select support in place.  It'd be
> a genuine improvement for less extreme loads on linux, too.

I finally got back to working on this. Attached is a WIP patch series
implementing:
0001: Allow to easily choose between the readiness primitives in unix_latch.c
      Pretty helpful for testing, not useful for anything else.
0002: Error out if waiting on socket readiness without a specified socket.
0003: Only clear unix_latch.c's self-pipe if it actually contains data.
      ~2% on high qps workloads
0004: Support using epoll as the polling primitive in unix_latch.c.
      ~3% on high qps workloads, massive scalability improvements (x3)
      on very large machines.

With 0004 obviously being the relevant bit for this thread. I verified
that using epoll addresses the performance problem, using the hardware
the OP noticed the performance problem on.

The reason I went with using epoll over the PR_SET_PDEATHSIG approach is
that it provides semantics that are more similar to the other platforms,
while being just as platform dependant as PR_SET_PDEATHSIG. It also is
actually measurably faster, at least here.

0004 currently contains one debatable optimization, which I'd like to
discuss: Currently the 'sock' passed to WaitLatchOrSocket is not
removed/added to the epoll fd, if it's the numerically same as in the
last call. That's good for performance, but would be wrong if the socket
were close and a new one with the same value would be waited on.  I
think a big warning sign somewhere is sufficient to deal with that
problem - it's not something we're likely to start doing. And even if
it's done at some point, we can just offer an API to reset the last used
socket fd.


Unless somebody comes up with a platform independent way of addressing
this, I'm inclined to press forward using epoll(). Opinions?

Andres
>From fb67ecf2f6f65525af1ed7c5d5e5dd46e8fa6fc4 Mon Sep 17 00:00:00 2001
From: Andres Freund <and...@anarazel.de>
Date: Thu, 14 Jan 2016 14:17:43 +0100
Subject: [PATCH 1/4] Make it easier to choose the used waiting primitive in
 unix_latch.c.

---
 src/backend/port/unix_latch.c | 50 +++++++++++++++++++++++++++++--------------
 1 file changed, 34 insertions(+), 16 deletions(-)

diff --git a/src/backend/port/unix_latch.c b/src/backend/port/unix_latch.c
index 2ad609c..f52704b 100644
--- a/src/backend/port/unix_latch.c
+++ b/src/backend/port/unix_latch.c
@@ -56,6 +56,22 @@
 #include "storage/pmsignal.h"
 #include "storage/shmem.h"
 
+/*
+ * Select the fd readiness primitive to use. Normally the "most modern"
+ * primitive supported by the OS will be used, but for testing it can be
+ * useful to manually specify the used primitive.  If desired, just add a
+ * define somewhere before this block.
+ */
+#if defined(LATCH_USE_POLL) || defined(LATCH_USE_SELECT)
+/* don't overwrite manual choice */
+#elif defined(HAVE_POLL)
+#define LATCH_USE_POLL
+#elif HAVE_SYS_SELECT_H
+#define LATCH_USE_SELECT
+#else
+#error "no latch implementation available"
+#endif
+
 /* Are we currently in WaitLatch? The signal handler would like to know. */
 static volatile sig_atomic_t waiting = false;
 
@@ -215,10 +231,10 @@ WaitLatchOrSocket(volatile Latch *latch, int wakeEvents, pgsocket sock,
 				cur_time;
 	long		cur_timeout;
 
-#ifdef HAVE_POLL
+#if defined(LATCH_USE_POLL)
 	struct pollfd pfds[3];
 	int			nfds;
-#else
+#elif defined(LATCH_USE_SELECT)
 	struct timeval tv,
 			   *tvp;
 	fd_set		input_mask;
@@ -247,7 +263,7 @@ WaitLatchOrSocket(volatile Latch *latch, int wakeEvents, pgsocket sock,
 		Assert(timeout >= 0 && timeout <= INT_MAX);
 		cur_timeout = timeout;
 
-#ifndef HAVE_POLL
+#ifdef LATCH_USE_SELECT
 		tv.tv_sec = cur_timeout / 1000L;
 		tv.tv_usec = (cur_timeout % 1000L) * 1000L;
 		tvp = &tv;
@@ -257,7 +273,7 @@ WaitLatchOrSocket(volatile Latch *latch, int wakeEvents, pgsocket sock,
 	{
 		cur_timeout = -1;
 
-#ifndef HAVE_POLL
+#ifdef LATCH_USE_SELECT
 		tvp = NULL;
 #endif
 	}
@@ -291,16 +307,10 @@ WaitLatchOrSocket(volatile Latch *latch, int wakeEvents, pgsocket sock,
 		}
 
 		/*
-		 * Must wait ... we use poll(2) if available, otherwise select(2).
-		 *
-		 * On at least older linux kernels select(), in violation of POSIX,
-		 * doesn't reliably return a socket as writable if closed - but we
-		 * rely on that. So far all the known cases of this problem are on
-		 * platforms that also provide a poll() implementation without that
-		 * bug.  If we find one where that's not the case, we'll need to add a
-		 * workaround.
+		 * Must wait ... we use the polling interface determined at the top of
+		 * this file to do so.
 		 */
-#ifdef HAVE_POLL
+#if defined(LATCH_USE_POLL)
 		nfds = 0;
 		if (wakeEvents & (WL_SOCKET_READABLE | WL_SOCKET_WRITEABLE))
 		{
@@ -396,8 +406,16 @@ WaitLatchOrSocket(volatile Latch *latch, int wakeEvents, pgsocket sock,
 					result |= WL_POSTMASTER_DEATH;
 			}
 		}
-#else							/* !HAVE_POLL */
+#elif defined(LATCH_USE_SELECT)
 
+		/*
+		 * On at least older linux kernels select(), in violation of POSIX,
+		 * doesn't reliably return a socket as writable if closed - but we
+		 * rely on that. So far all the known cases of this problem are on
+		 * platforms that also provide a poll() implementation without that
+		 * bug.  If we find one where that's not the case, we'll need to add a
+		 * workaround.
+		 */
 		FD_ZERO(&input_mask);
 		FD_ZERO(&output_mask);
 
@@ -477,7 +495,7 @@ WaitLatchOrSocket(volatile Latch *latch, int wakeEvents, pgsocket sock,
 					result |= WL_POSTMASTER_DEATH;
 			}
 		}
-#endif   /* HAVE_POLL */
+#endif   /* LATCH_USE_SELECT */
 
 		/* If we're not done, update cur_timeout for next iteration */
 		if (result == 0 && (wakeEvents & WL_TIMEOUT))
@@ -490,7 +508,7 @@ WaitLatchOrSocket(volatile Latch *latch, int wakeEvents, pgsocket sock,
 				/* Timeout has expired, no need to continue looping */
 				result |= WL_TIMEOUT;
 			}
-#ifndef HAVE_POLL
+#ifdef LATCH_USE_SELECT
 			else
 			{
 				tv.tv_sec = cur_timeout / 1000L;
-- 
2.5.0.400.gff86faf.dirty

>From cd5a66b55a00ba70613cfbe45be758a64d2112f8 Mon Sep 17 00:00:00 2001
From: Andres Freund <and...@anarazel.de>
Date: Thu, 14 Jan 2016 14:24:09 +0100
Subject: [PATCH 2/4] Error out if waiting on socket readiness without a
 specified socket.

---
 src/backend/port/unix_latch.c  | 7 ++++---
 src/backend/port/win32_latch.c | 6 ++++--
 2 files changed, 8 insertions(+), 5 deletions(-)

diff --git a/src/backend/port/unix_latch.c b/src/backend/port/unix_latch.c
index f52704b..ad621ea 100644
--- a/src/backend/port/unix_latch.c
+++ b/src/backend/port/unix_latch.c
@@ -242,9 +242,10 @@ WaitLatchOrSocket(volatile Latch *latch, int wakeEvents, pgsocket sock,
 	int			hifd;
 #endif
 
-	/* Ignore WL_SOCKET_* events if no valid socket is given */
-	if (sock == PGINVALID_SOCKET)
-		wakeEvents &= ~(WL_SOCKET_READABLE | WL_SOCKET_WRITEABLE);
+	/* waiting for socket readiness without a socket indicates a bug */
+	if (sock == PGINVALID_SOCKET &&
+		(wakeEvents & (WL_SOCKET_READABLE | WL_SOCKET_WRITEABLE)) != 0)
+		elog(ERROR, "cannot wait on socket events without a socket");
 
 	Assert(wakeEvents != 0);	/* must have at least one wake event */
 
diff --git a/src/backend/port/win32_latch.c b/src/backend/port/win32_latch.c
index 80adc13..e101acf 100644
--- a/src/backend/port/win32_latch.c
+++ b/src/backend/port/win32_latch.c
@@ -119,8 +119,10 @@ WaitLatchOrSocket(volatile Latch *latch, int wakeEvents, pgsocket sock,
 
 	Assert(wakeEvents != 0);	/* must have at least one wake event */
 
-	if ((wakeEvents & WL_LATCH_SET) && latch->owner_pid != MyProcPid)
-		elog(ERROR, "cannot wait on a latch owned by another process");
+	/* waiting for socket readiness without a socket indicates a bug */
+	if (sock == PGINVALID_SOCKET &&
+		(wakeEvents & (WL_SOCKET_READABLE | WL_SOCKET_WRITEABLE)) != 0)
+		elog(ERROR, "cannot wait on socket events without a socket");
 
 	/*
 	 * Initialize timeout if requested.  We must record the current time so
-- 
2.5.0.400.gff86faf.dirty

>From 162f66f7fccc335d8caad7bb15be1c2030ec838e Mon Sep 17 00:00:00 2001
From: Andres Freund <and...@anarazel.de>
Date: Thu, 14 Jan 2016 15:15:17 +0100
Subject: [PATCH 3/4] Only clear unix_latch.c's self-pipe if it actually
 contains data.

This avoids a good number of, individually quite fast, system calls in
scenarios with many quick queries. Besides the aesthetic benefit of
seing fewer superflous system calls with strace, it also improves
performance by ~2% measured by pgbench -M prepared -c 96 -j 8 -S (scale
100).
---
 src/backend/port/unix_latch.c | 77 ++++++++++++++++++++++++++++---------------
 1 file changed, 51 insertions(+), 26 deletions(-)

diff --git a/src/backend/port/unix_latch.c b/src/backend/port/unix_latch.c
index ad621ea..03bca68 100644
--- a/src/backend/port/unix_latch.c
+++ b/src/backend/port/unix_latch.c
@@ -283,27 +283,27 @@ WaitLatchOrSocket(volatile Latch *latch, int wakeEvents, pgsocket sock,
 	do
 	{
 		/*
-		 * Clear the pipe, then check if the latch is set already. If someone
-		 * sets the latch between this and the poll()/select() below, the
-		 * setter will write a byte to the pipe (or signal us and the signal
-		 * handler will do that), and the poll()/select() will return
-		 * immediately.
+		 * Check if the latch is set already. If so, leave loop immediately,
+		 * avoid blocking again. We don't attempt to report any other events
+		 * that might also be satisfied.
+		 *
+		 * If someone sets the latch between this and the poll()/select()
+		 * below, the setter will write a byte to the pipe (or signal us and
+		 * the signal handler will do that), and the poll()/select() will
+		 * return immediately.
+		 *
+		 * If there's a pending byte in the self pipe, we'll notice whenever
+		 * blocking. Only clearing the pipe in that case avoids having to
+		 * drain it everytime WaitLatchOrSocket() is used.
 		 *
 		 * Note: we assume that the kernel calls involved in drainSelfPipe()
 		 * and SetLatch() will provide adequate synchronization on machines
 		 * with weak memory ordering, so that we cannot miss seeing is_set if
 		 * the signal byte is already in the pipe when we drain it.
 		 */
-		drainSelfPipe();
-
 		if ((wakeEvents & WL_LATCH_SET) && latch->is_set)
 		{
 			result |= WL_LATCH_SET;
-
-			/*
-			 * Leave loop immediately, avoid blocking again. We don't attempt
-			 * to report any other events that might also be satisfied.
-			 */
 			break;
 		}
 
@@ -313,24 +313,26 @@ WaitLatchOrSocket(volatile Latch *latch, int wakeEvents, pgsocket sock,
 		 */
 #if defined(LATCH_USE_POLL)
 		nfds = 0;
+
+		/* selfpipe is always in pfds[0] */
+		pfds[0].fd = selfpipe_readfd;
+		pfds[0].events = POLLIN;
+		pfds[0].revents = 0;
+		nfds++;
+
 		if (wakeEvents & (WL_SOCKET_READABLE | WL_SOCKET_WRITEABLE))
 		{
-			/* socket, if used, is always in pfds[0] */
-			pfds[0].fd = sock;
-			pfds[0].events = 0;
+			/* socket, if used, is always in pfds[1] */
+			pfds[1].fd = sock;
+			pfds[1].events = 0;
 			if (wakeEvents & WL_SOCKET_READABLE)
-				pfds[0].events |= POLLIN;
+				pfds[1].events |= POLLIN;
 			if (wakeEvents & WL_SOCKET_WRITEABLE)
-				pfds[0].events |= POLLOUT;
-			pfds[0].revents = 0;
+				pfds[1].events |= POLLOUT;
+			pfds[1].revents = 0;
 			nfds++;
 		}
 
-		pfds[nfds].fd = selfpipe_readfd;
-		pfds[nfds].events = POLLIN;
-		pfds[nfds].revents = 0;
-		nfds++;
-
 		if (wakeEvents & WL_POSTMASTER_DEATH)
 		{
 			/* postmaster fd, if used, is always in pfds[nfds - 1] */
@@ -364,19 +366,26 @@ WaitLatchOrSocket(volatile Latch *latch, int wakeEvents, pgsocket sock,
 		else
 		{
 			/* at least one event occurred, so check revents values */
+
+			if (pfds[0].revents & POLLIN)
+			{
+				/* There's data in the self-pipe, clear it. */
+				drainSelfPipe();
+			}
+
 			if ((wakeEvents & WL_SOCKET_READABLE) &&
-				(pfds[0].revents & POLLIN))
+				(pfds[1].revents & POLLIN))
 			{
 				/* data available in socket, or EOF/error condition */
 				result |= WL_SOCKET_READABLE;
 			}
 			if ((wakeEvents & WL_SOCKET_WRITEABLE) &&
-				(pfds[0].revents & POLLOUT))
+				(pfds[1].revents & POLLOUT))
 			{
 				/* socket is writable */
 				result |= WL_SOCKET_WRITEABLE;
 			}
-			if (pfds[0].revents & (POLLHUP | POLLERR | POLLNVAL))
+			if (pfds[1].revents & (POLLHUP | POLLERR | POLLNVAL))
 			{
 				/* EOF/error condition */
 				if (wakeEvents & WL_SOCKET_READABLE)
@@ -468,6 +477,11 @@ WaitLatchOrSocket(volatile Latch *latch, int wakeEvents, pgsocket sock,
 		else
 		{
 			/* at least one event occurred, so check masks */
+			if (FD_ISSET(selfpipe_readfd, &input_mask))
+			{
+				/* There's data in the self-pipe, clear it. */
+				drainSelfPipe();
+			}
 			if ((wakeEvents & WL_SOCKET_READABLE) && FD_ISSET(sock, &input_mask))
 			{
 				/* data available in socket, or EOF */
@@ -498,6 +512,17 @@ WaitLatchOrSocket(volatile Latch *latch, int wakeEvents, pgsocket sock,
 		}
 #endif   /* LATCH_USE_SELECT */
 
+		/*
+		 * Check again wether latch is set, the arrival of a signal/self-byte
+		 * might be what stopped our sleep. It's not required for correctness
+		 * to signal the latch as being set (we'd just loop if there's no
+		 * other event), but it seems good to report an arrived latch asap.
+		 */
+		if ((wakeEvents & WL_LATCH_SET) && latch->is_set)
+		{
+			result |= WL_LATCH_SET;
+		}
+
 		/* If we're not done, update cur_timeout for next iteration */
 		if (result == 0 && (wakeEvents & WL_TIMEOUT))
 		{
-- 
2.5.0.400.gff86faf.dirty

>From fe417866a7132b1ee65e2ed96f79fbaad7922435 Mon Sep 17 00:00:00 2001
From: Andres Freund <and...@anarazel.de>
Date: Thu, 14 Jan 2016 15:24:15 +0100
Subject: [PATCH 4/4] Support using epoll as the polling primitive in
 unix_latch.c.

epoll(2) has the advantage of being able to reuse the wait datastructure
from previous calls when waiting the next time, on the same
events. Especially when waiting on a socket used by many processes like
the postmaster_alive_fd, that's good for scalability.
---
 configure                     |   2 +-
 configure.in                  |   2 +-
 src/backend/port/unix_latch.c | 228 +++++++++++++++++++++++++++++++++++++++++-
 src/include/pg_config.h.in    |   3 +
 src/include/storage/latch.h   |   4 +
 5 files changed, 234 insertions(+), 5 deletions(-)

diff --git a/configure b/configure
index 3dd1b15..d65e0b4 100755
--- a/configure
+++ b/configure
@@ -10144,7 +10144,7 @@ fi
 ## Header files
 ##
 
-for ac_header in atomic.h crypt.h dld.h fp_class.h getopt.h ieeefp.h ifaddrs.h langinfo.h mbarrier.h poll.h pwd.h sys/ioctl.h sys/ipc.h sys/poll.h sys/pstat.h sys/resource.h sys/select.h sys/sem.h sys/shm.h sys/socket.h sys/sockio.h sys/tas.h sys/time.h sys/un.h termios.h ucred.h utime.h wchar.h wctype.h
+for ac_header in atomic.h crypt.h dld.h fp_class.h getopt.h ieeefp.h ifaddrs.h langinfo.h mbarrier.h poll.h pwd.h sys/epoll.h sys/ioctl.h sys/ipc.h sys/poll.h sys/pstat.h sys/resource.h sys/select.h sys/sem.h sys/shm.h sys/socket.h sys/sockio.h sys/tas.h sys/time.h sys/un.h termios.h ucred.h utime.h wchar.h wctype.h
 do :
   as_ac_Header=`$as_echo "ac_cv_header_$ac_header" | $as_tr_sh`
 ac_fn_c_check_header_mongrel "$LINENO" "$ac_header" "$as_ac_Header" "$ac_includes_default"
diff --git a/configure.in b/configure.in
index 9398482..d24b7e8 100644
--- a/configure.in
+++ b/configure.in
@@ -1163,7 +1163,7 @@ AC_SUBST(UUID_LIBS)
 ##
 
 dnl sys/socket.h is required by AC_FUNC_ACCEPT_ARGTYPES
-AC_CHECK_HEADERS([atomic.h crypt.h dld.h fp_class.h getopt.h ieeefp.h ifaddrs.h langinfo.h mbarrier.h poll.h pwd.h sys/ioctl.h sys/ipc.h sys/poll.h sys/pstat.h sys/resource.h sys/select.h sys/sem.h sys/shm.h sys/socket.h sys/sockio.h sys/tas.h sys/time.h sys/un.h termios.h ucred.h utime.h wchar.h wctype.h])
+AC_CHECK_HEADERS([atomic.h crypt.h dld.h fp_class.h getopt.h ieeefp.h ifaddrs.h langinfo.h mbarrier.h poll.h pwd.h sys/epoll.h sys/ioctl.h sys/ipc.h sys/poll.h sys/pstat.h sys/resource.h sys/select.h sys/sem.h sys/shm.h sys/socket.h sys/sockio.h sys/tas.h sys/time.h sys/un.h termios.h ucred.h utime.h wchar.h wctype.h])
 
 # On BSD, test for net/if.h will fail unless sys/socket.h
 # is included first.
diff --git a/src/backend/port/unix_latch.c b/src/backend/port/unix_latch.c
index 03bca68..5e0edf6 100644
--- a/src/backend/port/unix_latch.c
+++ b/src/backend/port/unix_latch.c
@@ -38,6 +38,9 @@
 #include <unistd.h>
 #include <sys/time.h>
 #include <sys/types.h>
+#ifdef HAVE_SYS_EPOLL_H
+#include <sys/epoll.h>
+#endif
 #ifdef HAVE_POLL_H
 #include <poll.h>
 #endif
@@ -62,8 +65,10 @@
  * useful to manually specify the used primitive.  If desired, just add a
  * define somewhere before this block.
  */
-#if defined(LATCH_USE_POLL) || defined(LATCH_USE_SELECT)
+#if defined(LATCH_USE_EPOLL) || defined(LATCH_USE_POLL) || defined(LATCH_USE_SELECT)
 /* don't overwrite manual choice */
+#elif defined(HAVE_SYS_EPOLL_H)
+#define LATCH_USE_EPOLL
 #elif defined(HAVE_POLL)
 #define LATCH_USE_POLL
 #elif HAVE_SYS_SELECT_H
@@ -82,6 +87,9 @@ static int	selfpipe_writefd = -1;
 /* Private function prototypes */
 static void sendSelfPipeByte(void);
 static void drainSelfPipe(void);
+#ifdef LATCH_USE_EPOLL
+static void initEpoll(volatile Latch *latch);
+#endif
 
 
 /*
@@ -127,6 +135,10 @@ InitLatch(volatile Latch *latch)
 	latch->is_set = false;
 	latch->owner_pid = MyProcPid;
 	latch->is_shared = false;
+
+#ifdef LATCH_USE_EPOLL
+	initEpoll(latch);
+#endif
 }
 
 /*
@@ -174,6 +186,10 @@ OwnLatch(volatile Latch *latch)
 		elog(ERROR, "latch already owned");
 
 	latch->owner_pid = MyProcPid;
+
+#ifdef LATCH_USE_EPOLL
+	initEpoll(latch);
+#endif
 }
 
 /*
@@ -186,6 +202,14 @@ DisownLatch(volatile Latch *latch)
 	Assert(latch->owner_pid == MyProcPid);
 
 	latch->owner_pid = 0;
+
+#ifdef LATCH_USE_EPOLL
+	if (latch->epollfd >= 0)
+	{
+		close(latch->epollfd);
+		latch->epollfd = -1;
+	}
+#endif
 }
 
 /*
@@ -231,7 +255,9 @@ WaitLatchOrSocket(volatile Latch *latch, int wakeEvents, pgsocket sock,
 				cur_time;
 	long		cur_timeout;
 
-#if defined(LATCH_USE_POLL)
+#if defined(LATCH_USE_EPOLL)
+	struct epoll_event events[1];
+#elif defined(LATCH_USE_POLL)
 	struct pollfd pfds[3];
 	int			nfds;
 #elif defined(LATCH_USE_SELECT)
@@ -311,7 +337,175 @@ WaitLatchOrSocket(volatile Latch *latch, int wakeEvents, pgsocket sock,
 		 * Must wait ... we use the polling interface determined at the top of
 		 * this file to do so.
 		 */
-#if defined(LATCH_USE_POLL)
+#if defined(LATCH_USE_EPOLL)
+		if (wakeEvents != latch->lastmask || latch->lastwatchfd != sock)
+		{
+			bool sockfd_changed = latch->lastwatchfd != sock;
+
+			if (latch->lastwatchfd != -1 && sockfd_changed)
+			{
+				struct epoll_event data;
+
+				/*
+				 * Unnecessarily pass data for delete due to bug errorneously
+				 * requiring it in the past.
+				 */
+				rc = epoll_ctl(latch->epollfd, EPOLL_CTL_DEL,
+							   latch->lastwatchfd, &data);
+				if (rc < 0)
+				{
+					waiting = false;
+					ereport(ERROR,
+							(errcode_for_socket_access(),
+							 errmsg("epoll_ctl() failed: %m")));
+				}
+
+				latch->lastwatchfd = -1;
+			}
+
+			if (sock != -1 && sockfd_changed)
+			{
+				struct epoll_event data;
+				data.events = 0;
+				data.data.fd = sock;
+				rc = epoll_ctl(latch->epollfd, EPOLL_CTL_ADD, sock, &data);
+				if (rc < 0)
+				{
+					waiting = false;
+					ereport(ERROR,
+							(errcode_for_socket_access(),
+							 errmsg("epoll_ctl() failed: %m")));
+				}
+
+				latch->lastwatchfd = sock;
+			}
+
+			if (sock != -1 && (
+					sockfd_changed ||
+					(wakeEvents & (WL_SOCKET_READABLE | WL_SOCKET_WRITEABLE)) !=
+					(latch->lastmask & (WL_SOCKET_READABLE | WL_SOCKET_WRITEABLE))))
+			{
+				struct epoll_event data;
+
+				data.events = EPOLLRDHUP | EPOLLERR | EPOLLHUP;
+				data.data.fd = sock;
+
+				if (wakeEvents & WL_SOCKET_READABLE)
+					data.events |= EPOLLIN;
+				if (wakeEvents & WL_SOCKET_WRITEABLE)
+					data.events |= EPOLLOUT;
+
+				rc = epoll_ctl(latch->epollfd, EPOLL_CTL_MOD, sock, &data);
+				if (rc < 0)
+				{
+					waiting = false;
+					ereport(ERROR,
+							(errcode_for_socket_access(),
+							 errmsg("epoll_ctl() failed: %m")));
+				}
+			}
+
+			if ((latch->lastmask & WL_POSTMASTER_DEATH) &&
+				!(wakeEvents & WL_POSTMASTER_DEATH))
+			{
+				struct epoll_event data;
+
+				/*
+				 * Unnecessarily pass data for delete due to bug errorneously
+				 * requiring it in the past.
+				 */
+				rc = epoll_ctl(latch->epollfd, EPOLL_CTL_DEL,
+							   postmaster_alive_fds[POSTMASTER_FD_WATCH],
+							   &data);
+				if (rc < 0)
+				{
+					waiting = false;
+					ereport(ERROR,
+							(errcode_for_socket_access(),
+							 errmsg("epoll_ctl() failed: %m")));
+				}
+			}
+
+
+			if (!(latch->lastmask & WL_POSTMASTER_DEATH) &&
+				(wakeEvents & WL_POSTMASTER_DEATH))
+			{
+				struct epoll_event data;
+
+				data.events = EPOLLIN | EPOLLHUP | EPOLLRDHUP | EPOLLERR;
+				data.data.fd = postmaster_alive_fds[POSTMASTER_FD_WATCH];
+
+				rc = epoll_ctl(latch->epollfd, EPOLL_CTL_ADD,
+							   postmaster_alive_fds[POSTMASTER_FD_WATCH],
+							   &data);
+				if (rc < 0)
+				{
+					waiting = false;
+					ereport(ERROR,
+							(errcode_for_socket_access(),
+							 errmsg("epoll_ctl() failed: %m")));
+				}
+			}
+
+			latch->lastmask = wakeEvents;
+		}
+
+		rc = epoll_wait(latch->epollfd, events, 1, cur_timeout);
+		if (rc < 0)
+		{
+			/* EINTR is okay, otherwise complain */
+			if (errno != EINTR)
+			{
+				waiting = false;
+				ereport(ERROR,
+						(errcode_for_socket_access(),
+						 errmsg("epoll_wait() failed: %m")));
+			}
+		}
+		else if (rc == 0)
+		{
+			/* timeout exceeded */
+			if (wakeEvents & WL_TIMEOUT)
+				result |= WL_TIMEOUT;
+		}
+		else
+		{
+			if (events[0].data.fd == sock)
+			{
+				/* data available in socket */
+				if (events[0].events & EPOLLIN)
+					result |= WL_SOCKET_READABLE;
+
+				/* socket is writable */
+				if (events[0].events & EPOLLOUT)
+					result |= WL_SOCKET_WRITEABLE;
+
+				/* EOF/error condition */
+				if (events[0].events & (EPOLLERR | EPOLLHUP | EPOLLRDHUP))
+				{
+					if (wakeEvents & WL_SOCKET_READABLE)
+						result |= WL_SOCKET_READABLE;
+					if (wakeEvents & WL_SOCKET_WRITEABLE)
+						result |= WL_SOCKET_WRITEABLE;
+				}
+			}
+
+			if (events[0].data.fd == postmaster_alive_fds[POSTMASTER_FD_WATCH] &&
+				events[0].events & (EPOLLIN | EPOLLHUP | EPOLLERR | EPOLLRDHUP))
+			{
+				/* check comment for the corresponding LATCH_USE_POLL case */
+				Assert(!PostmasterIsAlive());
+				result |= WL_POSTMASTER_DEATH;
+			}
+
+			if (events[0].data.fd == selfpipe_readfd &&
+				events[0].events & EPOLLIN)
+			{
+				/* There's data in the self-pipe, clear it. */
+				drainSelfPipe();
+			}
+		}
+#elif defined(LATCH_USE_POLL)
 		nfds = 0;
 
 		/* selfpipe is always in pfds[0] */
@@ -725,3 +919,31 @@ drainSelfPipe(void)
 		/* else buffer wasn't big enough, so read again */
 	}
 }
+
+#ifdef LATCH_USE_EPOLL
+/*
+ * Create the epoll fd used to wait for readiness. Needs to be called whenever
+ * owning a latch, be it a shared or a backend-local one.
+ */
+static void
+initEpoll(volatile Latch *latch)
+{
+	struct epoll_event data;
+	int rc;
+
+	/* one each for selfpipe, socket, postmaster alive fd */
+	latch->epollfd = epoll_create(3);
+	if (latch->epollfd < 0)
+		elog(FATAL, "epoll_create failed: %m");
+
+	/* always want to be nodified of writes into thee self-pipe */
+	data.events = EPOLLIN;
+	data.data.fd = selfpipe_readfd;
+	rc = epoll_ctl(latch->epollfd, EPOLL_CTL_ADD, selfpipe_readfd, &data);
+	if (rc < 0)
+		elog(FATAL, "epoll_ctl failed: %m");
+
+	latch->lastwatchfd = -1;
+	latch->lastmask = 0;
+}
+#endif
diff --git a/src/include/pg_config.h.in b/src/include/pg_config.h.in
index 16a272e..0fc4ce2 100644
--- a/src/include/pg_config.h.in
+++ b/src/include/pg_config.h.in
@@ -530,6 +530,9 @@
 /* Define to 1 if you have the syslog interface. */
 #undef HAVE_SYSLOG
 
+/* Define to 1 if you have the <sys/epoll.h> header file. */
+#undef HAVE_SYS_EPOLL_H
+
 /* Define to 1 if you have the <sys/ioctl.h> header file. */
 #undef HAVE_SYS_IOCTL_H
 
diff --git a/src/include/storage/latch.h b/src/include/storage/latch.h
index e77491e..3666352 100644
--- a/src/include/storage/latch.h
+++ b/src/include/storage/latch.h
@@ -92,6 +92,10 @@ typedef struct Latch
 	int			owner_pid;
 #ifdef WIN32
 	HANDLE		event;
+#elif defined(HAVE_SYS_EPOLL_H)
+	int			epollfd;
+	int			lastwatchfd;
+	int			lastmask;
 #endif
 } Latch;
 
-- 
2.5.0.400.gff86faf.dirty

-- 
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers

Reply via email to