From 4bfc80ac0a43131d3c8923a16a69ac7b46ddc6f7 Mon Sep 17 00:00:00 2001
From: Thomas Munro <thomas.munro@enterprisedb.com>
Date: Thu, 22 Jun 2017 18:40:27 +1200
Subject: [PATCH] Add kqueue(2) support for WaitEventSet.

Add a kqueue(2) based implementation of WaitEventSet for use on BSD
family systems including FreeBSD, NetBSD, OpenBSD and macOS.  This
is similar to the epoll(2) implementation for Linux.

Unlike the epoll(2) implementation, the kqueue(2) implementation
doesn't need to use the postmaster_alive_fds pipe.  Instead, it
requests notification of process exit.

Author: Thomas Munro
Reviewed-By: Andres Freund, Marko Tiikkaja
Tested-By: Mateusz Guzik, Matteo Beccati, Keith Fiske, Heikki Linnakangas
Discussion: https://postgr.es/m/CAEepm%3D37oF84-iXDTQ9MrGjENwVGds%2B5zTr38ca73kWR7ez_tA%40mail.gmail.com
---
 configure                       |   4 +-
 configure.in                    |   2 +
 src/backend/storage/ipc/latch.c | 298 +++++++++++++++++++++++++++++++-
 src/include/pg_config.h.in      |   6 +
 src/include/pg_config.h.win32   |   9 +
 5 files changed, 316 insertions(+), 3 deletions(-)

diff --git a/configure b/configure
index 9de50377ff..e3dd9b8191 100755
--- a/configure
+++ b/configure
@@ -12758,7 +12758,7 @@ $as_echo "#define HAVE_STDBOOL_H 1" >>confdefs.h
 fi
 
 
-for ac_header in atomic.h copyfile.h execinfo.h fp_class.h getopt.h ieeefp.h ifaddrs.h langinfo.h mbarrier.h poll.h sys/epoll.h sys/ipc.h sys/prctl.h sys/procctl.h sys/pstat.h sys/resource.h sys/select.h sys/sem.h sys/shm.h sys/sockio.h sys/tas.h sys/un.h termios.h ucred.h utime.h wchar.h wctype.h
+for ac_header in atomic.h copyfile.h execinfo.h fp_class.h getopt.h ieeefp.h ifaddrs.h langinfo.h mbarrier.h poll.h sys/epoll.h sys/event.h sys/ipc.h sys/prctl.h sys/procctl.h sys/pstat.h sys/resource.h sys/select.h sys/sem.h sys/shm.h sys/sockio.h sys/tas.h sys/un.h termios.h ucred.h utime.h wchar.h wctype.h
 do :
   as_ac_Header=`$as_echo "ac_cv_header_$ac_header" | $as_tr_sh`
 ac_fn_c_check_header_mongrel "$LINENO" "$ac_header" "$as_ac_Header" "$ac_includes_default"
@@ -14994,7 +14994,7 @@ fi
 LIBS_including_readline="$LIBS"
 LIBS=`echo "$LIBS" | sed -e 's/-ledit//g' -e 's/-lreadline//g'`
 
-for ac_func in backtrace_symbols cbrt clock_gettime copyfile fdatasync getifaddrs getpeerucred getrlimit mbstowcs_l memset_s memmove poll posix_fallocate ppoll pstat pthread_is_threaded_np readlink setproctitle setproctitle_fast setsid shm_open strchrnul strsignal symlink sync_file_range uselocale utime utimes wcstombs_l
+for ac_func in backtrace_symbols cbrt clock_gettime copyfile fdatasync getifaddrs getpeerucred getrlimit kqueue mbstowcs_l memset_s memmove poll posix_fallocate ppoll pstat pthread_is_threaded_np readlink setproctitle setproctitle_fast setsid shm_open strchrnul strsignal symlink sync_file_range uselocale utime utimes wcstombs_l
 do :
   as_ac_var=`$as_echo "ac_cv_func_$ac_func" | $as_tr_sh`
 ac_fn_c_check_func "$LINENO" "$ac_func" "$as_ac_var"
diff --git a/configure.in b/configure.in
index 9c5e5e7f8c..afd069f321 100644
--- a/configure.in
+++ b/configure.in
@@ -1287,6 +1287,7 @@ AC_CHECK_HEADERS(m4_normalize([
 	mbarrier.h
 	poll.h
 	sys/epoll.h
+	sys/event.h
 	sys/ipc.h
 	sys/prctl.h
 	sys/procctl.h
@@ -1627,6 +1628,7 @@ AC_CHECK_FUNCS(m4_normalize([
 	getifaddrs
 	getpeerucred
 	getrlimit
+	kqueue
 	mbstowcs_l
 	memset_s
 	memmove
diff --git a/src/backend/storage/ipc/latch.c b/src/backend/storage/ipc/latch.c
index 2426cbcf8e..3e79525a3c 100644
--- a/src/backend/storage/ipc/latch.c
+++ b/src/backend/storage/ipc/latch.c
@@ -39,6 +39,9 @@
 #ifdef HAVE_SYS_EPOLL_H
 #include <sys/epoll.h>
 #endif
+#ifdef HAVE_SYS_EVENT_H
+#include <sys/event.h>
+#endif
 #ifdef HAVE_POLL_H
 #include <poll.h>
 #endif
@@ -60,10 +63,12 @@
  * define somewhere before this block.
  */
 #if defined(WAIT_USE_EPOLL) || defined(WAIT_USE_POLL) || \
-	defined(WAIT_USE_WIN32)
+	defined(WAIT_USE_KQUEUE) || defined(WAIT_USE_WIN32)
 /* don't overwrite manual choice */
 #elif defined(HAVE_SYS_EPOLL_H)
 #define WAIT_USE_EPOLL
+#elif defined(HAVE_KQUEUE)
+#define WAIT_USE_KQUEUE
 #elif defined(HAVE_POLL)
 #define WAIT_USE_POLL
 #elif WIN32
@@ -93,6 +98,9 @@ struct WaitEventSet
 	Latch	   *latch;
 	int			latch_pos;
 
+	/* A pointer to an event deferred until the next wait. */
+	WaitEvent  *pending_event;
+
 	/*
 	 * WL_EXIT_ON_PM_DEATH is converted to WL_POSTMASTER_DEATH, but this flag
 	 * is set so that we'll exit immediately if postmaster death is detected,
@@ -104,6 +112,10 @@ struct WaitEventSet
 	int			epoll_fd;
 	/* epoll_wait returns events in a user provided arrays, allocate once */
 	struct epoll_event *epoll_ret_events;
+#elif defined(WAIT_USE_KQUEUE)
+	int			kqueue_fd;
+	/* kevent returns events in a user provided arrays, allocate once */
+	struct kevent *kqueue_ret_events;
 #elif defined(WAIT_USE_POLL)
 	/* poll expects events to be waited on every poll() call, prepare once */
 	struct pollfd *pollfds;
@@ -136,6 +148,8 @@ static void drainSelfPipe(void);
 
 #if defined(WAIT_USE_EPOLL)
 static void WaitEventAdjustEpoll(WaitEventSet *set, WaitEvent *event, int action);
+#elif defined(WAIT_USE_KQUEUE)
+static void WaitEventAdjustKqueue(WaitEventSet *set, WaitEvent *event, int old_events);
 #elif defined(WAIT_USE_POLL)
 static void WaitEventAdjustPoll(WaitEventSet *set, WaitEvent *event);
 #elif defined(WAIT_USE_WIN32)
@@ -556,6 +570,8 @@ CreateWaitEventSet(MemoryContext context, int nevents)
 
 #if defined(WAIT_USE_EPOLL)
 	sz += MAXALIGN(sizeof(struct epoll_event) * nevents);
+#elif defined(WAIT_USE_KQUEUE)
+	sz += MAXALIGN(sizeof(struct kevent) * nevents);
 #elif defined(WAIT_USE_POLL)
 	sz += MAXALIGN(sizeof(struct pollfd) * nevents);
 #elif defined(WAIT_USE_WIN32)
@@ -574,6 +590,9 @@ CreateWaitEventSet(MemoryContext context, int nevents)
 #if defined(WAIT_USE_EPOLL)
 	set->epoll_ret_events = (struct epoll_event *) data;
 	data += MAXALIGN(sizeof(struct epoll_event) * nevents);
+#elif defined(WAIT_USE_KQUEUE)
+	set->kqueue_ret_events = (struct kevent *) data;
+	data += MAXALIGN(sizeof(struct kevent) * nevents);
 #elif defined(WAIT_USE_POLL)
 	set->pollfds = (struct pollfd *) data;
 	data += MAXALIGN(sizeof(struct pollfd) * nevents);
@@ -599,6 +618,12 @@ CreateWaitEventSet(MemoryContext context, int nevents)
 	if (fcntl(set->epoll_fd, F_SETFD, FD_CLOEXEC) == -1)
 		elog(ERROR, "fcntl(F_SETFD) failed on epoll descriptor: %m");
 #endif							/* EPOLL_CLOEXEC */
+#elif defined(WAIT_USE_KQUEUE)
+	set->kqueue_fd = kqueue();
+	if (set->kqueue_fd < 0)
+		elog(ERROR, "kqueue failed: %m");
+	if (fcntl(set->kqueue_fd, F_SETFD, FD_CLOEXEC) == -1)
+		elog(ERROR, "fcntl(F_SETFD) failed on kqueue descriptor: %m");
 #elif defined(WAIT_USE_WIN32)
 
 	/*
@@ -631,6 +656,8 @@ FreeWaitEventSet(WaitEventSet *set)
 {
 #if defined(WAIT_USE_EPOLL)
 	close(set->epoll_fd);
+#elif defined(WAIT_USE_KQUEUE)
+	close(set->kqueue_fd);
 #elif defined(WAIT_USE_WIN32)
 	WaitEvent  *cur_event;
 
@@ -747,6 +774,8 @@ AddWaitEventToSet(WaitEventSet *set, uint32 events, pgsocket fd, Latch *latch,
 	/* perform wait primitive specific initialization, if needed */
 #if defined(WAIT_USE_EPOLL)
 	WaitEventAdjustEpoll(set, event, EPOLL_CTL_ADD);
+#elif defined(WAIT_USE_KQUEUE)
+	WaitEventAdjustKqueue(set, event, 0);
 #elif defined(WAIT_USE_POLL)
 	WaitEventAdjustPoll(set, event);
 #elif defined(WAIT_USE_WIN32)
@@ -766,10 +795,16 @@ void
 ModifyWaitEvent(WaitEventSet *set, int pos, uint32 events, Latch *latch)
 {
 	WaitEvent  *event;
+#if defined(WAIT_USE_KQUEUE)
+	int old_events;
+#endif
 
 	Assert(pos < set->nevents);
 
 	event = &set->events[pos];
+#if defined(WAIT_USE_KQUEUE)
+	old_events = event->events;
+#endif
 
 	/*
 	 * If neither the event mask nor the associated latch changes, return
@@ -803,6 +838,8 @@ ModifyWaitEvent(WaitEventSet *set, int pos, uint32 events, Latch *latch)
 
 #if defined(WAIT_USE_EPOLL)
 	WaitEventAdjustEpoll(set, event, EPOLL_CTL_MOD);
+#elif defined(WAIT_USE_KQUEUE)
+	WaitEventAdjustKqueue(set, event, old_events);
 #elif defined(WAIT_USE_POLL)
 	WaitEventAdjustPoll(set, event);
 #elif defined(WAIT_USE_WIN32)
@@ -895,6 +932,129 @@ WaitEventAdjustPoll(WaitEventSet *set, WaitEvent *event)
 }
 #endif
 
+#if defined(WAIT_USE_KQUEUE)
+
+/*
+ * On most BSD family systems, the udata member of struct kevent is of type
+ * void *, so we could directly convert to/from WaitEvent *.  Unfortunately,
+ * NetBSD has it as intptr_t, so here we wallpaper over that difference with
+ * an lvalue cast.
+ */
+#define AccessWaitEvent(k_ev) (*((WaitEvent **)(&(k_ev)->udata)))
+
+static inline void
+WaitEventAdjustKqueueAdd(struct kevent *k_ev, int filter, int action,
+						 WaitEvent *event)
+{
+	k_ev->ident = event->fd;
+	k_ev->filter = filter;
+	k_ev->flags = action | EV_CLEAR;
+	k_ev->fflags = 0;
+	k_ev->data = 0;
+	AccessWaitEvent(k_ev) = event;
+}
+
+static inline void
+WaitEventAdjustKqueueAddPostmaster(struct kevent *k_ev, WaitEvent *event)
+{
+	/* For now postmaster death can only be added, not removed. */
+	k_ev->ident = PostmasterPid;
+	k_ev->filter = EVFILT_PROC;
+	k_ev->flags = EV_ADD | EV_CLEAR;
+	k_ev->fflags = NOTE_EXIT;
+	k_ev->data = 0;
+	AccessWaitEvent(k_ev) = event;
+}
+
+/*
+ * old_events is the previous event mask, used to compute what has changed.
+ */
+static void
+WaitEventAdjustKqueue(WaitEventSet *set, WaitEvent *event, int old_events)
+{
+	int rc;
+	struct kevent k_ev[2];
+	int count = 0;
+	bool new_filt_read = false;
+	bool old_filt_read = false;
+	bool new_filt_write = false;
+	bool old_filt_write = false;
+
+	if (old_events == event->events)
+		return;
+
+	Assert(event->events != WL_LATCH_SET || set->latch != NULL);
+	Assert(event->events == WL_LATCH_SET ||
+		   event->events == WL_POSTMASTER_DEATH ||
+		   (event->events & (WL_SOCKET_READABLE | WL_SOCKET_WRITEABLE)));
+
+	if (event->events == WL_POSTMASTER_DEATH)
+	{
+		/*
+		 * Unlike all the other implementations, we detect postmaster death
+		 * using process notification instead of waiting on the postmaster
+		 * alive pipe.
+		 */
+		WaitEventAdjustKqueueAddPostmaster(&k_ev[count++], event);
+	}
+	else
+	{
+		/*
+		 * We need to compute the adds and deletes required to get from the
+		 * old event mask to the new event mask, since kevent treats readable
+		 * and writable as separate events.
+		 */
+		if (old_events == WL_LATCH_SET ||
+			(old_events & WL_SOCKET_READABLE))
+				old_filt_read = true;
+		if (event->events == WL_LATCH_SET ||
+			(event->events & WL_SOCKET_READABLE))
+			new_filt_read = true;
+		if (old_events & WL_SOCKET_WRITEABLE)
+			old_filt_write = true;
+		if (event->events & WL_SOCKET_WRITEABLE)
+			new_filt_write = true;
+		if (old_filt_read && !new_filt_read)
+			WaitEventAdjustKqueueAdd(&k_ev[count++], EVFILT_READ, EV_DELETE,
+									 event);
+		else if (!old_filt_read && new_filt_read)
+			WaitEventAdjustKqueueAdd(&k_ev[count++], EVFILT_READ, EV_ADD,
+									 event);
+		if (old_filt_write && !new_filt_write)
+			WaitEventAdjustKqueueAdd(&k_ev[count++], EVFILT_WRITE, EV_DELETE,
+									 event);
+		else if (!old_filt_write && new_filt_write)
+			WaitEventAdjustKqueueAdd(&k_ev[count++], EVFILT_WRITE, EV_ADD,
+									 event);
+	}
+
+	Assert(count > 0);
+	Assert(count <= 2);
+
+	rc = kevent(set->kqueue_fd, &k_ev[0], count, NULL, 0, NULL);
+
+	if (rc < 0)
+	{
+		if (event->events == WL_POSTMASTER_DEATH && errno == ESRCH)
+		{
+			/*
+			 * The postmaster is already dead.  Defer reporting this to the
+			 * caller until wait time, for compatibility with the other
+			 * implementations.
+			 */
+			set->pending_event = event;
+		}
+		else
+		{
+			ereport(ERROR,
+					(errcode_for_socket_access(),
+					 errmsg("kevent() failed: %m")));
+		}
+	}
+}
+
+#endif
+
 #if defined(WAIT_USE_WIN32)
 static void
 WaitEventAdjustWin32(WaitEventSet *set, WaitEvent *event)
@@ -961,6 +1121,14 @@ WaitEventSetWait(WaitEventSet *set, long timeout,
 
 	Assert(nevents > 0);
 
+	/* Return an event that was deferred until the next wait. */
+	if (unlikely(set->pending_event))
+	{
+		occurred_events[0] = *set->pending_event;
+		set->pending_event = NULL;
+		return 1;
+	}
+
 	/*
 	 * Initialize timeout if requested.  We must record the current time so
 	 * that we can determine the remaining timeout if interrupted.
@@ -1186,6 +1354,134 @@ WaitEventSetWaitBlock(WaitEventSet *set, int cur_timeout,
 	return returned_events;
 }
 
+#elif defined(WAIT_USE_KQUEUE)
+
+/*
+ * Wait using FreeBSD kqueue(2)/kevent(2).  Also available on other BSD-family
+ * systems including macOS.
+ *
+ * This is the preferrable wait method for systems that have it, as several
+ * readiness notifications are delivered, without having to iterate through
+ * all of set->events.
+ *
+ * For now this mirrors the epoll code, but in future it could modify the fd
+ * set in the same call to kevent as it uses for waiting instead of doing that
+ * with separate system calls.
+ */
+static int
+WaitEventSetWaitBlock(WaitEventSet *set, int cur_timeout,
+					  WaitEvent *occurred_events, int nevents)
+{
+	int			returned_events = 0;
+	int			rc;
+	WaitEvent  *cur_event;
+	struct kevent *cur_kqueue_event;
+	struct timespec timeout;
+	struct timespec *timeout_p;
+
+	if (cur_timeout < 0)
+		timeout_p = NULL;
+	else
+	{
+		timeout.tv_sec = cur_timeout / 1000;
+		timeout.tv_nsec = (cur_timeout % 1000) * 1000000;
+		timeout_p = &timeout;
+	}
+
+	/* Sleep */
+	rc = kevent(set->kqueue_fd, NULL, 0,
+				set->kqueue_ret_events, nevents,
+				timeout_p);
+
+	/* Check return code */
+	if (rc < 0)
+	{
+		/* EINTR is okay, otherwise complain */
+		if (errno != EINTR)
+		{
+			waiting = false;
+			ereport(ERROR,
+					(errcode_for_socket_access(),
+					 errmsg("kevent() failed while trying to wait: %m")));
+		}
+		return 0;
+	}
+	else if (rc == 0)
+	{
+		/* timeout exceeded */
+		return -1;
+	}
+
+	/*
+	 * At least one event occurred, iterate over the returned kqueue events
+	 * until they're either all processed, or we've returned all the events
+	 * the caller desired.
+	 */
+	for (cur_kqueue_event = set->kqueue_ret_events;
+		 cur_kqueue_event < (set->kqueue_ret_events + rc) &&
+		 returned_events < nevents;
+		 cur_kqueue_event++)
+	{
+		/* kevent's udata points to the associated WaitEvent */
+		cur_event = AccessWaitEvent(cur_kqueue_event);
+
+		occurred_events->pos = cur_event->pos;
+		occurred_events->user_data = cur_event->user_data;
+		occurred_events->events = 0;
+
+		if (cur_event->events == WL_LATCH_SET &&
+			cur_kqueue_event->filter == EVFILT_READ)
+		{
+			/* There's data in the self-pipe, clear it. */
+			drainSelfPipe();
+
+			if (set->latch->is_set)
+			{
+				occurred_events->fd = PGINVALID_SOCKET;
+				occurred_events->events = WL_LATCH_SET;
+				occurred_events++;
+				returned_events++;
+			}
+		}
+		else if (cur_event->events == WL_POSTMASTER_DEATH &&
+				 cur_kqueue_event->filter == EVFILT_PROC &&
+				 (cur_kqueue_event->fflags & NOTE_EXIT) != 0)
+		{
+			occurred_events->fd = PGINVALID_SOCKET;
+			occurred_events->events = WL_POSTMASTER_DEATH;
+			occurred_events++;
+			returned_events++;
+		}
+		else if (cur_event->events & (WL_SOCKET_READABLE | WL_SOCKET_WRITEABLE))
+		{
+			Assert(cur_event->fd >= 0);
+
+			if ((cur_event->events & WL_SOCKET_READABLE) &&
+				(cur_kqueue_event->filter == EVFILT_READ))
+			{
+				/* readable, or EOF */
+				occurred_events->events |= WL_SOCKET_READABLE;
+			}
+
+			if ((cur_event->events & WL_SOCKET_WRITEABLE) &&
+				(cur_kqueue_event->filter == EVFILT_WRITE))
+			{
+				/* writable, or EOF */
+				occurred_events->events |= WL_SOCKET_WRITEABLE;
+			}
+
+			if (occurred_events->events != 0)
+			{
+				occurred_events->fd = cur_event->fd;
+				occurred_events++;
+				returned_events++;
+			}
+		}
+	}
+
+	return returned_events;
+}
+
 #elif defined(WAIT_USE_POLL)
 
 /*
diff --git a/src/include/pg_config.h.in b/src/include/pg_config.h.in
index 050c48b108..54ac286e9b 100644
--- a/src/include/pg_config.h.in
+++ b/src/include/pg_config.h.in
@@ -331,6 +331,9 @@
 /* Define to 1 if __builtin_constant_p(x) implies "i"(x) acceptance. */
 #undef HAVE_I_CONSTRAINT__BUILTIN_CONSTANT_P
 
+/* Define to 1 if you have the `kqueue' function. */
+#undef HAVE_KQUEUE
+
 /* Define to 1 if you have the <langinfo.h> header file. */
 #undef HAVE_LANGINFO_H
 
@@ -602,6 +605,9 @@
 /* Define to 1 if you have the <sys/epoll.h> header file. */
 #undef HAVE_SYS_EPOLL_H
 
+/* Define to 1 if you have the <sys/event.h> header file. */
+#undef HAVE_SYS_EVENT_H
+
 /* Define to 1 if you have the <sys/ipc.h> header file. */
 #undef HAVE_SYS_IPC_H
 
diff --git a/src/include/pg_config.h.win32 b/src/include/pg_config.h.win32
index 808f5abcdb..dd261fc65a 100644
--- a/src/include/pg_config.h.win32
+++ b/src/include/pg_config.h.win32
@@ -231,6 +231,9 @@
 /* Define to 1 if you have isinf(). */
 #define HAVE_ISINF 1
 
+/* Define to 1 if you have the `kqueue' function. */
+#undef HAVE_KQUEUE
+
 /* Define to 1 if you have the <langinfo.h> header file. */
 /* #undef HAVE_LANGINFO_H */
 
@@ -443,6 +446,12 @@
 /* Define to 1 if you have the syslog interface. */
 /* #undef HAVE_SYSLOG */
 
+/* Define to 1 if you have the <sys/epoll.h> header file. */
+/* #undef HAVE_SYS_EPOLL_H */
+
+/* Define to 1 if you have the <sys/event.h> header file. */
+/* #undef HAVE_SYS_EVENT_H */
+
 /* Define to 1 if you have the <sys/ipc.h> header file. */
 /* #undef HAVE_SYS_IPC_H */
 
-- 
2.23.0

