From: Anton Ivanov <anton.iva...@cambridgegreys.com>
Switches FDs which are marked as persistent in persistent
poll loops to use epoll instead of poll
Signed-off-by: Anton Ivanov <anton.iva...@cambridgegreys.com>
---
lib/poll-loop.c | 103 +++++++++++++++++++++++++++++++++++++++++++++++-
lib/timeval.c | 86 ++++++++++++++++++++++++++++++++++++++++
lib/timeval.h | 7 ++++
3 files changed, 194 insertions(+), 2 deletions(-)
diff --git a/lib/poll-loop.c b/lib/poll-loop.c
index 68e44eba2..5ce5977d5 100644
--- a/lib/poll-loop.c
+++ b/lib/poll-loop.c
@@ -38,6 +38,14 @@ VLOG_DEFINE_THIS_MODULE(poll_loop);
COVERAGE_DEFINE(poll_create_node);
COVERAGE_DEFINE(poll_zero_timeout);
+#define MAX_EPOLL_EVENTS 64
+
+#ifdef __linux__
+#define USE_EPOLL
+#include <unistd.h>
+#include <sys/epoll.h>
+#endif
+
struct poll_node {
struct hmap_node hmap_node;
struct pollfd pollfd; /* Events to pass to time_poll(). */
@@ -45,7 +53,6 @@ struct poll_node {
const char *where; /* Where poll_node was created. */
bool valid; /* Marked invalid if we got a HUP/NVAL from
poll */
};
-
struct poll_loop {
/* All active poll waiters. */
struct hmap poll_nodes;
@@ -55,10 +62,52 @@ struct poll_loop {
long long int timeout_when; /* In msecs as returned by time_msec(). */
const char *timeout_where; /* Where 'timeout_when' was set. */
bool persist;
+#ifdef USE_EPOLL
+ int epoll_fd;
+ struct epoll_event epoll_events[MAX_EPOLL_EVENTS];
+#endif
};
static struct poll_loop *poll_loop(void);
+#ifdef USE_EPOLL
+static inline int poll_to_epoll_events(short events) {
+ int ret = 0;
+ if (events & POLLIN) {
+ ret |= EPOLLIN;
+ }
+ if (events & POLLOUT) {
+ ret |= EPOLLOUT;
+ }
+ /* epoll always listens on ERR, no need to map,
+ * epoll distinguishes between HUP and RDHUP,
+ * they are same in poll, epoll has no NVAL
+ */
+ if (events & (POLLHUP | POLLNVAL)) {
+ ret |= (EPOLLHUP | EPOLLRDHUP);
+ }
+ return ret;
+}
+
+static inline short epoll_to_poll_events(int events) {
+ short ret = 0;
+ if (events & EPOLLIN) {
+ ret |= POLLIN;
+ }
+ if (events & EPOLLOUT) {
+ ret |= POLLOUT;
+ }
+ /* epoll always listens on ERR, no need to map,
+ * epoll distinguishes between HUP and RDHUP,
+ * they are same in poll, epoll has no NVAL
+ */
+ if (events & (EPOLLHUP | EPOLLRDHUP)) {
+ ret |= POLLHUP;
+ }
+ return ret;
+}
+#endif
+
/* Look up the node with same fd or wevent. */
static struct poll_node *
find_poll_node(struct poll_loop *loop, int fd, HANDLE wevent)
@@ -106,6 +155,9 @@ static struct poll_node
{
struct poll_loop *loop = poll_loop();
struct poll_node *node;
+#ifdef USE_EPOLL
+ struct epoll_event event;
+#endif
COVERAGE_INC(poll_create_node);
@@ -115,6 +167,13 @@ static struct poll_node
/* Check for duplicate. If found, "or" the events. */
node = find_poll_node(loop, fd, wevent);
if (node) {
+#ifdef USE_EPOLL
+ if (loop->persist && (node->pollfd.events != events)) {
+ event.events = poll_to_epoll_events(node->pollfd.events | events);
+ event.data.ptr = node;
+ epoll_ctl(loop->epoll_fd, EPOLL_CTL_MOD, fd, &event);
+ }
+#endif
node->pollfd.events |= events;
} else {
node = xzalloc(sizeof *node);
@@ -130,6 +189,13 @@ static struct poll_node
node->wevent = wevent;
node->where = where;
node->valid = true;
+#ifdef USE_EPOLL
+ if (loop->persist) {
+ event.events = poll_to_epoll_events(events);
+ event.data.ptr = node;
+ epoll_ctl(loop->epoll_fd, EPOLL_CTL_ADD, fd, &event);
+ }
+#endif
}
return node;
}
@@ -186,6 +252,11 @@ poll_fd_deregister_at(int fd, const char *where) {
node = find_poll_node(loop, fd, 0);
if (node) {
+#ifdef USE_EPOLL
+ if (loop->persist) {
+ epoll_ctl(loop->epoll_fd, EPOLL_CTL_DEL, node->pollfd.fd, NULL);
+ }
+#endif
hmap_remove(&loop->poll_nodes, &node->hmap_node);
}
}
@@ -344,6 +415,11 @@ free_poll_nodes(struct poll_loop *loop)
HMAP_FOR_EACH_SAFE (node, next, hmap_node, &loop->poll_nodes) {
hmap_remove(&loop->poll_nodes, &node->hmap_node);
+#ifdef USE_EPOLL
+ if (loop->persist) {
+ epoll_ctl(loop->epoll_fd, EPOLL_CTL_DEL, node->pollfd.fd, NULL);
+ }
+#endif
#ifdef _WIN32
if (node->wevent && node->pollfd.fd) {
WSAEventSelect(node->pollfd.fd, NULL, 0);
@@ -455,6 +531,7 @@ persist_poll_block(struct poll_loop *loop)
/* Populate with all the fds and events. */
counter = 0;
+#ifndef USE_EPOLL
HMAP_FOR_EACH (node, hmap_node, &loop->poll_nodes) {
if (node->pollfd.events && node->valid) {
pollfds[counter] = node->pollfd;
@@ -478,6 +555,12 @@ persist_poll_block(struct poll_loop *loop)
retval = time_poll(pollfds, hmap_count(&loop->poll_nodes), wevents,
loop->timeout_when, &elapsed);
+#else
+ retval = time_epoll_wait(loop->epoll_fd,
+ (struct epoll_event *) &loop->epoll_events, MAX_EPOLL_EVENTS,
loop->timeout_when, &elapsed);
+ counter = retval;
+#endif
+
if (retval < 0) {
static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(1, 5);
VLOG_ERR_RL(&rl, "poll: %s", ovs_strerror(-retval));
@@ -485,7 +568,13 @@ persist_poll_block(struct poll_loop *loop)
log_wakeup(loop->timeout_where, NULL, elapsed);
} else {
for (i = 0; i < counter; i++) {
+#ifdef USE_EPOLL
+ node = loop->epoll_events[i].data.ptr;
+ pollfds[i] = node->pollfd;
+ pollfds[i].revents =
epoll_to_poll_events(loop->epoll_events[i].events);
+#else
node = find_poll_node(loop, pollfds[i].fd, 0);
+#endif
if (!node) {
VLOG_FATAL("poll: persistence state corrupted, no hash entry for
%d", pollfds[i].fd);
}
@@ -546,12 +635,19 @@ free_poll_loop(void *loop_)
free_poll_nodes(loop);
hmap_destroy(&loop->poll_nodes);
free(loop);
+#ifdef USE_EPOLL
+ if (loop->persist) {
+ close(loop->epoll_fd);
+ }
+#endif
}
void poll_enable_persist(void) {
struct poll_loop *loop = poll_loop();
-
loop->persist = true;
+#ifdef USE_EPOLL
+ loop->epoll_fd = epoll_create(MAX_EPOLL_EVENTS);
+#endif
}
static struct poll_loop *
@@ -573,6 +669,9 @@ poll_loop(void)
hmap_init(&loop->poll_nodes);
xpthread_setspecific(key, loop);
loop->persist = false;
+#ifdef USE_EPOLL
+ loop->epoll_fd = -1;
+#endif
}
return loop;
}
diff --git a/lib/timeval.c b/lib/timeval.c
index 193c7bab1..6b1f1cf5a 100644
--- a/lib/timeval.c
+++ b/lib/timeval.c
@@ -38,6 +38,9 @@
#include "unixctl.h"
#include "util.h"
#include "openvswitch/vlog.h"
+#ifdef __linux__
+#include <sys/epoll.h>
+#endif
VLOG_DEFINE_THIS_MODULE(timeval);
@@ -270,6 +273,89 @@ time_alarm(unsigned int secs)
deadline = now < LLONG_MAX - msecs ? now + msecs : LLONG_MAX;
}
+#ifdef __linux__
+
+/* Like epoll_wait(), except:
+ *
+ * - The timeout is specified as an absolute time, as defined by
+ * time_msec(), instead of a duration.
+ *
+ * - On error, returns a negative error code (instead of setting errno).
+ *
+ * - If interrupted by a signal, retries automatically until the original
+ * timeout is reached. (Because of this property, this function will
+ * never return -EINTR.)
+ *
+ * Stores the number of milliseconds elapsed during poll in '*elapsed'. */
+int
+time_epoll_wait(int epoll_fd, struct epoll_event *events, int max,
+ long long int timeout_when, int *elapsed)
+{
+ long long int *last_wakeup = last_wakeup_get();
+ long long int start;
+ bool quiescent;
+ int retval = 0;
+
+ time_init();
+ coverage_clear();
+ coverage_run();
+ if (*last_wakeup && !thread_is_pmd()) {
+ log_poll_interval(*last_wakeup);
+ }
+ start = time_msec();
+
+ timeout_when = MIN(timeout_when, deadline);
+ quiescent = ovsrcu_is_quiescent();
+
+ for (;;) {
+ long long int now = time_msec();
+ int time_left;
+
+ if (now >= timeout_when) {
+ time_left = 0;
+ } else if ((unsigned long long int) timeout_when - now > INT_MAX) {
+ time_left = INT_MAX;
+ } else {
+ time_left = timeout_when - now;
+ }
+
+ if (!quiescent) {
+ if (!time_left) {
+ ovsrcu_quiesce();
+ } else {
+ ovsrcu_quiesce_start();
+ }
+ }
+
+ retval = epoll_wait(epoll_fd, events, max, time_left);
+ if (retval < 0) {
+ retval = -errno;
+ }
+
+ if (!quiescent && time_left) {
+ ovsrcu_quiesce_end();
+ }
+
+ if (deadline <= time_msec()) {
+ fatal_signal_handler(SIGALRM);
+ if (retval < 0) {
+ retval = 0;
+ }
+ break;
+ }
+
+ if (retval != -EINTR) {
+ break;
+ }
+ }
+ *last_wakeup = time_msec();
+ refresh_rusage();
+ *elapsed = *last_wakeup - start;
+ return retval;
+}
+#endif
+
+
/* Like poll(), except:
*
* - The timeout is specified as an absolute time, as defined by
diff --git a/lib/timeval.h b/lib/timeval.h
index 502f703d4..d640eab17 100644
--- a/lib/timeval.h
+++ b/lib/timeval.h
@@ -20,6 +20,9 @@
#include <time.h>
#include "openvswitch/type-props.h"
#include "util.h"
+#ifdef __linux__
+#include <sys/epoll.h>
+#endif
#ifdef __cplusplus
extern "C" {
@@ -59,6 +62,10 @@ long long int time_wall_usec(void);
void time_timespec(struct timespec *);
void time_wall_timespec(struct timespec *);
void time_alarm(unsigned int secs);
+#ifdef __linux__
+int time_epoll_wait(int epoll_fd, struct epoll_event *events, int max,
+ long long int timeout_when, int *elapsed);
+#endif
int time_poll(struct pollfd *, int n_pollfds, HANDLE *handles,
long long int timeout_when, int *elapsed);