From: Willem de Bruijn <[email protected]>

The underlying hrtimer is programmed with nanosecond resolution.

Use cases such as datacenter networking operate on timescales well
below milliseconds. Setting shorter timeouts bounds tail latency.

Add epoll_create1 flag EPOLL_NSTIMEO. When passed, this changes the
interpretation of argument timeout in epoll_wait from msec to nsec.

The new eventpoll state fits in existing 4B of padding when busy poll
is compiled in (the default), and reads the same cacheline.

Signed-off-by: Willem de Bruijn <[email protected]>

---

Selftest for now at github. Can follow-up for kselftests.
https://github.com/wdebruij/kerneltools/blob/master/tests/epoll_nstimeo.c
---
 fs/eventpoll.c                 | 26 +++++++++++++++++++-------
 include/uapi/linux/eventpoll.h |  1 +
 2 files changed, 20 insertions(+), 7 deletions(-)

diff --git a/fs/eventpoll.c b/fs/eventpoll.c
index 4df61129566d..1216b909d155 100644
--- a/fs/eventpoll.c
+++ b/fs/eventpoll.c
@@ -225,6 +225,9 @@ struct eventpoll {
        unsigned int napi_id;
 #endif
 
+       /* Accept timeout in ns resolution (EPOLL_NSTIMEO) */
+       unsigned int nstimeout:1;
+
 #ifdef CONFIG_DEBUG_LOCK_ALLOC
        /* tracks wakeup nests for lockdep validation */
        u8 nests;
@@ -1787,17 +1790,20 @@ static int ep_send_events(struct eventpoll *ep,
        return esed.res;
 }
 
-static inline struct timespec64 ep_set_mstimeout(long ms)
+static inline struct timespec64 ep_set_nstimeout(long ns)
 {
-       struct timespec64 now, ts = {
-               .tv_sec = ms / MSEC_PER_SEC,
-               .tv_nsec = NSEC_PER_MSEC * (ms % MSEC_PER_SEC),
-       };
+       struct timespec64 now, ts;
 
+       ts = ns_to_timespec64(ns);
        ktime_get_ts64(&now);
        return timespec64_add_safe(now, ts);
 }
 
+static inline struct timespec64 ep_set_mstimeout(long ms)
+{
+       return ep_set_nstimeout(NSEC_PER_MSEC * ms);
+}
+
 /**
  * ep_poll - Retrieves ready events, and delivers them to the caller supplied
  *           event buffer.
@@ -1826,7 +1832,10 @@ static int ep_poll(struct eventpoll *ep, struct 
epoll_event __user *events,
        lockdep_assert_irqs_enabled();
 
        if (timeout > 0) {
-               struct timespec64 end_time = ep_set_mstimeout(timeout);
+               struct timespec64 end_time;
+
+               end_time = ep->nstimeout ? ep_set_nstimeout(timeout) :
+                                          ep_set_mstimeout(timeout);
 
                slack = select_estimate_accuracy(&end_time);
                to = &expires;
@@ -2046,7 +2055,7 @@ static int do_epoll_create(int flags)
        /* Check the EPOLL_* constant for consistency.  */
        BUILD_BUG_ON(EPOLL_CLOEXEC != O_CLOEXEC);
 
-       if (flags & ~EPOLL_CLOEXEC)
+       if (flags & ~(EPOLL_CLOEXEC | EPOLL_NSTIMEO))
                return -EINVAL;
        /*
         * Create the internal data structure ("struct eventpoll").
@@ -2054,6 +2063,9 @@ static int do_epoll_create(int flags)
        error = ep_alloc(&ep);
        if (error < 0)
                return error;
+
+       ep->nstimeout = !!(flags & EPOLL_NSTIMEO);
+
        /*
         * Creates all the items needed to setup an eventpoll file. That is,
         * a file structure and a free file descriptor.
diff --git a/include/uapi/linux/eventpoll.h b/include/uapi/linux/eventpoll.h
index 8a3432d0f0dc..f6ef9c9f8ac2 100644
--- a/include/uapi/linux/eventpoll.h
+++ b/include/uapi/linux/eventpoll.h
@@ -21,6 +21,7 @@
 
 /* Flags for epoll_create1.  */
 #define EPOLL_CLOEXEC O_CLOEXEC
+#define EPOLL_NSTIMEO 0x1
 
 /* Valid opcodes to issue to sys_epoll_ctl() */
 #define EPOLL_CTL_ADD 1
-- 
2.29.0.rc1.297.gfa9743e501-goog

Reply via email to