Hello,

As we understand the `epoll_modify` function has an optimization that saves
a `epoll_ctl(EPOLL_CTL_DEL)` by not removing a triggered fd. This holds for
cases when notification from a file descriptor is removed by a thread that
runs `ev_run`. However, for the cases when it is removed by other thread
the following happens:

1) Consider that we have a read-ready `io_watcher`
2) `ev_run` gets from a `epoll_wait` a read-ready fd and fires the
`io_watcher`
3) callback does something application specific (notifies other thread to
do reading?) and stops the `io_watcher`
4) `epoll_modify` optimization does not remove `fd` from `epoll`
5) `ev_run` again gets from a `epoll_wait` a read-ready fd (that was not
removed from `epoll` on previous step)
6) `epoll_ctl(EPOLL_CTL_DEL)`. `ev_run` goes into `epoll_wait` again

In the above case there's an additional `epoll_wait` in step 5) and
`epoll_ctl(EPOLL_CTL_DEL)` is called anyway.

For use cases when fd is read from other thread we propose to disable
delayed `epoll_ctl(EPOLL_CTL_DEL)` optimization (see the patch in
attachment).

Some measures for the patch.

Without it:
```
twrk -c 1 -t 1 -d 10s http://localhost:8080/hello
153175 requests
            309426 syscalls:sys_enter_epoll_ctl

            479451 syscalls:sys_enter_epoll_wait
```

With it:
```
twrk -c 1 -t 1 -d 10s http://localhost:8080/hello
148106 requests
            299008 syscalls:sys_enter_epoll_ctl

            314399 syscalls:sys_enter_epoll_wait <== significant reduction
of syscalls
```

What do you think of the idea?
--- libev/ev.c  (3645c03fc23f076ed7d607eabf6cf886919bfce0)
+++ libev/ev.c  (working tree)
@@ -3751,6 +3751,14 @@ ev_pending_count (EV_P) EV_NOEXCEPT
   return count;
 }
 
+void
+ev_set_io_pessimistic_remove (EV_P) EV_NOEXCEPT
+{
+#if EV_USE_EPOLL || EV_GENWRAP
+  epoll_pessimistic_remove = 1;
+#endif
+}
+
 ecb_noinline
 void
 ev_invoke_pending (EV_P)
--- libev/ev.h  (3645c03fc23f076ed7d607eabf6cf886919bfce0)
+++ libev/ev.h  (working tree)
@@ -676,6 +676,10 @@ EV_API_DECL void ev_set_loop_release_cb (EV_P_ void 
(*release)(EV_P) EV_NOEXCEPT
 
 EV_API_DECL unsigned int ev_pending_count (EV_P) EV_NOEXCEPT; /* number of 
pending events, if any */
 
+#define EV_HAS_IO_PESSIMISTIC_REMOVE 1
+/* Disable an optimistics epoll_ctl(..., EPOLL_CTL_DEL, ...) optimization for 
epoll backend, does nothing for other backends */
+EV_API_DECL void ev_set_io_pessimistic_remove (EV_P) EV_NOEXCEPT;
+
 /*
  * stop/start the timer handling.
  */
--- libev/ev_epoll.c    (3645c03fc23f076ed7d607eabf6cf886919bfce0)
+++ libev/ev_epoll.c    (working tree)
@@ -80,9 +80,24 @@ epoll_modify (EV_P_ int fd, int oev, int nev)
    * event in epoll_poll.
    * if the fd is added again, we try to ADD it, and, if that
    * fails, we assume it still has the same eventmask.
+   *
+   * However, if the select-ready notification is removed by another
+   * thread and not the thread that does ev_run, this would be a pessimization,
+   * condition it on epoll_pessimistic_remove.
    */
   if (!nev)
-    return;
+    {
+      if (epoll_pessimistic_remove)
+        {
+          /* pre-2.6.9 kernels require a non-null pointer with EPOLL_CTL_DEL, 
*/
+          /* which is fortunately easy to do for us. */
+          if (ecb_expect_true (!epoll_ctl(backend_fd, EPOLL_CTL_DEL, fd, &ev)))
+            return;
+
+          postfork |= 2; /* an error occurred, recreate kernel state */
+        }
+      return;
+    }
 
   oldmask = anfds [fd].emask;
   anfds [fd].emask = nev;
--- libev/ev_vars.h     (3645c03fc23f076ed7d607eabf6cf886919bfce0)
+++ libev/ev_vars.h     (working tree)
@@ -105,6 +105,7 @@ VARx(int, epoll_eventmax)
 VARx(int *, epoll_eperms)
 VARx(int, epoll_epermcnt)
 VARx(int, epoll_epermmax)
+VARx(char, epoll_pessimistic_remove)
 #endif
 
 #if EV_USE_LINUXAIO || EV_GENWRAP
--- libev/ev_wrap.h     (3645c03fc23f076ed7d607eabf6cf886919bfce0)
+++ libev/ev_wrap.h     (working tree)
@@ -26,6 +26,7 @@
 #define epoll_eperms ((loop)->epoll_eperms)
 #define epoll_eventmax ((loop)->epoll_eventmax)
 #define epoll_events ((loop)->epoll_events)
+#define epoll_pessimistic_remove ((loop)->epoll_pessimistic_remove)
 #define evpipe ((loop)->evpipe)
 #define fdchangecnt ((loop)->fdchangecnt)
 #define fdchangemax ((loop)->fdchangemax)
_______________________________________________
libev mailing list
[email protected]
http://lists.schmorp.de/mailman/listinfo/libev

Reply via email to