Hello,
As we understand the `epoll_modify` function has an optimization that saves
a `epoll_ctl(EPOLL_CTL_DEL)` by not removing a triggered fd. This holds for
cases when notification from a file descriptor is removed by a thread that
runs `ev_run`. However, for the cases when it is removed by other thread
the following happens:
1) Consider that we have a read-ready `io_watcher`
2) `ev_run` gets from a `epoll_wait` a read-ready fd and fires the
`io_watcher`
3) callback does something application specific (notifies other thread to
do reading?) and stops the `io_watcher`
4) `epoll_modify` optimization does not remove `fd` from `epoll`
5) `ev_run` again gets from a `epoll_wait` a read-ready fd (that was not
removed from `epoll` on previous step)
6) `epoll_ctl(EPOLL_CTL_DEL)`. `ev_run` goes into `epoll_wait` again
In the above case there's an additional `epoll_wait` in step 5) and
`epoll_ctl(EPOLL_CTL_DEL)` is called anyway.
For use cases when fd is read from other thread we propose to disable
delayed `epoll_ctl(EPOLL_CTL_DEL)` optimization (see the patch in
attachment).
Some measures for the patch.
Without it:
```
twrk -c 1 -t 1 -d 10s http://localhost:8080/hello
153175 requests
309426 syscalls:sys_enter_epoll_ctl
479451 syscalls:sys_enter_epoll_wait
```
With it:
```
twrk -c 1 -t 1 -d 10s http://localhost:8080/hello
148106 requests
299008 syscalls:sys_enter_epoll_ctl
314399 syscalls:sys_enter_epoll_wait <== significant reduction
of syscalls
```
What do you think of the idea?
--- libev/ev.c (3645c03fc23f076ed7d607eabf6cf886919bfce0)
+++ libev/ev.c (working tree)
@@ -3751,6 +3751,14 @@ ev_pending_count (EV_P) EV_NOEXCEPT
return count;
}
+void
+ev_set_io_pessimistic_remove (EV_P) EV_NOEXCEPT
+{
+#if EV_USE_EPOLL || EV_GENWRAP
+ epoll_pessimistic_remove = 1;
+#endif
+}
+
ecb_noinline
void
ev_invoke_pending (EV_P)
--- libev/ev.h (3645c03fc23f076ed7d607eabf6cf886919bfce0)
+++ libev/ev.h (working tree)
@@ -676,6 +676,10 @@ EV_API_DECL void ev_set_loop_release_cb (EV_P_ void
(*release)(EV_P) EV_NOEXCEPT
EV_API_DECL unsigned int ev_pending_count (EV_P) EV_NOEXCEPT; /* number of
pending events, if any */
+#define EV_HAS_IO_PESSIMISTIC_REMOVE 1
+/* Disable an optimistics epoll_ctl(..., EPOLL_CTL_DEL, ...) optimization for
epoll backend, does nothing for other backends */
+EV_API_DECL void ev_set_io_pessimistic_remove (EV_P) EV_NOEXCEPT;
+
/*
* stop/start the timer handling.
*/
--- libev/ev_epoll.c (3645c03fc23f076ed7d607eabf6cf886919bfce0)
+++ libev/ev_epoll.c (working tree)
@@ -80,9 +80,24 @@ epoll_modify (EV_P_ int fd, int oev, int nev)
* event in epoll_poll.
* if the fd is added again, we try to ADD it, and, if that
* fails, we assume it still has the same eventmask.
+ *
+ * However, if the select-ready notification is removed by another
+ * thread and not the thread that does ev_run, this would be a pessimization,
+ * condition it on epoll_pessimistic_remove.
*/
if (!nev)
- return;
+ {
+ if (epoll_pessimistic_remove)
+ {
+ /* pre-2.6.9 kernels require a non-null pointer with EPOLL_CTL_DEL,
*/
+ /* which is fortunately easy to do for us. */
+ if (ecb_expect_true (!epoll_ctl(backend_fd, EPOLL_CTL_DEL, fd, &ev)))
+ return;
+
+ postfork |= 2; /* an error occurred, recreate kernel state */
+ }
+ return;
+ }
oldmask = anfds [fd].emask;
anfds [fd].emask = nev;
--- libev/ev_vars.h (3645c03fc23f076ed7d607eabf6cf886919bfce0)
+++ libev/ev_vars.h (working tree)
@@ -105,6 +105,7 @@ VARx(int, epoll_eventmax)
VARx(int *, epoll_eperms)
VARx(int, epoll_epermcnt)
VARx(int, epoll_epermmax)
+VARx(char, epoll_pessimistic_remove)
#endif
#if EV_USE_LINUXAIO || EV_GENWRAP
--- libev/ev_wrap.h (3645c03fc23f076ed7d607eabf6cf886919bfce0)
+++ libev/ev_wrap.h (working tree)
@@ -26,6 +26,7 @@
#define epoll_eperms ((loop)->epoll_eperms)
#define epoll_eventmax ((loop)->epoll_eventmax)
#define epoll_events ((loop)->epoll_events)
+#define epoll_pessimistic_remove ((loop)->epoll_pessimistic_remove)
#define evpipe ((loop)->evpipe)
#define fdchangecnt ((loop)->fdchangecnt)
#define fdchangemax ((loop)->fdchangemax)
_______________________________________________
libev mailing list
[email protected]
http://lists.schmorp.de/mailman/listinfo/libev