On 09/11/2015 11:08, Stefan Hajnoczi wrote: > From: Fam Zheng <f...@redhat.com> > > To minimize code duplication, epoll is hooked into aio-posix's > aio_poll() instead of rolling its own. This approach also has both > compile-time and run-time switchability. > > 1) When QEMU starts with a small number of fds in the event loop, ppoll > is used. > > 2) When QEMU starts with a big number of fds, or when more devices are > hot plugged, epoll kicks in when the number of fds hits the threshold. > > 3) Some fds may not support epoll, such as tty based stdio. In this > case, it falls back to ppoll. > > A rough benchmark with scsi-disk on virtio-scsi dataplane (epoll gets > enabled from 64 onward). Numbers are in MB/s. > > =============================================== > | master | epoll > | | > scsi disks # | read randrw | read randrw > -------------|----------------|---------------- > 1 | 86 36 | 92 45 > 8 | 87 43 | 86 41 > 64 | 71 32 | 70 38 > 128 | 48 24 | 58 31 > 256 | 37 19 | 57 28 > =============================================== > > To comply with aio_{disable,enable}_external, we always use ppoll when > aio_external_disabled() is true. > > [Removed #ifdef CONFIG_EPOLL around AioContext epollfd field declaration > since the field is also referenced outside CONFIG_EPOLL code. > --Stefan] > > Signed-off-by: Fam Zheng <f...@redhat.com> > Message-id: 1446177989-6702-4-git-send-email-f...@redhat.com > Signed-off-by: Stefan Hajnoczi <stefa...@redhat.com> > --- > aio-posix.c | 184 > +++++++++++++++++++++++++++++++++++++++++++++++++++- > include/block/aio.h | 5 ++ > 2 files changed, 188 insertions(+), 1 deletion(-) > > diff --git a/aio-posix.c b/aio-posix.c > index 5bff3cd..06148a9 100644 > --- a/aio-posix.c > +++ b/aio-posix.c > @@ -17,6 +17,9 @@ > #include "block/block.h" > #include "qemu/queue.h" > #include "qemu/sockets.h" > +#ifdef CONFIG_EPOLL > +#include <sys/epoll.h> > +#endif > > struct AioHandler > { > @@ -29,6 +32,162 @@ struct AioHandler > QLIST_ENTRY(AioHandler) node; > }; > > +#ifdef CONFIG_EPOLL > + > +/* The fd number threashold to switch to epoll */ > +#define EPOLL_ENABLE_THRESHOLD 64 > + > +static void aio_epoll_disable(AioContext *ctx) > +{ > + ctx->epoll_available = false; > + if (!ctx->epoll_enabled) { > + return; > + } > + ctx->epoll_enabled = false; > + close(ctx->epollfd); > +} > + > +static inline int epoll_events_from_pfd(int pfd_events) > +{ > + return (pfd_events & G_IO_IN ? EPOLLIN : 0) | > + (pfd_events & G_IO_OUT ? EPOLLOUT : 0) | > + (pfd_events & G_IO_HUP ? EPOLLHUP : 0) | > + (pfd_events & G_IO_ERR ? EPOLLERR : 0); > +} > + > +static bool aio_epoll_try_enable(AioContext *ctx) > +{ > + AioHandler *node; > + struct epoll_event event; > + > + QLIST_FOREACH(node, &ctx->aio_handlers, node) { > + int r; > + if (node->deleted || !node->pfd.events) { > + continue; > + } > + event.events = epoll_events_from_pfd(node->pfd.events); > + event.data.ptr = node; > + r = epoll_ctl(ctx->epollfd, EPOLL_CTL_ADD, node->pfd.fd, &event); > + if (r) { > + return false; > + } > + } > + ctx->epoll_enabled = true; > + return true; > +} > + > +static void aio_epoll_update(AioContext *ctx, AioHandler *node, bool is_new) > +{ > + struct epoll_event event; > + int r; > + > + if (!ctx->epoll_enabled) { > + return; > + } > + if (!node->pfd.events) {
Coverity says that node might have been freed by the time you call aio_epoll_update. You need to pass node->pfd.fd and node->pfd.events by value instead, I think, or move the call earlier in aio_set_fd_handler. Paolo > + r = epoll_ctl(ctx->epollfd, EPOLL_CTL_DEL, node->pfd.fd, &event); > + if (r) { > + aio_epoll_disable(ctx); > + } > + } else { > + event.data.ptr = node; > + event.events = epoll_events_from_pfd(node->pfd.events); > + if (is_new) { > + r = epoll_ctl(ctx->epollfd, EPOLL_CTL_ADD, node->pfd.fd, &event); > + if (r) { > + aio_epoll_disable(ctx); > + } > + } else { > + r = epoll_ctl(ctx->epollfd, EPOLL_CTL_MOD, node->pfd.fd, &event); > + if (r) { > + aio_epoll_disable(ctx); > + } > + } > + } > +} > + > +static int aio_epoll(AioContext *ctx, GPollFD *pfds, > + unsigned npfd, int64_t timeout) > +{ > + AioHandler *node; > + int i, ret = 0; > + struct epoll_event events[128]; > + > + assert(npfd == 1); > + assert(pfds[0].fd == ctx->epollfd); > + if (timeout > 0) { > + ret = qemu_poll_ns(pfds, npfd, timeout); > + } > + if (timeout <= 0 || ret > 0) { > + ret = epoll_wait(ctx->epollfd, events, > + sizeof(events) / sizeof(events[0]), > + timeout); > + if (ret <= 0) { > + goto out; > + } > + for (i = 0; i < ret; i++) { > + int ev = events[i].events; > + node = events[i].data.ptr; > + node->pfd.revents = (ev & EPOLLIN ? G_IO_IN : 0) | > + (ev & EPOLLOUT ? G_IO_OUT : 0) | > + (ev & EPOLLHUP ? G_IO_HUP : 0) | > + (ev & EPOLLERR ? G_IO_ERR : 0); > + } > + } > +out: > + return ret; > +} > + > +static bool aio_epoll_enabled(AioContext *ctx) > +{ > + /* Fall back to ppoll when external clients are disabled. */ > + return !aio_external_disabled(ctx) && ctx->epoll_enabled; > +} > + > +static bool aio_epoll_check_poll(AioContext *ctx, GPollFD *pfds, > + unsigned npfd, int64_t timeout) > +{ > + if (!ctx->epoll_available) { > + return false; > + } > + if (aio_epoll_enabled(ctx)) { > + return true; > + } > + if (npfd >= EPOLL_ENABLE_THRESHOLD) { > + if (aio_epoll_try_enable(ctx)) { > + return true; > + } else { > + aio_epoll_disable(ctx); > + } > + } > + return false; > +} > + > +#else > + > +static void aio_epoll_update(AioContext *ctx, AioHandler *node, bool is_new) > +{ > +} > + > +static int aio_epoll(AioContext *ctx, GPollFD *pfds, > + unsigned npfd, int64_t timeout) > +{ > + assert(false); > +} > + > +static bool aio_epoll_enabled(AioContext *ctx) > +{ > + return false; > +} > + > +static bool aio_epoll_check_poll(AioContext *ctx, GPollFD *pfds, > + unsigned npfd, int64_t timeout) > +{ > + return false; > +} > + > +#endif > + > static AioHandler *find_aio_handler(AioContext *ctx, int fd) > { > AioHandler *node; > @@ -50,6 +209,7 @@ void aio_set_fd_handler(AioContext *ctx, > void *opaque) > { > AioHandler *node; > + bool is_new = false; > > node = find_aio_handler(ctx, fd); > > @@ -79,6 +239,7 @@ void aio_set_fd_handler(AioContext *ctx, > QLIST_INSERT_HEAD(&ctx->aio_handlers, node, node); > > g_source_add_poll(&ctx->source, &node->pfd); > + is_new = true; > } > /* Update handler with latest information */ > node->io_read = io_read; > @@ -90,6 +251,7 @@ void aio_set_fd_handler(AioContext *ctx, > node->pfd.events |= (io_write ? G_IO_OUT | G_IO_ERR : 0); > } > > + aio_epoll_update(ctx, node, is_new); > aio_notify(ctx); > } > > @@ -262,6 +424,7 @@ bool aio_poll(AioContext *ctx, bool blocking) > /* fill pollfds */ > QLIST_FOREACH(node, &ctx->aio_handlers, node) { > if (!node->deleted && node->pfd.events > + && !aio_epoll_enabled(ctx) > && aio_node_check(ctx, node->is_external)) { > add_pollfd(node); > } > @@ -273,7 +436,17 @@ bool aio_poll(AioContext *ctx, bool blocking) > if (timeout) { > aio_context_release(ctx); > } > - ret = qemu_poll_ns((GPollFD *)pollfds, npfd, timeout); > + if (aio_epoll_check_poll(ctx, pollfds, npfd, timeout)) { > + AioHandler epoll_handler; > + > + epoll_handler.pfd.fd = ctx->epollfd; > + epoll_handler.pfd.events = G_IO_IN | G_IO_OUT | G_IO_HUP | G_IO_ERR; > + npfd = 0; > + add_pollfd(&epoll_handler); > + ret = aio_epoll(ctx, pollfds, npfd, timeout); > + } else { > + ret = qemu_poll_ns(pollfds, npfd, timeout); > + } > if (blocking) { > atomic_sub(&ctx->notify_me, 2); > } > @@ -305,4 +478,13 @@ bool aio_poll(AioContext *ctx, bool blocking) > > void aio_context_setup(AioContext *ctx, Error **errp) > { > +#ifdef CONFIG_EPOLL > + assert(!ctx->epollfd); > + ctx->epollfd = epoll_create1(EPOLL_CLOEXEC); > + if (ctx->epollfd == -1) { > + ctx->epoll_available = false; > + } else { > + ctx->epoll_available = true; > + } > +#endif > } > diff --git a/include/block/aio.h b/include/block/aio.h > index 9ffecf7..e086e3b 100644 > --- a/include/block/aio.h > +++ b/include/block/aio.h > @@ -124,6 +124,11 @@ struct AioContext { > QEMUTimerListGroup tlg; > > int external_disable_cnt; > + > + /* epoll(7) state used when built with CONFIG_EPOLL */ > + int epollfd; > + bool epoll_enabled; > + bool epoll_available; > }; > > /** >