On Mon, 09/29 13:26, Fam Zheng wrote: > A new implementation for qemu_poll_ns based on epoll is introduced here > to address the slowness of g_poll and ppoll when the number of fds are > high. > > On my laptop this would reduce the virtio-blk on top of null-aio > device's response time from 32 us to 29 us with few fds (~10), and 48 us > to 32 us with more fds (for example when virtio-serial is plugged and > ~64 more io handlers are enabled). > > Signed-off-by: Fam Zheng <f...@redhat.com> > --- > Makefile.objs | 1 + > include/qemu/main-loop.h | 1 + > qemu-epoll.c | 165 > +++++++++++++++++++++++++++++++++++++++++++++++ > qemu-timer.c | 4 +- > tests/Makefile | 2 +- > 5 files changed, 171 insertions(+), 2 deletions(-) > create mode 100644 qemu-epoll.c > > diff --git a/Makefile.objs b/Makefile.objs > index 97db978..52ee086 100644 > --- a/Makefile.objs > +++ b/Makefile.objs > @@ -9,6 +9,7 @@ util-obj-y = util/ qobject/ qapi/ qapi-types.o qapi-visit.o > qapi-event.o > block-obj-y = async.o thread-pool.o > block-obj-y += nbd.o block.o blockjob.o > block-obj-y += main-loop.o iohandler.o qemu-timer.o > +block-obj-$(CONFIG_LINUX) += qemu-epoll.o > block-obj-$(CONFIG_POSIX) += aio-posix.o > block-obj-$(CONFIG_WIN32) += aio-win32.o > block-obj-y += block/ > diff --git a/include/qemu/main-loop.h b/include/qemu/main-loop.h > index 62c68c0..eb01b95 100644 > --- a/include/qemu/main-loop.h > +++ b/include/qemu/main-loop.h > @@ -307,5 +307,6 @@ void qemu_iohandler_poll(GArray *pollfds, int rc); > > QEMUBH *qemu_bh_new(QEMUBHFunc *cb, void *opaque); > void qemu_bh_schedule_idle(QEMUBH *bh); > +int qemu_epoll(GPollFD *fds, guint nfds, int64_t timeout); > > #endif > diff --git a/qemu-epoll.c b/qemu-epoll.c > new file mode 100644 > index 0000000..89ec12a > --- /dev/null > +++ b/qemu-epoll.c > @@ -0,0 +1,165 @@ > +/* > + * QEMU Event Loop > + * > + * Copyright (c) 2014 Red Hat, Inc. > + * > + * Authors: > + * Fam Zheng <f...@redhat.com> > + * > + * Permission is hereby granted, free of charge, to any person obtaining a > copy > + * of this software and associated documentation files (the "Software"), to > deal > + * in the Software without restriction, including without limitation the > rights > + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell > + * copies of the Software, and to permit persons to whom the Software is > + * furnished to do so, subject to the following conditions: > + * > + * The above copyright notice and this permission notice shall be included in > + * all copies or substantial portions of the Software. > + * > + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR > + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, > + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL > + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER > + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING > FROM, > + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN > + * THE SOFTWARE. > + */ > + > +#include <sys/epoll.h> > +#include "qemu/main-loop.h" > + > +static bool g_poll_fds_changed(const GPollFD *fds_a, const guint nfds_a, > + const GPollFD *fds_b, const guint nfds_b) > +{ > + int i; > + > + if (nfds_a != nfds_b) { > + return true; > + } > + if (!!fds_a != !!fds_b) { > + return true; > + } > + for (i = 0; i < nfds_a; i++) { > + if (fds_a[i].fd != fds_b[i].fd || > + fds_a[i].events != fds_b[i].events) { > + return true; > + } > + } > + return false; > +} > + > +static inline int g_io_condition_from_epoll_events(int e) > +{ > + return (e & EPOLLIN ? G_IO_IN : 0) | > + (e & EPOLLOUT ? G_IO_OUT : 0) | > + (e & EPOLLERR ? G_IO_ERR : 0) | > + (e & EPOLLHUP ? G_IO_HUP : 0); > +} > + > +static inline void epoll_event_from_g_poll_fd(struct epoll_event *event, > + GPollFD *fd) > +{ > + int e = fd->events; > + > + event->events = (e & G_IO_IN ? EPOLLIN : 0) | > + (e & G_IO_OUT ? EPOLLOUT : 0) | > + (e & G_IO_ERR ? EPOLLERR : 0) | > + (e & G_IO_HUP ? EPOLLHUP : 0); > + event->data.ptr = fd; > +} > + > +static int epoll_prepare(int epollfd, > + GPollFD *fds, guint nfds, > + GPollFD **g_poll_fds, > + guint *g_poll_nfds, > + int **g_poll_fd_idx) > +{ > + int i; > + > + GPollFD *pfds = NULL; > + int npfds = 0; > + int *idx = NULL; > + > + for (i = 0; i < nfds; i++) { > + int r; > + struct epoll_event event; > + epoll_event_from_g_poll_fd(&event, &fds[i]); > + > + r = epoll_ctl(epollfd, EPOLL_CTL_ADD, fds[i].fd, &event); > + if (r) { > + /* Some fds may not support epoll, fall back and add them to > + * ppoll_fds */ > + pfds = g_renew(GPollFD, pfds, npfds + 1); > + pfds[npfds] = fds[i]; > + idx = g_renew(int, idx, npfds + 1); > + idx[npfds] = i; > + npfds++; > + } > + } > + > + g_free(*g_poll_fds); > + *g_poll_fds = pfds; > + *g_poll_nfds = npfds; > + *g_poll_fd_idx = idx; > + > + return epollfd; > +} > + > +int qemu_epoll(GPollFD *fds, guint nfds, int64_t timeout) > +{ > + /* A copy of last fd array, used to skip epoll_prepare when nothing > + * changed. */ > + static GPollFD *last_fds; > + static guint last_nfds; > + /* An array of fds that failed epoll_ctl and fall back to ppoll. Rare > case > + * too. */ > + static GPollFD *g_poll_fds; > + static guint g_poll_nfds; > + static int *g_poll_fd_idx; > + static int epollfd = -1; > + const int max_events = 40; > + struct epoll_event events[max_events]; > + int ret = 0; > + int r, i; > + > + if (!last_fds || g_poll_fds_changed(fds, nfds, last_fds, last_nfds)) { > + if (epollfd >= 0) { > + close(epollfd); > + } > + epollfd = epoll_create(1); > + if (epollfd < 0) { > + perror("epoll_create"); > + abort(); > + } > + epollfd = epoll_prepare(epollfd, fds, nfds, &g_poll_fds, > &g_poll_nfds, > + &g_poll_fd_idx); > + last_fds = g_memdup(fds, nfds * sizeof(GPollFD));
g_poll_fd_idx and last_fds are both leaked. Fam > + last_nfds = nfds; > + } > + if (g_poll_nfds) { > + ret = g_poll(g_poll_fds, g_poll_nfds, > qemu_timeout_ns_to_ms(timeout)); > + if (ret < 0) { > + return ret; > + } > + /* Sync revents back to original fds */ > + for (i = 0; i < ret; i++) { > + GPollFD *fd = &fds[g_poll_fd_idx[i]]; > + assert(fd->fd == g_poll_fds[i].fd); > + fd->revents = g_poll_fds[i].revents; > + } > + } > + > + r = epoll_wait(epollfd, events, max_events, > + qemu_timeout_ns_to_ms(timeout)); > + if (r < 0) { > + return r; > + } > + > + for (i = 0; i < r; i++) { > + GPollFD *gpfd = events[i].data.ptr; > + gpfd->revents = g_io_condition_from_epoll_events(events[i].events); > + } > + > + ret += r; > + return ret; > +}