This is Anthony's net-tap-zero-copy.patch which eliminates a copy on the host->guest data path with virtio_net. --- qemu/hw/virtio-net.c | 76 ++++++++++++++++++++++++++++++++++++------------- qemu/net.h | 3 ++ qemu/vl.c | 50 +++++++++++++++++++++++++++++++++ 3 files changed, 109 insertions(+), 20 deletions(-)
diff --git a/qemu/hw/virtio-net.c b/qemu/hw/virtio-net.c index a681a7e..5e71afe 100644 --- a/qemu/hw/virtio-net.c +++ b/qemu/hw/virtio-net.c @@ -70,6 +70,8 @@ typedef struct VirtIONet VLANClientState *vc; QEMUTimer *tx_timer; int tx_timer_active; + int last_elem_valid; + VirtQueueElement last_elem; } VirtIONet; /* TODO @@ -153,47 +155,80 @@ static int virtio_net_can_receive(void *opaque) return 1; } -static void virtio_net_receive(void *opaque, const uint8_t *buf, int size) +static void virtio_net_receive_zc(void *opaque, IOZeroCopyHandler *zc, void *data) { VirtIONet *n = opaque; - VirtQueueElement elem; + VirtQueueElement *elem = &n->last_elem; struct virtio_net_hdr *hdr; - int offset, i; - int total; + ssize_t err; + int idx; - if (virtqueue_pop(n->rx_vq, &elem) == 0) + if (!n->last_elem_valid && virtqueue_pop(n->rx_vq, elem) == 0) return; - if (elem.in_num < 1 || elem.in_sg[0].iov_len != sizeof(*hdr)) { + if (elem->in_num < 1 || elem->in_sg[0].iov_len != sizeof(*hdr)) { fprintf(stderr, "virtio-net header not in first element\n"); exit(1); } - hdr = (void *)elem.in_sg[0].iov_base; + n->last_elem_valid = 1; + + hdr = (void *)elem->in_sg[0].iov_base; hdr->flags = 0; hdr->gso_type = VIRTIO_NET_HDR_GSO_NONE; - offset = 0; - total = sizeof(*hdr); + idx = tap_has_offload(n->vc->vlan->first_client) ? 0 : 1; + + do { + err = zc(data, &elem->in_sg[idx], elem->in_num - idx); + } while (err == -1 && errno == EINTR); + + if (err == -1 && errno == EAGAIN) + return; - if (tap_has_offload(n->vc->vlan->first_client)) { - memcpy(hdr, buf, sizeof(*hdr)); - offset += total; + if (err < 0) { + fprintf(stderr, "virtio_net: error during IO\n"); + return; } + /* signal other side */ + n->last_elem_valid = 0; + virtqueue_push(n->rx_vq, elem, sizeof(*hdr) + err); + virtio_notify(&n->vdev, n->rx_vq); +} + +struct compat_data +{ + const uint8_t *buf; + int size; +}; + +static ssize_t compat_copy(void *opaque, struct iovec *iov, int iovcnt) +{ + struct compat_data *compat = opaque; + int offset, i; + /* copy in packet. ugh */ - i = 1; - while (offset < size && i < elem.in_num) { - int len = MIN(elem.in_sg[i].iov_len, size - offset); - memcpy(elem.in_sg[i].iov_base, buf + offset, len); + offset = 0; + i = 0; + while (offset < compat->size && i < iovcnt) { + int len = MIN(iov[i].iov_len, compat->size - offset); + memcpy(iov[i].iov_base, compat->buf + offset, len); offset += len; - total += len; i++; } - /* signal other side */ - virtqueue_push(n->rx_vq, &elem, total); - virtio_notify(&n->vdev, n->rx_vq); + return offset; +} + +static void virtio_net_receive(void *opaque, const uint8_t *buf, int size) +{ + struct compat_data compat; + + compat.buf = buf; + compat.size = size; + + virtio_net_receive_zc(opaque, compat_copy, &compat); } /* TX */ @@ -310,6 +345,7 @@ PCIDevice *virtio_net_init(PCIBus *bus, NICInfo *nd, int devfn) memcpy(n->mac, nd->macaddr, 6); n->vc = qemu_new_vlan_client(nd->vlan, virtio_net_receive, virtio_net_can_receive, n); + n->vc->fd_read_zc = virtio_net_receive_zc; n->tx_timer = qemu_new_timer(vm_clock, virtio_net_tx_timer, n); n->tx_timer_active = 0; diff --git a/qemu/net.h b/qemu/net.h index 6cfd8ce..aca50e9 100644 --- a/qemu/net.h +++ b/qemu/net.h @@ -6,6 +6,8 @@ /* VLANs support */ typedef ssize_t (IOReadvHandler)(void *, const struct iovec *, int); +typedef ssize_t (IOZeroCopyHandler)(void *, struct iovec *, int); +typedef void (IOReadZCHandler)(void *, IOZeroCopyHandler *, void *); typedef struct VLANClientState VLANClientState; @@ -14,6 +16,7 @@ typedef void (SetOffload)(VLANClientState *, int, int, int, int); struct VLANClientState { IOReadHandler *fd_read; IOReadvHandler *fd_readv; + IOReadZCHandler *fd_read_zc; /* Packets may still be sent if this returns zero. It's used to rate-limit the slirp code. */ IOCanRWHandler *fd_can_read; diff --git a/qemu/vl.c b/qemu/vl.c index de92848..bc5b151 100644 --- a/qemu/vl.c +++ b/qemu/vl.c @@ -4204,6 +4204,7 @@ typedef struct TAPState { char buf[TAP_BUFSIZE]; int size; int offload; + int received_eagain; } TAPState; static void tap_receive(void *opaque, const uint8_t *buf, int size) @@ -4232,6 +4233,48 @@ static ssize_t tap_readv(void *opaque, const struct iovec *iov, return len; } +static VLANClientState *tap_can_zero_copy(TAPState *s) +{ + VLANClientState *vc, *vc1 = NULL; + int vc_count = 0; + + for (vc = s->vc->vlan->first_client; vc; vc = vc->next) { + if (vc == s->vc) + continue; + + if (!vc->fd_read_zc || vc_count) + return NULL; + + vc_count++; + vc1 = vc; + } + + return vc1; +} + +static ssize_t tap_sendv(void *opaque, struct iovec *iov, int iovcnt) +{ + TAPState *s = opaque; + ssize_t ret; + + kvm_sleep_begin(); + ret = readv(s->fd, iov, iovcnt); + kvm_sleep_end(); + if (ret == -1 && errno == EAGAIN) + s->received_eagain = 1; + + return ret; +} + +static void tap_send_zero_copy(TAPState *s, VLANClientState *vc) +{ + s->received_eagain = 0; + while (s->received_eagain == 0 && + (!vc->fd_can_read || vc->fd_can_read(vc->opaque))) { + vc->fd_read_zc(vc->opaque, tap_sendv, s); + } +} + static int tap_can_send(void *opaque) { TAPState *s = opaque; @@ -4261,6 +4304,13 @@ static int tap_can_send(void *opaque) static void tap_send(void *opaque) { TAPState *s = opaque; + VLANClientState *zc; + + zc = tap_can_zero_copy(s); + if (zc) { + tap_send_zero_copy(s, zc); + return; + } /* First try to send any buffered packet */ if (s->size > 0) { -- 1.5.4.1 -- To unsubscribe from this list: send the line "unsubscribe kvm" in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html