Re: [PATCHv11] vhost: vhost TX zero-copy support
From: "Michael S. Tsirkin" Date: Mon, 18 Jul 2011 16:48:46 +0300 >>From: Shirley Ma > > This adds experimental zero copy support in vhost-net, > disabled by default. To enable, set > experimental_zcopytx module option to 1. > > This patch maintains the outstanding userspace buffers in the > sequence it is delivered to vhost. The outstanding userspace buffers > will be marked as done once the lower device buffers DMA has finished. > This is monitored through last reference of kfree_skb callback. Two > buffer indices are used for this purpose. > > The vhost-net device passes the userspace buffers info to lower device > skb through message control. DMA done status check and guest > notification are handled by handle_tx: in the worst case is all buffers > in the vq are in pending/done status, so we need to notify guest to > release DMA done buffers first before we get any new buffers from the > vq. > > One known problem is that if the guest stops submitting > buffers, buffers might never get used until some > further action, e.g. device reset. This does not > seem to affect linux guests. > > Signed-off-by: Shirley > Signed-off-by: Michael S. Tsirkin Applied, thanks! ___ Virtualization mailing list Virtualization@lists.linux-foundation.org https://lists.linux-foundation.org/mailman/listinfo/virtualization
RE: [PATCH 1/1] Staging: hv: Integrate the time source driver with Hyper-V detection code
> -Original Message- > From: K. Y. Srinivasan [mailto:k...@microsoft.com] > Sent: Friday, July 15, 2011 9:01 PM > To: gre...@suse.de; linux-ker...@vger.kernel.org; > de...@linuxdriverproject.org; virtualizat...@lists.osdl.org; > t...@linutronix.de > Cc: KY Srinivasan; Haiyang Zhang > Subject: [PATCH 1/1] Staging: hv: Integrate the time source driver with > Hyper-V > detection code > > The Hyper-V timesource driver is best integrated with Hyper-V detection code > since: (a) Linux guests running on Hyper-V need this timesource and (b) > by integrating with Hyper-V detection, we could significantly minimize the > code in the timesource driver. Thomas, A number of weeks ago you had reviewed the time Hyper-V time source driver code and recommended that I merge it with Hyper-V detection code. That is exactly what I have done here. Is this what you had in mind; your feedback would be greatly appreciated. I had sent this patch a couple of weeks ago and got no response. As an X86 maintainer, would you be willing to take this patch. Regards, K. Y > > Signed-off-by: K. Y. Srinivasan > Signed-off-by: Haiyang Zhang > --- > arch/x86/kernel/cpu/mshyperv.c | 24 > drivers/staging/hv/Makefile|2 +- > drivers/staging/hv/hv_timesource.c | 102 > > 3 files changed, 25 insertions(+), 103 deletions(-) > delete mode 100644 drivers/staging/hv/hv_timesource.c > > diff --git a/arch/x86/kernel/cpu/mshyperv.c b/arch/x86/kernel/cpu/mshyperv.c > index d944bf6..c97f88d 100644 > --- a/arch/x86/kernel/cpu/mshyperv.c > +++ b/arch/x86/kernel/cpu/mshyperv.c > @@ -11,6 +11,8 @@ > */ > > #include > +#include > +#include > #include > #include > #include > @@ -36,6 +38,25 @@ static bool __init ms_hyperv_platform(void) > !memcmp("Microsoft Hv", hyp_signature, 12); > } > > +static cycle_t read_hv_clock(struct clocksource *arg) > +{ > + cycle_t current_tick; > + /* > + * Read the partition counter to get the current tick count. This count > + * is set to 0 when the partition is created and is incremented in > + * 100 nanosecond units. > + */ > + rdmsrl(HV_X64_MSR_TIME_REF_COUNT, current_tick); > + return current_tick; > +} > + > +static struct clocksource hyperv_cs = { > + .name = "hyperv_clocksource", > + .rating = 400, /* use this when running on Hyperv*/ > + .read = read_hv_clock, > + .mask = CLOCKSOURCE_MASK(64), > +}; > + > static void __init ms_hyperv_init_platform(void) > { > /* > @@ -46,6 +67,9 @@ static void __init ms_hyperv_init_platform(void) > > printk(KERN_INFO "HyperV: features 0x%x, hints 0x%x\n", > ms_hyperv.features, ms_hyperv.hints); > + > + > + clocksource_register_hz(&hyperv_cs, NSEC_PER_SEC/100); > } > > const __refconst struct hypervisor_x86 x86_hyper_ms_hyperv = { > diff --git a/drivers/staging/hv/Makefile b/drivers/staging/hv/Makefile > index bd176b1..3e0d7e6 100644 > --- a/drivers/staging/hv/Makefile > +++ b/drivers/staging/hv/Makefile > @@ -1,4 +1,4 @@ > -obj-$(CONFIG_HYPERV) += hv_vmbus.o hv_timesource.o > +obj-$(CONFIG_HYPERV) += hv_vmbus.o > obj-$(CONFIG_HYPERV_STORAGE) += hv_storvsc.o > obj-$(CONFIG_HYPERV_NET) += hv_netvsc.o > obj-$(CONFIG_HYPERV_UTILS) += hv_utils.o > diff --git a/drivers/staging/hv/hv_timesource.c > b/drivers/staging/hv/hv_timesource.c > deleted file mode 100644 > index 0efb049..000 > --- a/drivers/staging/hv/hv_timesource.c > +++ /dev/null > @@ -1,102 +0,0 @@ > -/* > - * A clocksource for Linux running on HyperV. > - * > - * > - * Copyright (C) 2010, Novell, Inc. > - * Author : K. Y. Srinivasan > - * > - * This program is free software; you can redistribute it and/or modify > - * it under the terms of the GNU General Public License version 2 as > - * published by the Free Software Foundation. > - * > - * This program is distributed in the hope that it will be useful, but > - * WITHOUT ANY WARRANTY; without even the implied warranty of > - * MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or > - * NON INFRINGEMENT. See the GNU General Public License for more > - * details. > - * > - * You should have received a copy of the GNU General Public License > - * along with this program; if not, write to the Free Software > - * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. > - * > - */ > -#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt > - > -#include > -#include > -#include > -#include > -#include > -#include > -#include > -#include > -#include > - > -#define HV_CLOCK_SHIFT 22 > - > -static cycle_t read_hv_clock(struct clocksource *arg) > -{ > - cycle_t current_tick; > - /* > - * Read the partition counter to get the current tick count. This count > - * is set to 0 when the partition is created and is incremented in > - * 100 nanosecond units. > - *
[PATCHv11] vhost: vhost TX zero-copy support
>From: Shirley Ma This adds experimental zero copy support in vhost-net, disabled by default. To enable, set experimental_zcopytx module option to 1. This patch maintains the outstanding userspace buffers in the sequence it is delivered to vhost. The outstanding userspace buffers will be marked as done once the lower device buffers DMA has finished. This is monitored through last reference of kfree_skb callback. Two buffer indices are used for this purpose. The vhost-net device passes the userspace buffers info to lower device skb through message control. DMA done status check and guest notification are handled by handle_tx: in the worst case is all buffers in the vq are in pending/done status, so we need to notify guest to release DMA done buffers first before we get any new buffers from the vq. One known problem is that if the guest stops submitting buffers, buffers might never get used until some further action, e.g. device reset. This does not seem to affect linux guests. Signed-off-by: Shirley Signed-off-by: Michael S. Tsirkin --- I run some light tests and this seems to work fine for me. Given that the new functionality is disabled by default, it seems a pretty safe patch to merge for 3.1. Changes from v10: don't leak ubuf_info don't allocate ubuf_info for non zero copy vqs Changes from v9: coding style prettification suggested by Jesper Juhl drivers/vhost/net.c | 77 +- drivers/vhost/vhost.c | 128 +++-- drivers/vhost/vhost.h | 31 3 files changed, 220 insertions(+), 16 deletions(-) diff --git a/drivers/vhost/net.c b/drivers/vhost/net.c index e224a92..f0fd52c 100644 --- a/drivers/vhost/net.c +++ b/drivers/vhost/net.c @@ -12,6 +12,7 @@ #include #include #include +#include #include #include #include @@ -28,10 +29,18 @@ #include "vhost.h" +static int experimental_zcopytx; +module_param(experimental_zcopytx, int, 0444); +MODULE_PARM_DESC(experimental_zcopytx, "Enable Experimental Zero Copy TX"); + /* Max number of bytes transferred before requeueing the job. * Using this limit prevents one virtqueue from starving others. */ #define VHOST_NET_WEIGHT 0x8 +/* MAX number of TX used buffers for outstanding zerocopy */ +#define VHOST_MAX_PEND 128 +#define VHOST_GOODCOPY_LEN 256 + enum { VHOST_NET_VQ_RX = 0, VHOST_NET_VQ_TX = 1, @@ -54,6 +63,12 @@ struct vhost_net { enum vhost_net_poll_state tx_poll_state; }; +static bool vhost_sock_zcopy(struct socket *sock) +{ + return unlikely(experimental_zcopytx) && + sock_flag(sock->sk, SOCK_ZEROCOPY); +} + /* Pop first len bytes from iovec. Return number of segments used. */ static int move_iovec_hdr(struct iovec *from, struct iovec *to, size_t len, int iov_count) @@ -129,6 +144,8 @@ static void handle_tx(struct vhost_net *net) int err, wmem; size_t hdr_size; struct socket *sock; + struct vhost_ubuf_ref *uninitialized_var(ubufs); + bool zcopy; /* TODO: check that we are running from vhost_worker? */ sock = rcu_dereference_check(vq->private_data, 1); @@ -149,8 +166,13 @@ static void handle_tx(struct vhost_net *net) if (wmem < sock->sk->sk_sndbuf / 2) tx_poll_stop(net); hdr_size = vq->vhost_hlen; + zcopy = vhost_sock_zcopy(sock); for (;;) { + /* Release DMAs done buffers first */ + if (zcopy) + vhost_zerocopy_signal_used(vq); + head = vhost_get_vq_desc(&net->dev, vq, vq->iov, ARRAY_SIZE(vq->iov), &out, &in, @@ -166,6 +188,13 @@ static void handle_tx(struct vhost_net *net) set_bit(SOCK_ASYNC_NOSPACE, &sock->flags); break; } + /* If more outstanding DMAs, queue the work */ + if (unlikely(vq->upend_idx - vq->done_idx > +VHOST_MAX_PEND)) { + tx_poll_start(net, sock); + set_bit(SOCK_ASYNC_NOSPACE, &sock->flags); + break; + } if (unlikely(vhost_enable_notify(&net->dev, vq))) { vhost_disable_notify(&net->dev, vq); continue; @@ -188,9 +217,39 @@ static void handle_tx(struct vhost_net *net) iov_length(vq->hdr, s), hdr_size); break; } + /* use msg_control to pass vhost zerocopy ubuf info to skb */ + if (zcopy) { + vq->heads[vq->upend_idx].id = head; + if (len < VHOST_GOODCOPY_LEN) { +
Re: [PATCHv10] vhost: vhost TX zero-copy support
On Mon, Jul 18, 2011 at 01:59:05PM +0300, Michael S. Tsirkin wrote: > This adds experimental zero copy support in vhost-net, > disabled by default. Looks like there was a memory leak in the patch, I'll address that and post a new version shortly. -- MST ___ Virtualization mailing list Virtualization@lists.linux-foundation.org https://lists.linux-foundation.org/mailman/listinfo/virtualization
[PATCHv10] vhost: vhost TX zero-copy support
This adds experimental zero copy support in vhost-net, disabled by default. To enable, set the zerocopytx module option to 1. This patch maintains the outstanding userspace buffers in the sequence it is delivered to vhost. The outstanding userspace buffers will be marked as done once the lower device buffers DMA has finished. This is monitored through last reference of kfree_skb callback. Two buffer indices are used for this purpose. The vhost-net device passes the userspace buffers info to lower device skb through message control. DMA done status check and guest notification are handled by handle_tx: in the worst case is all buffers in the vq are in pending/done status, so we need to notify guest to release DMA done buffers first before we get any new buffers from the vq. One known problem is that if the guest stops submitting buffers, buffers might never get used until some further action, e.g. device reset. This does not seem to affect linux guests. Signed-off-by: Shirley Signed-off-by: Michael S. Tsirkin --- Changes from v9: coding style prettification suggested by Jesper Juhl. drivers/vhost/net.c | 73 ++- drivers/vhost/vhost.c | 84 + drivers/vhost/vhost.h | 29 + 3 files changed, 185 insertions(+), 1 deletions(-) diff --git a/drivers/vhost/net.c b/drivers/vhost/net.c index e224a92..226ca6b 100644 --- a/drivers/vhost/net.c +++ b/drivers/vhost/net.c @@ -12,6 +12,7 @@ #include #include #include +#include #include #include #include @@ -28,10 +29,18 @@ #include "vhost.h" +static int zcopytx; +module_param(zcopytx, int, 0444); +MODULE_PARM_DESC(lnksts, "Enable Zero Copy Transmit"); + /* Max number of bytes transferred before requeueing the job. * Using this limit prevents one virtqueue from starving others. */ #define VHOST_NET_WEIGHT 0x8 +/* MAX number of TX used buffers for outstanding zerocopy */ +#define VHOST_MAX_PEND 128 +#define VHOST_GOODCOPY_LEN 256 + enum { VHOST_NET_VQ_RX = 0, VHOST_NET_VQ_TX = 1, @@ -54,6 +63,11 @@ struct vhost_net { enum vhost_net_poll_state tx_poll_state; }; +static bool vhost_sock_zcopy(struct socket *sock) +{ + return unlikely(zcopytx) && sock_flag(sock->sk, SOCK_ZEROCOPY); +} + /* Pop first len bytes from iovec. Return number of segments used. */ static int move_iovec_hdr(struct iovec *from, struct iovec *to, size_t len, int iov_count) @@ -129,6 +143,8 @@ static void handle_tx(struct vhost_net *net) int err, wmem; size_t hdr_size; struct socket *sock; + struct vhost_ubuf_ref *uninitialized_var(ubufs); + bool zcopy; /* TODO: check that we are running from vhost_worker? */ sock = rcu_dereference_check(vq->private_data, 1); @@ -149,8 +165,13 @@ static void handle_tx(struct vhost_net *net) if (wmem < sock->sk->sk_sndbuf / 2) tx_poll_stop(net); hdr_size = vq->vhost_hlen; + zcopy = vhost_sock_zcopy(sock); for (;;) { + /* Release DMAs done buffers first */ + if (zcopy) + vhost_zerocopy_signal_used(vq); + head = vhost_get_vq_desc(&net->dev, vq, vq->iov, ARRAY_SIZE(vq->iov), &out, &in, @@ -166,6 +187,12 @@ static void handle_tx(struct vhost_net *net) set_bit(SOCK_ASYNC_NOSPACE, &sock->flags); break; } + /* If more outstanding DMAs, queue the work */ + if (vq->upend_idx - vq->done_idx > VHOST_MAX_PEND) { + tx_poll_start(net, sock); + set_bit(SOCK_ASYNC_NOSPACE, &sock->flags); + break; + } if (unlikely(vhost_enable_notify(&net->dev, vq))) { vhost_disable_notify(&net->dev, vq); continue; @@ -188,9 +215,39 @@ static void handle_tx(struct vhost_net *net) iov_length(vq->hdr, s), hdr_size); break; } + /* use msg_control to pass vhost zerocopy ubuf info to skb */ + if (zcopy) { + vq->heads[vq->upend_idx].id = head; + if (len < VHOST_GOODCOPY_LEN) { + /* copy don't need to wait for DMA done */ + vq->heads[vq->upend_idx].len = + VHOST_DMA_DONE_LEN; + msg.msg_control = NULL; + msg.msg_controllen = 0; + ubufs = NULL; + } else { +
Re: [PATCHv9] vhost: experimental tx zero-copy support
On Sun, Jul 17, 2011 at 10:01:41PM +0200, Jesper Juhl wrote: > > @@ -28,10 +29,18 @@ > > > > #include "vhost.h" > > > > +static int zcopytx; > > +module_param(zcopytx, int, 0444); > > Should everyone be able to read this? How about "0440" just to be > paranoid? or? I find it very helpful to have the parameter visible in sysfs. Given that: [mst@tuck linux-2.6]$ grep module_param drivers/net/*c|grep [64]44|wc -l 14 [mst@tuck linux-2.6]$ grep module_param drivers/net/*c|grep [64]40|wc -l 0 [mst@tuck linux-2.6]$ grep module_param drivers/net/*c|grep [64]00|wc -l 7 So at least the precedent is against 0440. What do you think? -- MST ___ Virtualization mailing list Virtualization@lists.linux-foundation.org https://lists.linux-foundation.org/mailman/listinfo/virtualization