On 10/9/19 3:38 PM, Marvin Liu wrote:
> Disable software pre-fetch actions on Skylake and later platforms.
> Hardware can fetch needed data for vhost, additional software pre-fetch
> will impact performance.
>
> Signed-off-by: Marvin Liu <yong....@intel.com>
>
> diff --git a/lib/librte_vhost/Makefile b/lib/librte_vhost/Makefile
> index 30839a001..5f3b42e56 100644
> --- a/lib/librte_vhost/Makefile
> +++ b/lib/librte_vhost/Makefile
> @@ -16,6 +16,12 @@ CFLAGS += -I vhost_user
> CFLAGS += -fno-strict-aliasing
> LDLIBS += -lpthread
>
> +AVX512_SUPPORT=$(shell $(CC) -march=native -dM -E - </dev/null |grep AVX512F)
> +
> +ifneq ($(AVX512_SUPPORT),)
> +CFLAGS += -DDISABLE_SWPREFETCH
> +endif
That's problematic I think, because the machine running the lib may be
different from the machine building it, for example distros.
In this case, a Skylake or later may be used to build the package, but
with passing "-march=haswell". It would end-up prefetching being
disabled whereas we would expect it to be enabled.
I see several solutions:
- Check for CONFIG_RTE_ENABLE_AVX512 flag.
- Keep prefetch instructions (what would be the impact on Skylake and
later?)
- Remove prefetch instructions (what would be the impact on pre-
Skylake?)
But really, I think we need some figures before applying such a patch.
What performance gain do you measure with this patch?
> ifeq ($(RTE_TOOLCHAIN), gcc)
> ifeq ($(shell test $(GCC_VERSION) -ge 83 && echo 1), 1)
> CFLAGS += -DSUPPORT_GCC_UNROLL_PRAGMA
> diff --git a/lib/librte_vhost/meson.build b/lib/librte_vhost/meson.build
> index ddf0ee579..5c6f0c0b4 100644
> --- a/lib/librte_vhost/meson.build
> +++ b/lib/librte_vhost/meson.build
> @@ -15,6 +15,10 @@ elif (toolchain == 'clang' and
> cc.version().version_compare('>=3.7.0'))
> elif (toolchain == 'icc' and cc.version().version_compare('>=16.0.0'))
> cflags += '-DSUPPORT_ICC_UNROLL_PRAGMA'
> endif
> +r = run_command(toolchain, '-march=native', '-dM', '-E', '-', '</dev/null',
> '|', 'grep AVX512F')
> +if (r.stdout().strip() != '')
> + cflags += '-DDISABLE_SWPREFETCH'
> +endif
> dpdk_conf.set('RTE_LIBRTE_VHOST_POSTCOPY',
> cc.has_header('linux/userfaultfd.h'))
> version = 4
> diff --git a/lib/librte_vhost/virtio_net.c b/lib/librte_vhost/virtio_net.c
> index 56c2080fb..046e497c2 100644
> --- a/lib/librte_vhost/virtio_net.c
> +++ b/lib/librte_vhost/virtio_net.c
> @@ -1075,7 +1075,9 @@ virtio_dev_rx_batch_packed(struct virtio_net *dev,
> struct vhost_virtqueue *vq,
>
> UNROLL_PRAGMA(UNROLL_PRAGMA_PARAM)
> for (i = 0; i < PACKED_BATCH_SIZE; i++) {
> +#ifndef DISABLE_SWPREFETCH
> rte_prefetch0((void *)(uintptr_t)desc_addrs[i]);
> +#endif
> hdrs[i] = (struct virtio_net_hdr_mrg_rxbuf *)
> (uintptr_t)desc_addrs[i];
> lens[i] = pkts[i]->pkt_len + dev->vhost_hlen;
> @@ -1144,7 +1146,9 @@ virtio_dev_rx_packed(struct virtio_net *dev, struct
> vhost_virtqueue *vq,
> uint32_t remained = count;
>
> do {
> +#ifndef DISABLE_SWPREFETCH
> rte_prefetch0(&vq->desc_packed[vq->last_avail_idx]);
> +#endif
>
> if (remained >= PACKED_BATCH_SIZE) {
> if (!virtio_dev_rx_batch_packed(dev, vq, pkts)) {
> @@ -1790,7 +1794,9 @@ virtio_dev_tx_batch_packed(struct virtio_net *dev,
> struct vhost_virtqueue *vq,
>
> UNROLL_PRAGMA(UNROLL_PRAGMA_PARAM)
> for (i = 0; i < PACKED_BATCH_SIZE; i++) {
> +#ifndef DISABLE_SWPREFETCH
> rte_prefetch0((void *)(uintptr_t)desc_addrs[i]);
> +#endif
> rte_memcpy(rte_pktmbuf_mtod_offset(pkts[i], void *, 0),
> (void *)(uintptr_t)(desc_addrs[i] + buf_offset),
> pkts[i]->pkt_len);
> @@ -2046,7 +2052,9 @@ virtio_dev_tx_packed(struct virtio_net *dev, struct
> vhost_virtqueue *vq,
> uint32_t remained = count;
>
> do {
> +#ifndef DISABLE_SWPREFETCH
> rte_prefetch0(&vq->desc_packed[vq->last_avail_idx]);
> +#endif
>
> if (remained >= PACKED_BATCH_SIZE) {
> if (!virtio_dev_tx_batch_packed(dev, vq, mbuf_pool,
>