> -----Original Message-----
> From: Fu, Patrick <patrick...@intel.com>
> Sent: Tuesday, July 7, 2020 1:07 PM
> To: dev@dpdk.org; maxime.coque...@redhat.com; Xia, Chenbo
> <chenbo....@intel.com>; Wang, Zhihong <zhihong.w...@intel.com>
> Cc: Fu, Patrick <patrick...@intel.com>; Wang, Yinan <yinan.w...@intel.com>;
> Jiang, Cheng1 <cheng1.ji...@intel.com>; Liang, Cunming
> <cunming.li...@intel.com>
> Subject: [PATCH v6 1/2] vhost: introduce async enqueue registration API
> 
> From: Patrick Fu <patrick...@intel.com>
> 
> Performing large memory copies usually takes up a major part of CPU cycles and
> becomes the hot spot in vhost-user enqueue operation. To offload the large
> copies from CPU to the DMA devices, asynchronous APIs are introduced, with
> which the CPU just submits copy jobs to the DMA but without waiting for its
> copy completion. Thus, there is no CPU intervention during data transfer. We
> can save precious CPU cycles and improve the overall throughput for vhost-user
> based applications. This patch introduces registration/un-registration APIs 
> for
> vhost async data enqueue operation. Together with the registration APIs
> implementations, data structures and the prototype of the async callback
> functions required for async enqueue data path are also defined.
> 
> Signed-off-by: Patrick Fu <patrick...@intel.com>
> ---
>  lib/librte_vhost/Makefile              |   2 +-
>  lib/librte_vhost/meson.build           |   2 +-
>  lib/librte_vhost/rte_vhost.h           |   1 +
>  lib/librte_vhost/rte_vhost_async.h     | 136 +++++++++++++++++++++++++
>  lib/librte_vhost/rte_vhost_version.map |   4 +
>  lib/librte_vhost/socket.c              |  27 +++++
>  lib/librte_vhost/vhost.c               | 127 ++++++++++++++++++++++-
>  lib/librte_vhost/vhost.h               |  30 +++++-
>  lib/librte_vhost/vhost_user.c          |  23 ++++-
>  9 files changed, 345 insertions(+), 7 deletions(-)  create mode 100644
> lib/librte_vhost/rte_vhost_async.h
> 
> diff --git a/lib/librte_vhost/Makefile b/lib/librte_vhost/Makefile index
> b7ff7dc4b..4f2f3e47d 100644
> --- a/lib/librte_vhost/Makefile
> +++ b/lib/librte_vhost/Makefile
> @@ -42,7 +42,7 @@ SRCS-$(CONFIG_RTE_LIBRTE_VHOST) := fd_man.c iotlb.c
> socket.c vhost.c \
> 
>  # install includes
>  SYMLINK-$(CONFIG_RTE_LIBRTE_VHOST)-include += rte_vhost.h rte_vdpa.h \
> -                                             rte_vdpa_dev.h
> +                                             rte_vdpa_dev.h
> rte_vhost_async.h
> 
>  # only compile vhost crypto when cryptodev is enabled  ifeq
> ($(CONFIG_RTE_LIBRTE_CRYPTODEV),y)
> diff --git a/lib/librte_vhost/meson.build b/lib/librte_vhost/meson.build index
> 882a0eaf4..cc9aa65c6 100644
> --- a/lib/librte_vhost/meson.build
> +++ b/lib/librte_vhost/meson.build
> @@ -22,5 +22,5 @@ sources = files('fd_man.c', 'iotlb.c', 'socket.c', 'vdpa.c',
>               'vhost.c', 'vhost_user.c',
>               'virtio_net.c', 'vhost_crypto.c')
>  headers = files('rte_vhost.h', 'rte_vdpa.h', 'rte_vdpa_dev.h',
> -             'rte_vhost_crypto.h')
> +             'rte_vhost_crypto.h', 'rte_vhost_async.h')
>  deps += ['ethdev', 'cryptodev', 'hash', 'pci'] diff --git
> a/lib/librte_vhost/rte_vhost.h b/lib/librte_vhost/rte_vhost.h index
> 8a5c332c8..f93f9595a 100644
> --- a/lib/librte_vhost/rte_vhost.h
> +++ b/lib/librte_vhost/rte_vhost.h
> @@ -35,6 +35,7 @@ extern "C" {
>  #define RTE_VHOST_USER_EXTBUF_SUPPORT        (1ULL << 5)
>  /* support only linear buffers (no chained mbufs) */
>  #define RTE_VHOST_USER_LINEARBUF_SUPPORT     (1ULL << 6)
> +#define RTE_VHOST_USER_ASYNC_COPY    (1ULL << 7)
> 
>  /* Features. */
>  #ifndef VIRTIO_NET_F_GUEST_ANNOUNCE
> diff --git a/lib/librte_vhost/rte_vhost_async.h
> b/lib/librte_vhost/rte_vhost_async.h
> new file mode 100644
> index 000000000..d5a59279a
> --- /dev/null
> +++ b/lib/librte_vhost/rte_vhost_async.h
> @@ -0,0 +1,136 @@
> +/* SPDX-License-Identifier: BSD-3-Clause
> + * Copyright(c) 2020 Intel Corporation
> + */
> +
> +#ifndef _RTE_VHOST_ASYNC_H_
> +#define _RTE_VHOST_ASYNC_H_
> +
> +#include "rte_vhost.h"
> +
> +/**
> + * iovec iterator
> + */
> +struct rte_vhost_iov_iter {
> +     /** offset to the first byte of interesting data */
> +     size_t offset;
> +     /** total bytes of data in this iterator */
> +     size_t count;
> +     /** pointer to the iovec array */
> +     struct iovec *iov;
> +     /** number of iovec in this iterator */
> +     unsigned long nr_segs;
> +};
> +
> +/**
> + * dma transfer descriptor pair
> + */
> +struct rte_vhost_async_desc {
> +     /** source memory iov_iter */
> +     struct rte_vhost_iov_iter *src;
> +     /** destination memory iov_iter */
> +     struct rte_vhost_iov_iter *dst;
> +};
> +
> +/**
> + * dma transfer status
> + */
> +struct rte_vhost_async_status {
> +     /** An array of application specific data for source memory */
> +     uintptr_t *src_opaque_data;
> +     /** An array of application specific data for destination memory */
> +     uintptr_t *dst_opaque_data;
> +};
> +
> +/**
> + * dma operation callbacks to be implemented by applications  */ struct
> +rte_vhost_async_channel_ops {
> +     /**
> +      * instruct async engines to perform copies for a batch of packets
> +      *
> +      * @param vid
> +      *  id of vhost device to perform data copies
> +      * @param queue_id
> +      *  queue id to perform data copies
> +      * @param descs
> +      *  an array of DMA transfer memory descriptors
> +      * @param opaque_data
> +      *  opaque data pair sending to DMA engine
> +      * @param count
> +      *  number of elements in the "descs" array
> +      * @return
> +      *  -1 on failure, number of descs processed on success
> +      */
> +     int (*transfer_data)(int vid, uint16_t queue_id,
> +             struct rte_vhost_async_desc *descs,
> +             struct rte_vhost_async_status *opaque_data,
> +             uint16_t count);
> +     /**
> +      * check copy-completed packets from the async engine
> +      * @param vid
> +      *  id of vhost device to check copy completion
> +      * @param queue_id
> +      *  queue id to check copyp completion
> +      * @param opaque_data
> +      *  buffer to receive the opaque data pair from DMA engine
> +      * @param max_packets
> +      *  max number of packets could be completed
> +      * @return
> +      *  -1 on failure, number of iov segments completed on success
> +      */
> +     int (*check_completed_copies)(int vid, uint16_t queue_id,
> +             struct rte_vhost_async_status *opaque_data,
> +             uint16_t max_packets);
> +};
> +
> +/**
> + *  dma channel feature bit definition
> + */
> +struct rte_vhost_async_features {
> +     union {
> +             uint32_t intval;
> +             struct {
> +                     uint32_t async_inorder:1;
> +                     uint32_t resvd_0:15;
> +                     uint32_t async_threshold:12;
> +                     uint32_t resvd_1:4;
> +             };
> +     };
> +};
> +
> +/**
> + * register a async channel for vhost
> + *
> + * @param vid
> + *  vhost device id async channel to be attached to
> + * @param queue_id
> + *  vhost queue id async channel to be attached to
> + * @param features
> + *  DMA channel feature bit
> + *    b0       : DMA supports inorder data transfer
> + *    b1  - b15: reserved
> + *    b16 - b27: Packet length threshold for DMA transfer
> + *    b28 - b31: reserved
> + * @param ops
> + *  DMA operation callbacks
> + * @return
> + *  0 on success, -1 on failures
> + */
> +__rte_experimental
> +int rte_vhost_async_channel_register(int vid, uint16_t queue_id,
> +     uint32_t features, struct rte_vhost_async_channel_ops *ops);
> +
> +/**
> + * unregister a dma channel for vhost
> + *
> + * @param vid
> + *  vhost device id DMA channel to be detached
> + * @param queue_id
> + *  vhost queue id DMA channel to be detached
> + * @return
> + *  0 on success, -1 on failures
> + */
> +__rte_experimental
> +int rte_vhost_async_channel_unregister(int vid, uint16_t queue_id);
> +
> +#endif /* _RTE_VHOST_ASYNC_H_ */
> diff --git a/lib/librte_vhost/rte_vhost_version.map
> b/lib/librte_vhost/rte_vhost_version.map
> index 86784405a..13ec53b63 100644
> --- a/lib/librte_vhost/rte_vhost_version.map
> +++ b/lib/librte_vhost/rte_vhost_version.map
> @@ -71,4 +71,8 @@ EXPERIMENTAL {
>       rte_vdpa_get_queue_num;
>       rte_vdpa_get_features;
>       rte_vdpa_get_protocol_features;
> +     rte_vhost_async_channel_register;
> +     rte_vhost_async_channel_unregister;
> +     rte_vhost_submit_enqueue_burst;
> +     rte_vhost_poll_enqueue_completed;
>  };
> diff --git a/lib/librte_vhost/socket.c b/lib/librte_vhost/socket.c index
> 49267cebf..c4626d2c4 100644
> --- a/lib/librte_vhost/socket.c
> +++ b/lib/librte_vhost/socket.c
> @@ -42,6 +42,7 @@ struct vhost_user_socket {
>       bool use_builtin_virtio_net;
>       bool extbuf;
>       bool linearbuf;
> +     bool async_copy;
> 
>       /*
>        * The "supported_features" indicates the feature bits the @@ -205,6
> +206,7 @@ vhost_user_add_connection(int fd, struct vhost_user_socket
> *vsocket)
>       size_t size;
>       struct vhost_user_connection *conn;
>       int ret;
> +     struct virtio_net *dev;
> 
>       if (vsocket == NULL)
>               return;
> @@ -236,6 +238,13 @@ vhost_user_add_connection(int fd, struct
> vhost_user_socket *vsocket)
>       if (vsocket->linearbuf)
>               vhost_enable_linearbuf(vid);
> 
> +     if (vsocket->async_copy) {
> +             dev = get_device(vid);
> +
> +             if (dev)
> +                     dev->async_copy = 1;
> +     }
> +
>       VHOST_LOG_CONFIG(INFO, "new device, handle is %d\n", vid);
> 
>       if (vsocket->notify_ops->new_connection) { @@ -881,6 +890,17 @@
> rte_vhost_driver_register(const char *path, uint64_t flags)
>               goto out_mutex;
>       }
> 
> +     vsocket->async_copy = flags & RTE_VHOST_USER_ASYNC_COPY;
> +
> +     if (vsocket->async_copy &&
> +             (flags & (RTE_VHOST_USER_IOMMU_SUPPORT |
> +             RTE_VHOST_USER_POSTCOPY_SUPPORT))) {
> +             VHOST_LOG_CONFIG(ERR, "error: enabling async copy and
> IOMMU "
> +                     "or post-copy feature simultaneously is not "
> +                     "supported\n");
> +             goto out_mutex;
> +     }
> +
>       /*
>        * Set the supported features correctly for the builtin vhost-user
>        * net driver.
> @@ -931,6 +951,13 @@ rte_vhost_driver_register(const char *path, uint64_t
> flags)
>                       ~(1ULL << VHOST_USER_PROTOCOL_F_PAGEFAULT);
>       }
> 
> +     if (vsocket->async_copy) {
> +             vsocket->supported_features &= ~(1ULL << VHOST_F_LOG_ALL);
> +             vsocket->features &= ~(1ULL << VHOST_F_LOG_ALL);
> +             VHOST_LOG_CONFIG(INFO,
> +                     "Logging feature is disabled in async copy mode\n");
> +     }
> +
>       /*
>        * We'll not be able to receive a buffer from guest in linear mode
>        * without external buffer if it will not fit in a single mbuf, which 
> is diff --
> git a/lib/librte_vhost/vhost.c b/lib/librte_vhost/vhost.c index
> 0d822d6a3..a11385f39 100644
> --- a/lib/librte_vhost/vhost.c
> +++ b/lib/librte_vhost/vhost.c
> @@ -332,8 +332,13 @@ free_vq(struct virtio_net *dev, struct vhost_virtqueue
> *vq)  {
>       if (vq_is_packed(dev))
>               rte_free(vq->shadow_used_packed);
> -     else
> +     else {
>               rte_free(vq->shadow_used_split);
> +             if (vq->async_pkts_pending)
> +                     rte_free(vq->async_pkts_pending);
> +             if (vq->async_pending_info)
> +                     rte_free(vq->async_pending_info);
> +     }
>       rte_free(vq->batch_copy_elems);
>       rte_mempool_free(vq->iotlb_pool);
>       rte_free(vq);
> @@ -1522,3 +1527,123 @@ RTE_INIT(vhost_log_init)
>       if (vhost_data_log_level >= 0)
>               rte_log_set_level(vhost_data_log_level,
> RTE_LOG_WARNING);  }
> +
> +int rte_vhost_async_channel_register(int vid, uint16_t queue_id,
> +                                     uint32_t features,
> +                                     struct rte_vhost_async_channel_ops
> *ops) {
> +     struct vhost_virtqueue *vq;
> +     struct virtio_net *dev = get_device(vid);
> +     struct rte_vhost_async_features f;
> +
> +     if (dev == NULL || ops == NULL)
> +             return -1;
> +
> +     f.intval = features;
> +
> +     vq = dev->virtqueue[queue_id];
> +
> +     if (unlikely(vq == NULL || !dev->async_copy))
> +             return -1;
> +
> +     /* packed queue is not supported */
> +     if (unlikely(vq_is_packed(dev) || !f.async_inorder)) {
> +             VHOST_LOG_CONFIG(ERR,
> +                     "async copy is not supported on packed queue or non-
> inorder mode "
> +                     "(vid %d, qid: %d)\n", vid, queue_id);
> +             return -1;
> +     }
> +
> +     if (unlikely(ops->check_completed_copies == NULL ||
> +             ops->transfer_data == NULL))
> +             return -1;
> +
> +     rte_spinlock_lock(&vq->access_lock);
> +
> +     if (unlikely(vq->async_registered)) {
> +             VHOST_LOG_CONFIG(ERR,
> +                     "async register failed: channel already registered "
> +                     "(vid %d, qid: %d)\n", vid, queue_id);
> +             goto reg_out;
> +     }
> +
> +     vq->async_pkts_pending = rte_malloc(NULL,
> +                     vq->size * sizeof(uintptr_t),
> +                     RTE_CACHE_LINE_SIZE);
> +     vq->async_pending_info = rte_malloc(NULL,
> +                     vq->size * sizeof(uint64_t),
> +                     RTE_CACHE_LINE_SIZE);
> +     if (!vq->async_pkts_pending || !vq->async_pending_info) {
> +             if (vq->async_pkts_pending)
> +                     rte_free(vq->async_pkts_pending);
> +
> +             if (vq->async_pending_info)
> +                     rte_free(vq->async_pending_info);
> +
> +             VHOST_LOG_CONFIG(ERR,
> +                             "async register failed: cannot allocate memory
> for vq data "
> +                             "(vid %d, qid: %d)\n", vid, queue_id);
> +             goto reg_out;
> +     }
> +
> +     vq->async_ops.check_completed_copies = ops-
> >check_completed_copies;
> +     vq->async_ops.transfer_data = ops->transfer_data;
> +
> +     vq->async_inorder = f.async_inorder;
> +     vq->async_threshold = f.async_threshold;
> +
> +     vq->async_registered = true;
> +
> +reg_out:
> +     rte_spinlock_unlock(&vq->access_lock);
> +
> +     return 0;
> +}
> +
> +int rte_vhost_async_channel_unregister(int vid, uint16_t queue_id) {
> +     struct vhost_virtqueue *vq;
> +     struct virtio_net *dev = get_device(vid);
> +     int ret = -1;
> +
> +     if (dev == NULL)
> +             return ret;
> +
> +     vq = dev->virtqueue[queue_id];
> +
> +     if (vq == NULL)
> +             return ret;
> +
> +     ret = 0;
> +     rte_spinlock_lock(&vq->access_lock);
> +
> +     if (!vq->async_registered)
> +             goto out;
> +
> +     if (vq->async_pkts_inflight_n) {
> +             VHOST_LOG_CONFIG(ERR, "Failed to unregister async channel.
> "
> +                     "async inflight packets must be completed before
> unregistration.\n");
> +             ret = -1;
> +             goto out;
> +     }
> +
> +     if (vq->async_pkts_pending) {
> +             rte_free(vq->async_pkts_pending);
> +             vq->async_pkts_pending = NULL;
> +     }
> +
> +     if (vq->async_pending_info) {
> +             rte_free(vq->async_pending_info);
> +             vq->async_pending_info = NULL;
> +     }
> +
> +     vq->async_ops.transfer_data = NULL;
> +     vq->async_ops.check_completed_copies = NULL;
> +     vq->async_registered = false;
> +
> +out:
> +     rte_spinlock_unlock(&vq->access_lock);
> +
> +     return ret;
> +}
> +
> diff --git a/lib/librte_vhost/vhost.h b/lib/librte_vhost/vhost.h index
> 034463699..f3731982b 100644
> --- a/lib/librte_vhost/vhost.h
> +++ b/lib/librte_vhost/vhost.h
> @@ -24,6 +24,8 @@
>  #include "rte_vdpa.h"
>  #include "rte_vdpa_dev.h"
> 
> +#include "rte_vhost_async.h"
> +
>  /* Used to indicate that the device is running on a data core */  #define
> VIRTIO_DEV_RUNNING 1
>  /* Used to indicate that the device is ready to operate */ @@ -40,6 +42,11 @@
> 
>  #define VHOST_LOG_CACHE_NR 32
> 
> +#define MAX_PKT_BURST 32
> +
> +#define VHOST_MAX_ASYNC_IT (MAX_PKT_BURST * 2) #define
> +VHOST_MAX_ASYNC_VEC (BUF_VECTOR_MAX * 2)
> +
>  #define PACKED_DESC_ENQUEUE_USED_FLAG(w)     \
>       ((w) ? (VRING_DESC_F_AVAIL | VRING_DESC_F_USED |
> VRING_DESC_F_WRITE) : \
>               VRING_DESC_F_WRITE)
> @@ -202,6 +209,25 @@ struct vhost_virtqueue {
>       TAILQ_HEAD(, vhost_iotlb_entry) iotlb_list;
>       int                             iotlb_cache_nr;
>       TAILQ_HEAD(, vhost_iotlb_entry) iotlb_pending_list;
> +
> +     /* operation callbacks for async dma */
> +     struct rte_vhost_async_channel_ops      async_ops;
> +
> +     struct rte_vhost_iov_iter it_pool[VHOST_MAX_ASYNC_IT];
> +     struct iovec vec_pool[VHOST_MAX_ASYNC_VEC];
> +
> +     /* async data transfer status */
> +     uintptr_t       **async_pkts_pending;
> +     #define         ASYNC_PENDING_INFO_N_MSK 0xFFFF
> +     #define         ASYNC_PENDING_INFO_N_SFT 16
> +     uint64_t        *async_pending_info;
> +     uint16_t        async_pkts_idx;
> +     uint16_t        async_pkts_inflight_n;
> +
> +     /* vq async features */
> +     bool            async_inorder;
> +     bool            async_registered;
> +     uint16_t        async_threshold;
>  } __rte_cache_aligned;
> 
>  #define VHOST_MAX_VRING                      0x100
> @@ -338,6 +364,7 @@ struct virtio_net {
>       int16_t                 broadcast_rarp;
>       uint32_t                nr_vring;
>       int                     dequeue_zero_copy;
> +     int                     async_copy;
>       int                     extbuf;
>       int                     linearbuf;
>       struct vhost_virtqueue  *virtqueue[VHOST_MAX_QUEUE_PAIRS * 2];
> @@ -683,7 +710,8 @@ vhost_vring_call_split(struct virtio_net *dev, struct
> vhost_virtqueue *vq)
>       /* Don't kick guest if we don't reach index specified by guest. */
>       if (dev->features & (1ULL << VIRTIO_RING_F_EVENT_IDX)) {
>               uint16_t old = vq->signalled_used;
> -             uint16_t new = vq->last_used_idx;
> +             uint16_t new = vq->async_pkts_inflight_n ?
> +                                     vq->used->idx:vq->last_used_idx;
>               bool signalled_used_valid = vq->signalled_used_valid;
> 
>               vq->signalled_used = new;
> diff --git a/lib/librte_vhost/vhost_user.c b/lib/librte_vhost/vhost_user.c 
> index
> 6039a8fdb..aa8605523 100644
> --- a/lib/librte_vhost/vhost_user.c
> +++ b/lib/librte_vhost/vhost_user.c
> @@ -476,12 +476,14 @@ vhost_user_set_vring_num(struct virtio_net **pdev,
>       } else {
>               if (vq->shadow_used_split)
>                       rte_free(vq->shadow_used_split);
> +
>               vq->shadow_used_split = rte_malloc(NULL,
>                               vq->size * sizeof(struct vring_used_elem),
>                               RTE_CACHE_LINE_SIZE);
> +
>               if (!vq->shadow_used_split) {
>                       VHOST_LOG_CONFIG(ERR,
> -                                     "failed to allocate memory for shadow
> used ring.\n");
> +                                     "failed to allocate memory for vq
> internal data.\n");
>                       return RTE_VHOST_MSG_RESULT_ERR;
>               }
>       }
> @@ -1166,7 +1168,8 @@ vhost_user_set_mem_table(struct virtio_net **pdev,
> struct VhostUserMsg *msg,
>                       goto err_mmap;
>               }
> 
> -             populate = (dev->dequeue_zero_copy) ? MAP_POPULATE : 0;
> +             populate = (dev->dequeue_zero_copy || dev->async_copy) ?
> +                     MAP_POPULATE : 0;
>               mmap_addr = mmap(NULL, mmap_size, PROT_READ |
> PROT_WRITE,
>                                MAP_SHARED | populate, fd, 0);
> 
> @@ -1181,7 +1184,7 @@ vhost_user_set_mem_table(struct virtio_net **pdev,
> struct VhostUserMsg *msg,
>               reg->host_user_addr = (uint64_t)(uintptr_t)mmap_addr +
>                                     mmap_offset;
> 
> -             if (dev->dequeue_zero_copy)
> +             if (dev->dequeue_zero_copy || dev->async_copy)
>                       if (add_guest_pages(dev, reg, alignment) < 0) {
>                               VHOST_LOG_CONFIG(ERR,
>                                       "adding guest pages to region %u
> failed.\n", @@ -1979,6 +1982,12 @@ vhost_user_get_vring_base(struct
> virtio_net **pdev,
>       } else {
>               rte_free(vq->shadow_used_split);
>               vq->shadow_used_split = NULL;
> +             if (vq->async_pkts_pending)
> +                     rte_free(vq->async_pkts_pending);
> +             if (vq->async_pending_info)
> +                     rte_free(vq->async_pending_info);
> +             vq->async_pkts_pending = NULL;
> +             vq->async_pending_info = NULL;
>       }
> 
>       rte_free(vq->batch_copy_elems);
> @@ -2012,6 +2021,14 @@ vhost_user_set_vring_enable(struct virtio_net
> **pdev,
>               "set queue enable: %d to qp idx: %d\n",
>               enable, index);
> 
> +     if (!enable && dev->virtqueue[index]->async_registered) {
> +             if (dev->virtqueue[index]->async_pkts_inflight_n) {
> +                     VHOST_LOG_CONFIG(ERR, "failed to disable vring. "
> +                     "async inflight packets must be completed first\n");
> +                     return RTE_VHOST_MSG_RESULT_ERR;
> +             }
> +     }
> +
>       /* On disable, rings have to be stopped being processed. */
>       if (!enable && dev->dequeue_zero_copy)
>               drain_zmbuf_list(dev->virtqueue[index]);
> --
> 2.18.4

Reviewed-by: Chenbo Xia <chenbo....@intel.com>

Reply via email to