Merge vhost merge-able rx.
For vhost tx, previous vhost merge-able feature introduces virtio_dev_merge_tx,
and calls virtio_dev_tx and vritio_dev_merge_tx respectively depends on whether
the vhost device supports merge-able feature.
There is no so called merge-tx, it is actually fix for memcpy from chained vring
desc to chained mbuf.
Use virtio_dev_merge_tx as the base for vhost tx.

Signed-off-by: Huawei Xie <huawei.xie at intel.com>
---
 lib/librte_vhost/rte_virtio_net.h |  16 +-
 lib/librte_vhost/vhost_rxtx.c     | 568 +++++++++++++++++++++++++++++++++-----
 2 files changed, 511 insertions(+), 73 deletions(-)

diff --git a/lib/librte_vhost/rte_virtio_net.h 
b/lib/librte_vhost/rte_virtio_net.h
index 08dc6f4..99ddfc1 100644
--- a/lib/librte_vhost/rte_virtio_net.h
+++ b/lib/librte_vhost/rte_virtio_net.h
@@ -53,9 +53,18 @@
 /* Enum for virtqueue management. */
 enum {VIRTIO_RXQ, VIRTIO_TXQ, VIRTIO_QNUM};

-
-/*
- * Structure contains variables relevant to TX/RX virtqueues.
+#define BUF_VECTOR_MAX 256
+/**
+ * Structure contains buffer address, length and descriptor index
+ * from vring to do scatter RX.
+ */
+struct buf_vector {
+       uint64_t buf_addr;
+       uint32_t buf_len;
+       uint32_t desc_idx;
+};
+/**
+ * Structure contains variables relevant to RX/TX virtqueues.
  */
 struct vhost_virtqueue
 {
@@ -69,6 +78,7 @@ struct vhost_virtqueue
        volatile uint16_t       last_used_idx_res;      /* Used for multiple 
devices reserving buffers. */
        eventfd_t                       callfd;                         /* 
Currently unused as polling mode is enabled. */
        eventfd_t                       kickfd;                         /* Used 
to notify the guest (trigger interrupt). */
+       struct buf_vector    buf_vec[BUF_VECTOR_MAX]; /**< for scatter RX. */
 } __rte_cache_aligned;


diff --git a/lib/librte_vhost/vhost_rxtx.c b/lib/librte_vhost/vhost_rxtx.c
index 0d96c43..81368e6 100644
--- a/lib/librte_vhost/vhost_rxtx.c
+++ b/lib/librte_vhost/vhost_rxtx.c
@@ -49,8 +49,8 @@
  * count is returned to indicate the number of packets that were succesfully
  * added to the RX queue. This function works when mergeable is disabled.
  */
-uint32_t
-rte_vhost_enqueue_burst(struct virtio_net *dev, uint16_t queue_id, struct 
rte_mbuf **pkts, uint32_t count)
+static inline uint32_t __attribute__((always_inline))
+virtio_dev_rx(struct virtio_net *dev, uint16_t queue_id, struct rte_mbuf 
**pkts, uint32_t count)
 {
        struct vhost_virtqueue *vq;
        struct vring_desc *desc;
@@ -61,7 +61,6 @@ rte_vhost_enqueue_burst(struct virtio_net *dev, uint16_t 
queue_id, struct rte_mb
        uint64_t buff_hdr_addr = 0;
        uint32_t head[VHOST_MAX_PKT_BURST], packet_len = 0;
        uint32_t head_idx, packet_success = 0;
-       uint32_t mergeable, mrg_count = 0;
        uint16_t avail_idx, res_cur_idx;
        uint16_t res_base_idx, res_end_idx;
        uint16_t free_entries;
@@ -101,9 +100,6 @@ rte_vhost_enqueue_burst(struct virtio_net *dev, uint16_t 
queue_id, struct rte_mb
        /* Prefetch available ring to retrieve indexes. */
        rte_prefetch0(&vq->avail->ring[res_cur_idx & (vq->size - 1)]);

-       /* Check if the VIRTIO_NET_F_MRG_RXBUF feature is enabled. */
-       mergeable = dev->features & (1 << VIRTIO_NET_F_MRG_RXBUF);
-
        /* Retrieve all of the head indexes first to avoid caching issues. */
        for (head_idx = 0; head_idx < count; head_idx++)
                head[head_idx] = vq->avail->ring[(res_cur_idx + head_idx) & 
(vq->size - 1)];
@@ -122,27 +118,23 @@ rte_vhost_enqueue_burst(struct virtio_net *dev, uint16_t 
queue_id, struct rte_mb
                /* Prefetch buffer address. */
                rte_prefetch0((void*)(uintptr_t)buff_addr);

-               if (mergeable && (mrg_count != 0)) {
-                       desc->len = packet_len = rte_pktmbuf_data_len(buff);
+               /* Copy virtio_hdr to packet and increment buffer address */
+               buff_hdr_addr = buff_addr;
+               packet_len = rte_pktmbuf_data_len(buff) + vq->vhost_hlen;
+
+               /*
+                * If the descriptors are chained the header and data are 
placed in
+                * separate buffers.
+                */
+               if (desc->flags & VRING_DESC_F_NEXT) {
+                       desc->len = vq->vhost_hlen;
+                       desc = &vq->desc[desc->next];
+                       /* Buffer address translation. */
+                       buff_addr = gpa_to_vva(dev, desc->addr);
+                       desc->len = rte_pktmbuf_data_len(buff);
                } else {
-                       /* Copy virtio_hdr to packet and increment buffer 
address */
-                       buff_hdr_addr = buff_addr;
-                       packet_len = rte_pktmbuf_data_len(buff) + 
vq->vhost_hlen;
-
-                       /*
-                        * If the descriptors are chained the header and data 
are placed in
-                        * separate buffers.
-                        */
-                       if (desc->flags & VRING_DESC_F_NEXT) {
-                               desc->len = vq->vhost_hlen;
-                               desc = &vq->desc[desc->next];
-                               /* Buffer address translation. */
-                               buff_addr = gpa_to_vva(dev, desc->addr);
-                               desc->len = rte_pktmbuf_data_len(buff);
-                       } else {
-                               buff_addr += vq->vhost_hlen;
-                               desc->len = packet_len;
-                       }
+                       buff_addr += vq->vhost_hlen;
+                       desc->len = packet_len;
                }


@@ -161,21 +153,9 @@ rte_vhost_enqueue_burst(struct virtio_net *dev, uint16_t 
queue_id, struct rte_mb
                res_cur_idx++;
                packet_success++;

-               /* If mergeable is disabled then a header is required per 
buffer. */
-               if (!mergeable) {
-                       rte_memcpy((void *)(uintptr_t)buff_hdr_addr, (const 
void *)&virtio_hdr, vq->vhost_hlen);
-                       VHOST_PRINT_PACKET(dev, (uintptr_t)buff_hdr_addr, 
vq->vhost_hlen, 1);
-               } else {
-                       mrg_count++;
-                       /* Merge buffer can only handle so many buffers at a 
time. Tell the guest if this limit is reached. */
-                       if ((mrg_count == VHOST_MAX_MRG_PKT_BURST) || 
(res_cur_idx == res_end_idx)) {
-                               virtio_hdr.num_buffers = mrg_count;
-                               LOG_DEBUG(VHOST_DATA, "(%"PRIu64") RX: Num 
merge buffers %d\n", dev->device_fh, virtio_hdr.num_buffers);
-                               rte_memcpy((void *)(uintptr_t)buff_hdr_addr, 
(const void *)&virtio_hdr, vq->vhost_hlen);
-                               VHOST_PRINT_PACKET(dev, 
(uintptr_t)buff_hdr_addr, vq->vhost_hlen, 1);
-                               mrg_count = 0;
-                       }
-               }
+               rte_memcpy((void *)(uintptr_t)buff_hdr_addr, (const void 
*)&virtio_hdr,
+                               vq->vhost_hlen);
+               VHOST_PRINT_PACKET(dev, (uintptr_t)buff_hdr_addr, 
vq->vhost_hlen, 1);
                if (res_cur_idx < res_end_idx) {
                        /* Prefetch descriptor index. */
                        rte_prefetch0(&vq->desc[head[packet_success]]);
@@ -197,18 +177,357 @@ rte_vhost_enqueue_burst(struct virtio_net *dev, uint16_t 
queue_id, struct rte_mb
        return count;
 }

+static inline uint32_t __attribute__((always_inline))
+copy_from_mbuf_to_vring(struct virtio_net *dev, uint16_t res_base_idx,
+       uint16_t res_end_idx, struct rte_mbuf *pkt)
+{
+       uint32_t vec_idx = 0;
+       uint32_t entry_success = 0;
+       struct vhost_virtqueue *vq;
+       /* The virtio_hdr is initialised to 0. */
+       struct virtio_net_hdr_mrg_rxbuf virtio_hdr = {
+               {0, 0, 0, 0, 0, 0}, 0};
+       uint16_t cur_idx = res_base_idx;
+       uint64_t vb_addr = 0;
+       uint64_t vb_hdr_addr = 0;
+       uint32_t seg_offset = 0;
+       uint32_t vb_offset = 0;
+       uint32_t seg_avail;
+       uint32_t vb_avail;
+       uint32_t cpy_len, entry_len;
+
+       if (pkt == NULL)
+               return 0;
+
+       LOG_DEBUG(VHOST_DATA, "(%"PRIu64") Current Index %d| "
+               "End Index %d\n",
+               dev->device_fh, cur_idx, res_end_idx);
+
+       /*
+        * Convert from gpa to vva
+        * (guest physical addr -> vhost virtual addr)
+        */
+       vq = dev->virtqueue[VIRTIO_RXQ];
+       vb_addr =
+               gpa_to_vva(dev, vq->buf_vec[vec_idx].buf_addr);
+       vb_hdr_addr = vb_addr;
+
+       /* Prefetch buffer address. */
+       rte_prefetch0((void *)(uintptr_t)vb_addr);
+
+       virtio_hdr.num_buffers = res_end_idx - res_base_idx;
+
+       LOG_DEBUG(VHOST_DATA, "(%"PRIu64") RX: Num merge buffers %d\n",
+               dev->device_fh, virtio_hdr.num_buffers);
+
+       rte_memcpy((void *)(uintptr_t)vb_hdr_addr,
+               (const void *)&virtio_hdr, vq->vhost_hlen);
+
+       VHOST_PRINT_PACKET(dev, (uintptr_t)vb_hdr_addr, vq->vhost_hlen, 1);
+
+       seg_avail = rte_pktmbuf_data_len(pkt);
+       vb_offset = vq->vhost_hlen;
+       vb_avail =
+               vq->buf_vec[vec_idx].buf_len - vq->vhost_hlen;
+
+       entry_len = vq->vhost_hlen;
+
+       if (vb_avail == 0) {
+               uint32_t desc_idx =
+                       vq->buf_vec[vec_idx].desc_idx;
+               vq->desc[desc_idx].len = vq->vhost_hlen;
+
+               if ((vq->desc[desc_idx].flags
+                       & VRING_DESC_F_NEXT) == 0) {
+                       /* Update used ring with desc information */
+                       vq->used->ring[cur_idx & (vq->size - 1)].id
+                               = vq->buf_vec[vec_idx].desc_idx;
+                       vq->used->ring[cur_idx & (vq->size - 1)].len
+                               = entry_len;
+
+                       entry_len = 0;
+                       cur_idx++;
+                       entry_success++;
+               }
+
+               vec_idx++;
+               vb_addr =
+                       gpa_to_vva(dev, vq->buf_vec[vec_idx].buf_addr);
+
+               /* Prefetch buffer address. */
+               rte_prefetch0((void *)(uintptr_t)vb_addr);
+               vb_offset = 0;
+               vb_avail = vq->buf_vec[vec_idx].buf_len;
+       }
+
+       cpy_len = RTE_MIN(vb_avail, seg_avail);
+
+       while (cpy_len > 0) {
+               /* Copy mbuf data to vring buffer */
+               rte_memcpy((void *)(uintptr_t)(vb_addr + vb_offset),
+                       (const void *)(rte_pktmbuf_mtod(pkt, char*) + 
seg_offset),
+                       cpy_len);
+
+               VHOST_PRINT_PACKET(dev,
+                       (uintptr_t)(vb_addr + vb_offset),
+                       cpy_len, 0);
+
+               seg_offset += cpy_len;
+               vb_offset += cpy_len;
+               seg_avail -= cpy_len;
+               vb_avail -= cpy_len;
+               entry_len += cpy_len;
+
+               if (seg_avail != 0) {
+                       /*
+                        * The virtio buffer in this vring
+                        * entry reach to its end.
+                        * But the segment doesn't complete.
+                        */
+                       if ((vq->desc[vq->buf_vec[vec_idx].desc_idx].flags &
+                               VRING_DESC_F_NEXT) == 0) {
+                               /* Update used ring with desc information */
+                               vq->used->ring[cur_idx & (vq->size - 1)].id
+                                       = vq->buf_vec[vec_idx].desc_idx;
+                               vq->used->ring[cur_idx & (vq->size - 1)].len
+                                       = entry_len;
+                               entry_len = 0;
+                               cur_idx++;
+                               entry_success++;
+                       }
+
+                       vec_idx++;
+                       vb_addr = gpa_to_vva(dev,
+                               vq->buf_vec[vec_idx].buf_addr);
+                       vb_offset = 0;
+                       vb_avail = vq->buf_vec[vec_idx].buf_len;
+                       cpy_len = RTE_MIN(vb_avail, seg_avail);
+               } else {
+                       /*
+                        * This current segment complete, need continue to
+                        * check if the whole packet complete or not.
+                        */
+                       pkt = pkt->pkt.next;
+                       if (pkt != NULL) {
+                               /*
+                                * There are more segments.
+                                */
+                               if (vb_avail == 0) {
+                                       /*
+                                        * This current buffer from vring is
+                                        * used up, need fetch next buffer
+                                        * from buf_vec.
+                                        */
+                                       uint32_t desc_idx =
+                                               vq->buf_vec[vec_idx].desc_idx;
+                                       vq->desc[desc_idx].len = vb_offset;
+
+                                       if ((vq->desc[desc_idx].flags &
+                                               VRING_DESC_F_NEXT) == 0) {
+                                               uint16_t wrapped_idx =
+                                                       cur_idx & (vq->size - 
1);
+                                               /*
+                                                * Update used ring with the
+                                                * descriptor information
+                                                */
+                                               vq->used->ring[wrapped_idx].id
+                                                       = desc_idx;
+                                               vq->used->ring[wrapped_idx].len
+                                                       = entry_len;
+                                               entry_success++;
+                                               entry_len = 0;
+                                               cur_idx++;
+                                       }
+
+                                       /* Get next buffer from buf_vec. */
+                                       vec_idx++;
+                                       vb_addr = gpa_to_vva(dev,
+                                               vq->buf_vec[vec_idx].buf_addr);
+                                       vb_avail =
+                                               vq->buf_vec[vec_idx].buf_len;
+                                       vb_offset = 0;
+                               }
+
+                               seg_offset = 0;
+                               seg_avail = rte_pktmbuf_data_len(pkt);
+                               cpy_len = RTE_MIN(vb_avail, seg_avail);
+                       } else {
+                               /*
+                                * This whole packet completes.
+                                */
+                               uint32_t desc_idx =
+                                       vq->buf_vec[vec_idx].desc_idx;
+                               vq->desc[desc_idx].len = vb_offset;
+
+                               while (vq->desc[desc_idx].flags &
+                                       VRING_DESC_F_NEXT) {
+                                       desc_idx = vq->desc[desc_idx].next;
+                                        vq->desc[desc_idx].len = 0;
+                               }
+
+                               /* Update used ring with desc information */
+                               vq->used->ring[cur_idx & (vq->size - 1)].id
+                                       = vq->buf_vec[vec_idx].desc_idx;
+                               vq->used->ring[cur_idx & (vq->size - 1)].len
+                                       = entry_len;
+                               entry_len = 0;
+                               cur_idx++;
+                               entry_success++;
+                               seg_avail = 0;
+                               cpy_len = RTE_MIN(vb_avail, seg_avail);
+                       }
+               }
+       }
+
+       return entry_success;
+}
+
+/*
+ * This function works for mergeable RX.
+ */
+static inline uint32_t __attribute__((always_inline))
+virtio_dev_merge_rx(struct virtio_net *dev, uint16_t queue_id, struct rte_mbuf 
**pkts,
+       uint32_t count)
+{
+       struct vhost_virtqueue *vq;
+       uint32_t pkt_idx = 0, entry_success = 0;
+       uint16_t avail_idx, res_cur_idx;
+       uint16_t res_base_idx, res_end_idx;
+       uint8_t success = 0;
+
+       LOG_DEBUG(VHOST_DATA, "(%"PRIu64") virtio_dev_merge_rx()\n",
+               dev->device_fh);
+       if (unlikely(queue_id != VIRTIO_RXQ)) {
+               LOG_DEBUG(VHOST_DATA, "mq isn't supported in this version.\n");
+       }
+
+       vq = dev->virtqueue[VIRTIO_RXQ];
+       count = RTE_MIN((uint32_t)VHOST_MAX_PKT_BURST, count);
+
+       if (count == 0)
+               return 0;
+
+       for (pkt_idx = 0; pkt_idx < count; pkt_idx++) {
+               uint32_t secure_len = 0;
+               uint16_t need_cnt;
+               uint32_t vec_idx = 0;
+               uint32_t pkt_len = pkts[pkt_idx]->pkt.pkt_len + vq->vhost_hlen;
+               uint16_t i, id;
+
+               do {
+                       /*
+                        * As many data cores may want access to available
+                        * buffers, they need to be reserved.
+                        */
+                       res_base_idx = vq->last_used_idx_res;
+                       res_cur_idx = res_base_idx;
+
+                       do {
+                               avail_idx = *((volatile uint16_t 
*)&vq->avail->idx);
+                               if (unlikely(res_cur_idx == avail_idx)) {
+                                       LOG_DEBUG(VHOST_DATA,
+                                               "(%"PRIu64") Failed "
+                                               "to get enough desc from "
+                                               "vring\n",
+                                               dev->device_fh);
+                                       return pkt_idx;
+                               } else {
+                                       uint16_t wrapped_idx =
+                                               (res_cur_idx) & (vq->size - 1);
+                                       uint32_t idx =
+                                               vq->avail->ring[wrapped_idx];
+                                       uint8_t next_desc;
+
+                                       do {
+                                               next_desc = 0;
+                                               secure_len += vq->desc[idx].len;
+                                               if (vq->desc[idx].flags &
+                                                       VRING_DESC_F_NEXT) {
+                                                       idx = 
vq->desc[idx].next;
+                                                       next_desc = 1;
+                                               }
+                                       } while (next_desc);
+
+                                       res_cur_idx++;
+                               }
+                       } while (pkt_len > secure_len);
+
+                       /* vq->last_used_idx_res is atomically updated. */
+                       success = rte_atomic16_cmpset(&vq->last_used_idx_res,
+                                                       res_base_idx,
+                                                       res_cur_idx);
+               } while (success == 0);
+
+               id = res_base_idx;
+               need_cnt = res_cur_idx - res_base_idx;
+
+               for (i = 0; i < need_cnt; i++, id++) {
+                       uint16_t wrapped_idx = id & (vq->size - 1);
+                       uint32_t idx = vq->avail->ring[wrapped_idx];
+                       uint8_t next_desc;
+                       do {
+                               next_desc = 0;
+                               vq->buf_vec[vec_idx].buf_addr =
+                                       vq->desc[idx].addr;
+                               vq->buf_vec[vec_idx].buf_len =
+                                       vq->desc[idx].len;
+                               vq->buf_vec[vec_idx].desc_idx = idx;
+                               vec_idx++;
+
+                               if (vq->desc[idx].flags & VRING_DESC_F_NEXT) {
+                                       idx = vq->desc[idx].next;
+                                       next_desc = 1;
+                               }
+                       } while (next_desc);
+               }
+
+               res_end_idx = res_cur_idx;
+
+               entry_success = copy_from_mbuf_to_vring(dev, res_base_idx,
+                       res_end_idx, pkts[pkt_idx]);
+
+               rte_compiler_barrier();
+
+               /*
+                * Wait until it's our turn to add our buffer
+                * to the used ring.
+                */
+               while (unlikely(vq->last_used_idx != res_base_idx))
+                       rte_pause();
+
+               *(volatile uint16_t *)&vq->used->idx += entry_success;
+               vq->last_used_idx = res_end_idx;
+
+               /* Kick the guest if necessary. */
+               if (!(vq->avail->flags & VRING_AVAIL_F_NO_INTERRUPT))
+                       eventfd_write((int)vq->kickfd, 1);
+       }
+
+       return count;
+}
+
+uint32_t
+rte_vhost_enqueue_burst(struct virtio_net *dev, uint16_t queue_id, struct 
rte_mbuf **pkts, uint32_t count)
+{
+       if (unlikely(dev->features & (1 << VIRTIO_NET_F_MRG_RXBUF)))
+               return virtio_dev_merge_rx(dev, queue_id, pkts, count);
+       else
+               return virtio_dev_rx(dev, queue_id, pkts, count);
+}
+
+

 uint32_t
 rte_vhost_dequeue_burst(struct virtio_net *dev, uint16_t queue_id, struct 
rte_mempool *mbuf_pool, struct rte_mbuf **pkts, uint32_t count)
 {
-       struct rte_mbuf *mbuf;
+       struct rte_mbuf *m, *prev;
        struct vhost_virtqueue *vq;
        struct vring_desc *desc;
-       uint64_t buff_addr = 0;
+       uint64_t vb_addr = 0;
        uint32_t head[VHOST_MAX_PKT_BURST];
        uint32_t used_idx;
        uint32_t i;
-       uint16_t free_entries, packet_success = 0;
+       uint16_t free_entries, entry_success = 0;
        uint16_t avail_idx;

        if (unlikely(queue_id != VIRTIO_TXQ)) {
@@ -223,7 +542,8 @@ rte_vhost_dequeue_burst(struct virtio_net *dev, uint16_t 
queue_id, struct rte_me
        if (vq->last_used_idx == avail_idx)
                return 0;

-       LOG_DEBUG(VHOST_DATA, "(%"PRIu64") virtio_dev_tx()\n", dev->device_fh);
+       LOG_DEBUG(VHOST_DATA, "(%"PRIu64") %s()\n", __func__,
+               dev->device_fh);

        /* Prefetch available ring to retrieve head indexes. */
        rte_prefetch0(&vq->avail->ring[vq->last_used_idx & (vq->size - 1)]);
@@ -231,11 +551,9 @@ rte_vhost_dequeue_burst(struct virtio_net *dev, uint16_t 
queue_id, struct rte_me
        /*get the number of free entries in the ring*/
        free_entries = (avail_idx - vq->last_used_idx);

-       if (free_entries > count)
-               free_entries = count;
+       free_entries = RTE_MIN(free_entries, count);
        /* Limit to MAX_PKT_BURST. */
-       if (free_entries > VHOST_MAX_PKT_BURST)
-               free_entries = VHOST_MAX_PKT_BURST;
+       free_entries = RTE_MIN(free_entries, VHOST_MAX_PKT_BURST);

        LOG_DEBUG(VHOST_DATA, "(%"PRIu64") Buffers available %d\n", 
dev->device_fh, free_entries);
        /* Retrieve all of the head indexes first to avoid caching issues. */
@@ -243,56 +561,166 @@ rte_vhost_dequeue_burst(struct virtio_net *dev, uint16_t 
queue_id, struct rte_me
                head[i] = vq->avail->ring[(vq->last_used_idx + i) & (vq->size - 
1)];

        /* Prefetch descriptor index. */
-       rte_prefetch0(&vq->desc[head[packet_success]]);
+       rte_prefetch0(&vq->desc[head[entry_success]]);
        rte_prefetch0(&vq->used->ring[vq->last_used_idx & (vq->size - 1)]);

-       while (packet_success < free_entries) {
-               desc = &vq->desc[head[packet_success]];
+       while (entry_success < free_entries) {
+               uint32_t vb_avail, vb_offset;
+               uint32_t seg_avail, seg_offset;
+               uint32_t cpy_len;
+               uint32_t seg_num = 0;
+               struct rte_mbuf *cur;
+               uint8_t alloc_err = 0;
+
+               desc = &vq->desc[head[entry_success]];

                /* Discard first buffer as it is the virtio header */
                desc = &vq->desc[desc->next];

                /* Buffer address translation. */
-               buff_addr = gpa_to_vva(dev, desc->addr);
+               vb_addr = gpa_to_vva(dev, desc->addr);
                /* Prefetch buffer address. */
-               rte_prefetch0((void*)(uintptr_t)buff_addr);
+               rte_prefetch0((void *)(uintptr_t)vb_addr);

                used_idx = vq->last_used_idx & (vq->size - 1);

-               if (packet_success < (free_entries - 1)) {
+               if (entry_success < (free_entries - 1)) {
                        /* Prefetch descriptor index. */
-                       rte_prefetch0(&vq->desc[head[packet_success+1]]);
+                       rte_prefetch0(&vq->desc[head[entry_success+1]]);
                        rte_prefetch0(&vq->used->ring[(used_idx + 1) & 
(vq->size - 1)]);
                }

                /* Update used index buffer information. */
-               vq->used->ring[used_idx].id = head[packet_success];
+               vq->used->ring[used_idx].id = head[entry_success];
                vq->used->ring[used_idx].len = 0;

-               mbuf = rte_pktmbuf_alloc(mbuf_pool);
-               if (unlikely(mbuf == NULL)) {
-                       RTE_LOG(ERR, VHOST_DATA, "Failed to allocate memory for 
mbuf.\n");
-                       return packet_success;
+               vb_offset = 0;
+               vb_avail = desc->len;
+               
+               seg_avail = 0;
+               /* Allocate an mbuf and populate the structure. */
+               m = rte_pktmbuf_alloc(mbuf_pool);
+               if (unlikely(m == NULL)) {
+                       RTE_LOG(ERR, VHOST_DATA,
+                               "Failed to allocate memory for mbuf.\n");
+                       return entry_success;
                }
-               mbuf->pkt.data_len = desc->len;
-               mbuf->pkt.pkt_len  = mbuf->pkt.data_len;
+               seg_offset = 0;
+               seg_avail = m->buf_len - RTE_PKTMBUF_HEADROOM;
+               cpy_len = RTE_MIN(vb_avail, seg_avail);
+
+               VHOST_PRINT_PACKET(dev, (uintptr_t)vb_addr, desc->len, 0);
+
+               
+               seg_num++;
+               cur = m;
+               prev = m;
+               while (cpy_len != 0) {
+                       rte_memcpy((void *)(rte_pktmbuf_mtod(cur, char *) + 
seg_offset),
+                               (void *)((uintptr_t)(vb_addr + vb_offset)),
+                               cpy_len);
+
+                       seg_offset += cpy_len;
+                       vb_offset += cpy_len;
+                       vb_avail -= cpy_len;
+                       seg_avail -= cpy_len;
+
+                       if (vb_avail != 0) {
+                               /*
+                                * The segment reachs to its end,
+                                * while the virtio buffer in TX vring has
+                                * more data to be copied.
+                                */
+                               cur->pkt.data_len = seg_offset;
+                               m->pkt.pkt_len += seg_offset;
+                               /* Allocate mbuf and populate the structure. */
+                               cur = rte_pktmbuf_alloc(mbuf_pool);
+                               if (unlikely(cur == NULL)) {
+                                       RTE_LOG(ERR, VHOST_DATA, "Failed to "
+                                               "allocate memory for mbuf.\n");
+                                       rte_pktmbuf_free(m);
+                                       alloc_err = 1;
+                                       break;
+                               }
+
+                               seg_num++;
+                               prev->pkt.next = cur;
+                               prev = cur;
+                               seg_offset = 0;
+                               seg_avail = cur->buf_len - RTE_PKTMBUF_HEADROOM;
+                       } else {
+                               if (desc->flags & VRING_DESC_F_NEXT) {
+                                       /*
+                                        * There are more virtio buffers in
+                                        * same vring entry need to be copied.
+                                        */
+                                       if (seg_avail == 0) {
+                                               /*
+                                                * The current segment hasn't
+                                                * room to accomodate more
+                                                * data.
+                                                */
+                                               cur->pkt.data_len = seg_offset;
+                                               m->pkt.pkt_len += seg_offset;
+                                               /*
+                                                * Allocate an mbuf and
+                                                * populate the structure.
+                                                */
+                                               cur = 
rte_pktmbuf_alloc(mbuf_pool);
+                                               if (unlikely(cur == NULL)) {
+                                                       RTE_LOG(ERR,
+                                                               VHOST_DATA,
+                                                               "Failed to "
+                                                               "allocate 
memory "
+                                                               "for mbuf\n");
+                                                       rte_pktmbuf_free(m);
+                                                       alloc_err = 1;
+                                                       break;
+                                               }
+                                               seg_num++;
+                                               prev->pkt.next = cur;
+                                               prev = cur;
+                                               seg_offset = 0;
+                                               seg_avail = cur->buf_len - 
RTE_PKTMBUF_HEADROOM;
+                                       }
+
+                                       desc = &vq->desc[desc->next];
+
+                                       /* Buffer address translation. */
+                                       vb_addr = gpa_to_vva(dev, desc->addr);
+                                       /* Prefetch buffer address. */
+                                       rte_prefetch0((void 
*)(uintptr_t)vb_addr);
+                                       vb_offset = 0;
+                                       vb_avail = desc->len;
+
+                                       VHOST_PRINT_PACKET(dev, 
(uintptr_t)vb_addr,
+                                               desc->len, 0);
+                               } else {
+                                       /* The whole packet completes. */
+                                       cur->pkt.data_len = seg_offset;
+                                       m->pkt.pkt_len += seg_offset;
+                                       vb_avail = 0;
+                               }
+                       }

-               rte_memcpy((void *) mbuf->pkt.data,
-                       (const void *) buff_addr, mbuf->pkt.data_len);
+                       cpy_len = RTE_MIN(vb_avail, seg_avail);
+               }

-               pkts[packet_success] = mbuf;
+               if (unlikely(alloc_err == 1))
+                       break;

-               VHOST_PRINT_PACKET(dev, (uintptr_t)buff_addr, desc->len, 0);
+               m->pkt.nb_segs = seg_num;

+               pkts[entry_success] = m;
                vq->last_used_idx++;
-               packet_success++;
+               entry_success++;
        }

        rte_compiler_barrier();
-       vq->used->idx += packet_success;
+       vq->used->idx += entry_success;
        /* Kick guest if required. */
        if (!(vq->avail->flags & VRING_AVAIL_F_NO_INTERRUPT))
                eventfd_write((int)vq->kickfd, 1);
+       return entry_success;

-       return packet_success;
 }
-- 
1.8.1.4

Reply via email to