> -----Original Message-----
> From: dev [mailto:dev-bounces at dpdk.org] On Behalf Of Ouyang Changchun
> Sent: Friday, July 25, 2014 2:03 PM
> To: dev at dpdk.org
> Subject: [dpdk-dev] [PATCH v2] virtio: Support mergeable buffer in virtio PMD
> 
> v2 change:
> - Resolve conflicts wiht the tip code;
> - And resolve 2 issues:
>    -- fix mbuf leak when discard a uncompleted packet.
>    -- refine pkt.data to point to actual payload data start point.
> 
> v1 change:
> This patch supports mergeable buffer feature in DPDK based virtio PMD, which
> can
> receive jumbo frame with larger size, like 3K, 4K or even 9K.
> 
> Signed-off-by: Changchun Ouyang <changchun.ouyang at intel.com>
> ---
>  lib/librte_pmd_virtio/virtio_ethdev.c |  20 ++--
>  lib/librte_pmd_virtio/virtio_ethdev.h |   3 +
>  lib/librte_pmd_virtio/virtio_rxtx.c   | 206 +++++++++++++++++++++++++++++---
> --
>  3 files changed, 194 insertions(+), 35 deletions(-)
> 
> diff --git a/lib/librte_pmd_virtio/virtio_ethdev.c
> b/lib/librte_pmd_virtio/virtio_ethdev.c
> index b9f5529..535d798 100644
> --- a/lib/librte_pmd_virtio/virtio_ethdev.c
> +++ b/lib/librte_pmd_virtio/virtio_ethdev.c
> @@ -337,7 +337,7 @@ int virtio_dev_queue_setup(struct rte_eth_dev *dev,
>               snprintf(vq_name, sizeof(vq_name), "port%d_tvq%d_hdrzone",
>                       dev->data->port_id, queue_idx);
>               vq->virtio_net_hdr_mz =
> rte_memzone_reserve_aligned(vq_name,
> -                     vq_size * sizeof(struct virtio_net_hdr),
> +                     vq_size * hw->vtnet_hdr_size,
>                       socket_id, 0, CACHE_LINE_SIZE);
>               if (vq->virtio_net_hdr_mz == NULL) {
>                       rte_free(vq);
> @@ -346,7 +346,7 @@ int virtio_dev_queue_setup(struct rte_eth_dev *dev,
>               vq->virtio_net_hdr_mem =
>                       vq->virtio_net_hdr_mz->phys_addr;
>               memset(vq->virtio_net_hdr_mz->addr, 0,
> -                     vq_size * sizeof(struct virtio_net_hdr));
> +                     vq_size * hw->vtnet_hdr_size);
>       } else if (queue_type == VTNET_CQ) {
>               /* Allocate a page for control vq command, data and status */
>               snprintf(vq_name, sizeof(vq_name), "port%d_cvq_hdrzone",
> @@ -571,9 +571,6 @@ virtio_negotiate_features(struct virtio_hw *hw)
>       mask |= VIRTIO_NET_F_GUEST_TSO4 | VIRTIO_NET_F_GUEST_TSO6 |
> VIRTIO_NET_F_GUEST_ECN;
>       mask |= VTNET_LRO_FEATURES;
> 
> -     /* rx_mbuf should not be in multiple merged segments */
> -     mask |= VIRTIO_NET_F_MRG_RXBUF;
> -
>       /* not negotiating INDIRECT descriptor table support */
>       mask |= VIRTIO_RING_F_INDIRECT_DESC;
> 
> @@ -746,7 +743,6 @@ eth_virtio_dev_init(__rte_unused struct eth_driver
> *eth_drv,
>       }
> 
>       eth_dev->dev_ops = &virtio_eth_dev_ops;
> -     eth_dev->rx_pkt_burst = &virtio_recv_pkts;
>       eth_dev->tx_pkt_burst = &virtio_xmit_pkts;
> 
>       if (rte_eal_process_type() == RTE_PROC_SECONDARY)
> @@ -801,10 +797,13 @@ eth_virtio_dev_init(__rte_unused struct eth_driver
> *eth_drv,
>       virtio_negotiate_features(hw);
> 
>       /* Setting up rx_header size for the device */
> -     if (vtpci_with_feature(hw, VIRTIO_NET_F_MRG_RXBUF))
> +     if (vtpci_with_feature(hw, VIRTIO_NET_F_MRG_RXBUF)) {
> +             eth_dev->rx_pkt_burst = &virtio_recv_mergeable_pkts;
>               hw->vtnet_hdr_size = sizeof(struct virtio_net_hdr_mrg_rxbuf);
> -     else
> +     } else {
> +             eth_dev->rx_pkt_burst = &virtio_recv_pkts;
>               hw->vtnet_hdr_size = sizeof(struct virtio_net_hdr);
> +     }
> 
>       /* Allocate memory for storing MAC addresses */
>       eth_dev->data->mac_addrs = rte_zmalloc("virtio", ETHER_ADDR_LEN,
> 0);
> @@ -1009,7 +1008,7 @@ static void virtio_dev_free_mbufs(struct rte_eth_dev
> *dev)
> 
>               while ((buf = (struct rte_mbuf *)virtqueue_detatch_unused(
>                                       dev->data->rx_queues[i])) != NULL) {
> -                     rte_pktmbuf_free_seg(buf);
> +                     rte_pktmbuf_free(buf);
>                       mbuf_num++;
>               }
> 
> @@ -1028,7 +1027,8 @@ static void virtio_dev_free_mbufs(struct rte_eth_dev
> *dev)
>               mbuf_num = 0;
>               while ((buf = (struct rte_mbuf *)virtqueue_detatch_unused(
>                                       dev->data->tx_queues[i])) != NULL) {
> -                     rte_pktmbuf_free_seg(buf);
> +                     rte_pktmbuf_free(buf);
> +
>                       mbuf_num++;
>               }
> 
> diff --git a/lib/librte_pmd_virtio/virtio_ethdev.h
> b/lib/librte_pmd_virtio/virtio_ethdev.h
> index 858e644..d2e1eed 100644
> --- a/lib/librte_pmd_virtio/virtio_ethdev.h
> +++ b/lib/librte_pmd_virtio/virtio_ethdev.h
> @@ -104,6 +104,9 @@ int  virtio_dev_tx_queue_setup(struct rte_eth_dev *dev,
> uint16_t tx_queue_id,
>  uint16_t virtio_recv_pkts(void *rx_queue, struct rte_mbuf **rx_pkts,
>               uint16_t nb_pkts);
> 
> +uint16_t virtio_recv_mergeable_pkts(void *rx_queue, struct rte_mbuf
> **rx_pkts,
> +             uint16_t nb_pkts);
> +
>  uint16_t virtio_xmit_pkts(void *tx_queue, struct rte_mbuf **tx_pkts,
>               uint16_t nb_pkts);
> 
> diff --git a/lib/librte_pmd_virtio/virtio_rxtx.c
> b/lib/librte_pmd_virtio/virtio_rxtx.c
> index fcd8bd1..3d81b34 100644
> --- a/lib/librte_pmd_virtio/virtio_rxtx.c
> +++ b/lib/librte_pmd_virtio/virtio_rxtx.c
> @@ -146,6 +146,7 @@ static inline int
>  virtqueue_enqueue_recv_refill(struct virtqueue *vq, struct rte_mbuf *cookie)
>  {
>       struct vq_desc_extra *dxp;
> +     struct virtio_hw *hw = vq->hw;
>       struct vring_desc *start_dp;
>       uint16_t needed = 1;
>       uint16_t head_idx, idx;
> @@ -165,9 +166,11 @@ virtqueue_enqueue_recv_refill(struct virtqueue *vq,
> struct rte_mbuf *cookie)
>       dxp->ndescs = needed;
> 
>       start_dp = vq->vq_ring.desc;
> -     start_dp[idx].addr  =
> -             (uint64_t) (cookie->buf_physaddr + RTE_PKTMBUF_HEADROOM
> - sizeof(struct virtio_net_hdr));
> -     start_dp[idx].len   = cookie->buf_len - RTE_PKTMBUF_HEADROOM +
> sizeof(struct virtio_net_hdr);
> +     start_dp[idx].addr =
> +             (uint64_t)(cookie->buf_physaddr + RTE_PKTMBUF_HEADROOM
> +             - hw->vtnet_hdr_size);
> +     start_dp[idx].len =
> +             cookie->buf_len - RTE_PKTMBUF_HEADROOM + hw-
> >vtnet_hdr_size;
>       start_dp[idx].flags =  VRING_DESC_F_WRITE;
>       idx = start_dp[idx].next;
>       vq->vq_desc_head_idx = idx;
> @@ -184,8 +187,10 @@ virtqueue_enqueue_xmit(struct virtqueue *txvq, struct
> rte_mbuf *cookie)
>  {
>       struct vq_desc_extra *dxp;
>       struct vring_desc *start_dp;
> -     uint16_t needed = 2;
> +     uint16_t seg_num = cookie->pkt.nb_segs;
> +     uint16_t needed = 1 + seg_num;
>       uint16_t head_idx, idx;
> +     uint16_t head_size = txvq->hw->vtnet_hdr_size;
> 
>       if (unlikely(txvq->vq_free_cnt == 0))
>               return -ENOSPC;
> @@ -198,19 +203,25 @@ virtqueue_enqueue_xmit(struct virtqueue *txvq,
> struct rte_mbuf *cookie)
>       idx = head_idx;
>       dxp = &txvq->vq_descx[idx];
>       if (dxp->cookie != NULL)
> -             rte_pktmbuf_free_seg(dxp->cookie);
> +             rte_pktmbuf_free(dxp->cookie);
>       dxp->cookie = (void *)cookie;
>       dxp->ndescs = needed;
> 
>       start_dp = txvq->vq_ring.desc;
> -     start_dp[idx].addr  =
> -             txvq->virtio_net_hdr_mem + idx * sizeof(struct virtio_net_hdr);
> -     start_dp[idx].len   = sizeof(struct virtio_net_hdr);
> +     start_dp[idx].addr =
> +             txvq->virtio_net_hdr_mem + idx * head_size;
> +     start_dp[idx].len = (uint32_t)head_size;
>       start_dp[idx].flags = VRING_DESC_F_NEXT;
> -     idx = start_dp[idx].next;
> -     start_dp[idx].addr  = RTE_MBUF_DATA_DMA_ADDR(cookie);
> -     start_dp[idx].len   = cookie->pkt.data_len;
> -     start_dp[idx].flags = 0;
> +
> +     for (; ((seg_num > 0) && (cookie != NULL)); seg_num--) {
> +             idx = start_dp[idx].next;
> +             start_dp[idx].addr  = RTE_MBUF_DATA_DMA_ADDR(cookie);
> +             start_dp[idx].len   = cookie->pkt.data_len;
> +             start_dp[idx].flags = VRING_DESC_F_NEXT;
> +             cookie = cookie->pkt.next;
> +     }
> +
> +     start_dp[idx].flags &= ~VRING_DESC_F_NEXT;
>       idx = start_dp[idx].next;
>       txvq->vq_desc_head_idx = idx;
>       if (txvq->vq_desc_head_idx == VQ_RING_DESC_CHAIN_END)
> @@ -284,7 +295,7 @@ virtio_dev_vring_start(struct virtqueue *vq, int
> queue_type)
>                       error = virtqueue_enqueue_recv_refill(vq, m);
> 
>                       if (error) {
> -                             rte_pktmbuf_free_seg(m);
> +                             rte_pktmbuf_free(m);
>                               break;
>                       }
>                       nbufs++;
> @@ -423,7 +434,7 @@ virtio_discard_rxbuf(struct virtqueue *vq, struct
> rte_mbuf *m)
>       error = virtqueue_enqueue_recv_refill(vq, m);
>       if (unlikely(error)) {
>               RTE_LOG(ERR, PMD, "cannot requeue discarded mbuf");
> -             rte_pktmbuf_free_seg(m);
> +             rte_pktmbuf_free(m);
>       }
>  }
> 
> @@ -471,17 +482,158 @@ virtio_recv_pkts(void *rx_queue, struct rte_mbuf
> **rx_pkts, uint16_t nb_pkts)
> 
>               rxm->pkt.in_port = rxvq->port_id;
>               rxm->pkt.data = (char *)rxm->buf_addr +
> RTE_PKTMBUF_HEADROOM;
> +
>               rxm->pkt.nb_segs = 1;
>               rxm->pkt.next = NULL;
> -             rxm->pkt.pkt_len  = (uint32_t)(len[i]
> -                                            - sizeof(struct virtio_net_hdr));
> -             rxm->pkt.data_len = (uint16_t)(len[i]
> -                                            - sizeof(struct virtio_net_hdr));
> +             rxm->pkt.pkt_len = (uint32_t)(len[i] - hw->vtnet_hdr_size);
> +             rxm->pkt.data_len = (uint16_t)(len[i] - hw->vtnet_hdr_size);
> 
>               VIRTIO_DUMP_PACKET(rxm, rxm->pkt.data_len);
> 
>               rx_pkts[nb_rx++] = rxm;
> -             rxvq->bytes += len[i] - sizeof(struct virtio_net_hdr);
> +             rxvq->bytes += rx_pkts[nb_rx - 1]->pkt.pkt_len;
> +     }
> +
> +     rxvq->packets += nb_rx;
> +
> +     /* Allocate new mbuf for the used descriptor */
> +     error = ENOSPC;
> +     while (likely(!virtqueue_full(rxvq))) {
> +             new_mbuf = rte_rxmbuf_alloc(rxvq->mpool);
> +             if (unlikely(new_mbuf == NULL)) {
> +                     struct rte_eth_dev *dev
> +                             = &rte_eth_devices[rxvq->port_id];
> +                     dev->data->rx_mbuf_alloc_failed++;
> +                     break;
> +             }
> +             error = virtqueue_enqueue_recv_refill(rxvq, new_mbuf);
> +             if (unlikely(error)) {
> +                     rte_pktmbuf_free(new_mbuf);
> +                     break;
> +             }
> +             nb_enqueued++;
> +     }
> +
> +     if (likely(nb_enqueued)) {
> +             if (unlikely(virtqueue_kick_prepare(rxvq))) {
> +                     virtqueue_notify(rxvq);
> +                     PMD_RX_LOG(DEBUG, "Notified\n");
> +             }
> +     }
> +
> +     vq_update_avail_idx(rxvq);
> +
> +     return nb_rx;
> +}
> +
> +uint16_t
> +virtio_recv_mergeable_pkts(void *rx_queue,
> +                     struct rte_mbuf **rx_pkts,
> +                     uint16_t nb_pkts)
> +{
> +     struct virtqueue *rxvq = rx_queue;
> +     struct virtio_hw *hw = rxvq->hw;
> +     struct rte_mbuf *rxm, *new_mbuf;
> +     uint16_t nb_used, num, nb_rx = 0;
> +     uint32_t len[VIRTIO_MBUF_BURST_SZ];
> +     struct rte_mbuf *rcv_pkts[VIRTIO_MBUF_BURST_SZ];
> +     struct rte_mbuf *prev;
> +     int error;
> +     uint32_t i = 0, nb_enqueued = 0;
> +     uint32_t seg_num = 0;
> +     uint16_t extra_idx = 0;
> +
> +     nb_used = VIRTQUEUE_NUSED(rxvq);
> +
> +     rmb();
> +
> +     if (nb_used == 0)
> +             return 0;
> +
> +     while (i < nb_used) {
> +             struct virtio_net_hdr_mrg_rxbuf *header;
> +             char *head_ptr;
> +
> +             if (nb_rx >= nb_pkts)
> +                     break;
> +
> +             num = virtqueue_dequeue_burst_rx(rxvq, rcv_pkts, len, 1);
> +             i += num;
> +
> +             PMD_RX_LOG(DEBUG, "used:%d dequeue:%d\n", nb_used,
> num);
> +             PMD_RX_LOG(DEBUG, "packet len:%d\n", len[i]);
> +
> +             rxm = rcv_pkts[0];
> +             extra_idx = 0;
> +
> +             if (unlikely(len[0]
> +                          < (uint32_t)hw->vtnet_hdr_size + ETHER_HDR_LEN))
> {
> +                     PMD_RX_LOG(ERR, "Packet drop\n");
> +                     nb_enqueued++;
> +                     virtio_discard_rxbuf(rxvq, rxm);
> +                     rxvq->errors++;
> +                     continue;
> +             }
> +
> +             head_ptr = (char *)rxm->pkt.data;
> +             head_ptr -= hw->vtnet_hdr_size;
> +             header = (struct virtio_net_hdr_mrg_rxbuf *)head_ptr;
> +             seg_num = header->num_buffers;
> +
> +             if (seg_num == 0)
> +                     seg_num = 1;
> +
> +             rxm->pkt.data = (char *)rxm->buf_addr +
> RTE_PKTMBUF_HEADROOM;
> +             rxm->pkt.nb_segs = seg_num;
> +             rxm->pkt.next = NULL;
> +             rxm->pkt.pkt_len = (uint32_t)(len[0] - hw->vtnet_hdr_size);
> +             rxm->pkt.data_len = (uint16_t)(len[0] - hw->vtnet_hdr_size);
> +
> +             rxm->pkt.in_port = rxvq->port_id;
> +             rx_pkts[nb_rx] = rxm;
> +
> +             prev = rxm;
> +
> +             VIRTIO_DUMP_PACKET(rxm, rxm->pkt.data_len);
Here it might cause segmentation fault when debug is enabled.
> +
> +             /*
> +              * Get extra segments for current uncompleted packet.
> +              */
> +             if (VIRTQUEUE_NUSED(rxvq) >= seg_num - 1) {
> +                     uint32_t rx_num = virtqueue_dequeue_burst_rx(rxvq,
> +                             rcv_pkts, len, seg_num - 1);
> +                     i += rx_num;
> +             } else {
> +                     PMD_RX_LOG(ERR, "No enough segments for
> packet.\n");
> +                     nb_enqueued++;
> +                     virtio_discard_rxbuf(rxvq, rxm);
> +                     rxvq->errors++;
> +                     break;
Here we should figure out if virtio net spec specify the behavior. If the 
backend wants to en-queue a merge able packet, but there isn't enough available 
descriptors, could it do it in a non-atomic way, i.e., firstly transfer some of 
them, update the used->idx and later transfer the rest of them? 
If that is the case,  the handling here will cause the content of next segment 
later en-queued to be treated as virtio merge-able header.

> +             }
> +
> +             while (extra_idx < seg_num - 1) {
> +                     rxm = rcv_pkts[extra_idx];
> +
> +                     rxm->pkt.data =
> +                             (char *)rxm->buf_addr +
> RTE_PKTMBUF_HEADROOM
> +                             - hw->vtnet_hdr_size;
> +                     rxm->pkt.next = NULL;
> +                     rxm->pkt.pkt_len = (uint32_t)(len[extra_idx]);
> +                     rxm->pkt.data_len = (uint16_t)(len[extra_idx]);
> +
> +                     if (prev)
> +                             prev->pkt.next = rxm;
> +
> +                     prev = rxm;
> +                     rx_pkts[nb_rx]->pkt.pkt_len += rxm->pkt.pkt_len;
> +
> +                     extra_idx++;
> +
> +                     VIRTIO_DUMP_PACKET(rxm, rxm->pkt.data_len);
> +             };
> +
> +             rxvq->bytes += rx_pkts[nb_rx]->pkt.pkt_len;
> +             nb_rx++;
>       }
> 
>       rxvq->packets += nb_rx;
> @@ -498,11 +650,12 @@ virtio_recv_pkts(void *rx_queue, struct rte_mbuf
> **rx_pkts, uint16_t nb_pkts)
>               }
>               error = virtqueue_enqueue_recv_refill(rxvq, new_mbuf);
>               if (unlikely(error)) {
> -                     rte_pktmbuf_free_seg(new_mbuf);
> +                     rte_pktmbuf_free(new_mbuf);
>                       break;
>               }
>               nb_enqueued++;
>       }
> +
>       if (likely(nb_enqueued)) {
>               if (unlikely(virtqueue_kick_prepare(rxvq))) {
>                       virtqueue_notify(rxvq);
> @@ -536,12 +689,15 @@ virtio_xmit_pkts(void *tx_queue, struct rte_mbuf
> **tx_pkts, uint16_t nb_pkts)
>       num = (uint16_t)(likely(nb_used < VIRTIO_MBUF_BURST_SZ) ? nb_used :
> VIRTIO_MBUF_BURST_SZ);
> 
>       while (nb_tx < nb_pkts) {
> -             if (virtqueue_full(txvq) && num) {
> -                     virtqueue_dequeue_pkt_tx(txvq);
> -                     num--;
> +             int need = tx_pkts[nb_tx]->pkt.nb_segs - txvq->vq_free_cnt;
> +             if ((need > 0) && (num > 0)) {
> +                     do {
> +                             virtqueue_dequeue_pkt_tx(txvq);
> +                             num--;
> +                     } while (num > 0);
>               }
> 
> -             if (!virtqueue_full(txvq)) {
> +             if (tx_pkts[nb_tx]->pkt.nb_segs <= txvq->vq_free_cnt) {
>                       txm = tx_pkts[nb_tx];
>                       /* Enqueue Packet buffers */
>                       error = virtqueue_enqueue_xmit(txvq, txm);
> @@ -555,7 +711,7 @@ virtio_xmit_pkts(void *tx_queue, struct rte_mbuf
> **tx_pkts, uint16_t nb_pkts)
>                               break;
>                       }
>                       nb_tx++;
> -                     txvq->bytes += txm->pkt.data_len;
> +                     txvq->bytes += txm->pkt.pkt_len;
>               } else {
>                       PMD_TX_LOG(ERR, "No free tx descriptors to transmit");
>                       break;
> --
> 1.8.4.2

Reply via email to