From: Moti Haimovsky <mo...@mellanox.com>

This patch adds support for transmitting packets spanning over
multiple buffers.
In this patch we also take into consideration the amount of entries
a packet occupies in the TxQ when setting the report-completion flag
of the chip.

Signed-off-by: Moti Haimovsky <mo...@mellanox.com>
---
 drivers/net/mlx4/mlx4_rxtx.c | 208 ++++++++++++++++++++++++-------------------
 drivers/net/mlx4/mlx4_rxtx.h |   6 +-
 drivers/net/mlx4/mlx4_txq.c  |  12 ++-
 3 files changed, 129 insertions(+), 97 deletions(-)

diff --git a/drivers/net/mlx4/mlx4_rxtx.c b/drivers/net/mlx4/mlx4_rxtx.c
index e45bb3b..4200716 100644
--- a/drivers/net/mlx4/mlx4_rxtx.c
+++ b/drivers/net/mlx4/mlx4_rxtx.c
@@ -63,6 +63,16 @@
 #include "mlx4_rxtx.h"
 #include "mlx4_utils.h"
 
+/*
+ * Pointer-value pair structure
+ * used in tx_post_send for saving the first DWORD (32 byte)
+ * of a TXBB0
+ */
+struct pv {
+       struct mlx4_wqe_data_seg *dseg;
+       uint32_t val;
+};
+
 /**
  * Stamp a WQE so it won't be reused by the HW.
  * Routine is used when freeing WQE used by the chip or when failing
@@ -296,34 +306,38 @@
  *
  * @param txq
  *   The Tx queue to post to.
- * @param wr
- *   The work request to handle.
- * @param bad_wr
- *   The wr in case that posting had failed.
+ * @param pkt
+ *   The packet to transmit.
  *
  * @return
  *   0 - success, negative errno value otherwise and rte_errno is set.
  */
 static inline int
 mlx4_post_send(struct txq *txq,
-              struct rte_mbuf *pkt,
-              uint32_t send_flags)
+              struct rte_mbuf *pkt)
 {
        struct mlx4_wqe_ctrl_seg *ctrl;
        struct mlx4_wqe_data_seg *dseg;
        struct mlx4_sq *sq = &txq->msq;
+       struct rte_mbuf *buf;
        uint32_t head_idx = sq->head & sq->txbb_cnt_mask;
        uint32_t lkey;
        uintptr_t addr;
+       uint32_t srcrb_flags;
+       uint32_t owner_opcode = MLX4_OPCODE_SEND;
+       uint32_t byte_count;
        int wqe_real_size;
        int nr_txbbs;
        int rc;
+       struct pv *pv = (struct pv *)txq->bounce_buf;
+       int pv_counter = 0;
 
        /* Calculate the needed work queue entry size for this packet. */
        wqe_real_size = sizeof(struct mlx4_wqe_ctrl_seg) +
                        pkt->nb_segs * sizeof(struct mlx4_wqe_data_seg);
        nr_txbbs = MLX4_SIZE_TO_TXBBS(wqe_real_size);
-       /* Check that there is room for this WQE in the send queue and
+       /*
+        * Check that there is room for this WQE in the send queue and
         * that the WQE size is legal.
         */
        if (likely(((sq->head - sq->tail) + nr_txbbs +
@@ -332,76 +346,108 @@
                rc = ENOSPC;
                goto err;
        }
-       /* Get the control and single-data entries of the WQE */
+       /* Get the control and data entries of the WQE. */
        ctrl = (struct mlx4_wqe_ctrl_seg *)mlx4_get_send_wqe(sq, head_idx);
        dseg = (struct mlx4_wqe_data_seg *)(((char *)ctrl) +
                sizeof(struct mlx4_wqe_ctrl_seg));
-       /*
-        * Fill the data segment with buffer information.
-        */
-       addr = rte_pktmbuf_mtod(pkt, uintptr_t);
-       rte_prefetch0((volatile void *)addr);
-       dseg->addr = rte_cpu_to_be_64(addr);
-       /* Memory region key for this memory pool. */
-       lkey = mlx4_txq_mp2mr(txq, mlx4_txq_mb2mp(pkt));
-       if (unlikely(lkey == (uint32_t)-1)) {
-               /* MR does not exist. */
-               DEBUG("%p: unable to get MP <-> MR"
-                     " association", (void *)txq);
-               /*
-                * Restamp entry in case of failure.
-                * Make sure that size is written correctly.
-                * Note that we give ownership to the SW, not the HW.
+       /* Fill the data segments with buffer information. */
+       for (buf = pkt; buf != NULL; buf = buf->next, dseg++) {
+               addr = rte_pktmbuf_mtod(buf, uintptr_t);
+               rte_prefetch0((volatile void *)addr);
+               /* Handle WQE wraparound. */
+               if (unlikely(dseg >= (struct mlx4_wqe_data_seg *)sq->eob))
+                       dseg = (struct mlx4_wqe_data_seg *)sq->buf;
+               dseg->addr = rte_cpu_to_be_64(addr);
+               /* Memory region key for this memory pool. */
+               lkey = mlx4_txq_mp2mr(txq, mlx4_txq_mb2mp(buf));
+               if (unlikely(lkey == (uint32_t)-1)) {
+                       /* MR does not exist. */
+                       DEBUG("%p: unable to get MP <-> MR"
+                             " association", (void *)txq);
+                       /*
+                        * Restamp entry in case of failure.
+                        * Make sure that size is written correctly
+                        * Note that we give ownership to the SW, not the HW.
+                        */
+                       ctrl->fence_size = (wqe_real_size >> 4) & 0x3f;
+                       mlx4_txq_stamp_freed_wqe(sq, head_idx,
+                                    (sq->head & sq->txbb_cnt) ? 0 : 1);
+                       rc = EFAULT;
+                       goto err;
+               }
+               dseg->lkey = rte_cpu_to_be_32(lkey);
+               if (likely(buf->data_len))
+                       byte_count = rte_cpu_to_be_32(buf->data_len);
+               else
+                       /*
+                        * Zero length segment is treated as inline segment
+                        * with zero data.
+                        */
+                       byte_count = RTE_BE32(0x80000000);
+               /* If the data segment is not at the beginning of a
+                * Tx basic block(TXBB) then write the byte count,
+                * else postpone the writing to just before updating the
+                * control segment.
                 */
-               ctrl->fence_size = (wqe_real_size >> 4) & 0x3f;
-               mlx4_txq_stamp_freed_wqe(sq, head_idx,
-                                        (sq->head & sq->txbb_cnt) ? 0 : 1);
-               rc = EFAULT;
-               goto err;
+               if ((uintptr_t)dseg & (uintptr_t)(MLX4_TXBB_SIZE - 1)) {
+                       /*
+                        * Need a barrier here before writing the byte_count
+                        * fields to make sure that all the data is visible
+                        * before the byte_count field is set.
+                        * Otherwise, if the segment begins a new cacheline,
+                        * the HCA prefetcher could grab the 64-byte chunk and
+                        * get a valid (!= * 0xffffffff) byte count but stale
+                        * data, and end up sending the wrong data.
+                        */
+                       rte_io_wmb();
+                       dseg->byte_count = byte_count;
+               } else {
+                       /*
+                        * This data segment starts at the beginning of a new
+                        * TXBB, so we need to postpone its byte_count writing
+                        * for later.
+                        */
+                       pv[pv_counter].dseg = dseg;
+                       pv[pv_counter++].val = byte_count;
+               }
        }
-       dseg->lkey = rte_cpu_to_be_32(lkey);
-       /*
-        * Need a barrier here before writing the byte_count field to
-        * make sure that all the data is visible before the
-        * byte_count field is set.  Otherwise, if the segment begins
-        * a new cacheline, the HCA prefetcher could grab the 64-byte
-        * chunk and get a valid (!= * 0xffffffff) byte count but
-        * stale data, and end up sending the wrong data.
-        */
-       rte_io_wmb();
-       if (likely(pkt->data_len))
-               dseg->byte_count = rte_cpu_to_be_32(pkt->data_len);
-       else
-               /*
-                * Zero length segment is treated as inline segment
-                * with zero data.
-                */
-               dseg->byte_count = RTE_BE32(0x80000000);
-       /*
-        * Fill the control parameters for this packet.
-        * For raw Ethernet, the SOLICIT flag is used to indicate that no icrc
-        * should be calculated
-        */
-       ctrl->srcrb_flags =
-               rte_cpu_to_be_32(MLX4_WQE_CTRL_SOLICIT |
-                                (send_flags & MLX4_WQE_CTRL_CQ_UPDATE));
+       /* Write the first DWORD of each TXBB save earlier. */
+       if (pv_counter) {
+               /* Need a barrier here before writing the byte_count. */
+               rte_io_wmb();
+               for (--pv_counter; pv_counter  >= 0; pv_counter--)
+                       pv[pv_counter].dseg->byte_count = pv[pv_counter].val;
+       }
+       /* Fill the control parameters for this packet. */
        ctrl->fence_size = (wqe_real_size >> 4) & 0x3f;
        /*
         * The caller should prepare "imm" in advance in order to support
         * VF to VF communication (when the device is a virtual-function
         * device (VF)).
-        */
+       */
        ctrl->imm = 0;
        /*
+        * For raw Ethernet, the SOLICIT flag is used to indicate that no icrc
+        * should be calculated.
+        */
+       txq->elts_comp_cd -= nr_txbbs;
+       if (unlikely(txq->elts_comp_cd <= 0)) {
+               txq->elts_comp_cd = txq->elts_comp_cd_init;
+               srcrb_flags = RTE_BE32(MLX4_WQE_CTRL_SOLICIT |
+                                      MLX4_WQE_CTRL_CQ_UPDATE);
+       } else {
+               srcrb_flags = RTE_BE32(MLX4_WQE_CTRL_SOLICIT);
+       }
+       ctrl->srcrb_flags = srcrb_flags;
+       /*
         * Make sure descriptor is fully written before
         * setting ownership bit (because HW can start
         * executing as soon as we do).
         */
-       rte_wmb();
-       ctrl->owner_opcode =
-               rte_cpu_to_be_32(MLX4_OPCODE_SEND |
-                                ((sq->head & sq->txbb_cnt) ?
-                                 MLX4_BIT_WQE_OWN : 0));
+        rte_wmb();
+        ctrl->owner_opcode = rte_cpu_to_be_32(owner_opcode |
+                                              ((sq->head & sq->txbb_cnt) ?
+                                              MLX4_BIT_WQE_OWN : 0));
        sq->head += nr_txbbs;
        return 0;
 err:
@@ -428,14 +474,13 @@
        struct txq *txq = (struct txq *)dpdk_txq;
        unsigned int elts_head = txq->elts_head;
        const unsigned int elts_n = txq->elts_n;
-       unsigned int elts_comp_cd = txq->elts_comp_cd;
        unsigned int elts_comp = 0;
        unsigned int bytes_sent = 0;
        unsigned int i;
        unsigned int max;
        int err;
 
-       assert(elts_comp_cd != 0);
+       assert(txq->elts_comp_cd != 0);
        mlx4_txq_complete(txq);
        max = (elts_n - (elts_head - txq->elts_tail));
        if (max > elts_n)
@@ -454,8 +499,6 @@
                        (((elts_head + 1) == elts_n) ? 0 : elts_head + 1);
                struct txq_elt *elt_next = &(*txq->elts)[elts_head_next];
                struct txq_elt *elt = &(*txq->elts)[elts_head];
-               unsigned int segs = buf->nb_segs;
-               uint32_t send_flags = 0;
 
                /* Clean up old buffer. */
                if (likely(elt->buf != NULL)) {
@@ -473,34 +516,16 @@
                                tmp = next;
                        } while (tmp != NULL);
                }
-               /* Request Tx completion. */
-               if (unlikely(--elts_comp_cd == 0)) {
-                       elts_comp_cd = txq->elts_comp_cd_init;
-                       ++elts_comp;
-                       send_flags |= MLX4_WQE_CTRL_CQ_UPDATE;
-               }
-               if (likely(segs == 1)) {
-                       /* Update element. */
-                       elt->buf = buf;
-                       RTE_MBUF_PREFETCH_TO_FREE(elt_next->buf);
-                       /* post the pkt for sending */
-                       err = mlx4_post_send(txq, buf, send_flags);
-                       if (unlikely(err)) {
-                               if (unlikely(send_flags &
-                                            MLX4_WQE_CTRL_CQ_UPDATE)) {
-                                       elts_comp_cd = 1;
-                                       --elts_comp;
-                               }
-                               elt->buf = NULL;
-                               goto stop;
-                       }
-                       elt->buf = buf;
-                       bytes_sent += buf->pkt_len;
-               } else {
-                       err = -EINVAL;
-                       rte_errno = -err;
+               RTE_MBUF_PREFETCH_TO_FREE(elt_next->buf);
+               /* post the packet for sending. */
+               err = mlx4_post_send(txq, buf);
+               if (unlikely(err)) {
+                       elt->buf = NULL;
                        goto stop;
                }
+               elt->buf = buf;
+               bytes_sent += buf->pkt_len;
+               ++elts_comp;
                elts_head = elts_head_next;
        }
 stop:
@@ -516,7 +541,6 @@
        rte_write32(txq->msq.doorbell_qpn, txq->msq.db);
        txq->elts_head = elts_head;
        txq->elts_comp += elts_comp;
-       txq->elts_comp_cd = elts_comp_cd;
        return i;
 }
 
diff --git a/drivers/net/mlx4/mlx4_rxtx.h b/drivers/net/mlx4/mlx4_rxtx.h
index df83552..1b90533 100644
--- a/drivers/net/mlx4/mlx4_rxtx.h
+++ b/drivers/net/mlx4/mlx4_rxtx.h
@@ -103,13 +103,15 @@ struct txq {
        struct mlx4_cq mcq; /**< Info for directly manipulating the CQ. */
        unsigned int elts_head; /**< Current index in (*elts)[]. */
        unsigned int elts_tail; /**< First element awaiting completion. */
-       unsigned int elts_comp; /**< Number of completion requests. */
-       unsigned int elts_comp_cd; /**< Countdown for next completion. */
+       unsigned int elts_comp; /**< Number of pkts waiting for completion. */
+       int elts_comp_cd; /**< Countdown for next completion. */
        unsigned int elts_comp_cd_init; /**< Initial value for countdown. */
        unsigned int elts_n; /**< (*elts)[] length. */
        struct txq_elt (*elts)[]; /**< Tx elements. */
        struct mlx4_txq_stats stats; /**< Tx queue counters. */
        uint32_t max_inline; /**< Max inline send size. */
+       char *bounce_buf;
+       /**< memory used for storing the first DWORD of data TXBBs. */
        struct {
                const struct rte_mempool *mp; /**< Cached memory pool. */
                struct ibv_mr *mr; /**< Memory region (for mp). */
diff --git a/drivers/net/mlx4/mlx4_txq.c b/drivers/net/mlx4/mlx4_txq.c
index 492779f..9333311 100644
--- a/drivers/net/mlx4/mlx4_txq.c
+++ b/drivers/net/mlx4/mlx4_txq.c
@@ -83,8 +83,14 @@
                rte_calloc_socket("TXQ", 1, sizeof(*elts), 0, txq->ctrl.socket);
        int ret = 0;
 
-       if (elts == NULL) {
-               ERROR("%p: can't allocate packets array", (void *)txq);
+       /* Allocate Bounce-buf memory */
+       txq->bounce_buf = (char *)rte_zmalloc_socket("TXQ",
+                                                    MLX4_MAX_WQE_SIZE,
+                                                    RTE_CACHE_LINE_MIN_SIZE,
+                                                    txq->ctrl.socket);
+
+       if ((elts == NULL) || (txq->bounce_buf == NULL)) {
+               ERROR("%p: can't allocate TXQ memory", (void *)txq);
                ret = ENOMEM;
                goto error;
        }
@@ -110,6 +116,7 @@
        assert(ret == 0);
        return 0;
 error:
+       rte_free(txq->bounce_buf);
        rte_free(elts);
        DEBUG("%p: failed, freed everything", (void *)txq);
        assert(ret > 0);
@@ -303,7 +310,6 @@ struct txq_mp2mr_mbuf_check_data {
        struct mlx4dv_obj mlxdv;
        struct mlx4dv_qp dv_qp;
        struct mlx4dv_cq dv_cq;
-
        struct txq tmpl = {
                .ctrl = {
                        .priv = priv,
-- 
1.8.3.1

Reply via email to