From: Moti Haimovsky <mo...@mellanox.com> This patch adds support for transmitting packets spanning over multiple buffers. In this patch we also take into consideration the amount of entries a packet occupies in the TxQ when setting the report-completion flag of the chip.
Signed-off-by: Moti Haimovsky <mo...@mellanox.com> --- drivers/net/mlx4/mlx4_rxtx.c | 208 ++++++++++++++++++++++++------------------- drivers/net/mlx4/mlx4_rxtx.h | 6 +- drivers/net/mlx4/mlx4_txq.c | 12 ++- 3 files changed, 129 insertions(+), 97 deletions(-) diff --git a/drivers/net/mlx4/mlx4_rxtx.c b/drivers/net/mlx4/mlx4_rxtx.c index e45bb3b..4200716 100644 --- a/drivers/net/mlx4/mlx4_rxtx.c +++ b/drivers/net/mlx4/mlx4_rxtx.c @@ -63,6 +63,16 @@ #include "mlx4_rxtx.h" #include "mlx4_utils.h" +/* + * Pointer-value pair structure + * used in tx_post_send for saving the first DWORD (32 byte) + * of a TXBB0 + */ +struct pv { + struct mlx4_wqe_data_seg *dseg; + uint32_t val; +}; + /** * Stamp a WQE so it won't be reused by the HW. * Routine is used when freeing WQE used by the chip or when failing @@ -296,34 +306,38 @@ * * @param txq * The Tx queue to post to. - * @param wr - * The work request to handle. - * @param bad_wr - * The wr in case that posting had failed. + * @param pkt + * The packet to transmit. * * @return * 0 - success, negative errno value otherwise and rte_errno is set. */ static inline int mlx4_post_send(struct txq *txq, - struct rte_mbuf *pkt, - uint32_t send_flags) + struct rte_mbuf *pkt) { struct mlx4_wqe_ctrl_seg *ctrl; struct mlx4_wqe_data_seg *dseg; struct mlx4_sq *sq = &txq->msq; + struct rte_mbuf *buf; uint32_t head_idx = sq->head & sq->txbb_cnt_mask; uint32_t lkey; uintptr_t addr; + uint32_t srcrb_flags; + uint32_t owner_opcode = MLX4_OPCODE_SEND; + uint32_t byte_count; int wqe_real_size; int nr_txbbs; int rc; + struct pv *pv = (struct pv *)txq->bounce_buf; + int pv_counter = 0; /* Calculate the needed work queue entry size for this packet. */ wqe_real_size = sizeof(struct mlx4_wqe_ctrl_seg) + pkt->nb_segs * sizeof(struct mlx4_wqe_data_seg); nr_txbbs = MLX4_SIZE_TO_TXBBS(wqe_real_size); - /* Check that there is room for this WQE in the send queue and + /* + * Check that there is room for this WQE in the send queue and * that the WQE size is legal. */ if (likely(((sq->head - sq->tail) + nr_txbbs + @@ -332,76 +346,108 @@ rc = ENOSPC; goto err; } - /* Get the control and single-data entries of the WQE */ + /* Get the control and data entries of the WQE. */ ctrl = (struct mlx4_wqe_ctrl_seg *)mlx4_get_send_wqe(sq, head_idx); dseg = (struct mlx4_wqe_data_seg *)(((char *)ctrl) + sizeof(struct mlx4_wqe_ctrl_seg)); - /* - * Fill the data segment with buffer information. - */ - addr = rte_pktmbuf_mtod(pkt, uintptr_t); - rte_prefetch0((volatile void *)addr); - dseg->addr = rte_cpu_to_be_64(addr); - /* Memory region key for this memory pool. */ - lkey = mlx4_txq_mp2mr(txq, mlx4_txq_mb2mp(pkt)); - if (unlikely(lkey == (uint32_t)-1)) { - /* MR does not exist. */ - DEBUG("%p: unable to get MP <-> MR" - " association", (void *)txq); - /* - * Restamp entry in case of failure. - * Make sure that size is written correctly. - * Note that we give ownership to the SW, not the HW. + /* Fill the data segments with buffer information. */ + for (buf = pkt; buf != NULL; buf = buf->next, dseg++) { + addr = rte_pktmbuf_mtod(buf, uintptr_t); + rte_prefetch0((volatile void *)addr); + /* Handle WQE wraparound. */ + if (unlikely(dseg >= (struct mlx4_wqe_data_seg *)sq->eob)) + dseg = (struct mlx4_wqe_data_seg *)sq->buf; + dseg->addr = rte_cpu_to_be_64(addr); + /* Memory region key for this memory pool. */ + lkey = mlx4_txq_mp2mr(txq, mlx4_txq_mb2mp(buf)); + if (unlikely(lkey == (uint32_t)-1)) { + /* MR does not exist. */ + DEBUG("%p: unable to get MP <-> MR" + " association", (void *)txq); + /* + * Restamp entry in case of failure. + * Make sure that size is written correctly + * Note that we give ownership to the SW, not the HW. + */ + ctrl->fence_size = (wqe_real_size >> 4) & 0x3f; + mlx4_txq_stamp_freed_wqe(sq, head_idx, + (sq->head & sq->txbb_cnt) ? 0 : 1); + rc = EFAULT; + goto err; + } + dseg->lkey = rte_cpu_to_be_32(lkey); + if (likely(buf->data_len)) + byte_count = rte_cpu_to_be_32(buf->data_len); + else + /* + * Zero length segment is treated as inline segment + * with zero data. + */ + byte_count = RTE_BE32(0x80000000); + /* If the data segment is not at the beginning of a + * Tx basic block(TXBB) then write the byte count, + * else postpone the writing to just before updating the + * control segment. */ - ctrl->fence_size = (wqe_real_size >> 4) & 0x3f; - mlx4_txq_stamp_freed_wqe(sq, head_idx, - (sq->head & sq->txbb_cnt) ? 0 : 1); - rc = EFAULT; - goto err; + if ((uintptr_t)dseg & (uintptr_t)(MLX4_TXBB_SIZE - 1)) { + /* + * Need a barrier here before writing the byte_count + * fields to make sure that all the data is visible + * before the byte_count field is set. + * Otherwise, if the segment begins a new cacheline, + * the HCA prefetcher could grab the 64-byte chunk and + * get a valid (!= * 0xffffffff) byte count but stale + * data, and end up sending the wrong data. + */ + rte_io_wmb(); + dseg->byte_count = byte_count; + } else { + /* + * This data segment starts at the beginning of a new + * TXBB, so we need to postpone its byte_count writing + * for later. + */ + pv[pv_counter].dseg = dseg; + pv[pv_counter++].val = byte_count; + } } - dseg->lkey = rte_cpu_to_be_32(lkey); - /* - * Need a barrier here before writing the byte_count field to - * make sure that all the data is visible before the - * byte_count field is set. Otherwise, if the segment begins - * a new cacheline, the HCA prefetcher could grab the 64-byte - * chunk and get a valid (!= * 0xffffffff) byte count but - * stale data, and end up sending the wrong data. - */ - rte_io_wmb(); - if (likely(pkt->data_len)) - dseg->byte_count = rte_cpu_to_be_32(pkt->data_len); - else - /* - * Zero length segment is treated as inline segment - * with zero data. - */ - dseg->byte_count = RTE_BE32(0x80000000); - /* - * Fill the control parameters for this packet. - * For raw Ethernet, the SOLICIT flag is used to indicate that no icrc - * should be calculated - */ - ctrl->srcrb_flags = - rte_cpu_to_be_32(MLX4_WQE_CTRL_SOLICIT | - (send_flags & MLX4_WQE_CTRL_CQ_UPDATE)); + /* Write the first DWORD of each TXBB save earlier. */ + if (pv_counter) { + /* Need a barrier here before writing the byte_count. */ + rte_io_wmb(); + for (--pv_counter; pv_counter >= 0; pv_counter--) + pv[pv_counter].dseg->byte_count = pv[pv_counter].val; + } + /* Fill the control parameters for this packet. */ ctrl->fence_size = (wqe_real_size >> 4) & 0x3f; /* * The caller should prepare "imm" in advance in order to support * VF to VF communication (when the device is a virtual-function * device (VF)). - */ + */ ctrl->imm = 0; /* + * For raw Ethernet, the SOLICIT flag is used to indicate that no icrc + * should be calculated. + */ + txq->elts_comp_cd -= nr_txbbs; + if (unlikely(txq->elts_comp_cd <= 0)) { + txq->elts_comp_cd = txq->elts_comp_cd_init; + srcrb_flags = RTE_BE32(MLX4_WQE_CTRL_SOLICIT | + MLX4_WQE_CTRL_CQ_UPDATE); + } else { + srcrb_flags = RTE_BE32(MLX4_WQE_CTRL_SOLICIT); + } + ctrl->srcrb_flags = srcrb_flags; + /* * Make sure descriptor is fully written before * setting ownership bit (because HW can start * executing as soon as we do). */ - rte_wmb(); - ctrl->owner_opcode = - rte_cpu_to_be_32(MLX4_OPCODE_SEND | - ((sq->head & sq->txbb_cnt) ? - MLX4_BIT_WQE_OWN : 0)); + rte_wmb(); + ctrl->owner_opcode = rte_cpu_to_be_32(owner_opcode | + ((sq->head & sq->txbb_cnt) ? + MLX4_BIT_WQE_OWN : 0)); sq->head += nr_txbbs; return 0; err: @@ -428,14 +474,13 @@ struct txq *txq = (struct txq *)dpdk_txq; unsigned int elts_head = txq->elts_head; const unsigned int elts_n = txq->elts_n; - unsigned int elts_comp_cd = txq->elts_comp_cd; unsigned int elts_comp = 0; unsigned int bytes_sent = 0; unsigned int i; unsigned int max; int err; - assert(elts_comp_cd != 0); + assert(txq->elts_comp_cd != 0); mlx4_txq_complete(txq); max = (elts_n - (elts_head - txq->elts_tail)); if (max > elts_n) @@ -454,8 +499,6 @@ (((elts_head + 1) == elts_n) ? 0 : elts_head + 1); struct txq_elt *elt_next = &(*txq->elts)[elts_head_next]; struct txq_elt *elt = &(*txq->elts)[elts_head]; - unsigned int segs = buf->nb_segs; - uint32_t send_flags = 0; /* Clean up old buffer. */ if (likely(elt->buf != NULL)) { @@ -473,34 +516,16 @@ tmp = next; } while (tmp != NULL); } - /* Request Tx completion. */ - if (unlikely(--elts_comp_cd == 0)) { - elts_comp_cd = txq->elts_comp_cd_init; - ++elts_comp; - send_flags |= MLX4_WQE_CTRL_CQ_UPDATE; - } - if (likely(segs == 1)) { - /* Update element. */ - elt->buf = buf; - RTE_MBUF_PREFETCH_TO_FREE(elt_next->buf); - /* post the pkt for sending */ - err = mlx4_post_send(txq, buf, send_flags); - if (unlikely(err)) { - if (unlikely(send_flags & - MLX4_WQE_CTRL_CQ_UPDATE)) { - elts_comp_cd = 1; - --elts_comp; - } - elt->buf = NULL; - goto stop; - } - elt->buf = buf; - bytes_sent += buf->pkt_len; - } else { - err = -EINVAL; - rte_errno = -err; + RTE_MBUF_PREFETCH_TO_FREE(elt_next->buf); + /* post the packet for sending. */ + err = mlx4_post_send(txq, buf); + if (unlikely(err)) { + elt->buf = NULL; goto stop; } + elt->buf = buf; + bytes_sent += buf->pkt_len; + ++elts_comp; elts_head = elts_head_next; } stop: @@ -516,7 +541,6 @@ rte_write32(txq->msq.doorbell_qpn, txq->msq.db); txq->elts_head = elts_head; txq->elts_comp += elts_comp; - txq->elts_comp_cd = elts_comp_cd; return i; } diff --git a/drivers/net/mlx4/mlx4_rxtx.h b/drivers/net/mlx4/mlx4_rxtx.h index df83552..1b90533 100644 --- a/drivers/net/mlx4/mlx4_rxtx.h +++ b/drivers/net/mlx4/mlx4_rxtx.h @@ -103,13 +103,15 @@ struct txq { struct mlx4_cq mcq; /**< Info for directly manipulating the CQ. */ unsigned int elts_head; /**< Current index in (*elts)[]. */ unsigned int elts_tail; /**< First element awaiting completion. */ - unsigned int elts_comp; /**< Number of completion requests. */ - unsigned int elts_comp_cd; /**< Countdown for next completion. */ + unsigned int elts_comp; /**< Number of pkts waiting for completion. */ + int elts_comp_cd; /**< Countdown for next completion. */ unsigned int elts_comp_cd_init; /**< Initial value for countdown. */ unsigned int elts_n; /**< (*elts)[] length. */ struct txq_elt (*elts)[]; /**< Tx elements. */ struct mlx4_txq_stats stats; /**< Tx queue counters. */ uint32_t max_inline; /**< Max inline send size. */ + char *bounce_buf; + /**< memory used for storing the first DWORD of data TXBBs. */ struct { const struct rte_mempool *mp; /**< Cached memory pool. */ struct ibv_mr *mr; /**< Memory region (for mp). */ diff --git a/drivers/net/mlx4/mlx4_txq.c b/drivers/net/mlx4/mlx4_txq.c index 492779f..9333311 100644 --- a/drivers/net/mlx4/mlx4_txq.c +++ b/drivers/net/mlx4/mlx4_txq.c @@ -83,8 +83,14 @@ rte_calloc_socket("TXQ", 1, sizeof(*elts), 0, txq->ctrl.socket); int ret = 0; - if (elts == NULL) { - ERROR("%p: can't allocate packets array", (void *)txq); + /* Allocate Bounce-buf memory */ + txq->bounce_buf = (char *)rte_zmalloc_socket("TXQ", + MLX4_MAX_WQE_SIZE, + RTE_CACHE_LINE_MIN_SIZE, + txq->ctrl.socket); + + if ((elts == NULL) || (txq->bounce_buf == NULL)) { + ERROR("%p: can't allocate TXQ memory", (void *)txq); ret = ENOMEM; goto error; } @@ -110,6 +116,7 @@ assert(ret == 0); return 0; error: + rte_free(txq->bounce_buf); rte_free(elts); DEBUG("%p: failed, freed everything", (void *)txq); assert(ret > 0); @@ -303,7 +310,6 @@ struct txq_mp2mr_mbuf_check_data { struct mlx4dv_obj mlxdv; struct mlx4dv_qp dv_qp; struct mlx4dv_cq dv_cq; - struct txq tmpl = { .ctrl = { .priv = priv, -- 1.8.3.1