From: Moti Haimovsky <mo...@mellanox.com> This patch adds support for accessing the hardware directly when handling Rx packets eliminating the need to use Verbs in the Rx data path.
Rx scatter support: calculate the number of scatters on the fly according to the maximum expected packet size. Signed-off-by: Vasily Philipov <vasi...@mellanox.com> Signed-off-by: Moti Haimovsky <mo...@mellanox.com> Signed-off-by: Ophir Munk <ophi...@mellanox.com> Acked-by: Adrien Mazarguil <adrien.mazarg...@6wind.com> --- drivers/net/mlx4/mlx4_rxq.c | 151 +++++++++++++++++-------- drivers/net/mlx4/mlx4_rxtx.c | 226 +++++++++++++++++++++----------------- drivers/net/mlx4/mlx4_rxtx.h | 19 ++-- 3 files changed, 241 insertions(+), 155 deletions(-) diff --git a/drivers/net/mlx4/mlx4_rxq.c b/drivers/net/mlx4/mlx4_rxq.c index e7bde2e..fb6c080 100644 --- a/drivers/net/mlx4/mlx4_rxq.c +++ b/drivers/net/mlx4/mlx4_rxq.c @@ -51,6 +51,7 @@ #pragma GCC diagnostic error "-Wpedantic" #endif +#include <rte_byteorder.h> #include <rte_common.h> #include <rte_errno.h> #include <rte_ethdev.h> @@ -312,45 +313,46 @@ void mlx4_rss_detach(struct mlx4_rss *rss) static int mlx4_rxq_alloc_elts(struct rxq *rxq) { - struct rxq_elt (*elts)[rxq->elts_n] = rxq->elts; + const uint32_t elts_n = 1 << rxq->elts_n; + const uint32_t sges_n = 1 << rxq->sges_n; + struct rte_mbuf *(*elts)[elts_n] = rxq->elts; unsigned int i; - /* For each WR (packet). */ + assert(rte_is_power_of_2(elts_n)); for (i = 0; i != RTE_DIM(*elts); ++i) { - struct rxq_elt *elt = &(*elts)[i]; - struct ibv_recv_wr *wr = &elt->wr; - struct ibv_sge *sge = &(*elts)[i].sge; + volatile struct mlx4_wqe_data_seg *scat = &(*rxq->wqes)[i]; struct rte_mbuf *buf = rte_pktmbuf_alloc(rxq->mp); if (buf == NULL) { while (i--) { - rte_pktmbuf_free_seg((*elts)[i].buf); - (*elts)[i].buf = NULL; + rte_pktmbuf_free_seg((*elts)[i]); + (*elts)[i] = NULL; } rte_errno = ENOMEM; return -rte_errno; } - elt->buf = buf; - wr->next = &(*elts)[(i + 1)].wr; - wr->sg_list = sge; - wr->num_sge = 1; /* Headroom is reserved by rte_pktmbuf_alloc(). */ assert(buf->data_off == RTE_PKTMBUF_HEADROOM); /* Buffer is supposed to be empty. */ assert(rte_pktmbuf_data_len(buf) == 0); assert(rte_pktmbuf_pkt_len(buf) == 0); - /* sge->addr must be able to store a pointer. */ - assert(sizeof(sge->addr) >= sizeof(uintptr_t)); - /* SGE keeps its headroom. */ - sge->addr = (uintptr_t) - ((uint8_t *)buf->buf_addr + RTE_PKTMBUF_HEADROOM); - sge->length = (buf->buf_len - RTE_PKTMBUF_HEADROOM); - sge->lkey = rxq->mr->lkey; - /* Redundant check for tailroom. */ - assert(sge->length == rte_pktmbuf_tailroom(buf)); + /* Only the first segment keeps headroom. */ + if (i % sges_n) + buf->data_off = 0; + buf->port = rxq->port_id; + buf->data_len = rte_pktmbuf_tailroom(buf); + buf->pkt_len = rte_pktmbuf_tailroom(buf); + buf->nb_segs = 1; + *scat = (struct mlx4_wqe_data_seg){ + .addr = rte_cpu_to_be_64(rte_pktmbuf_mtod(buf, + uintptr_t)), + .byte_count = rte_cpu_to_be_32(buf->data_len), + .lkey = rte_cpu_to_be_32(rxq->mr->lkey), + }; + (*elts)[i] = buf; } - /* The last WR pointer must be NULL. */ - (*elts)[(i - 1)].wr.next = NULL; + DEBUG("%p: allocated and configured %u segments (max %u packets)", + (void *)rxq, elts_n, elts_n / sges_n); return 0; } @@ -364,14 +366,14 @@ static void mlx4_rxq_free_elts(struct rxq *rxq) { unsigned int i; - struct rxq_elt (*elts)[rxq->elts_n] = rxq->elts; + struct rte_mbuf *(*elts)[1 << rxq->elts_n] = rxq->elts; - DEBUG("%p: freeing WRs", (void *)rxq); + DEBUG("%p: freeing Rx queue elements", (void *)rxq); for (i = 0; (i != RTE_DIM(*elts)); ++i) { - if (!(*elts)[i].buf) + if (!(*elts)[i]) continue; - rte_pktmbuf_free_seg((*elts)[i].buf); - (*elts)[i].buf = NULL; + rte_pktmbuf_free_seg((*elts)[i]); + (*elts)[i] = NULL; } } @@ -400,8 +402,11 @@ mlx4_rx_queue_setup(struct rte_eth_dev *dev, uint16_t idx, uint16_t desc, struct rte_mempool *mp) { struct priv *priv = dev->data->dev_private; + struct mlx4dv_obj mlxdv; + struct mlx4dv_rwq dv_rwq; + struct mlx4dv_cq dv_cq; uint32_t mb_len = rte_pktmbuf_data_room_size(mp); - struct rxq_elt (*elts)[desc]; + struct rte_mbuf *(*elts)[rte_align32pow2(desc)]; struct rte_flow_error error; struct rxq *rxq; struct rte_malloc_vec vec[] = { @@ -439,6 +444,12 @@ mlx4_rx_queue_setup(struct rte_eth_dev *dev, uint16_t idx, uint16_t desc, ERROR("%p: invalid number of Rx descriptors", (void *)dev); return -rte_errno; } + if (desc != RTE_DIM(*elts)) { + desc = RTE_DIM(*elts); + WARN("%p: increased number of descriptors in Rx queue %u" + " to the next power of two (%u)", + (void *)dev, idx, desc); + } /* Allocate and initialize Rx queue. */ rte_zmallocv_socket("RXQ", vec, RTE_DIM(vec), socket); if (!rxq) { @@ -450,8 +461,8 @@ mlx4_rx_queue_setup(struct rte_eth_dev *dev, uint16_t idx, uint16_t desc, .priv = priv, .mp = mp, .port_id = dev->data->port_id, - .elts_n = desc, - .elts_head = 0, + .sges_n = 0, + .elts_n = rte_log2_u32(desc), .elts = elts, .stats.idx = idx, .socket = socket, @@ -462,9 +473,29 @@ mlx4_rx_queue_setup(struct rte_eth_dev *dev, uint16_t idx, uint16_t desc, (mb_len - RTE_PKTMBUF_HEADROOM)) { ; } else if (dev->data->dev_conf.rxmode.enable_scatter) { - WARN("%p: scattered mode has been requested but is" - " not supported, this may lead to packet loss", - (void *)dev); + uint32_t size = + RTE_PKTMBUF_HEADROOM + + dev->data->dev_conf.rxmode.max_rx_pkt_len; + uint32_t sges_n; + + /* + * Determine the number of SGEs needed for a full packet + * and round it to the next power of two. + */ + sges_n = rte_log2_u32((size / mb_len) + !!(size % mb_len)); + rxq->sges_n = sges_n; + /* Make sure sges_n did not overflow. */ + size = mb_len * (1 << rxq->sges_n); + size -= RTE_PKTMBUF_HEADROOM; + if (size < dev->data->dev_conf.rxmode.max_rx_pkt_len) { + rte_errno = EOVERFLOW; + ERROR("%p: too many SGEs (%u) needed to handle" + " requested maximum packet size %u", + (void *)dev, + 1 << sges_n, + dev->data->dev_conf.rxmode.max_rx_pkt_len); + goto error; + } } else { WARN("%p: the requested maximum Rx packet size (%u) is" " larger than a single mbuf (%u) and scattered" @@ -473,6 +504,17 @@ mlx4_rx_queue_setup(struct rte_eth_dev *dev, uint16_t idx, uint16_t desc, dev->data->dev_conf.rxmode.max_rx_pkt_len, mb_len - RTE_PKTMBUF_HEADROOM); } + DEBUG("%p: maximum number of segments per packet: %u", + (void *)dev, 1 << rxq->sges_n); + if (desc % (1 << rxq->sges_n)) { + rte_errno = EINVAL; + ERROR("%p: number of Rx queue descriptors (%u) is not a" + " multiple of maximum segments per packet (%u)", + (void *)dev, + desc, + 1 << rxq->sges_n); + goto error; + } /* Use the entire Rx mempool as the memory region. */ rxq->mr = mlx4_mp2mr(priv->pd, mp); if (!rxq->mr) { @@ -497,7 +539,8 @@ mlx4_rx_queue_setup(struct rte_eth_dev *dev, uint16_t idx, uint16_t desc, goto error; } } - rxq->cq = ibv_create_cq(priv->ctx, desc, NULL, rxq->channel, 0); + rxq->cq = ibv_create_cq(priv->ctx, desc >> rxq->sges_n, NULL, + rxq->channel, 0); if (!rxq->cq) { rte_errno = ENOMEM; ERROR("%p: CQ creation failure: %s", @@ -508,8 +551,8 @@ mlx4_rx_queue_setup(struct rte_eth_dev *dev, uint16_t idx, uint16_t desc, (priv->ctx, &(struct ibv_wq_init_attr){ .wq_type = IBV_WQT_RQ, - .max_wr = RTE_MIN(priv->device_attr.max_qp_wr, desc), - .max_sge = 1, + .max_wr = desc >> rxq->sges_n, + .max_sge = 1 << rxq->sges_n, .pd = priv->pd, .cq = rxq->cq, }); @@ -531,27 +574,43 @@ mlx4_rx_queue_setup(struct rte_eth_dev *dev, uint16_t idx, uint16_t desc, (void *)dev, strerror(rte_errno)); goto error; } - ret = mlx4_rxq_alloc_elts(rxq); + /* Retrieve device queue information. */ + mlxdv.cq.in = rxq->cq; + mlxdv.cq.out = &dv_cq; + mlxdv.rwq.in = rxq->wq; + mlxdv.rwq.out = &dv_rwq; + ret = mlx4dv_init_obj(&mlxdv, MLX4DV_OBJ_RWQ | MLX4DV_OBJ_CQ); if (ret) { - ERROR("%p: RXQ allocation failed: %s", - (void *)dev, strerror(rte_errno)); + rte_errno = EINVAL; + ERROR("%p: failed to obtain device information", (void *)dev); goto error; } - ret = ibv_post_wq_recv(rxq->wq, &(*rxq->elts)[0].wr, - &(struct ibv_recv_wr *){ NULL }); + rxq->wqes = + (volatile struct mlx4_wqe_data_seg (*)[]) + ((uintptr_t)dv_rwq.buf.buf + dv_rwq.rq.offset); + rxq->rq_db = dv_rwq.rdb; + rxq->rq_ci = 0; + rxq->mcq.buf = dv_cq.buf.buf; + rxq->mcq.cqe_cnt = dv_cq.cqe_cnt; + rxq->mcq.set_ci_db = dv_cq.set_ci_db; + rxq->mcq.cqe_64 = (dv_cq.cqe_size & 64) ? 1 : 0; + ret = mlx4_rxq_alloc_elts(rxq); if (ret) { - rte_errno = ret; - ERROR("%p: ibv_post_recv() failed: %s", - (void *)dev, - strerror(rte_errno)); + ERROR("%p: RXQ allocation failed: %s", + (void *)dev, strerror(rte_errno)); goto error; } DEBUG("%p: adding Rx queue %p to list", (void *)dev, (void *)rxq); dev->data->rx_queues[idx] = rxq; /* Enable associated flows. */ ret = mlx4_flow_sync(priv, &error); - if (!ret) + if (!ret) { + /* Update doorbell counter. */ + rxq->rq_ci = desc >> rxq->sges_n; + rte_wmb(); + *rxq->rq_db = rte_cpu_to_be_32(rxq->rq_ci); return 0; + } ERROR("cannot re-attach flow rules to queue %u" " (code %d, \"%s\"), flow error type %d, cause %p, message: %s", idx, -ret, strerror(-ret), error.type, error.cause, diff --git a/drivers/net/mlx4/mlx4_rxtx.c b/drivers/net/mlx4/mlx4_rxtx.c index 38b87a0..cc0baaa 100644 --- a/drivers/net/mlx4/mlx4_rxtx.c +++ b/drivers/net/mlx4/mlx4_rxtx.c @@ -538,9 +538,44 @@ mlx4_tx_burst(void *dpdk_txq, struct rte_mbuf **pkts, uint16_t pkts_n) } /** - * DPDK callback for Rx. + * Poll one CQE from CQ. * - * The following function doesn't manage scattered packets. + * @param rxq + * Pointer to the receive queue structure. + * @param[out] out + * Just polled CQE. + * + * @return + * Number of bytes of the CQE, 0 in case there is no completion. + */ +static unsigned int +mlx4_cq_poll_one(struct rxq *rxq, struct mlx4_cqe **out) +{ + int ret = 0; + struct mlx4_cqe *cqe = NULL; + struct mlx4_cq *cq = &rxq->mcq; + + cqe = (struct mlx4_cqe *)mlx4_get_cqe(cq, cq->cons_index); + if (!!(cqe->owner_sr_opcode & MLX4_CQE_OWNER_MASK) ^ + !!(cq->cons_index & cq->cqe_cnt)) + goto out; + /* + * Make sure we read CQ entry contents after we've checked the + * ownership bit. + */ + rte_rmb(); + assert(!(cqe->owner_sr_opcode & MLX4_CQE_IS_SEND_MASK)); + assert((cqe->owner_sr_opcode & MLX4_CQE_OPCODE_MASK) != + MLX4_CQE_OPCODE_ERROR); + ret = rte_be_to_cpu_32(cqe->byte_cnt); + ++cq->cons_index; +out: + *out = cqe; + return ret; +} + +/** + * DPDK callback for Rx with scattered packets support. * * @param dpdk_rxq * Generic pointer to Rx queue structure. @@ -555,112 +590,107 @@ mlx4_tx_burst(void *dpdk_txq, struct rte_mbuf **pkts, uint16_t pkts_n) uint16_t mlx4_rx_burst(void *dpdk_rxq, struct rte_mbuf **pkts, uint16_t pkts_n) { - struct rxq *rxq = (struct rxq *)dpdk_rxq; - struct rxq_elt (*elts)[rxq->elts_n] = rxq->elts; - const unsigned int elts_n = rxq->elts_n; - unsigned int elts_head = rxq->elts_head; - struct ibv_wc wcs[pkts_n]; - struct ibv_recv_wr *wr_head = NULL; - struct ibv_recv_wr **wr_next = &wr_head; - struct ibv_recv_wr *wr_bad = NULL; - unsigned int i; - unsigned int pkts_ret = 0; - int ret; + struct rxq *rxq = dpdk_rxq; + const uint32_t wr_cnt = (1 << rxq->elts_n) - 1; + const uint16_t sges_n = rxq->sges_n; + struct rte_mbuf *pkt = NULL; + struct rte_mbuf *seg = NULL; + unsigned int i = 0; + uint32_t rq_ci = rxq->rq_ci << sges_n; + int len = 0; - ret = ibv_poll_cq(rxq->cq, pkts_n, wcs); - if (unlikely(ret == 0)) - return 0; - if (unlikely(ret < 0)) { - DEBUG("rxq=%p, ibv_poll_cq() failed (wc_n=%d)", - (void *)rxq, ret); - return 0; - } - assert(ret <= (int)pkts_n); - /* For each work completion. */ - for (i = 0; i != (unsigned int)ret; ++i) { - struct ibv_wc *wc = &wcs[i]; - struct rxq_elt *elt = &(*elts)[elts_head]; - struct ibv_recv_wr *wr = &elt->wr; - uint32_t len = wc->byte_len; - struct rte_mbuf *seg = elt->buf; - struct rte_mbuf *rep; + while (pkts_n) { + struct mlx4_cqe *cqe; + uint32_t idx = rq_ci & wr_cnt; + struct rte_mbuf *rep = (*rxq->elts)[idx]; + volatile struct mlx4_wqe_data_seg *scat = &(*rxq->wqes)[idx]; - /* Sanity checks. */ - assert(wr->sg_list == &elt->sge); - assert(wr->num_sge == 1); - assert(elts_head < rxq->elts_n); - assert(rxq->elts_head < rxq->elts_n); - /* - * Fetch initial bytes of packet descriptor into a - * cacheline while allocating rep. - */ - rte_mbuf_prefetch_part1(seg); - rte_mbuf_prefetch_part2(seg); - /* Link completed WRs together for repost. */ - *wr_next = wr; - wr_next = &wr->next; - if (unlikely(wc->status != IBV_WC_SUCCESS)) { - /* Whatever, just repost the offending WR. */ - DEBUG("rxq=%p: bad work completion status (%d): %s", - (void *)rxq, wc->status, - ibv_wc_status_str(wc->status)); - /* Increment dropped packets counter. */ - ++rxq->stats.idropped; - goto repost; - } + /* Update the 'next' pointer of the previous segment. */ + if (pkt) + seg->next = rep; + seg = rep; + rte_prefetch0(seg); + rte_prefetch0(scat); rep = rte_mbuf_raw_alloc(rxq->mp); if (unlikely(rep == NULL)) { - /* - * Unable to allocate a replacement mbuf, - * repost WR. - */ - DEBUG("rxq=%p: can't allocate a new mbuf", - (void *)rxq); - /* Increase out of memory counters. */ ++rxq->stats.rx_nombuf; - ++rxq->priv->dev->data->rx_mbuf_alloc_failed; - goto repost; + if (!pkt) { + /* + * No buffers before we even started, + * bail out silently. + */ + break; + } + while (pkt != seg) { + assert(pkt != (*rxq->elts)[idx]); + rep = pkt->next; + pkt->next = NULL; + pkt->nb_segs = 1; + rte_mbuf_raw_free(pkt); + pkt = rep; + } + break; + } + if (!pkt) { + /* Looking for the new packet. */ + len = mlx4_cq_poll_one(rxq, &cqe); + if (!len) { + rte_mbuf_raw_free(rep); + break; + } + if (unlikely(len < 0)) { + /* Rx error, packet is likely too large. */ + rte_mbuf_raw_free(rep); + ++rxq->stats.idropped; + goto skip; + } + pkt = seg; + pkt->packet_type = 0; + pkt->ol_flags = 0; + pkt->pkt_len = len; + } + rep->nb_segs = 1; + rep->port = rxq->port_id; + rep->data_len = seg->data_len; + rep->data_off = seg->data_off; + (*rxq->elts)[idx] = rep; + /* + * Fill NIC descriptor with the new buffer. The lkey and size + * of the buffers are already known, only the buffer address + * changes. + */ + scat->addr = rte_cpu_to_be_64(rte_pktmbuf_mtod(rep, uintptr_t)); + if (len > seg->data_len) { + len -= seg->data_len; + ++pkt->nb_segs; + ++rq_ci; + continue; } - /* Reconfigure sge to use rep instead of seg. */ - elt->sge.addr = (uintptr_t)rep->buf_addr + RTE_PKTMBUF_HEADROOM; - assert(elt->sge.lkey == rxq->mr->lkey); - elt->buf = rep; - /* Update seg information. */ - seg->data_off = RTE_PKTMBUF_HEADROOM; - seg->nb_segs = 1; - seg->port = rxq->port_id; - seg->next = NULL; - seg->pkt_len = len; + /* The last segment. */ seg->data_len = len; - seg->packet_type = 0; - seg->ol_flags = 0; + /* Increment bytes counter. */ + rxq->stats.ibytes += pkt->pkt_len; /* Return packet. */ - *(pkts++) = seg; - ++pkts_ret; - /* Increase bytes counter. */ - rxq->stats.ibytes += len; -repost: - if (++elts_head >= elts_n) - elts_head = 0; - continue; + *(pkts++) = pkt; + pkt = NULL; + --pkts_n; + ++i; +skip: + /* Align consumer index to the next stride. */ + rq_ci >>= sges_n; + ++rq_ci; + rq_ci <<= sges_n; } - if (unlikely(i == 0)) + if (unlikely(i == 0 && (rq_ci >> sges_n) == rxq->rq_ci)) return 0; - /* Repost WRs. */ - *wr_next = NULL; - assert(wr_head); - ret = ibv_post_wq_recv(rxq->wq, wr_head, &wr_bad); - if (unlikely(ret)) { - /* Inability to repost WRs is fatal. */ - DEBUG("%p: recv_burst(): failed (ret=%d)", - (void *)rxq->priv, - ret); - abort(); - } - rxq->elts_head = elts_head; - /* Increase packets counter. */ - rxq->stats.ipackets += pkts_ret; - return pkts_ret; + /* Update the consumer index. */ + rxq->rq_ci = rq_ci >> sges_n; + rte_wmb(); + *rxq->rq_db = rte_cpu_to_be_32(rxq->rq_ci); + *rxq->mcq.set_ci_db = rte_cpu_to_be_32(rxq->mcq.cons_index & 0xffffff); + /* Increment packets counter. */ + rxq->stats.ipackets += i; + return i; } /** diff --git a/drivers/net/mlx4/mlx4_rxtx.h b/drivers/net/mlx4/mlx4_rxtx.h index ff27126..fa5738f 100644 --- a/drivers/net/mlx4/mlx4_rxtx.h +++ b/drivers/net/mlx4/mlx4_rxtx.h @@ -63,13 +63,6 @@ struct mlx4_rxq_stats { uint64_t rx_nombuf; /**< Total of Rx mbuf allocation failures. */ }; -/** Rx element. */ -struct rxq_elt { - struct ibv_recv_wr wr; /**< Work request. */ - struct ibv_sge sge; /**< Scatter/gather element. */ - struct rte_mbuf *buf; /**< Buffer. */ -}; - /** Rx queue descriptor. */ struct rxq { struct priv *priv; /**< Back pointer to private data. */ @@ -78,10 +71,14 @@ struct rxq { struct ibv_cq *cq; /**< Completion queue. */ struct ibv_wq *wq; /**< Work queue. */ struct ibv_comp_channel *channel; /**< Rx completion channel. */ - unsigned int port_id; /**< Port ID for incoming packets. */ - unsigned int elts_n; /**< (*elts)[] length. */ - unsigned int elts_head; /**< Current index in (*elts)[]. */ - struct rxq_elt (*elts)[]; /**< Rx elements. */ + uint16_t rq_ci; /**< Saved RQ consumer index. */ + uint16_t port_id; /**< Port ID for incoming packets. */ + uint16_t sges_n; /**< Number of segments per packet (log2 value). */ + uint16_t elts_n; /**< Mbuf queue size (log2 value). */ + struct rte_mbuf *(*elts)[]; /**< Rx elements. */ + volatile struct mlx4_wqe_data_seg (*wqes)[]; /**< HW queue entries. */ + volatile uint32_t *rq_db; /**< RQ doorbell record. */ + struct mlx4_cq mcq; /**< Info for directly manipulating the CQ. */ struct mlx4_rxq_stats stats; /**< Rx queue counters. */ unsigned int socket; /**< CPU socket ID for allocations. */ uint8_t data[]; /**< Remaining queue resources. */ -- 2.1.4