Bypass Verbs to improve RX performance. Signed-off-by: Nelio Laranjeiro <nelio.laranjeiro at 6wind.com> Signed-off-by: Yaacov Hazan <yaacovh at mellanox.com> Signed-off-by: Adrien Mazarguil <adrien.mazarguil at 6wind.com> Signed-off-by: Vasily Philipov <vasilyf at mellanox.com> --- drivers/net/mlx5/mlx5_ethdev.c | 4 +- drivers/net/mlx5/mlx5_fdir.c | 2 +- drivers/net/mlx5/mlx5_rxq.c | 303 ++++++++++++++++++++--------------------- drivers/net/mlx5/mlx5_rxtx.c | 289 ++++++++++++++++++++------------------- drivers/net/mlx5/mlx5_rxtx.h | 38 +++--- drivers/net/mlx5/mlx5_vlan.c | 3 +- 6 files changed, 325 insertions(+), 314 deletions(-)
diff --git a/drivers/net/mlx5/mlx5_ethdev.c b/drivers/net/mlx5/mlx5_ethdev.c index 759434e..16b05d3 100644 --- a/drivers/net/mlx5/mlx5_ethdev.c +++ b/drivers/net/mlx5/mlx5_ethdev.c @@ -1263,7 +1263,9 @@ mlx5_secondary_data_setup(struct priv *priv) } /* RX queues. */ for (i = 0; i != nb_rx_queues; ++i) { - struct rxq *primary_rxq = (*sd->primary_priv->rxqs)[i]; + struct rxq_ctrl *primary_rxq = + container_of((*sd->primary_priv->rxqs)[i], + struct rxq_ctrl, rxq); if (primary_rxq == NULL) continue; diff --git a/drivers/net/mlx5/mlx5_fdir.c b/drivers/net/mlx5/mlx5_fdir.c index 1850218..73eb00e 100644 --- a/drivers/net/mlx5/mlx5_fdir.c +++ b/drivers/net/mlx5/mlx5_fdir.c @@ -431,7 +431,7 @@ priv_get_fdir_queue(struct priv *priv, uint16_t idx) ind_init_attr = (struct ibv_exp_rwq_ind_table_init_attr){ .pd = priv->pd, .log_ind_tbl_size = 0, - .ind_tbl = &((*priv->rxqs)[idx]->wq), + .ind_tbl = &rxq_ctrl->wq, .comp_mask = 0, }; diff --git a/drivers/net/mlx5/mlx5_rxq.c b/drivers/net/mlx5/mlx5_rxq.c index 7db4ce7..a8f68a3 100644 --- a/drivers/net/mlx5/mlx5_rxq.c +++ b/drivers/net/mlx5/mlx5_rxq.c @@ -43,6 +43,8 @@ #pragma GCC diagnostic ignored "-pedantic" #endif #include <infiniband/verbs.h> +#include <infiniband/arch.h> +#include <infiniband/mlx5_hw.h> #ifdef PEDANTIC #pragma GCC diagnostic error "-pedantic" #endif @@ -373,8 +375,13 @@ priv_create_hash_rxqs(struct priv *priv) DEBUG("indirection table extended to assume %u WQs", priv->reta_idx_n); } - for (i = 0; (i != priv->reta_idx_n); ++i) - wqs[i] = (*priv->rxqs)[(*priv->reta_idx)[i]]->wq; + for (i = 0; (i != priv->reta_idx_n); ++i) { + struct rxq_ctrl *rxq_ctrl; + + rxq_ctrl = container_of((*priv->rxqs)[(*priv->reta_idx)[i]], + struct rxq_ctrl, rxq); + wqs[i] = rxq_ctrl->wq; + } /* Get number of hash RX queues to configure. */ for (i = 0, hash_rxqs_n = 0; (i != ind_tables_n); ++i) hash_rxqs_n += ind_table_init[i].hash_types_n; @@ -638,21 +645,13 @@ rxq_alloc_elts(struct rxq_ctrl *rxq_ctrl, unsigned int elts_n, struct rte_mbuf **pool) { unsigned int i; - struct rxq_elt (*elts)[elts_n] = - rte_calloc_socket("RXQ elements", 1, sizeof(*elts), 0, - rxq_ctrl->socket); int ret = 0; - if (elts == NULL) { - ERROR("%p: can't allocate packets array", (void *)rxq_ctrl); - ret = ENOMEM; - goto error; - } /* For each WR (packet). */ for (i = 0; (i != elts_n); ++i) { - struct rxq_elt *elt = &(*elts)[i]; - struct ibv_sge *sge = &(*elts)[i].sge; struct rte_mbuf *buf; + volatile struct mlx5_wqe_data_seg *scat = + &(*rxq_ctrl->rxq.wqes)[i]; if (pool != NULL) { buf = *(pool++); @@ -666,40 +665,36 @@ rxq_alloc_elts(struct rxq_ctrl *rxq_ctrl, unsigned int elts_n, ret = ENOMEM; goto error; } - elt->buf = buf; /* Headroom is reserved by rte_pktmbuf_alloc(). */ assert(DATA_OFF(buf) == RTE_PKTMBUF_HEADROOM); /* Buffer is supposed to be empty. */ assert(rte_pktmbuf_data_len(buf) == 0); assert(rte_pktmbuf_pkt_len(buf) == 0); - /* sge->addr must be able to store a pointer. */ - assert(sizeof(sge->addr) >= sizeof(uintptr_t)); - /* SGE keeps its headroom. */ - sge->addr = (uintptr_t) - ((uint8_t *)buf->buf_addr + RTE_PKTMBUF_HEADROOM); - sge->length = (buf->buf_len - RTE_PKTMBUF_HEADROOM); - sge->lkey = rxq_ctrl->mr->lkey; - /* Redundant check for tailroom. */ - assert(sge->length == rte_pktmbuf_tailroom(buf)); + assert(!buf->next); + PORT(buf) = rxq_ctrl->rxq.port_id; + DATA_LEN(buf) = rte_pktmbuf_tailroom(buf); + PKT_LEN(buf) = DATA_LEN(buf); + NB_SEGS(buf) = 1; + /* scat->addr must be able to store a pointer. */ + assert(sizeof(scat->addr) >= sizeof(uintptr_t)); + *scat = (struct mlx5_wqe_data_seg){ + .addr = htonll(rte_pktmbuf_mtod(buf, uintptr_t)), + .byte_count = htonl(DATA_LEN(buf)), + .lkey = htonl(rxq_ctrl->mr->lkey), + }; + (*rxq_ctrl->rxq.elts)[i] = buf; } DEBUG("%p: allocated and configured %u single-segment WRs", (void *)rxq_ctrl, elts_n); - rxq_ctrl->rxq.elts_n = elts_n; - rxq_ctrl->rxq.elts_head = 0; - rxq_ctrl->rxq.elts = elts; assert(ret == 0); return 0; error: - if (elts != NULL) { - assert(pool == NULL); - for (i = 0; (i != RTE_DIM(*elts)); ++i) { - struct rxq_elt *elt = &(*elts)[i]; - struct rte_mbuf *buf = elt->buf; - - if (buf != NULL) - rte_pktmbuf_free_seg(buf); - } - rte_free(elts); + assert(pool == NULL); + elts_n = i; + for (i = 0; (i != elts_n); ++i) { + if ((*rxq_ctrl->rxq.elts)[i] != NULL) + rte_pktmbuf_free_seg((*rxq_ctrl->rxq.elts)[i]); + (*rxq_ctrl->rxq.elts)[i] = NULL; } DEBUG("%p: failed, freed everything", (void *)rxq_ctrl); assert(ret > 0); @@ -716,22 +711,16 @@ static void rxq_free_elts(struct rxq_ctrl *rxq_ctrl) { unsigned int i; - unsigned int elts_n = rxq_ctrl->rxq.elts_n; - struct rxq_elt (*elts)[elts_n] = rxq_ctrl->rxq.elts; DEBUG("%p: freeing WRs", (void *)rxq_ctrl); - rxq_ctrl->rxq.elts_n = 0; - rxq_ctrl->rxq.elts = NULL; - if (elts == NULL) + if (rxq_ctrl->rxq.elts == NULL) return; - for (i = 0; (i != RTE_DIM(*elts)); ++i) { - struct rxq_elt *elt = &(*elts)[i]; - struct rte_mbuf *buf = elt->buf; - if (buf != NULL) - rte_pktmbuf_free_seg(buf); + for (i = 0; (i != rxq_ctrl->rxq.elts_n); ++i) { + if ((*rxq_ctrl->rxq.elts)[i] != NULL) + rte_pktmbuf_free_seg((*rxq_ctrl->rxq.elts)[i]); + (*rxq_ctrl->rxq.elts)[i] = NULL; } - rte_free(elts); } /** @@ -749,42 +738,40 @@ rxq_cleanup(struct rxq_ctrl *rxq_ctrl) DEBUG("cleaning up %p", (void *)rxq_ctrl); rxq_free_elts(rxq_ctrl); - rxq_ctrl->rxq.poll = NULL; - rxq_ctrl->rxq.recv = NULL; if (rxq_ctrl->if_wq != NULL) { - assert(rxq_ctrl->rxq.priv != NULL); - assert(rxq_ctrl->rxq.priv->ctx != NULL); - assert(rxq_ctrl->rxq.wq != NULL); + assert(rxq_ctrl->priv != NULL); + assert(rxq_ctrl->priv->ctx != NULL); + assert(rxq_ctrl->wq != NULL); params = (struct ibv_exp_release_intf_params){ .comp_mask = 0, }; - claim_zero(ibv_exp_release_intf(rxq_ctrl->rxq.priv->ctx, + claim_zero(ibv_exp_release_intf(rxq_ctrl->priv->ctx, rxq_ctrl->if_wq, ¶ms)); } if (rxq_ctrl->if_cq != NULL) { - assert(rxq_ctrl->rxq.priv != NULL); - assert(rxq_ctrl->rxq.priv->ctx != NULL); - assert(rxq_ctrl->rxq.cq != NULL); + assert(rxq_ctrl->priv != NULL); + assert(rxq_ctrl->priv->ctx != NULL); + assert(rxq_ctrl->cq != NULL); params = (struct ibv_exp_release_intf_params){ .comp_mask = 0, }; - claim_zero(ibv_exp_release_intf(rxq_ctrl->rxq.priv->ctx, + claim_zero(ibv_exp_release_intf(rxq_ctrl->priv->ctx, rxq_ctrl->if_cq, ¶ms)); } - if (rxq_ctrl->rxq.wq != NULL) - claim_zero(ibv_exp_destroy_wq(rxq_ctrl->rxq.wq)); - if (rxq_ctrl->rxq.cq != NULL) - claim_zero(ibv_destroy_cq(rxq_ctrl->rxq.cq)); + if (rxq_ctrl->wq != NULL) + claim_zero(ibv_exp_destroy_wq(rxq_ctrl->wq)); + if (rxq_ctrl->cq != NULL) + claim_zero(ibv_destroy_cq(rxq_ctrl->cq)); if (rxq_ctrl->rd != NULL) { struct ibv_exp_destroy_res_domain_attr attr = { .comp_mask = 0, }; - assert(rxq_ctrl->rxq.priv != NULL); - assert(rxq_ctrl->rxq.priv->ctx != NULL); - claim_zero(ibv_exp_destroy_res_domain(rxq_ctrl->rxq.priv->ctx, + assert(rxq_ctrl->priv != NULL); + assert(rxq_ctrl->priv->ctx != NULL); + claim_zero(ibv_exp_destroy_res_domain(rxq_ctrl->priv->ctx, rxq_ctrl->rd, &attr)); } @@ -811,14 +798,13 @@ rxq_cleanup(struct rxq_ctrl *rxq_ctrl) int rxq_rehash(struct rte_eth_dev *dev, struct rxq_ctrl *rxq_ctrl) { - struct priv *priv = rxq_ctrl->rxq.priv; + struct priv *priv = rxq_ctrl->priv; struct rxq_ctrl tmpl = *rxq_ctrl; unsigned int mbuf_n; unsigned int desc_n; struct rte_mbuf **pool; unsigned int i, k; struct ibv_exp_wq_attr mod; - struct rxq_elt (*elts)[tmpl.rxq.elts_n]; int err; DEBUG("%p: rehashing queue %p", (void *)dev, (void *)rxq_ctrl); @@ -840,7 +826,7 @@ rxq_rehash(struct rte_eth_dev *dev, struct rxq_ctrl *rxq_ctrl) .attr_mask = IBV_EXP_WQ_ATTR_STATE, .wq_state = IBV_EXP_WQS_RESET, }; - err = ibv_exp_modify_wq(tmpl.rxq.wq, &mod); + err = ibv_exp_modify_wq(tmpl.wq, &mod); if (err) { ERROR("%p: cannot reset WQ: %s", (void *)dev, strerror(err)); assert(err > 0); @@ -854,60 +840,33 @@ rxq_rehash(struct rte_eth_dev *dev, struct rxq_ctrl *rxq_ctrl) } /* Snatch mbufs from original queue. */ k = 0; - elts = rxq_ctrl->rxq.elts; - for (i = 0; (i != RTE_DIM(*elts)); ++i) { - struct rxq_elt *elt = &(*elts)[i]; - struct rte_mbuf *buf = elt->buf; - - pool[k++] = buf; - } + for (i = 0; (i != desc_n); ++i) + pool[k++] = (*rxq_ctrl->rxq.elts)[i]; assert(k == mbuf_n); - tmpl.rxq.elts_n = 0; - tmpl.rxq.elts = NULL; - assert((void *)&tmpl.rxq.elts == NULL); - err = rxq_alloc_elts(&tmpl, desc_n, pool); - if (err) { - ERROR("%p: cannot reallocate WRs, aborting", (void *)dev); - rte_free(pool); - assert(err > 0); - return err; - } - assert(tmpl.rxq.elts_n == desc_n); rte_free(pool); - /* Clean up original data. */ - rxq_ctrl->rxq.elts_n = 0; - rte_free(rxq_ctrl->rxq.elts); - rxq_ctrl->rxq.elts = NULL; /* Change queue state to ready. */ mod = (struct ibv_exp_wq_attr){ .attr_mask = IBV_EXP_WQ_ATTR_STATE, .wq_state = IBV_EXP_WQS_RDY, }; - err = ibv_exp_modify_wq(tmpl.rxq.wq, &mod); + err = ibv_exp_modify_wq(tmpl.wq, &mod); if (err) { ERROR("%p: WQ state to IBV_EXP_WQS_RDY failed: %s", (void *)dev, strerror(err)); goto error; } /* Post SGEs. */ - assert(tmpl.if_wq != NULL); - elts = tmpl.rxq.elts; - for (i = 0; (i != RTE_DIM(*elts)); ++i) { - err = tmpl.if_wq->recv_burst( - tmpl.rxq.wq, - &(*elts)[i].sge, - 1); - if (err) - break; - } + err = rxq_alloc_elts(&tmpl, desc_n, pool); if (err) { - ERROR("%p: failed to post SGEs with error %d", - (void *)dev, err); - /* Set err because it does not contain a valid errno value. */ - err = EIO; - goto error; + ERROR("%p: cannot reallocate WRs, aborting", (void *)dev); + rte_free(pool); + assert(err > 0); + return err; } - tmpl.rxq.recv = tmpl.if_wq->recv_burst; + /* Update doorbell counter. */ + rxq_ctrl->rxq.rq_ci = desc_n; + rte_wmb(); + *rxq_ctrl->rxq.rq_db = htonl(rxq_ctrl->rxq.rq_ci); error: *rxq_ctrl = tmpl; assert(err >= 0); @@ -915,6 +874,45 @@ error: } /** + * Initialize RX queue. + * + * @param tmpl + * Pointer to RX queue control template. + * @param rxq_ctrl + * Pointer to RX queue control. + * + * @return + * 0 on success, errno value on failure. + */ +static inline int +rxq_setup(struct rxq_ctrl *tmpl, struct rxq_ctrl *rxq_ctrl) +{ + struct ibv_cq *ibcq = tmpl->cq; + struct mlx5_cq *cq = to_mxxx(cq, cq); + struct mlx5_rwq *rwq = container_of(tmpl->wq, struct mlx5_rwq, wq); + + if (cq->cqe_sz != RTE_CACHE_LINE_SIZE) { + ERROR("Wrong MLX5_CQE_SIZE environment variable value: " + "it should be set to %u", RTE_CACHE_LINE_SIZE); + return EINVAL; + } + tmpl->rxq.rq_db = rwq->rq.db; + tmpl->rxq.cq_ci = 0; + tmpl->rxq.rq_ci = 0; + tmpl->rxq.cq_db = cq->dbrec; + tmpl->rxq.wqes = + (volatile struct mlx5_wqe_data_seg (*)[]) + (uintptr_t)rwq->rq.buff; + tmpl->rxq.cqes = + (volatile struct mlx5_cqe (*)[]) + (uintptr_t)cq->active_buf->buf; + tmpl->rxq.elts = + (struct rte_mbuf *(*)[tmpl->rxq.elts_n]) + ((uintptr_t)rxq_ctrl + sizeof(*rxq_ctrl)); + return 0; +} + +/** * Configure a RX queue. * * @param dev @@ -934,15 +932,16 @@ error: * 0 on success, errno value on failure. */ int -rxq_setup(struct rte_eth_dev *dev, struct rxq_ctrl *rxq_ctrl, uint16_t desc, - unsigned int socket, const struct rte_eth_rxconf *conf, - struct rte_mempool *mp) +rxq_ctrl_setup(struct rte_eth_dev *dev, struct rxq_ctrl *rxq_ctrl, + uint16_t desc, unsigned int socket, + const struct rte_eth_rxconf *conf, struct rte_mempool *mp) { struct priv *priv = dev->data->dev_private; struct rxq_ctrl tmpl = { + .priv = priv, .socket = socket, .rxq = { - .priv = priv, + .elts_n = desc, .mp = mp, }, }; @@ -952,17 +951,16 @@ rxq_setup(struct rte_eth_dev *dev, struct rxq_ctrl *rxq_ctrl, uint16_t desc, struct ibv_exp_cq_init_attr cq; struct ibv_exp_res_domain_init_attr rd; struct ibv_exp_wq_init_attr wq; + struct ibv_exp_cq_attr cq_attr; } attr; enum ibv_exp_query_intf_status status; unsigned int mb_len = rte_pktmbuf_data_room_size(mp); - struct rxq_elt (*elts)[desc]; int ret = 0; - unsigned int i; - unsigned int cq_size = desc; (void)conf; /* Thresholds configuration (ignored). */ if (desc == 0) { - ERROR("%p: invalid number of RX descriptors", (void *)dev); + ERROR("%p: invalid number of RX descriptors (must be a" + " multiple of 2)", (void *)dev); return EINVAL; } /* Toggle RX checksum offload if hardware supports it. */ @@ -996,9 +994,9 @@ rxq_setup(struct rte_eth_dev *dev, struct rxq_ctrl *rxq_ctrl, uint16_t desc, .comp_mask = IBV_EXP_CQ_INIT_ATTR_RES_DOMAIN, .res_domain = tmpl.rd, }; - tmpl.rxq.cq = ibv_exp_create_cq(priv->ctx, cq_size, NULL, NULL, 0, - &attr.cq); - if (tmpl.rxq.cq == NULL) { + tmpl.cq = ibv_exp_create_cq(priv->ctx, desc - 1, NULL, NULL, 0, + &attr.cq); + if (tmpl.cq == NULL) { ret = ENOMEM; ERROR("%p: CQ creation failure: %s", (void *)dev, strerror(ret)); @@ -1015,13 +1013,13 @@ rxq_setup(struct rte_eth_dev *dev, struct rxq_ctrl *rxq_ctrl, uint16_t desc, .wq_context = NULL, /* Could be useful in the future. */ .wq_type = IBV_EXP_WQT_RQ, /* Max number of outstanding WRs. */ - .max_recv_wr = ((priv->device_attr.max_qp_wr < (int)cq_size) ? + .max_recv_wr = ((priv->device_attr.max_qp_wr < (int)desc) ? priv->device_attr.max_qp_wr : - (int)cq_size), + (int)desc), /* Max number of scatter/gather elements in a WR. */ .max_recv_sge = 1, .pd = priv->pd, - .cq = tmpl.rxq.cq, + .cq = tmpl.cq, .comp_mask = IBV_EXP_CREATE_WQ_RES_DOMAIN | IBV_EXP_CREATE_WQ_VLAN_OFFLOADS | @@ -1064,19 +1062,13 @@ rxq_setup(struct rte_eth_dev *dev, struct rxq_ctrl *rxq_ctrl, uint16_t desc, " up to date", (void *)dev); - tmpl.rxq.wq = ibv_exp_create_wq(priv->ctx, &attr.wq); - if (tmpl.rxq.wq == NULL) { + tmpl.wq = ibv_exp_create_wq(priv->ctx, &attr.wq); + if (tmpl.wq == NULL) { ret = (errno ? errno : EINVAL); ERROR("%p: WQ creation failure: %s", (void *)dev, strerror(ret)); goto error; } - ret = rxq_alloc_elts(&tmpl, desc, NULL); - if (ret) { - ERROR("%p: RXQ allocation failed: %s", - (void *)dev, strerror(ret)); - goto error; - } /* Save port ID. */ tmpl.rxq.port_id = dev->data->port_id; DEBUG("%p: RTE port ID: %u", (void *)rxq_ctrl, tmpl.rxq.port_id); @@ -1084,7 +1076,7 @@ rxq_setup(struct rte_eth_dev *dev, struct rxq_ctrl *rxq_ctrl, uint16_t desc, .intf_scope = IBV_EXP_INTF_GLOBAL, .intf_version = 1, .intf = IBV_EXP_INTF_CQ, - .obj = tmpl.rxq.cq, + .obj = tmpl.cq, }; tmpl.if_cq = ibv_exp_query_intf(priv->ctx, &attr.params, &status); if (tmpl.if_cq == NULL) { @@ -1095,7 +1087,7 @@ rxq_setup(struct rte_eth_dev *dev, struct rxq_ctrl *rxq_ctrl, uint16_t desc, attr.params = (struct ibv_exp_query_intf_params){ .intf_scope = IBV_EXP_INTF_GLOBAL, .intf = IBV_EXP_INTF_WQ, - .obj = tmpl.rxq.wq, + .obj = tmpl.wq, }; tmpl.if_wq = ibv_exp_query_intf(priv->ctx, &attr.params, &status); if (tmpl.if_wq == NULL) { @@ -1108,38 +1100,34 @@ rxq_setup(struct rte_eth_dev *dev, struct rxq_ctrl *rxq_ctrl, uint16_t desc, .attr_mask = IBV_EXP_WQ_ATTR_STATE, .wq_state = IBV_EXP_WQS_RDY, }; - ret = ibv_exp_modify_wq(tmpl.rxq.wq, &mod); + ret = ibv_exp_modify_wq(tmpl.wq, &mod); if (ret) { ERROR("%p: WQ state to IBV_EXP_WQS_RDY failed: %s", (void *)dev, strerror(ret)); goto error; } - /* Post SGEs. */ - elts = tmpl.rxq.elts; - for (i = 0; (i != RTE_DIM(*elts)); ++i) { - ret = tmpl.if_wq->recv_burst( - tmpl.rxq.wq, - &(*elts)[i].sge, - 1); - if (ret) - break; + ret = rxq_setup(&tmpl, rxq_ctrl); + if (ret) { + ERROR("%p: cannot initialize RX queue structure: %s", + (void *)dev, strerror(ret)); + goto error; } + ret = rxq_alloc_elts(&tmpl, desc, NULL); if (ret) { - ERROR("%p: failed to post SGEs with error %d", - (void *)dev, ret); - /* Set ret because it does not contain a valid errno value. */ - ret = EIO; + ERROR("%p: RXQ allocation failed: %s", + (void *)dev, strerror(ret)); goto error; } /* Clean up rxq in case we're reinitializing it. */ DEBUG("%p: cleaning-up old rxq just in case", (void *)rxq_ctrl); rxq_cleanup(rxq_ctrl); *rxq_ctrl = tmpl; + /* Update doorbell counter. */ + rxq_ctrl->rxq.rq_ci = desc; + rte_wmb(); + *rxq_ctrl->rxq.rq_db = htonl(rxq_ctrl->rxq.rq_ci); DEBUG("%p: rxq updated with %p", (void *)rxq_ctrl, (void *)&tmpl); assert(ret == 0); - /* Assign function in queue. */ - rxq_ctrl->rxq.poll = rxq_ctrl->if_cq->poll_length_flags_cvlan; - rxq_ctrl->rxq.recv = rxq_ctrl->if_wq->recv_burst; return 0; error: rxq_cleanup(&tmpl); @@ -1173,14 +1161,19 @@ mlx5_rx_queue_setup(struct rte_eth_dev *dev, uint16_t idx, uint16_t desc, { struct priv *priv = dev->data->dev_private; struct rxq *rxq = (*priv->rxqs)[idx]; - struct rxq_ctrl *rxq_ctrl; + struct rxq_ctrl *rxq_ctrl = container_of(rxq, struct rxq_ctrl, rxq); int ret; if (mlx5_is_secondary()) return -E_RTE_SECONDARY; priv_lock(priv); - rxq_ctrl = container_of(rxq, struct rxq_ctrl, rxq); + if (!rte_is_power_of_2(desc)) { + desc = 1 << log2above(desc); + WARN("%p: increased number of descriptors in RX queue %u" + " to the next power of two (%d)", + (void *)dev, idx, desc); + } DEBUG("%p: configuring queue %u for %u descriptors", (void *)dev, idx, desc); if (idx >= priv->rxqs_n) { @@ -1199,8 +1192,9 @@ mlx5_rx_queue_setup(struct rte_eth_dev *dev, uint16_t idx, uint16_t desc, (*priv->rxqs)[idx] = NULL; rxq_cleanup(rxq_ctrl); } else { - rxq_ctrl = rte_calloc_socket("RXQ", 1, sizeof(*rxq_ctrl), 0, - socket); + rxq_ctrl = rte_calloc_socket("RXQ", 1, sizeof(*rxq_ctrl) + + desc * sizeof(struct rte_mbuf *), + 0, socket); if (rxq_ctrl == NULL) { ERROR("%p: unable to allocate queue index %u", (void *)dev, idx); @@ -1208,7 +1202,7 @@ mlx5_rx_queue_setup(struct rte_eth_dev *dev, uint16_t idx, uint16_t desc, return -ENOMEM; } } - ret = rxq_setup(dev, rxq_ctrl, desc, socket, conf, mp); + ret = rxq_ctrl_setup(dev, rxq_ctrl, desc, socket, conf, mp); if (ret) rte_free(rxq_ctrl); else { @@ -1243,12 +1237,12 @@ mlx5_rx_queue_release(void *dpdk_rxq) if (rxq == NULL) return; rxq_ctrl = container_of(rxq, struct rxq_ctrl, rxq); - priv = rxq->priv; + priv = rxq_ctrl->priv; priv_lock(priv); for (i = 0; (i != priv->rxqs_n); ++i) if ((*priv->rxqs)[i] == rxq) { DEBUG("%p: removing RX queue %p from list", - (void *)priv->dev, (void *)rxq); + (void *)priv->dev, (void *)rxq_ctrl); (*priv->rxqs)[i] = NULL; break; } @@ -1278,7 +1272,8 @@ mlx5_rx_burst_secondary_setup(void *dpdk_rxq, struct rte_mbuf **pkts, uint16_t pkts_n) { struct rxq *rxq = dpdk_rxq; - struct priv *priv = mlx5_secondary_data_setup(rxq->priv); + struct rxq_ctrl *rxq_ctrl = container_of(rxq, struct rxq_ctrl, rxq); + struct priv *priv = mlx5_secondary_data_setup(rxq_ctrl->priv); struct priv *primary_priv; unsigned int index; diff --git a/drivers/net/mlx5/mlx5_rxtx.c b/drivers/net/mlx5/mlx5_rxtx.c index 6a0d707..27d8852 100644 --- a/drivers/net/mlx5/mlx5_rxtx.c +++ b/drivers/net/mlx5/mlx5_rxtx.c @@ -42,6 +42,8 @@ #pragma GCC diagnostic ignored "-pedantic" #endif #include <infiniband/verbs.h> +#include <infiniband/mlx5_hw.h> +#include <infiniband/arch.h> #ifdef PEDANTIC #pragma GCC diagnostic error "-pedantic" #endif @@ -55,7 +57,7 @@ #include <rte_prefetch.h> #include <rte_common.h> #include <rte_branch_prediction.h> -#include <rte_memory.h> +#include <rte_ether.h> #ifdef PEDANTIC #pragma GCC diagnostic error "-pedantic" #endif @@ -65,6 +67,47 @@ #include "mlx5_rxtx.h" #include "mlx5_autoconf.h" #include "mlx5_defs.h" +#include "mlx5_prm.h" + +static inline volatile struct mlx5_cqe64 * +get_cqe64(volatile struct mlx5_cqe cqes[], + unsigned int cqes_n, uint16_t *ci) + __attribute__((always_inline)); + +static inline int +rx_poll_len(struct rxq *rxq) __attribute__((always_inline)); + +static volatile struct mlx5_cqe64 * +get_cqe64(volatile struct mlx5_cqe cqes[], + unsigned int cqes_n, uint16_t *ci) +{ + volatile struct mlx5_cqe64 *cqe; + uint16_t idx = *ci; + uint8_t op_own; + + cqe = &cqes[idx & (cqes_n - 1)].cqe64; + op_own = cqe->op_own; + if (unlikely((op_own & MLX5_CQE_OWNER_MASK) == !(idx & cqes_n))) { + return NULL; + } else if (unlikely(op_own & 0x80)) { + switch (op_own >> 4) { + case MLX5_CQE_INVALID: + return NULL; /* No CQE */ + case MLX5_CQE_REQ_ERR: + return cqe; + case MLX5_CQE_RESP_ERR: + ++(*ci); + return NULL; + default: + return NULL; + } + } + if (cqe) { + *ci = idx + 1; + return cqe; + } + return NULL; +} /** * Manage TX completions. @@ -390,8 +433,8 @@ stop: /** * Translate RX completion flags to packet type. * - * @param flags - * RX completion flags returned by poll_length_flags(). + * @param[in] cqe + * Pointer to CQE. * * @note: fix mlx5_dev_supported_ptypes_get() if any change here. * @@ -399,11 +442,13 @@ stop: * Packet type for struct rte_mbuf. */ static inline uint32_t -rxq_cq_to_pkt_type(uint32_t flags) +rxq_cq_to_pkt_type(volatile struct mlx5_cqe64 *cqe) { uint32_t pkt_type; + uint8_t flags = cqe->l4_hdr_type_etc; + uint8_t info = cqe->rsvd0[0]; - if (flags & IBV_EXP_CQ_RX_TUNNEL_PACKET) + if (info & IBV_EXP_CQ_RX_TUNNEL_PACKET) pkt_type = TRANSPOSE(flags, IBV_EXP_CQ_RX_OUTER_IPV4_PACKET, @@ -420,11 +465,11 @@ rxq_cq_to_pkt_type(uint32_t flags) else pkt_type = TRANSPOSE(flags, - IBV_EXP_CQ_RX_IPV4_PACKET, - RTE_PTYPE_L3_IPV4) | + MLX5_CQE_L3_HDR_TYPE_IPV6, + RTE_PTYPE_L3_IPV6) | TRANSPOSE(flags, - IBV_EXP_CQ_RX_IPV6_PACKET, - RTE_PTYPE_L3_IPV6); + MLX5_CQE_L3_HDR_TYPE_IPV4, + RTE_PTYPE_L3_IPV4); return pkt_type; } @@ -433,50 +478,69 @@ rxq_cq_to_pkt_type(uint32_t flags) * * @param[in] rxq * Pointer to RX queue structure. - * @param flags - * RX completion flags returned by poll_length_flags(). + * @param[in] cqe + * Pointer to CQE. * * @return * Offload flags (ol_flags) for struct rte_mbuf. */ static inline uint32_t -rxq_cq_to_ol_flags(const struct rxq *rxq, uint32_t flags) +rxq_cq_to_ol_flags(struct rxq *rxq, volatile struct mlx5_cqe64 *cqe) { uint32_t ol_flags = 0; + uint8_t l3_hdr = (cqe->l4_hdr_type_etc) & MLX5_CQE_L3_HDR_TYPE_MASK; + uint8_t l4_hdr = (cqe->l4_hdr_type_etc) & MLX5_CQE_L4_HDR_TYPE_MASK; + uint8_t info = cqe->rsvd0[0]; - if (rxq->csum) { - /* Set IP checksum flag only for IPv4/IPv6 packets. */ - if (flags & - (IBV_EXP_CQ_RX_IPV4_PACKET | IBV_EXP_CQ_RX_IPV6_PACKET)) - ol_flags |= - TRANSPOSE(~flags, - IBV_EXP_CQ_RX_IP_CSUM_OK, - PKT_RX_IP_CKSUM_BAD); - /* Set L4 checksum flag only for TCP/UDP packets. */ - if (flags & - (IBV_EXP_CQ_RX_TCP_PACKET | IBV_EXP_CQ_RX_UDP_PACKET)) - ol_flags |= - TRANSPOSE(~flags, - IBV_EXP_CQ_RX_TCP_UDP_CSUM_OK, - PKT_RX_L4_CKSUM_BAD); - } + if ((l3_hdr == MLX5_CQE_L3_HDR_TYPE_IPV4) || + (l3_hdr == MLX5_CQE_L3_HDR_TYPE_IPV6)) + ol_flags |= + (!(cqe->hds_ip_ext & MLX5_CQE_L3_OK) * + PKT_RX_IP_CKSUM_BAD); + if ((l4_hdr == MLX5_CQE_L4_HDR_TYPE_TCP) || + (l4_hdr == MLX5_CQE_L4_HDR_TYPE_TCP_EMP_ACK) || + (l4_hdr == MLX5_CQE_L4_HDR_TYPE_TCP_ACK) || + (l4_hdr == MLX5_CQE_L4_HDR_TYPE_UDP)) + ol_flags |= + (!(cqe->hds_ip_ext & MLX5_CQE_L4_OK) * + PKT_RX_L4_CKSUM_BAD); /* * PKT_RX_IP_CKSUM_BAD and PKT_RX_L4_CKSUM_BAD are used in place * of PKT_RX_EIP_CKSUM_BAD because the latter is not functional * (its value is 0). */ - if ((flags & IBV_EXP_CQ_RX_TUNNEL_PACKET) && (rxq->csum_l2tun)) + if ((info & IBV_EXP_CQ_RX_TUNNEL_PACKET) && (rxq->csum_l2tun)) ol_flags |= - TRANSPOSE(~flags, + TRANSPOSE(~cqe->l4_hdr_type_etc, IBV_EXP_CQ_RX_OUTER_IP_CSUM_OK, PKT_RX_IP_CKSUM_BAD) | - TRANSPOSE(~flags, + TRANSPOSE(~cqe->l4_hdr_type_etc, IBV_EXP_CQ_RX_OUTER_TCP_UDP_CSUM_OK, PKT_RX_L4_CKSUM_BAD); return ol_flags; } /** + * Get size of the next packet. + * + * @param rxq + * RX queue to fetch packet from. + * + * @return + * Packet size in bytes. + */ +static inline int __attribute__((always_inline)) +rx_poll_len(struct rxq *rxq) +{ + volatile struct mlx5_cqe64 *cqe; + + cqe = get_cqe64(*rxq->cqes, rxq->elts_n, &rxq->cq_ci); + if (cqe) + return ntohl(cqe->byte_cnt); + return 0; +} + +/** * DPDK callback for RX. * * @param dpdk_rxq @@ -492,133 +556,82 @@ rxq_cq_to_ol_flags(const struct rxq *rxq, uint32_t flags) uint16_t mlx5_rx_burst(void *dpdk_rxq, struct rte_mbuf **pkts, uint16_t pkts_n) { - struct rxq *rxq = (struct rxq *)dpdk_rxq; - struct rxq_elt (*elts)[rxq->elts_n] = rxq->elts; - const unsigned int elts_n = rxq->elts_n; - unsigned int elts_head = rxq->elts_head; - struct ibv_sge sges[pkts_n]; - unsigned int i; + struct rxq *rxq = dpdk_rxq; unsigned int pkts_ret = 0; - int ret; + unsigned int i; + unsigned int rq_ci = rxq->rq_ci; + const unsigned int elts_n = rxq->elts_n; + const unsigned int wqe_cnt = elts_n - 1; for (i = 0; (i != pkts_n); ++i) { - struct rxq_elt *elt = &(*elts)[elts_head]; - unsigned int len; - struct rte_mbuf *seg = elt->buf; + unsigned int idx = rq_ci & wqe_cnt; struct rte_mbuf *rep; - uint32_t flags; - uint16_t vlan_tci; - - /* Sanity checks. */ - assert(seg != NULL); - assert(elts_head < rxq->elts_n); - assert(rxq->elts_head < rxq->elts_n); - /* - * Fetch initial bytes of packet descriptor into a - * cacheline while allocating rep. - */ - rte_mbuf_prefetch_part1(seg); - rte_mbuf_prefetch_part2(seg); - ret = rxq->poll(rxq->cq, NULL, NULL, &flags, &vlan_tci); - if (unlikely(ret < 0)) { - struct ibv_wc wc; - int wcs_n; - - DEBUG("rxq=%p, poll_length() failed (ret=%d)", - (void *)rxq, ret); - /* ibv_poll_cq() must be used in case of failure. */ - wcs_n = ibv_poll_cq(rxq->cq, 1, &wc); - if (unlikely(wcs_n == 0)) - break; - if (unlikely(wcs_n < 0)) { - DEBUG("rxq=%p, ibv_poll_cq() failed (wcs_n=%d)", - (void *)rxq, wcs_n); - break; - } - assert(wcs_n == 1); - if (unlikely(wc.status != IBV_WC_SUCCESS)) { - /* Whatever, just repost the offending WR. */ - DEBUG("rxq=%p, wr_id=%" PRIu64 ": bad work" - " completion status (%d): %s", - (void *)rxq, wc.wr_id, wc.status, - ibv_wc_status_str(wc.status)); -#ifdef MLX5_PMD_SOFT_COUNTERS - /* Increment dropped packets counter. */ - ++rxq->stats.idropped; -#endif - /* Add SGE to array for repost. */ - sges[i] = elt->sge; - goto repost; - } - ret = wc.byte_len; - } - if (ret == 0) - break; - assert(ret >= (rxq->crc_present << 2)); - len = ret - (rxq->crc_present << 2); + struct rte_mbuf *pkt; + unsigned int len; + volatile struct mlx5_wqe_data_seg *wqe = &(*rxq->wqes)[idx]; + volatile struct mlx5_cqe64 *cqe = + &(*rxq->cqes)[rxq->cq_ci & wqe_cnt].cqe64; + + pkt = (*rxq->elts)[idx]; + rte_prefetch0(cqe); rep = rte_mbuf_raw_alloc(rxq->mp); if (unlikely(rep == NULL)) { - /* - * Unable to allocate a replacement mbuf, - * repost WR. - */ - DEBUG("rxq=%p: can't allocate a new mbuf", - (void *)rxq); - /* Increment out of memory counters. */ ++rxq->stats.rx_nombuf; - ++rxq->priv->dev->data->rx_mbuf_alloc_failed; - goto repost; + break; } - - /* Reconfigure sge to use rep instead of seg. */ - elt->sge.addr = (uintptr_t)rep->buf_addr + RTE_PKTMBUF_HEADROOM; - elt->buf = rep; - - /* Add SGE to array for repost. */ - sges[i] = elt->sge; - - /* Update seg information. */ - SET_DATA_OFF(seg, RTE_PKTMBUF_HEADROOM); - NB_SEGS(seg) = 1; - PORT(seg) = rxq->port_id; - NEXT(seg) = NULL; - PKT_LEN(seg) = len; - DATA_LEN(seg) = len; - if (rxq->csum | rxq->csum_l2tun | rxq->vlan_strip) { - seg->packet_type = rxq_cq_to_pkt_type(flags); - seg->ol_flags = rxq_cq_to_ol_flags(rxq, flags); - if (flags & IBV_EXP_CQ_RX_CVLAN_STRIPPED_V1) { - seg->ol_flags |= PKT_RX_VLAN_PKT; - seg->vlan_tci = vlan_tci; + SET_DATA_OFF(rep, RTE_PKTMBUF_HEADROOM); + NB_SEGS(rep) = 1; + PORT(rep) = rxq->port_id; + NEXT(rep) = NULL; + len = rx_poll_len(rxq); + if (unlikely(len == 0)) { + rte_mbuf_refcnt_set(rep, 0); + __rte_mbuf_raw_free(rep); + break; + } + /* Fill NIC descriptor with the new buffer. The lkey and size + * of the buffers are already known, only the buffer address + * changes. */ + wqe->addr = htonll((uintptr_t)rep->buf_addr + + RTE_PKTMBUF_HEADROOM); + (*rxq->elts)[idx] = rep; + /* Update pkt information. */ + if (rxq->csum | rxq->csum_l2tun | rxq->vlan_strip | + rxq->crc_present) { + if (rxq->csum) { + pkt->packet_type = rxq_cq_to_pkt_type(cqe); + pkt->ol_flags = rxq_cq_to_ol_flags(rxq, cqe); + } + if (cqe->l4_hdr_type_etc & MLX5_CQE_VLAN_STRIPPED) { + pkt->ol_flags |= PKT_RX_VLAN_PKT; + pkt->vlan_tci = ntohs(cqe->vlan_info); } + if (rxq->crc_present) + len -= ETHER_CRC_LEN; } - /* Return packet. */ - *(pkts++) = seg; - ++pkts_ret; + PKT_LEN(pkt) = len; + DATA_LEN(pkt) = len; #ifdef MLX5_PMD_SOFT_COUNTERS /* Increment bytes counter. */ rxq->stats.ibytes += len; #endif -repost: - if (++elts_head >= elts_n) - elts_head = 0; - continue; + /* Return packet. */ + *(pkts++) = pkt; + ++pkts_ret; + ++rq_ci; } - if (unlikely(i == 0)) + if (unlikely((i == 0) && (rq_ci == rxq->rq_ci))) return 0; /* Repost WRs. */ #ifdef DEBUG_RECV DEBUG("%p: reposting %u WRs", (void *)rxq, i); #endif - ret = rxq->recv(rxq->wq, sges, i); - if (unlikely(ret)) { - /* Inability to repost WRs is fatal. */ - DEBUG("%p: recv_burst(): failed (ret=%d)", - (void *)rxq->priv, - ret); - abort(); - } - rxq->elts_head = elts_head; + /* Update the consumer index. */ + rxq->rq_ci = rq_ci; + rte_wmb(); + *rxq->cq_db = htonl(rxq->cq_ci); + rte_wmb(); + *rxq->rq_db = htonl(rxq->rq_ci); #ifdef MLX5_PMD_SOFT_COUNTERS /* Increment packets counter. */ rxq->stats.ipackets += pkts_ret; diff --git a/drivers/net/mlx5/mlx5_rxtx.h b/drivers/net/mlx5/mlx5_rxtx.h index 570345b..1827123 100644 --- a/drivers/net/mlx5/mlx5_rxtx.h +++ b/drivers/net/mlx5/mlx5_rxtx.h @@ -43,6 +43,7 @@ #pragma GCC diagnostic ignored "-pedantic" #endif #include <infiniband/verbs.h> +#include <infiniband/mlx5_hw.h> #ifdef PEDANTIC #pragma GCC diagnostic error "-pedantic" #endif @@ -61,6 +62,7 @@ #include "mlx5.h" #include "mlx5_autoconf.h" #include "mlx5_defs.h" +#include "mlx5_prm.h" struct mlx5_rxq_stats { unsigned int idx; /**< Mapping index. */ @@ -81,12 +83,6 @@ struct mlx5_txq_stats { uint64_t odropped; /**< Total of packets not sent when TX ring full. */ }; -/* RX element. */ -struct rxq_elt { - struct ibv_sge sge; /* Scatter/Gather Element. */ - struct rte_mbuf *buf; /* SGE buffer. */ -}; - /* Flow director queue structure. */ struct fdir_queue { struct ibv_qp *qp; /* Associated RX QP. */ @@ -97,25 +93,28 @@ struct priv; /* RX queue descriptor. */ struct rxq { - struct priv *priv; /* Back pointer to private data. */ - struct rte_mempool *mp; /* Memory Pool for allocations. */ - struct ibv_cq *cq; /* Completion Queue. */ - struct ibv_exp_wq *wq; /* Work Queue. */ - int32_t (*poll)(); /* Verbs poll function. */ - int32_t (*recv)(); /* Verbs receive function. */ - unsigned int port_id; /* Port ID for incoming packets. */ - unsigned int elts_n; /* (*elts)[] length. */ - unsigned int elts_head; /* Current index in (*elts)[]. */ unsigned int csum:1; /* Enable checksum offloading. */ unsigned int csum_l2tun:1; /* Same for L2 tunnels. */ unsigned int vlan_strip:1; /* Enable VLAN stripping. */ unsigned int crc_present:1; /* CRC must be subtracted. */ - struct rxq_elt (*elts)[]; /* RX elements. */ - struct mlx5_rxq_stats stats; /* RX queue counters. */ + uint16_t rq_ci; + uint16_t cq_ci; + uint16_t elts_n; + uint16_t port_id; + volatile struct mlx5_wqe_data_seg(*wqes)[]; + volatile struct mlx5_cqe(*cqes)[]; + volatile uint32_t *rq_db; + volatile uint32_t *cq_db; + struct rte_mbuf *(*elts)[]; + struct rte_mempool *mp; + struct mlx5_rxq_stats stats; } __rte_cache_aligned; /* RX queue control descriptor. */ struct rxq_ctrl { + struct priv *priv; /* Back pointer to private data. */ + struct ibv_cq *cq; /* Completion Queue. */ + struct ibv_exp_wq *wq; /* Work Queue. */ struct ibv_exp_res_domain *rd; /* Resource Domain. */ struct fdir_queue fdir_queue; /* Flow director queue. */ struct ibv_mr *mr; /* Memory Region (for mp). */ @@ -284,8 +283,9 @@ int priv_allow_flow_type(struct priv *, enum hash_rxq_flow_type); int priv_rehash_flows(struct priv *); void rxq_cleanup(struct rxq_ctrl *); int rxq_rehash(struct rte_eth_dev *, struct rxq_ctrl *); -int rxq_setup(struct rte_eth_dev *, struct rxq_ctrl *, uint16_t, unsigned int, - const struct rte_eth_rxconf *, struct rte_mempool *); +int rxq_ctrl_setup(struct rte_eth_dev *, struct rxq_ctrl *, uint16_t, + unsigned int, const struct rte_eth_rxconf *, + struct rte_mempool *); int mlx5_rx_queue_setup(struct rte_eth_dev *, uint16_t, uint16_t, unsigned int, const struct rte_eth_rxconf *, struct rte_mempool *); void mlx5_rx_queue_release(void *); diff --git a/drivers/net/mlx5/mlx5_vlan.c b/drivers/net/mlx5/mlx5_vlan.c index 3b9b771..4719e69 100644 --- a/drivers/net/mlx5/mlx5_vlan.c +++ b/drivers/net/mlx5/mlx5_vlan.c @@ -144,6 +144,7 @@ static void priv_vlan_strip_queue_set(struct priv *priv, uint16_t idx, int on) { struct rxq *rxq = (*priv->rxqs)[idx]; + struct rxq_ctrl *rxq_ctrl = container_of(rxq, struct rxq_ctrl, rxq); struct ibv_exp_wq_attr mod; uint16_t vlan_offloads = (on ? IBV_EXP_RECEIVE_WQ_CVLAN_STRIP : 0) | @@ -157,7 +158,7 @@ priv_vlan_strip_queue_set(struct priv *priv, uint16_t idx, int on) .vlan_offloads = vlan_offloads, }; - err = ibv_exp_modify_wq(rxq->wq, &mod); + err = ibv_exp_modify_wq(rxq_ctrl->wq, &mod); if (err) { ERROR("%p: failed to modified stripping mode: %s", (void *)priv, strerror(err)); -- 2.1.4