Getting hw directly on RX fast path without verbs call.

Now the number of scatters is calculating on the fly, according to the
maximum expected packet size.

Signed-off-by: Vasily Philipov <vasi...@mellanox.com>
---
The series depends on:

http://dpdk.org/dev/patchwork/patch/27313/
---
 drivers/net/mlx4/mlx4.h       |   3 +
 drivers/net/mlx4/mlx4_prm.h   | 405 ++++++++++++++++++++++++++++++++++++++++++
 drivers/net/mlx4/mlx4_rxq.c   | 205 ++++++++++-----------
 drivers/net/mlx4/mlx4_rxtx.c  | 266 ++++++++++++++++-----------
 drivers/net/mlx4/mlx4_rxtx.h  |  18 +-
 drivers/net/mlx4/mlx4_utils.h |  20 +++
 6 files changed, 688 insertions(+), 229 deletions(-)
 create mode 100644 drivers/net/mlx4/mlx4_prm.h

diff --git a/drivers/net/mlx4/mlx4.h b/drivers/net/mlx4/mlx4.h
index 1cd4db3..4b7f98b 100644
--- a/drivers/net/mlx4/mlx4.h
+++ b/drivers/net/mlx4/mlx4.h
@@ -42,6 +42,7 @@
 #pragma GCC diagnostic ignored "-Wpedantic"
 #endif
 #include <infiniband/verbs.h>
+#include "mlx4_prm.h"
 #ifdef PEDANTIC
 #pragma GCC diagnostic error "-Wpedantic"
 #endif
@@ -57,6 +58,8 @@
 /* Maximum size for inline data. */
 #define MLX4_PMD_MAX_INLINE 0
 
+#include <rte_ethdev.h>
+
 /*
  * Maximum number of cached Memory Pools (MPs) per TX queue. Each RTE MP
  * from which buffers are to be transmitted will have to be mapped by this
diff --git a/drivers/net/mlx4/mlx4_prm.h b/drivers/net/mlx4/mlx4_prm.h
new file mode 100644
index 0000000..03c1192
--- /dev/null
+++ b/drivers/net/mlx4/mlx4_prm.h
@@ -0,0 +1,405 @@
+/*-
+ *   BSD LICENSE
+ *
+ *   Copyright 2017 6WIND S.A.
+ *   Copyright 2017 Mellanox.
+ *
+ *   Redistribution and use in source and binary forms, with or without
+ *   modification, are permitted provided that the following conditions
+ *   are met:
+ *
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in
+ *       the documentation and/or other materials provided with the
+ *       distribution.
+ *     * Neither the name of 6WIND S.A. nor the names of its
+ *       contributors may be used to endorse or promote products derived
+ *       from this software without specific prior written permission.
+ *
+ *   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ *   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ *   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ *   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ *   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ *   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ *   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ *   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ *   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ *   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef RTE_PMD_MLX4_PRM_H_
+#define RTE_PMD_MLX4_PRM_H_
+
+#include <arpa/inet.h>
+
+#include <infiniband/arch.h>
+#include <infiniband/driver.h>
+#include <infiniband/verbs.h>
+
+#define MLX4_GCC_VERSION (__GNUC__ * 100 + __GNUC_MINOR__)
+#if MLX4_GCC_VERSION >= 403
+#      define __MLX4_ALGN_FUNC__ __attribute__((noinline, aligned(64)))
+#      define __MLX4_ALGN_DATA__ __attribute__((aligned(64)))
+#else
+#      define __MLX4_ALGN_FUNC__
+#      define __MLX4_ALGN_DATA__
+#endif
+
+/* Maximum number of physical ports. */
+#define MLX4_PMD_MAX_PHYS_PORTS 2
+
+/* Generic macro to convert MLX4 to IBV flags. */
+#define MLX4_TRANSPOSE(val, from, to) \
+       (((from) >= (to)) ? \
+        (((val) & (from)) / ((from) / (to))) : \
+        (((val) & (from)) * ((to) / (from))))
+
+struct list_head {
+       struct list_head *next, *prev;
+};
+
+enum {
+       MLX4_INVALID_LKEY = 0x100,
+};
+
+enum {
+       MLX4_MAX_BFS_IN_PAGE = 8,
+       MLX4_BFS_STRIDE      = 512,
+};
+
+enum {
+       MLX4_CQE_L2_TUNNEL_IPV4    = 1U << 25,
+       MLX4_CQE_L2_TUNNEL_L4_CSUM = 1U << 26,
+       MLX4_CQE_L2_TUNNEL         = 1U << 27,
+       MLX4_CQE_VLAN_PRESENT_MASK = 1U << 29,
+       MLX4_CQE_L2_TUNNEL_IPOK    = 1U << 31,
+       MLX4_CQE_QPN_MASK          = 0xffffff,
+};
+
+enum {
+       MLX4_QP_TABLE_BITS = 8,
+       MLX4_QP_TABLE_SIZE = 1 << MLX4_QP_TABLE_BITS,
+       MLX4_QP_TABLE_MASK = MLX4_QP_TABLE_SIZE - 1
+};
+
+enum {
+       MLX4_XSRQ_TABLE_BITS = 8,
+       MLX4_XSRQ_TABLE_SIZE = 1 << MLX4_XSRQ_TABLE_BITS,
+       MLX4_XSRQ_TABLE_MASK = MLX4_XSRQ_TABLE_SIZE - 1
+};
+
+struct mlx4_wqe_data_seg {
+       uint32_t byte_count;
+       uint32_t lkey;
+       uint64_t addr;
+};
+
+struct mlx4_xsrq_table {
+       struct {
+               struct mlx4_srq **table;
+               int             refcnt;
+       } xsrq_table[MLX4_XSRQ_TABLE_SIZE];
+       pthread_mutex_t         mutex;
+       int                     num_xsrq;
+       int                     shift;
+       int                     mask;
+};
+
+enum qp_cap_cache {
+       MLX4_RX_CSUM_MODE_IP_OK_IP_NON_TCP_UDP = 1 << 1,
+       MLX4_RX_VXLAN                          = 1 << 2
+};
+
+enum mlx4_db_type {
+       MLX4_DB_TYPE_CQ,
+       MLX4_DB_TYPE_RQ,
+       MLX4_NUM_DB_TYPE,
+};
+
+enum mlx4_lock_type {
+       MLX4_SPIN_LOCK = 0,
+       MLX4_MUTEX     = 1,
+};
+
+enum mlx4_lock_state {
+       MLX4_USE_LOCK,
+       MLX4_LOCKED,
+       MLX4_UNLOCKED
+};
+
+struct mlx4_spinlock {
+       pthread_spinlock_t   lock;
+       enum mlx4_lock_state state;
+};
+
+struct mlx4_lock {
+       pthread_mutex_t      mutex;
+       pthread_spinlock_t   slock;
+       enum mlx4_lock_state state;
+       enum mlx4_lock_type  type;
+};
+
+/* struct for BF dedicated for one QP */
+struct mlx4_dedic_bf {
+       void *address;
+};
+
+/* struct for the common BF which may be shared by many QPs */
+struct mlx4_cmn_bf {
+       void             *address;
+       /*
+        * Protect usage of BF address field including data written
+        * to the BF and the BF buffer toggling.
+        */
+       struct mlx4_lock lock;
+};
+
+union mlx4_bf {
+       struct mlx4_dedic_bf dedic;
+       struct mlx4_cmn_bf cmn;
+};
+
+struct mlx4_bfs_data {
+       struct mlx4_dedic_bf dedic_bf[MLX4_MAX_BFS_IN_PAGE - 1];
+       struct mlx4_cmn_bf   cmn_bf;
+       uint8_t              dedic_bf_used[MLX4_MAX_BFS_IN_PAGE - 1];
+       uint8_t              dedic_bf_free;
+       /*
+        * protect dedicated BFs managing
+        * including dedic_bf_used and
+        * dedic_bf_free fields
+        */
+       struct mlx4_spinlock dedic_bf_lock;
+       void                 *page;
+       uint16_t             buf_size;
+       uint8_t              num_dedic_bfs;
+};
+
+struct mlx4_db_page;
+
+struct mlx4_context {
+       union {
+               struct ibv_context      ibv_ctx;
+       };
+       /* protects send_db_list and send_db_num_uars */
+       struct mlx4_spinlock            send_db_lock;
+       struct list_head                send_db_list;
+       unsigned int                    send_db_num_uars;
+       void                            *uar;
+       struct mlx4_spinlock            uar_lock;
+       struct mlx4_bfs_data            bfs;
+       int                             bf_regs_per_page;
+       int                             max_ctx_res_domain;
+       struct {
+               struct mlx4_qp          **table;
+               int                     refcnt;
+       } qp_table[MLX4_QP_TABLE_SIZE];
+       pthread_mutex_t                 qp_table_mutex;
+       int                             num_qps;
+       int                             qp_table_shift;
+       int                             qp_table_mask;
+       int                             max_qp_wr;
+       int                             max_sge;
+       int                             max_cqe;
+       uint64_t                        exp_device_cap_flags;
+       struct {
+               int                     offset;
+               int                     mult;
+               int                     shift;
+               uint64_t                mask;
+       } core_clk;
+       void                            *hca_core_clock;
+       struct mlx4_xsrq_table          xsrq_table;
+       struct mlx4_db_page             *db_list[MLX4_NUM_DB_TYPE];
+       pthread_mutex_t                 db_list_mutex;
+       int                             cqe_size;
+       int                             prefer_bf;
+       struct mlx4_spinlock            hugetlb_lock;
+       struct list_head                hugetlb_list;
+       int                             stall_enable;
+       pthread_mutex_t                 task_mutex;
+       struct {
+               uint8_t                 valid;
+               uint8_t                 link_layer;
+               enum ibv_port_cap_flags caps;
+       } port_query_cache[MLX4_PMD_MAX_PHYS_PORTS];
+       pthread_mutex_t                 env_mtx;
+       int                             env_initialized;
+};
+
+struct mlx4_buf {
+       void   *buf;
+       void   *hmem;
+       size_t length;
+       int    base;
+};
+
+struct mlx4_pd {
+       struct ibv_pd ibv_pd;
+       uint32_t      pdn;
+};
+
+struct mlx4_cq {
+       struct ibv_cq    ibv_cq __MLX4_ALGN_DATA__;
+       uint32_t         pattern;
+       struct mlx4_buf  buf;
+       struct mlx4_buf  resize_buf;
+       struct mlx4_lock lock;
+       uint32_t         cqn;
+       uint32_t         cons_index;
+       uint32_t         wait_index;
+       uint32_t         wait_count;
+       uint32_t         *set_ci_db;
+       uint32_t         *arm_db;
+       int              arm_sn;
+       int              stall_next_poll;
+       int              stall_enable;
+       int              cqe_size;
+       int              creation_flags;
+       struct mlx4_qp   *last_qp;
+       uint32_t         model_flags; /* use mlx4_cq_model_flags */
+};
+
+struct mlx4_wq {
+       uint64_t         *wrid;
+       struct mlx4_lock lock;
+       int              wqe_cnt;
+       unsigned         max_post;
+       char             *buf;
+       unsigned         head;
+       unsigned         tail;
+       int              max_gs;
+       int              wqe_shift;
+       unsigned         head_en_index;
+       unsigned         head_en_count;
+};
+
+struct mlx4_inlr_rbuff {
+       void *rbuff;
+       int rlen;
+};
+
+struct mlx4_inlr_sg_list {
+       struct mlx4_inlr_rbuff *sg_list;
+       int list_len;
+};
+
+struct mlx4_inlr_buff {
+       struct mlx4_inlr_sg_list *buff;
+       int len;
+};
+
+struct mlx4_qp {
+       struct verbs_qp       verbs_qp;
+       uint32_t              pattern;
+       int                   buf_size;
+       uint32_t              model_flags; /* use mlx4_qp_model_flags */
+       /* hot post send data */
+       struct mlx4_wq        sq __MLX4_ALGN_DATA__;
+       int                   (*post_send_one)(struct ibv_send_wr *wr,
+                                              struct mlx4_qp *qp,
+                                              void *wqe, int *total_size,
+                                              int *inl, unsigned int ind);
+       union mlx4_bf         *bf;
+       uint32_t              *sdb; /* send DB */
+       struct mlx4_buf       buf;
+       unsigned              last_db_head;
+       uint32_t              doorbell_qpn;
+       uint32_t              create_flags;
+       uint16_t              max_inline_data;
+       uint16_t              bf_buf_size;
+       uint16_t              sq_spare_wqes;
+       uint8_t               srcrb_flags_tbl[16];
+       uint8_t               db_method;
+       uint8_t               qp_type;
+       /* RAW_PACKET hot data */
+       uint8_t               link_layer;
+       uint8_t               is_masked_atomic;
+       /* post receive hot data */
+       struct mlx4_wq        rq __MLX4_ALGN_DATA__;
+       uint32_t              *db;
+       uint32_t              max_inlr_sg;
+       int32_t               cached_rx_csum_flags;
+       int32_t               transposed_rx_csum_flags;
+       struct mlx4_inlr_buff inlr_buff;
+       uint8_t               qp_cap_cache;
+};
+
+struct mlx4_cqe {
+       uint32_t        vlan_my_qpn;
+       uint32_t        immed_rss_invalid;
+       uint32_t        g_mlpath_rqpn;
+       union {
+               struct {
+                       union {
+                               struct {
+                                       uint16_t  sl_vid;
+                                       uint16_t  rlid;
+                               };
+                               uint32_t  timestamp_16_47;
+                       };
+                       uint16_t  status;
+                       uint8_t   reserved2;
+                       uint8_t   badfcs_enc;
+               };
+               struct {
+                       uint16_t reserved4;
+                       uint8_t  smac[6];
+               };
+       };
+       uint32_t        byte_cnt;
+       uint16_t        wqe_index;
+       uint16_t        checksum;
+       uint8_t         reserved5[1];
+       uint16_t        timestamp_0_15;
+       uint8_t         owner_sr_opcode;
+} __attribute__((packed));
+
+enum {
+       MLX4_CQE_OWNER_MASK       = 0x80,
+       MLX4_CQE_IS_SEND_MASK     = 0x40,
+       MLX4_CQE_INL_SCATTER_MASK = 0x20,
+       MLX4_CQE_OPCODE_MASK      = 0x1f
+};
+
+enum {
+       MLX4_CQE_OPCODE_ERROR  = 0x1e,
+       MLX4_CQE_OPCODE_RESIZE = 0x16,
+};
+
+enum {
+       MLX4_CQE_STATUS_L4_CSUM = 1 << 2,
+       MLX4_CQE_STATUS_IPV4    = 1 << 6,
+       MLX4_CQE_STATUS_IPV4F   = 1 << 7,
+       MLX4_CQE_STATUS_IPV6    = 1 << 8,
+       MLX4_CQE_STATUS_IPV4OPT = 1 << 9,
+       MLX4_CQE_STATUS_TCP     = 1 << 10,
+       MLX4_CQE_STATUS_UDP     = 1 << 11,
+       MLX4_CQE_STATUS_IPOK    = 1 << 12
+};
+
+#define to_mxxx(xxx, type) \
+       ((struct mlx4_##type *) \
+        ((uint8_t *)ib##xxx - offsetof(struct mlx4_##type, ibv_##xxx)))
+
+static inline struct mlx4_context *to_mctx(struct ibv_context *ibctx)
+{
+       return to_mxxx(ctx, context);
+}
+
+static inline struct mlx4_cq *to_mcq(struct ibv_cq *ibcq)
+{
+       return to_mxxx(cq, cq);
+}
+
+static inline struct mlx4_qp *to_mqp(struct ibv_qp *ibqp)
+{
+       return container_of(container_of(ibqp, struct verbs_qp, qp),
+                           struct mlx4_qp, verbs_qp);
+}
+
+#endif /* RTE_PMD_MLX4_PRM_H_ */
diff --git a/drivers/net/mlx4/mlx4_rxq.c b/drivers/net/mlx4/mlx4_rxq.c
index 1456b5f..bbe9c89 100644
--- a/drivers/net/mlx4/mlx4_rxq.c
+++ b/drivers/net/mlx4/mlx4_rxq.c
@@ -78,103 +78,73 @@
  */
 static int
 mlx4_rxq_alloc_elts(struct rxq *rxq, unsigned int elts_n,
-                   struct rte_mbuf **pool)
+                   struct rte_mbuf *(*pool)[])
 {
-       unsigned int i;
-       struct rxq_elt (*elts)[elts_n] =
-               rte_calloc_socket("RXQ elements", 1, sizeof(*elts), 0,
-                                 rxq->socket);
+       unsigned int i = 0;
+       const unsigned int sge_n = 1 << rxq->sge_n;
+       struct rte_mbuf *(*elts)[elts_n] =
+               rte_calloc_socket("RXQ", 1, sizeof(*elts), 0, rxq->socket);
 
        if (elts == NULL) {
                rte_errno = ENOMEM;
                ERROR("%p: can't allocate packets array", (void *)rxq);
                goto error;
        }
-       /* For each WR (packet). */
-       for (i = 0; (i != elts_n); ++i) {
-               struct rxq_elt *elt = &(*elts)[i];
-               struct ibv_recv_wr *wr = &elt->wr;
-               struct ibv_sge *sge = &(*elts)[i].sge;
+       rxq->elts = elts;
+       for (; i != elts_n; ++i) {
                struct rte_mbuf *buf;
+               volatile struct mlx4_wqe_data_seg *scat =
+                       &(*rxq->hw.wqes)[i];
 
                if (pool != NULL) {
-                       buf = *(pool++);
+                       buf = (*pool)[i];
                        assert(buf != NULL);
                        rte_pktmbuf_reset(buf);
-               } else {
+                       rte_pktmbuf_refcnt_update(buf, 1);
+               } else
                        buf = rte_pktmbuf_alloc(rxq->mp);
-               }
                if (buf == NULL) {
                        rte_errno = ENOMEM;
                        assert(pool == NULL);
                        ERROR("%p: empty mbuf pool", (void *)rxq);
                        goto error;
                }
-               /*
-                * Configure WR. Work request ID contains its own index in
-                * the elts array and the offset between SGE buffer header and
-                * its data.
-                */
-               WR_ID(wr->wr_id).id = i;
-               WR_ID(wr->wr_id).offset =
-                       (((uintptr_t)buf->buf_addr + RTE_PKTMBUF_HEADROOM) -
-                        (uintptr_t)buf);
-               wr->next = &(*elts)[(i + 1)].wr;
-               wr->sg_list = sge;
-               wr->num_sge = 1;
                /* Headroom is reserved by rte_pktmbuf_alloc(). */
                assert(buf->data_off == RTE_PKTMBUF_HEADROOM);
                /* Buffer is supposed to be empty. */
                assert(rte_pktmbuf_data_len(buf) == 0);
                assert(rte_pktmbuf_pkt_len(buf) == 0);
-               /* sge->addr must be able to store a pointer. */
-               assert(sizeof(sge->addr) >= sizeof(uintptr_t));
-               /* SGE keeps its headroom. */
-               sge->addr = (uintptr_t)
-                       ((uint8_t *)buf->buf_addr + RTE_PKTMBUF_HEADROOM);
-               sge->length = (buf->buf_len - RTE_PKTMBUF_HEADROOM);
-               sge->lkey = rxq->mr->lkey;
-               /* Redundant check for tailroom. */
-               assert(sge->length == rte_pktmbuf_tailroom(buf));
-               /*
-                * Make sure elts index and SGE mbuf pointer can be deduced
-                * from WR ID.
-                */
-               if ((WR_ID(wr->wr_id).id != i) ||
-                   ((void *)((uintptr_t)sge->addr -
-                       WR_ID(wr->wr_id).offset) != buf)) {
-                       rte_errno = EOVERFLOW;
-                       ERROR("%p: cannot store index and offset in WR ID",
-                             (void *)rxq);
-                       sge->addr = 0;
-                       rte_pktmbuf_free(buf);
-                       goto error;
-               }
+               assert(!buf->next);
+               /* Only the first segment keeps headroom. */
+               if (i % sge_n)
+                       buf->data_off = 0;
+               buf->port = rxq->port_id;
+               buf->data_len = rte_pktmbuf_tailroom(buf);
+               buf->pkt_len = rte_pktmbuf_tailroom(buf);
+               buf->nb_segs = 1;
+               /* scat->addr must be able to store a pointer. */
+               assert(sizeof(scat->addr) >= sizeof(uintptr_t));
+               *scat = (struct mlx4_wqe_data_seg){
+                       .addr = htonll(rte_pktmbuf_mtod(buf, uintptr_t)),
+                       .byte_count = htonl(buf->data_len),
+                       .lkey = htonl(rxq->mr->lkey),
+               };
+               (*rxq->elts)[i] = buf;
        }
-       /* The last WR pointer must be NULL. */
-       (*elts)[(i - 1)].wr.next = NULL;
-       DEBUG("%p: allocated and configured %u single-segment WRs",
-             (void *)rxq, elts_n);
-       rxq->elts_n = elts_n;
-       rxq->elts_head = 0;
-       rxq->elts = elts;
+       DEBUG("%p: allocated and configured %u segments (max %u packets)",
+             (void *)rxq, elts_n, elts_n >> rxq->sge_n);
+       rxq->elts_n = log2above(elts_n);
        return 0;
 error:
-       if (elts != NULL) {
-               assert(pool == NULL);
-               for (i = 0; (i != RTE_DIM(*elts)); ++i) {
-                       struct rxq_elt *elt = &(*elts)[i];
-                       struct rte_mbuf *buf;
-
-                       if (elt->sge.addr == 0)
-                               continue;
-                       assert(WR_ID(elt->wr.wr_id).id == i);
-                       buf = (void *)((uintptr_t)elt->sge.addr -
-                               WR_ID(elt->wr.wr_id).offset);
-                       rte_pktmbuf_free_seg(buf);
-               }
-               rte_free(elts);
+       assert(pool == NULL);
+       elts_n = i;
+       for (i = 0; i != elts_n; ++i) {
+               if ((*rxq->elts)[i] != NULL)
+                       rte_pktmbuf_free_seg((*rxq->elts)[i]);
+               (*rxq->elts)[i] = NULL;
        }
+       rte_free(rxq->elts);
+       rxq->elts = NULL;
        DEBUG("%p: failed, freed everything", (void *)rxq);
        assert(rte_errno > 0);
        return -rte_errno;
@@ -190,26 +160,17 @@
 mlx4_rxq_free_elts(struct rxq *rxq)
 {
        unsigned int i;
-       unsigned int elts_n = rxq->elts_n;
-       struct rxq_elt (*elts)[elts_n] = rxq->elts;
 
        DEBUG("%p: freeing WRs", (void *)rxq);
-       rxq->elts_n = 0;
-       rxq->elts = NULL;
-       if (elts == NULL)
+       if (rxq->elts == NULL)
                return;
-       for (i = 0; (i != RTE_DIM(*elts)); ++i) {
-               struct rxq_elt *elt = &(*elts)[i];
-               struct rte_mbuf *buf;
 
-               if (elt->sge.addr == 0)
-                       continue;
-               assert(WR_ID(elt->wr.wr_id).id == i);
-               buf = (void *)((uintptr_t)elt->sge.addr -
-                       WR_ID(elt->wr.wr_id).offset);
-               rte_pktmbuf_free_seg(buf);
+       for (i = 0; i != (1u << rxq->elts_n); ++i) {
+               if ((*rxq->elts)[i] != NULL)
+                       rte_pktmbuf_free_seg((*rxq->elts)[i]);
+               (*rxq->elts)[i] = NULL;
        }
-       rte_free(elts);
+       rte_free(rxq->elts);
 }
 
 /**
@@ -251,7 +212,8 @@
  *   QP pointer or NULL in case of error and rte_errno is set.
  */
 static struct ibv_qp *
-mlx4_rxq_setup_qp(struct priv *priv, struct ibv_cq *cq, uint16_t desc)
+mlx4_rxq_setup_qp(struct priv *priv, struct ibv_cq *cq,
+                 uint16_t desc, unsigned int sge_n)
 {
        struct ibv_qp *qp;
        struct ibv_qp_init_attr attr = {
@@ -265,7 +227,7 @@
                                        priv->device_attr.max_qp_wr :
                                        desc),
                        /* Max number of scatter/gather elements in a WR. */
-                       .max_recv_sge = 1,
+                       .max_recv_sge = sge_n,
                },
                .qp_type = IBV_QPT_RAW_PACKET,
        };
@@ -307,26 +269,34 @@
                .socket = socket
        };
        struct ibv_qp_attr mod;
-       struct ibv_recv_wr *bad_wr;
        unsigned int mb_len;
        int ret;
 
        (void)conf; /* Thresholds configuration (ignored). */
        mb_len = rte_pktmbuf_data_room_size(mp);
-       if (desc == 0) {
-               rte_errno = EINVAL;
-               ERROR("%p: invalid number of RX descriptors", (void *)dev);
-               goto error;
-       }
        /* Enable scattered packets support for this queue if necessary. */
        assert(mb_len >= RTE_PKTMBUF_HEADROOM);
        if (dev->data->dev_conf.rxmode.max_rx_pkt_len <=
            (mb_len - RTE_PKTMBUF_HEADROOM)) {
-               ;
+               tmpl.sge_n = 0;
        } else if (dev->data->dev_conf.rxmode.enable_scatter) {
-               WARN("%p: scattered mode has been requested but is"
-                    " not supported, this may lead to packet loss",
-                    (void *)dev);
+               unsigned int sges_n;
+               unsigned int rx_pkt_len =
+                               dev->data->dev_conf.rxmode.jumbo_frame ?
+                               dev->data->dev_conf.rxmode.max_rx_pkt_len :
+                               ETHER_MTU;
+
+               if (rx_pkt_len < ETHER_MTU)
+                       rx_pkt_len = ETHER_MTU;
+               /* Only the first mbuf has a headroom */
+               rx_pkt_len = rx_pkt_len - mb_len + RTE_PKTMBUF_HEADROOM;
+               /*
+                * Determine the number of SGEs needed for a full packet
+                * and round it to the next power of two.
+                */
+               sges_n = (rx_pkt_len / mb_len) + !!(rx_pkt_len / mb_len) + 1;
+               tmpl.sge_n = log2above(sges_n);
+               desc >>= tmpl.sge_n;
        } else {
                WARN("%p: the requested maximum Rx packet size (%u) is"
                     " larger than a single mbuf (%u) and scattered"
@@ -335,6 +305,8 @@
                     dev->data->dev_conf.rxmode.max_rx_pkt_len,
                     mb_len - RTE_PKTMBUF_HEADROOM);
        }
+       DEBUG("%p: number of sges %u (%u WRs)",
+             (void *)dev, 1 << tmpl.sge_n, desc);
        /* Use the entire RX mempool as the memory region. */
        tmpl.mr = mlx4_mp2mr(priv->pd, mp);
        if (tmpl.mr == NULL) {
@@ -370,7 +342,7 @@
              priv->device_attr.max_qp_wr);
        DEBUG("priv->device_attr.max_sge is %d",
              priv->device_attr.max_sge);
-       tmpl.qp = mlx4_rxq_setup_qp(priv, tmpl.cq, desc);
+       tmpl.qp = mlx4_rxq_setup_qp(priv, tmpl.cq, desc, 1 << tmpl.sge_n);
        if (tmpl.qp == NULL) {
                ERROR("%p: QP creation failure: %s",
                      (void *)dev, strerror(rte_errno));
@@ -389,21 +361,6 @@
                      (void *)dev, strerror(rte_errno));
                goto error;
        }
-       ret = mlx4_rxq_alloc_elts(&tmpl, desc, NULL);
-       if (ret) {
-               ERROR("%p: RXQ allocation failed: %s",
-                     (void *)dev, strerror(rte_errno));
-               goto error;
-       }
-       ret = ibv_post_recv(tmpl.qp, &(*tmpl.elts)[0].wr, &bad_wr);
-       if (ret) {
-               rte_errno = ret;
-               ERROR("%p: ibv_post_recv() failed for WR %p: %s",
-                     (void *)dev,
-                     (void *)bad_wr,
-                     strerror(rte_errno));
-               goto error;
-       }
        mod = (struct ibv_qp_attr){
                .qp_state = IBV_QPS_RTR
        };
@@ -414,14 +371,32 @@
                      (void *)dev, strerror(rte_errno));
                goto error;
        }
+       /* Init HW depended fields */
+       tmpl.hw.wqes =
+               (volatile struct mlx4_wqe_data_seg (*)[])
+               (uintptr_t)to_mqp(tmpl.qp)->rq.buf;
+       tmpl.hw.rq_db =
+               (volatile uint32_t *)
+               (uintptr_t)to_mqp(tmpl.qp)->db;
+       tmpl.hw.rq_ci = 0;
        /* Save port ID. */
        tmpl.port_id = dev->data->port_id;
        DEBUG("%p: RTE port ID: %u", (void *)rxq, tmpl.port_id);
+       ret = mlx4_rxq_alloc_elts(&tmpl, desc << tmpl.sge_n, NULL);
+       if (ret) {
+               ERROR("%p: RXQ allocation failed: %s",
+                     (void *)dev, strerror(rte_errno));
+               goto error;
+       }
        /* Clean up rxq in case we're reinitializing it. */
        DEBUG("%p: cleaning-up old rxq just in case", (void *)rxq);
        mlx4_rxq_cleanup(rxq);
        *rxq = tmpl;
        DEBUG("%p: rxq updated with %p", (void *)rxq, (void *)&tmpl);
+       /* Update doorbell counter. */
+       rxq->hw.rq_ci = desc;
+       rte_wmb();
+       *rxq->hw.rq_db = htonl(rxq->hw.rq_ci);
        return 0;
 error:
        ret = rte_errno;
@@ -459,6 +434,12 @@
        struct rxq *rxq = (*priv->rxqs)[idx];
        int ret;
 
+       if (!rte_is_power_of_2(desc)) {
+               desc = 1 << log2above(desc);
+               WARN("%p: increased number of descriptors in RX queue %u"
+                    " to the next power of two (%d)",
+                    (void *)dev, idx, desc);
+       }
        DEBUG("%p: configuring queue %u for %u descriptors",
              (void *)dev, idx, desc);
        if (idx >= priv->rxqs_n) {
diff --git a/drivers/net/mlx4/mlx4_rxtx.c b/drivers/net/mlx4/mlx4_rxtx.c
index 944cf48..f11c84c 100644
--- a/drivers/net/mlx4/mlx4_rxtx.c
+++ b/drivers/net/mlx4/mlx4_rxtx.c
@@ -348,9 +348,73 @@
 }
 
 /**
- * DPDK callback for Rx.
+ * Get next cqe from HW.
  *
- * The following function doesn't manage scattered packets.
+ * @param cq
+ *   Pointer to CQ structure.
+ *
+ * @return
+ *   Pointer to the CQ element or NULL in case there is no one.
+ */
+static inline struct mlx4_cqe *
+mlx4_cq_get_next_cqe(struct mlx4_cq *cq)
+{
+       int cqe_off;
+       struct mlx4_cqe *cqe;
+       const int cqe_size = cq->cqe_size;
+
+       /* CQE offset is 32 bytes in case if cqe_size is 64 */
+       cqe_off = (cqe_size & 64) >> 1;
+       cqe = (struct mlx4_cqe *)
+               ((uint8_t *)cq->buf.buf +
+               (cq->cons_index & cq->ibv_cq.cqe) * cqe_size +
+               cqe_off);
+       /* Return NULL if HW hasn't produced cqe */
+       if (!!(cqe->owner_sr_opcode & MLX4_CQE_OWNER_MASK) ^
+           !!(cq->cons_index & (cq->ibv_cq.cqe + 1)))
+               return NULL;
+       return cqe;
+}
+
+/**
+ * Poll one CQE from CQ.
+ *
+ * @param cq
+ *   Pointer to ibv CQ structure.
+ * @param[out] out
+ *   Just polled cqe.
+ *
+ * @return
+ *   byte_cnt of the cqe, 0 in case there is no completion,
+ *   negative on failure.
+ */
+static int
+mlx4_cq_poll_one(struct rxq *rxq,
+                struct mlx4_cqe **out)
+{
+       int ret = 0;
+       struct mlx4_cqe *cqe;
+       struct mlx4_cq *cq = to_mcq(rxq->cq);
+
+       cqe = mlx4_cq_get_next_cqe(cq);
+       if (cqe) {
+               /*
+                * Make sure we read CQ entry contents after we've checked the
+                * ownership bit.
+                */
+               rte_rmb();
+               assert(!(cqe->owner_sr_opcode & MLX4_CQE_IS_SEND_MASK));
+               assert((cqe->owner_sr_opcode & MLX4_CQE_OPCODE_MASK) !=
+                      MLX4_CQE_OPCODE_ERROR);
+               ret = ntohl(cqe->byte_cnt);
+               ++cq->cons_index;
+       }
+       *out = cqe;
+       return ret;
+}
+
+/**
+ * DPDK callback for RX with scattered packets support.
  *
  * @param dpdk_rxq
  *   Generic pointer to Rx queue structure.
@@ -365,121 +429,109 @@
 uint16_t
 mlx4_rx_burst(void *dpdk_rxq, struct rte_mbuf **pkts, uint16_t pkts_n)
 {
-       struct rxq *rxq = (struct rxq *)dpdk_rxq;
-       struct rxq_elt (*elts)[rxq->elts_n] = rxq->elts;
-       const unsigned int elts_n = rxq->elts_n;
-       unsigned int elts_head = rxq->elts_head;
-       struct ibv_wc wcs[pkts_n];
-       struct ibv_recv_wr *wr_head = NULL;
-       struct ibv_recv_wr **wr_next = &wr_head;
-       struct ibv_recv_wr *wr_bad = NULL;
-       unsigned int i;
-       unsigned int pkts_ret = 0;
-       int ret;
+       struct rxq *rxq = dpdk_rxq;
+       const unsigned int wr_cnt = (1 << rxq->elts_n) - 1;
+       const unsigned int sge_n = rxq->sge_n;
+       struct rte_mbuf *pkt = NULL;
+       struct rte_mbuf *seg = NULL;
+       unsigned int i = 0;
+       unsigned int rq_ci = (rxq->hw.rq_ci << sge_n);
+       int len = 0;
 
-       ret = ibv_poll_cq(rxq->cq, pkts_n, wcs);
-       if (unlikely(ret == 0))
-               return 0;
-       if (unlikely(ret < 0)) {
-               DEBUG("rxq=%p, ibv_poll_cq() failed (wc_n=%d)",
-                     (void *)rxq, ret);
-               return 0;
-       }
-       assert(ret <= (int)pkts_n);
-       /* For each work completion. */
-       for (i = 0; i != (unsigned int)ret; ++i) {
-               struct ibv_wc *wc = &wcs[i];
-               struct rxq_elt *elt = &(*elts)[elts_head];
-               struct ibv_recv_wr *wr = &elt->wr;
-               uint64_t wr_id = wr->wr_id;
-               uint32_t len = wc->byte_len;
-               struct rte_mbuf *seg = (void *)((uintptr_t)elt->sge.addr -
-                       WR_ID(wr_id).offset);
-               struct rte_mbuf *rep;
+       while (pkts_n) {
+               struct mlx4_cqe *cqe;
+               unsigned int idx = rq_ci & wr_cnt;
+               struct rte_mbuf *rep = (*rxq->elts)[idx];
+               volatile struct mlx4_wqe_data_seg *scat =
+                                       &(*rxq->hw.wqes)[idx];
 
-               /* Sanity checks. */
-               assert(WR_ID(wr_id).id < rxq->elts_n);
-               assert(wr_id == wc->wr_id);
-               assert(wr->sg_list == &elt->sge);
-               assert(wr->num_sge == 1);
-               assert(elts_head < rxq->elts_n);
-               assert(rxq->elts_head < rxq->elts_n);
-               /*
-                * Fetch initial bytes of packet descriptor into a
-                * cacheline while allocating rep.
-                */
-               rte_mbuf_prefetch_part1(seg);
-               rte_mbuf_prefetch_part2(seg);
-               /* Link completed WRs together for repost. */
-               *wr_next = wr;
-               wr_next = &wr->next;
-               if (unlikely(wc->status != IBV_WC_SUCCESS)) {
-                       /* Whatever, just repost the offending WR. */
-                       DEBUG("rxq=%p, wr_id=%" PRIu64 ": bad work completion"
-                             " status (%d): %s",
-                             (void *)rxq, wr_id, wc->status,
-                             ibv_wc_status_str(wc->status));
-                       /* Increment dropped packets counter. */
-                       ++rxq->stats.idropped;
-                       goto repost;
-               }
+               /* Update the 'next' pointer of the previous segment */
+               if (pkt)
+                       seg->next = rep;
+               seg = rep;
+               rte_prefetch0(seg);
+               rte_prefetch0(scat);
                rep = rte_mbuf_raw_alloc(rxq->mp);
                if (unlikely(rep == NULL)) {
-                       /*
-                        * Unable to allocate a replacement mbuf,
-                        * repost WR.
-                        */
-                       DEBUG("rxq=%p, wr_id=%" PRIu32 ":"
-                             " can't allocate a new mbuf",
-                             (void *)rxq, WR_ID(wr_id).id);
-                       /* Increase out of memory counters. */
                        ++rxq->stats.rx_nombuf;
-                       ++rxq->priv->dev->data->rx_mbuf_alloc_failed;
-                       goto repost;
+                       if (!pkt) {
+                               /*
+                                * no buffers before we even started,
+                                * bail out silently.
+                                */
+                               break;
+                       }
+                       while (pkt != seg) {
+                               assert(pkt != (*rxq->elts)[idx]);
+                               rep = pkt->next;
+                               pkt->next = NULL;
+                               pkt->nb_segs = 1;
+                               rte_mbuf_raw_free(pkt);
+                               pkt = rep;
+                       }
+                       break;
                }
-               /* Reconfigure sge to use rep instead of seg. */
-               elt->sge.addr = (uintptr_t)rep->buf_addr + RTE_PKTMBUF_HEADROOM;
-               assert(elt->sge.lkey == rxq->mr->lkey);
-               WR_ID(wr->wr_id).offset =
-                       (((uintptr_t)rep->buf_addr + RTE_PKTMBUF_HEADROOM) -
-                        (uintptr_t)rep);
-               assert(WR_ID(wr->wr_id).id == WR_ID(wr_id).id);
-               /* Update seg information. */
-               seg->data_off = RTE_PKTMBUF_HEADROOM;
-               seg->nb_segs = 1;
-               seg->port = rxq->port_id;
-               seg->next = NULL;
-               seg->pkt_len = len;
+               if (!pkt) {
+                       /* Looking for the new packet */
+                       len = mlx4_cq_poll_one(rxq, &cqe);
+                       if (!len) {
+                               rte_mbuf_raw_free(rep);
+                               break;
+                       }
+                       if (unlikely(len < 0)) {
+                               /* RX error, packet is likely too large. */
+                               rte_mbuf_raw_free(rep);
+                               ++rxq->stats.idropped;
+                               goto skip;
+                       }
+                       pkt = seg;
+                       pkt->packet_type = 0;
+                       pkt->ol_flags = 0;
+                       pkt->pkt_len = len;
+               }
+               rep->nb_segs = 1;
+               rep->port = rxq->port_id;
+               rep->data_len = seg->data_len;
+               rep->data_off = seg->data_off;
+               (*rxq->elts)[idx] = rep;
+               /*
+                * Fill NIC descriptor with the new buffer. The lkey and size
+                * of the buffers are already known, only the buffer address
+                * changes.
+                */
+               scat->addr = htonll(rte_pktmbuf_mtod(rep, uintptr_t));
+               if (len > seg->data_len) {
+                       len -= seg->data_len;
+                       ++pkt->nb_segs;
+                       ++rq_ci;
+                       continue;
+               }
+               /* The last segment. */
                seg->data_len = len;
-               seg->packet_type = 0;
-               seg->ol_flags = 0;
+               /* Increment bytes counter. */
+               rxq->stats.ibytes += pkt->pkt_len;
                /* Return packet. */
-               *(pkts++) = seg;
-               ++pkts_ret;
-               /* Increase bytes counter. */
-               rxq->stats.ibytes += len;
-repost:
-               if (++elts_head >= elts_n)
-                       elts_head = 0;
-               continue;
+               *(pkts++) = pkt;
+               pkt = NULL;
+               --pkts_n;
+               ++i;
+skip:
+               /* Align consumer index to the next stride. */
+               rq_ci >>= sge_n;
+               ++rq_ci;
+               rq_ci <<= sge_n;
        }
-       if (unlikely(i == 0))
+       if (unlikely((i == 0) && ((rq_ci >> sge_n) == rxq->hw.rq_ci)))
                return 0;
-       /* Repost WRs. */
-       *wr_next = NULL;
-       assert(wr_head);
-       ret = ibv_post_recv(rxq->qp, wr_head, &wr_bad);
-       if (unlikely(ret)) {
-               /* Inability to repost WRs is fatal. */
-               DEBUG("%p: recv_burst(): failed (ret=%d)",
-                     (void *)rxq->priv,
-                     ret);
-               abort();
-       }
-       rxq->elts_head = elts_head;
-       /* Increase packets counter. */
-       rxq->stats.ipackets += pkts_ret;
-       return pkts_ret;
+       /* Update the consumer index. */
+       rxq->hw.rq_ci = rq_ci >> sge_n;
+       rte_wmb();
+       *rxq->hw.rq_db = htonl(rxq->hw.rq_ci);
+       *to_mcq(rxq->cq)->set_ci_db =
+               htonl(to_mcq(rxq->cq)->cons_index & 0xffffff);
+       /* Increment packets counter. */
+       rxq->stats.ipackets += i;
+       return i;
 }
 
 /**
diff --git a/drivers/net/mlx4/mlx4_rxtx.h b/drivers/net/mlx4/mlx4_rxtx.h
index a3d972b..077fdd8 100644
--- a/drivers/net/mlx4/mlx4_rxtx.h
+++ b/drivers/net/mlx4/mlx4_rxtx.h
@@ -70,13 +70,6 @@ struct mlx4_rxq_stats {
        uint64_t rx_nombuf; /**< Total of Rx mbuf allocation failures. */
 };
 
-/** Rx element. */
-struct rxq_elt {
-       struct ibv_recv_wr wr; /**< Work request. */
-       struct ibv_sge sge; /**< Scatter/gather element. */
-       /* mbuf pointer is derived from WR_ID(wr.wr_id).offset. */
-};
-
 /** Rx queue descriptor. */
 struct rxq {
        struct priv *priv; /**< Back pointer to private data. */
@@ -86,9 +79,14 @@ struct rxq {
        struct ibv_qp *qp; /**< Queue pair. */
        struct ibv_comp_channel *channel; /**< Rx completion channel. */
        unsigned int port_id; /**< Port ID for incoming packets. */
-       unsigned int elts_n; /**< (*elts)[] length. */
-       unsigned int elts_head; /**< Current index in (*elts)[]. */
-       struct rxq_elt (*elts)[]; /**< Rx elements. */
+       unsigned int elts_n; /**< Log 2 of Mbufs. */
+       struct rte_mbuf *(*elts)[]; /**< Rx elements. */
+       struct {
+               volatile struct mlx4_wqe_data_seg(*wqes)[];
+               volatile uint32_t *rq_db;
+               uint16_t rq_ci;
+       } hw;
+       unsigned int sge_n; /**< Log 2 of SGEs number. */
        struct mlx4_rxq_stats stats; /**< Rx queue counters. */
        unsigned int socket; /**< CPU socket ID for allocations. */
 };
diff --git a/drivers/net/mlx4/mlx4_utils.h b/drivers/net/mlx4/mlx4_utils.h
index e74b61b..a37a3e5 100644
--- a/drivers/net/mlx4/mlx4_utils.h
+++ b/drivers/net/mlx4/mlx4_utils.h
@@ -102,4 +102,24 @@
 
 int mlx4_fd_set_non_blocking(int fd);
 
+/**
+ * Return nearest power of two above input value.
+ *
+ * @param v
+ *   Input value.
+ *
+ * @return
+ *   Nearest power of two above input value.
+ */
+static inline unsigned int
+log2above(unsigned int v)
+{
+       unsigned int l;
+       unsigned int r;
+
+       for (l = 0, r = 0; (v >> 1); ++l, v >>= 1)
+               r |= (v & 1);
+       return l + r;
+}
+
 #endif /* MLX4_UTILS_H_ */
-- 
1.8.3.1

Reply via email to