Mini (compressed) CQEs are returned by the NIC when PCI back pressure is
detected, in which case the first CQE64 contains common packet information
followed by a number of CQE8 providing the rest, followed by a matching
number of empty CQE64 entries to be used by software for decompression.

Before decompression:

      0           1          2           6         7         8
  +-------+  +---------+ +-------+   +-------+ +-------+ +-------+
  | CQE64 |  |  CQE64  | | CQE64 |   | CQE64 | | CQE64 | | CQE64 |
  |-------|  |---------| |-------|   |-------| |-------| |-------|
  | ..... |  | cqe8[0] | |       | . |       | |       | | ..... |
  | ..... |  | cqe8[1] | |       | . |       | |       | | ..... |
  | ..... |  | ....... | |       | . |       | |       | | ..... |
  | ..... |  | cqe8[7] | |       |   |       | |       | | ..... |
  +-------+  +---------+ +-------+   +-------+ +-------+ +-------+

After decompression:

      0          1     ...     8
  +-------+  +-------+     +-------+
  | CQE64 |  | CQE64 |     | CQE64 |
  |-------|  |-------|     |-------|
  | ..... |  | ..... |  .  | ..... |
  | ..... |  | ..... |  .  | ..... |
  | ..... |  | ..... |  .  | ..... |
  | ..... |  | ..... |     | ..... |
  +-------+  +-------+     +-------+

This patch does not perform the entire decompression step as it would be
really expensive, instead the first CQE64 is consumed and an internal
context is maintained to interpret the following CQE8 entries directly.

Intermediate empty CQE64 entries are handed back to HW without further
processing.

Signed-off-by: Nelio Laranjeiro <nelio.laranjeiro at 6wind.com>
Signed-off-by: Adrien Mazarguil <adrien.mazarguil at 6wind.com>
Signed-off-by: Olga Shern <olgas at mellanox.com>
Signed-off-by: Vasily Philipov <vasilyf at mellanox.com>
---
 doc/guides/nics/mlx5.rst     |   6 +
 drivers/net/mlx5/mlx5.c      |  25 ++++-
 drivers/net/mlx5/mlx5.h      |   1 +
 drivers/net/mlx5/mlx5_rxq.c  |   9 +-
 drivers/net/mlx5/mlx5_rxtx.c | 260 ++++++++++++++++++++++++++++++++-----------
 drivers/net/mlx5/mlx5_rxtx.h |  11 ++
 drivers/net/mlx5/mlx5_txq.c  |   5 +
 7 files changed, 248 insertions(+), 69 deletions(-)

diff --git a/doc/guides/nics/mlx5.rst b/doc/guides/nics/mlx5.rst
index 3a07928..756153b 100644
--- a/doc/guides/nics/mlx5.rst
+++ b/doc/guides/nics/mlx5.rst
@@ -148,6 +148,12 @@ Run-time configuration

 - **ethtool** operations on related kernel interfaces also affect the PMD.

+- ``rxq_cqe_comp_en`` parameter [int]
+
+  A nonzero value enables the compression of CQE on RX side. This feature
+  allows to save PCI bandwidth and improve performance at the cost of a
+  slightly higher CPU usage.  Enabled by default.
+
 Prerequisites
 -------------

diff --git a/drivers/net/mlx5/mlx5.c b/drivers/net/mlx5/mlx5.c
index 98884f7..ec4e0b6 100644
--- a/drivers/net/mlx5/mlx5.c
+++ b/drivers/net/mlx5/mlx5.c
@@ -69,6 +69,9 @@
 #include "mlx5_autoconf.h"
 #include "mlx5_defs.h"

+/* Device parameter to enable RX completion queue compression. */
+#define MLX5_RXQ_CQE_COMP_EN "rxq_cqe_comp_en"
+
 /**
  * Retrieve integer value from environment variable.
  *
@@ -256,12 +259,21 @@ static int
 mlx5_args_check(const char *key, const char *val, void *opaque)
 {
        struct priv *priv = opaque;
+       unsigned long tmp;

-       /* No parameters are expected at the moment. */
-       (void)priv;
-       (void)val;
-       WARN("%s: unknown parameter", key);
-       return -EINVAL;
+       errno = 0;
+       tmp = strtoul(val, NULL, 0);
+       if (errno) {
+               WARN("%s: \"%s\" is not a valid integer", key, val);
+               return errno;
+       }
+       if (strcmp(MLX5_RXQ_CQE_COMP_EN, key) == 0)
+               priv->cqe_comp = !!tmp;
+       else {
+               WARN("%s: unknown parameter", key);
+               return -EINVAL;
+       }
+       return 0;
 }

 /**
@@ -279,7 +291,7 @@ static int
 mlx5_args(struct priv *priv, struct rte_devargs *devargs)
 {
        static const char *params[] = {
-               NULL,
+               MLX5_RXQ_CQE_COMP_EN,
        };
        struct rte_kvargs *kvlist;
        int ret = 0;
@@ -474,6 +486,7 @@ mlx5_pci_devinit(struct rte_pci_driver *pci_drv, struct 
rte_pci_device *pci_dev)
                priv->port = port;
                priv->pd = pd;
                priv->mtu = ETHER_MTU;
+               priv->cqe_comp = 1; /* Enable compression by default. */
                err = mlx5_args(priv, pci_dev->devargs);
                if (err) {
                        ERROR("failed to process device arguments: %s",
diff --git a/drivers/net/mlx5/mlx5.h b/drivers/net/mlx5/mlx5.h
index 3dca03d..8f5a6df 100644
--- a/drivers/net/mlx5/mlx5.h
+++ b/drivers/net/mlx5/mlx5.h
@@ -111,6 +111,7 @@ struct priv {
        unsigned int hw_padding:1; /* End alignment padding is supported. */
        unsigned int sriov:1; /* This is a VF or PF with VF devices. */
        unsigned int mps:1; /* Whether multi-packet send is supported. */
+       unsigned int cqe_comp:1; /* Whether CQE compression is enabled. */
        unsigned int pending_alarm:1; /* An alarm is pending. */
        /* RX/TX queues. */
        unsigned int rxqs_n; /* RX queues array size. */
diff --git a/drivers/net/mlx5/mlx5_rxq.c b/drivers/net/mlx5/mlx5_rxq.c
index a8f68a3..6881cdd 100644
--- a/drivers/net/mlx5/mlx5_rxq.c
+++ b/drivers/net/mlx5/mlx5_rxq.c
@@ -897,6 +897,7 @@ rxq_setup(struct rxq_ctrl *tmpl, struct rxq_ctrl *rxq_ctrl)
                return EINVAL;
        }
        tmpl->rxq.rq_db = rwq->rq.db;
+       tmpl->rxq.cqe_n = ibcq->cqe + 1;
        tmpl->rxq.cq_ci = 0;
        tmpl->rxq.rq_ci = 0;
        tmpl->rxq.cq_db = cq->dbrec;
@@ -955,6 +956,7 @@ rxq_ctrl_setup(struct rte_eth_dev *dev, struct rxq_ctrl 
*rxq_ctrl,
        } attr;
        enum ibv_exp_query_intf_status status;
        unsigned int mb_len = rte_pktmbuf_data_room_size(mp);
+       unsigned int cqe_n = desc - 1;
        int ret = 0;

        (void)conf; /* Thresholds configuration (ignored). */
@@ -994,7 +996,12 @@ rxq_ctrl_setup(struct rte_eth_dev *dev, struct rxq_ctrl 
*rxq_ctrl,
                .comp_mask = IBV_EXP_CQ_INIT_ATTR_RES_DOMAIN,
                .res_domain = tmpl.rd,
        };
-       tmpl.cq = ibv_exp_create_cq(priv->ctx, desc - 1, NULL, NULL, 0,
+       if (priv->cqe_comp) {
+               attr.cq.comp_mask |= IBV_EXP_CQ_INIT_ATTR_FLAGS;
+               attr.cq.flags |= IBV_EXP_CQ_COMPRESSED_CQE;
+               cqe_n = (desc * 2) - 1; /* Double the number of CQEs. */
+       }
+       tmpl.cq = ibv_exp_create_cq(priv->ctx, cqe_n, NULL, NULL, 0,
                                    &attr.cq);
        if (tmpl.cq == NULL) {
                ret = ENOMEM;
diff --git a/drivers/net/mlx5/mlx5_rxtx.c b/drivers/net/mlx5/mlx5_rxtx.c
index 95bf981..30d413c 100644
--- a/drivers/net/mlx5/mlx5_rxtx.c
+++ b/drivers/net/mlx5/mlx5_rxtx.c
@@ -69,44 +69,85 @@
 #include "mlx5_defs.h"
 #include "mlx5_prm.h"

-static inline volatile struct mlx5_cqe64 *
-get_cqe64(volatile struct mlx5_cqe cqes[],
-         unsigned int cqes_n, uint16_t *ci)
-         __attribute__((always_inline));
+#ifndef NDEBUG
+
+/**
+ * Verify or set magic value in CQE.
+ *
+ * @param cqe
+ *   Pointer to CQE.
+ *
+ * @return
+ *   0 the first time.
+ */
+static inline int
+check_cqe64_seen(volatile struct mlx5_cqe64 *cqe)
+{
+       static const uint8_t magic[] = "seen";
+       volatile uint8_t (*buf)[sizeof(cqe->rsvd40)] = &cqe->rsvd40;
+       int ret = 1;
+       unsigned int i;
+
+       for (i = 0; i < sizeof(magic) && i < sizeof(*buf); ++i)
+               if (!ret || !(ret = ((*buf)[i] == magic[i])))
+                       (*buf)[i] = magic[i];
+       return ret;
+}
+
+#endif /* NDEBUG */

 static inline int
-rx_poll_len(struct rxq *rxq) __attribute__((always_inline));
+check_cqe64(volatile struct mlx5_cqe64 *cqe,
+           unsigned int cqes_n, const uint16_t ci)
+           __attribute__((always_inline));

-static volatile struct mlx5_cqe64 *
-get_cqe64(volatile struct mlx5_cqe cqes[],
-         unsigned int cqes_n, uint16_t *ci)
+/**
+ * Check whether CQE is valid.
+ *
+ * @param cqe
+ *   Pointer to CQE.
+ * @param cqes_n
+ *   Size of completion queue.
+ * @param ci
+ *   Consumer index.
+ *
+ * @return
+ *   0 on success, 1 on failure.
+ */
+static inline int
+check_cqe64(volatile struct mlx5_cqe64 *cqe,
+               unsigned int cqes_n, const uint16_t ci)
 {
-       volatile struct mlx5_cqe64 *cqe;
-       uint16_t idx = *ci;
-       uint8_t op_own;
-
-       cqe = &cqes[idx & (cqes_n - 1)].cqe64;
-       op_own = cqe->op_own;
-       if (unlikely((op_own & MLX5_CQE_OWNER_MASK) == !(idx & cqes_n))) {
-               return NULL;
-       } else if (unlikely(op_own & 0x80)) {
-               switch (op_own >> 4) {
-               case MLX5_CQE_INVALID:
-                       return NULL; /* No CQE */
-               case MLX5_CQE_REQ_ERR:
-                       return cqe;
-               case MLX5_CQE_RESP_ERR:
-                       ++(*ci);
-                       return NULL;
-               default:
-                       return NULL;
-               }
-       }
-       if (cqe) {
-               *ci = idx + 1;
-               return cqe;
+       uint16_t idx = ci & cqes_n;
+       uint8_t op_own = cqe->op_own;
+       uint8_t op_owner = MLX5_CQE_OWNER(op_own);
+       uint8_t op_code = MLX5_CQE_OPCODE(op_own);
+
+       if (unlikely((op_owner != (!!(idx))) || (op_code == MLX5_CQE_INVALID)))
+               return 1; /* No CQE. */
+#ifndef NDEBUG
+       if ((op_code == MLX5_CQE_RESP_ERR) ||
+           (op_code == MLX5_CQE_REQ_ERR)) {
+               volatile struct mlx5_err_cqe *err_cqe = (volatile void *)cqe;
+               uint8_t syndrome = err_cqe->syndrome;
+
+               if ((syndrome == MLX5_CQE_SYNDROME_LOCAL_LENGTH_ERR) ||
+                   (syndrome == MLX5_CQE_SYNDROME_REMOTE_ABORTED_ERR))
+                       return 0;
+               if (!check_cqe64_seen(cqe))
+                       ERROR("unexpected CQE error %u (0x%02x)"
+                             " syndrome 0x%02x",
+                             op_code, op_code, syndrome);
+               return 1;
+       } else if ((op_code != MLX5_CQE_RESP_SEND) &&
+                  (op_code != MLX5_CQE_REQ)) {
+               if (!check_cqe64_seen(cqe))
+                       ERROR("unexpected CQE opcode %u (0x%02x)",
+                             op_code, op_code);
+               return 1;
        }
-       return NULL;
+#endif /* NDEBUG */
+       return 0;
 }

 /**
@@ -125,20 +166,34 @@ txq_complete(struct txq *txq)
 {
        const unsigned int elts_n = txq->elts_n;
        const unsigned int cqe_n = txq->cqe_n;
+       const unsigned int cqe_cnt = cqe_n - 1;
        uint16_t elts_free = txq->elts_tail;
        uint16_t elts_tail;
        uint16_t cq_ci = txq->cq_ci;
        unsigned int wqe_ci = (unsigned int)-1;
-       int ret = 0;

-       while (ret == 0) {
-               volatile struct mlx5_cqe64 *cqe;
+       do {
+               unsigned int idx = cq_ci & cqe_cnt;
+               volatile struct mlx5_cqe64 *cqe = &(*txq->cqes)[idx].cqe64;

-               cqe = get_cqe64(*txq->cqes, cqe_n, &cq_ci);
-               if (cqe == NULL)
+               if (check_cqe64(cqe, cqe_n, cq_ci) == 1)
                        break;
+#ifndef NDEBUG
+               if (MLX5_CQE_FORMAT(cqe->op_own) == MLX5_COMPRESSED) {
+                       if (!check_cqe64_seen(cqe))
+                               ERROR("unexpected compressed CQE, TX stopped");
+                       return;
+               }
+               if ((MLX5_CQE_OPCODE(cqe->op_own) == MLX5_CQE_RESP_ERR) ||
+                   (MLX5_CQE_OPCODE(cqe->op_own) == MLX5_CQE_REQ_ERR)) {
+                       if (!check_cqe64_seen(cqe))
+                               ERROR("unexpected error CQE, TX stopped");
+                       return;
+               }
+#endif /* NDEBUG */
                wqe_ci = ntohs(cqe->wqe_counter);
-       }
+               ++cq_ci;
+       } while (1);
        if (unlikely(wqe_ci == (unsigned int)-1))
                return;
        /* Free buffers. */
@@ -507,6 +562,97 @@ rxq_cq_to_pkt_type(volatile struct mlx5_cqe64 *cqe)
 }

 /**
+ * Get size of the next packet for a given CQE. For compressed CQEs, the
+ * consumer index is updated only once all packets of the current one have
+ * been processed.
+ *
+ * @param rxq
+ *   Pointer to RX queue.
+ * @param cqe
+ *   CQE to process.
+ *
+ * @return
+ *   Packet size in bytes (0 if there is none), -1 in case of completion
+ *   with error.
+ */
+static inline int
+mlx5_rx_poll_len(struct rxq *rxq, volatile struct mlx5_cqe64 *cqe,
+                uint16_t cqe_cnt)
+{
+       struct rxq_zip *zip = &rxq->zip;
+       uint16_t cqe_n = cqe_cnt + 1;
+       int len = 0;
+
+       /* Process compressed data in the CQE and mini arrays. */
+       if (zip->ai) {
+               volatile struct mlx5_mini_cqe8 (*mc)[8] =
+                       (volatile struct mlx5_mini_cqe8 (*)[8])
+                       (uintptr_t)(&(*rxq->cqes)[zip->ca & cqe_cnt].cqe64);
+
+               len = ntohl((*mc)[zip->ai & 7].byte_cnt);
+               if ((++zip->ai & 7) == 0) {
+                       /* Increment consumer index to skip the number of
+                        * CQEs consumed. Hardware leaves holes in the CQ
+                        * ring for software use. */
+                       zip->ca = zip->na;
+                       zip->na += 8;
+               }
+               if (unlikely(rxq->zip.ai == rxq->zip.cqe_cnt)) {
+                       uint16_t idx = rxq->cq_ci;
+                       uint16_t end = zip->cq_ci;
+
+                       while (idx != end) {
+                               (*rxq->cqes)[idx & cqe_cnt].cqe64.op_own =
+                                       MLX5_CQE_INVALIDATE;
+                               ++idx;
+                       }
+                       rxq->cq_ci = zip->cq_ci;
+                       zip->ai = 0;
+               }
+       /* No compressed data, get next CQE and verify if it is compressed. */
+       } else {
+               int ret;
+               int8_t op_own;
+
+               ret = check_cqe64(cqe, cqe_n, rxq->cq_ci);
+               if (unlikely(ret == 1))
+                       return 0;
+               ++rxq->cq_ci;
+               op_own = cqe->op_own;
+               if (MLX5_CQE_FORMAT(op_own) == MLX5_COMPRESSED) {
+                       volatile struct mlx5_mini_cqe8 (*mc)[8] =
+                               (volatile struct mlx5_mini_cqe8 (*)[8])
+                               (uintptr_t)&(*rxq->cqes)[rxq->cq_ci &
+                                                        cqe_cnt].cqe64;
+
+                       /* Fix endianness. */
+                       zip->cqe_cnt = ntohl(cqe->byte_cnt);
+                       /*
+                        * Current mini array position is the one returned by
+                        * check_cqe64().
+                        *
+                        * If completion comprises several mini arrays, as a
+                        * special case the second one is located 7 CQEs after
+                        * the initial CQE instead of 8 for subsequent ones.
+                        */
+                       zip->ca = rxq->cq_ci & cqe_cnt;
+                       zip->na = zip->ca + 7;
+                       /* Compute the next non compressed CQE. */
+                       --rxq->cq_ci;
+                       zip->cq_ci = rxq->cq_ci + zip->cqe_cnt;
+                       /* Get packet size to return. */
+                       len = ntohl((*mc)[0].byte_cnt);
+                       zip->ai = 1;
+               } else
+                       len = ntohl(cqe->byte_cnt);
+               /* Error while receiving packet. */
+               if (unlikely(MLX5_CQE_OPCODE(op_own) == MLX5_CQE_RESP_ERR))
+                       return -1;
+       }
+       return len;
+}
+
+/**
  * Translate RX completion flags to offload flags.
  *
  * @param[in] rxq
@@ -554,26 +700,6 @@ rxq_cq_to_ol_flags(struct rxq *rxq, volatile struct 
mlx5_cqe64 *cqe)
 }

 /**
- * Get size of the next packet.
- *
- * @param rxq
- *   RX queue to fetch packet from.
- *
- * @return
- *   Packet size in bytes.
- */
-static inline int __attribute__((always_inline))
-rx_poll_len(struct rxq *rxq)
-{
-       volatile struct mlx5_cqe64 *cqe;
-
-       cqe = get_cqe64(*rxq->cqes, rxq->elts_n, &rxq->cq_ci);
-       if (cqe)
-               return ntohl(cqe->byte_cnt);
-       return 0;
-}
-
-/**
  * DPDK callback for RX.
  *
  * @param dpdk_rxq
@@ -595,15 +721,16 @@ mlx5_rx_burst(void *dpdk_rxq, struct rte_mbuf **pkts, 
uint16_t pkts_n)
        unsigned int rq_ci = rxq->rq_ci;
        const unsigned int elts_n = rxq->elts_n;
        const unsigned int wqe_cnt = elts_n - 1;
+       const unsigned int cqe_cnt = rxq->cqe_n - 1;

        for (i = 0; (i != pkts_n); ++i) {
                unsigned int idx = rq_ci & wqe_cnt;
+               int len;
                struct rte_mbuf *rep;
                struct rte_mbuf *pkt;
-               unsigned int len;
                volatile struct mlx5_wqe_data_seg *wqe = &(*rxq->wqes)[idx];
                volatile struct mlx5_cqe64 *cqe =
-                       &(*rxq->cqes)[rxq->cq_ci & wqe_cnt].cqe64;
+                       &(*rxq->cqes)[rxq->cq_ci & cqe_cnt].cqe64;

                pkt = (*rxq->elts)[idx];
                rte_prefetch0(cqe);
@@ -616,12 +743,20 @@ mlx5_rx_burst(void *dpdk_rxq, struct rte_mbuf **pkts, 
uint16_t pkts_n)
                NB_SEGS(rep) = 1;
                PORT(rep) = rxq->port_id;
                NEXT(rep) = NULL;
-               len = rx_poll_len(rxq);
+               len = mlx5_rx_poll_len(rxq, cqe, cqe_cnt);
                if (unlikely(len == 0)) {
                        rte_mbuf_refcnt_set(rep, 0);
                        __rte_mbuf_raw_free(rep);
                        break;
                }
+               if (unlikely(len == -1)) {
+                       /* RX error, packet is likely too large. */
+                       rte_mbuf_refcnt_set(rep, 0);
+                       __rte_mbuf_raw_free(rep);
+                       ++rxq->stats.idropped;
+                       --i;
+                       goto skip;
+               }
                /* Fill NIC descriptor with the new buffer.  The lkey and size
                 * of the buffers are already known, only the buffer address
                 * changes. */
@@ -651,6 +786,7 @@ mlx5_rx_burst(void *dpdk_rxq, struct rte_mbuf **pkts, 
uint16_t pkts_n)
                /* Return packet. */
                *(pkts++) = pkt;
                ++pkts_ret;
+       skip:
                ++rq_ci;
        }
        if (unlikely((i == 0) && (rq_ci == rxq->rq_ci)))
diff --git a/drivers/net/mlx5/mlx5_rxtx.h b/drivers/net/mlx5/mlx5_rxtx.h
index 6b3bb2d..77b0fde 100644
--- a/drivers/net/mlx5/mlx5_rxtx.h
+++ b/drivers/net/mlx5/mlx5_rxtx.h
@@ -91,6 +91,15 @@ struct fdir_queue {

 struct priv;

+/* Compressed CQE context. */
+struct rxq_zip {
+       uint16_t ai; /* Array index. */
+       uint16_t ca; /* Current array index. */
+       uint16_t na; /* Next array index. */
+       uint16_t cq_ci; /* The next CQE. */
+       uint32_t cqe_cnt; /* Number of CQEs. */
+};
+
 /* RX queue descriptor. */
 struct rxq {
        unsigned int csum:1; /* Enable checksum offloading. */
@@ -100,9 +109,11 @@ struct rxq {
        uint16_t rq_ci;
        uint16_t cq_ci;
        uint16_t elts_n;
+       uint16_t cqe_n; /* Number of CQ elements. */
        uint16_t port_id;
        volatile struct mlx5_wqe_data_seg(*wqes)[];
        volatile struct mlx5_cqe(*cqes)[];
+       struct rxq_zip zip; /* Compressed context. */
        volatile uint32_t *rq_db;
        volatile uint32_t *cq_db;
        struct rte_mbuf *(*elts)[];
diff --git a/drivers/net/mlx5/mlx5_txq.c b/drivers/net/mlx5/mlx5_txq.c
index dbf9c04..ddcd6b6 100644
--- a/drivers/net/mlx5/mlx5_txq.c
+++ b/drivers/net/mlx5/mlx5_txq.c
@@ -268,6 +268,11 @@ txq_ctrl_setup(struct rte_eth_dev *dev, struct txq_ctrl 
*txq_ctrl,
        enum ibv_exp_query_intf_status status;
        int ret = 0;

+       if (mlx5_getenv_int("MLX5_ENABLE_CQE_COMPRESSION")) {
+               ret = ENOTSUP;
+               ERROR("MLX5_ENABLE_CQE_COMPRESSION must never be set");
+               goto error;
+       }
        (void)conf; /* Thresholds configuration (ignored). */
        tmpl.txq.elts_n = desc;
        /* Request send completion every MLX5_PMD_TX_PER_COMP_REQ packets or
-- 
2.1.4

Reply via email to