Same technique than some Intel drivers, for arches where PAGE_SIZE = 4096

In most cases, pages are reused because they were consumed
before we could loop around the RX ring.

This brings back performance, and is even better,
a single TCP flow reaches 30Gbit on my hosts.

v2: added full memset() in mlx4_en_free_frag(), as Tariq found it was needed
if we switch to large MTU, as priv->log_rx_info can dynamically be changed.

Signed-off-by: Eric Dumazet <eduma...@google.com>
---
 drivers/net/ethernet/mellanox/mlx4/en_rx.c   | 258 +++++++++------------------
 drivers/net/ethernet/mellanox/mlx4/mlx4_en.h |   1 -
 2 files changed, 82 insertions(+), 177 deletions(-)

diff --git a/drivers/net/ethernet/mellanox/mlx4/en_rx.c 
b/drivers/net/ethernet/mellanox/mlx4/en_rx.c
index 
069ea09185fb0669d5c1f9b1b88f517b2d69c5ed..5edd0cf2999cbde37b3404aafd700584696af940
 100644
--- a/drivers/net/ethernet/mellanox/mlx4/en_rx.c
+++ b/drivers/net/ethernet/mellanox/mlx4/en_rx.c
@@ -50,10 +50,9 @@
 
 #include "mlx4_en.h"
 
-static int mlx4_alloc_pages(struct mlx4_en_priv *priv,
-                           struct mlx4_en_rx_alloc *page_alloc,
-                           const struct mlx4_en_frag_info *frag_info,
-                           gfp_t gfp)
+static int mlx4_alloc_page(struct mlx4_en_priv *priv,
+                          struct mlx4_en_rx_alloc *frag,
+                          gfp_t gfp)
 {
        struct page *page;
        dma_addr_t dma;
@@ -63,145 +62,46 @@ static int mlx4_alloc_pages(struct mlx4_en_priv *priv,
                return -ENOMEM;
        dma = dma_map_page(priv->ddev, page, 0, PAGE_SIZE, priv->dma_dir);
        if (unlikely(dma_mapping_error(priv->ddev, dma))) {
-               put_page(page);
+               __free_page(page);
                return -ENOMEM;
        }
-       page_alloc->page = page;
-       page_alloc->dma = dma;
-       page_alloc->page_offset = 0;
-       /* Not doing get_page() for each frag is a big win
-        * on asymetric workloads. Note we can not use atomic_set().
-        */
-       page_ref_add(page, PAGE_SIZE / frag_info->frag_stride - 1);
+       frag->page = page;
+       frag->dma = dma;
+       frag->page_offset = priv->rx_headroom;
        return 0;
 }
 
 static int mlx4_en_alloc_frags(struct mlx4_en_priv *priv,
                               struct mlx4_en_rx_desc *rx_desc,
                               struct mlx4_en_rx_alloc *frags,
-                              struct mlx4_en_rx_alloc *ring_alloc,
                               gfp_t gfp)
 {
-       struct mlx4_en_rx_alloc page_alloc[MLX4_EN_MAX_RX_FRAGS];
-       const struct mlx4_en_frag_info *frag_info;
-       struct page *page;
        int i;
 
-       for (i = 0; i < priv->num_frags; i++) {
-               frag_info = &priv->frag_info[i];
-               page_alloc[i] = ring_alloc[i];
-               page_alloc[i].page_offset += frag_info->frag_stride;
-
-               if (page_alloc[i].page_offset + frag_info->frag_stride <=
-                   PAGE_SIZE)
-                       continue;
-
-               if (unlikely(mlx4_alloc_pages(priv, &page_alloc[i],
-                                             frag_info, gfp)))
-                       goto out;
-       }
-
-       for (i = 0; i < priv->num_frags; i++) {
-               frags[i] = ring_alloc[i];
-               frags[i].page_offset += priv->rx_headroom;
-               rx_desc->data[i].addr = cpu_to_be64(frags[i].dma +
-                                                   frags[i].page_offset);
-               ring_alloc[i] = page_alloc[i];
-       }
-
-       return 0;
-
-out:
-       while (i--) {
-               if (page_alloc[i].page != ring_alloc[i].page) {
-                       dma_unmap_page(priv->ddev, page_alloc[i].dma,
-                                      PAGE_SIZE, priv->dma_dir);
-                       page = page_alloc[i].page;
-                       /* Revert changes done by mlx4_alloc_pages */
-                       page_ref_sub(page, PAGE_SIZE /
-                                          priv->frag_info[i].frag_stride - 1);
-                       put_page(page);
-               }
-       }
-       return -ENOMEM;
-}
-
-static void mlx4_en_free_frag(struct mlx4_en_priv *priv,
-                             struct mlx4_en_rx_alloc *frags,
-                             int i)
-{
-       const struct mlx4_en_frag_info *frag_info = &priv->frag_info[i];
-       u32 next_frag_end = frags[i].page_offset + 2 * frag_info->frag_stride;
-
-
-       if (next_frag_end > PAGE_SIZE)
-               dma_unmap_page(priv->ddev, frags[i].dma, PAGE_SIZE,
-                              priv->dma_dir);
-
-       if (frags[i].page)
-               put_page(frags[i].page);
-}
-
-static int mlx4_en_init_allocator(struct mlx4_en_priv *priv,
-                                 struct mlx4_en_rx_ring *ring)
-{
-       int i;
-       struct mlx4_en_rx_alloc *page_alloc;
-
-       for (i = 0; i < priv->num_frags; i++) {
-               const struct mlx4_en_frag_info *frag_info = &priv->frag_info[i];
-
-               if (mlx4_alloc_pages(priv, &ring->page_alloc[i],
-                                    frag_info, GFP_KERNEL | __GFP_COLD))
-                       goto out;
-
-               en_dbg(DRV, priv, "  frag %d allocator: - frags:%d\n",
-                      i, page_ref_count(ring->page_alloc[i].page));
+       for (i = 0; i < priv->num_frags; i++, frags++) {
+               if (!frags->page && mlx4_alloc_page(priv, frags, gfp))
+                       return -ENOMEM;
+               rx_desc->data[i].addr = cpu_to_be64(frags->dma +
+                                                   frags->page_offset);
        }
        return 0;
-
-out:
-       while (i--) {
-               struct page *page;
-
-               page_alloc = &ring->page_alloc[i];
-               dma_unmap_page(priv->ddev, page_alloc->dma,
-                              PAGE_SIZE, priv->dma_dir);
-               page = page_alloc->page;
-               /* Revert changes done by mlx4_alloc_pages */
-               page_ref_sub(page, PAGE_SIZE /
-                                  priv->frag_info[i].frag_stride - 1);
-               put_page(page);
-               page_alloc->page = NULL;
-       }
-       return -ENOMEM;
 }
 
-static void mlx4_en_destroy_allocator(struct mlx4_en_priv *priv,
-                                     struct mlx4_en_rx_ring *ring)
+static void mlx4_en_free_frag(const struct mlx4_en_priv *priv,
+                             struct mlx4_en_rx_alloc *frag)
 {
-       struct mlx4_en_rx_alloc *page_alloc;
-       int i;
-
-       for (i = 0; i < priv->num_frags; i++) {
-               const struct mlx4_en_frag_info *frag_info = &priv->frag_info[i];
-
-               page_alloc = &ring->page_alloc[i];
-               en_dbg(DRV, priv, "Freeing allocator:%d count:%d\n",
-                      i, page_count(page_alloc->page));
-
-               dma_unmap_page(priv->ddev, page_alloc->dma,
+       if (frag->page) {
+               dma_unmap_page(priv->ddev, frag->dma,
                               PAGE_SIZE, priv->dma_dir);
-               while (page_alloc->page_offset + frag_info->frag_stride <
-                      PAGE_SIZE) {
-                       put_page(page_alloc->page);
-                       page_alloc->page_offset += frag_info->frag_stride;
-               }
-               page_alloc->page = NULL;
+               __free_page(frag->page);
        }
+       /* We need to clear all fields, otherwise a change of priv->log_rx_info
+        * could lead to see garbage later in frag->page.
+        */
+       memset(frag, 0, sizeof(*frag));
 }
 
-static void mlx4_en_init_rx_desc(struct mlx4_en_priv *priv,
+static void mlx4_en_init_rx_desc(const struct mlx4_en_priv *priv,
                                 struct mlx4_en_rx_ring *ring, int index)
 {
        struct mlx4_en_rx_desc *rx_desc = ring->buf + ring->stride * index;
@@ -235,19 +135,22 @@ static int mlx4_en_prepare_rx_desc(struct mlx4_en_priv 
*priv,
                                        (index << priv->log_rx_info);
 
        if (ring->page_cache.index > 0) {
-               ring->page_cache.index--;
-               frags[0].page = 
ring->page_cache.buf[ring->page_cache.index].page;
-               frags[0].dma  = 
ring->page_cache.buf[ring->page_cache.index].dma;
-               frags[0].page_offset = XDP_PACKET_HEADROOM;
-               rx_desc->data[0].addr = cpu_to_be64(frags[0].dma +
-                                                   frags[0].page_offset);
+               /* XDP uses a single page per frame */
+               if (!frags->page) {
+                       ring->page_cache.index--;
+                       frags->page = 
ring->page_cache.buf[ring->page_cache.index].page;
+                       frags->dma  = 
ring->page_cache.buf[ring->page_cache.index].dma;
+               }
+               frags->page_offset = XDP_PACKET_HEADROOM;
+               rx_desc->data[0].addr = cpu_to_be64(frags->dma +
+                                                   XDP_PACKET_HEADROOM);
                return 0;
        }
 
-       return mlx4_en_alloc_frags(priv, rx_desc, frags, ring->page_alloc, gfp);
+       return mlx4_en_alloc_frags(priv, rx_desc, frags, gfp);
 }
 
-static inline bool mlx4_en_is_ring_empty(struct mlx4_en_rx_ring *ring)
+static bool mlx4_en_is_ring_empty(const struct mlx4_en_rx_ring *ring)
 {
        return ring->prod == ring->cons;
 }
@@ -257,7 +160,8 @@ static inline void mlx4_en_update_rx_prod_db(struct 
mlx4_en_rx_ring *ring)
        *ring->wqres.db.db = cpu_to_be32(ring->prod & 0xffff);
 }
 
-static void mlx4_en_free_rx_desc(struct mlx4_en_priv *priv,
+/* slow path */
+static void mlx4_en_free_rx_desc(const struct mlx4_en_priv *priv,
                                 struct mlx4_en_rx_ring *ring,
                                 int index)
 {
@@ -267,7 +171,7 @@ static void mlx4_en_free_rx_desc(struct mlx4_en_priv *priv,
        frags = ring->rx_info + (index << priv->log_rx_info);
        for (nr = 0; nr < priv->num_frags; nr++) {
                en_dbg(DRV, priv, "Freeing fragment:%d\n", nr);
-               mlx4_en_free_frag(priv, frags, nr);
+               mlx4_en_free_frag(priv, frags + nr);
        }
 }
 
@@ -323,12 +227,12 @@ static void mlx4_en_free_rx_buf(struct mlx4_en_priv *priv,
               ring->cons, ring->prod);
 
        /* Unmap and free Rx buffers */
-       while (!mlx4_en_is_ring_empty(ring)) {
-               index = ring->cons & ring->size_mask;
+       for (index = 0; index < ring->size; index++) {
                en_dbg(DRV, priv, "Processing descriptor:%d\n", index);
                mlx4_en_free_rx_desc(priv, ring, index);
-               ++ring->cons;
        }
+       ring->cons = 0;
+       ring->prod = 0;
 }
 
 void mlx4_en_set_num_rx_rings(struct mlx4_en_dev *mdev)
@@ -380,9 +284,9 @@ int mlx4_en_create_rx_ring(struct mlx4_en_priv *priv,
 
        tmp = size * roundup_pow_of_two(MLX4_EN_MAX_RX_FRAGS *
                                        sizeof(struct mlx4_en_rx_alloc));
-       ring->rx_info = vmalloc_node(tmp, node);
+       ring->rx_info = vzalloc_node(tmp, node);
        if (!ring->rx_info) {
-               ring->rx_info = vmalloc(tmp);
+               ring->rx_info = vzalloc(tmp);
                if (!ring->rx_info) {
                        err = -ENOMEM;
                        goto err_ring;
@@ -452,16 +356,6 @@ int mlx4_en_activate_rx_rings(struct mlx4_en_priv *priv)
                /* Initialize all descriptors */
                for (i = 0; i < ring->size; i++)
                        mlx4_en_init_rx_desc(priv, ring, i);
-
-               /* Initialize page allocators */
-               err = mlx4_en_init_allocator(priv, ring);
-               if (err) {
-                       en_err(priv, "Failed initializing ring allocator\n");
-                       if (ring->stride <= TXBB_SIZE)
-                               ring->buf -= TXBB_SIZE;
-                       ring_ind--;
-                       goto err_allocator;
-               }
        }
        err = mlx4_en_fill_rx_buffers(priv);
        if (err)
@@ -481,11 +375,9 @@ int mlx4_en_activate_rx_rings(struct mlx4_en_priv *priv)
                mlx4_en_free_rx_buf(priv, priv->rx_ring[ring_ind]);
 
        ring_ind = priv->rx_ring_num - 1;
-err_allocator:
        while (ring_ind >= 0) {
                if (priv->rx_ring[ring_ind]->stride <= TXBB_SIZE)
                        priv->rx_ring[ring_ind]->buf -= TXBB_SIZE;
-               mlx4_en_destroy_allocator(priv, priv->rx_ring[ring_ind]);
                ring_ind--;
        }
        return err;
@@ -565,50 +457,68 @@ void mlx4_en_deactivate_rx_ring(struct mlx4_en_priv *priv,
        mlx4_en_free_rx_buf(priv, ring);
        if (ring->stride <= TXBB_SIZE)
                ring->buf -= TXBB_SIZE;
-       mlx4_en_destroy_allocator(priv, ring);
 }
 
 
 static int mlx4_en_complete_rx_desc(struct mlx4_en_priv *priv,
-                                   struct mlx4_en_rx_desc *rx_desc,
                                    struct mlx4_en_rx_alloc *frags,
                                    struct sk_buff *skb,
                                    int length)
 {
-       struct skb_frag_struct *skb_frags_rx = skb_shinfo(skb)->frags;
-       struct mlx4_en_frag_info *frag_info = priv->frag_info;
+       const struct mlx4_en_frag_info *frag_info = priv->frag_info;
+       unsigned int truesize = 0;
        int nr, frag_size;
+       struct page *page;
        dma_addr_t dma;
+       bool release;
 
        /* Collect used fragments while replacing them in the HW descriptors */
-       for (nr = 0;;) {
+       for (nr = 0;; frags++) {
                frag_size = min_t(int, length, frag_info->frag_size);
 
-               if (unlikely(!frags[nr].page))
+               page = frags->page;
+               if (unlikely(!page))
                        goto fail;
 
-               dma = be64_to_cpu(rx_desc->data[nr].addr);
-               dma_sync_single_for_cpu(priv->ddev, dma, frag_info->frag_size,
-                                       DMA_FROM_DEVICE);
+               dma = frags->dma;
+               dma_sync_single_range_for_cpu(priv->ddev, dma, 
frags->page_offset,
+                                             frag_size, priv->dma_dir);
 
-               __skb_fill_page_desc(skb, nr, frags[nr].page,
-                                    frags[nr].page_offset,
+               __skb_fill_page_desc(skb, nr, page, frags->page_offset,
                                     frag_size);
 
-               skb->truesize += frag_info->frag_stride;
-               frags[nr].page = NULL;
+               truesize += frag_info->frag_stride;
+               if (frag_info->frag_stride == PAGE_SIZE / 2) {
+                       frags->page_offset ^= PAGE_SIZE / 2;
+                       release = page_count(page) != 1 ||
+                                 page_is_pfmemalloc(page) ||
+                                 page_to_nid(page) != numa_mem_id();
+               } else {
+                       u32 sz_align = ALIGN(frag_size, SMP_CACHE_BYTES);
+
+                       frags->page_offset += sz_align;
+                       release = frags->page_offset + frag_info->frag_size > 
PAGE_SIZE;
+               }
+               if (release) {
+                       dma_unmap_page(priv->ddev, dma, PAGE_SIZE, 
priv->dma_dir);
+                       frags->page = NULL;
+               } else {
+                       page_ref_inc(page);
+               }
+
                nr++;
                length -= frag_size;
                if (!length)
                        break;
                frag_info++;
        }
+       skb->truesize += truesize;
        return nr;
 
 fail:
        while (nr > 0) {
                nr--;
-               __skb_frag_unref(&skb_frags_rx[nr]);
+               __skb_frag_unref(skb_shinfo(skb)->frags + nr);
        }
        return 0;
 }
@@ -639,7 +549,8 @@ static struct sk_buff *mlx4_en_rx_skb(struct mlx4_en_priv 
*priv,
        if (length <= SMALL_PACKET_SIZE) {
                /* We are copying all relevant data to the skb - temporarily
                 * sync buffers for the copy */
-               dma = be64_to_cpu(rx_desc->data[0].addr);
+
+               dma = frags[0].dma + frags[0].page_offset;
                dma_sync_single_for_cpu(priv->ddev, dma, length,
                                        DMA_FROM_DEVICE);
                skb_copy_to_linear_data(skb, va, length);
@@ -648,7 +559,7 @@ static struct sk_buff *mlx4_en_rx_skb(struct mlx4_en_priv 
*priv,
                unsigned int pull_len;
 
                /* Move relevant fragments to skb */
-               used_frags = mlx4_en_complete_rx_desc(priv, rx_desc, frags,
+               used_frags = mlx4_en_complete_rx_desc(priv, frags,
                                                        skb, length);
                if (unlikely(!used_frags)) {
                        kfree_skb(skb);
@@ -916,8 +827,10 @@ int mlx4_en_process_rx_cq(struct net_device *dev, struct 
mlx4_en_cq *cq, int bud
                        case XDP_TX:
                                if (likely(!mlx4_en_xmit_frame(ring, frags, dev,
                                                        length, cq->ring,
-                                                       &doorbell_pending)))
-                                       goto consumed;
+                                                       &doorbell_pending))) {
+                                       frags[0].page = NULL;
+                                       goto next;
+                               }
                                trace_xdp_exception(dev, xdp_prog, act);
                                goto xdp_drop_no_cnt; /* Drop on xmit failure */
                        default:
@@ -927,8 +840,6 @@ int mlx4_en_process_rx_cq(struct net_device *dev, struct 
mlx4_en_cq *cq, int bud
                        case XDP_DROP:
                                ring->xdp_drop++;
 xdp_drop_no_cnt:
-                               if (likely(mlx4_en_rx_recycle(ring, frags)))
-                                       goto consumed;
                                goto next;
                        }
                }
@@ -974,9 +885,8 @@ int mlx4_en_process_rx_cq(struct net_device *dev, struct 
mlx4_en_cq *cq, int bud
                        if (!gro_skb)
                                goto next;
 
-                       nr = mlx4_en_complete_rx_desc(priv,
-                               rx_desc, frags, gro_skb,
-                               length);
+                       nr = mlx4_en_complete_rx_desc(priv, frags, gro_skb,
+                                                     length);
                        if (!nr)
                                goto next;
 
@@ -1084,10 +994,6 @@ int mlx4_en_process_rx_cq(struct net_device *dev, struct 
mlx4_en_cq *cq, int bud
 
                napi_gro_receive(&cq->napi, skb);
 next:
-               for (nr = 0; nr < priv->num_frags; nr++)
-                       mlx4_en_free_frag(priv, frags, nr);
-
-consumed:
                ++cq->mcq.cons_index;
                index = (cq->mcq.cons_index) & ring->size_mask;
                cqe = mlx4_en_get_cqe(cq->buf, index, priv->cqe_size) + factor;
diff --git a/drivers/net/ethernet/mellanox/mlx4/mlx4_en.h 
b/drivers/net/ethernet/mellanox/mlx4/mlx4_en.h
index 
4f7d1503277d2e030854deabeb22b96fa9315c5d..3c93b9614e0674e904d33b83e041f348d4f2ffe9
 100644
--- a/drivers/net/ethernet/mellanox/mlx4/mlx4_en.h
+++ b/drivers/net/ethernet/mellanox/mlx4/mlx4_en.h
@@ -327,7 +327,6 @@ struct mlx4_en_rx_desc {
 
 struct mlx4_en_rx_ring {
        struct mlx4_hwq_resources wqres;
-       struct mlx4_en_rx_alloc page_alloc[MLX4_EN_MAX_RX_FRAGS];
        u32 size ;      /* number of Rx descs*/
        u32 actual_size;
        u32 size_mask;
-- 
2.11.0.483.g087da7b7c-goog

Reply via email to