This patch shows how it is possible to have both the driver local page
cache, which uses elevated refcnt for "catching"/avoiding SKB
put_page.  And at the same time, have pages getting returned to the
page_pool from ndp_xdp_xmit DMA completion.

Performance is surprisingly good. Tested DMA-TX completion on ixgbe,
that calls "xdp_return_frame", which call page_pool_put_page().
Stats show DMA-TX-completion runs on CPU#9 and mlx5 RX runs on CPU#5.
(Internally page_pool uses ptr_ring, which is what gives the good
cross CPU performance).

Show adapter(s) (ixgbe2 mlx5p2) statistics (ONLY that changed!)
Ethtool(ixgbe2  ) stat:    732863573 (    732,863,573) <= tx_bytes /sec
Ethtool(ixgbe2  ) stat:    781724427 (    781,724,427) <= tx_bytes_nic /sec
Ethtool(ixgbe2  ) stat:     12214393 (     12,214,393) <= tx_packets /sec
Ethtool(ixgbe2  ) stat:     12214435 (     12,214,435) <= tx_pkts_nic /sec
Ethtool(mlx5p2  ) stat:     12211786 (     12,211,786) <= rx3_cache_empty /sec
Ethtool(mlx5p2  ) stat:     36506736 (     36,506,736) <= rx_64_bytes_phy /sec
Ethtool(mlx5p2  ) stat:   2336430575 (  2,336,430,575) <= rx_bytes_phy /sec
Ethtool(mlx5p2  ) stat:     12211786 (     12,211,786) <= rx_cache_empty /sec
Ethtool(mlx5p2  ) stat:     22823073 (     22,823,073) <= rx_discards_phy /sec
Ethtool(mlx5p2  ) stat:      1471860 (      1,471,860) <= rx_out_of_buffer /sec
Ethtool(mlx5p2  ) stat:     36506715 (     36,506,715) <= rx_packets_phy /sec
Ethtool(mlx5p2  ) stat:   2336542282 (  2,336,542,282) <= rx_prio0_bytes /sec
Ethtool(mlx5p2  ) stat:     13683921 (     13,683,921) <= rx_prio0_packets /sec
Ethtool(mlx5p2  ) stat:    821015537 (    821,015,537) <= 
rx_vport_unicast_bytes /sec
Ethtool(mlx5p2  ) stat:     13683608 (     13,683,608) <= 
rx_vport_unicast_packets /sec

Before this patch: single flow performance was 6Mpps, and if I started
two flows the collective performance drop to 4Mpps, because we hit the
page allocator lock (further negative scaling occurs).

V2: Adjustments requested by Tariq
 - Changed page_pool_create return codes not return NULL, only
   ERR_PTR, as this simplifies err handling in drivers.
 - Save a branch in mlx5e_page_release
 - Correct page_pool size calc for MLX5_WQ_TYPE_LINKED_LIST_STRIDING_RQ

Signed-off-by: Jesper Dangaard Brouer <bro...@redhat.com>
---
 drivers/net/ethernet/mellanox/mlx5/core/en.h      |    3 ++
 drivers/net/ethernet/mellanox/mlx5/core/en_main.c |   41 +++++++++++++++++----
 drivers/net/ethernet/mellanox/mlx5/core/en_rx.c   |   16 ++++++--
 3 files changed, 48 insertions(+), 12 deletions(-)

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en.h 
b/drivers/net/ethernet/mellanox/mlx5/core/en.h
index 28cc26debeda..ab91166f7c5a 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en.h
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en.h
@@ -53,6 +53,8 @@
 #include "mlx5_core.h"
 #include "en_stats.h"
 
+struct page_pool;
+
 #define MLX5_SET_CFG(p, f, v) MLX5_SET(create_flow_group_in, p, f, v)
 
 #define MLX5E_ETH_HARD_MTU (ETH_HLEN + VLAN_HLEN + ETH_FCS_LEN)
@@ -535,6 +537,7 @@ struct mlx5e_rq {
        /* XDP */
        struct bpf_prog       *xdp_prog;
        struct mlx5e_xdpsq     xdpsq;
+       struct page_pool      *page_pool;
 
        /* control */
        struct mlx5_wq_ctrl    wq_ctrl;
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c 
b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
index 2e4ca0f15b62..bf17e6d614d6 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
@@ -35,6 +35,7 @@
 #include <linux/mlx5/fs.h>
 #include <net/vxlan.h>
 #include <linux/bpf.h>
+#include <net/page_pool.h>
 #include "eswitch.h"
 #include "en.h"
 #include "en_tc.h"
@@ -387,10 +388,11 @@ static int mlx5e_alloc_rq(struct mlx5e_channel *c,
                          struct mlx5e_rq_param *rqp,
                          struct mlx5e_rq *rq)
 {
+       struct page_pool_params pp_params = { 0 };
        struct mlx5_core_dev *mdev = c->mdev;
        void *rqc = rqp->rqc;
        void *rqc_wq = MLX5_ADDR_OF(rqc, rqc, wq);
-       u32 byte_count;
+       u32 byte_count, pool_size;
        int npages;
        int wq_sz;
        int err;
@@ -429,10 +431,13 @@ static int mlx5e_alloc_rq(struct mlx5e_channel *c,
 
        rq->buff.map_dir = rq->xdp_prog ? DMA_BIDIRECTIONAL : DMA_FROM_DEVICE;
        rq->buff.headroom = params->rq_headroom;
+       pool_size = 1 << params->log_rq_size;
 
        switch (rq->wq_type) {
        case MLX5_WQ_TYPE_LINKED_LIST_STRIDING_RQ:
 
+               pool_size = pool_size * MLX5_MPWRQ_PAGES_PER_WQE;
+
                rq->post_wqes = mlx5e_post_rx_mpwqes;
                rq->dealloc_wqe = mlx5e_dealloc_rx_mpwqe;
 
@@ -506,13 +511,31 @@ static int mlx5e_alloc_rq(struct mlx5e_channel *c,
                rq->mkey_be = c->mkey_be;
        }
 
-       /* This must only be activate for order-0 pages */
-       if (rq->xdp_prog) {
-               err = xdp_rxq_info_reg_mem_model(&rq->xdp_rxq,
-                                                MEM_TYPE_PAGE_ORDER0, NULL);
-               if (err)
-                       goto err_rq_wq_destroy;
+       /* Create a page_pool and register it with rxq */
+       pp_params.size      = PAGE_POOL_PARAMS_SIZE;
+       pp_params.order     = rq->buff.page_order;
+       pp_params.dev       = c->pdev;
+       pp_params.nid       = cpu_to_node(c->cpu);
+       pp_params.dma_dir   = rq->buff.map_dir;
+       pp_params.pool_size = pool_size;
+       pp_params.flags     = 0; /* No-internal DMA mapping in page_pool */
+
+       /* page_pool can be used even when there is no rq->xdp_prog,
+        * given page_pool does not handle DMA mapping there is no
+        * required state to clear. And page_pool gracefully handle
+        * elevated refcnt.
+        */
+       rq->page_pool = page_pool_create(&pp_params);
+       if (IS_ERR(rq->page_pool)) {
+               kfree(rq->wqe.frag_info);
+               err = PTR_ERR(rq->page_pool);
+               rq->page_pool = NULL;
+               goto err_rq_wq_destroy;
        }
+       err = xdp_rxq_info_reg_mem_model(&rq->xdp_rxq,
+                                        MEM_TYPE_PAGE_POOL, rq->page_pool);
+       if (err)
+               goto err_rq_wq_destroy;
 
        for (i = 0; i < wq_sz; i++) {
                struct mlx5e_rx_wqe *wqe = mlx5_wq_ll_get_wqe(&rq->wq, i);
@@ -550,6 +573,8 @@ static int mlx5e_alloc_rq(struct mlx5e_channel *c,
        if (rq->xdp_prog)
                bpf_prog_put(rq->xdp_prog);
        xdp_rxq_info_unreg(&rq->xdp_rxq);
+       if (rq->page_pool)
+               page_pool_destroy_rcu(rq->page_pool);
        mlx5_wq_destroy(&rq->wq_ctrl);
 
        return err;
@@ -563,6 +588,8 @@ static void mlx5e_free_rq(struct mlx5e_rq *rq)
                bpf_prog_put(rq->xdp_prog);
 
        xdp_rxq_info_unreg(&rq->xdp_rxq);
+       if (rq->page_pool)
+               page_pool_destroy_rcu(rq->page_pool);
 
        switch (rq->wq_type) {
        case MLX5_WQ_TYPE_LINKED_LIST_STRIDING_RQ:
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_rx.c 
b/drivers/net/ethernet/mellanox/mlx5/core/en_rx.c
index 6dcc3e8fbd3e..2ac78b88fc3d 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_rx.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_rx.c
@@ -37,6 +37,7 @@
 #include <linux/bpf_trace.h>
 #include <net/busy_poll.h>
 #include <net/ip6_checksum.h>
+#include <net/page_pool.h>
 #include "en.h"
 #include "en_tc.h"
 #include "eswitch.h"
@@ -221,7 +222,7 @@ static inline int mlx5e_page_alloc_mapped(struct mlx5e_rq 
*rq,
        if (mlx5e_rx_cache_get(rq, dma_info))
                return 0;
 
-       dma_info->page = dev_alloc_pages(rq->buff.page_order);
+       dma_info->page = page_pool_dev_alloc_pages(rq->page_pool);
        if (unlikely(!dma_info->page))
                return -ENOMEM;
 
@@ -246,11 +247,16 @@ static inline void mlx5e_page_dma_unmap(struct mlx5e_rq 
*rq,
 void mlx5e_page_release(struct mlx5e_rq *rq, struct mlx5e_dma_info *dma_info,
                        bool recycle)
 {
-       if (likely(recycle) && mlx5e_rx_cache_put(rq, dma_info))
-               return;
+       if (likely(recycle)) {
+               if (mlx5e_rx_cache_put(rq, dma_info))
+                       return;
 
-       mlx5e_page_dma_unmap(rq, dma_info);
-       put_page(dma_info->page);
+               mlx5e_page_dma_unmap(rq, dma_info);
+               page_pool_recycle_direct(rq->page_pool, dma_info->page);
+       } else {
+               mlx5e_page_dma_unmap(rq, dma_info);
+               put_page(dma_info->page);
+       }
 }
 
 static inline bool mlx5e_page_reuse(struct mlx5e_rq *rq,

Reply via email to