On Mon, Jul 21, 2025 at 03:14:17AM -0700, Dipayaan Roy wrote:
> This patch enhances RX buffer handling in the mana driver by allocating
> pages from a page pool and slicing them into MTU-sized fragments, rather
> than dedicating a full page per packet. This approach is especially
> beneficial on systems with 64KB page sizes.
> 
> Key improvements:
> 
> - Proper integration of page pool for RX buffer allocations.
> - MTU-sized buffer slicing to improve memory utilization.
> - Reduce overall per Rx queue memory footprint.
> - Automatic fallback to full-page buffers when:
>    * Jumbo frames are enabled (MTU > PAGE_SIZE / 2).
>    * The XDP path is active, to avoid complexities with fragment reuse.
> - Removal of redundant pre-allocated RX buffers used in scenarios like MTU
>   changes, ensuring consistency in RX buffer allocation.
> 
> Testing on VMs with 64KB pages shows around 200% throughput improvement.
> Memory efficiency is significantly improved due to reduced wastage in page
> allocations. Example: We are now able to fit 35 Rx buffers in a single 64KB
> page for MTU size of 1500, instead of 1 Rx buffer per page previously.
> 
> Tested:
> 
> - iperf3, iperf2, and nttcp benchmarks.
> - Jumbo frames with MTU 9000.
> - Native XDP programs (XDP_PASS, XDP_DROP, XDP_TX, XDP_REDIRECT) for
>   testing the driver’s XDP path.
> - Page leak detection (kmemleak).
> - Driver load/unload, reboot, and stress scenarios.
> 
> Signed-off-by: Dipayaan Roy <[email protected]>
> ---
>  .../net/ethernet/microsoft/mana/mana_bpf.c    |  22 +-
>  drivers/net/ethernet/microsoft/mana/mana_en.c | 284 ++++++------------
>  .../ethernet/microsoft/mana/mana_ethtool.c    |  13 -
>  include/net/mana/mana.h                       |  13 +-
>  4 files changed, 115 insertions(+), 217 deletions(-)
> 
> diff --git a/drivers/net/ethernet/microsoft/mana/mana_bpf.c 
> b/drivers/net/ethernet/microsoft/mana/mana_bpf.c
> index d30721d4516f..96813b6c184f 100644
> --- a/drivers/net/ethernet/microsoft/mana/mana_bpf.c
> +++ b/drivers/net/ethernet/microsoft/mana/mana_bpf.c
> @@ -174,6 +174,7 @@ static int mana_xdp_set(struct net_device *ndev, struct 
> bpf_prog *prog,
>       struct mana_port_context *apc = netdev_priv(ndev);
>       struct bpf_prog *old_prog;
>       struct gdma_context *gc;
> +     int err;
>  
>       gc = apc->ac->gdma_dev->gdma_context;
>  
> @@ -198,14 +199,33 @@ static int mana_xdp_set(struct net_device *ndev, struct 
> bpf_prog *prog,
>       if (old_prog)
>               bpf_prog_put(old_prog);
>  
> -     if (apc->port_is_up)
> +     if (apc->port_is_up) {
> +             /* Re-create rxq's after xdp prog was loaded or unloaded.
> +              * Ex: re create rxq's to switch from full pages to smaller
> +              * size page fragments when xdp prog is unloaded and vice-versa.
> +              */
> +
> +             err = mana_detach(ndev, false);
> +             if (err) {
> +                     netdev_err(ndev, "mana_detach failed at xdp set: %d\n", 
> err);
> +                     goto out;

You should return err. At out we are always returning 0 which is wrong.

> +             }
> +
> +             err = mana_attach(ndev);
> +             if (err) {
> +                     netdev_err(ndev, "mana_attach failed at xdp set: %d\n", 
> err);
> +                     goto out;

same here.

> +             }
> +
>               mana_chn_setxdp(apc, prog);
> +     }
>  
>       if (prog)
>               ndev->max_mtu = MANA_XDP_MTU_MAX;
>       else
>               ndev->max_mtu = gc->adapter_mtu - ETH_HLEN;
>  
> +out:
>       return 0;
>  }
>  
> diff --git a/drivers/net/ethernet/microsoft/mana/mana_en.c 
> b/drivers/net/ethernet/microsoft/mana/mana_en.c
> index a7973651ae51..a474c59c907c 100644
> --- a/drivers/net/ethernet/microsoft/mana/mana_en.c
> +++ b/drivers/net/ethernet/microsoft/mana/mana_en.c
> @@ -548,171 +548,45 @@ static u16 mana_select_queue(struct net_device *ndev, 
> struct sk_buff *skb,
>       return txq;
>  }
>  
> -/* Release pre-allocated RX buffers */
> -void mana_pre_dealloc_rxbufs(struct mana_port_context *mpc)
> -{
> -     struct device *dev;
> -     int i;
> -
> -     dev = mpc->ac->gdma_dev->gdma_context->dev;
> -
> -     if (!mpc->rxbufs_pre)
> -             goto out1;
> -
> -     if (!mpc->das_pre)
> -             goto out2;
> -
> -     while (mpc->rxbpre_total) {
> -             i = --mpc->rxbpre_total;
> -             dma_unmap_single(dev, mpc->das_pre[i], mpc->rxbpre_datasize,
> -                              DMA_FROM_DEVICE);
> -             put_page(virt_to_head_page(mpc->rxbufs_pre[i]));
> -     }
> -
> -     kfree(mpc->das_pre);
> -     mpc->das_pre = NULL;
> -
> -out2:
> -     kfree(mpc->rxbufs_pre);
> -     mpc->rxbufs_pre = NULL;
> -
> -out1:
> -     mpc->rxbpre_datasize = 0;
> -     mpc->rxbpre_alloc_size = 0;
> -     mpc->rxbpre_headroom = 0;
> -}
> -
> -/* Get a buffer from the pre-allocated RX buffers */
> -static void *mana_get_rxbuf_pre(struct mana_rxq *rxq, dma_addr_t *da)
> -{
> -     struct net_device *ndev = rxq->ndev;
> -     struct mana_port_context *mpc;
> -     void *va;
> -
> -     mpc = netdev_priv(ndev);
> -
> -     if (!mpc->rxbufs_pre || !mpc->das_pre || !mpc->rxbpre_total) {
> -             netdev_err(ndev, "No RX pre-allocated bufs\n");
> -             return NULL;
> -     }
> -
> -     /* Check sizes to catch unexpected coding error */
> -     if (mpc->rxbpre_datasize != rxq->datasize) {
> -             netdev_err(ndev, "rxbpre_datasize mismatch: %u: %u\n",
> -                        mpc->rxbpre_datasize, rxq->datasize);
> -             return NULL;
> -     }
> -
> -     if (mpc->rxbpre_alloc_size != rxq->alloc_size) {
> -             netdev_err(ndev, "rxbpre_alloc_size mismatch: %u: %u\n",
> -                        mpc->rxbpre_alloc_size, rxq->alloc_size);
> -             return NULL;
> -     }
> -
> -     if (mpc->rxbpre_headroom != rxq->headroom) {
> -             netdev_err(ndev, "rxbpre_headroom mismatch: %u: %u\n",
> -                        mpc->rxbpre_headroom, rxq->headroom);
> -             return NULL;
> -     }
> -
> -     mpc->rxbpre_total--;
> -
> -     *da = mpc->das_pre[mpc->rxbpre_total];
> -     va = mpc->rxbufs_pre[mpc->rxbpre_total];
> -     mpc->rxbufs_pre[mpc->rxbpre_total] = NULL;
> -
> -     /* Deallocate the array after all buffers are gone */
> -     if (!mpc->rxbpre_total)
> -             mana_pre_dealloc_rxbufs(mpc);
> -
> -     return va;
> -}
> -
>  /* Get RX buffer's data size, alloc size, XDP headroom based on MTU */
> -static void mana_get_rxbuf_cfg(int mtu, u32 *datasize, u32 *alloc_size,
> -                            u32 *headroom)
> +static void mana_get_rxbuf_cfg(struct mana_port_context *apc,
> +                            int mtu, u32 *datasize, u32 *alloc_size,
> +                            u32 *headroom, u32 *frag_count)
>  {
> -     if (mtu > MANA_XDP_MTU_MAX)
> -             *headroom = 0; /* no support for XDP */
> -     else
> -             *headroom = XDP_PACKET_HEADROOM;
> -
> -     *alloc_size = SKB_DATA_ALIGN(mtu + MANA_RXBUF_PAD + *headroom);
> -
> -     /* Using page pool in this case, so alloc_size is PAGE_SIZE */
> -     if (*alloc_size < PAGE_SIZE)
> -             *alloc_size = PAGE_SIZE;
> -
> +     /* Calculate datasize first (consistent across all cases) */
>       *datasize = mtu + ETH_HLEN;
> -}
> -
> -int mana_pre_alloc_rxbufs(struct mana_port_context *mpc, int new_mtu, int 
> num_queues)
> -{
> -     struct device *dev;
> -     struct page *page;
> -     dma_addr_t da;
> -     int num_rxb;
> -     void *va;
> -     int i;
> -
> -     mana_get_rxbuf_cfg(new_mtu, &mpc->rxbpre_datasize,
> -                        &mpc->rxbpre_alloc_size, &mpc->rxbpre_headroom);
> -
> -     dev = mpc->ac->gdma_dev->gdma_context->dev;
> -
> -     num_rxb = num_queues * mpc->rx_queue_size;
> -
> -     WARN(mpc->rxbufs_pre, "mana rxbufs_pre exists\n");
> -     mpc->rxbufs_pre = kmalloc_array(num_rxb, sizeof(void *), GFP_KERNEL);
> -     if (!mpc->rxbufs_pre)
> -             goto error;
>  
> -     mpc->das_pre = kmalloc_array(num_rxb, sizeof(dma_addr_t), GFP_KERNEL);
> -     if (!mpc->das_pre)
> -             goto error;
> -
> -     mpc->rxbpre_total = 0;
> -
> -     for (i = 0; i < num_rxb; i++) {
> -             page = dev_alloc_pages(get_order(mpc->rxbpre_alloc_size));
> -             if (!page)
> -                     goto error;
> -
> -             va = page_to_virt(page);
> -
> -             da = dma_map_single(dev, va + mpc->rxbpre_headroom,
> -                                 mpc->rxbpre_datasize, DMA_FROM_DEVICE);
> -             if (dma_mapping_error(dev, da)) {
> -                     put_page(page);
> -                     goto error;
> +     /* For xdp and jumbo frames make sure only one packet fits per page */
> +     if (((mtu + MANA_RXBUF_PAD) > PAGE_SIZE / 2) || 
> rcu_access_pointer(apc->bpf_prog)) {
> +             if (rcu_access_pointer(apc->bpf_prog)) {
> +                     *headroom = XDP_PACKET_HEADROOM;
> +                     *alloc_size = PAGE_SIZE;
> +             } else {
> +                     *headroom = 0; /* no support for XDP */
> +                     *alloc_size = SKB_DATA_ALIGN(mtu + MANA_RXBUF_PAD + 
> *headroom);
>               }
>  
> -             mpc->rxbufs_pre[i] = va;
> -             mpc->das_pre[i] = da;
> -             mpc->rxbpre_total = i + 1;
> +             *frag_count = 1;
> +             return;
>       }
>  
> -     return 0;
> +     /* Standard MTU case - optimize for multiple packets per page */
> +     *headroom = 0;
>  
> -error:
> -     netdev_err(mpc->ndev, "Failed to pre-allocate RX buffers for %d 
> queues\n", num_queues);
> -     mana_pre_dealloc_rxbufs(mpc);
> -     return -ENOMEM;
> +     /* Calculate base buffer size needed */
> +     u32 len = SKB_DATA_ALIGN(mtu + MANA_RXBUF_PAD + *headroom);
> +     u32 buf_size = ALIGN(len, MANA_RX_FRAG_ALIGNMENT);

its good to have all the declaration at start of function.

> +
> +     /* Calculate how many packets can fit in a page */
> +     *frag_count = PAGE_SIZE / buf_size;
> +     *alloc_size = buf_size;
>  }
>  
>  static int mana_change_mtu(struct net_device *ndev, int new_mtu)
>  {
> -     struct mana_port_context *mpc = netdev_priv(ndev);
>       unsigned int old_mtu = ndev->mtu;
>       int err;
>  
> -     /* Pre-allocate buffers to prevent failure in mana_attach later */
> -     err = mana_pre_alloc_rxbufs(mpc, new_mtu, mpc->num_queues);
> -     if (err) {
> -             netdev_err(ndev, "Insufficient memory for new MTU\n");
> -             return err;
> -     }
> -
>       err = mana_detach(ndev, false);
>       if (err) {
>               netdev_err(ndev, "mana_detach failed: %d\n", err);
> @@ -728,7 +602,6 @@ static int mana_change_mtu(struct net_device *ndev, int 
> new_mtu)
>       }
>  
>  out:
> -     mana_pre_dealloc_rxbufs(mpc);
>       return err;
>  }
>  
> @@ -1841,8 +1714,11 @@ static void mana_rx_skb(void *buf_va, bool from_pool,
>  
>  drop:
>       if (from_pool) {
> -             page_pool_recycle_direct(rxq->page_pool,
> -                                      virt_to_head_page(buf_va));
> +             if (rxq->frag_count == 1)
> +                     page_pool_recycle_direct(rxq->page_pool,
> +                                              virt_to_head_page(buf_va));
> +             else
> +                     page_pool_free_va(rxq->page_pool, buf_va, true);
>       } else {
>               WARN_ON_ONCE(rxq->xdp_save_va);
>               /* Save for reuse */
> @@ -1854,37 +1730,50 @@ static void mana_rx_skb(void *buf_va, bool from_pool,
>       return;
>  }
>  
> -static void *mana_get_rxfrag(struct mana_rxq *rxq, struct device *dev,
> -                          dma_addr_t *da, bool *from_pool)
> +static void *mana_get_rxfrag(struct mana_rxq *rxq,
> +                          struct device *dev, dma_addr_t *da, bool 
> *from_pool)
>  {
>       struct page *page;
> +     u32 offset;
>       void *va;
> -
>       *from_pool = false;
>  
> -     /* Reuse XDP dropped page if available */
> -     if (rxq->xdp_save_va) {
> -             va = rxq->xdp_save_va;
> -             rxq->xdp_save_va = NULL;
> -     } else {
> -             page = page_pool_dev_alloc_pages(rxq->page_pool);
> -             if (!page)
> +     /* Don't use fragments for jumbo frames or XDP (i.e when fragment = 1 
> per page) */
> +     if (rxq->frag_count == 1) {
> +             /* Reuse XDP dropped page if available */
> +             if (rxq->xdp_save_va) {
> +                     va = rxq->xdp_save_va;
> +                     rxq->xdp_save_va = NULL;
> +             } else {
> +                     page = page_pool_dev_alloc_pages(rxq->page_pool);
> +                     if (!page)
> +                             return NULL;
> +
> +                     *from_pool = true;
> +                     va = page_to_virt(page);
> +             }
> +
> +             *da = dma_map_single(dev, va + rxq->headroom, rxq->datasize,
> +                                  DMA_FROM_DEVICE);
> +             if (dma_mapping_error(dev, *da)) {
> +                     if (*from_pool)
> +                             page_pool_put_full_page(rxq->page_pool, page, 
> false);
> +                     else
> +                             put_page(virt_to_head_page(va));
> +
>                       return NULL;
> +             }
>  
> -             *from_pool = true;
> -             va = page_to_virt(page);
> +             return va;
>       }
>  
> -     *da = dma_map_single(dev, va + rxq->headroom, rxq->datasize,
> -                          DMA_FROM_DEVICE);
> -     if (dma_mapping_error(dev, *da)) {
> -             if (*from_pool)
> -                     page_pool_put_full_page(rxq->page_pool, page, false);
> -             else
> -                     put_page(virt_to_head_page(va));
> -
> +     page =  page_pool_dev_alloc_frag(rxq->page_pool, &offset, 
> rxq->alloc_size);
> +     if (!page)
>               return NULL;
> -     }
> +
> +     va  = page_to_virt(page) + offset;
> +     *da = page_pool_get_dma_addr(page) + offset + rxq->headroom;
> +     *from_pool = true;
>  
>       return va;
>  }
> @@ -1901,9 +1790,9 @@ static void mana_refill_rx_oob(struct device *dev, 
> struct mana_rxq *rxq,
>       va = mana_get_rxfrag(rxq, dev, &da, &from_pool);
>       if (!va)
>               return;
> -
> -     dma_unmap_single(dev, rxoob->sgl[0].address, rxq->datasize,
> -                      DMA_FROM_DEVICE);
> +     if (!rxoob->from_pool || rxq->frag_count == 1)
> +             dma_unmap_single(dev, rxoob->sgl[0].address, rxq->datasize,
> +                              DMA_FROM_DEVICE);
>       *old_buf = rxoob->buf_va;
>       *old_fp = rxoob->from_pool;
>  
> @@ -2314,15 +2203,19 @@ static void mana_destroy_rxq(struct mana_port_context 
> *apc,
>               if (!rx_oob->buf_va)
>                       continue;
>  
> -             dma_unmap_single(dev, rx_oob->sgl[0].address,
> -                              rx_oob->sgl[0].size, DMA_FROM_DEVICE);
> -
>               page = virt_to_head_page(rx_oob->buf_va);
>  
> -             if (rx_oob->from_pool)
> -                     page_pool_put_full_page(rxq->page_pool, page, false);
> -             else
> -                     put_page(page);
> +             if (rxq->frag_count == 1) {
> +                     dma_unmap_single(dev, rx_oob->sgl[0].address, 
> rx_oob->sgl[0].size,
> +                                      DMA_FROM_DEVICE);
> +
> +                     if (rx_oob->from_pool)
> +                             page_pool_put_full_page(rxq->page_pool, page, 
> false);
> +                     else
> +                             put_page(page);
> +             } else {
> +                     page_pool_free_va(rxq->page_pool, rx_oob->buf_va, true);
> +             }
>  
>               rx_oob->buf_va = NULL;
>       }
> @@ -2338,16 +2231,11 @@ static void mana_destroy_rxq(struct mana_port_context 
> *apc,
>  static int mana_fill_rx_oob(struct mana_recv_buf_oob *rx_oob, u32 mem_key,
>                           struct mana_rxq *rxq, struct device *dev)
>  {
> -     struct mana_port_context *mpc = netdev_priv(rxq->ndev);
>       bool from_pool = false;
>       dma_addr_t da;
>       void *va;
>  
> -     if (mpc->rxbufs_pre)
> -             va = mana_get_rxbuf_pre(rxq, &da);
> -     else
> -             va = mana_get_rxfrag(rxq, dev, &da, &from_pool);
> -
> +     va = mana_get_rxfrag(rxq, dev, &da, &from_pool);
>       if (!va)
>               return -ENOMEM;
>  
> @@ -2428,11 +2316,22 @@ static int mana_create_page_pool(struct mana_rxq 
> *rxq, struct gdma_context *gc)
>       struct page_pool_params pprm = {};
>       int ret;
>  
> -     pprm.pool_size = mpc->rx_queue_size;
> +     pprm.pool_size = mpc->rx_queue_size / rxq->frag_count + 1;
>       pprm.nid = gc->numa_node;
>       pprm.napi = &rxq->rx_cq.napi;
>       pprm.netdev = rxq->ndev;
>       pprm.order = get_order(rxq->alloc_size);
> +     pprm.queue_idx = rxq->rxq_idx;
> +     pprm.dev = gc->dev;
> +
> +     /* Let the page pool do the dma map when page sharing with multiple 
> fragments
> +      * enabled for rx buffers.
> +      */
> +     if (rxq->frag_count > 1) {
> +             pprm.flags =  PP_FLAG_DMA_MAP | PP_FLAG_DMA_SYNC_DEV;
> +             pprm.max_len = PAGE_SIZE;
> +             pprm.dma_dir = DMA_FROM_DEVICE;
> +     }
>  
>       rxq->page_pool = page_pool_create(&pprm);
>  
> @@ -2471,9 +2370,8 @@ static struct mana_rxq *mana_create_rxq(struct 
> mana_port_context *apc,
>       rxq->rxq_idx = rxq_idx;
>       rxq->rxobj = INVALID_MANA_HANDLE;
>  
> -     mana_get_rxbuf_cfg(ndev->mtu, &rxq->datasize, &rxq->alloc_size,
> -                        &rxq->headroom);
> -
> +     mana_get_rxbuf_cfg(apc, ndev->mtu, &rxq->datasize, &rxq->alloc_size,
> +                        &rxq->headroom, &rxq->frag_count);
>       /* Create page pool for RX queue */
>       err = mana_create_page_pool(rxq, gc);
>       if (err) {
> diff --git a/drivers/net/ethernet/microsoft/mana/mana_ethtool.c 
> b/drivers/net/ethernet/microsoft/mana/mana_ethtool.c
> index a1afa75a9463..7ede03c74fb9 100644
> --- a/drivers/net/ethernet/microsoft/mana/mana_ethtool.c
> +++ b/drivers/net/ethernet/microsoft/mana/mana_ethtool.c
> @@ -396,12 +396,6 @@ static int mana_set_channels(struct net_device *ndev,
>       unsigned int old_count = apc->num_queues;
>       int err;
>  
> -     err = mana_pre_alloc_rxbufs(apc, ndev->mtu, new_count);
> -     if (err) {
> -             netdev_err(ndev, "Insufficient memory for new allocations");
> -             return err;
> -     }
> -
>       err = mana_detach(ndev, false);
>       if (err) {
>               netdev_err(ndev, "mana_detach failed: %d\n", err);
> @@ -416,7 +410,6 @@ static int mana_set_channels(struct net_device *ndev,
>       }
>  
>  out:
> -     mana_pre_dealloc_rxbufs(apc);
>       return err;
>  }
>  
> @@ -465,12 +458,7 @@ static int mana_set_ringparam(struct net_device *ndev,
>  
>       /* pre-allocating new buffers to prevent failures in mana_attach() 
> later */
>       apc->rx_queue_size = new_rx;
> -     err = mana_pre_alloc_rxbufs(apc, ndev->mtu, apc->num_queues);
>       apc->rx_queue_size = old_rx;
> -     if (err) {
> -             netdev_err(ndev, "Insufficient memory for new allocations\n");
> -             return err;
> -     }
>  
>       err = mana_detach(ndev, false);
>       if (err) {
> @@ -488,7 +476,6 @@ static int mana_set_ringparam(struct net_device *ndev,
>               apc->rx_queue_size = old_rx;
>       }
>  out:
> -     mana_pre_dealloc_rxbufs(apc);
>       return err;
>  }
>  
> diff --git a/include/net/mana/mana.h b/include/net/mana/mana.h
> index e1030a7d2daa..99a3847b0f9d 100644
> --- a/include/net/mana/mana.h
> +++ b/include/net/mana/mana.h
> @@ -65,6 +65,8 @@ enum TRI_STATE {
>  #define MANA_STATS_RX_COUNT 5
>  #define MANA_STATS_TX_COUNT 11
>  
> +#define MANA_RX_FRAG_ALIGNMENT 64
> +
>  struct mana_stats_rx {
>       u64 packets;
>       u64 bytes;
> @@ -328,6 +330,7 @@ struct mana_rxq {
>       u32 datasize;
>       u32 alloc_size;
>       u32 headroom;
> +     u32 frag_count;
>  
>       mana_handle_t rxobj;
>  
> @@ -503,14 +506,6 @@ struct mana_port_context {
>       /* This points to an array of num_queues of RQ pointers. */
>       struct mana_rxq **rxqs;
>  
> -     /* pre-allocated rx buffer array */
> -     void **rxbufs_pre;
> -     dma_addr_t *das_pre;
> -     int rxbpre_total;
> -     u32 rxbpre_datasize;
> -     u32 rxbpre_alloc_size;
> -     u32 rxbpre_headroom;
> -
>       struct bpf_prog *bpf_prog;
>  
>       /* Create num_queues EQs, SQs, SQ-CQs, RQs and RQ-CQs, respectively. */
> @@ -574,8 +569,6 @@ int mana_query_link_cfg(struct mana_port_context *apc);
>  int mana_set_bw_clamp(struct mana_port_context *apc, u32 speed,
>                     int enable_clamping);
>  void mana_query_phy_stats(struct mana_port_context *apc);
> -int mana_pre_alloc_rxbufs(struct mana_port_context *apc, int mtu, int 
> num_queues);
> -void mana_pre_dealloc_rxbufs(struct mana_port_context *apc);
>  
>  extern const struct ethtool_ops mana_ethtool_ops;
>  extern struct dentry *mana_debugfs_root;
> -- 
> 2.43.0


Rest looks good. After fixing above,
Reviewed-by: Saurabh Sengar <[email protected]>

Reply via email to