Hi,

The code LGTM.
I am thinking that if the user wants to use HWS for up-to 1K SFs, we will still 
reach such limitations or if the application try to create some rings per 
device. Maybe in the log we can tell the user try to use some API to 
re-configure the maximal memzones as a hint. WDYT?

> -----Original Message-----
> From: Maayan Kashani <[email protected]>
> Sent: Monday, January 12, 2026 5:25 PM
> To: [email protected]
> Cc: Maayan Kashani <[email protected]>; Raslan Darawsheh
> <[email protected]>; Dariusz Sosnowski <[email protected]>;
> [email protected]; Slava Ovsiienko <[email protected]>; Bing Zhao
> <[email protected]>; Ori Kam <[email protected]>; Suanming Mou
> <[email protected]>; Matan Azrad <[email protected]>
> Subject: [PATCH 2/4] net/mlx5: fix default memzone requirements in HWS
> 
> From: Dariusz Sosnowski <[email protected]>
> 
> Commit [1] has changed the default behavior of flow engine selection in
> mlx5 PMD to accommodate for new NIC generations.
> Whenever underlying device does not support SWS (e.g., ConnectX-9 or
> untrusted VFs/SFs) and device does support HWS, default flow engine would
> be HWS (dv_flow_en=2) which also supports sync flow API.
> 
> This behavior change had consequence in memory usage whenever SFs are
> probed by DPDK. In default HWS configuration supporting sync flow API
> (i.e. without calling rte_flow_configure())
> mlx5 PMD allocated 4 rte_ring objects per port:
> 
> - indir_iq and indir_cq - For handling indirect action completions.
> - flow_transfer_pending and flow_transfer_completed - For handling
>   template table resizing.
> 
> This has not happened previously with SWS as default flow engine.
> 
> Since a dedicated memzone is allocated for each rte_ring object, this lead
> to exhaustion of default memzone limit on setups with ~1K SFs to probe.
> It resulted in the following error on port start:
> 
>     EAL: memzone_reserve_aligned_thread_unsafe():
>         Number of requested memzone segments exceeds maximum 2560
>     RING: Cannot reserve memory
>     mlx5_net: Failed to start port 998 mlx5_core.sf.998:
>         fail to configure port
> 
> Since template table resizing is allowed if and only if async flow API was
> configured, 2 of the aforementioned rings are never used in the default
> sync flow API configuration.
> 
> This patch removes allocation of flow_transfer_pending and
> flow_transfer_completed rings in default sync flow API configuration of
> mlx5 PMD to reduce memzone usage and allow DPDK probing to succeed on
> setups with ~1K SFs to probe.
> 
> [1] commit d1ac7b6c64d9
>     ("net/mlx5: update flow devargs handling for future HW")
> 
> Fixes: 27d171b88031 ("net/mlx5: abstract flow action and enable
> reconfigure")
> Cc: [email protected]
> 
> Signed-off-by: Dariusz Sosnowski <[email protected]>
> ---
>  drivers/net/mlx5/mlx5_flow_hw.c | 86 ++++++++++++++++++++++++++-------
>  1 file changed, 68 insertions(+), 18 deletions(-)
> 
> diff --git a/drivers/net/mlx5/mlx5_flow_hw.c
> b/drivers/net/mlx5/mlx5_flow_hw.c index 98483abc7fc..1dada2e7cef 100644
> --- a/drivers/net/mlx5/mlx5_flow_hw.c
> +++ b/drivers/net/mlx5/mlx5_flow_hw.c
> @@ -4483,6 +4483,9 @@ mlx5_hw_pull_flow_transfer_comp(struct rte_eth_dev
> *dev,
>       struct mlx5_priv *priv = dev->data->dev_private;
>       struct rte_ring *ring = priv->hw_q[queue].flow_transfer_completed;
> 
> +     if (ring == NULL)
> +             return 0;
> +
>       size = RTE_MIN(rte_ring_count(ring), n_res);
>       for (i = 0; i < size; i++) {
>               res[i].status = RTE_FLOW_OP_SUCCESS;
> @@ -4714,8 +4717,9 @@ __flow_hw_push_action(struct rte_eth_dev *dev,
>       struct mlx5_hw_q *hw_q = &priv->hw_q[queue];
> 
>       mlx5_hw_push_queue(hw_q->indir_iq, hw_q->indir_cq);
> -     mlx5_hw_push_queue(hw_q->flow_transfer_pending,
> -                        hw_q->flow_transfer_completed);
> +     if (hw_q->flow_transfer_pending != NULL && hw_q-
> >flow_transfer_completed != NULL)
> +             mlx5_hw_push_queue(hw_q->flow_transfer_pending,
> +                                hw_q->flow_transfer_completed);
>       if (!priv->shared_host) {
>               if (priv->hws_ctpool)
>                       mlx5_aso_push_wqe(priv->sh,
> @@ -11889,6 +11893,60 @@ mlx5_hwq_ring_create(uint16_t port_id, uint32_t
> queue, uint32_t size, const char
>                              RING_F_SP_ENQ | RING_F_SC_DEQ |
> RING_F_EXACT_SZ);  }
> 
> +static int
> +flow_hw_queue_setup_rings(struct rte_eth_dev *dev,
> +                       uint16_t queue,
> +                       uint32_t queue_size,
> +                       bool nt_mode)
> +{
> +     struct mlx5_priv *priv = dev->data->dev_private;
> +
> +     /* HWS queue info container must be already allocated. */
> +     MLX5_ASSERT(priv->hw_q != NULL);
> +
> +     /* Notice ring name length is limited. */
> +     priv->hw_q[queue].indir_cq = mlx5_hwq_ring_create
> +             (dev->data->port_id, queue, queue_size, "indir_act_cq");
> +     if (!priv->hw_q[queue].indir_cq) {
> +             DRV_LOG(ERR, "port %u failed to allocate indir_act_cq ring for
> HWS",
> +                     dev->data->port_id);
> +             return -ENOMEM;
> +     }
> +
> +     priv->hw_q[queue].indir_iq = mlx5_hwq_ring_create
> +             (dev->data->port_id, queue, queue_size, "indir_act_iq");
> +     if (!priv->hw_q[queue].indir_iq) {
> +             DRV_LOG(ERR, "port %u failed to allocate indir_act_iq ring for
> HWS",
> +                     dev->data->port_id);
> +             return -ENOMEM;
> +     }
> +
> +     /*
> +      * Sync flow API does not require rings used for table resize
> handling,
> +      * because these rings are only used through async flow APIs.
> +      */
> +     if (nt_mode)
> +             return 0;
> +
> +     priv->hw_q[queue].flow_transfer_pending = mlx5_hwq_ring_create
> +             (dev->data->port_id, queue, queue_size, "tx_pending");
> +     if (!priv->hw_q[queue].flow_transfer_pending) {
> +             DRV_LOG(ERR, "port %u failed to allocate tx_pending ring for
> HWS",
> +                     dev->data->port_id);
> +             return -ENOMEM;
> +     }
> +
> +     priv->hw_q[queue].flow_transfer_completed = mlx5_hwq_ring_create
> +             (dev->data->port_id, queue, queue_size, "tx_done");
> +     if (!priv->hw_q[queue].flow_transfer_completed) {
> +             DRV_LOG(ERR, "port %u failed to allocate tx_done ring for
> HWS",
> +                     dev->data->port_id);
> +             return -ENOMEM;
> +     }
> +
> +     return 0;
> +}
> +
>  static int
>  flow_hw_validate_attributes(const struct rte_flow_port_attr *port_attr,
>                           uint16_t nb_queue,
> @@ -12057,22 +12115,8 @@ __flow_hw_configure(struct rte_eth_dev *dev,
>                     &priv->hw_q[i].job[_queue_attr[i]->size];
>               for (j = 0; j < _queue_attr[i]->size; j++)
>                       priv->hw_q[i].job[j] = &job[j];
> -             /* Notice ring name length is limited. */
> -             priv->hw_q[i].indir_cq = mlx5_hwq_ring_create
> -                     (dev->data->port_id, i, _queue_attr[i]->size,
> "indir_act_cq");
> -             if (!priv->hw_q[i].indir_cq)
> -                     goto err;
> -             priv->hw_q[i].indir_iq = mlx5_hwq_ring_create
> -                     (dev->data->port_id, i, _queue_attr[i]->size,
> "indir_act_iq");
> -             if (!priv->hw_q[i].indir_iq)
> -                     goto err;
> -             priv->hw_q[i].flow_transfer_pending = mlx5_hwq_ring_create
> -                     (dev->data->port_id, i, _queue_attr[i]->size,
> "tx_pending");
> -             if (!priv->hw_q[i].flow_transfer_pending)
> -                     goto err;
> -             priv->hw_q[i].flow_transfer_completed = mlx5_hwq_ring_create
> -                     (dev->data->port_id, i, _queue_attr[i]->size,
> "tx_done");
> -             if (!priv->hw_q[i].flow_transfer_completed)
> +
> +             if (flow_hw_queue_setup_rings(dev, i, _queue_attr[i]->size,
> nt_mode)
> +< 0)
>                       goto err;
>       }
>       dr_ctx_attr.pd = priv->sh->cdev->pd;
> @@ -15440,6 +15484,12 @@ flow_hw_update_resized(struct rte_eth_dev *dev,
> uint32_t queue,
>       };
> 
>       MLX5_ASSERT(hw_flow->flags &
> MLX5_FLOW_HW_FLOW_FLAG_MATCHER_SELECTOR);
> +     /*
> +      * Update resized can be called only through async flow API.
> +      * These rings are allocated if and only if async flow API was
> configured.
> +      */
> +     MLX5_ASSERT(priv->hw_q[queue].flow_transfer_completed != NULL);
> +     MLX5_ASSERT(priv->hw_q[queue].flow_transfer_pending != NULL);
>       /**
>        * mlx5dr_matcher_resize_rule_move() accepts original table matcher
> -
>        * the one that was used BEFORE table resize.
> --
> 2.21.0

Acked-by: Bing Zhao <[email protected]>

Reply via email to