Acked-by: Viacheslav Ovsiienko <[email protected]>
> -----Original Message-----
> From: Vincent Jardin <[email protected]>
> Sent: Sunday, March 22, 2026 3:46 PM
> To: [email protected]
> Cc: Raslan Darawsheh <[email protected]>; NBU-Contact-Thomas Monjalon
> (EXTERNAL) <[email protected]>; [email protected];
> Dariusz Sosnowski <[email protected]>; Slava Ovsiienko
> <[email protected]>; Bing Zhao <[email protected]>; Ori Kam
> <[email protected]>; Suanming Mou <[email protected]>; Matan Azrad
> <[email protected]>; [email protected];
> [email protected]; Vincent Jardin <[email protected]>
> Subject: [PATCH v4 04/10] net/mlx5: add per-queue packet pacing
> infrastructure
>
> Add mlx5_txq_rate_limit structure and alloc/free helpers for per-queue data-
> rate packet pacing. Each Tx queue can now hold its own PP (Packet Pacing)
> context allocated via mlx5dv_pp_alloc() with MLX5_DATA_RATE mode.
>
> mlx5_txq_alloc_pp_rate_limit() converts Mbps to kbps for the PRM rate_limit
> field and allocates a PP context from the HW rate table.
> mlx5_txq_free_pp_rate_limit() releases it.
>
> PP allocation uses shared mode (flags=0). Each dv_alloc_pp() call returns a
> distinct PP handle (needed for per-queue dv_free_pp() cleanup), but the kernel
> mlx5 driver internally maps identical rate parameters to the same HW rate
> table
> entry (same pp_id) with internal refcounting. This avoids exhausting the rate
> table (typically 128 entries on ConnectX-6 Dx) when many queues share the
> same rate.
>
> The existing Clock Queue path (sh->txpp.pp / sh->txpp.pp_id) is untouched — it
> uses MLX5_WQE_RATE for per-packet scheduling with a dedicated index, while
> per-queue rate limiting uses MLX5_DATA_RATE.
>
> PP index cleanup is added to mlx5_txq_release() to prevent leaks when queues
> are destroyed.
>
> Supported hardware:
> - ConnectX-6 Dx: per-SQ rate via packet_pacing_rate_limit_index
> - ConnectX-7/8: same mechanism, plus wait-on-time coexistence
> - BlueField-2/3: same PP allocation support
>
> Not supported:
> - ConnectX-5: packet_pacing exists but MLX5_DATA_RATE mode may
> not be available on all firmware versions
> - ConnectX-4 Lx and earlier: no packet_pacing capability
>
> Signed-off-by: Vincent Jardin <[email protected]>
> ---
> drivers/net/mlx5/mlx5.h | 11 +++++
> drivers/net/mlx5/mlx5_tx.h | 1 +
> drivers/net/mlx5/mlx5_txpp.c | 78
> ++++++++++++++++++++++++++++++++++++
> drivers/net/mlx5/mlx5_txq.c | 1 +
> 4 files changed, 91 insertions(+)
>
> diff --git a/drivers/net/mlx5/mlx5.h b/drivers/net/mlx5/mlx5.h index
> 4da184eb47..33628d7987 100644
> --- a/drivers/net/mlx5/mlx5.h
> +++ b/drivers/net/mlx5/mlx5.h
> @@ -1297,6 +1297,13 @@ struct mlx5_txpp_ts {
> RTE_ATOMIC(uint64_t) ts;
> };
>
> +/* Per-queue rate limit tracking. */
> +struct mlx5_txq_rate_limit {
> + void *pp; /* Packet pacing context from dv_alloc_pp. */
> + uint16_t pp_id; /* Packet pacing index. */
> + uint32_t rate_mbps; /* Current rate in Mbps, 0 = disabled. */
> +};
> +
> /* Tx packet pacing structure. */
> struct mlx5_dev_txpp {
> pthread_mutex_t mutex; /* Pacing create/destroy mutex. */ @@ -
> 2630,6 +2637,10 @@ int mlx5_txpp_xstats_get_names(struct rte_eth_dev
> *dev, void mlx5_txpp_interrupt_handler(void *cb_arg); int
> mlx5_txpp_map_hca_bar(struct rte_eth_dev *dev); void
> mlx5_txpp_unmap_hca_bar(struct rte_eth_dev *dev);
> +int mlx5_txq_alloc_pp_rate_limit(struct mlx5_dev_ctx_shared *sh,
> + struct mlx5_txq_rate_limit *rate_limit,
> + uint32_t rate_mbps);
> +void mlx5_txq_free_pp_rate_limit(struct mlx5_txq_rate_limit
> +*rate_limit);
>
> /* mlx5_rxtx.c */
>
> diff --git a/drivers/net/mlx5/mlx5_tx.h b/drivers/net/mlx5/mlx5_tx.h index
> 0134a2e003..51f330454a 100644
> --- a/drivers/net/mlx5/mlx5_tx.h
> +++ b/drivers/net/mlx5/mlx5_tx.h
> @@ -192,6 +192,7 @@ struct mlx5_txq_ctrl {
> uint16_t dump_file_n; /* Number of dump files. */
> struct rte_eth_hairpin_conf hairpin_conf; /* Hairpin configuration. */
> uint32_t hairpin_status; /* Hairpin binding status. */
> + struct mlx5_txq_rate_limit rate_limit; /* Per-queue rate limit. */
> struct mlx5_txq_data txq; /* Data path structure. */
> /* Must be the last field in the structure, contains elts[]. */ };
> diff --git
> a/drivers/net/mlx5/mlx5_txpp.c b/drivers/net/mlx5/mlx5_txpp.c index
> 0e99b58bde..e34e996e9b 100644
> --- a/drivers/net/mlx5/mlx5_txpp.c
> +++ b/drivers/net/mlx5/mlx5_txpp.c
> @@ -128,6 +128,84 @@ mlx5_txpp_alloc_pp_index(struct
> mlx5_dev_ctx_shared *sh) #endif }
>
> +/* Free a per-queue packet pacing index. */ void
> +mlx5_txq_free_pp_rate_limit(struct mlx5_txq_rate_limit *rate_limit) {
> +#ifdef HAVE_MLX5DV_PP_ALLOC
> + if (rate_limit->pp) {
> + mlx5_glue->dv_free_pp(rate_limit->pp);
> + rate_limit->pp = NULL;
> + rate_limit->pp_id = 0;
> + rate_limit->rate_mbps = 0;
> + }
> +#else
> + RTE_SET_USED(rate_limit);
> +#endif
> +}
> +
> +/* Allocate a per-queue packet pacing index for data-rate limiting. */
> +int mlx5_txq_alloc_pp_rate_limit(struct mlx5_dev_ctx_shared *sh,
> + struct mlx5_txq_rate_limit *rate_limit,
> + uint32_t rate_mbps)
> +{
> +#ifdef HAVE_MLX5DV_PP_ALLOC
> + uint32_t pp[MLX5_ST_SZ_DW(set_pp_rate_limit_context)];
> + uint64_t rate_kbps;
> + struct mlx5_hca_qos_attr *qos = &sh->cdev->config.hca_attr.qos;
> +
> + if (rate_mbps == 0) {
> + DRV_LOG(ERR, "Rate must be greater than zero.");
> + rte_errno = EINVAL;
> + return -EINVAL;
> + }
> + rate_kbps = (uint64_t)rate_mbps * 1000;
> + if (qos->packet_pacing_min_rate && rate_kbps < qos-
> >packet_pacing_min_rate) {
> + DRV_LOG(ERR, "Rate %u Mbps below HW minimum (%u
> kbps).",
> + rate_mbps, qos->packet_pacing_min_rate);
> + rte_errno = ERANGE;
> + return -ERANGE;
> + }
> + if (qos->packet_pacing_max_rate && rate_kbps > qos-
> >packet_pacing_max_rate) {
> + DRV_LOG(ERR, "Rate %u Mbps exceeds HW maximum (%u
> kbps).",
> + rate_mbps, qos->packet_pacing_max_rate);
> + rte_errno = ERANGE;
> + return -ERANGE;
> + }
> + memset(&pp, 0, sizeof(pp));
> + MLX5_SET(set_pp_rate_limit_context, &pp, rate_limit,
> (uint32_t)rate_kbps);
> + MLX5_SET(set_pp_rate_limit_context, &pp, rate_mode,
> MLX5_DATA_RATE);
> + rate_limit->pp = mlx5_glue->dv_alloc_pp(sh->cdev->ctx, sizeof(pp),
> + &pp, 0);
> + if (rate_limit->pp == NULL) {
> + DRV_LOG(ERR, "Failed to allocate PP index for rate %u Mbps.",
> + rate_mbps);
> + rte_errno = errno;
> + return -errno;
> + }
> + rate_limit->pp_id = ((struct mlx5dv_pp *)rate_limit->pp)->index;
> + if (!rate_limit->pp_id) {
> + DRV_LOG(ERR, "Zero PP index allocated for rate %u Mbps.",
> + rate_mbps);
> + mlx5_txq_free_pp_rate_limit(rate_limit);
> + rte_errno = ENOTSUP;
> + return -ENOTSUP;
> + }
> + rate_limit->rate_mbps = rate_mbps;
> + DRV_LOG(DEBUG, "Allocated PP index %u for rate %u Mbps.",
> + rate_limit->pp_id, rate_mbps);
> + return 0;
> +#else
> + RTE_SET_USED(sh);
> + RTE_SET_USED(rate_limit);
> + RTE_SET_USED(rate_mbps);
> + DRV_LOG(ERR, "Per-queue rate limit requires rdma-core PP support.");
> + rte_errno = ENOTSUP;
> + return -ENOTSUP;
> +#endif
> +}
> +
> static void
> mlx5_txpp_destroy_send_queue(struct mlx5_txpp_wq *wq) { diff --git
> a/drivers/net/mlx5/mlx5_txq.c b/drivers/net/mlx5/mlx5_txq.c index
> 9275efb58e..3356c89758 100644
> --- a/drivers/net/mlx5/mlx5_txq.c
> +++ b/drivers/net/mlx5/mlx5_txq.c
> @@ -1344,6 +1344,7 @@ mlx5_txq_release(struct rte_eth_dev *dev, uint16_t
> idx)
> mlx5_free(txq_ctrl->obj);
> txq_ctrl->obj = NULL;
> }
> + mlx5_txq_free_pp_rate_limit(&txq_ctrl->rate_limit);
> if (!txq_ctrl->is_hairpin) {
> if (txq_ctrl->txq.fcqs) {
> mlx5_free(txq_ctrl->txq.fcqs);
> --
> 2.43.0