Fallback to the old way to update port information if the kernel driver does not support RDMA monitor.
Signed-off-by: Minggang Li(Gavin) <gav...@nvidia.com> Acked-by: Viacheslav Ovsiienko <viachesl...@nvidia.com> --- doc/guides/rel_notes/release_24_11.rst | 14 +++++ drivers/common/mlx5/linux/mlx5_nl.c | 73 +++++++++++++++++++++++++ drivers/common/mlx5/version.map | 1 + drivers/net/mlx5/linux/mlx5_ethdev_os.c | 2 +- drivers/net/mlx5/linux/mlx5_os.c | 27 +++++++-- drivers/net/mlx5/mlx5.h | 1 + 6 files changed, 111 insertions(+), 7 deletions(-) diff --git a/doc/guides/rel_notes/release_24_11.rst b/doc/guides/rel_notes/release_24_11.rst index fa4822d928..02827ff392 100644 --- a/doc/guides/rel_notes/release_24_11.rst +++ b/doc/guides/rel_notes/release_24_11.rst @@ -247,6 +247,20 @@ New Features Added ability for node to advertise and update multiple xstat counters, that can be retrieved using ``rte_graph_cluster_stats_get``. +* **Updated NVIDIA mlx5 driver.** + + Optimized port probe in large scale. + In previous release, it would take long time to probe one VF/SF if + hundreds of VF/SF were created in the system. With this newly introduced + feature optimization, the time to probe a VF/SF will be reduced greatly in + large scale, eg hundreds of VF/SFs. This feature is controlled through the + ``probe_opt_en`` device argument. Setting it to a non-zero value indicates + the application will enable this functionality when probing a device. This + feature relies on a feature of RDMA driver to be release in incoming + upstream kernel 6.13 or the equivalent in OFED 24.10, ie. RDMA monitor. + For further information on the devargs limitation, see + "doc/guides/nics/mlx5.rst". + Removed Items ------------- diff --git a/drivers/common/mlx5/linux/mlx5_nl.c b/drivers/common/mlx5/linux/mlx5_nl.c index ce1c2a8e75..12f1a620f3 100644 --- a/drivers/common/mlx5/linux/mlx5_nl.c +++ b/drivers/common/mlx5/linux/mlx5_nl.c @@ -2152,3 +2152,76 @@ mlx5_nl_rdma_monitor_info_get(struct nlmsghdr *hdr, struct mlx5_nl_port_info *da error: rte_errno = EINVAL; } + +static int +mlx5_nl_rdma_monitor_cap_get_cb(struct nlmsghdr *hdr, void *arg) +{ + size_t off = NLMSG_HDRLEN; + uint8_t *cap = arg; + + if (hdr->nlmsg_type != RDMA_NL_GET_TYPE(RDMA_NL_NLDEV, RDMA_NLDEV_CMD_SYS_GET)) + goto error; + + *cap = 0; + while (off < hdr->nlmsg_len) { + struct nlattr *na = (void *)((uintptr_t)hdr + off); + void *payload = (void *)((uintptr_t)na + NLA_HDRLEN); + + if (na->nla_len > hdr->nlmsg_len - off) + goto error; + switch (na->nla_type) { + case RDMA_NLDEV_SYS_ATTR_MONITOR_MODE: + *cap = *(uint8_t *)payload; + return 0; + default: + break; + } + off += NLA_ALIGN(na->nla_len); + } + + return 0; + +error: + return -EINVAL; +} + +/** + * Get RDMA monitor support in driver. + * + * + * @param nl + * Netlink socket of the RDMA kind (NETLINK_RDMA). + * @param[out] cap + * Pointer to port info. + * @return + * 0 on success, negative on error and rte_errno is set. + */ +int +mlx5_nl_rdma_monitor_cap_get(int nl, uint8_t *cap) +{ + union { + struct nlmsghdr nh; + uint8_t buf[NLMSG_HDRLEN]; + } req = { + .nh = { + .nlmsg_len = NLMSG_LENGTH(0), + .nlmsg_type = RDMA_NL_GET_TYPE(RDMA_NL_NLDEV, + RDMA_NLDEV_CMD_SYS_GET), + .nlmsg_flags = NLM_F_REQUEST | NLM_F_ACK, + }, + }; + uint32_t sn = MLX5_NL_SN_GENERATE; + int ret; + + ret = mlx5_nl_send(nl, &req.nh, sn); + if (ret < 0) { + rte_errno = -ret; + return ret; + } + ret = mlx5_nl_recv(nl, sn, mlx5_nl_rdma_monitor_cap_get_cb, cap); + if (ret < 0) { + rte_errno = -ret; + return ret; + } + return 0; +} diff --git a/drivers/common/mlx5/version.map b/drivers/common/mlx5/version.map index 5230576006..8301485839 100644 --- a/drivers/common/mlx5/version.map +++ b/drivers/common/mlx5/version.map @@ -148,6 +148,7 @@ INTERNAL { mlx5_nl_vlan_vmwa_delete; # WINDOWS_NO_EXPORT mlx5_nl_rdma_monitor_init; # WINDOWS_NO_EXPORT mlx5_nl_rdma_monitor_info_get; # WINDOWS_NO_EXPORT + mlx5_nl_rdma_monitor_cap_get; # WINDOWS_NO_EXPORT mlx5_os_umem_dereg; mlx5_os_umem_reg; diff --git a/drivers/net/mlx5/linux/mlx5_ethdev_os.c b/drivers/net/mlx5/linux/mlx5_ethdev_os.c index 5156d96b3a..6b2c25a7c2 100644 --- a/drivers/net/mlx5/linux/mlx5_ethdev_os.c +++ b/drivers/net/mlx5/linux/mlx5_ethdev_os.c @@ -736,7 +736,7 @@ mlx5_dev_interrupt_nl_cb(struct nlmsghdr *hdr, void *cb_arg) if (mlx5_nl_parse_link_status_update(hdr, &if_index) < 0) return; - if (sh->cdev->config.probe_opt && sh->cdev->dev_info.port_num > 1) + if (sh->cdev->config.probe_opt && sh->cdev->dev_info.port_num > 1 && !sh->rdma_monitor_supp) mlx5_handle_port_info_update(&sh->cdev->dev_info, if_index, hdr->nlmsg_type); for (i = 0; i < sh->max_port; i++) { diff --git a/drivers/net/mlx5/linux/mlx5_os.c b/drivers/net/mlx5/linux/mlx5_os.c index 47da00937b..93556dc580 100644 --- a/drivers/net/mlx5/linux/mlx5_os.c +++ b/drivers/net/mlx5/linux/mlx5_os.c @@ -3022,6 +3022,7 @@ mlx5_os_dev_shared_handler_install(struct mlx5_dev_ctx_shared *sh) { struct ibv_context *ctx = sh->cdev->ctx; int nlsk_fd; + uint8_t rdma_monitor_supp = 0; sh->intr_handle = mlx5_os_interrupt_handler_create (RTE_INTR_INSTANCE_F_SHARED, true, @@ -3030,20 +3031,34 @@ mlx5_os_dev_shared_handler_install(struct mlx5_dev_ctx_shared *sh) DRV_LOG(ERR, "Failed to allocate intr_handle."); return; } - if (sh->cdev->config.probe_opt && sh->cdev->dev_info.port_num > 1) { + if (sh->cdev->config.probe_opt && + sh->cdev->dev_info.port_num > 1 && + !sh->rdma_monitor_supp) { nlsk_fd = mlx5_nl_rdma_monitor_init(); if (nlsk_fd < 0) { DRV_LOG(ERR, "Failed to create a socket for RDMA Netlink events: %s", rte_strerror(rte_errno)); return; } - sh->intr_handle_ib = mlx5_os_interrupt_handler_create - (RTE_INTR_INSTANCE_F_SHARED, true, - nlsk_fd, mlx5_dev_interrupt_handler_ib, sh); - if (sh->intr_handle_ib == NULL) { - DRV_LOG(ERR, "Fail to allocate intr_handle"); + if (mlx5_nl_rdma_monitor_cap_get(nlsk_fd, &rdma_monitor_supp)) { + DRV_LOG(ERR, "Failed to query RDMA monitor support: %s", + rte_strerror(rte_errno)); + close(nlsk_fd); return; } + sh->rdma_monitor_supp = rdma_monitor_supp; + if (sh->rdma_monitor_supp) { + sh->intr_handle_ib = mlx5_os_interrupt_handler_create + (RTE_INTR_INSTANCE_F_SHARED, true, + nlsk_fd, mlx5_dev_interrupt_handler_ib, sh); + if (sh->intr_handle_ib == NULL) { + DRV_LOG(ERR, "Fail to allocate intr_handle"); + close(nlsk_fd); + return; + } + } else { + close(nlsk_fd); + } } nlsk_fd = mlx5_nl_init(NETLINK_ROUTE, RTMGRP_LINK); if (nlsk_fd < 0) { diff --git a/drivers/net/mlx5/mlx5.h b/drivers/net/mlx5/mlx5.h index fe56bc897a..126b48ac61 100644 --- a/drivers/net/mlx5/mlx5.h +++ b/drivers/net/mlx5/mlx5.h @@ -1517,6 +1517,7 @@ struct mlx5_dev_ctx_shared { uint32_t lag_rx_port_affinity_en:1; /* lag_rx_port_affinity is supported. */ uint32_t hws_max_log_bulk_sz:5; + uint32_t rdma_monitor_supp:1; /* Log of minimal HWS counters created hard coded. */ uint32_t hws_max_nb_counters; /* Maximal number for HWS counters. */ uint32_t max_port; /* Maximal IB device port index. */ -- 2.34.1