Fallback to the old way to update port information if the kernel driver
does not support RDMA monitor.

Signed-off-by: Minggang Li(Gavin) <gav...@nvidia.com>
Acked-by: Viacheslav Ovsiienko <viachesl...@nvidia.com>
---
 doc/guides/rel_notes/release_24_11.rst  | 14 +++++
 drivers/common/mlx5/linux/mlx5_nl.c     | 73 +++++++++++++++++++++++++
 drivers/common/mlx5/version.map         |  1 +
 drivers/net/mlx5/linux/mlx5_ethdev_os.c |  2 +-
 drivers/net/mlx5/linux/mlx5_os.c        | 27 +++++++--
 drivers/net/mlx5/mlx5.h                 |  1 +
 6 files changed, 111 insertions(+), 7 deletions(-)

diff --git a/doc/guides/rel_notes/release_24_11.rst 
b/doc/guides/rel_notes/release_24_11.rst
index fa4822d928..02827ff392 100644
--- a/doc/guides/rel_notes/release_24_11.rst
+++ b/doc/guides/rel_notes/release_24_11.rst
@@ -247,6 +247,20 @@ New Features
   Added ability for node to advertise and update multiple xstat counters,
   that can be retrieved using ``rte_graph_cluster_stats_get``.
 
+* **Updated NVIDIA mlx5 driver.**
+
+  Optimized port probe in large scale.
+  In previous release, it would take long time to probe one VF/SF if
+  hundreds of VF/SF were created in the system. With this newly introduced
+  feature optimization, the time to probe a VF/SF will be reduced greatly in
+  large scale, eg hundreds of VF/SFs. This feature is controlled through the
+  ``probe_opt_en`` device argument. Setting it to a non-zero value indicates
+  the application will enable this functionality when probing a device. This
+  feature relies on a feature of RDMA driver to be release in incoming
+  upstream kernel 6.13 or the equivalent in OFED 24.10, ie. RDMA monitor.
+  For further information on the devargs limitation, see
+  "doc/guides/nics/mlx5.rst".
+
 
 Removed Items
 -------------
diff --git a/drivers/common/mlx5/linux/mlx5_nl.c 
b/drivers/common/mlx5/linux/mlx5_nl.c
index ce1c2a8e75..12f1a620f3 100644
--- a/drivers/common/mlx5/linux/mlx5_nl.c
+++ b/drivers/common/mlx5/linux/mlx5_nl.c
@@ -2152,3 +2152,76 @@ mlx5_nl_rdma_monitor_info_get(struct nlmsghdr *hdr, 
struct mlx5_nl_port_info *da
 error:
        rte_errno = EINVAL;
 }
+
+static int
+mlx5_nl_rdma_monitor_cap_get_cb(struct nlmsghdr *hdr, void *arg)
+{
+       size_t off = NLMSG_HDRLEN;
+       uint8_t *cap = arg;
+
+       if (hdr->nlmsg_type != RDMA_NL_GET_TYPE(RDMA_NL_NLDEV, 
RDMA_NLDEV_CMD_SYS_GET))
+               goto error;
+
+       *cap = 0;
+       while (off < hdr->nlmsg_len) {
+               struct nlattr *na = (void *)((uintptr_t)hdr + off);
+               void *payload = (void *)((uintptr_t)na + NLA_HDRLEN);
+
+               if (na->nla_len > hdr->nlmsg_len - off)
+                       goto error;
+               switch (na->nla_type) {
+               case RDMA_NLDEV_SYS_ATTR_MONITOR_MODE:
+                       *cap = *(uint8_t *)payload;
+                       return 0;
+               default:
+                       break;
+               }
+               off += NLA_ALIGN(na->nla_len);
+       }
+
+       return 0;
+
+error:
+       return -EINVAL;
+}
+
+/**
+ * Get RDMA monitor support in driver.
+ *
+ *
+ * @param nl
+ *   Netlink socket of the RDMA kind (NETLINK_RDMA).
+ * @param[out] cap
+ *   Pointer to port info.
+ * @return
+ *   0 on success, negative on error and rte_errno is set.
+ */
+int
+mlx5_nl_rdma_monitor_cap_get(int nl, uint8_t *cap)
+{
+       union {
+               struct nlmsghdr nh;
+               uint8_t buf[NLMSG_HDRLEN];
+       } req = {
+               .nh = {
+                       .nlmsg_len = NLMSG_LENGTH(0),
+                       .nlmsg_type = RDMA_NL_GET_TYPE(RDMA_NL_NLDEV,
+                                                      RDMA_NLDEV_CMD_SYS_GET),
+                       .nlmsg_flags = NLM_F_REQUEST | NLM_F_ACK,
+               },
+       };
+       uint32_t sn = MLX5_NL_SN_GENERATE;
+       int ret;
+
+       ret = mlx5_nl_send(nl, &req.nh, sn);
+       if (ret < 0) {
+               rte_errno = -ret;
+               return ret;
+       }
+       ret = mlx5_nl_recv(nl, sn, mlx5_nl_rdma_monitor_cap_get_cb, cap);
+       if (ret < 0) {
+               rte_errno = -ret;
+               return ret;
+       }
+       return 0;
+}
diff --git a/drivers/common/mlx5/version.map b/drivers/common/mlx5/version.map
index 5230576006..8301485839 100644
--- a/drivers/common/mlx5/version.map
+++ b/drivers/common/mlx5/version.map
@@ -148,6 +148,7 @@ INTERNAL {
        mlx5_nl_vlan_vmwa_delete; # WINDOWS_NO_EXPORT
        mlx5_nl_rdma_monitor_init; # WINDOWS_NO_EXPORT
        mlx5_nl_rdma_monitor_info_get; # WINDOWS_NO_EXPORT
+       mlx5_nl_rdma_monitor_cap_get; # WINDOWS_NO_EXPORT
 
        mlx5_os_umem_dereg;
        mlx5_os_umem_reg;
diff --git a/drivers/net/mlx5/linux/mlx5_ethdev_os.c 
b/drivers/net/mlx5/linux/mlx5_ethdev_os.c
index 5156d96b3a..6b2c25a7c2 100644
--- a/drivers/net/mlx5/linux/mlx5_ethdev_os.c
+++ b/drivers/net/mlx5/linux/mlx5_ethdev_os.c
@@ -736,7 +736,7 @@ mlx5_dev_interrupt_nl_cb(struct nlmsghdr *hdr, void *cb_arg)
 
        if (mlx5_nl_parse_link_status_update(hdr, &if_index) < 0)
                return;
-       if (sh->cdev->config.probe_opt && sh->cdev->dev_info.port_num > 1)
+       if (sh->cdev->config.probe_opt && sh->cdev->dev_info.port_num > 1 && 
!sh->rdma_monitor_supp)
                mlx5_handle_port_info_update(&sh->cdev->dev_info, if_index, 
hdr->nlmsg_type);
 
        for (i = 0; i < sh->max_port; i++) {
diff --git a/drivers/net/mlx5/linux/mlx5_os.c b/drivers/net/mlx5/linux/mlx5_os.c
index 47da00937b..93556dc580 100644
--- a/drivers/net/mlx5/linux/mlx5_os.c
+++ b/drivers/net/mlx5/linux/mlx5_os.c
@@ -3022,6 +3022,7 @@ mlx5_os_dev_shared_handler_install(struct 
mlx5_dev_ctx_shared *sh)
 {
        struct ibv_context *ctx = sh->cdev->ctx;
        int nlsk_fd;
+       uint8_t rdma_monitor_supp = 0;
 
        sh->intr_handle = mlx5_os_interrupt_handler_create
                (RTE_INTR_INSTANCE_F_SHARED, true,
@@ -3030,20 +3031,34 @@ mlx5_os_dev_shared_handler_install(struct 
mlx5_dev_ctx_shared *sh)
                DRV_LOG(ERR, "Failed to allocate intr_handle.");
                return;
        }
-       if (sh->cdev->config.probe_opt && sh->cdev->dev_info.port_num > 1) {
+       if (sh->cdev->config.probe_opt &&
+           sh->cdev->dev_info.port_num > 1 &&
+           !sh->rdma_monitor_supp) {
                nlsk_fd = mlx5_nl_rdma_monitor_init();
                if (nlsk_fd < 0) {
                        DRV_LOG(ERR, "Failed to create a socket for RDMA 
Netlink events: %s",
                                rte_strerror(rte_errno));
                        return;
                }
-               sh->intr_handle_ib = mlx5_os_interrupt_handler_create
-                       (RTE_INTR_INSTANCE_F_SHARED, true,
-                        nlsk_fd, mlx5_dev_interrupt_handler_ib, sh);
-               if (sh->intr_handle_ib == NULL) {
-                       DRV_LOG(ERR, "Fail to allocate intr_handle");
+               if (mlx5_nl_rdma_monitor_cap_get(nlsk_fd, &rdma_monitor_supp)) {
+                       DRV_LOG(ERR, "Failed to query RDMA monitor support: %s",
+                               rte_strerror(rte_errno));
+                       close(nlsk_fd);
                        return;
                }
+               sh->rdma_monitor_supp = rdma_monitor_supp;
+               if (sh->rdma_monitor_supp) {
+                       sh->intr_handle_ib = mlx5_os_interrupt_handler_create
+                               (RTE_INTR_INSTANCE_F_SHARED, true,
+                                nlsk_fd, mlx5_dev_interrupt_handler_ib, sh);
+                       if (sh->intr_handle_ib == NULL) {
+                               DRV_LOG(ERR, "Fail to allocate intr_handle");
+                               close(nlsk_fd);
+                               return;
+                       }
+               } else {
+                       close(nlsk_fd);
+               }
        }
        nlsk_fd = mlx5_nl_init(NETLINK_ROUTE, RTMGRP_LINK);
        if (nlsk_fd < 0) {
diff --git a/drivers/net/mlx5/mlx5.h b/drivers/net/mlx5/mlx5.h
index fe56bc897a..126b48ac61 100644
--- a/drivers/net/mlx5/mlx5.h
+++ b/drivers/net/mlx5/mlx5.h
@@ -1517,6 +1517,7 @@ struct mlx5_dev_ctx_shared {
        uint32_t lag_rx_port_affinity_en:1;
        /* lag_rx_port_affinity is supported. */
        uint32_t hws_max_log_bulk_sz:5;
+       uint32_t rdma_monitor_supp:1;
        /* Log of minimal HWS counters created hard coded. */
        uint32_t hws_max_nb_counters; /* Maximal number for HWS counters. */
        uint32_t max_port; /* Maximal IB device port index. */
-- 
2.34.1

Reply via email to