The Mellanox NICs starting from ConnectX-5 support LAG over
NIC ports internally, implemented by the NIC firmware and hardware.
The multiport NIC presents multiple physical PCI functions (PF),
with SR-IOV multiple virtual PCI functions (VFs) might be presented.
With switchdev mode the VF representors are engaged and PFs and their
VFs are connected by internal E-Switch feature. Each PF and related VFs
have dedicated E-Switch and belong to dedicated switch domain.

If NIC ports are combined to support NIC the kernel drivers introduce
the single unified Infiniband multiport devices, and all only one
unified E-Switch with single switch domain combines master PF
all all VFs. No extra DPDK bonding device is needed.

Signed-off-by: Viacheslav Ovsiienko <viachesl...@mellanox.com>
---
 drivers/net/mlx5/mlx5.c | 160 +++++++++++++++++++++++++++++++++++++++++++++++-
 1 file changed, 159 insertions(+), 1 deletion(-)

diff --git a/drivers/net/mlx5/mlx5.c b/drivers/net/mlx5/mlx5.c
index 701da7e..12eed13 100644
--- a/drivers/net/mlx5/mlx5.c
+++ b/drivers/net/mlx5/mlx5.c
@@ -169,6 +169,7 @@ struct mlx5_dev_spawn_data {
        uint32_t ifindex; /**< Network interface index. */
        uint32_t max_port; /**< IB device maximal port index. */
        uint32_t ibv_port; /**< IB device physical port index. */
+       int pf_bond; /**< bonding device PF index. < 0 - no bonding */
        struct mlx5_switch_info info; /**< Switch information. */
        struct ibv_device *ibv_dev; /**< Associated IB device. */
        struct rte_eth_dev *eth_dev; /**< Associated Ethernet device. */
@@ -2119,6 +2120,108 @@ struct mlx5_dev_spawn_data {
 }
 
 /**
+ * Match PCI information for possible slaves of bonding device.
+ *
+ * @param[in] ibv_dev
+ *   Pointer to Infiniband device structure.
+ * @param[in] pci_dev
+ *   Pointer to PCI device structure to match PCI address.
+ * @param[in] nl_rdma
+ *   Netlink RDMA group socket handle.
+ *
+ * @return
+ *   negative value if no bonding device found, otherwise
+ *   positive index of slave PF in bonding.
+ */
+static int
+mlx5_device_bond_pci_match(const struct ibv_device *ibv_dev,
+                          const struct rte_pci_device *pci_dev,
+                          int nl_rdma)
+{
+       char ifname[IF_NAMESIZE + 1];
+       unsigned int ifindex;
+       unsigned int np, i;
+       FILE *file = NULL;
+       int pf = -1;
+
+       /*
+        * Try to get master device name. If something goes
+        * wrong suppose the lack of kernel support and no
+        * bonding devices.
+        */
+       if (nl_rdma < 0)
+               return -1;
+       if (!strstr(ibv_dev->name, "bond"))
+               return -1;
+       np = mlx5_nl_portnum(nl_rdma, ibv_dev->name);
+       if (!np)
+               return -1;
+       /*
+        * The Master device might not be on the predefined
+        * port (not on port index 1, it is not garanted),
+        * we have to scan all Infiniband device port and
+        * find master.
+        */
+       for (i = 1; i <= np; ++i) {
+               /* Check whether Infiniband port is populated. */
+               ifindex = mlx5_nl_ifindex(nl_rdma, ibv_dev->name, i);
+               if (!ifindex)
+                       continue;
+               if (!if_indextoname(ifindex, ifname))
+                       continue;
+               /* Try to read bonding slave names from sysfs. */
+               MKSTR(slaves,
+                     "/sys/class/net/%s/master/bonding/slaves", ifname);
+               file = fopen(slaves, "r");
+               if (file)
+                       break;
+       }
+       if (!file)
+               return -1;
+       MKSTR(format, "%c%us", '%', (unsigned int)(sizeof(ifname) - 1));
+
+       /* Use safe format to check maximal buffer length. */
+#pragma GCC diagnostic ignored "-Wformat-nonliteral"
+       while (fscanf(file, format, ifname) == 1) {
+#pragma GCC diagnostic error "-Wformat-nonliteral"
+               char tmp_str[IF_NAMESIZE + 32];
+               struct rte_pci_addr pci_addr;
+               struct mlx5_switch_info info;
+
+               /* Process slave interface names in the loop. */
+               snprintf(tmp_str, sizeof(tmp_str),
+                        "/sys/class/net/%s", ifname);
+               if (mlx5_dev_to_pci_addr(tmp_str, &pci_addr)) {
+                       DRV_LOG(WARNING, "can not get PCI address"
+                                        " for netdev \"%s\"", ifname);
+                       continue;
+               }
+               if (pci_dev->addr.domain != pci_addr.domain ||
+                   pci_dev->addr.bus != pci_addr.bus ||
+                   pci_dev->addr.devid != pci_addr.devid ||
+                   pci_dev->addr.function != pci_addr.function)
+                       continue;
+               /* Slave interface PCI address match found. */
+               fclose(file);
+               snprintf(tmp_str, sizeof(tmp_str),
+                        "/sys/class/net/%s/phys_port_name", ifname);
+               file = fopen(tmp_str, "rb");
+               if (!file)
+                       break;
+               info.name_type = MLX5_PHYS_PORT_NAME_TYPE_NOTSET;
+               if (fscanf(file, "%32s", tmp_str) == 1)
+                       mlx5_translate_port_name(tmp_str, &info);
+               if (info.name_type == MLX5_PHYS_PORT_NAME_TYPE_LEGACY ||
+                   info.name_type == MLX5_PHYS_PORT_NAME_TYPE_UPLINK)
+                       pf = info.port_name;
+               break;
+       }
+       if (file)
+               fclose(file);
+       return pf;
+}
+
+/**
  * DPDK callback to register a PCI device.
  *
  * This function spawns Ethernet devices out of a given PCI device.
@@ -2154,6 +2257,12 @@ struct mlx5_dev_spawn_data {
         * Actually this is the number of iterations to spawn.
         */
        unsigned int ns = 0;
+       /*
+        * Bonding device
+        *   < 0 - no bonding device (single one)
+        *  >= 0 - bonding device (value is slave PF index)
+        */
+       int bd = -1;
        struct mlx5_dev_spawn_data *list = NULL;
        struct mlx5_dev_config dev_config;
        int ret;
@@ -2185,6 +2294,30 @@ struct mlx5_dev_spawn_data {
                struct rte_pci_addr pci_addr;
 
                DRV_LOG(DEBUG, "checking device \"%s\"", ibv_list[ret]->name);
+               bd = mlx5_device_bond_pci_match
+                               (ibv_list[ret], pci_dev, nl_rdma);
+               if (bd >= 0) {
+                       /*
+                        * Bonding device detected. Only one match is allowed,
+                        * the bonding is supported over multi-port IB device,
+                        * there should be no matches on representor PCI
+                        * functions or non VF LAG bonding devices with
+                        * specified address.
+                        */
+                       if (nd) {
+                               DRV_LOG(ERR,
+                                       "multiple PCI match on bonding device"
+                                       "\"%s\" found", ibv_list[ret]->name);
+                               rte_errno = ENOENT;
+                               ret = -rte_errno;
+                               goto exit;
+                       }
+                       DRV_LOG(INFO, "PCI information matches for"
+                                     " slave %d bonding device \"%s\"",
+                                     bd, ibv_list[ret]->name);
+                       ibv_match[nd++] = ibv_list[ret];
+                       break;
+               }
                if (mlx5_dev_to_pci_addr
                        (ibv_list[ret]->ibdev_path, &pci_addr))
                        continue;
@@ -2220,6 +2353,13 @@ struct mlx5_dev_spawn_data {
                if (!np)
                        DRV_LOG(WARNING, "can not get IB device \"%s\""
                                         " ports number", ibv_match[0]->name);
+               if (bd >= 0 && !np) {
+                       DRV_LOG(ERR, "can not get ports"
+                                    " for bonding device");
+                       rte_errno = ENOENT;
+                       ret = -rte_errno;
+                       goto exit;
+               }
        }
        /*
         * Now we can determine the maximal
@@ -2235,7 +2375,7 @@ struct mlx5_dev_spawn_data {
                ret = -rte_errno;
                goto exit;
        }
-       if (np > 1) {
+       if (bd >= 0 || np > 1) {
                /*
                 * Single IB device with multiple ports found,
                 * it may be E-Switch master device and representors.
@@ -2244,12 +2384,14 @@ struct mlx5_dev_spawn_data {
                assert(nl_rdma >= 0);
                assert(ns == 0);
                assert(nd == 1);
+               assert(np);
                for (i = 1; i <= np; ++i) {
                        list[ns].max_port = np;
                        list[ns].ibv_port = i;
                        list[ns].ibv_dev = ibv_match[0];
                        list[ns].eth_dev = NULL;
                        list[ns].pci_dev = pci_dev;
+                       list[ns].pf_bond = bd;
                        list[ns].ifindex = mlx5_nl_ifindex
                                        (nl_rdma, list[ns].ibv_dev->name, i);
                        if (!list[ns].ifindex) {
@@ -2279,6 +2421,21 @@ struct mlx5_dev_spawn_data {
                                                (list[ns].ifindex,
                                                 &list[ns].info);
                        }
+                       if (!ret && bd >= 0) {
+                               switch (list[ns].info.name_type) {
+                               case MLX5_PHYS_PORT_NAME_TYPE_UPLINK:
+                                       if (list[ns].info.port_name == bd)
+                                               ns++;
+                                       break;
+                               case MLX5_PHYS_PORT_NAME_TYPE_PFVF:
+                                       if (list[ns].info.pf_num == bd)
+                                               ns++;
+                                       break;
+                               default:
+                                       break;
+                               }
+                               continue;
+                       }
                        if (!ret && (list[ns].info.representor ^
                                     list[ns].info.master))
                                ns++;
@@ -2317,6 +2474,7 @@ struct mlx5_dev_spawn_data {
                        list[ns].ibv_dev = ibv_match[i];
                        list[ns].eth_dev = NULL;
                        list[ns].pci_dev = pci_dev;
+                       list[ns].pf_bond = -1;
                        list[ns].ifindex = 0;
                        if (nl_rdma >= 0)
                                list[ns].ifindex = mlx5_nl_ifindex
-- 
1.8.3.1

Reply via email to