This is my first attempt to configure Lustre for RDMA (Mellanox RoCEv2). lnetctl net show net: - net type: lo local NI(s): - nid: 0@lo status: up
Below results in an error. The interface (ens800f0) is working and I can ping other nodes on that network. lnetctl net add --net o2ib --if ens800f0 add: - net: errno: -100 descr: "cannot add network: Network is down" [root@inst-fknk9-relaxing-louse ~]# dmesg | tail [ 1399.903159] Lustre: Lustre: Build Version: 2.12.6 [ 1427.411527] LNetError: 20092:0:(o2iblnd.c:2781:kiblnd_dev_failover()) Failed to bind ens800f0:192.168.169.112 to device( (null)): -19 [ 1427.564213] LNetError: 20092:0:(o2iblnd.c:3314:kiblnd_startup()) ko2iblnd: Can't initialize device: rc = -19 [ 1428.681259] LNetError: 105-4: Error -100 starting up LNI o2ib [ 1474.343671] LNetError: 20260:0:(o2iblnd.c:2781:kiblnd_dev_failover()) Failed to bind ens800f0:192.168.169.112 to device( (null)): -19 [ 1474.496347] LNetError: 20260:0:(o2iblnd.c:3314:kiblnd_startup()) ko2iblnd: Can't initialize device: rc = -19 [ 1475.610993] LNetError: 105-4: Error -100 starting up LNI o2ib [ 1535.441463] LNetError: 20549:0:(o2iblnd.c:2781:kiblnd_dev_failover()) Failed to bind ens800f0:192.168.169.112 to device( (null)): -19 [ 1535.594183] LNetError: 20549:0:(o2iblnd.c:3314:kiblnd_startup()) ko2iblnd: Can't initialize device: rc = -19 [ 1536.709841] LNetError: 105-4: Error -100 starting up LNI o2ib Interface: ens800f0 is the 100Gbps RDMA Mlnx NIC: ip addr 1: lo: <LOOPBACK,UP,LOWER_UP> mtu 65536 qdisc noqueue state UNKNOWN group default qlen 1000 link/loopback 00:00:00:00:00:00 brd 00:00:00:00:00:00 inet 127.0.0.1/8 scope host lo valid_lft forever preferred_lft forever 2: ens300f0: <BROADCAST,MULTICAST,UP,LOWER_UP> mtu 9000 qdisc mq state UP group default qlen 1000 link/ether b8:ce:f6:25:ff:5e brd ff:ff:ff:ff:ff:ff inet 172.16.5.112/22 brd 172.16.7.255 scope global dynamic ens300f0 valid_lft 84734sec preferred_lft 84734sec 3: ens300f1: <BROADCAST,MULTICAST,UP,LOWER_UP> mtu 1500 qdisc mq state UP group default qlen 1000 link/ether b8:ce:f6:25:ff:5f brd ff:ff:ff:ff:ff:ff 4: ens800f0: <BROADCAST,MULTICAST,UP,LOWER_UP> mtu 1500 qdisc mq state UP group default qlen 1000 link/ether 04:3f:72:e3:08:42 brd ff:ff:ff:ff:ff:ff inet 192.168.169.112/22 brd 192.168.171.255 scope global ens800f0 valid_lft forever preferred_lft forever 5: ens800f1: <NO-CARRIER,BROADCAST,MULTICAST,UP> mtu 1500 qdisc mq state DOWN group default qlen 1000 link/ether 04:3f:72:e3:08:43 brd ff:ff:ff:ff:ff:ff OS: RHCK 7.9 3.10.0-1160.2.1.el7_lustre.x86_64 OFED: Mellanox ofed_info -n 4.9-3.1.5.0 cat /etc/lnet.conf is empty cat /etc/modprobe.d/lnet.conf cat: /etc/modprobe.d/lnet.conf: No such file or directory [root@inst-fknk9-relaxing-louse ~]# modprobe -v lustre insmod /lib/modules/3.10.0-1160.2.1.el7_lustre.x86_64/extra/lustre/fs/obdclass.ko insmod /lib/modules/3.10.0-1160.2.1.el7_lustre.x86_64/extra/lustre/fs/ptlrpc.ko insmod /lib/modules/3.10.0-1160.2.1.el7_lustre.x86_64/extra/lustre/fs/fld.ko insmod /lib/modules/3.10.0-1160.2.1.el7_lustre.x86_64/extra/lustre/fs/fid.ko insmod /lib/modules/3.10.0-1160.2.1.el7_lustre.x86_64/extra/lustre/fs/lov.ko insmod /lib/modules/3.10.0-1160.2.1.el7_lustre.x86_64/extra/lustre/fs/osc.ko insmod /lib/modules/3.10.0-1160.2.1.el7_lustre.x86_64/extra/lustre/fs/mdc.ko insmod /lib/modules/3.10.0-1160.2.1.el7_lustre.x86_64/extra/lustre/fs/lmv.ko insmod /lib/modules/3.10.0-1160.2.1.el7_lustre.x86_64/extra/lustre/fs/lustre.ko [root@inst-fknk9-relaxing-louse ~]# Based on discussion threads from Google search, one thread said to add this, still same error. echo 'options lnet networks="o2ib(ens800f0)" ' > /etc/modprobe.d/lustre.conf echo 'options lnet networks="o2ib(ens800f0)" ' > /etc/modprobe.d/lnet.conf Thanks, Pinkesh Valdria Principal Solutions Architect – HPC
_______________________________________________ lustre-discuss mailing list lustre-discuss@lists.lustre.org http://lists.lustre.org/listinfo.cgi/lustre-discuss-lustre.org