Hi Jon, thanks for your time.
You're absolutely correct regarding the network configuration. The node boots
up with PXE from a single network card and them it creates the LACP bond. On
the switch side that was achieved with LACP fallback. So yes, ip and
nicips.bond0 are the same. It's the single interface that is later on bonded.
Regarding NetworkManager, I thought xCAT was using it to configure it's
network. So there's a way to disable it?
Finally, the networkconfig script works if I add it to the postbootscripts
section instead of postscripts. In the later I end up with the server without
any network. Console login is unavaible because root password does not work
either, so I can't check what happened. Don't know why.
Anyway, here's the files:
Bondies first:
[root@login ~]# cat /etc/sysconfig/network-scripts/ifcfg-xcat-bond-bond0
BONDING_OPTS="mode=802.3ad miimon=100"
TYPE=Bond
BONDING_MASTER=yes
HWADDR=
MTU=1500
PROXY_METHOD=none
BROWSER_ONLY=no
BOOTPROTO=none
IPADDR=172.26.255.253
PREFIX=16
DEFROUTE=yes
IPV4_FAILURE_FATAL=no
IPV6INIT=yes
IPV6_AUTOCONF=yes
IPV6_DEFROUTE=yes
IPV6_FAILURE_FATAL=no
IPV6_ADDR_GEN_MODE=stable-privacy
NAME=xcat-bond-bond0
UUID=c7b9cbeb-2966-4e41-88a5-be212ca4b675
DEVICE=bond0
ONBOOT=yes
AUTOCONNECT_PRIORITY=9
AUTOCONNECT_RETRIES=0
AUTOCONNECT_SLAVES=yes
[root@login ~]# cat
/etc/sysconfig/network-scripts/ifcfg-xcat-bond-slave-ens1f0np0
MTU=1500
TYPE=Ethernet
NAME=xcat-bond-slave-ens1f0np0
UUID=b9738cbf-908e-4c6f-8dad-29f422f75d11
DEVICE=ens1f0np0
ONBOOT=yes
AUTOCONNECT_PRIORITY=9
AUTOCONNECT_RETRIES=0
MASTER_UUID=c7b9cbeb-2966-4e41-88a5-be212ca4b675
MASTER=bond0
SLAVE=yes
[root@login ~]# cat
/etc/sysconfig/network-scripts/ifcfg-xcat-bond-slave-ens1f1np1
MTU=1500
TYPE=Ethernet
NAME=xcat-bond-slave-ens1f1np1
UUID=667dc3c7-f2d9-44ea-9342-5e20fe54d8a8
DEVICE=ens1f1np1
ONBOOT=yes
AUTOCONNECT_PRIORITY=9
AUTOCONNECT_RETRIES=0
MASTER_UUID=c7b9cbeb-2966-4e41-88a5-be212ca4b675
MASTER=bond0
SLAVE=yes
This is not supposed to be here:
[root@login ~]# cat /etc/sysconfig/network-scripts/ifcfg-xcat-ens1f0np0
TYPE=Ethernet
PROXY_METHOD=none
BROWSER_ONLY=no
BOOTPROTO=none
IPADDR=172.26.255.253
PREFIX=16
DEFROUTE=yes
IPV4_FAILURE_FATAL=no
IPV6INIT=yes
IPV6_AUTOCONF=yes
IPV6_DEFROUTE=yes
IPV6_FAILURE_FATAL=no
IPV6_ADDR_GEN_MODE=stable-privacy
NAME=xcat-ens1f0np0
UUID=dd90bcec-1520-403e-a384-7edb2a53756b
DEVICE=ens1f0np0
ONBOOT=no
AUTOCONNECT_PRIORITY=9
MTU=1500
GATEWAY=172.26.255.254
That's the WAN interface:
[root@login ~]# cat /etc/sysconfig/network-scripts/ifcfg-xcat-eno1
TYPE=Ethernet
PROXY_METHOD=none
BROWSER_ONLY=no
BOOTPROTO=none
IPADDR=XXX.XXX.XXX.XXX
PREFIX=26
DEFROUTE=yes
IPV4_FAILURE_FATAL=no
IPV6INIT=yes
IPV6_AUTOCONF=yes
IPV6_DEFROUTE=yes
IPV6_FAILURE_FATAL=no
IPV6_ADDR_GEN_MODE=stable-privacy
NAME=xcat-eno1
UUID=741bff50-38b7-4849-ac27-9caa3ffeb329
DEVICE=eno1
ONBOOT=yes
AUTOCONNECT_PRIORITY=9
MTU=1500
GATEWAY=172.26.255.254
That's a VLAN on top of the bond:
[root@login ~]# cat /etc/sysconfig/network-scripts/ifcfg-xcat-vlan-bond0.1010
VLAN=yes
TYPE=Vlan
PHYSDEV=bond0
VLAN_ID=1010
REORDER_HDR=yes
GVRP=no
MVRP=no
HWADDR=
MTU=1500
PROXY_METHOD=none
BROWSER_ONLY=no
BOOTPROTO=none
IPADDR=10.0.255.253
PREFIX=24
DEFROUTE=yes
IPV4_FAILURE_FATAL=no
IPV6INIT=yes
IPV6_AUTOCONF=yes
IPV6_DEFROUTE=yes
IPV6_FAILURE_FATAL=no
IPV6_ADDR_GEN_MODE=stable-privacy
NAME=xcat-vlan-bond0.1010
UUID=e45236da-5054-4074-812a-9693d9b7771a
ONBOOT=yes
AUTOCONNECT_PRIORITY=9
AUTOCONNECT_RETRIES=0
[root@login ~]# ip -o a
1: lo inet 127.0.0.1/8 scope host lo\ valid_lft forever preferred_lft
forever
1: lo inet6 ::1/128 scope host \ valid_lft forever preferred_lft
forever
3: eno1 inet XXX.XXX.XXX.XXX/26 brd XXX.XXX.XXX.255 scope global
noprefixroute eno1\ valid_lft forever preferred_lft forever
3: eno1 inet6 fe80::2c4c:dee5:2256:adb5/64 scope link noprefixroute \
valid_lft forever preferred_lft forever
10: ib0 inet 172.27.255.253/16 brd 172.27.255.255 scope global noprefixroute
ib0\ valid_lft forever preferred_lft forever
10: ib0 inet6 fe80::63f:7203:e2:3cb6/64 scope link \ valid_lft forever
preferred_lft forever
11: virbr0 inet 192.168.122.1/24 brd 192.168.122.255 scope global virbr0\
valid_lft forever preferred_lft forever
13: bond0 inet 172.26.255.253/16 brd 172.26.255.255 scope global
noprefixroute bond0\ valid_lft forever preferred_lft forever
13: bond0 inet6 fe80::7b68:313e:d868:2c54/64 scope link noprefixroute \
valid_lft forever preferred_lft forever
14: bond0.1010 inet 10.0.255.253/24 brd 10.0.255.255 scope global
noprefixroute bond0.1010\ valid_lft forever preferred_lft forever
14: bond0.1010 inet6 fe80::58e6:efd:80d3:5a68/64 scope link noprefixroute \
valid_lft forever preferred_lft forever
Finally the non xCAT ifcfgs and you guessed right. DHCP is enabled in one of
the slaves of the bons, this is effectively a conflict; look at ens1f0np0
(which is the PXE fallback interface).
[root@login ~]# cat /etc/sysconfig/network-scripts/ifcfg-ens1f1np1
TYPE=Ethernet
PROXY_METHOD=none
BROWSER_ONLY=no
BOOTPROTO=dhcp
DEFROUTE=yes
IPV4_FAILURE_FATAL=no
IPV6INIT=yes
IPV6_AUTOCONF=yes
IPV6_DEFROUTE=yes
IPV6_FAILURE_FATAL=no
NAME=ens1f1np1
UUID=dac4a797-99fd-4d7f-92e4-8c0c257e9128
DEVICE=ens1f1np1
ONBOOT=no
[root@login ~]# cat /etc/sysconfig/network-scripts/ifcfg-ens1f0np0
# Generated by parse-kickstart
TYPE=Ethernet
DEVICE=ens1f0np0
UUID=beba2419-7903-4f09-8e61-16b51b66d835
ONBOOT=yes
BOOTPROTO=dhcp
IPV6INIT=yes
PROXY_METHOD=none
BROWSER_ONLY=no
DEFROUTE=yes
IPV4_FAILURE_FATAL=no
IPV6_AUTOCONF=yes
IPV6_DEFROUTE=yes
IPV6_FAILURE_FATAL=no
NAME="System ens1f0np0"
So Jon, what's your suggestion here? Do you mind to share how you fixed this?
Thank you so much.
Vinícius
PS: routes are messed up too:
[root@login ~]# ip r
default via 172.26.255.254 dev eno1 proto static metric 103 <=== this should
not be here, this is the WAN interface.
default via 172.26.255.254 dev bond0 proto static metric 300
default via 172.26.255.254 dev bond0.1010 proto static metric 400 <=== this
should not be here, this is a VLAN...
10.0.255.0/24 dev bond0.1010 proto kernel scope link src 10.0.255.253 metric 400
XXX.XXX.XXX.192/26 dev eno1 proto kernel scope link src 143.106.42.244 metric
103 <=== this should be the default route.
172.26.0.0/16 dev bond0 proto kernel scope link src 172.26.255.253 metric 300
172.26.255.254 dev eno1 proto static scope link metric 103
172.26.255.254 dev bond0.1010 proto static scope link metric 400
172.27.0.0/16 dev ib0 proto kernel scope link src 172.27.255.253 metric 150
192.168.122.0/24 dev virbr0 proto kernel scope link src 192.168.122.1 linkdown
On 16 Jun 2021, at 05:11, Jon Diprose
<[email protected]<mailto:[email protected]>> wrote:
Hi Vinícius,
I don't have masses to contribute on this other than that we generally disable
NetworkManager (why on earth would I want my server dynamically reconfiguring
its network?) but I am curious about your bonding set-up. Am I right in
thinking that you are setting up a bond for the primary nic (the one xCAT talks
to)? It looks like the values for 'ip' and 'nicips.bond0' are the same. I never
got that to work with the xCAT-supplied postscripts and had to write my own to
do it, plus specifying some additional install-time kernel params and the
relevant switch config. So does your way actually generate the correct ifcfg
files?
To me, your symptoms are consistent with an interface that is still dhcp-ing
but receiving an empty dns config, or an ifcfg file with an empty "DNS=" param.
And even with NetworkManager disabled I routinely add "PEERDNS=no" (and
"DEFROUTE=no") to the nicextraparams.* setting for all secondary nics, though
it is the route that usually bites me there.
Could you share `ip -o a` and the ifcfg files?
Jon
--
Dr. Jonathan Diprose <[email protected]<mailto:[email protected]>>
Tel: 01865 287873
Research Computing Manager
Henry Wellcome Building for Genomic Medicine
Roosevelt Drive, Headington, Oxford OX3 7BN
________________________________
From: Vinícius Ferrão via xCAT-user
[[email protected]<mailto:[email protected]>]
Sent: 16 June 2021 04:15
To: xCAT Users Mailing list
Cc: Vinícius Ferrão
Subject: Re: [xcat-user] /etc/resolv.conf missing nameserver on install nodes
I was able to at lease top /etc/resolv.conf from being overwritten at every
reboot with the following file:
# cat /etc/NetworkManager/conf.d/90-dns-none.conf
[main]
dns=none
I added this to the synclists and we are good about the /etc/resolv.conf isso.
The conclusion is that NetworkManager was doing something wrong on
/etc/resolv.conf.
Although that was fixed with a hack there's consequences of it, the hostname of
the machine is set as localhost.localdomain, and I don't know how to fix it. Is
there any option in the node table to set the default hostname? So
confignetwork can do it's job?
# lsdef login
Object name: login
arch=x86_64
bmc=172.25.255.253
bmcpassword=calvin
bmcusername=root
cons=ipmi
consoleenabled=1
currchain=boot
currstate=install ol8.4.0-x86_64-compute
groups=login,all
ip=172.26.255.253
mac=2c:ea:7f:92:aa:d9
mgt=ipmi
netboot=xnba
nicdevices.bond0=ens1f0np0|ens1f1np1
nicdevices.bond0.1010=bond0
nichostnamesuffixes.bond0.1010=-ceph
nicips.ib0=172.27.255.253
nicips.eno1=XXX.XXX.XXX.XXX
nicips.bond0=172.26.255.253
nicips.bond0.1010=10.0.255.253
nicnetworks.ib0=application
nicnetworks.eno1=site
nicnetworks.bond0=management
nicnetworks.bond0.1010=ceph
nictypes.ens1f1np1=ethernet
nictypes.bond0=bond
nictypes.eno1=ethernet
nictypes.ib0=Infiniband
nictypes.bond0.1010=vlan
nictypes.ens1f0np0=ethernet
os=ol8.4.0
postbootscripts=otherpkgs,versatushpc/openpbs-login,versatushpc/fix-ohpc-login
postscripts=syslog,remoteshell,syncfiles,confignetwork,versatushpc/postinstall-login
profile=compute
provmethod=ol8.4.0-x86_64-install-login
serialport=0
serialspeed=115200
status=powering-on
statustime=06-15-2021 16:29:52
updatestatus=failed
updatestatustime=06-15-2021 16:27:27
Thanks,
Vinícius.
On 14 Jun 2021, at 13:48, Vinícius Ferrão via xCAT-user
<[email protected]<mailto:[email protected]>> wrote:
Hi Thomas,
There's a pattern that I've found. When the compute node is simple enough it
works, probably da data for resolv.conf is fetched directly from DHPC which
should be configured correctly.
The issue is around the nodes that have custom network schemes, like bonds and
VLANs; it's something wrong during the confignetwork postscript. Probably due
to a configuration mistake that I've made but I don't know which one.
Regarding your questions:
1) It does not exist
[root@ceph01-ib0 ~]# systemctl status systemd-networkd
Unit systemd-networkd.service could not be found.
2) It's running
[root@ceph01-ib0 ~]# systemctl status NetworkManager
● NetworkManager.service - Network Manager
Loaded: loaded (/usr/lib/systemd/system/NetworkManager.service; enabled;
vendor preset: enabled)
Active: active (running) since Mon 2021-06-14 13:37:20 -03; 8min ago
Docs: man:NetworkManager(8)
Main PID: 2028 (NetworkManager)
Tasks: 3 (limit: 2464038)
Memory: 11.4M
CGroup: /system.slice/NetworkManager.service
└─2028 /usr/sbin/NetworkManager --no-daemon
3) It does not exist:
[root@ceph01-ib0 ~]# ls -l /etc/resolv.conf
-rw-r--r-- 1 root root 65 Jun 14 13:37 /etc/resolv.conf
[root@ceph01-ib0 ~]# ls -l /run/systemd/resolv/resolv.conf
ls: cannot access '/run/systemd/resolv/resolv.conf': No such file or directory
Cannot find anything related to rc-manager, is this a systemd thing?
4) No it's not.
[root@ceph01-ib0 ~]# ls -l /etc/resolv.conf
-rw-r--r-- 1 root root 65 Jun 14 13:37 /etc/resolv.conf
5) Seems default to me
[root@ceph01-ib0 ~]# grep host /etc/nsswitch.conf
# Valid databases are: aliases, ethers, group, gshadow, hosts,
# myhostname Use systemd host names
hosts: files dns myhostname
That's it.
It's probably something messy with confignetwork script, but not sure what.
Thanks,
On 14 Jun 2021, at 07:57, Thomas HUMMEL
<[email protected]<mailto:[email protected]>> wrote:
On 14/06/2021 07:41, Vinícius Ferrão via xCAT-user wrote:
Hello,
For unknown reasons nodes that I've installed with rinstall (using stateful
method) didn't get the nameserver section in resolv.conf, basically leaving the
node without any name resolution.
Hello,
assuming it is not an xCAT bug, I would look at
1) if systemd-networkd is enabled
2) if NetworkManager is enabled
3) if b) if it handles /etc/resolv.conf by looking at its conf and
a) is dns= stated ?
b) is /etc/resolv.conf a symlink to /run/systemd/resolv/resolv.conf ?
c) is rc-manager stated ?
4) is /etc/resolv.conf a symlink to ../run/resolvconf/resolv.conf ?
5) the host line of /etc/nsswitch.conf
to figure out who manages /etc/resolv.conf
Hope it helps.
--
Thomas HUMMEL
rc-manager=
As specified on the documentation
https://xcat-docs.readthedocs.io/en/stable/advanced/domain_name_resolution/domain_name_resolution.html<https://urldefense.com/v3/__https://xcat-docs.readthedocs.io/en/stable/advanced/domain_name_resolution/domain_name_resolution.html__;!!JFdNOqOXpB6UZW0!91ZLw8JQX3n5Rscdto49z3zhxcPMupJEn1wtuLVOZFrMI5loio5BEgk3-82bVMwzYliuCA$>;
it should be generated it nameservers and domain are provided on the site
table: The resolv.conf files for the compute nodes will be created
automatically using the domain and nameservers values set in the xCAT network
or site definition.
Both are defined but it still didn't generate it correctly.
[root@headnode ~]# lsdef -t site clustersite | egrep "nameserver|forward|domain"
domain=cluster.domain.tld
forwarders=1.1.1.1
nameservers=172.26.255.254
I even tried adding the nameservers to the network definition, but it was a no
go:
[root@headnode ~]# lsdef -t network management
Object name: management
gateway=<xcatmaster>
mask=255.255.0.0
mgtifname=bond0
mtu=1500
nameservers=172.26.255.254
net=172.26.0.0
tftpserver=<xcatmaster>
Is there anything that I can do to debug this?
Thanks,
Vinícius.
PS: Here's full data from a given node and the networks.
[root@headnode ~]# lsdef ceph01
Object name: ceph01
arch=x86_64
bmc=172.25.254.1
bmcpassword=calvin
bmcusername=root
cons=ipmi
consoleenabled=1
currchain=boot
currstate=install ol8.4.0-x86_64-compute
groups=ceph,all
ip=172.26.254.1
mac=bc:97:e1:ea:08:b0
mgt=ipmi
netboot=xnba
nicdevices.bond0.123=bond0
nicdevices.bond0.1010=bond0
nicdevices.bond0=ens1f0np0|ens1f1np1
nichostnamesuffixes.bond0.1010=-ceph
nichostnamesuffixes.bond0.123=-cephsync
nicips.ib0=172.27.254.1
nicips.bond0=172.26.254.1
nicips.bond0.1010=10.0.10.21
nicips.bond0.123=192.168.168.21
nicnetworks.bond0.123=ceph-sync
nicnetworks.ib0=application
nicnetworks.bond0.1010=ceph
nicnetworks.bond0=management
nictypes.ib0=Infiniband
nictypes.ens1f0np0=ethernet
nictypes.bond0.1010=vlan
nictypes.bond0=bond
nictypes.ens1f1np1=ethernet
nictypes.bond0.123=vlan
os=ol8.4.0
postbootscripts=otherpkgs,confignics
postscripts=syslog,remoteshell,syncfiles,confignetwork,versatushpc/postinstall-ceph
profile=compute
provmethod=ol8.4.0-x86_64-install-ceph
serialport=0
serialspeed=115200
status=booted
statustime=06-14-2021 02:37:04
updatestatus=synced
updatestatustime=06-14-2021 02:01:55
[root@headnode ~]# lsdef -t network
application (network)
ceph (network)
ceph-sync (network)
libvirt (network)
management (network)
service (network)
site (network)
_______________________________________________
xCAT-user mailing list
[email protected]<mailto:[email protected]>
https://urldefense.com/v3/__https://lists.sourceforge.net/lists/listinfo/xcat-user__;!!JFdNOqOXpB6UZW0!91ZLw8JQX3n5Rscdto49z3zhxcPMupJEn1wtuLVOZFrMI5loio5BEgk3-82bVMxD4UfdFg$
_______________________________________________
xCAT-user mailing list
[email protected]<mailto:[email protected]>
https://lists.sourceforge.net/lists/listinfo/xcat-user
_______________________________________________
xCAT-user mailing list
[email protected]<mailto:[email protected]>
https://lists.sourceforge.net/lists/listinfo/xcat-user
_______________________________________________
xCAT-user mailing list
[email protected]
https://lists.sourceforge.net/lists/listinfo/xcat-user