On Wed, Apr 26, 2023 at 11:17:32AM +0200, Alexander Bluhm wrote:
> On Tue, Apr 25, 2023 at 11:57:09PM +0300, Vitaliy Makkoveev wrote:
> > On Tue, Apr 25, 2023 at 11:44:34AM +0200, Alexander Bluhm wrote:
> > > Hi,
> > >
> > > Mutex arp_mtx protects the llinfo_arp la_... fields. So kernel
> > > lock is only needed for changing the route rt_flags.
> > >
> > > Of course there is a race between checking and setting rt_flags.
> > > But the other checks of the RTF_REJECT flags were without kernel
> > > lock before. This does not cause trouble, the worst thing that may
> > > happen is to wait another exprire time for ARP retry. My diff does
> > > not make it worse, reading rt_flags and rt_expire is done without
> > > lock anyway.
> > >
> > > The kernel lock is needed to change rt_flags. Testing without
> > > KERNEL_LOCK() caused crashes.
> > >
> >
> > Hi,
> >
> > I'm interesting is the system stable with the diff below? If so, we
> > could avoid kernel lock in the arpresolve().
>
> I could not crash it.
I was too fast. Just after writing this mail I restarted the test.
[0] 0:arp- 1:ksh* "ot31.obsd-lab.genua.d" 12:00
26-Apr-23ESC[mESC(BESC[23;18Hpanic: pool_do_get: art_node free list modified:
page 0xfffffd8747128000; item addr 0xfffffd8747128410; offset
0x0=0x182f4660f2a7188a != 0x182f4660f2a71889
Stopped at db_enter+0x14: popq %rbp
TID PID UID PRFLAGS PFLAGS CPU COMMAND
45805 80626 0 0x14000 0x200 3 reaper
353816 99629 0 0x14000 0x200 1 softnet
487701 10647 0 0x14000 0x200 2 softnet
152789 43620 0 0x14000 0x200 7 softnet
*356742 68683 0 0x14000 0x200 5 softnet
db_enter() at db_enter+0x14
panic(ffffffff8213f5d0) at panic+0xc3
pool_do_get(ffffffff824ae060,a,ffff8000247b71b4) at pool_do_get+0x321
pool_get(ffffffff824ae060,a) at pool_get+0x9a
art_get(ffff8000027ceac0,20) at art_get+0x30
rtable_insert(0,ffff8000027ceac0,0,ffff8000247b72f0,3,fffffd8745e4a948) at rtab
le_insert+0x1a2
rtrequest(b,ffff8000247b73f8,3,ffff8000247b7498,0) at rtrequest+0x613
rt_clone(ffff8000247b7500,ffff8000247b7558,0) at rt_clone+0x77
rtalloc_mpath(ffff8000247b7558,fffffd800369aad8,0) at rtalloc_mpath+0x50
in_ouraddr(fffffd80a94fcd00,ffff80000077e048,ffff8000247b75d8) at in_ouraddr+0x
88
ip_input_if(ffff8000247b7678,ffff8000247b7684,4,0,ffff80000077e048) at ip_input
ipv4_input(ffff80000077e048,fffffd80a94fcd00) at ipv4_input+0x3d
ether_input(ffff80000077e048,fffffd80a94fcd00) at ether_input+0x3b5
if_input_process(ffff80000077e048,ffff8000247b7768) at if_input_process+0x6f
end trace frame: 0xffff8000247b77b0, count: 0
https://www.openbsd.org/ddb.html describes the minimum info required in bug
reports. Insufficient info makes it difficult to find and fix bugs.
ddb{5}> show panic
*cpu5: pool_do_get: art_node free list modified: page 0xfffffd8747128000; item a
ddr 0xfffffd8747128410; offset 0x0=0x182f4660f2a7188a != 0x182f4660f2a71889
ddb{5}> show register
rdi 0
rsi 0x14
rbp 0xffff8000247b7060
rbx 0
rdx 0xc000000000000000
rcx 0x286
rax 0x9c
r8 0x101010101010101
r9 0
r10 0xcdd678d0954ec026
r11 0x92611e3f4c85263e
r12 0xffff80002252e990
r13 0
r14 0
r15 0xffffffff8213f5d0 cy_pio_rec+0x1ea86
rip 0xffffffff81b0f124 db_enter+0x14
cs 0x8
rflags 0x282
rsp 0xffff8000247b7060
ss 0x10
db_enter+0x14: popq %rbp
ddb{5}> trace
db_enter() at db_enter+0x14
panic(ffffffff8213f5d0) at panic+0xc3
pool_do_get(ffffffff824ae060,a,ffff8000247b71b4) at pool_do_get+0x321
pool_get(ffffffff824ae060,a) at pool_get+0x9a
art_get(ffff8000027ceac0,20) at art_get+0x30
rtable_insert(0,ffff8000027ceac0,0,ffff8000247b72f0,3,fffffd8745e4a948) at rtab
le_insert+0x1a2
rtrequest(b,ffff8000247b73f8,3,ffff8000247b7498,0) at rtrequest+0x613
rt_clone(ffff8000247b7500,ffff8000247b7558,0) at rt_clone+0x77
rtalloc_mpath(ffff8000247b7558,fffffd800369aad8,0) at rtalloc_mpath+0x50
in_ouraddr(fffffd80a94fcd00,ffff80000077e048,ffff8000247b75d8) at in_ouraddr+0x
88
ip_input_if(ffff8000247b7678,ffff8000247b7684,4,0,ffff80000077e048) at ip_input
_if+0x1f0
ipv4_input(ffff80000077e048,fffffd80a94fcd00) at ipv4_input+0x3d
ether_input(ffff80000077e048,fffffd80a94fcd00) at ether_input+0x3b5
if_input_process(ffff80000077e048,ffff8000247b7768) at if_input_process+0x6f
ifiq_process(ffff800000782400) at ifiq_process+0x75
taskq_thread(ffff800000036000) at taskq_thread+0x100
end trace frame: 0x0, count: -16
ddb{5}> ps
PID TID PPID UID S FLAGS WAIT COMMAND
3208 212022 39561 0 3 0x10008b sigsusp timeout
6160 324932 4896 0 3 0x100083 ttyin ksh
39561 309293 4896 0 3 0x10008b sigsusp ksh
4896 16573 1 0 3 0x100080 kqread tmux
91788 275888 20986 0 3 0x100083 kqread tmux
20986 27136 1 0 3 0x10008b sigsusp ksh
59912 394974 1 0 3 0x100098 kqread cron
25465 470631 1 99 3 0x1100090 kqread sndiod
55473 426797 1 110 3 0x100090 kqread sndiod
59948 369086 93818 95 3 0x1100092 kqread smtpd
91703 135835 93818 103 3 0x1100092 kqread smtpd
96840 246769 93818 95 3 0x1100092 kqread smtpd
52992 383912 93818 95 3 0x100092 kqread smtpd
99867 251002 93818 95 3 0x1100092 kqread smtpd
82244 385303 93818 95 3 0x1100092 kqread smtpd
93818 366740 1 0 3 0x100080 kqread smtpd
73222 145148 94559 89 3 0x1100092 kqread relayd
23499 417534 94559 89 3 0x1100092 kqread relayd
7002 187795 94559 89 3 0x1100092 kqread relayd
1328 96961 94559 89 3 0x1100092 kqread relayd
35216 236120 94559 89 3 0x1100092 kqread relayd
44997 23635 94559 89 3 0x1100092 kqread relayd
82930 19018 94559 89 3 0x1100092 kqread relayd
42026 16480 94559 89 3 0x1100092 kqread relayd
94559 253040 1 0 3 0x80 kqread relayd
89865 410855 1 0 3 0x88 kqread sshd
22339 294875 0 0 3 0x14280 nfsidl nfsio
89599 51352 0 0 3 0x14280 nfsidl nfsio
63849 361892 0 0 3 0x14280 nfsidl nfsio
30664 357919 0 0 3 0x14280 nfsidl nfsio
10499 399494 1 0 3 0x100080 kqread ntpd
97199 131290 30807 83 3 0x100092 kqread ntpd
30807 203991 1 83 3 0x1100092 kqread ntpd
26198 299506 99348 74 3 0x1100092 bpf pflogd
99348 201119 1 0 3 0x80 netio pflogd
96301 342337 50719 73 3 0x1100090 kqread syslogd
50719 204195 1 0 3 0x100082 netio syslogd
56409 269750 82339 77 3 0x100092 kqread dhcpleased
38043 210823 82339 77 3 0x100092 kqread dhcpleased
82339 321261 1 0 3 0x80 kqread dhcpleased
78726 171179 42866 115 3 0x100092 kqread slaacd
64825 522042 42866 115 3 0x100092 kqread slaacd
42866 121490 1 0 3 0x100080 kqread slaacd
27056 523309 0 0 3 0x14200 bored smr
60893 19168 0 0 3 0x14200 pgzero zerothread
53700 375669 0 0 3 0x14200 aiodoned aiodoned
42976 46427 0 0 3 0x14200 syncer update
71949 418239 0 0 3 0x14200 cleaner cleaner
80626 45805 0 0 7 0x14200 reaper
54869 43057 0 0 3 0x14200 pgdaemon pagedaemon
84624 479667 0 0 3 0x14200 usbtsk usbtask
34897 79859 0 0 3 0x14200 usbatsk usbatsk
66093 3735 0 0 3 0x40014200 acpi0 acpi0
86817 206525 0 0 3 0x40014200 idle7
91530 81236 0 0 7 0x40014200 idle6
26856 190910 0 0 3 0x40014200 idle5
59400 12485 0 0 7 0x40014200 idle4
46837 20343 0 0 3 0x40014200 idle3
13360 60775 0 0 3 0x40014200 idle2
75344 252894 0 0 3 0x40014200 idle1
59939 347814 0 0 3 0x14200 bored sensors
99629 353816 0 0 7 0x14200 softnet
10647 487701 0 0 7 0x14200 softnet
43620 152789 0 0 7 0x14200 softnet
*68683 356742 0 0 7 0x14200 softnet
13264 491700 0 0 3 0x14200 arttfini systqmp
59941 489236 0 0 3 0x14200 bored systq
17414 459279 0 0 3 0x40014200 bored softclock
84124 362612 0 0 7 0x40014200 idle0
1 38941 0 0 3 0x82 wait init
0 0 -1 0 3 0x10200 scheduler swapper
bluhm