Hello, It crashed again. I've never used 7.2 with this configuration and environment. What i simply do is feeding kernel routing table via bgp rib table. (Learn from rib table add it as mpath routing...) While openbgpd doesn't support multipath routing I'm trying to manipulate it via my golang software.
Here it is: uvm_fault(0xfffffd823b8b6448, 0x8, 0, 1) -> e kernel: page fault trap, code=0 Stopped at srp_get_locked+0x11: movq 0(%rdi),%rax TID PID UID PRFLAGS PFLAGS CPU COMMAND * 69167 59539 0 0 0 0 bgpd 337353 18233 0 0x2 0x4000000 3 mine-core 148193 18233 0 0x2 0x4000080 1 mine-core 186456 18233 0 0x2 0x4000000 2 mine-core srp_get_locked(8) at srp_get_locked+0x11 rtable_mpath_reprio(0,ffff800007fdf9d0,18,30,fffffd820a457be8) at rtable_mpath_ reprio+0x175 rt_if_linkstate_change(fffffd820a457be8,ffff800001ab6000,0) at rt_if_linkstate_ change+0xcd rtm_output(ffff800001c52a00,ffff8000224e5360,ffff8000224e52b8,30,0) at rtm_outp ut+0x71e route_output(fffffd8068c19b00,fffffd8237c29738) at route_output+0x3bc route_send(fffffd8237c29738,fffffd8068c19b00,0,0) at route_send+0x57 sosend(fffffd8237c29738,0,ffff8000224e55f0,0,0,80) at sosend+0x37f dofilewritev(ffff800022587380,6,ffff8000224e55f0,0,ffff8000224e56f0) at dofilew ritev+0x14d sys_writev(ffff800022587380,ffff8000224e5690,ffff8000224e56f0) at sys_writev+0x d2 syscall(ffff8000224e5760) at syscall+0x3d4 Xsyscall() at Xsyscall+0x128 end of kernel end trace frame: 0x776ee2e48340, count: 4 https://www.openbsd.org/ddb.html describes the minimum info required in bug reports. Insufficient info makes it difficult to find and fix bugs. ddb{0}> show panic *cpu0: uvm_fault(0xfffffd823b8b6448, 0x8, 0, 1) -> e ddb{0}> trace srp_get_locked(8) at srp_get_locked+0x11 rtable_mpath_reprio(0,ffff800007fdf9d0,18,30,fffffd820a457be8) at rtable_mpat h_ reprio+0x175 rt_if_linkstate_change(fffffd820a457be8,ffff800001ab6000,0) at rt_if_linkstat e_ change+0xcd rtm_output(ffff800001c52a00,ffff8000224e5360,ffff8000224e52b8,30,0) at rtm_ou tp ut+0x71e route_output(fffffd8068c19b00,fffffd8237c29738) at route_output+0x3bc route_send(fffffd8237c29738,fffffd8068c19b00,0,0) at route_send+0x57 sosend(fffffd8237c29738,0,ffff8000224e55f0,0,0,80) at sosend+0x37f dofilewritev(ffff800022587380,6,ffff8000224e55f0,0,ffff8000224e56f0) at dofil ew ritev+0x14d sys_writev(ffff800022587380,ffff8000224e5690,ffff8000224e56f0) at sys_writev+ 0x d2 syscall(ffff8000224e5760) at syscall+0x3d4 Xsyscall() at Xsyscall+0x128 end of kernel end trace frame: 0x776ee2e48340, count: -11 ddb{0}> show malloc Type InUse MemUse HighUse Limit Requests Type Lim devbuf 12877 18897K 18897K 78643K 36405 0 pcb 13 14K 18K 78643K 21 0 rtable 443 61K 75K 78643K 6370 0 ifaddr 225 50K 51K 78643K 988 0 sysctl 3 1K 1K 78643K 3 0 counters 214 83K 83K 78643K 214 0 ioctlops 0 0K 4K 78643K 9241 0 iov 0 0K 2K 78643K 153 0 mount 8 8K 8K 78643K 9 0 log 0 0K 0K 78643K 1 0 vnodes 1220 77K 79K 78643K 3836 0 UFS quota 1 32K 32K 78643K 1 0 UFS mount 31 69K 69K 78643K 39 0 shm 2 1K 1K 78643K 2 0 VM map 2 1K 1K 78643K 2 0 sem 2 10K 10K 78643K 4 0 dirhash 138 25K 25K 78643K 180 0 ACPI 3556 426K 453K 78643K 13924 0 file desc 28 19K 23K 78643K 4162 0 sigio 1 0K 0K 78643K 1 0 proc 108 85K 93K 78643K 9005 0 MFS node 6 0K 0K 78643K 6 0 NFS srvsock 1 0K 0K 78643K 1 0 NFS daemon 1 16K 16K 78643K 1 0 in_multi 35 2K 2K 78643K 70 0 ether_multi 2 0K 0K 78643K 2 0 ISOFS mount 1 32K 32K 78643K 1 0 MSDOSFS mount 1 16K 16K 78643K 1 0 ttys 25 122K 122K 78643K 25 0 exec 0 0K 1K 78643K 9608 0 pfkey data 0 0K 0K 78643K 2 0 tdb 3 0K 0K 78643K 3 0 pagedep 1 8K 8K 78643K 1 0 inodedep 1 32K 32K 78643K 1 0 newblk 1 0K 0K 78643K 1 0 VM swap 8 1646K 1648K 78643K 10 0 UVM amap 3317 308K 311K 78643K 94660 0 UVM aobj 3 2K 2K 78643K 3 0 USB 11 10K 10K 78643K 13 0 USB device 4 0K 0K 78643K 4 0 USB HC 1 0K 0K 78643K 1 0 memdesc 1 4K 4K 78643K 1 0 crypto data 18 258K 258K 78643K 18 0 NDP 43 0K 0K 78643K 43 0 temp 187 5824K 5888K 78643K 206641 0 kqueue 71 130K 138K 78643K 397 0 SYN cache 2 16K 16K 78643K 2 0 ddb{0}> trace srp_get_locked(8) at srp_get_locked+0x11 rtable_mpath_reprio(0,ffff800007fdf9d0,18,30,fffffd820a457be8) at rtable_mpat h_ reprio+0x175 rt_if_linkstate_change(fffffd820a457be8,ffff800001ab6000,0) at rt_if_linkstat e_ change+0xcd rtm_output(ffff800001c52a00,ffff8000224e5360,ffff8000224e52b8,30,0) at rtm_ou tp ut+0x71e route_output(fffffd8068c19b00,fffffd8237c29738) at route_output+0x3bc route_send(fffffd8237c29738,fffffd8068c19b00,0,0) at route_send+0x57 sosend(fffffd8237c29738,0,ffff8000224e55f0,0,0,80) at sosend+0x37f dofilewritev(ffff800022587380,6,ffff8000224e55f0,0,ffff8000224e56f0) at dofil ew ritev+0x14d sys_writev(ffff800022587380,ffff8000224e5690,ffff8000224e56f0) at sys_writev+ 0x d2 syscall(ffff8000224e5760) at syscall+0x3d4 Xsyscall() at Xsyscall+0x128 end of kernel end trace frame: 0x776ee2e48340, count: -11 ddb{0}> machine ddbcpu 0 Invalid cpu 0 ddb{0}> machine ddbcpu 1 P.S. It didn't respond to machine ddb commands so i were just able to get these outputs from ddb. ________________________________ From: Alexander Bluhm <alexander.bl...@gmx.net> Sent: Thursday, July 6, 2023 19:17 To: Valdrin MUJA <valdrin_m...@outlook.com> Cc: bugs@openbsd.org <bugs@openbsd.org> Subject: Re: kernel diagnostic assertion "!_kernel_lock_held()" failed On Thu, Jul 06, 2023 at 02:14:09PM +0000, Valdrin MUJA wrote: > I've applied your patch but crashed again. Here it is: > ddb{1}> show panic > *cpu1: kernel diagnostic assertion "refcnt_read(&rt->rt_refcnt) >= 2" failed: > f > ile "/usr/src/sys/net/rtable.c", line 828 This kassert I added seems to be wrong. I copied it from above without thinking enough. Just remove it, updated diff below. I compared your crash 3 and 4 output: TEST1> uvm_fault(0xfffffd826717bcc0, 0x8, 0, 1) -> e kernel: page fault trap, code=0 Stopped at srp_get_locked+0x11: movq 0(%rdi),%rax TID PID UID PRFLAGS PFLAGS CPU COMMAND *225335 47125 0 0 0 1 bgpd 231752 78299 73 0x1100010 0 3 syslogd 344909 6421 0 0x14000 0x200 2 wg_handshake 361415 98860 0 0x14000 0x200 0 reaper SPOKE1> uvm_fault(0xfffffd81d5995878, 0x8, 0, 1) -> e kernel: page fault trap, code=0 Stopped at srp_get_locked+0x11: movq 0(%rdi),%rax TID PID UID PRFLAGS PFLAGS CPU COMMAND 448769 98731 0 0x100002 0 3 sh 350289 69698 73 0x1100010 0 0 syslogd *114462 84824 0 0 0 1 bgpd 256495 50081 0 0x14000 0x200 2 wg_handshake It is interesting that bgpd and wireguard are running in both cases when it crashes. Unfortunately you mail does not include this output for crash 1 and 2. It is printed immediately when the machine crashes. Do you have it in some console history? I see a lot of different workload on your machine. That makes it harder to identify the subsystem that has the bug. I see bgpd(8) and wg(2) doing things with network and routing. Is there something else? What has changed to make these crashes happen? New workload? New machine? Upgrade to 7.3? Was it stable with 7.2? ... Thanks for testing. bluhm Index: net/rtable.c =================================================================== RCS file: /data/mirror/openbsd/cvs/src/sys/net/rtable.c,v retrieving revision 1.82 diff -u -p -r1.82 rtable.c --- net/rtable.c 19 Apr 2023 17:42:47 -0000 1.82 +++ net/rtable.c 6 Jul 2023 15:56:04 -0000 @@ -604,6 +604,11 @@ rtable_insert(unsigned int rtableid, str SRPL_INSERT_HEAD_LOCKED(&rt_rc, &an->an_rtlist, rt, rt_next); prev = art_insert(ar, an, addr, plen); + if (prev == an) { + rw_exit_write(&ar->ar_lock); + /* keep the refcount for rt while it is in an_rtlist */ + return (0); + } if (prev != an) { SRPL_REMOVE_LOCKED(&rt_rc, &an->an_rtlist, rt, rtentry, rt_next); @@ -689,9 +694,10 @@ rtable_delete(unsigned int rtableid, str npaths++; if (npaths > 1) { - KASSERT(refcnt_read(&rt->rt_refcnt) >= 1); + KASSERT(refcnt_read(&rt->rt_refcnt) >= 2); SRPL_REMOVE_LOCKED(&rt_rc, &an->an_rtlist, rt, rtentry, rt_next); + rtfree(rt); mrt = SRPL_FIRST_LOCKED(&an->an_rtlist); if (npaths == 2) @@ -703,8 +709,9 @@ rtable_delete(unsigned int rtableid, str if (art_delete(ar, an, addr, plen) == NULL) panic("art_delete failed to find node %p", an); - KASSERT(refcnt_read(&rt->rt_refcnt) >= 1); + KASSERT(refcnt_read(&rt->rt_refcnt) >= 2); SRPL_REMOVE_LOCKED(&rt_rc, &an->an_rtlist, rt, rtentry, rt_next); + rtfree(rt); art_put(an); leave: @@ -821,12 +828,10 @@ rtable_mpath_reprio(unsigned int rtablei */ rt->rt_priority = prio; } else { - rtref(rt); /* keep rt alive in between remove and insert */ SRPL_REMOVE_LOCKED(&rt_rc, &an->an_rtlist, rt, rtentry, rt_next); rt->rt_priority = prio; rtable_mpath_insert(an, rt); - rtfree(rt); error = EAGAIN; } rw_exit_write(&ar->ar_lock); @@ -839,6 +844,9 @@ rtable_mpath_insert(struct art_node *an, { struct rtentry *mrt, *prt = NULL; uint8_t prio = rt->rt_priority; + + /* increment the refcount for rt while it is in an_rtlist */ + rtref(rt); if ((mrt = SRPL_FIRST_LOCKED(&an->an_rtlist)) == NULL) { SRPL_INSERT_HEAD_LOCKED(&rt_rc, &an->an_rtlist, rt, rt_next);