Hello,

It crashed again.
I've never used 7.2 with this configuration and environment.
What i simply do is feeding kernel routing table via bgp rib table. (Learn from 
rib table add it as mpath routing...)
While openbgpd doesn't support multipath routing I'm trying to manipulate it 
via my golang software.

Here it is:
uvm_fault(0xfffffd823b8b6448, 0x8, 0, 1) -> e
kernel: page fault trap, code=0
Stopped at      srp_get_locked+0x11:    movq    0(%rdi),%rax
    TID    PID    UID     PRFLAGS     PFLAGS  CPU  COMMAND
* 69167  59539      0           0          0    0  bgpd
 337353  18233      0         0x2  0x4000000    3  mine-core
 148193  18233      0         0x2  0x4000080    1  mine-core
 186456  18233      0         0x2  0x4000000    2  mine-core
srp_get_locked(8) at srp_get_locked+0x11
rtable_mpath_reprio(0,ffff800007fdf9d0,18,30,fffffd820a457be8) at rtable_mpath_
reprio+0x175
rt_if_linkstate_change(fffffd820a457be8,ffff800001ab6000,0) at rt_if_linkstate_
change+0xcd
rtm_output(ffff800001c52a00,ffff8000224e5360,ffff8000224e52b8,30,0) at rtm_outp
ut+0x71e
route_output(fffffd8068c19b00,fffffd8237c29738) at route_output+0x3bc
route_send(fffffd8237c29738,fffffd8068c19b00,0,0) at route_send+0x57
sosend(fffffd8237c29738,0,ffff8000224e55f0,0,0,80) at sosend+0x37f
dofilewritev(ffff800022587380,6,ffff8000224e55f0,0,ffff8000224e56f0) at dofilew
ritev+0x14d
sys_writev(ffff800022587380,ffff8000224e5690,ffff8000224e56f0) at sys_writev+0x
d2
syscall(ffff8000224e5760) at syscall+0x3d4
Xsyscall() at Xsyscall+0x128
end of kernel
end trace frame: 0x776ee2e48340, count: 4
https://www.openbsd.org/ddb.html describes the minimum info required in bug
reports.  Insufficient info makes it difficult to find and fix bugs.
ddb{0}> show panic
*cpu0: uvm_fault(0xfffffd823b8b6448, 0x8, 0, 1) -> e
ddb{0}> trace
srp_get_locked(8) at srp_get_locked+0x11
rtable_mpath_reprio(0,ffff800007fdf9d0,18,30,fffffd820a457be8) at rtable_mpat  
h_
reprio+0x175
rt_if_linkstate_change(fffffd820a457be8,ffff800001ab6000,0) at rt_if_linkstat  
e_
change+0xcd
rtm_output(ffff800001c52a00,ffff8000224e5360,ffff8000224e52b8,30,0) at rtm_ou  
tp
ut+0x71e
route_output(fffffd8068c19b00,fffffd8237c29738) at route_output+0x3bc
route_send(fffffd8237c29738,fffffd8068c19b00,0,0) at route_send+0x57
sosend(fffffd8237c29738,0,ffff8000224e55f0,0,0,80) at sosend+0x37f
dofilewritev(ffff800022587380,6,ffff8000224e55f0,0,ffff8000224e56f0) at dofil  
ew
ritev+0x14d
sys_writev(ffff800022587380,ffff8000224e5690,ffff8000224e56f0) at sys_writev+  
0x
d2
syscall(ffff8000224e5760) at syscall+0x3d4
Xsyscall() at Xsyscall+0x128
end of kernel
end trace frame: 0x776ee2e48340, count: -11
ddb{0}> show malloc
           Type InUse  MemUse  HighUse   Limit  Requests Type Lim
         devbuf 12877  18897K   18897K  78643K     36405        0
            pcb    13     14K      18K  78643K        21        0
         rtable   443     61K      75K  78643K      6370        0
         ifaddr   225     50K      51K  78643K       988        0
         sysctl     3      1K       1K  78643K         3        0
       counters   214     83K      83K  78643K       214        0
       ioctlops     0      0K       4K  78643K      9241        0
            iov     0      0K       2K  78643K       153        0
          mount     8      8K       8K  78643K         9        0
            log     0      0K       0K  78643K         1        0
         vnodes  1220     77K      79K  78643K      3836        0
      UFS quota     1     32K      32K  78643K         1        0
      UFS mount    31     69K      69K  78643K        39        0
            shm     2      1K       1K  78643K         2        0
         VM map     2      1K       1K  78643K         2        0
            sem     2     10K      10K  78643K         4        0
        dirhash   138     25K      25K  78643K       180        0
           ACPI  3556    426K     453K  78643K     13924        0
      file desc    28     19K      23K  78643K      4162        0
          sigio     1      0K       0K  78643K         1        0
           proc   108     85K      93K  78643K      9005        0
       MFS node     6      0K       0K  78643K         6        0
    NFS srvsock     1      0K       0K  78643K         1        0
     NFS daemon     1     16K      16K  78643K         1        0
       in_multi    35      2K       2K  78643K        70        0
    ether_multi     2      0K       0K  78643K         2        0
    ISOFS mount     1     32K      32K  78643K         1        0
  MSDOSFS mount     1     16K      16K  78643K         1        0
           ttys    25    122K     122K  78643K        25        0
           exec     0      0K       1K  78643K      9608        0
     pfkey data     0      0K       0K  78643K         2        0
            tdb     3      0K       0K  78643K         3        0
        pagedep     1      8K       8K  78643K         1        0
       inodedep     1     32K      32K  78643K         1        0
         newblk     1      0K       0K  78643K         1        0
        VM swap     8   1646K    1648K  78643K        10        0
       UVM amap  3317    308K     311K  78643K     94660        0
       UVM aobj     3      2K       2K  78643K         3        0
            USB    11     10K      10K  78643K        13        0
     USB device     4      0K       0K  78643K         4        0
         USB HC     1      0K       0K  78643K         1        0
        memdesc     1      4K       4K  78643K         1        0
    crypto data    18    258K     258K  78643K        18        0
            NDP    43      0K       0K  78643K        43        0
           temp   187   5824K    5888K  78643K    206641        0
         kqueue    71    130K     138K  78643K       397        0
      SYN cache     2     16K      16K  78643K         2        0
ddb{0}> trace
srp_get_locked(8) at srp_get_locked+0x11
rtable_mpath_reprio(0,ffff800007fdf9d0,18,30,fffffd820a457be8) at rtable_mpat  
h_
reprio+0x175
rt_if_linkstate_change(fffffd820a457be8,ffff800001ab6000,0) at rt_if_linkstat  
e_
change+0xcd
rtm_output(ffff800001c52a00,ffff8000224e5360,ffff8000224e52b8,30,0) at rtm_ou  
tp
ut+0x71e
route_output(fffffd8068c19b00,fffffd8237c29738) at route_output+0x3bc
route_send(fffffd8237c29738,fffffd8068c19b00,0,0) at route_send+0x57
sosend(fffffd8237c29738,0,ffff8000224e55f0,0,0,80) at sosend+0x37f
dofilewritev(ffff800022587380,6,ffff8000224e55f0,0,ffff8000224e56f0) at dofil  
ew
ritev+0x14d
sys_writev(ffff800022587380,ffff8000224e5690,ffff8000224e56f0) at sys_writev+  
0x
d2
syscall(ffff8000224e5760) at syscall+0x3d4
Xsyscall() at Xsyscall+0x128
end of kernel
end trace frame: 0x776ee2e48340, count: -11
ddb{0}> machine ddbcpu 0
Invalid cpu 0
ddb{0}> machine ddbcpu 1

P.S. It didn't respond to machine ddb commands so i were just able to get these 
outputs from ddb.
________________________________
From: Alexander Bluhm <alexander.bl...@gmx.net>
Sent: Thursday, July 6, 2023 19:17
To: Valdrin MUJA <valdrin_m...@outlook.com>
Cc: bugs@openbsd.org <bugs@openbsd.org>
Subject: Re: kernel diagnostic assertion "!_kernel_lock_held()" failed

On Thu, Jul 06, 2023 at 02:14:09PM +0000, Valdrin MUJA wrote:
> I've applied your patch but crashed again. Here it is:
> ddb{1}> show panic
> *cpu1: kernel diagnostic assertion "refcnt_read(&rt->rt_refcnt) >= 2" failed: 
> f
> ile "/usr/src/sys/net/rtable.c", line 828

This kassert I added seems to be wrong.  I copied it from above
without thinking enough.  Just remove it, updated diff below.

I compared your crash 3 and 4 output:

TEST1> uvm_fault(0xfffffd826717bcc0, 0x8, 0, 1) -> e
kernel: page fault trap, code=0
Stopped at      srp_get_locked+0x11:    movq    0(%rdi),%rax
    TID    PID    UID     PRFLAGS     PFLAGS  CPU  COMMAND
*225335  47125      0           0          0    1  bgpd
 231752  78299     73   0x1100010          0    3  syslogd
 344909   6421      0     0x14000      0x200    2  wg_handshake
 361415  98860      0     0x14000      0x200    0  reaper

SPOKE1> uvm_fault(0xfffffd81d5995878, 0x8, 0, 1) -> e
kernel: page fault trap, code=0
Stopped at      srp_get_locked+0x11:    movq    0(%rdi),%rax
    TID    PID    UID     PRFLAGS     PFLAGS  CPU  COMMAND
 448769  98731      0    0x100002          0    3  sh
 350289  69698     73   0x1100010          0    0  syslogd
*114462  84824      0           0          0    1  bgpd
 256495  50081      0     0x14000      0x200    2  wg_handshake

It is interesting that bgpd and wireguard are running in both cases
when it crashes.  Unfortunately you mail does not include this
output for crash 1 and 2.  It is printed immediately when the machine
crashes.  Do you have it in some console history?

I see a lot of different workload on your machine.  That makes it
harder to identify the subsystem that has the bug.  I see bgpd(8)
and wg(2) doing things with network and routing.  Is there something
else?

What has changed to make these crashes happen?  New workload?  New
machine?  Upgrade to 7.3?  Was it stable with 7.2?  ...

Thanks for testing.

bluhm

Index: net/rtable.c
===================================================================
RCS file: /data/mirror/openbsd/cvs/src/sys/net/rtable.c,v
retrieving revision 1.82
diff -u -p -r1.82 rtable.c
--- net/rtable.c        19 Apr 2023 17:42:47 -0000      1.82
+++ net/rtable.c        6 Jul 2023 15:56:04 -0000
@@ -604,6 +604,11 @@ rtable_insert(unsigned int rtableid, str
         SRPL_INSERT_HEAD_LOCKED(&rt_rc, &an->an_rtlist, rt, rt_next);

         prev = art_insert(ar, an, addr, plen);
+       if (prev == an) {
+               rw_exit_write(&ar->ar_lock);
+               /* keep the refcount for rt while it is in an_rtlist */
+               return (0);
+       }
         if (prev != an) {
                 SRPL_REMOVE_LOCKED(&rt_rc, &an->an_rtlist, rt, rtentry,
                     rt_next);
@@ -689,9 +694,10 @@ rtable_delete(unsigned int rtableid, str
                 npaths++;

         if (npaths > 1) {
-               KASSERT(refcnt_read(&rt->rt_refcnt) >= 1);
+               KASSERT(refcnt_read(&rt->rt_refcnt) >= 2);
                 SRPL_REMOVE_LOCKED(&rt_rc, &an->an_rtlist, rt, rtentry,
                     rt_next);
+               rtfree(rt);

                 mrt = SRPL_FIRST_LOCKED(&an->an_rtlist);
                 if (npaths == 2)
@@ -703,8 +709,9 @@ rtable_delete(unsigned int rtableid, str
         if (art_delete(ar, an, addr, plen) == NULL)
                 panic("art_delete failed to find node %p", an);

-       KASSERT(refcnt_read(&rt->rt_refcnt) >= 1);
+       KASSERT(refcnt_read(&rt->rt_refcnt) >= 2);
         SRPL_REMOVE_LOCKED(&rt_rc, &an->an_rtlist, rt, rtentry, rt_next);
+       rtfree(rt);
         art_put(an);

 leave:
@@ -821,12 +828,10 @@ rtable_mpath_reprio(unsigned int rtablei
                  */
                 rt->rt_priority = prio;
         } else {
-               rtref(rt); /* keep rt alive in between remove and insert */
                 SRPL_REMOVE_LOCKED(&rt_rc, &an->an_rtlist,
                     rt, rtentry, rt_next);
                 rt->rt_priority = prio;
                 rtable_mpath_insert(an, rt);
-               rtfree(rt);
                 error = EAGAIN;
         }
         rw_exit_write(&ar->ar_lock);
@@ -839,6 +844,9 @@ rtable_mpath_insert(struct art_node *an,
 {
         struct rtentry                  *mrt, *prt = NULL;
         uint8_t                          prio = rt->rt_priority;
+
+       /* increment the refcount for rt while it is in an_rtlist */
+       rtref(rt);

         if ((mrt = SRPL_FIRST_LOCKED(&an->an_rtlist)) == NULL) {
                 SRPL_INSERT_HEAD_LOCKED(&rt_rc, &an->an_rtlist, rt, rt_next);

Reply via email to