> Date: Tue, 30 Apr 2024 11:08:13 +0200 > From: Martin Pieuchot <m...@openbsd.org> > > On 27/04/24(Sat) 13:44, Visa Hankala wrote: > > On Tue, Apr 23, 2024 at 02:48:32PM +0200, Martin Pieuchot wrote: > > > [...] > > > I agree. Now I'd be very grateful if someone could dig into WITNESS to > > > figure out why we see such reports. Are these false positive or are we > > > missing data from the code path that we think are incorrect? > > > > WITNESS currently cannot show lock cycles longer than two locks. > > To fix this, WITNESS needs to do a path search in the lock order graph. > > Lovely! > > > However, there is also something else wrong in WITNESS, possibly > > related to situations where the kernel lock comes between two rwlocks > > in the lock order. I still need to study this more. > > I greatly appreciate your dedication in this area. > > > Below is a patch that adds the cycle search and printing. The patch > > also tweaks a few prints to show more context. > > This is ok mpi@ > > > With the patch, the nfsnode-vmmaplk reversal looks like this: > > So the issue here is due to NFS entering the network stack after the > VFS. Alexander, Vitaly are we far from a NET_LOCK()-free sosend()? > Is something we should consider? > > On the other side, would that make sense to have a NET_LOCK()-free > sysctl path?
I don't think it is necessary to make sysctl calls NET_LOCK()-free, but it does make a lot of sense to avoid calling copyin() and copyout() while holding locks as much as possible. For the network sysctl, that means that instead of something like: NET_LOCK(); ... sysctl_int_bounded(oldp, oldlenp, newp, newlen, ...); /* copyin/copyout */ ... NET_UNLOCK(); we really should be doing something like: sysctl_int_bounded(NULL, 0, newp, newlen, ...); /* copyin */ NET_LOCK(); ... NET_UNLOCK(); sysctl_int_bounded(oldp, oldlenp, NULL, 0, ...); /* copyout */ > > witness: lock order reversal: > > 1st 0xfffffd8126deacf8 vmmaplk (&map->lock) > > 2nd 0xffff800039831948 nfsnode (&np->n_lock) > > lock order [1] vmmaplk (&map->lock) -> [2] nfsnode (&np->n_lock) > > #0 rw_enter+0x6d > > #1 rrw_enter+0x5e > > #2 VOP_LOCK+0x5f > > #3 vn_lock+0xbc > > #4 vn_rdwr+0x83 > > #5 vndstrategy+0x2ca > > #6 physio+0x204 > > #7 spec_write+0x9e > > #8 VOP_WRITE+0x6e > > #9 vn_write+0x100 > > #10 dofilewritev+0x143 > > #11 sys_pwrite+0x60 > > #12 syscall+0x588 > > #13 Xsyscall+0x128 > > lock order [2] nfsnode (&np->n_lock) -> [3] netlock (netlock) > > #0 rw_enter_read+0x50 > > #1 solock_shared+0x3a > > #2 sosend+0x10c > > #3 nfs_send+0x8d > > #4 nfs_request+0x258 > > #5 nfs_getattr+0xcb > > #6 VOP_GETATTR+0x55 > > #7 mountnfs+0x37c > > #8 nfs_mount+0x125 > > #9 sys_mount+0x343 > > #10 syscall+0x561 > > #11 Xsyscall+0x128 > > lock order [3] netlock (netlock) -> [1] vmmaplk (&map->lock) > > #0 rw_enter_read+0x50 > > #1 uvmfault_lookup+0x8a > > #2 uvm_fault_check+0x36 > > #3 uvm_fault+0xfb > > #4 kpageflttrap+0x158 > > #5 kerntrap+0x94 > > #6 alltraps_kern_meltdown+0x7b > > #7 _copyin+0x62 > > #8 sysctl_bounded_arr+0x83 > > #9 tcp_sysctl+0x546 > > #10 sys_sysctl+0x17b > > #11 syscall+0x561 > > #12 Xsyscall+0x128 > > > > > > Index: kern/subr_witness.c > > =================================================================== > > RCS file: src/sys/kern/subr_witness.c,v > > retrieving revision 1.50 > > diff -u -p -r1.50 subr_witness.c > > --- kern/subr_witness.c 30 May 2023 08:30:01 -0000 1.50 > > +++ kern/subr_witness.c 27 Apr 2024 13:08:43 -0000 > > @@ -369,6 +369,13 @@ static struct witness_lock_order_data *w > > struct witness *child); > > static void witness_list_lock(struct lock_instance *instance, > > int (*prnt)(const char *fmt, ...)); > > +static void witness_print_cycle(int(*prnt)(const char *fmt, ...), > > + struct witness *parent, struct witness *child); > > +static void witness_print_cycle_edge(int(*prnt)(const char *fmt, > > ...), > > + struct witness *parent, struct witness *child, > > + int step, int last); > > +static int witness_search(struct witness *w, struct witness *target, > > + struct witness **path, int depth, int *remaining); > > static void witness_setflag(struct lock_object *lock, int flag, int > > set); > > > > /* > > @@ -652,8 +659,9 @@ witness_ddb_display_descendants(int(*prn > > > > for (i = 0; i < indent; i++) > > prnt(" "); > > - prnt("%s (type: %s, depth: %d)", > > - w->w_type->lt_name, w->w_class->lc_name, w->w_ddb_level); > > + prnt("%s (%s) (type: %s, depth: %d)", > > + w->w_subtype, w->w_type->lt_name, > > + w->w_class->lc_name, w->w_ddb_level); > > if (w->w_displayed) { > > prnt(" -- (already displayed)\n"); > > return; > > @@ -719,7 +727,8 @@ witness_ddb_display(int(*prnt)(const cha > > SLIST_FOREACH(w, &w_all, w_list) { > > if (w->w_acquired) > > continue; > > - prnt("%s (type: %s, depth: %d)\n", w->w_type->lt_name, > > + prnt("%s (%s) (type: %s, depth: %d)\n", > > + w->w_subtype, w->w_type->lt_name, > > w->w_class->lc_name, w->w_ddb_level); > > } > > } > > @@ -1066,47 +1075,8 @@ witness_checkorder(struct lock_object *l > > printf(" 3rd %p %s (%s)\n", lock, > > lock->lo_name, w->w_type->lt_name); > > } > > - if (witness_watch > 1) { > > - struct witness_lock_order_data *wlod1, *wlod2; > > - > > - mtx_enter(&w_mtx); > > - wlod1 = witness_lock_order_get(w, w1); > > - wlod2 = witness_lock_order_get(w1, w); > > - mtx_leave(&w_mtx); > > - > > - /* > > - * It is safe to access saved stack traces, > > - * w_type, and w_class without the lock. > > - * Once written, they never change. > > - */ > > - > > - if (wlod1 != NULL) { > > - printf("lock order \"%s\"(%s) -> " > > - "\"%s\"(%s) first seen at:\n", > > - w->w_type->lt_name, > > - w->w_class->lc_name, > > - w1->w_type->lt_name, > > - w1->w_class->lc_name); > > - stacktrace_print( > > - &wlod1->wlod_stack, printf); > > - } else { > > - printf("lock order data " > > - "w2 -> w1 missing\n"); > > - } > > - if (wlod2 != NULL) { > > - printf("lock order \"%s\"(%s) -> " > > - "\"%s\"(%s) first seen at:\n", > > - w1->w_type->lt_name, > > - w1->w_class->lc_name, > > - w->w_type->lt_name, > > - w->w_class->lc_name); > > - stacktrace_print( > > - &wlod2->wlod_stack, printf); > > - } else { > > - printf("lock order data " > > - "w1 -> w2 missing\n"); > > - } > > - } > > + if (witness_watch > 1) > > + witness_print_cycle(printf, w1, w); > > witness_debugger(0); > > goto out_splx; > > } > > @@ -1875,6 +1845,92 @@ witness_list_lock(struct lock_instance * > > stacktrace_print(&instance->li_stack->ls_stack, prnt); > > } > > > > +static int > > +witness_search(struct witness *w, struct witness *target, > > + struct witness **path, int depth, int *remaining) > > +{ > > + int i, any_remaining; > > + > > + if (depth == 0) { > > + *remaining = 1; > > + return (w == target); > > + } > > + > > + any_remaining = 0; > > + for (i = 1; i <= w_max_used_index; i++) { > > + if (w_rmatrix[w->w_index][i] & WITNESS_PARENT) { > > + if (witness_search(&w_data[i], target, path, depth - 1, > > + remaining)) { > > + path[depth - 1] = &w_data[i]; > > + *remaining = 1; > > + return 1; > > + } > > + if (remaining) > > + any_remaining = 1; > > + } > > + } > > + *remaining = any_remaining; > > + return 0; > > +} > > + > > +static void > > +witness_print_cycle_edge(int(*prnt)(const char *fmt, ...), > > + struct witness *parent, struct witness *child, int step, int last) > > +{ > > + struct witness_lock_order_data *wlod; > > + int next; > > + > > + if (last) > > + next = 1; > > + else > > + next = step + 1; > > + prnt("lock order [%d] %s (%s) -> [%d] %s (%s)\n", > > + step, parent->w_subtype, parent->w_type->lt_name, > > + next, child->w_subtype, child->w_type->lt_name); > > + if (witness_watch > 1) { > > + mtx_enter(&w_mtx); > > + wlod = witness_lock_order_get(parent, child); > > + mtx_leave(&w_mtx); > > + > > + if (wlod != NULL) > > + stacktrace_print(&wlod->wlod_stack, printf); > > + else > > + prnt("lock order data %p -> %p is missing\n", > > + parent->w_type->lt_name, child->w_type->lt_name); > > + } > > +} > > + > > +static void > > +witness_print_cycle(int(*prnt)(const char *fmt, ...), > > + struct witness *parent, struct witness *child) > > +{ > > + struct witness *path[4]; > > + struct witness *w; > > + int depth, remaining; > > + int step = 0; > > + > > + /* > > + * Use depth-limited search to find the shortest path > > + * from child to parent. > > + */ > > + for (depth = 1; depth < nitems(path); depth++) { > > + if (witness_search(child, parent, path, depth, &remaining)) > > + goto found; > > + if (!remaining) > > + break; > > + } > > + prnt("witness: incomplete path, depth %d\n", depth); > > + return; > > + > > +found: > > + witness_print_cycle_edge(prnt, parent, child, ++step, 0); > > + for (w = child; depth > 0; depth--) { > > + witness_print_cycle_edge(prnt, w, path[depth - 1], ++step, > > + depth == 1); > > + w = path[depth - 1]; > > + } > > +} > > + > > #ifdef DDB > > static int > > witness_thread_has_locks(struct proc *p) > > @@ -2123,9 +2179,6 @@ db_witness_list_all(db_expr_t addr, int > > void > > witness_print_badstacks(void) > > { > > - static struct witness tmp_w1, tmp_w2; > > - static struct witness_lock_order_data tmp_data1, tmp_data2; > > - struct witness_lock_order_data *data1, *data2; > > struct witness *w1, *w2; > > int error, generation, i, j; > > > > @@ -2139,11 +2192,6 @@ witness_print_badstacks(void) > > } > > error = 0; > > > > - memset(&tmp_w1, 0, sizeof(tmp_w1)); > > - memset(&tmp_w2, 0, sizeof(tmp_w2)); > > - memset(&tmp_data1, 0, sizeof(tmp_data1)); > > - memset(&tmp_data2, 0, sizeof(tmp_data2)); > > - > > restart: > > mtx_enter(&w_mtx); > > generation = w_generation; > > @@ -2165,12 +2213,9 @@ restart: > > mtx_leave(&w_mtx); > > continue; > > } > > - > > - /* Copy w1 locally so we can release the spin lock. */ > > - tmp_w1 = *w1; > > mtx_leave(&w_mtx); > > > > - if (tmp_w1.w_reversed == 0) > > + if (w1->w_reversed == 0) > > continue; > > for (j = 1; j < w_max_used_index; j++) { > > if ((w_rmatrix[i][j] & WITNESS_REVERSAL) == 0 || i > j) > > @@ -2187,47 +2232,13 @@ restart: > > } > > > > w2 = &w_data[j]; > > - data1 = witness_lock_order_get(w1, w2); > > - data2 = witness_lock_order_get(w2, w1); > > - > > - /* > > - * Copy information locally so we can release the > > - * spin lock. > > - */ > > - tmp_w2 = *w2; > > - > > - if (data1) > > - tmp_data1.wlod_stack = data1->wlod_stack; > > - if (data2 && data2 != data1) > > - tmp_data2.wlod_stack = data2->wlod_stack; > > mtx_leave(&w_mtx); > > > > db_printf("\nLock order reversal between \"%s\"(%s) " > > "and \"%s\"(%s)!\n", > > - tmp_w1.w_type->lt_name, tmp_w1.w_class->lc_name, > > - tmp_w2.w_type->lt_name, tmp_w2.w_class->lc_name); > > - if (data1) { > > - db_printf("Lock order \"%s\"(%s) -> \"%s\"(%s) " > > - "first seen at:\n", > > - tmp_w1.w_type->lt_name, > > - tmp_w1.w_class->lc_name, > > - tmp_w2.w_type->lt_name, > > - tmp_w2.w_class->lc_name); > > - stacktrace_print(&tmp_data1.wlod_stack, > > - db_printf); > > - db_printf("\n"); > > - } > > - if (data2 && data2 != data1) { > > - db_printf("Lock order \"%s\"(%s) -> \"%s\"(%s) " > > - "first seen at:\n", > > - tmp_w2.w_type->lt_name, > > - tmp_w2.w_class->lc_name, > > - tmp_w1.w_type->lt_name, > > - tmp_w1.w_class->lc_name); > > - stacktrace_print(&tmp_data2.wlod_stack, > > - db_printf); > > - db_printf("\n"); > > - } > > + w1->w_type->lt_name, w1->w_class->lc_name, > > + w2->w_type->lt_name, w2->w_class->lc_name); > > + witness_print_cycle(db_printf, w1, w2); > > } > > } > > mtx_enter(&w_mtx); > >