> Date: Tue, 30 Apr 2024 11:08:13 +0200
> From: Martin Pieuchot <m...@openbsd.org>
> 
> On 27/04/24(Sat) 13:44, Visa Hankala wrote:
> > On Tue, Apr 23, 2024 at 02:48:32PM +0200, Martin Pieuchot wrote:
> > > [...]
> > > I agree.  Now I'd be very grateful if someone could dig into WITNESS to
> > > figure out why we see such reports.  Are these false positive or are we
> > > missing data from the code path that we think are incorrect?
> > 
> > WITNESS currently cannot show lock cycles longer than two locks.
> > To fix this, WITNESS needs to do a path search in the lock order graph.
> 
> Lovely!
> 
> > However, there is also something else wrong in WITNESS, possibly
> > related to situations where the kernel lock comes between two rwlocks
> > in the lock order. I still need to study this more.
> 
> I greatly appreciate your dedication in this area.
> 
> > Below is a patch that adds the cycle search and printing. The patch
> > also tweaks a few prints to show more context.
> 
> This is ok mpi@
> 
> > With the patch, the nfsnode-vmmaplk reversal looks like this:
> 
> So the issue here is due to NFS entering the network stack after the
> VFS.  Alexander, Vitaly are we far from a NET_LOCK()-free sosend()?
> Is something we should consider?
> 
> On the other side, would that make sense to have a NET_LOCK()-free
> sysctl path?

I don't think it is necessary to make sysctl calls NET_LOCK()-free,
but it does make a lot of sense to avoid calling copyin() and
copyout() while holding locks as much as possible.

For the network sysctl, that means that instead of something like:

  NET_LOCK();
  ...
  sysctl_int_bounded(oldp, oldlenp, newp, newlen, ...); /* copyin/copyout */
  ...
  NET_UNLOCK();

we really should be doing something like:

  sysctl_int_bounded(NULL, 0, newp, newlen, ...); /* copyin */
  NET_LOCK();
  ...
  NET_UNLOCK();
  sysctl_int_bounded(oldp, oldlenp, NULL, 0, ...); /* copyout */


> > witness: lock order reversal:
> >  1st 0xfffffd8126deacf8 vmmaplk (&map->lock)
> >  2nd 0xffff800039831948 nfsnode (&np->n_lock)
> > lock order [1] vmmaplk (&map->lock) -> [2] nfsnode (&np->n_lock)
> > #0  rw_enter+0x6d
> > #1  rrw_enter+0x5e
> > #2  VOP_LOCK+0x5f
> > #3  vn_lock+0xbc
> > #4  vn_rdwr+0x83
> > #5  vndstrategy+0x2ca
> > #6  physio+0x204
> > #7  spec_write+0x9e
> > #8  VOP_WRITE+0x6e
> > #9  vn_write+0x100
> > #10 dofilewritev+0x143
> > #11 sys_pwrite+0x60
> > #12 syscall+0x588
> > #13 Xsyscall+0x128
> > lock order [2] nfsnode (&np->n_lock) -> [3] netlock (netlock)
> > #0  rw_enter_read+0x50
> > #1  solock_shared+0x3a
> > #2  sosend+0x10c
> > #3  nfs_send+0x8d
> > #4  nfs_request+0x258
> > #5  nfs_getattr+0xcb
> > #6  VOP_GETATTR+0x55
> > #7  mountnfs+0x37c
> > #8  nfs_mount+0x125
> > #9  sys_mount+0x343
> > #10 syscall+0x561
> > #11 Xsyscall+0x128
> > lock order [3] netlock (netlock) -> [1] vmmaplk (&map->lock)
> > #0  rw_enter_read+0x50
> > #1  uvmfault_lookup+0x8a
> > #2  uvm_fault_check+0x36
> > #3  uvm_fault+0xfb
> > #4  kpageflttrap+0x158
> > #5  kerntrap+0x94
> > #6  alltraps_kern_meltdown+0x7b
> > #7  _copyin+0x62
> > #8  sysctl_bounded_arr+0x83
> > #9  tcp_sysctl+0x546
> > #10 sys_sysctl+0x17b
> > #11 syscall+0x561
> > #12 Xsyscall+0x128
> > 
> > 
> > Index: kern/subr_witness.c
> > ===================================================================
> > RCS file: src/sys/kern/subr_witness.c,v
> > retrieving revision 1.50
> > diff -u -p -r1.50 subr_witness.c
> > --- kern/subr_witness.c     30 May 2023 08:30:01 -0000      1.50
> > +++ kern/subr_witness.c     27 Apr 2024 13:08:43 -0000
> > @@ -369,6 +369,13 @@ static struct witness_lock_order_data  *w
> >                                         struct witness *child);
> >  static void        witness_list_lock(struct lock_instance *instance,
> >                 int (*prnt)(const char *fmt, ...));
> > +static void        witness_print_cycle(int(*prnt)(const char *fmt, ...),
> > +               struct witness *parent, struct witness *child);
> > +static void        witness_print_cycle_edge(int(*prnt)(const char *fmt, 
> > ...),
> > +               struct witness *parent, struct witness *child,
> > +               int step, int last);
> > +static int witness_search(struct witness *w, struct witness *target,
> > +               struct witness **path, int depth, int *remaining);
> >  static void        witness_setflag(struct lock_object *lock, int flag, int 
> > set);
> >  
> >  /*
> > @@ -652,8 +659,9 @@ witness_ddb_display_descendants(int(*prn
> >  
> >     for (i = 0; i < indent; i++)
> >             prnt(" ");
> > -   prnt("%s (type: %s, depth: %d)",
> > -        w->w_type->lt_name, w->w_class->lc_name, w->w_ddb_level);
> > +   prnt("%s (%s) (type: %s, depth: %d)",
> > +       w->w_subtype, w->w_type->lt_name,
> > +       w->w_class->lc_name, w->w_ddb_level);
> >     if (w->w_displayed) {
> >             prnt(" -- (already displayed)\n");
> >             return;
> > @@ -719,7 +727,8 @@ witness_ddb_display(int(*prnt)(const cha
> >     SLIST_FOREACH(w, &w_all, w_list) {
> >             if (w->w_acquired)
> >                     continue;
> > -           prnt("%s (type: %s, depth: %d)\n", w->w_type->lt_name,
> > +           prnt("%s (%s) (type: %s, depth: %d)\n",
> > +               w->w_subtype, w->w_type->lt_name,
> >                 w->w_class->lc_name, w->w_ddb_level);
> >     }
> >  }
> > @@ -1066,47 +1075,8 @@ witness_checkorder(struct lock_object *l
> >                             printf(" 3rd %p %s (%s)\n", lock,
> >                                 lock->lo_name, w->w_type->lt_name);
> >                     }
> > -                   if (witness_watch > 1) {
> > -                           struct witness_lock_order_data *wlod1, *wlod2;
> > -
> > -                           mtx_enter(&w_mtx);
> > -                           wlod1 = witness_lock_order_get(w, w1);
> > -                           wlod2 = witness_lock_order_get(w1, w);
> > -                           mtx_leave(&w_mtx);
> > -
> > -                           /*
> > -                            * It is safe to access saved stack traces,
> > -                            * w_type, and w_class without the lock.
> > -                            * Once written, they never change.
> > -                            */
> > -
> > -                           if (wlod1 != NULL) {
> > -                                   printf("lock order \"%s\"(%s) -> "
> > -                                       "\"%s\"(%s) first seen at:\n",
> > -                                       w->w_type->lt_name,
> > -                                       w->w_class->lc_name,
> > -                                       w1->w_type->lt_name,
> > -                                       w1->w_class->lc_name);
> > -                                   stacktrace_print(
> > -                                       &wlod1->wlod_stack, printf);
> > -                           } else {
> > -                                   printf("lock order data "
> > -                                       "w2 -> w1 missing\n");
> > -                           }
> > -                           if (wlod2 != NULL) {
> > -                                   printf("lock order \"%s\"(%s) -> "
> > -                                       "\"%s\"(%s) first seen at:\n",
> > -                                       w1->w_type->lt_name,
> > -                                       w1->w_class->lc_name,
> > -                                       w->w_type->lt_name,
> > -                                       w->w_class->lc_name);
> > -                                   stacktrace_print(
> > -                                       &wlod2->wlod_stack, printf);
> > -                           } else {
> > -                                   printf("lock order data "
> > -                                       "w1 -> w2 missing\n");
> > -                           }
> > -                   }
> > +                   if (witness_watch > 1)
> > +                           witness_print_cycle(printf, w1, w);
> >                     witness_debugger(0);
> >                     goto out_splx;
> >             }
> > @@ -1875,6 +1845,92 @@ witness_list_lock(struct lock_instance *
> >             stacktrace_print(&instance->li_stack->ls_stack, prnt);
> >  }
> >  
> > +static int
> > +witness_search(struct witness *w, struct witness *target,
> > +    struct witness **path, int depth, int *remaining)
> > +{
> > +   int i, any_remaining;
> > +
> > +   if (depth == 0) {
> > +           *remaining = 1;
> > +           return (w == target);
> > +   }
> > +
> > +   any_remaining = 0;
> > +   for (i = 1; i <= w_max_used_index; i++) {
> > +           if (w_rmatrix[w->w_index][i] & WITNESS_PARENT) {
> > +                   if (witness_search(&w_data[i], target, path, depth - 1,
> > +                       remaining)) {
> > +                           path[depth - 1] = &w_data[i];
> > +                           *remaining = 1;
> > +                           return 1;
> > +                   }
> > +                   if (remaining)
> > +                           any_remaining = 1;
> > +           }
> > +   }
> > +   *remaining = any_remaining;
> > +   return 0;
> > +}
> > +
> > +static void
> > +witness_print_cycle_edge(int(*prnt)(const char *fmt, ...),
> > +    struct witness *parent, struct witness *child, int step, int last)
> > +{
> > +   struct witness_lock_order_data *wlod;
> > +   int next;
> > +
> > +   if (last)
> > +           next = 1;
> > +   else
> > +           next = step + 1;
> > +   prnt("lock order [%d] %s (%s) -> [%d] %s (%s)\n",
> > +       step, parent->w_subtype, parent->w_type->lt_name,
> > +       next, child->w_subtype, child->w_type->lt_name);
> > +   if (witness_watch > 1) {
> > +           mtx_enter(&w_mtx);
> > +           wlod = witness_lock_order_get(parent, child);
> > +           mtx_leave(&w_mtx);
> > +
> > +           if (wlod != NULL)
> > +                   stacktrace_print(&wlod->wlod_stack, printf);
> > +           else
> > +                   prnt("lock order data %p -> %p is missing\n",
> > +                       parent->w_type->lt_name, child->w_type->lt_name);
> > +   }
> > +}
> > +
> > +static void
> > +witness_print_cycle(int(*prnt)(const char *fmt, ...),
> > +    struct witness *parent, struct witness *child)
> > +{
> > +   struct witness *path[4];
> > +   struct witness *w;
> > +   int depth, remaining;
> > +   int step = 0;
> > +
> > +   /*
> > +    * Use depth-limited search to find the shortest path
> > +    * from child to parent.
> > +    */
> > +   for (depth = 1; depth < nitems(path); depth++) {
> > +           if (witness_search(child, parent, path, depth, &remaining))
> > +                   goto found;
> > +           if (!remaining)
> > +                   break;
> > +   }
> > +   prnt("witness: incomplete path, depth %d\n", depth);
> > +   return;
> > +
> > +found:
> > +   witness_print_cycle_edge(prnt, parent, child, ++step, 0);
> > +   for (w = child; depth > 0; depth--) {
> > +           witness_print_cycle_edge(prnt, w, path[depth - 1], ++step,
> > +               depth == 1);
> > +           w = path[depth - 1];
> > +   }
> > +}
> > +
> >  #ifdef DDB
> >  static int
> >  witness_thread_has_locks(struct proc *p)
> > @@ -2123,9 +2179,6 @@ db_witness_list_all(db_expr_t addr, int 
> >  void
> >  witness_print_badstacks(void)
> >  {
> > -   static struct witness tmp_w1, tmp_w2;
> > -   static struct witness_lock_order_data tmp_data1, tmp_data2;
> > -   struct witness_lock_order_data *data1, *data2;
> >     struct witness *w1, *w2;
> >     int error, generation, i, j;
> >  
> > @@ -2139,11 +2192,6 @@ witness_print_badstacks(void)
> >     }
> >     error = 0;
> >  
> > -   memset(&tmp_w1, 0, sizeof(tmp_w1));
> > -   memset(&tmp_w2, 0, sizeof(tmp_w2));
> > -   memset(&tmp_data1, 0, sizeof(tmp_data1));
> > -   memset(&tmp_data2, 0, sizeof(tmp_data2));
> > -
> >  restart:
> >     mtx_enter(&w_mtx);
> >     generation = w_generation;
> > @@ -2165,12 +2213,9 @@ restart:
> >                     mtx_leave(&w_mtx);
> >                     continue;
> >             }
> > -
> > -           /* Copy w1 locally so we can release the spin lock. */
> > -           tmp_w1 = *w1;
> >             mtx_leave(&w_mtx);
> >  
> > -           if (tmp_w1.w_reversed == 0)
> > +           if (w1->w_reversed == 0)
> >                     continue;
> >             for (j = 1; j < w_max_used_index; j++) {
> >                     if ((w_rmatrix[i][j] & WITNESS_REVERSAL) == 0 || i > j)
> > @@ -2187,47 +2232,13 @@ restart:
> >                     }
> >  
> >                     w2 = &w_data[j];
> > -                   data1 = witness_lock_order_get(w1, w2);
> > -                   data2 = witness_lock_order_get(w2, w1);
> > -
> > -                   /*
> > -                    * Copy information locally so we can release the
> > -                    * spin lock.
> > -                    */
> > -                   tmp_w2 = *w2;
> > -
> > -                   if (data1)
> > -                           tmp_data1.wlod_stack = data1->wlod_stack;
> > -                   if (data2 && data2 != data1)
> > -                           tmp_data2.wlod_stack = data2->wlod_stack;
> >                     mtx_leave(&w_mtx);
> >  
> >                     db_printf("\nLock order reversal between \"%s\"(%s) "
> >                         "and \"%s\"(%s)!\n",
> > -                       tmp_w1.w_type->lt_name, tmp_w1.w_class->lc_name,
> > -                       tmp_w2.w_type->lt_name, tmp_w2.w_class->lc_name);
> > -                   if (data1) {
> > -                           db_printf("Lock order \"%s\"(%s) -> \"%s\"(%s) "
> > -                               "first seen at:\n",
> > -                               tmp_w1.w_type->lt_name,
> > -                               tmp_w1.w_class->lc_name,
> > -                               tmp_w2.w_type->lt_name,
> > -                               tmp_w2.w_class->lc_name);
> > -                           stacktrace_print(&tmp_data1.wlod_stack,
> > -                               db_printf);
> > -                           db_printf("\n");
> > -                   }
> > -                   if (data2 && data2 != data1) {
> > -                           db_printf("Lock order \"%s\"(%s) -> \"%s\"(%s) "
> > -                               "first seen at:\n",
> > -                               tmp_w2.w_type->lt_name,
> > -                               tmp_w2.w_class->lc_name,
> > -                               tmp_w1.w_type->lt_name,
> > -                               tmp_w1.w_class->lc_name);
> > -                           stacktrace_print(&tmp_data2.wlod_stack,
> > -                               db_printf);
> > -                           db_printf("\n");
> > -                   }
> > +                       w1->w_type->lt_name, w1->w_class->lc_name,
> > +                       w2->w_type->lt_name, w2->w_class->lc_name);
> > +                   witness_print_cycle(db_printf, w1, w2);
> >             }
> >     }
> >     mtx_enter(&w_mtx);
> 
> 

Reply via email to