On Mon, Apr 25, 2022 at 04:01:49PM +0200, Alexander Bluhm wrote:
> On Sat, Apr 23, 2022 at 04:48:00PM +0300, kasak wrote:
> > ipsp_ids_gc(0) at ipsp_ids_gc+0xb4
> > softclock_thread(ffff800022baf260) at softclock_thread+0x13b
> 
> This code was modified between 7.0 and 7.1.  It crashes here:
> 
> /usr/src/sys/netinet/ip_ipsp.c:1266
> *     b4:       41 83 7e 64 00          cmpl   $0x0,0x64(%r14)
>       b9:       75 46                   jne    101 <ipsp_ids_gc+0x101>
>       bb:       4d 8b 3e                mov    (%r14),%r15
> /usr/src/sys/netinet/ip_ipsp.c:1269
> 
>   1257  /* free ids only from delayed timeout */
>   1258  void
>   1259  ipsp_ids_gc(void *arg)
>   1260  {
>   1261          struct ipsec_ids *ids, *tids;
>   1262  
>   1263          mtx_enter(&ipsec_flows_mtx);
>   1264  
>   1265          LIST_FOREACH_SAFE(ids, &ipsp_ids_gc_list, id_gc_list, tids) {
> * 1266                  KASSERT(ids->id_refcount == 0);
>   1267                  DPRINTF("ids %p count %d", ids, ids->id_refcount);
>   1268  
>   1269                  if ((--ids->id_gc_ttl) > 0)
>   1270                          continue;
>   1271  
>   1272                  LIST_REMOVE(ids, id_gc_list);
>   1273                  RBT_REMOVE(ipsec_ids_tree, &ipsec_ids_tree, ids);
>   1274                  RBT_REMOVE(ipsec_ids_flows, &ipsec_ids_flows, ids);
>   1275                  free(ids->id_local, M_CREDENTIALS, 0);
>   1276                  free(ids->id_remote, M_CREDENTIALS, 0);
>   1277                  free(ids, M_CREDENTIALS, 0);
>   1278          }
>   1279  
>   1280          if (!LIST_EMPTY(&ipsp_ids_gc_list))
>   1281                  timeout_add_sec(&ipsp_ids_gc_timeout, 1);
>   1282  
>   1283          mtx_leave(&ipsec_flows_mtx);
>   1284  }
> 
> I would suggest to replace the timeout garbage collector with
> reference counting.  This diff is only compile tested as I have no
> setup for IPsec ids yet.  Should work on 7.1 an -current.
> 
> bluhm
> 

Your diff is broken, it keeps `id_gc_list' as part of 'ipsec_ids' and
still removes `ids' from this non-existing list:

-
-
-       /*
-        * Add second for the case ipsp_ids_gc() is already running and
-        * awaits netlock to be released.
-        */
-       ids->id_gc_ttl = ipsec_ids_idle + 1;
-
-       if (LIST_EMPTY(&ipsp_ids_gc_list))
-               timeout_add_sec(&ipsp_ids_gc_timeout, 1);
-       LIST_INSERT_HEAD(&ipsp_ids_gc_list, ids, id_gc_list);
-
+       LIST_REMOVE(ids, id_gc_list);
                             ^
-----------------------------'
+       RBT_REMOVE(ipsec_ids_tree, &ipsec_ids_tree, ids);
+       RBT_REMOVE(ipsec_ids_flows, &ipsec_ids_flows, ids);
        mtx_leave(&ipsec_flows_mtx);
+
+       free(ids->id_local, M_CREDENTIALS, 0);
+       free(ids->id_remote, M_CREDENTIALS, 0);
+       free(ids, M_CREDENTIALS, sizeof(*ids));
 }

Also it modifies existing behaviour, we stop to re-use dead `ids' for
some period after it's removal. I don't know why we do this, may be some
one involved to this layer could explain.

I guess, this race exposes that the IPL_SOFTNET level is not enough for
`ipsec_flows_mtx' mutex(9). This mutex(9) protects multiple objects, so
could get similar panic on other places. I propose to increase the
`ipsec_flows_mtx' mutex(9) priority level to IPL_MPFLOOR.

Index: sys/netinet/ip_ipsp.c
===================================================================
RCS file: /cvs/src/sys/netinet/ip_ipsp.c,v
retrieving revision 1.269
diff -u -p -r1.269 ip_ipsp.c
--- sys/netinet/ip_ipsp.c       10 Mar 2022 15:21:08 -0000      1.269
+++ sys/netinet/ip_ipsp.c       25 Apr 2022 14:29:42 -0000
@@ -91,7 +91,7 @@ void tdb_hashstats(void);
  *     F       ipsec_flows_mtx
  */
 
-struct mutex ipsec_flows_mtx = MUTEX_INITIALIZER(IPL_SOFTNET);
+struct mutex ipsec_flows_mtx = MUTEX_INITIALIZER(IPL_MPFLOOR);
 
 int            tdb_rehash(void);
 void           tdb_timeout(void *);

Reply via email to