On Fri, Apr 28, 2023 at 02:13:15PM +0200, Alexander Bluhm wrote:
> After running stress test successfully with this diff, next day
> machine crashed while compiling a new kernel. It is unclear whether
> it is related to the diff. The softdep in ps is problably processing
> make output via ssh. Looks like recursive kernel stack overflow.
>
> [-- MARK -- Fri Apr 28 13:25:00 2023]
> kernel: protection fault trap, code=0
> Stopped at rt_if_linkstate_change+0x21: movl 0x58(%rdi),%eax
> ddb{3}>
>
> 8a0) at art_table_walk+0x26c
> art_table_walk(ffff800000390900,fffffd8746decdc0,ffffffff813bde40,ffff8000247c9
> 8a0) at art_table_walk+0x26c
> art_table_walk(ffff800000390900,fffffd8746decea0,ffffffff813bde40,ffff8000247c9
> 8a0) at art_table_walk+0x26c
> art_walk(ffff800000390900,ffffffff813bde40,ffff8000247c98a0) at art_walk+0xd1
> rtable_walk(0,18,ffff8000247c9938,ffffffff813c2b70,ffff800000784050) at
> rtable_
> walk+0xa4
> art_walk(ffff800000390900,ffffffff813bde40,ffff8000247c98a0) at art_walk+0xd1
> rtable_walk(0,18,ffff8000247c9938,ffffffff813c2b70,ffff800000784050) at
> rtable_
This is not rtfree() related. I was surptized, but reference counters
used by art tables are not MP safe. I propose to use refcnt API.
Index: sys/net/art.c
===================================================================
RCS file: /cvs/src/sys/net/art.c,v
retrieving revision 1.29
diff -u -p -r1.29 art.c
--- sys/net/art.c 12 Nov 2020 15:25:28 -0000 1.29
+++ sys/net/art.c 28 Apr 2023 12:53:28 -0000
@@ -535,7 +535,7 @@ art_table_delete(struct art_root *ar, st
struct art_table *
art_table_ref(struct art_root *ar, struct art_table *at)
{
- at->at_refcnt++;
+ refcnt_take(&at->at_refcnt);
return (at);
}
@@ -545,7 +545,7 @@ art_table_rele(struct art_table *at)
if (at == NULL)
return (0);
- return (--at->at_refcnt == 0);
+ return (refcnt_rele(&at->at_refcnt) != 0);
}
int
@@ -729,7 +729,7 @@ art_table_get(struct art_root *ar, struc
at->at_level = lvl;
at->at_bits = ar->ar_bits[lvl];
at->at_heap = at_heap;
- at->at_refcnt = 0;
+ refcnt_init(&at->at_refcnt);
if (parent != NULL) {
node = srp_get_locked(&parent->at_heap[j].node);
@@ -754,13 +754,13 @@ art_table_put(struct art_root *ar, struc
struct art_node *node;
uint32_t j = at->at_index;
- KASSERT(at->at_refcnt == 0);
+ KASSERT(refcnt_read(&at->at_refcnt) == 0);
KASSERT(j != 0 && j != 1);
if (parent != NULL) {
KASSERT(j != -1);
KASSERT(at->at_level == parent->at_level + 1);
- KASSERT(parent->at_refcnt >= 1);
+ KASSERT(refcnt_read(&parent->at_refcnt) >= 1);
/* Give the route back to its parent. */
node = srp_get_locked(&at->at_default);
Index: sys/net/art.h
===================================================================
RCS file: /cvs/src/sys/net/art.h,v
retrieving revision 1.23
diff -u -p -r1.23 art.h
--- sys/net/art.h 19 Apr 2023 17:42:47 -0000 1.23
+++ sys/net/art.h 28 Apr 2023 12:53:28 -0000
@@ -21,6 +21,7 @@
#include <sys/rwlock.h>
#include <sys/srp.h>
+#include <sys/refcnt.h>
#define ART_MAXLVL 32 /* We currently use 32 levels for IPv6. */
@@ -66,10 +67,11 @@ struct art_table {
*/
union {
struct srp node;
- unsigned long count;
+ struct refcnt refcnt;
} *at_heap; /* Array of 2^(slen+1) items */
};
-#define at_refcnt at_heap[0].count/* Refcounter (1 per different
route) */
+#define at_refcnt at_heap[0].refcnt /* Refcounter (1 per
+ different route) */
#define at_default at_heap[1].node /* Default route (was in parent
heap) */
/* Heap size for an ART table of stride length ``slen''. */