Hi,
When forwaring in packets parallel while removing the corresponding
ARP entry in a loop like this
while :; do arp -nd 10.10.21.1; arp -nd 10.10.22.4; done
it was quite easy to crash the kernel.
The race is exposed by putting a delay() here:
Index: net/rtable.c
===================================================================
RCS file: /data/mirror/openbsd/cvs/src/sys/net/rtable.c,v
retrieving revision 1.82
diff -u -p -r1.82 rtable.c
--- net/rtable.c 19 Apr 2023 17:42:47 -0000 1.82
+++ net/rtable.c 5 Jul 2023 19:59:04 -0000
@@ -489,6 +489,8 @@ rtable_match(unsigned int rtableid, stru
if (an == NULL)
goto out;
+ delay(10); /* triggers crash */
+
rt = SRPL_FIRST(&sr, &an->an_rtlist);
if (rt == NULL) {
SRPL_LEAVE(&sr);
Then expecting the struct rtentry rt_ifa is bogous and rt_refcnt
is 0. So it looks like a refcounting bug.
While an_rtlist is protected by SRP, the route entries are not. I
increased the refcount while the entry is held the an_rtlist. Then
the crash goes away.
ok?
bluhm
Index: net/rtable.c
===================================================================
RCS file: /data/mirror/openbsd/cvs/src/sys/net/rtable.c,v
retrieving revision 1.82
diff -u -p -r1.82 rtable.c
--- net/rtable.c 19 Apr 2023 17:42:47 -0000 1.82
+++ net/rtable.c 5 Jul 2023 20:05:26 -0000
@@ -604,6 +604,11 @@ rtable_insert(unsigned int rtableid, str
SRPL_INSERT_HEAD_LOCKED(&rt_rc, &an->an_rtlist, rt, rt_next);
prev = art_insert(ar, an, addr, plen);
+ if (prev == an) {
+ rw_exit_write(&ar->ar_lock);
+ /* keep the refcount for rt while it is in an_rtlist */
+ return (0);
+ }
if (prev != an) {
SRPL_REMOVE_LOCKED(&rt_rc, &an->an_rtlist, rt, rtentry,
rt_next);
@@ -689,9 +694,10 @@ rtable_delete(unsigned int rtableid, str
npaths++;
if (npaths > 1) {
- KASSERT(refcnt_read(&rt->rt_refcnt) >= 1);
+ KASSERT(refcnt_read(&rt->rt_refcnt) >= 2);
SRPL_REMOVE_LOCKED(&rt_rc, &an->an_rtlist, rt, rtentry,
rt_next);
+ rtfree(rt);
mrt = SRPL_FIRST_LOCKED(&an->an_rtlist);
if (npaths == 2)
@@ -703,8 +709,9 @@ rtable_delete(unsigned int rtableid, str
if (art_delete(ar, an, addr, plen) == NULL)
panic("art_delete failed to find node %p", an);
- KASSERT(refcnt_read(&rt->rt_refcnt) >= 1);
+ KASSERT(refcnt_read(&rt->rt_refcnt) >= 2);
SRPL_REMOVE_LOCKED(&rt_rc, &an->an_rtlist, rt, rtentry, rt_next);
+ rtfree(rt);
art_put(an);
leave:
@@ -821,12 +828,11 @@ rtable_mpath_reprio(unsigned int rtablei
*/
rt->rt_priority = prio;
} else {
- rtref(rt); /* keep rt alive in between remove and insert */
+ KASSERT(refcnt_read(&rt->rt_refcnt) >= 2);
SRPL_REMOVE_LOCKED(&rt_rc, &an->an_rtlist,
rt, rtentry, rt_next);
rt->rt_priority = prio;
rtable_mpath_insert(an, rt);
- rtfree(rt);
error = EAGAIN;
}
rw_exit_write(&ar->ar_lock);
@@ -839,6 +845,9 @@ rtable_mpath_insert(struct art_node *an,
{
struct rtentry *mrt, *prt = NULL;
uint8_t prio = rt->rt_priority;
+
+ /* increment the refcount for rt while it is in an_rtlist */
+ rtref(rt);
if ((mrt = SRPL_FIRST_LOCKED(&an->an_rtlist)) == NULL) {
SRPL_INSERT_HEAD_LOCKED(&rt_rc, &an->an_rtlist, rt, rt_next);