One of the remaining SMP issue with our routing table usage is to guarantee that the L2 entry referenced by a RTF_GATEWAY route via the ``rt_gwroute'' pointer wont be replaced/invalidated by another CPU while we are filling the address field of an Ethernet frame.
The most efficient way, performance wise, to do that is to make the ``rt_gwroute'' pointer immutable during the lifetime of a RTF_GATEWAY route. If we know that this pointer won't change and that the memory it points to won't be freed as long as a CPU has reference to one of the RTF_GATEWAY routes, we don't need any special protection inside arp_resolve() and nd6_resolve(). Diff below achieve that by always resolving the next-hop entry before inserting the corresponding RTF_GATEWAY route in the tree. In other words rt_setgwroute() is no longer called in the sending path. To guarantee that a cached route won't be deleted while it is still referenced a new flag, RTF_CACHED, is added to the route. arp(8) and ndp(8) treat entry with this flag like RTF_LOCAL. Removing rt_setgwroute() from the sending path means that inserting a RTF_GATEWAY route will now fail if the next-hop cannot be resolve. This might introduce regression for some setups but I see that has an improvement since the kernel no longer let you add a route that cannot be used. It also mean that stale routes need to be fixed whenever a new address is configured. The diff does that and also plug an ifa leak that was happening when the same address is configured twice on an ifa. Note that even with this diff there are *still* some MP-safeness issue due to stale routes, they will be address in a later diff. Comments, ok?
Index: sys/net/route.c =================================================================== RCS file: /cvs/src/sys/net/route.c,v retrieving revision 1.313 diff -u -p -r1.313 route.c --- sys/net/route.c 22 Jul 2016 11:03:30 -0000 1.313 +++ sys/net/route.c 11 Aug 2016 13:32:50 -0000 @@ -153,7 +153,9 @@ struct pool rtentry_pool; /* pool for r struct pool rttimer_pool; /* pool for rttimer structures */ void rt_timer_init(void); -void rt_setgwroute(struct rtentry *, u_int); +int rt_setgwroute(struct rtentry *, u_int); +void rt_putgwroute(struct rtentry *); +int rt_fixgwroute(struct rtentry *, void *, unsigned int); int rtflushclone1(struct rtentry *, void *, u_int); void rtflushclone(unsigned int, struct rtentry *); int rt_if_remove_rtdelete(struct rtentry *, void *, u_int); @@ -204,21 +206,20 @@ rtisvalid(struct rtentry *rt) if (rt == NULL) return (0); -#ifdef DIAGNOSTIC - if (ISSET(rt->rt_flags, RTF_GATEWAY) && (rt->rt_gwroute != NULL) && - ISSET(rt->rt_gwroute->rt_flags, RTF_GATEWAY)) - panic("next hop must be directly reachable"); -#endif - - if ((rt->rt_flags & RTF_UP) == 0) + if (!ISSET(rt->rt_flags, RTF_UP)) return (0); /* Routes attached to stale ifas should be freed. */ if (rt->rt_ifa == NULL || rt->rt_ifa->ifa_ifp == NULL) return (0); - if (ISSET(rt->rt_flags, RTF_GATEWAY) && !rtisvalid(rt->rt_gwroute)) - return (0); +#ifdef DIAGNOSTIC + if (ISSET(rt->rt_flags, RTF_GATEWAY)) { + KASSERT(rt->rt_gwroute != NULL); + KASSERT(ISSET(rt->rt_gwroute->rt_flags, RTF_UP)); + KASSERT(!ISSET(rt->rt_gwroute->rt_flags, RTF_GATEWAY)); + } +#endif /* DIAGNOSTIC */ return (1); } @@ -267,8 +268,6 @@ rt_match(struct sockaddr *dst, uint32_t return (rt); } -struct rtentry *_rtalloc(struct sockaddr *, uint32_t *, int, unsigned int); - #ifndef SMALL_KERNEL /* * Originated from bridge_hash() in if_bridge.c @@ -349,16 +348,10 @@ rt_hash(struct rtentry *rt, struct socka struct rtentry * rtalloc_mpath(struct sockaddr *dst, uint32_t *src, unsigned int rtableid) { - return (_rtalloc(dst, src, RT_RESOLVE, rtableid)); + return (rt_match(dst, src, RT_RESOLVE, rtableid)); } #endif /* SMALL_KERNEL */ -struct rtentry * -rtalloc(struct sockaddr *dst, int flags, unsigned int rtableid) -{ - return (_rtalloc(dst, NULL, flags, rtableid)); -} - /* * Look in the routing table for the best matching entry for * ``dst''. @@ -367,44 +360,29 @@ rtalloc(struct sockaddr *dst, int flags, * longer valid, try to cache it. */ struct rtentry * -_rtalloc(struct sockaddr *dst, uint32_t *src, int flags, unsigned int rtableid) +rtalloc(struct sockaddr *dst, int flags, unsigned int rtableid) { - struct rtentry *rt; - - rt = rt_match(dst, src, flags, rtableid); - - /* No match or route to host? We're done. */ - if (rt == NULL || !ISSET(rt->rt_flags, RTF_GATEWAY)) - return (rt); - - /* Nothing to do if the next hop is valid. */ - if (rtisvalid(rt->rt_gwroute)) - return (rt); - - rt_setgwroute(rt, rtableid); - - return (rt); + return (rt_match(dst, NULL, flags, rtableid)); } -void +/* + * Cache the route entry corresponding to a reachable next hop in + * the gateway entry ``rt''. + */ +int rt_setgwroute(struct rtentry *rt, u_int rtableid) { struct rtentry *nhrt; - rtfree(rt->rt_gwroute); - rt->rt_gwroute = NULL; + KERNEL_ASSERT_LOCKED(); - /* - * If we cannot find a valid next hop, return the route - * with a gateway. - * - * XXX Some dragons hiding in the tree certainly depends on - * this behavior. But it is safe since rt_checkgate() wont - * allow us to us this route later on. - */ + KASSERT(ISSET(rt->rt_flags, RTF_GATEWAY)); + KASSERT(rt->rt_gwroute == NULL); + + /* If we cannot find a valid next hop bail. */ nhrt = rt_match(rt->rt_gateway, NULL, RT_RESOLVE, rtable_l2(rtableid)); if (nhrt == NULL) - return; + return (ENOENT); /* * Next hop must be reachable, this also prevents rtentry @@ -412,13 +390,13 @@ rt_setgwroute(struct rtentry *rt, u_int */ if (ISSET(nhrt->rt_flags, RTF_CLONING|RTF_GATEWAY)) { rtfree(nhrt); - return; + return (ELOOP); } - /* Next hop entry must be UP and on the same interface. */ - if (!ISSET(nhrt->rt_flags, RTF_UP) || nhrt->rt_ifidx != rt->rt_ifidx) { + /* Next hop entry must be on the same interface. */ + if (nhrt->rt_ifidx != rt->rt_ifidx) { rtfree(nhrt); - return; + return (EHOSTUNREACH); } /* @@ -429,10 +407,75 @@ rt_setgwroute(struct rtentry *rt, u_int rt->rt_mtu = nhrt->rt_mtu; /* - * Do not return the cached next-hop route, rt_checkgate() will - * do the magic for us. + * To avoid reference counting problems when writting link-layer + * addresses in an outgoing packet, we ensure that the lifetime + * of a cached entry is greater that the bigger lifetime of the + * gateway entries it is pointed by. */ + nhrt->rt_flags |= RTF_CACHED; + nhrt->rt_cachecnt++; + rt->rt_gwroute = nhrt; + + return (0); +} + +/* + * Invalidate the cached route entry of the gateway entry ``rt''. + */ +void +rt_putgwroute(struct rtentry *rt) +{ + struct rtentry *nhrt = rt->rt_gwroute; + + KERNEL_ASSERT_LOCKED(); + + if (!ISSET(rt->rt_flags, RTF_GATEWAY) || nhrt == NULL) + return; + + KASSERT(nhrt->rt_cachecnt > 0); + + --nhrt->rt_cachecnt; + if (nhrt->rt_cachecnt == 0) + nhrt->rt_flags &= ~RTF_CACHED; + + rtfree(rt->rt_gwroute); + rt->rt_gwroute = NULL; +} + +/* + * Refresh cached entries of RTF_GATEWAY routes for a given interface. + * + * This clever logic is necessary to try to fix routes linked to stale + * ifas. + */ +int +rt_fixgwroute(struct rtentry *rt, void *arg, unsigned int id) +{ + struct ifnet *ifp = arg; + + KERNEL_ASSERT_LOCKED(); + + if (rt->rt_ifidx != ifp->if_index || !ISSET(rt->rt_flags, RTF_GATEWAY)) + return (0); + + /* + * If the gateway route is not stale, its associated cached + * is also not stale. + */ + if (rt->rt_ifa->ifa_ifp != NULL) + return (0); + + /* If we can fix the cached next hop entry, we can fix the ifa. */ + if (rt_setgate(rt, rt->rt_gateway, ifp->if_rdomain) == 0) { + struct ifaddr *ifa = rt->rt_gwroute->rt_ifa; + + ifafree(rt->rt_ifa); + ifa->ifa_refcnt++; + rt->rt_ifa = ifa; + } + + return (0); } void @@ -889,8 +932,7 @@ rtrequest_delete(struct rt_addrinfo *inf if ((rt->rt_flags & RTF_CLONING) != 0) rtflushclone(tableid, rt); - rtfree(rt->rt_gwroute); - rt->rt_gwroute = NULL; + rt_putgwroute(rt); rtfree(rt->rt_parent); rt->rt_parent = NULL; @@ -1099,7 +1141,7 @@ rtrequest(int req, struct rt_addrinfo *i tableid))) { ifafree(ifa); rtfree(rt->rt_parent); - rtfree(rt->rt_gwroute); + rt_putgwroute(rt); free(rt->rt_gateway, M_RTABLE, 0); free(ndst, M_RTABLE, dlen); pool_put(&rtentry_pool, rt); @@ -1129,7 +1171,7 @@ rtrequest(int req, struct rt_addrinfo *i if (error != 0) { ifafree(ifa); rtfree(rt->rt_parent); - rtfree(rt->rt_gwroute); + rt_putgwroute(rt); free(rt->rt_gateway, M_RTABLE, 0); free(ndst, M_RTABLE, dlen); pool_put(&rtentry_pool, rt); @@ -1170,33 +1212,27 @@ rt_setgate(struct rtentry *rt, struct so } memmove(rt->rt_gateway, gate, glen); - if (ISSET(rt->rt_flags, RTF_GATEWAY)) - rt_setgwroute(rt, rtableid); + if (ISSET(rt->rt_flags, RTF_GATEWAY)) { + rt_putgwroute(rt); + return (rt_setgwroute(rt, rtableid)); + } return (0); } -int -rt_checkgate(struct rtentry *rt, struct rtentry **rtp) +/* + * Return the route entry containing the next hop link-layer + * address corresponding to ``rt''. + */ +struct rtentry * +rt_getll(struct rtentry *rt) { - struct rtentry *rt0; - - KASSERT(rt != NULL); - - rt0 = rt; - - if (rt->rt_flags & RTF_GATEWAY) { - if (rt->rt_gwroute == NULL) - return (EHOSTUNREACH); - rt = rt->rt_gwroute; + if (ISSET(rt->rt_flags, RTF_GATEWAY)) { + KASSERT(rt->rt_gwroute != NULL); + return (rt->rt_gwroute); } - if (rt->rt_flags & RTF_REJECT) - if (rt->rt_expire == 0 || time_uptime < rt->rt_expire) - return (rt == rt0 ? EHOSTDOWN : EHOSTUNREACH); - - *rtp = rt; - return (0); + return (rt); } void @@ -1260,6 +1296,8 @@ rt_ifa_add(struct ifaddr *ifa, int flags error = rtrequest(RTM_ADD, &info, prio, &rt, rtableid); if (error == 0) { + unsigned int i; + /* * A local route is created for every address configured * on an interface, so use this information to notify @@ -1269,6 +1307,18 @@ rt_ifa_add(struct ifaddr *ifa, int flags rt_sendaddrmsg(rt, RTM_NEWADDR, ifa); rt_sendmsg(rt, RTM_ADD, rtableid); rtfree(rt); + + /* + * Userland inserted routes stay in the table even + * if their corresponding ``ifa'' is no longer valid. + * + * Try to fix the stale RTF_GATEWAY entries in case + * their gateway match the newly inserted route. + */ + for (i = 0; i <= RT_TABLEID_MAX; i++) { + rtable_walk(i, ifa->ifa_addr->sa_family, + rt_fixgwroute, ifp); + } } return (error); } @@ -1788,7 +1838,8 @@ rt_if_linkstate_change(struct rtentry *r * from down interfaces so we have a chance to get * new routes from a better source. */ - if (ISSET(rt->rt_flags, RTF_CLONED|RTF_DYNAMIC)) { + if (ISSET(rt->rt_flags, RTF_CLONED|RTF_DYNAMIC) && + !ISSET(rt->rt_flags, RTF_CACHED)) { int error; if ((error = rtdeletemsg(rt, ifp, id))) Index: sys/net/route.h =================================================================== RCS file: /cvs/src/sys/net/route.h,v retrieving revision 1.141 diff -u -p -r1.141 route.h --- sys/net/route.h 13 Jul 2016 08:40:46 -0000 1.141 +++ sys/net/route.h 11 Aug 2016 09:04:53 -0000 @@ -103,7 +103,12 @@ struct rtentry { struct ifaddr *rt_ifa; /* the answer: interface addr to use */ caddr_t rt_llinfo; /* pointer to link level info cache or to an MPLS structure */ - struct rtentry *rt_gwroute; /* implied entry for gatewayed routes */ + union { + struct rtentry *_nh; /* implied entry for gatewayed routes */ + unsigned int _ref; /* # gatewayed caching this route */ + } RT_gw; +#define rt_gwroute RT_gw._nh +#define rt_cachecnt RT_gw._ref struct rtentry *rt_parent; /* If cloned, parent of this route. */ LIST_HEAD(, rttimer) rt_timer; /* queue of timeouts for misc funcs */ struct rt_kmetrics rt_rmx; /* metrics used by rx'ing protocols */ @@ -139,6 +144,7 @@ struct rtentry { #define RTF_ANNOUNCE RTF_PROTO2 /* announce L2 entry */ #define RTF_PROTO1 0x8000 /* protocol specific routing flag */ #define RTF_CLONED 0x10000 /* this is a cloned route */ +#define RTF_CACHED 0x20000 /* cached by a RTF_GATEWAY entry */ #define RTF_MPATH 0x40000 /* multipath route or operation */ #define RTF_MPLS 0x100000 /* MPLS additional infos */ #define RTF_LOCAL 0x200000 /* route to a local address */ @@ -363,7 +369,7 @@ void rt_sendmsg(struct rtentry *, int, void rt_sendaddrmsg(struct rtentry *, int, struct ifaddr *); void rt_missmsg(int, struct rt_addrinfo *, int, uint8_t, u_int, int, u_int); int rt_setgate(struct rtentry *, struct sockaddr *, u_int); -int rt_checkgate(struct rtentry *, struct rtentry **); +struct rtentry *rt_getll(struct rtentry *); void rt_setmetrics(u_long, const struct rt_metrics *, struct rt_kmetrics *); void rt_getmetrics(const struct rt_kmetrics *, struct rt_metrics *); Index: sys/net/rtsock.c =================================================================== RCS file: /cvs/src/sys/net/rtsock.c,v retrieving revision 1.194 diff -u -p -r1.194 rtsock.c --- sys/net/rtsock.c 11 Jul 2016 13:06:31 -0000 1.194 +++ sys/net/rtsock.c 11 Aug 2016 13:50:57 -0000 @@ -549,7 +549,7 @@ route_output(struct mbuf *m, ...) /* Do not let userland play with kernel-only flags. */ - if ((rtm->rtm_flags & (RTF_LOCAL|RTF_BROADCAST)) != 0) { + if ((rtm->rtm_flags & (RTF_LOCAL|RTF_BROADCAST|RTF_CACHED)) != 0) { error = EINVAL; goto fail; } @@ -615,7 +615,20 @@ route_output(struct mbuf *m, ...) } break; case RTM_DELETE: - error = rtrequest(RTM_DELETE, &info, prio, &rt, tableid); + if (!rtable_exists(tableid)) { + error = EAFNOSUPPORT; + goto flush; + } + + rt = rtable_lookup(tableid, info.rti_info[RTAX_DST], + info.rti_info[RTAX_NETMASK], info.rti_info[RTAX_GATEWAY], + prio); + + /* Silently ignore removing cached next hop entry. */ + if (ISSET(rt->rt_flags, RTF_CACHED)) + goto report; + + error = rtrequest(RTM_DELETE, &info, prio, NULL, tableid); if (error == 0) goto report; break; Index: sys/netinet/if_ether.c =================================================================== RCS file: /cvs/src/sys/netinet/if_ether.c,v retrieving revision 1.220 diff -u -p -r1.220 if_ether.c --- sys/netinet/if_ether.c 14 Jul 2016 14:01:40 -0000 1.220 +++ sys/netinet/if_ether.c 11 Aug 2016 09:05:05 -0000 @@ -306,7 +306,6 @@ arpresolve(struct ifnet *ifp, struct rte struct sockaddr_dl *sdl; struct rtentry *rt = NULL; char addr[INET_ADDRSTRLEN]; - int error; if (m->m_flags & M_BCAST) { /* broadcast */ memcpy(desten, etherbroadcastaddr, sizeof(etherbroadcastaddr)); @@ -317,10 +316,12 @@ arpresolve(struct ifnet *ifp, struct rte return (0); } - error = rt_checkgate(rt0, &rt); - if (error) { + rt = rt_getll(rt0); + + if (ISSET(rt->rt_flags, RTF_REJECT) && + (rt->rt_expire == 0 || time_uptime < rt->rt_expire)) { m_freem(m); - return (error); + return (rt == rt0 ? EHOSTDOWN : EHOSTUNREACH); } if (!ISSET(rt->rt_flags, RTF_LLINFO)) { @@ -683,7 +684,7 @@ arptfree(struct rtentry *rt) la->la_asked = 0; } - if (!ISSET(rt->rt_flags, RTF_STATIC)) + if (!ISSET(rt->rt_flags, RTF_STATIC|RTF_CACHED)) rtdeletemsg(rt, ifp, ifp->if_rdomain); if_put(ifp); } Index: sys/netinet6/nd6.c =================================================================== RCS file: /cvs/src/sys/netinet6/nd6.c,v retrieving revision 1.188 diff -u -p -r1.188 nd6.c --- sys/netinet6/nd6.c 13 Jul 2016 08:40:46 -0000 1.188 +++ sys/netinet6/nd6.c 11 Aug 2016 09:05:29 -0000 @@ -814,7 +814,7 @@ nd6_free(struct rtentry *rt, int gc) * caches, and disable the route entry not to be used in already * cached routes. */ - if (!ISSET(rt->rt_flags, RTF_STATIC)) + if (!ISSET(rt->rt_flags, RTF_STATIC|RTF_CACHED)) rtdeletemsg(rt, ifp, ifp->if_rdomain); splx(s); @@ -1495,18 +1495,13 @@ nd6_resolve(struct ifnet *ifp, struct rt struct sockaddr_dl *sdl; struct rtentry *rt; struct llinfo_nd6 *ln = NULL; - int error; if (m->m_flags & M_MCAST) { ETHER_MAP_IPV6_MULTICAST(&satosin6(dst)->sin6_addr, desten); return (0); } - error = rt_checkgate(rt0, &rt); - if (error) { - m_freem(m); - return (error); - } + rt = rt_getll(rt0); /* * Address resolution or Neighbor Unreachability Detection Index: sys/netinet6/nd6_rtr.c =================================================================== RCS file: /cvs/src/sys/netinet6/nd6_rtr.c,v retrieving revision 1.140 diff -u -p -r1.140 nd6_rtr.c --- sys/netinet6/nd6_rtr.c 5 Jul 2016 10:17:14 -0000 1.140 +++ sys/netinet6/nd6_rtr.c 8 Aug 2016 10:33:02 -0000 @@ -1249,19 +1249,6 @@ prelist_update(struct nd_prefix *new, st goto end; /* we should just give up in this case. */ } - /* - * XXX: from the ND point of view, we can ignore a prefix - * with the on-link bit being zero. However, we need a - * prefix structure for references from autoconfigured - * addresses. Thus, we explicitly make sure that the prefix - * itself expires now. - */ - if (newpr->ndpr_raf_onlink == 0) { - newpr->ndpr_vltime = 0; - newpr->ndpr_pltime = 0; - in6_init_prefix_ltimes(newpr); - } - pr = newpr; } Index: usr.sbin/arp/arp.c =================================================================== RCS file: /cvs/src/usr.sbin/arp/arp.c,v retrieving revision 1.75 diff -u -p -r1.75 arp.c --- usr.sbin/arp/arp.c 28 May 2016 07:00:18 -0000 1.75 +++ usr.sbin/arp/arp.c 11 Aug 2016 09:06:34 -0000 @@ -417,7 +417,7 @@ tryagain: } if (sin->sin_addr.s_addr == sin_m.sin_addr.s_addr) { if (sdl->sdl_family == AF_LINK && rtm->rtm_flags & RTF_LLINFO) { - if (rtm->rtm_flags & RTF_LOCAL) + if (rtm->rtm_flags & (RTF_LOCAL|RTF_CACHED)) return (0); if (!(rtm->rtm_flags & RTF_GATEWAY)) switch (sdl->sdl_type) { Index: usr.sbin/ndp/ndp.c =================================================================== RCS file: /cvs/src/usr.sbin/ndp/ndp.c,v retrieving revision 1.75 diff -u -p -r1.75 ndp.c --- usr.sbin/ndp/ndp.c 2 Aug 2016 16:17:54 -0000 1.75 +++ usr.sbin/ndp/ndp.c 11 Aug 2016 09:07:26 -0000 @@ -514,7 +514,7 @@ delete(char *host) if (IN6_ARE_ADDR_EQUAL(&sin->sin6_addr, &sin_m.sin6_addr)) { if (sdl->sdl_family == AF_LINK && rtm->rtm_flags & RTF_LLINFO) { - if (rtm->rtm_flags & (RTF_LOCAL|RTF_BROADCAST)) + if (rtm->rtm_flags & (RTF_LOCAL|RTF_CACHED)) return (0); if (!(rtm->rtm_flags & RTF_GATEWAY)) goto delete;