On 25/08/15(Tue) 12:27, Martin Pieuchot wrote: > On 12/08/15(Wed) 17:03, Martin Pieuchot wrote: > > I'm currently working on the routing table interface to make is safe > > to use by multiple CPUs at the same time. The diff below is a big > > step in this direction and I'd really appreciate if people could test > > it with their usual network setup and report back. > > Updated version to match recent changes. I'm still looking for test > reports and reviews.
Thanks to all the testers. Here's a third version that should fix a regression reported by phessler@. The idea is to use rtvalid(9) to check early if the gateway route is still valid or not. If it isn't then call rtalloc(9) again to perform the ARP/NDP resolution. This diff includes rtvalid(9) and makes use of it in ip{,6}_output(), more conversions will be done afterward, but these should hopefully be all the necessary bits to fix the regression. Tests & reviews are still welcome! Index: net/route.c =================================================================== RCS file: /cvs/src/sys/net/route.c,v retrieving revision 1.226 diff -u -p -r1.226 route.c --- net/route.c 30 Aug 2015 10:39:16 -0000 1.226 +++ net/route.c 31 Aug 2015 09:33:27 -0000 @@ -153,6 +153,7 @@ int rtable_alloc(void ***, u_int); int rtflushclone1(struct rtentry *, void *, u_int); void rtflushclone(unsigned int, struct rtentry *); int rt_if_remove_rtdelete(struct rtentry *, void *, u_int); +struct rtentry *rt_match(struct sockaddr *, int, unsigned int); struct ifaddr *ifa_ifwithroute(int, struct sockaddr *, struct sockaddr *, u_int); @@ -300,19 +301,68 @@ rtable_exists(u_int id) /* verify table return (1); } + +/* + * Do the actual checks for rtvalid(9), do not use directly! + */ +static inline int +rt_is_valid(struct rtentry *rt) +{ + if (rt == NULL) + return (0); + + if ((rt->rt_flags & RTF_UP) == 0) + return (0); + + /* Routes attached to stall ifas should be freed. */ + if (rt->rt_ifa == NULL || rt->rt_ifa->ifa_ifp == NULL) + return (0); + + return (1); +} + +/* + * Returns 1 if the cached ``rt'' entry is still valid, 0 otherwise. + */ +int +rtvalid(struct rtentry *rt) +{ + if (rt_is_valid(rt) == 0) + return (0); + + /* Next hop must also be valid. */ + if ((rt->rt_flags & RTF_GATEWAY) && rt_is_valid(rt->rt_gwroute) == 0) + return (0); + + return (1); +} + +/* + * Do the actual lookup for rtalloc(9), do not use directly! + * + * Return the best matching entry for the destination ``dst''. + * + * "RT_RESOLVE" means that a corresponding L2 entry should + * be added to the routing table and resolved (via ARP or + * NDP), if it does not exist. + * + * "RT_REPORT" indicates that a message should be sent to + * userland if no matching route has been found or if an + * error occured while adding a L2 entry. + */ struct rtentry * -rtalloc(struct sockaddr *dst, int flags, unsigned int tableid) +rt_match(struct sockaddr *dst, int flags, unsigned int tableid) { struct rtentry *rt; struct rtentry *newrt = NULL; struct rt_addrinfo info; - int s, error = 0, msgtype = RTM_MISS; + int s, error = 0; - s = splsoftnet(); bzero(&info, sizeof(info)); info.rti_info[RTAX_DST] = dst; + s = splsoftnet(); rt = rtable_match(tableid, dst); if (rt != NULL) { newrt = rt; @@ -325,10 +375,6 @@ rtalloc(struct sockaddr *dst, int flags, goto miss; } rt = newrt; - if (rt->rt_flags & RTF_XRESOLVE) { - msgtype = RTM_RESOLVE; - goto miss; - } /* Inform listeners of the new route */ rt_sendmsg(rt, RTM_ADD, tableid); } else @@ -336,11 +382,8 @@ rtalloc(struct sockaddr *dst, int flags, } else { rtstat.rts_unreach++; miss: - if (ISSET(flags, RT_REPORT)) { - bzero((caddr_t)&info, sizeof(info)); - info.rti_info[RTAX_DST] = dst; - rt_missmsg(msgtype, &info, 0, NULL, error, tableid); - } + if (ISSET(flags, RT_REPORT)) + rt_missmsg(RTM_MISS, &info, 0, NULL, error, tableid); } splx(s); return (newrt); @@ -374,6 +417,75 @@ rtalloc_mpath(struct sockaddr *dst, uint } #endif /* SMALL_KERNEL */ +/* + * Look in the routing table for the best matching entry for + * ``dst''. + * + * If a route with a gateway is found and its next hop is no + * longer valid, try to cache it. + */ +struct rtentry * +rtalloc(struct sockaddr *dst, int flags, unsigned int rtableid) +{ + struct rtentry *rt, *nhrt; + + rt = rt_match(dst, flags, rtableid); + + /* No match or route to host? We're done. */ + if (rt == NULL || (rt->rt_flags & RTF_GATEWAY) == 0) + return (rt); + + nhrt = rt->rt_gwroute; + + /* Nothing to do if the next hop is valid. */ + if (nhrt != NULL && (nhrt->rt_flags & RTF_UP)) + return (rt); + + rtfree(rt->rt_gwroute); + rt->rt_gwroute = NULL; + + /* + * If we cannot find a valid next hop, return the route + * with a gateway. + * + * Some dragons hiding in the tree certainly depends on + * this behavior. + */ + nhrt = rt_match(rt->rt_gateway, flags | RT_RESOLVE, rtableid); + if (nhrt == NULL) + return (rt); + + /* + * Next hop must be reachable, this also prevents rtentry + * loops for example when rt->rt_gwroute points to rt. + */ + if ((nhrt->rt_flags & (RTF_UP|RTF_CLONING|RTF_GATEWAY)) != RTF_UP) { + rtfree(nhrt); + return (rt); + } + + /* + * Next hop entry MUST be on the same interface. + * + * This check is needed as long as we support stall + * ``ifa'' pointers. + */ + if (nhrt->rt_ifp != rt->rt_ifp) { + rtfree(nhrt); + return (rt); + } + + /* + * If the MTU of next hop is 0, this will reset the MTU of the + * route to run PMTUD again from scratch. + */ + if (!ISSET(rt->rt_locks, RTV_MTU) && (rt->rt_mtu > nhrt->rt_mtu)) + rt->rt_mtu = nhrt->rt_mtu; + + rt->rt_gwroute = nhrt; + return (rt); +} + void rtfree(struct rtentry *rt) { @@ -527,7 +639,7 @@ create: rt->rt_flags |= RTF_MODIFIED; flags |= RTF_MODIFIED; stat = &rtstat.rts_newgateway; - rt_setgate(rt, gateway, rdomain); + rt_setgate(rt, gateway); } } else error = EHOSTUNREACH; @@ -659,19 +771,13 @@ ifa_ifwithroute(int flags, struct sockad } if (ifa == NULL) { struct rtentry *rt = rtalloc(gateway, 0, rtableid); - if (rt == NULL) - return (NULL); /* The gateway must be local if the same address family. */ - if ((rt->rt_flags & RTF_GATEWAY) && - rt_key(rt)->sa_family == dst->sa_family) { + if (rtvalid(rt) == 0 || ((rt->rt_flags & RTF_GATEWAY) && + rt_key(rt)->sa_family == dst->sa_family)) { rtfree(rt); return (NULL); } ifa = rt->rt_ifa; - if (ifa == NULL || ifa->ifa_ifp == NULL) { - rtfree(rt); - return (NULL); - } rtfree(rt); } if (ifa->ifa_addr->sa_family != dst->sa_family) { @@ -982,8 +1088,7 @@ rtrequest1(int req, struct rt_addrinfo * * the routing table because the radix MPATH code use * it to (re)order routes. */ - if ((error = rt_setgate(rt, info->rti_info[RTAX_GATEWAY], - tableid))) { + if ((error = rt_setgate(rt, info->rti_info[RTAX_GATEWAY]))) { free(ndst, M_RTABLE, dlen); pool_put(&rtentry_pool, rt); return (error); @@ -1034,7 +1139,7 @@ rtrequest1(int req, struct rt_addrinfo * } int -rt_setgate(struct rtentry *rt, struct sockaddr *gate, unsigned int tableid) +rt_setgate(struct rtentry *rt, struct sockaddr *gate) { int glen = ROUNDUP(gate->sa_len); struct sockaddr *sa; @@ -1052,22 +1157,7 @@ rt_setgate(struct rtentry *rt, struct so rtfree(rt->rt_gwroute); rt->rt_gwroute = NULL; } - if (rt->rt_flags & RTF_GATEWAY) { - /* XXX is this actually valid to cross tables here? */ - rt->rt_gwroute = rtalloc(gate, RT_REPORT|RT_RESOLVE, tableid); - /* - * If we switched gateways, grab the MTU from the new - * gateway route if the current MTU is 0 or greater - * than the MTU of gateway. - * Note that, if the MTU of gateway is 0, we will reset the - * MTU of the route to run PMTUD again from scratch. XXX - */ - if (rt->rt_gwroute && !(rt->rt_rmx.rmx_locks & RTV_MTU) && - rt->rt_rmx.rmx_mtu && - rt->rt_rmx.rmx_mtu > rt->rt_gwroute->rt_rmx.rmx_mtu) { - rt->rt_rmx.rmx_mtu = rt->rt_gwroute->rt_rmx.rmx_mtu; - } - } + return (0); } @@ -1079,28 +1169,21 @@ rt_checkgate(struct ifnet *ifp, struct r KASSERT(rt != NULL); - if ((rt->rt_flags & RTF_UP) == 0) { - rt = rtalloc(dst, RT_REPORT|RT_RESOLVE, rtableid); - if (rt == NULL) - return (EHOSTUNREACH); - rt->rt_refcnt--; - if (rt->rt_ifp != ifp) - return (EHOSTUNREACH); - } + if ((rt->rt_flags & RTF_UP) == 0) + return (EHOSTUNREACH); rt0 = rt; if (rt->rt_flags & RTF_GATEWAY) { - if (rt->rt_gwroute && !(rt->rt_gwroute->rt_flags & RTF_UP)) { + if (rt->rt_gwroute == NULL) + return (EHOSTUNREACH); + + if ((rt->rt_gwroute->rt_flags & RTF_UP) == 0) { rtfree(rt->rt_gwroute); rt->rt_gwroute = NULL; + return (EHOSTUNREACH); } - if (rt->rt_gwroute == NULL) { - rt->rt_gwroute = rtalloc(rt->rt_gateway, - RT_REPORT|RT_RESOLVE, rtableid); - if (rt->rt_gwroute == NULL) - return (EHOSTUNREACH); - } + /* * Next hop must be reachable, this also prevents rtentry * loops, for example when rt->rt_gwroute points to rt. Index: net/route.h =================================================================== RCS file: /cvs/src/sys/net/route.h,v retrieving revision 1.110 diff -u -p -r1.110 route.h --- net/route.h 20 Aug 2015 12:39:43 -0000 1.110 +++ net/route.h 31 Aug 2015 09:05:07 -0000 @@ -118,6 +118,8 @@ struct rtentry { }; #define rt_use rt_rmx.rmx_pksent #define rt_expire rt_rmx.rmx_expire +#define rt_locks rt_rmx.rmx_locks +#define rt_mtu rt_rmx.rmx_mtu #define RTF_UP 0x1 /* route usable */ #define RTF_GATEWAY 0x2 /* destination is a gateway */ @@ -361,7 +363,7 @@ void rt_sendmsg(struct rtentry *, int, void rt_sendaddrmsg(struct rtentry *, int); void rt_missmsg(int, struct rt_addrinfo *, int, struct ifnet *, int, u_int); -int rt_setgate(struct rtentry *, struct sockaddr *, unsigned int); +int rt_setgate(struct rtentry *, struct sockaddr *); int rt_checkgate(struct ifnet *, struct rtentry *, struct sockaddr *, unsigned int, struct rtentry **); void rt_setmetrics(u_long, struct rt_metrics *, struct rt_kmetrics *); @@ -377,6 +379,7 @@ void rt_timer_queue_destroy(struct rt unsigned long rt_timer_queue_count(struct rttimer_queue *); void rt_timer_timer(void *); +int rtvalid(struct rtentry *); #ifdef SMALL_KERNEL #define rtalloc_mpath(dst, s, rid) rtalloc((dst), RT_REPORT|RT_RESOLVE, (rid)) #else Index: net/rtsock.c =================================================================== RCS file: /cvs/src/sys/net/rtsock.c,v retrieving revision 1.169 diff -u -p -r1.169 rtsock.c --- net/rtsock.c 24 Aug 2015 22:11:33 -0000 1.169 +++ net/rtsock.c 31 Aug 2015 09:05:07 -0000 @@ -744,9 +744,8 @@ report: info.rti_info[RTAX_GATEWAY]->sa_len)) { newgate = 1; } - if (info.rti_info[RTAX_GATEWAY] != NULL && - (error = rt_setgate(rt, info.rti_info[RTAX_GATEWAY], - tableid))) + if (info.rti_info[RTAX_GATEWAY] != NULL && (error = + rt_setgate(rt, info.rti_info[RTAX_GATEWAY]))) goto flush; /* * new gateway could require new ifaddr, ifp; Index: netinet/ip_output.c =================================================================== RCS file: /cvs/src/sys/netinet/ip_output.c,v retrieving revision 1.287 diff -u -p -r1.287 ip_output.c --- netinet/ip_output.c 31 Aug 2015 07:17:12 -0000 1.287 +++ netinet/ip_output.c 31 Aug 2015 09:27:51 -0000 @@ -168,12 +168,13 @@ ip_output(struct mbuf *m0, struct mbuf * dst = satosin(&ro->ro_dst); /* - * If there is a cached route, check that it is to the same - * destination and is still up. If not, free it and try again. + * If there is a cached route, check that it is to the + * same destination and is still valid. If not, free + * it and try again. */ - if (ro->ro_rt && ((ro->ro_rt->rt_flags & RTF_UP) == 0 || + if (rtvalid(ro->ro_rt) == 0 || dst->sin_addr.s_addr != ip->ip_dst.s_addr || - ro->ro_tableid != m->m_pkthdr.ph_rtableid)) { + ro->ro_tableid != m->m_pkthdr.ph_rtableid) { rtfree(ro->ro_rt); ro->ro_rt = NULL; } @@ -296,12 +297,13 @@ reroute: dst = satosin(&ro->ro_dst); /* - * If there is a cached route, check that it is to the same - * destination and is still up. If not, free it and try again. + * If there is a cached route, check that it is to the + * same destination and is still valid. If not, free + * it and try again. */ - if (ro->ro_rt && ((ro->ro_rt->rt_flags & RTF_UP) == 0 || + if (rtvalid(ro->ro_rt) == 0 || dst->sin_addr.s_addr != ip->ip_dst.s_addr || - ro->ro_tableid != m->m_pkthdr.ph_rtableid)) { + ro->ro_tableid != m->m_pkthdr.ph_rtableid) { rtfree(ro->ro_rt); ro->ro_rt = NULL; } Index: netinet6/in6_src.c =================================================================== RCS file: /cvs/src/sys/netinet6/in6_src.c,v retrieving revision 1.51 diff -u -p -r1.51 in6_src.c --- netinet6/in6_src.c 8 Jun 2015 22:19:28 -0000 1.51 +++ netinet6/in6_src.c 31 Aug 2015 09:41:20 -0000 @@ -247,13 +247,12 @@ in6_selectsrc(struct in6_addr **in6src, * our src addr is taken from the i/f, else punt. */ if (ro) { - if (ro->ro_rt && ((ro->ro_rt->rt_flags & RTF_UP) == 0 || - !IN6_ARE_ADDR_EQUAL(&ro->ro_dst.sin6_addr, dst))) { + if (rtvalid(ro->ro_rt) == 0 || + !IN6_ARE_ADDR_EQUAL(&ro->ro_dst.sin6_addr, dst)) { rtfree(ro->ro_rt); ro->ro_rt = NULL; } - if (ro->ro_rt == (struct rtentry *)0 || - ro->ro_rt->rt_ifp == (struct ifnet *)0) { + if (ro->ro_rt == NULL) { struct sockaddr_in6 *sa6; /* No route yet, so try to acquire one */ @@ -419,10 +418,9 @@ selectroute(struct sockaddr_in6 *dstsock * cached destination, in case of sharing the cache with IPv4. */ if (ro) { - if (ro->ro_rt && - (!(ro->ro_rt->rt_flags & RTF_UP) || + if (rtvalid(ro->ro_rt) == 0 || sin6tosa(&ro->ro_dst)->sa_family != AF_INET6 || - !IN6_ARE_ADDR_EQUAL(&ro->ro_dst.sin6_addr, dst))) { + !IN6_ARE_ADDR_EQUAL(&ro->ro_dst.sin6_addr, dst)) { rtfree(ro->ro_rt); ro->ro_rt = NULL; }