I sent this diff out some time ago and would really like to get this in. This is one step on makeing rtsock.c less of a hornets nest. This reduces the side effects in route_output and simplifies some other bits as well. For example route_input is less variadic and simpler.
Anyone couragous enough to send me an OK? -- :wq Claudio Index: net/rtsock.c =================================================================== RCS file: /cvs/src/sys/net/rtsock.c,v retrieving revision 1.213 diff -u -p -r1.213 rtsock.c --- net/rtsock.c 19 Jan 2017 23:18:29 -0000 1.213 +++ net/rtsock.c 20 Jan 2017 01:37:33 -0000 @@ -92,7 +92,6 @@ struct sockaddr route_dst = { 2, PF_ROUTE, }; struct sockaddr route_src = { 2, PF_ROUTE, }; -struct sockproto route_proto = { PF_ROUTE, }; struct walkarg { int w_op, w_arg, w_given, w_needed, w_tmemsize; @@ -100,7 +99,7 @@ struct walkarg { }; int route_ctloutput(int, struct socket *, int, int, struct mbuf **); -void route_input(struct mbuf *m0, ...); +void route_input(struct mbuf *m0, unsigned short); int route_arp_conflict(struct rtentry *, struct rt_addrinfo *); int route_cleargateway(struct rtentry *, void *, unsigned int); @@ -331,7 +330,7 @@ rt_senddesync(void *data) } void -route_input(struct mbuf *m0, ...) +route_input(struct mbuf *m0, unsigned short sa_family) { struct rawcb *rp; struct routecb *rop; @@ -339,15 +338,10 @@ route_input(struct mbuf *m0, ...) struct mbuf *m = m0; int s, sockets = 0; struct socket *last = NULL; - va_list ap; - struct sockproto *proto; struct sockaddr *sosrc, *sodst; - va_start(ap, m0); - proto = va_arg(ap, struct sockproto *); - sosrc = va_arg(ap, struct sockaddr *); - sodst = va_arg(ap, struct sockaddr *); - va_end(ap); + sosrc = &route_src; + sodst = &route_src; /* ensure that we can access the rtm_type via mtod() */ if (m->m_len < offsetof(struct rt_msghdr, rtm_type) + 1) { @@ -358,10 +352,15 @@ route_input(struct mbuf *m0, ...) LIST_FOREACH(rp, &rawcb, rcb_list) { if (rp->rcb_socket->so_state & SS_CANTRCVMORE) continue; - if (rp->rcb_proto.sp_family != proto->sp_family) + if (rp->rcb_proto.sp_family != PF_ROUTE) continue; - if (rp->rcb_proto.sp_protocol && proto->sp_protocol && - rp->rcb_proto.sp_protocol != proto->sp_protocol) + /* + * If route socket is bound to an address family only send + * messages that match the address family. Address family + * agnostic messages are always send. + */ + if (rp->rcb_proto.sp_protocol != 0 && sa_family != 0 && + rp->rcb_proto.sp_protocol != sa_family) continue; /* * We assume the lower level routines have @@ -458,36 +457,99 @@ route_input(struct mbuf *m0, ...) m_freem(m); } +struct rt_msghdr * +rt_report(struct rtentry *rt, u_char type, int seq, int tableid) +{ + struct rt_msghdr *rtm; + struct rt_addrinfo info; + struct sockaddr_rtlabel sa_rl; + struct sockaddr_in6 sa_mask; +#ifdef BFD + struct sockaddr_bfd sa_bfd; +#endif +#ifdef MPLS + struct sockaddr_mpls sa_mpls; +#endif + struct ifnet *ifp = NULL; + int len; + + bzero(&info, sizeof(info)); + info.rti_info[RTAX_DST] = rt_key(rt); + info.rti_info[RTAX_GATEWAY] = rt->rt_gateway; + info.rti_info[RTAX_NETMASK] = rt_plen2mask(rt, &sa_mask); + info.rti_info[RTAX_LABEL] = rtlabel_id2sa(rt->rt_labelid, &sa_rl); +#ifdef BFD + if (rt->rt_flags & RTF_BFD) + info.rti_info[RTAX_BFD] = bfd2sa(rt, &sa_bfd); +#endif +#ifdef MPLS + if (rt->rt_flags & RTF_MPLS) { + bzero(&sa_mpls, sizeof(sa_mpls)); + sa_mpls.smpls_family = AF_MPLS; + sa_mpls.smpls_len = sizeof(sa_mpls); + sa_mpls.smpls_label = ((struct rt_mpls *) + rt->rt_llinfo)->mpls_label; + info.rti_info[RTAX_SRC] = (struct sockaddr *)&sa_mpls; + info.rti_mpls = ((struct rt_mpls *) + rt->rt_llinfo)->mpls_operation; + } +#endif + ifp = if_get(rt->rt_ifidx); + if (ifp != NULL) { + info.rti_info[RTAX_IFP] = sdltosa(ifp->if_sadl); + info.rti_info[RTAX_IFA] = rt->rt_ifa->ifa_addr; + if (ifp->if_flags & IFF_POINTOPOINT) + info.rti_info[RTAX_BRD] = rt->rt_ifa->ifa_dstaddr; + } + if_put(ifp); + /* RTAX_GENMASK, RTAX_AUTHOR, RTAX_SRCMASK ignored */ + + /* build new route message */ + len = rt_msg2(type, RTM_VERSION, &info, NULL, NULL); + /* XXX why can't we wait? Should be process context... */ + rtm = malloc(len, M_RTABLE, M_NOWAIT | M_ZERO); + if (rtm == NULL) + return NULL; + + rt_msg2(type, RTM_VERSION, &info, (caddr_t)rtm, NULL); + rtm->rtm_type = type; + rtm->rtm_index = rt->rt_ifidx; + rtm->rtm_tableid = tableid; + rtm->rtm_priority = rt->rt_priority & RTP_MASK; + rtm->rtm_flags = rt->rt_flags; + rtm->rtm_pid = curproc->p_p->ps_pid; + rtm->rtm_seq = seq; + rt_getmetrics(&rt->rt_rmx, &rtm->rtm_rmx); + rtm->rtm_addrs = info.rti_addrs; +#ifdef MPLS + rtm->rtm_mpls = info.rti_mpls; +#endif + return rtm; +} + int route_output(struct mbuf *m, ...) { struct rt_msghdr *rtm = NULL; struct rtentry *rt = NULL; - struct rtentry *saved_nrt = NULL; struct rt_addrinfo info; - int plen, len, newgate = 0, error = 0; + int plen, len, seq, newgate = 0, error = 0; struct ifnet *ifp = NULL; struct ifaddr *ifa = NULL; struct socket *so; struct rawcb *rp = NULL; - struct sockaddr_rtlabel sa_rl; - struct sockaddr_in6 sa_mask; -#ifdef BFD - struct sockaddr_bfd sa_bfd; -#endif #ifdef MPLS - struct sockaddr_mpls sa_mpls, *psa_mpls; + struct sockaddr_mpls *psa_mpls; #endif va_list ap; u_int tableid; u_int8_t prio; - u_char vers; + u_char vers, type; va_start(ap, m); so = va_arg(ap, struct socket *); va_end(ap); - info.rti_info[RTAX_DST] = NULL; /* for error handling (goto flush) */ if (m == NULL || ((m->m_len < sizeof(int32_t)) && (m = m_pullup(m, sizeof(int32_t))) == 0)) return (ENOBUFS); @@ -554,10 +616,10 @@ route_output(struct mbuf *m, ...) if (!rtable_exists(tableid)) { if (rtm->rtm_type == RTM_ADD) { if ((error = rtable_add(tableid)) != 0) - goto flush; + goto fail; } else { error = EINVAL; - goto flush; + goto fail; } } @@ -597,7 +659,7 @@ route_output(struct mbuf *m, ...) info.rti_info[RTAX_GATEWAY]->sa_family >= AF_MAX) || info.rti_info[RTAX_GENMASK] != NULL) { error = EINVAL; - goto flush; + goto fail; } #ifdef MPLS info.rti_mpls = rtm->rtm_mpls; @@ -609,6 +671,11 @@ route_output(struct mbuf *m, ...) info.rti_flags |= RTF_LLINFO; } + /* + * Do not use goto flush before this point since the message itself + * may be not consistent and could cause unexpected behaviour in other + * userland clients. + */ switch (rtm->rtm_type) { case RTM_ADD: if (info.rti_info[RTAX_GATEWAY] == NULL) { @@ -634,23 +701,14 @@ route_output(struct mbuf *m, ...) rtfree(rt); rt = NULL; - error = rtrequest(RTM_ADD, &info, prio, &saved_nrt, tableid); - if (error == 0) { + error = rtrequest(RTM_ADD, &info, prio, &rt, tableid); + if (error == 0) rt_setmetrics(rtm->rtm_inits, &rtm->rtm_rmx, - &saved_nrt->rt_rmx); - /* write back the priority the kernel used */ - rtm->rtm_priority = saved_nrt->rt_priority & RTP_MASK; - rtm->rtm_index = saved_nrt->rt_ifidx; - rtm->rtm_flags = saved_nrt->rt_flags; - rtfree(saved_nrt); - } + &rt->rt_rmx); + else + goto flush; break; case RTM_DELETE: - if (!rtable_exists(tableid)) { - error = EAFNOSUPPORT; - goto flush; - } - rt = rtable_lookup(tableid, info.rti_info[RTAX_DST], info.rti_info[RTAX_NETMASK], info.rti_info[RTAX_GATEWAY], prio); @@ -668,7 +726,7 @@ route_output(struct mbuf *m, ...) /* Reset the MTU of the gateway route. */ rtable_walk(tableid, rt_key(rt)->sa_family, route_cleargateway, rt); - goto report; + break; } /* @@ -678,96 +736,18 @@ route_output(struct mbuf *m, ...) if ((rt != NULL) && ISSET(rt->rt_flags, RTF_LOCAL|RTF_BROADCAST)) { error = EINVAL; - goto report; + break; } rtfree(rt); rt = NULL; error = rtrequest(RTM_DELETE, &info, prio, &rt, tableid); - if (error == 0) - goto report; - break; - case RTM_GET: - if (!rtable_exists(tableid)) { - error = EAFNOSUPPORT; + if (error != 0) goto flush; - } - rt = rtable_lookup(tableid, info.rti_info[RTAX_DST], - info.rti_info[RTAX_NETMASK], info.rti_info[RTAX_GATEWAY], - prio); - if (rt == NULL) { - error = ESRCH; - goto flush; - } - -report: - info.rti_info[RTAX_DST] = rt_key(rt); - info.rti_info[RTAX_GATEWAY] = rt->rt_gateway; - info.rti_info[RTAX_NETMASK] = - rt_plen2mask(rt, &sa_mask); - info.rti_info[RTAX_LABEL] = - rtlabel_id2sa(rt->rt_labelid, &sa_rl); -#ifdef BFD - if (rt->rt_flags & RTF_BFD) - info.rti_info[RTAX_BFD] = bfd2sa(rt, &sa_bfd); -#endif -#ifdef MPLS - if (rt->rt_flags & RTF_MPLS) { - bzero(&sa_mpls, sizeof(sa_mpls)); - sa_mpls.smpls_family = AF_MPLS; - sa_mpls.smpls_len = sizeof(sa_mpls); - sa_mpls.smpls_label = ((struct rt_mpls *) - rt->rt_llinfo)->mpls_label; - info.rti_info[RTAX_SRC] = - (struct sockaddr *)&sa_mpls; - info.rti_mpls = ((struct rt_mpls *) - rt->rt_llinfo)->mpls_operation; - rtm->rtm_mpls = info.rti_mpls; - } -#endif - info.rti_info[RTAX_IFP] = NULL; - info.rti_info[RTAX_IFA] = NULL; - ifp = if_get(rt->rt_ifidx); - if (ifp != NULL && rtm->rtm_addrs & (RTA_IFP|RTA_IFA)) { - info.rti_info[RTAX_IFP] = sdltosa(ifp->if_sadl); - info.rti_info[RTAX_IFA] = rt->rt_ifa->ifa_addr; - if (ifp->if_flags & IFF_POINTOPOINT) - info.rti_info[RTAX_BRD] = - rt->rt_ifa->ifa_dstaddr; - else - info.rti_info[RTAX_BRD] = NULL; - } - if_put(ifp); - len = rt_msg2(rtm->rtm_type, RTM_VERSION, &info, NULL, - NULL); - if (len > rtm->rtm_msglen) { - struct rt_msghdr *new_rtm; - new_rtm = malloc(len, M_RTABLE, M_NOWAIT); - if (new_rtm == NULL) { - error = ENOBUFS; - goto flush; - } - memcpy(new_rtm, rtm, rtm->rtm_msglen); - free(rtm, M_RTABLE, 0); - rtm = new_rtm; - } - rt_msg2(rtm->rtm_type, RTM_VERSION, &info, (caddr_t)rtm, - NULL); - rtm->rtm_flags = rt->rt_flags; - rtm->rtm_use = 0; - rtm->rtm_priority = rt->rt_priority & RTP_MASK; - rtm->rtm_index = rt->rt_ifidx; - rt_getmetrics(&rt->rt_rmx, &rtm->rtm_rmx); - rtm->rtm_addrs = info.rti_addrs; break; case RTM_CHANGE: case RTM_LOCK: - if (!rtable_exists(tableid)) { - error = EAFNOSUPPORT; - goto flush; - } - rt = rtable_lookup(tableid, info.rti_info[RTAX_DST], info.rti_info[RTAX_NETMASK], info.rti_info[RTAX_GATEWAY], prio); @@ -916,9 +896,6 @@ change: rt_setmetrics(rtm->rtm_inits, &rtm->rtm_rmx, &rt->rt_rmx); - rtm->rtm_index = rt->rt_ifidx; - rtm->rtm_priority = rt->rt_priority & RTP_MASK; - rtm->rtm_flags = rt->rt_flags; ifp = if_get(rt->rt_ifidx); KASSERT(ifp != NULL); @@ -938,13 +915,45 @@ change: rt->rt_rmx.rmx_locks &= ~(rtm->rtm_inits); rt->rt_rmx.rmx_locks |= (rtm->rtm_inits & rtm->rtm_rmx.rmx_locks); - rtm->rtm_priority = rt->rt_priority & RTP_MASK; break; } break; + case RTM_GET: + rt = rtable_lookup(tableid, info.rti_info[RTAX_DST], + info.rti_info[RTAX_NETMASK], info.rti_info[RTAX_GATEWAY], + prio); + if (rt == NULL) { + error = ESRCH; + goto flush; + } + break; + } + + /* CODE DOWN HERE NEEDS: + rt, rtm, error, so, m, tableid, sa_family of some sort + + OTHER NOTES: + - to end up here it seems all is OK, error is most probably 0 + - error cases take the flush route or in bad cases fail. + - fail does not report the message back but just fails the call. + if the message is not valid then fail should be used + - the dance in route_input and rp->rcb_proto.sp_family = 0 dance is + insane, better to pass the so we don't want to listen on to + route_input. + */ + + type = rtm->rtm_type; + seq = rtm->rtm_seq; + free(rtm, M_RTABLE, 0); + rtm = rt_report(rt, type, seq, tableid); + if (rtm == NULL) { + error = ENOBUFS; + goto fail; } flush: + if (rt) + rtfree(rt); if (rtm) { if (error) rtm->rtm_errno = error; @@ -952,16 +961,12 @@ flush: rtm->rtm_flags |= RTF_DONE; } } - if (info.rti_info[RTAX_DST]) - route_proto.sp_protocol = info.rti_info[RTAX_DST]->sa_family; - if (rt) - rtfree(rt); - /* * Check to see if we don't want our own messages. */ if (!(so->so_options & SO_USELOOPBACK)) { if (route_cb.any_count <= 1) { + /* no other listener and no loopback of messages */ fail: free(rtm, M_RTABLE, 0); m_freem(m); @@ -969,9 +974,8 @@ fail: } /* There is another listener, so construct message */ rp = sotorawcb(so); - } - if (rp) rp->rcb_proto.sp_family = 0; /* Avoid us */ + } if (rtm) { if (m_copyback(m, 0, rtm->rtm_msglen, rtm, M_NOWAIT)) { m_freem(m); @@ -981,9 +985,10 @@ fail: free(rtm, M_RTABLE, 0); } if (m) - route_input(m, &route_proto, &route_src, &route_dst); + route_input(m, info.rti_info[RTAX_DST] ? + info.rti_info[RTAX_DST]->sa_family : 0); if (rp) - rp->rcb_proto.sp_family = PF_ROUTE; + rp->rcb_proto.sp_family = PF_ROUTE; /* Readd us */ return (error); } @@ -1055,7 +1060,6 @@ rt_setmetrics(u_long which, const struct out->rmx_expire = expire; } - /* RTV_PRIORITY handled before */ } void @@ -1257,11 +1261,7 @@ rt_missmsg(int type, struct rt_addrinfo rtm->rtm_tableid = tableid; rtm->rtm_addrs = rtinfo->rti_addrs; rtm->rtm_index = ifidx; - if (sa == NULL) - route_proto.sp_protocol = 0; - else - route_proto.sp_protocol = sa->sa_family; - route_input(m, &route_proto, &route_src, &route_dst); + route_input(m, sa ? sa->sa_family : 0); } /* @@ -1286,8 +1286,7 @@ rt_ifmsg(struct ifnet *ifp) ifm->ifm_xflags = ifp->if_xflags; ifm->ifm_data = ifp->if_data; ifm->ifm_addrs = 0; - route_proto.sp_protocol = 0; - route_input(m, &route_proto, &route_src, &route_dst); + route_input(m, 0); } /* @@ -1323,11 +1322,7 @@ rt_sendaddrmsg(struct rtentry *rt, int c ifam->ifam_addrs = info.rti_addrs; ifam->ifam_tableid = ifp->if_rdomain; - if (ifa->ifa_addr == NULL) - route_proto.sp_protocol = 0; - else - route_proto.sp_protocol = ifa->ifa_addr->sa_family; - route_input(m, &route_proto, &route_src, &route_dst); + route_input(m, ifa->ifa_addr ? ifa->ifa_addr->sa_family : 0); } /* @@ -1349,8 +1344,7 @@ rt_ifannouncemsg(struct ifnet *ifp, int ifan->ifan_index = ifp->if_index; strlcpy(ifan->ifan_name, ifp->if_xname, sizeof(ifan->ifan_name)); ifan->ifan_what = what; - route_proto.sp_protocol = 0; - route_input(m, &route_proto, &route_src, &route_dst); + route_input(m, 0); } #ifdef BFD @@ -1381,8 +1375,7 @@ rt_bfdmsg(struct bfd_config *bfd) bfd2sa(bfd->bc_rt, &sa_bfd); memcpy(&bfdm->bm_sa, &sa_bfd, sizeof(sa_bfd)); - route_proto.sp_protocol = info.rti_info[RTAX_DST]->sa_family; - route_input(m, &route_proto, &route_src, &route_dst); + route_input(m, info.rti_info[RTAX_DST]->sa_family); } #endif /* BFD */ @@ -1647,7 +1640,7 @@ extern struct domain routedomain; /* or struct protosw routesw[] = { { SOCK_RAW, &routedomain, 0, PR_ATOMIC|PR_ADDR|PR_WANTRCVD, - route_input, route_output, 0, route_ctloutput, + 0, route_output, 0, route_ctloutput, route_usrreq, raw_init, 0, 0, 0, sysctl_rtable,