On Thu, Mar 04, 2021 at 03:36:19PM +1000, David Gwynne wrote: > as the subject says, this is a rewrite of vxlan(4). > > vxlan(4) relies on bridge(4) to implement learning, but i want to be > able to remove bridge(4) one day. while working on veb(4), i wrote > the guts of a learning bridge implementation that is now used by veb(4), > bpe(4), and nvgre(4). that learning bridge code is now also used by > vxlan(4). > > this means that a few of the modes that the manpage talks about are > different now. because vxlan doesnt need a bridge for learning, there's > no "multicast mode" anymore, it just does "dynamic mode" out of the box > when configured with a multicast destination address. there's no > multipoint mode now too. > > another thing that's always bothered me about vxlan(4) is how it occupies > the "udp namespace" and gets how it steals packets from the udp stack. > the new code actually creates and bind udp sockets to handle the > vxlan packets. this means userland can't collide with a vxlan interface, > and you get to see that the port is in use in things like netstat. e.g.: > > dlg@ikkaku ~$ ifconfig vxlan0 > vxlan0: flags=8843<UP,BROADCAST,RUNNING,SIMPLEX,MULTICAST> mtu 1500 > lladdr fe:e1:ba:d1:17:2a > index 11 llprio 3 > encap: vnetid none parent aggr0 txprio 0 rxprio outer > groups: vxlan > tunnel: inet 192.0.2.36 port 4789 --> 239.0.0.1 ttl 1 nodf > Addresses (max cache: 100, timeout: 240): > inet 100.64.1.36 netmask 0xffffff00 broadcast 100.64.1.255 > dlg@ikkaku ~$ netstat -na -f inet -p udp > Active Internet connections (including servers) > Proto Recv-Q Send-Q Local Address Foreign Address > udp 0 0 130.102.96.36.29742 129.250.35.250.123 > udp 0 0 130.102.96.36.8965 162.159.200.123.123 > udp 0 0 130.102.96.36.13189 162.159.200.1.123 > udp 0 0 130.102.96.36.46580 220.158.215.20.123 > udp 0 0 130.102.96.36.23109 103.38.121.36.123 > udp 0 0 239.0.0.1.4789 *.* > udp 0 0 192.0.2.36.4789 *.* > > ive also added loop prevention, ie, sending an interfaces vxlan > packets over itself should fail rather than panic now.
here's an updated diff with a few fixes. Index: netinet/udp_usrreq.c =================================================================== RCS file: /cvs/src/sys/netinet/udp_usrreq.c,v retrieving revision 1.262 diff -u -p -r1.262 udp_usrreq.c --- netinet/udp_usrreq.c 22 Aug 2020 17:54:57 -0000 1.262 +++ netinet/udp_usrreq.c 5 Mar 2021 06:22:43 -0000 @@ -112,11 +112,6 @@ #include <net/pipex.h> #endif -#include "vxlan.h" -#if NVXLAN > 0 -#include <net/if_vxlan.h> -#endif - /* * UDP protocol implementation. * Per RFC 768, August, 1980. @@ -350,15 +345,6 @@ udp_input(struct mbuf **mp, int *offp, i break; #endif /* INET6 */ } - -#if NVXLAN > 0 - if (vxlan_enable > 0 && -#if NPF > 0 - !(m->m_pkthdr.pf.flags & PF_TAG_DIVERTED) && -#endif - vxlan_lookup(m, uh, iphlen, &srcsa.sa, &dstsa.sa) != 0) - return IPPROTO_DONE; -#endif if (m->m_flags & (M_BCAST|M_MCAST)) { struct inpcb *last; Index: net/if_vxlan.c =================================================================== RCS file: /cvs/src/sys/net/if_vxlan.c,v retrieving revision 1.82 diff -u -p -r1.82 if_vxlan.c --- net/if_vxlan.c 25 Feb 2021 02:48:21 -0000 1.82 +++ net/if_vxlan.c 5 Mar 2021 06:22:43 -0000 @@ -1,7 +1,7 @@ -/* $OpenBSD: if_vxlan.c,v 1.82 2021/02/25 02:48:21 dlg Exp $ */ +/* $OpenBSD$ */ /* - * Copyright (c) 2013 Reyk Floeter <r...@openbsd.org> + * Copyright (c) 2021 David Gwynne <d...@openbsd.org> * * Permission to use, copy, modify, and distribute this software for any * purpose with or without fee is hereby granted, provided that the above @@ -17,493 +17,759 @@ */ #include "bpfilter.h" -#include "vxlan.h" -#include "vlan.h" #include "pf.h" -#include "bridge.h" #include <sys/param.h> #include <sys/systm.h> +#include <sys/kernel.h> #include <sys/mbuf.h> #include <sys/socket.h> -#include <sys/sockio.h> #include <sys/ioctl.h> +#include <sys/timeout.h> +#include <sys/pool.h> +#include <sys/tree.h> +#include <sys/refcnt.h> +#include <sys/smr.h> + +#include <sys/socket.h> +#include <sys/socketvar.h> #include <net/if.h> #include <net/if_var.h> +#include <net/if_dl.h> #include <net/if_media.h> +#include <net/if_types.h> #include <net/route.h> - -#if NBPFILTER > 0 -#include <net/bpf.h> -#endif +#include <net/rtable.h> #include <netinet/in.h> #include <netinet/in_var.h> #include <netinet/if_ether.h> #include <netinet/ip.h> -#include <netinet/ip_var.h> #include <netinet/udp.h> -#include <netinet/udp_var.h> #include <netinet/in_pcb.h> +#include <netinet/ip_var.h> -#if NPF > 0 -#include <net/pfvar.h> +#ifdef INET6 +#include <netinet/ip6.h> +#include <netinet6/ip6_var.h> +#include <netinet6/in6_var.h> #endif -#if NBRIDGE > 0 +/* for bridge stuff */ #include <net/if_bridge.h> +#include <net/if_etherbridge.h> + +#if NBPFILTER > 0 +#include <net/bpf.h> #endif -#include <net/if_vxlan.h> +/* + * The protocol. + */ + +#define VXLANMTU 1492 +#define VXLAN_PORT 4789 + +struct vxlan_header { + uint32_t vxlan_flags; +#define VXLAN_F_I (1U << 27) + uint32_t vxlan_id; +#define VXLAN_VNI_SHIFT 8 +#define VXLAN_VNI_MASK (0xffffff << VXLAN_VNI_SHIFT) +}; + +#define VXLAN_VNI_MAX 0x00ffffff +#define VXLAN_VNI_MIN 0x00000000 + +/* + * The driver. + */ + +union vxlan_addr { + struct in_addr in4; + struct in6_addr in6; +}; + +struct vxlan_softc; + +struct vxlan_peer { + RBT_ENTRY(vxlan_peer) p_entry; + + unsigned int p_mask; /* do we use addr in the comparison */ + union vxlan_addr p_addr; + struct vxlan_header p_header; + + struct vxlan_softc *p_sc; +}; + +RBT_HEAD(vxlan_peers, vxlan_peer); + +struct vxlan_tep { + TAILQ_ENTRY(vxlan_tep) vt_entry; + + sa_family_t vt_af; + unsigned int vt_rdomain; + union vxlan_addr vt_addr; +#define vt_addr4 vt_addr.in4 +#define vt_addr6 vt_addr.in6 + in_port_t vt_port; + + struct socket *vt_so; + + struct mutex vt_mtx; + struct vxlan_peers vt_peers; +}; + +TAILQ_HEAD(vxlan_teps, vxlan_tep); + +enum vxlan_tunnel_mode { + VXLAN_TMODE_UNSET, + VXLAN_TMODE_P2P, /* unicast destination, no learning */ + VXLAN_TMODE_LEARNING, /* multicast destination, learning */ + VXLAN_TMODE_ENDPOINT, /* unset destination, no learning */ +}; struct vxlan_softc { struct arpcom sc_ac; - struct ifmedia sc_media; + struct etherbridge sc_eb; + + unsigned int sc_rdomain; + sa_family_t sc_af; + union vxlan_addr sc_src; + union vxlan_addr sc_dst; + in_port_t sc_port; + struct vxlan_header sc_header; + unsigned int sc_if_index0; - struct ip_moptions sc_imo; - struct task sc_atask; - struct task sc_ltask; struct task sc_dtask; + void *sc_inmulti; + + enum vxlan_tunnel_mode sc_mode; + struct vxlan_peer *sc_ucast_peer; + struct vxlan_peer *sc_mcast_peer; + struct refcnt sc_refs; - struct sockaddr_storage sc_src; - struct sockaddr_storage sc_dst; - in_port_t sc_dstport; - u_int sc_rdomain; - int64_t sc_vnetid; uint16_t sc_df; - u_int8_t sc_ttl; + int sc_ttl; int sc_txhprio; + int sc_rxhprio; - struct task sc_sendtask; - - LIST_ENTRY(vxlan_softc) sc_entry; + struct task sc_send_task; }; -void vxlanattach(int); -int vxlanioctl(struct ifnet *, u_long, caddr_t); -void vxlanstart(struct ifnet *); -int vxlan_clone_create(struct if_clone *, int); -int vxlan_clone_destroy(struct ifnet *); -void vxlan_multicast_cleanup(struct ifnet *); -int vxlan_multicast_join(struct ifnet *, struct sockaddr *, - struct sockaddr *); -int vxlan_media_change(struct ifnet *); -void vxlan_media_status(struct ifnet *, struct ifmediareq *); -int vxlan_config(struct ifnet *, struct sockaddr *, struct sockaddr *); -int vxlan_output(struct ifnet *, struct mbuf *); -void vxlan_addr_change(void *); -void vxlan_if_change(void *); -void vxlan_link_change(void *); -void vxlan_send_dispatch(void *); +void vxlanattach(int); + +static int vxlan_clone_create(struct if_clone *, int); +static int vxlan_clone_destroy(struct ifnet *); + +static int vxlan_output(struct ifnet *, struct mbuf *, + struct sockaddr *, struct rtentry *); +static int vxlan_enqueue(struct ifnet *, struct mbuf *); +static void vxlan_start(struct ifqueue *); +static void vxlan_send(void *); + +static int vxlan_ioctl(struct ifnet *, u_long, caddr_t); +static int vxlan_up(struct vxlan_softc *); +static int vxlan_down(struct vxlan_softc *); +static int vxlan_addmulti(struct vxlan_softc *, struct ifnet *); +static void vxlan_delmulti(struct vxlan_softc *); + +static struct mbuf * + vxlan_input(void *, struct mbuf *, + struct ip *, struct ip6_hdr *, void *, int); + +static int vxlan_set_rdomain(struct vxlan_softc *, const struct ifreq *); +static int vxlan_get_rdomain(struct vxlan_softc *, struct ifreq *); +static int vxlan_set_tunnel(struct vxlan_softc *, + const struct if_laddrreq *); +static int vxlan_get_tunnel(struct vxlan_softc *, struct if_laddrreq *); +static int vxlan_del_tunnel(struct vxlan_softc *); +static int vxlan_set_vnetid(struct vxlan_softc *, const struct ifreq *); +static int vxlan_get_vnetid(struct vxlan_softc *, struct ifreq *); +static int vxlan_del_vnetid(struct vxlan_softc *); +static int vxlan_set_parent(struct vxlan_softc *, + const struct if_parent *); +static int vxlan_get_parent(struct vxlan_softc *, struct if_parent *); +static int vxlan_del_parent(struct vxlan_softc *); + +static int vxlan_add_addr(struct vxlan_softc *, const struct ifbareq *); +static int vxlan_del_addr(struct vxlan_softc *, const struct ifbareq *); -int vxlan_sockaddr_cmp(struct sockaddr *, struct sockaddr *); -uint16_t vxlan_sockaddr_port(struct sockaddr *); +static void vxlan_detach_hook(void *); -struct if_clone vxlan_cloner = +static struct if_clone vxlan_cloner = IF_CLONE_INITIALIZER("vxlan", vxlan_clone_create, vxlan_clone_destroy); -int vxlan_enable = 0; -u_long vxlan_tagmask; +static int vxlan_eb_port_eq(void *, void *, void *); +static void *vxlan_eb_port_take(void *, void *); +static void vxlan_eb_port_rele(void *, void *); +static size_t vxlan_eb_port_ifname(void *, char *, size_t, void *); +static void vxlan_eb_port_sa(void *, struct sockaddr_storage *, void *); + +static const struct etherbridge_ops vxlan_etherbridge_ops = { + vxlan_eb_port_eq, + vxlan_eb_port_take, + vxlan_eb_port_rele, + vxlan_eb_port_ifname, + vxlan_eb_port_sa, +}; + +static struct rwlock vxlan_lock = RWLOCK_INITIALIZER("vteps"); +static struct vxlan_teps vxlan_teps = TAILQ_HEAD_INITIALIZER(vxlan_teps); +static struct pool vxlan_endpoint_pool; -#define VXLAN_TAGHASHSIZE 32 -#define VXLAN_TAGHASH(tag) ((unsigned int)tag & vxlan_tagmask) -LIST_HEAD(vxlan_taghash, vxlan_softc) *vxlan_tagh, vxlan_any; +static inline int vxlan_peer_cmp(const struct vxlan_peer *, + const struct vxlan_peer *); + +RBT_PROTOTYPE(vxlan_peers, vxlan_peer, p_entry, vxlan_peer_cmp); void vxlanattach(int count) { - /* Regular vxlan interfaces with a VNI */ - if ((vxlan_tagh = hashinit(VXLAN_TAGHASHSIZE, M_DEVBUF, M_NOWAIT, - &vxlan_tagmask)) == NULL) - panic("vxlanattach: hashinit"); - - /* multipoint-to-multipoint interfaces that accept any VNI */ - LIST_INIT(&vxlan_any); - if_clone_attach(&vxlan_cloner); } -int +static int vxlan_clone_create(struct if_clone *ifc, int unit) { - struct ifnet *ifp; - struct vxlan_softc *sc; + struct vxlan_softc *sc; + struct ifnet *ifp; + int error; + + if (vxlan_endpoint_pool.pr_size == 0) { + pool_init(&vxlan_endpoint_pool, sizeof(union vxlan_addr), + 0, IPL_SOFTNET, 0, "vxlanep", NULL); + } - sc = malloc(sizeof(*sc), M_DEVBUF, M_WAITOK|M_ZERO); - sc->sc_imo.imo_membership = mallocarray(IP_MIN_MEMBERSHIPS, - sizeof(struct in_multi *), M_IPMOPTS, M_WAITOK|M_ZERO); - sc->sc_imo.imo_max_memberships = IP_MIN_MEMBERSHIPS; - sc->sc_dstport = htons(VXLAN_PORT); - sc->sc_vnetid = VXLAN_VNI_UNSET; - sc->sc_txhprio = IFQ_TOS2PRIO(IPTOS_PREC_ROUTINE); /* 0 */ - sc->sc_df = htons(0); - task_set(&sc->sc_atask, vxlan_addr_change, sc); - task_set(&sc->sc_ltask, vxlan_link_change, sc); - task_set(&sc->sc_dtask, vxlan_if_change, sc); - task_set(&sc->sc_sendtask, vxlan_send_dispatch, sc); + sc = malloc(sizeof(*sc), M_DEVBUF, M_WAITOK|M_ZERO|M_CANFAIL); + if (sc == NULL) + return (ENOMEM); ifp = &sc->sc_ac.ac_if; - snprintf(ifp->if_xname, sizeof ifp->if_xname, "vxlan%d", unit); - ifp->if_flags = IFF_BROADCAST | IFF_SIMPLEX | IFF_MULTICAST; - ether_fakeaddr(ifp); - ifp->if_softc = sc; - ifp->if_ioctl = vxlanioctl; - ifp->if_start = vxlanstart; + snprintf(ifp->if_xname, sizeof(ifp->if_xname), "%s%d", + ifc->ifc_name, unit); - ifp->if_hardmtu = ETHER_MAX_HARDMTU_LEN; - ifp->if_capabilities = IFCAP_VLAN_MTU; - ifp->if_xflags = IFXF_CLONED; + error = etherbridge_init(&sc->sc_eb, ifp->if_xname, + &vxlan_etherbridge_ops, sc); + if (error == -1) { + free(sc, M_DEVBUF, sizeof(*sc)); + return (error); + } + + sc->sc_af = AF_UNSPEC; + sc->sc_txhprio = 0; + sc->sc_rxhprio = IF_HDRPRIO_OUTER; + sc->sc_df = 0; + sc->sc_ttl = IP_DEFAULT_MULTICAST_TTL; + + task_set(&sc->sc_dtask, vxlan_detach_hook, sc); + refcnt_init(&sc->sc_refs); + task_set(&sc->sc_send_task, vxlan_send, sc); - ifmedia_init(&sc->sc_media, 0, vxlan_media_change, - vxlan_media_status); - ifmedia_add(&sc->sc_media, IFM_ETHER | IFM_AUTO, 0, NULL); - ifmedia_set(&sc->sc_media, IFM_ETHER | IFM_AUTO); + ifp->if_softc = sc; + ifp->if_hardmtu = ETHER_MAX_HARDMTU_LEN; + ifp->if_ioctl = vxlan_ioctl; + ifp->if_output = vxlan_output; + ifp->if_enqueue = vxlan_enqueue; + ifp->if_qstart = vxlan_start; + ifp->if_flags = IFF_BROADCAST | IFF_MULTICAST | IFF_SIMPLEX; + ifp->if_xflags = IFXF_CLONED | IFXF_MPSAFE; + ether_fakeaddr(ifp); if_counters_alloc(ifp); if_attach(ifp); ether_ifattach(ifp); -#if 0 - /* - * Instead of using a decreased MTU of 1450 bytes, prefer - * to use the default Ethernet-size MTU of 1500 bytes and to - * increase the MTU of the outer transport interfaces to - * at least 1550 bytes. The following is disabled by default. - */ - ifp->if_mtu = ETHERMTU - sizeof(struct ether_header); - ifp->if_mtu -= sizeof(struct vxlanudphdr) + sizeof(struct ipovly); -#endif - - LIST_INSERT_HEAD(&vxlan_tagh[VXLAN_TAGHASH(0)], sc, sc_entry); - vxlan_enable++; - return (0); } -int +static int vxlan_clone_destroy(struct ifnet *ifp) { - struct vxlan_softc *sc = ifp->if_softc; + struct vxlan_softc *sc = ifp->if_softc; NET_LOCK(); - vxlan_multicast_cleanup(ifp); + if (ISSET(ifp->if_flags, IFF_RUNNING)) + vxlan_down(sc); NET_UNLOCK(); - vxlan_enable--; - LIST_REMOVE(sc, sc_entry); - - ifmedia_delete_instance(&sc->sc_media, IFM_INST_ANY); ether_ifdetach(ifp); if_detach(ifp); - if (!task_del(net_tq(ifp->if_index), &sc->sc_sendtask)) - taskq_barrier(net_tq(ifp->if_index)); + etherbridge_destroy(&sc->sc_eb); + + refcnt_finalize(&sc->sc_refs, "vxlanfini"); - free(sc->sc_imo.imo_membership, M_IPMOPTS, - sc->sc_imo.imo_max_memberships * sizeof(struct in_multi *)); free(sc, M_DEVBUF, sizeof(*sc)); return (0); } -void -vxlan_multicast_cleanup(struct ifnet *ifp) +static struct vxlan_softc * +vxlan_take(struct vxlan_softc *sc) +{ + refcnt_take(&sc->sc_refs); + return (sc); +} + +static void +vxlan_rele(struct vxlan_softc *sc) { - struct vxlan_softc *sc = (struct vxlan_softc *)ifp->if_softc; - struct ip_moptions *imo = &sc->sc_imo; - struct ifnet *mifp; + refcnt_rele_wake(&sc->sc_refs); +} - mifp = if_get(imo->imo_ifidx); - if (mifp != NULL) { - if_addrhook_del(mifp, &sc->sc_atask); - if_linkstatehook_del(mifp, &sc->sc_ltask); - if_detachhook_del(mifp, &sc->sc_dtask); +static struct mbuf * +vxlan_encap(struct vxlan_softc *sc, struct mbuf *m, + struct mbuf *(ip_encap)(struct vxlan_softc *sc, struct mbuf *, + const union vxlan_addr *, uint8_t)) +{ + struct mbuf *m0; + union vxlan_addr gateway; + const union vxlan_addr *endpoint; + struct vxlan_header *vh; + struct udphdr *uh; + int prio; + uint8_t tos; - if_put(mifp); - } + if (sc->sc_mode == VXLAN_TMODE_UNSET) + goto drop; - if (imo->imo_num_memberships > 0) { - in_delmulti(imo->imo_membership[--imo->imo_num_memberships]); - imo->imo_ifidx = 0; + if (sc->sc_mode == VXLAN_TMODE_P2P) + endpoint = &sc->sc_dst; + else { /* VXLAN_TMODE_LEARNING || VXLAN_TMODE_ENDPOINT */ + struct ether_header *eh = mtod(m, struct ether_header *); + + smr_read_enter(); + endpoint = etherbridge_resolve_ea(&sc->sc_eb, + (struct ether_addr *)eh->ether_dhost); + if (endpoint != NULL) { + gateway = *endpoint; + endpoint = &gateway; + } + smr_read_leave(); + + if (endpoint == NULL) { + if (sc->sc_mode == VXLAN_TMODE_ENDPOINT) + goto drop; + + /* "flood" to unknown destinations */ + endpoint = &sc->sc_dst; + } } + + /* force prepend mbuf because of payload alignment */ + m0 = m_get(M_DONTWAIT, m->m_type); + if (m0 == NULL) + goto drop; + + m_align(m0, 0); + m0->m_len = 0; + + M_MOVE_PKTHDR(m0, m); + m0->m_next = m; + + m = m_prepend(m0, sizeof(*vh), M_DONTWAIT); + if (m == NULL) + return (NULL); + + vh = mtod(m, struct vxlan_header *); + *vh = sc->sc_header; + + m = m_prepend(m, sizeof(*uh), M_DONTWAIT); + if (m == NULL) + return (NULL); + + uh = mtod(m, struct udphdr *); + uh->uh_sport = sc->sc_port; /* XXX */ + uh->uh_dport = sc->sc_port; + htobem16(&uh->uh_ulen, m->m_pkthdr.len); + uh->uh_sum = htons(0); + + SET(m->m_pkthdr.csum_flags, M_UDP_CSUM_OUT); + + prio = sc->sc_txhprio; + if (prio == IF_HDRPRIO_PACKET) + prio = m->m_pkthdr.pf.prio; + tos = IFQ_PRIO2TOS(prio); + + CLR(m->m_flags, M_BCAST|M_MCAST); + m->m_pkthdr.ph_rtableid = sc->sc_rdomain; + +#if NPF > 0 + pf_pkt_addr_changed(m); +#endif + + return ((*ip_encap)(sc, m, endpoint, tos)); +drop: + m_freem(m); + return (NULL); } -int -vxlan_multicast_join(struct ifnet *ifp, struct sockaddr *src, - struct sockaddr *dst) +static struct mbuf * +vxlan_encap_ipv4(struct vxlan_softc *sc, struct mbuf *m, + const union vxlan_addr *endpoint, uint8_t tos) { - struct vxlan_softc *sc = ifp->if_softc; - struct ip_moptions *imo = &sc->sc_imo; - struct sockaddr_in *src4, *dst4; -#ifdef INET6 - struct sockaddr_in6 *dst6; -#endif /* INET6 */ - struct ifaddr *ifa; - struct ifnet *mifp; + struct ip *ip; + + m = m_prepend(m, sizeof(*ip), M_DONTWAIT); + if (m == NULL) + return (NULL); + + ip = mtod(m, struct ip *); + ip->ip_v = IPVERSION; + ip->ip_hl = sizeof(*ip) >> 2; + ip->ip_off = sc->sc_df; + ip->ip_tos = tos; + ip->ip_len = htons(m->m_pkthdr.len); + ip->ip_ttl = sc->sc_ttl; + ip->ip_p = IPPROTO_UDP; + ip->ip_src = sc->sc_src.in4; + ip->ip_dst = endpoint->in4; + + return (m); +} - switch (dst->sa_family) { - case AF_INET: - dst4 = satosin(dst); - if (!IN_MULTICAST(dst4->sin_addr.s_addr)) - return (0); - break; #ifdef INET6 - case AF_INET6: - dst6 = satosin6(dst); - if (!IN6_IS_ADDR_MULTICAST(&dst6->sin6_addr)) - return (0); +static struct mbuf * +vxlan_encap_ipv6(struct vxlan_softc *sc, struct mbuf *m, + const union vxlan_addr *endpoint, uint8_t tos) +{ + struct ip6_hdr *ip6; + int len = m->m_pkthdr.len; - /* Multicast mode is currently not supported for IPv6 */ - return (EAFNOSUPPORT); + m = m_prepend(m, sizeof(*ip6), M_DONTWAIT); + if (m == NULL) + return (NULL); + + ip6 = mtod(m, struct ip6_hdr *); + ip6->ip6_flow = ISSET(m->m_pkthdr.csum_flags, M_FLOWID) ? + htonl(m->m_pkthdr.ph_flowid) : 0; + ip6->ip6_vfc |= IPV6_VERSION; + ip6->ip6_flow |= htonl((uint32_t)tos << 20); + ip6->ip6_plen = htons(len); + ip6->ip6_nxt = IPPROTO_UDP; + ip6->ip6_hlim = sc->sc_ttl; + ip6->ip6_src = sc->sc_src.in6; + ip6->ip6_dst = endpoint->in6; + + if (sc->sc_df) + SET(m->m_pkthdr.csum_flags, M_IPV6_DF_OUT); + + return (m); +} #endif /* INET6 */ - default: - return (EAFNOSUPPORT); + +static int +vxlan_output(struct ifnet *ifp, struct mbuf *m, struct sockaddr *dst, + struct rtentry *rt) +{ + struct m_tag *mtag; + int error = 0; + + mtag = NULL; + while ((mtag = m_tag_find(m, PACKET_TAG_GRE, mtag)) != NULL) { + if (memcmp((caddr_t)(mtag + 1), &ifp->if_index, + sizeof(ifp->if_index)) == 0) { + error = EIO; + goto drop; + } } - src4 = satosin(src); - dst4 = satosin(dst); + mtag = m_tag_get(PACKET_TAG_GRE, sizeof(ifp->if_index), M_NOWAIT); + if (mtag == NULL) { + error = ENOBUFS; + goto drop; + } + memcpy((caddr_t)(mtag + 1), &ifp->if_index, sizeof(ifp->if_index)); + m_tag_prepend(m, mtag); - if (src4->sin_addr.s_addr == INADDR_ANY || - IN_MULTICAST(src4->sin_addr.s_addr)) - return (EINVAL); - if ((ifa = ifa_ifwithaddr(src, sc->sc_rdomain)) == NULL || - (mifp = ifa->ifa_ifp) == NULL || - (mifp->if_flags & IFF_MULTICAST) == 0) - return (EADDRNOTAVAIL); + return (ether_output(ifp, m, dst, rt)); - if ((imo->imo_membership[0] = - in_addmulti(&dst4->sin_addr, mifp)) == NULL) - return (ENOBUFS); +drop: + m_freem(m); + return (error); +} - imo->imo_num_memberships++; - imo->imo_ifidx = mifp->if_index; - if (sc->sc_ttl > 0) - imo->imo_ttl = sc->sc_ttl; - else - imo->imo_ttl = IP_DEFAULT_MULTICAST_TTL; - imo->imo_loop = 0; +static int +vxlan_enqueue(struct ifnet *ifp, struct mbuf *m) +{ + struct vxlan_softc *sc = ifp->if_softc; + struct ifqueue *ifq = &ifp->if_snd; + + if (ifq_enqueue(ifq, m) != 0) + return (ENOBUFS); - /* - * Use interface hooks to track any changes on the interface - * that is used to send out the tunnel traffic as multicast. - */ - if_addrhook_add(mifp, &sc->sc_atask); - if_linkstatehook_add(mifp, &sc->sc_ltask); - if_detachhook_add(mifp, &sc->sc_dtask); + task_add(ifq->ifq_softnet, &sc->sc_send_task); return (0); } -void -vxlanstart(struct ifnet *ifp) +static void +vxlan_start(struct ifqueue *ifq) { - struct vxlan_softc *sc = (struct vxlan_softc *)ifp->if_softc; + struct ifnet *ifp = ifq->ifq_if; + struct vxlan_softc *sc = ifp->if_softc; - task_add(net_tq(ifp->if_index), &sc->sc_sendtask); + task_add(ifq->ifq_softnet, &sc->sc_send_task); } -void -vxlan_send_dispatch(void *xsc) +static uint64_t +vxlan_send_ipv4(struct vxlan_softc *sc, struct mbuf_list *ml) { - struct vxlan_softc *sc = xsc; - struct ifnet *ifp = &sc->sc_ac.ac_if; - struct mbuf *m; - struct mbuf_list ml; - - ml_init(&ml); - for (;;) { - m = ifq_dequeue(&ifp->if_snd); - if (m == NULL) - break; - -#if NBPFILTER > 0 - if (ifp->if_bpf) - bpf_mtap(ifp->if_bpf, m, BPF_DIRECTION_OUT); -#endif + struct ip_moptions imo; + struct mbuf *m; + uint64_t oerrors = 0; + + imo.imo_ifidx = sc->sc_if_index0; + imo.imo_ttl = sc->sc_ttl; + imo.imo_loop = 0; - ml_enqueue(&ml, m); + NET_LOCK(); + while ((m = ml_dequeue(ml)) != NULL) { + if (ip_output(m, NULL, NULL, IP_RAWOUTPUT, &imo, NULL, 0) != 0) + oerrors++; } + NET_UNLOCK(); - if (ml_empty(&ml)) - return; + return (oerrors); +} + +#ifdef INET6 +static uint64_t +vxlan_send_ipv6(struct vxlan_softc *sc, struct mbuf_list *ml) +{ + struct ip6_moptions im6o; + struct mbuf *m; + uint64_t oerrors = 0; + + im6o.im6o_ifidx = sc->sc_if_index0; + im6o.im6o_hlim = sc->sc_ttl; + im6o.im6o_loop = 0; NET_LOCK(); - while ((m = ml_dequeue(&ml)) != NULL) { - vxlan_output(ifp, m); + while ((m = ml_dequeue(ml)) != NULL) { + if (ip6_output(m, NULL, NULL, 0, &im6o, NULL) != 0) + oerrors++; } NET_UNLOCK(); -} + return (oerrors); +} +#endif /* INET6 */ -int -vxlan_config(struct ifnet *ifp, struct sockaddr *src, struct sockaddr *dst) +static void +vxlan_send(void *arg) { - struct vxlan_softc *sc = (struct vxlan_softc *)ifp->if_softc; - int reset = 0, error, af; - socklen_t slen; - in_port_t port; - struct vxlan_taghash *tagh; - - if (src != NULL && dst != NULL) { - if ((af = src->sa_family) != dst->sa_family) - return (EAFNOSUPPORT); - } else { - /* Reset current configuration */ - af = sc->sc_src.ss_family; - src = sstosa(&sc->sc_src); - dst = sstosa(&sc->sc_dst); - reset = 1; - } + struct vxlan_softc *sc = arg; + struct ifnet *ifp = &sc->sc_ac.ac_if; + struct mbuf *(*ip_encap)(struct vxlan_softc *, struct mbuf *, + const union vxlan_addr *, uint8_t); + uint64_t (*ip_send)(struct vxlan_softc *, struct mbuf_list *); + struct mbuf_list ml = MBUF_LIST_INITIALIZER(); + struct mbuf *m; + uint64_t oerrors; + + if (!ISSET(ifp->if_flags, IFF_RUNNING)) + return; - switch (af) { + switch (sc->sc_af) { case AF_INET: - slen = sizeof(struct sockaddr_in); + ip_encap = vxlan_encap_ipv4; + ip_send = vxlan_send_ipv4; break; #ifdef INET6 case AF_INET6: - slen = sizeof(struct sockaddr_in6); + ip_encap = vxlan_encap_ipv6; + ip_send = vxlan_send_ipv6; break; -#endif /* INET6 */ +#endif default: - return (EAFNOSUPPORT); + unhandled_af(sc->sc_af); + /* NOTREACHED */ } - if (src->sa_len != slen || dst->sa_len != slen) - return (EINVAL); + while ((m = ifq_dequeue(&ifp->if_snd)) != NULL) { +#if NBPFILTER > 0 + caddr_t if_bpf = READ_ONCE(ifp->if_bpf); + if (if_bpf != NULL) + bpf_mtap_ether(if_bpf, m, BPF_DIRECTION_OUT); +#endif + m = vxlan_encap(sc, m, ip_encap); + if (m == NULL) + continue; + + ml_enqueue(&ml, m); + } - vxlan_multicast_cleanup(ifp); + oerrors = (*ip_send)(sc, &ml); - /* returns without error if multicast is not configured */ - if ((error = vxlan_multicast_join(ifp, src, dst)) != 0) - return (error); + counters_add(ifp->if_counters, ifc_oerrors, oerrors); +} + +static struct mbuf * +vxlan_input(void *arg, struct mbuf *m, struct ip *ip, struct ip6_hdr *ip6, + void *uhp, int hlen) +{ + struct vxlan_tep *vt = arg; + struct vxlan_peer key, *p; + struct udphdr *uh; + struct vxlan_header *vh; + struct ether_header *eh; + int vhlen = hlen + sizeof(*vh); + struct mbuf *n; + int off; + in_port_t port; + struct vxlan_softc *sc = NULL; + struct ifnet *ifp; + + if (m->m_pkthdr.len < vhlen) + goto drop; + + uh = uhp; + port = uh->uh_sport; - if ((port = vxlan_sockaddr_port(dst)) != 0) - sc->sc_dstport = port; + memset(&key, 0, sizeof(key)); + key.p_mask = 0; - if (!reset) { - bzero(&sc->sc_src, sizeof(sc->sc_src)); - bzero(&sc->sc_dst, sizeof(sc->sc_dst)); - memcpy(&sc->sc_src, src, src->sa_len); - memcpy(&sc->sc_dst, dst, dst->sa_len); + if (ip != NULL) + key.p_addr.in4 = ip->ip_src; +#ifdef INET6 + else + key.p_addr.in6 = ip6->ip6_src; +#endif + + if (m->m_len < vhlen) { + m = m_pullup(m, vhlen); + if (m == NULL) + return (NULL); } - if (sc->sc_vnetid == VXLAN_VNI_ANY) { - /* - * If the interface accepts any VNI, put it into a separate - * list that is not part of the main hash. - */ - tagh = &vxlan_any; - } else - tagh = &vxlan_tagh[VXLAN_TAGHASH(sc->sc_vnetid)]; + vh = (struct vxlan_header *)(mtod(m, caddr_t) + hlen); + key.p_header.vxlan_flags = vh->vxlan_flags & htonl(VXLAN_F_I); + key.p_header.vxlan_id = vh->vxlan_id & htonl(VXLAN_VNI_MASK); - LIST_REMOVE(sc, sc_entry); - LIST_INSERT_HEAD(tagh, sc, sc_entry); + mtx_enter(&vt->vt_mtx); + p = RBT_FIND(vxlan_peers, &vt->vt_peers, &key); + if (p != NULL) + sc = vxlan_take(p->p_sc); + mtx_leave(&vt->vt_mtx); - return (0); + if (sc == NULL) + goto drop; + + ifp = &sc->sc_ac.ac_if; + if (ISSET(ifp->if_flags, IFF_LINK0) && port != sc->sc_port) + goto rele_drop; + + m_adj(m, vhlen); + + if (m->m_pkthdr.len < sizeof(*eh)) + goto rele_drop; + + if (m->m_len < sizeof(*eh)) { + m = m_pullup(m, sizeof(*eh)); + if (m == NULL) + goto rele; + } + + n = m_getptr(m, sizeof(*eh), &off); + if (n == NULL) + goto rele_drop; + + if (!ALIGNED_POINTER(mtod(n, caddr_t) + off, uint32_t)) { + n = m_dup_pkt(m, ETHER_ALIGN, M_NOWAIT); + m_freem(m); + if (n == NULL) + goto rele; + m = n; + } + + if (sc->sc_mode == VXLAN_TMODE_LEARNING) { + eh = mtod(m, struct ether_header *); + etherbridge_map_ea(&sc->sc_eb, &key.p_addr, + (struct ether_addr *)eh->ether_shost); + } + + /* XXX prio */ + + if_vinput(ifp, m); +rele: + vxlan_rele(sc); + return (NULL); + +rele_drop: + vxlan_rele(sc); +drop: + m_freem(m); + return (NULL); } -int -vxlanioctl(struct ifnet *ifp, u_long cmd, caddr_t data) +static int +vxlan_ioctl(struct ifnet *ifp, u_long cmd, caddr_t data) { - struct vxlan_softc *sc = (struct vxlan_softc *)ifp->if_softc; - struct ifreq *ifr = (struct ifreq *)data; - struct if_laddrreq *lifr = (struct if_laddrreq *)data; - int error = 0; + struct vxlan_softc *sc = ifp->if_softc; + struct ifreq *ifr = (struct ifreq *)data; + struct ifbrparam *bparam = (struct ifbrparam *)data; + int error = 0; switch (cmd) { case SIOCSIFADDR: - ifp->if_flags |= IFF_UP; - /* FALLTHROUGH */ - + break; case SIOCSIFFLAGS: - if (ifp->if_flags & IFF_UP) { - ifp->if_flags |= IFF_RUNNING; + if (ISSET(ifp->if_flags, IFF_UP)) { + if (!ISSET(ifp->if_flags, IFF_RUNNING)) + error = vxlan_up(sc); + else + error = 0; } else { - ifp->if_flags &= ~IFF_RUNNING; + if (ISSET(ifp->if_flags, IFF_RUNNING)) + error = vxlan_down(sc); } break; - case SIOCADDMULTI: - case SIOCDELMULTI: + case SIOCSLIFPHYRTABLE: + error = vxlan_set_rdomain(sc, ifr); break; - - case SIOCGIFMEDIA: - case SIOCSIFMEDIA: - error = ifmedia_ioctl(ifp, ifr, &sc->sc_media, cmd); + case SIOCGLIFPHYRTABLE: + error = vxlan_get_rdomain(sc, ifr); break; case SIOCSLIFPHYADDR: - error = vxlan_config(ifp, - sstosa(&lifr->addr), - sstosa(&lifr->dstaddr)); + error = vxlan_set_tunnel(sc, (const struct if_laddrreq *)data); + break; + case SIOCGLIFPHYADDR: + error = vxlan_get_tunnel(sc, (struct if_laddrreq *)data); break; - case SIOCDIFPHYADDR: - vxlan_multicast_cleanup(ifp); - bzero(&sc->sc_src, sizeof(sc->sc_src)); - bzero(&sc->sc_dst, sizeof(sc->sc_dst)); - sc->sc_dstport = htons(VXLAN_PORT); + error = vxlan_del_tunnel(sc); break; - case SIOCGLIFPHYADDR: - if (sc->sc_dst.ss_family == AF_UNSPEC) { - error = EADDRNOTAVAIL; - break; - } - bzero(&lifr->addr, sizeof(lifr->addr)); - bzero(&lifr->dstaddr, sizeof(lifr->dstaddr)); - memcpy(&lifr->addr, &sc->sc_src, sc->sc_src.ss_len); - memcpy(&lifr->dstaddr, &sc->sc_dst, sc->sc_dst.ss_len); + case SIOCSVNETID: + error = vxlan_set_vnetid(sc, ifr); break; - - case SIOCSLIFPHYRTABLE: - if (ifr->ifr_rdomainid < 0 || - ifr->ifr_rdomainid > RT_TABLEID_MAX || - !rtable_exists(ifr->ifr_rdomainid)) { - error = EINVAL; - break; - } - sc->sc_rdomain = ifr->ifr_rdomainid; - (void)vxlan_config(ifp, NULL, NULL); + case SIOCGVNETID: + error = vxlan_get_vnetid(sc, ifr); break; - - case SIOCGLIFPHYRTABLE: - ifr->ifr_rdomainid = sc->sc_rdomain; + case SIOCDVNETID: + error = vxlan_del_vnetid(sc); break; - case SIOCSLIFPHYTTL: - if (ifr->ifr_ttl < 0 || ifr->ifr_ttl > 0xff) { - error = EINVAL; - break; - } - if (sc->sc_ttl == (u_int8_t)ifr->ifr_ttl) - break; - sc->sc_ttl = (u_int8_t)(ifr->ifr_ttl); - (void)vxlan_config(ifp, NULL, NULL); + case SIOCSIFPARENT: + error = vxlan_set_parent(sc, (struct if_parent *)data); break; - - case SIOCGLIFPHYTTL: - ifr->ifr_ttl = (int)sc->sc_ttl; - break; - - case SIOCSLIFPHYDF: - /* commit */ - sc->sc_df = ifr->ifr_df ? htons(IP_DF) : htons(0); - break; - case SIOCGLIFPHYDF: - ifr->ifr_df = sc->sc_df ? 1 : 0; + case SIOCGIFPARENT: + error = vxlan_get_parent(sc, (struct if_parent *)data); + break; + case SIOCDIFPARENT: + error = vxlan_del_parent(sc); break; case SIOCSTXHPRIO: - if (ifr->ifr_hdrprio == IF_HDRPRIO_PACKET) - ; /* fall through */ - else if (ifr->ifr_hdrprio < IF_HDRPRIO_MIN || - ifr->ifr_hdrprio > IF_HDRPRIO_MAX) { - error = EINVAL; + error = if_txhprio_l2_check(ifr->ifr_hdrprio); + if (error != 0) break; - } sc->sc_txhprio = ifr->ifr_hdrprio; break; @@ -511,35 +777,64 @@ vxlanioctl(struct ifnet *ifp, u_long cmd ifr->ifr_hdrprio = sc->sc_txhprio; break; - case SIOCSVNETID: - if (sc->sc_vnetid == ifr->ifr_vnetid) + case SIOCSRXHPRIO: + error = if_rxhprio_l2_check(ifr->ifr_hdrprio); + if (error != 0) break; - if ((ifr->ifr_vnetid != VXLAN_VNI_ANY) && - (ifr->ifr_vnetid > VXLAN_VNI_MAX || - ifr->ifr_vnetid < VXLAN_VNI_MIN)) { - error = EINVAL; - break; - } + sc->sc_rxhprio = ifr->ifr_hdrprio; + break; + case SIOCGRXHPRIO: + ifr->ifr_hdrprio = sc->sc_rxhprio; + break; - sc->sc_vnetid = (int)ifr->ifr_vnetid; - (void)vxlan_config(ifp, NULL, NULL); + case SIOCSLIFPHYDF: + /* commit */ + sc->sc_df = ifr->ifr_df ? htons(IP_DF) : htons(0); + break; + case SIOCGLIFPHYDF: + ifr->ifr_df = sc->sc_df ? 1 : 0; break; - case SIOCGVNETID: - if ((sc->sc_vnetid != VXLAN_VNI_ANY) && - (sc->sc_vnetid > VXLAN_VNI_MAX || - sc->sc_vnetid < VXLAN_VNI_MIN)) { - error = EADDRNOTAVAIL; + case SIOCSLIFPHYTTL: + if (ifr->ifr_ttl < 1 || ifr->ifr_ttl > 0xff) { + error = EINVAL; break; } - ifr->ifr_vnetid = sc->sc_vnetid; + /* commit */ + sc->sc_ttl = (uint8_t)ifr->ifr_ttl; + break; + case SIOCGLIFPHYTTL: + ifr->ifr_ttl = (int)sc->sc_ttl; + break; + + case SIOCBRDGSCACHE: + error = etherbridge_set_max(&sc->sc_eb, bparam); + break; + case SIOCBRDGGCACHE: + error = etherbridge_get_max(&sc->sc_eb, bparam); + break; + case SIOCBRDGSTO: + error = etherbridge_set_tmo(&sc->sc_eb, bparam); + break; + case SIOCBRDGGTO: + error = etherbridge_get_tmo(&sc->sc_eb, bparam); break; - case SIOCDVNETID: - sc->sc_vnetid = VXLAN_VNI_UNSET; - (void)vxlan_config(ifp, NULL, NULL); + case SIOCBRDGRTS: + error = etherbridge_rtfind(&sc->sc_eb, + (struct ifbaconf *)data); + break; + case SIOCBRDGFLUSH: + etherbridge_flush(&sc->sc_eb, + ((struct ifbreq *)data)->ifbr_ifsflags); + break; + case SIOCBRDGSADDR: + error = vxlan_add_addr(sc, (struct ifbareq *)data); + break; + case SIOCBRDGDADDR: + error = vxlan_del_addr(sc, (struct ifbareq *)data); break; default: @@ -550,465 +845,964 @@ vxlanioctl(struct ifnet *ifp, u_long cmd return (error); } -int -vxlan_media_change(struct ifnet *ifp) +static struct vxlan_tep * +vxlan_tep_get(struct vxlan_softc *sc, const union vxlan_addr *addr) { - return (0); -} + struct vxlan_tep *vt; -void -vxlan_media_status(struct ifnet *ifp, struct ifmediareq *imr) -{ - imr->ifm_status = IFM_AVALID | IFM_ACTIVE; + TAILQ_FOREACH(vt, &vxlan_teps, vt_entry) { + if (sc->sc_af == vt->vt_af && + sc->sc_rdomain == vt->vt_rdomain && + memcmp(addr, &vt->vt_addr, sizeof(*addr)) == 0 && + sc->sc_port == vt->vt_port) + return (vt); + } + + return (NULL); } -int -vxlan_sockaddr_cmp(struct sockaddr *srcsa, struct sockaddr *dstsa) +static int +vxlan_tep_add_addr(struct vxlan_softc *sc, const union vxlan_addr *addr, + struct vxlan_peer *p) { - struct sockaddr_in *src4, *dst4; + struct mbuf m; + struct vxlan_tep *vt; + struct socket *so; + struct sockaddr_in *sin; #ifdef INET6 - struct sockaddr_in6 *src6, *dst6; -#endif /* INET6 */ + struct sockaddr_in6 *sin6; +#endif + int error; + int s; - if (srcsa->sa_family != dstsa->sa_family) - return (1); + vt = vxlan_tep_get(sc, addr); + if (vt != NULL) { + struct vxlan_peer *op; + + mtx_enter(&vt->vt_mtx); + op = RBT_INSERT(vxlan_peers, &vt->vt_peers, p); + mtx_leave(&vt->vt_mtx); + + if (op != NULL) + return (EADDRINUSE); + + return (0); + } - switch (dstsa->sa_family) { + vt = malloc(sizeof(*vt), M_DEVBUF, M_NOWAIT|M_ZERO); + if (vt == NULL) + return (ENOMEM); + + vt->vt_af = sc->sc_af; + vt->vt_rdomain = sc->sc_rdomain; + vt->vt_addr = *addr; + vt->vt_port = sc->sc_port; + + mtx_init(&vt->vt_mtx, IPL_SOFTNET); + RBT_INIT(vxlan_peers, &vt->vt_peers); + RBT_INSERT(vxlan_peers, &vt->vt_peers, p); + + error = socreate(vt->vt_af, &so, SOCK_DGRAM, IPPROTO_UDP); + if (error != 0) + goto free; + + s = solock(so); + + sotoinpcb(so)->inp_upcall = vxlan_input; + sotoinpcb(so)->inp_upcall_arg = vt; + + m_inithdr(&m); + m.m_len = sizeof(vt->vt_rdomain); + *mtod(&m, unsigned int *) = vt->vt_rdomain; + error = sosetopt(so, SOL_SOCKET, SO_RTABLE, &m); + if (error != 0) + goto close; + + m_inithdr(&m); + switch (vt->vt_af) { case AF_INET: - src4 = satosin(srcsa); - dst4 = satosin(dstsa); - if (src4->sin_addr.s_addr == dst4->sin_addr.s_addr) - return (0); + sin = mtod(&m, struct sockaddr_in *); + memset(sin, 0, sizeof(*sin)); + sin->sin_len = sizeof(*sin); + sin->sin_family = AF_INET; + sin->sin_addr = addr->in4; + sin->sin_port = vt->vt_port; + + m.m_len = sizeof(*sin); break; + #ifdef INET6 case AF_INET6: - src6 = satosin6(srcsa); - dst6 = satosin6(dstsa); - if (IN6_ARE_ADDR_EQUAL(&src6->sin6_addr, &dst6->sin6_addr) && - src6->sin6_scope_id == dst6->sin6_scope_id) - return (0); + sin6 = mtod(&m, struct sockaddr_in6 *); + sin6->sin6_len = sizeof(*sin6); + sin6->sin6_family = AF_INET6; + in6_recoverscope(sin6, &addr->in6); + sin6->sin6_port = sc->sc_port; + + m.m_len = sizeof(*sin6); break; -#endif /* INET6 */ +#endif + default: + unhandled_af(vt->vt_af); } - return (1); + error = sobind(so, &m, curproc); + if (error != 0) + goto close; + + sounlock(so, s); + + rw_assert_wrlock(&vxlan_lock); + TAILQ_INSERT_TAIL(&vxlan_teps, vt, vt_entry); + + vt->vt_so = so; + + return (0); + +close: + sounlock(so, s); + soclose(so, MSG_DONTWAIT); +free: + free(vt, M_DEVBUF, sizeof(*vt)); + return (error); } -uint16_t -vxlan_sockaddr_port(struct sockaddr *sa) +static void +vxlan_tep_del_addr(struct vxlan_softc *sc, const union vxlan_addr *addr, + struct vxlan_peer *p) { - struct sockaddr_in *sin4; -#ifdef INET6 - struct sockaddr_in6 *sin6; -#endif /* INET6 */ + struct vxlan_tep *vt; + int empty; - switch (sa->sa_family) { - case AF_INET: - sin4 = satosin(sa); - return (sin4->sin_port); -#ifdef INET6 - case AF_INET6: - sin6 = satosin6(sa); - return (sin6->sin6_port); -#endif /* INET6 */ - default: - break; - } + vt = vxlan_tep_get(sc, addr); + if (vt == NULL) + panic("unable to find vxlan_tep for peer %p (sc %p)", p, sc); + + mtx_enter(&vt->vt_mtx); + RBT_REMOVE(vxlan_peers, &vt->vt_peers, p); + empty = RBT_EMPTY(vxlan_peers, &vt->vt_peers); + mtx_leave(&vt->vt_mtx); + + if (!empty) + return; + + rw_assert_wrlock(&vxlan_lock); + TAILQ_REMOVE(&vxlan_teps, vt, vt_entry); + + soclose(vt->vt_so, MSG_DONTWAIT); + free(vt, M_DEVBUF, sizeof(*vt)); +} + +static int +vxlan_tep_up(struct vxlan_softc *sc) +{ + struct vxlan_peer *up, *mp; + int error; + + up = malloc(sizeof(*up), M_DEVBUF, M_NOWAIT|M_ZERO); + if (up == NULL) + return (ENOMEM); + + up->p_mask = (sc->sc_mode != VXLAN_TMODE_P2P); + up->p_addr = sc->sc_dst; + up->p_header = sc->sc_header; + up->p_sc = vxlan_take(sc); + + error = vxlan_tep_add_addr(sc, &sc->sc_src, up); + if (error != 0) + goto freeup; + + sc->sc_ucast_peer = up; + + if (sc->sc_mode != VXLAN_TMODE_LEARNING) + return (0); + + mp = malloc(sizeof(*mp), M_DEVBUF, M_NOWAIT|M_ZERO); + if (mp == NULL) { + error = ENOMEM; + goto delup; + } + + mp->p_mask = 1; + /* addr is masked, leave it as 0s */ + mp->p_header = sc->sc_header; + mp->p_sc = vxlan_take(sc); + + /* destination address is a multicast group we want to join */ + error = vxlan_tep_add_addr(sc, &sc->sc_dst, up); + if (error != 0) + goto freemp; + + sc->sc_mcast_peer = mp; return (0); + +freemp: + vxlan_rele(mp->p_sc); + free(mp, M_DEVBUF, sizeof(*mp)); +delup: + vxlan_tep_del_addr(sc, &sc->sc_src, up); +freeup: + vxlan_rele(up->p_sc); + free(up, M_DEVBUF, sizeof(*up)); + return (error); } -int -vxlan_lookup(struct mbuf *m, struct udphdr *uh, int iphlen, - struct sockaddr *srcsa, struct sockaddr *dstsa) -{ - struct vxlan_softc *sc = NULL, *sc_cand = NULL; - struct vxlan_header v; - int vni; - struct ifnet *ifp; - int skip; -#if NBRIDGE > 0 - struct bridge_tunneltag *brtag; -#endif - struct mbuf *n; - int off; - - /* XXX Should verify the UDP port first before copying the packet */ - skip = iphlen + sizeof(*uh); - if (m->m_pkthdr.len - skip < sizeof(v)) - return (0); - m_copydata(m, skip, sizeof(v), &v); - skip += sizeof(v); - - if (v.vxlan_flags & htonl(VXLAN_RESERVED1) || - v.vxlan_id & htonl(VXLAN_RESERVED2)) - return (0); - - vni = ntohl(v.vxlan_id) >> VXLAN_VNI_S; - if ((v.vxlan_flags & htonl(VXLAN_FLAGS_VNI)) == 0) { - if (vni != 0) - return (0); +static void +vxlan_tep_down(struct vxlan_softc *sc) +{ + struct vxlan_peer *up = sc->sc_ucast_peer; - vni = VXLAN_VNI_UNSET; + if (sc->sc_mode == VXLAN_TMODE_LEARNING) { + struct vxlan_peer *mp = sc->sc_mcast_peer; + vxlan_tep_del_addr(sc, &sc->sc_dst, mp); + vxlan_rele(mp->p_sc); + free(mp, M_DEVBUF, sizeof(*mp)); } + vxlan_tep_del_addr(sc, &sc->sc_src, up); + vxlan_rele(up->p_sc); + free(up, M_DEVBUF, sizeof(*up)); +} + +static int +vxlan_up(struct vxlan_softc *sc) +{ + struct ifnet *ifp = &sc->sc_ac.ac_if; + struct ifnet *ifp0 = NULL; + int error; + + KASSERT(!ISSET(ifp->if_flags, IFF_RUNNING)); NET_ASSERT_LOCKED(); - /* First search for a vxlan(4) interface with the packet's VNI */ - LIST_FOREACH(sc, &vxlan_tagh[VXLAN_TAGHASH(vni)], sc_entry) { - if ((uh->uh_dport == sc->sc_dstport) && - vni == sc->sc_vnetid && - sc->sc_rdomain == rtable_l2(m->m_pkthdr.ph_rtableid)) { - sc_cand = sc; - if (vxlan_sockaddr_cmp(srcsa, sstosa(&sc->sc_dst)) == 0) - goto found; - } + + if (sc->sc_af == AF_UNSPEC) + return (EDESTADDRREQ); + KASSERT(sc->sc_mode != VXLAN_TMODE_UNSET); + + NET_UNLOCK(); + + error = rw_enter(&vxlan_lock, RW_WRITE|RW_INTR); + if (error != 0) + goto netlock; + + NET_LOCK(); + if (ISSET(ifp->if_flags, IFF_RUNNING)) { + /* something else beat us */ + rw_exit(&vxlan_lock); + return (0); } + NET_UNLOCK(); - /* - * Now loop through all the vxlan(4) interfaces that are configured - * to accept any VNI and operating in multipoint-to-multipoint mode - * that is used in combination with bridge(4) or switch(4). - * If a vxlan(4) interface has been found for the packet's VNI, this - * code is not reached as the other interface is more specific. - */ - LIST_FOREACH(sc, &vxlan_any, sc_entry) { - if ((uh->uh_dport == sc->sc_dstport) && - (sc->sc_rdomain == rtable_l2(m->m_pkthdr.ph_rtableid))) { - sc_cand = sc; - goto found; - } + if (sc->sc_mode != VXLAN_TMODE_P2P) { + error = etherbridge_up(&sc->sc_eb); + if (error != 0) + goto unlock; } - if (sc_cand) { - sc = sc_cand; - goto found; + if (sc->sc_mode == VXLAN_TMODE_LEARNING) { + ifp0 = if_get(sc->sc_if_index0); + if (ifp0 == NULL) { + error = ENXIO; + goto down; + } + + /* check again if multicast will work on top of the parent */ + if (!ISSET(ifp0->if_flags, IFF_MULTICAST)) { + error = EPROTONOSUPPORT; + goto put; + } + + error = vxlan_addmulti(sc, ifp0); + if (error != 0) + goto put; + + /* Register callback if parent wants to unregister */ + if_detachhook_add(ifp0, &sc->sc_dtask); + } else { + if (sc->sc_if_index0 != 0) { + error = EPROTONOSUPPORT; + goto down; + } } - /* not found */ + error = vxlan_tep_up(sc); + if (error != 0) + goto del; + + if_put(ifp0); + + NET_LOCK(); + SET(ifp->if_flags, IFF_RUNNING); + rw_exit(&vxlan_lock); + return (0); - found: - if (m->m_pkthdr.len < skip + sizeof(struct ether_header)) { - m_freem(m); - return (EINVAL); +del: + if (ifp0 != NULL) + if_detachhook_del(ifp0, &sc->sc_dtask); + vxlan_delmulti(sc); +put: + if_put(ifp0); +down: + if (sc->sc_mode != VXLAN_TMODE_P2P) + etherbridge_down(&sc->sc_eb); +unlock: + rw_exit(&vxlan_lock); +netlock: + NET_LOCK(); + + return (error); +} + +static int +vxlan_down(struct vxlan_softc *sc) +{ + struct ifnet *ifp = &sc->sc_ac.ac_if; + struct ifnet *ifp0; + int error; + + KASSERT(ISSET(ifp->if_flags, IFF_RUNNING)); + NET_UNLOCK(); + + error = rw_enter(&vxlan_lock, RW_WRITE|RW_INTR); + if (error != 0) { + NET_LOCK(); + return (error); } - m_adj(m, skip); - ifp = &sc->sc_ac.ac_if; + NET_LOCK(); + if (!ISSET(ifp->if_flags, IFF_RUNNING)) { + /* something else beat us */ + rw_exit(&vxlan_lock); + return (0); + } + NET_UNLOCK(); + + vxlan_tep_down(sc); -#if NBRIDGE > 0 - /* Store the tunnel src/dst IP and vni for the bridge or switch */ - if ((ifp->if_bridgeidx != 0 || ifp->if_switchport != NULL) && - srcsa->sa_family != AF_UNSPEC && - ((brtag = bridge_tunneltag(m)) != NULL)) { - memcpy(&brtag->brtag_peer.sa, srcsa, srcsa->sa_len); - memcpy(&brtag->brtag_local.sa, dstsa, dstsa->sa_len); - brtag->brtag_id = vni; + if (sc->sc_mode == VXLAN_TMODE_LEARNING) { + vxlan_delmulti(sc); + ifp0 = if_get(sc->sc_if_index0); + if (ifp0 != NULL) { + if_detachhook_del(ifp0, &sc->sc_dtask); + } + if_put(ifp0); } -#endif - m->m_flags &= ~(M_BCAST|M_MCAST); + if (sc->sc_mode != VXLAN_TMODE_P2P) + etherbridge_down(&sc->sc_eb); -#if NPF > 0 - pf_pkt_addr_changed(m); -#endif - if ((m->m_len < sizeof(struct ether_header)) && - (m = m_pullup(m, sizeof(struct ether_header))) == NULL) - return (ENOBUFS); + taskq_del_barrier(ifp->if_snd.ifq_softnet, &sc->sc_send_task); + NET_LOCK(); + CLR(ifp->if_flags, IFF_RUNNING); + rw_exit(&vxlan_lock); - n = m_getptr(m, sizeof(struct ether_header), &off); - if (n == NULL) { - m_freem(m); - return (EINVAL); + return (0); +} + +static int +vxlan_addmulti(struct vxlan_softc *sc, struct ifnet *ifp0) +{ + int error = 0; + + NET_LOCK(); + + switch (sc->sc_af) { + case AF_INET: + sc->sc_inmulti = in_addmulti(&sc->sc_dst.in4, ifp0); + if (sc->sc_inmulti == NULL) + error = EADDRNOTAVAIL; + break; +#ifdef INET6 + case AF_INET6: + sc->sc_inmulti = in6_addmulti(&sc->sc_dst.in6, ifp0, &error); + break; +#endif + default: + unhandled_af(sc->sc_af); } - if (!ALIGNED_POINTER(mtod(n, caddr_t) + off, uint32_t)) { - n = m_dup_pkt(m, ETHER_ALIGN, M_NOWAIT); - /* Dispose of the original mbuf chain */ - m_freem(m); - if (n == NULL) - return (ENOBUFS); - m = n; + + NET_UNLOCK(); + + return (error); +} + +static void +vxlan_delmulti(struct vxlan_softc *sc) +{ + NET_LOCK(); + + switch (sc->sc_af) { + case AF_INET: + in_delmulti(sc->sc_inmulti); + break; +#ifdef INET6 + case AF_INET6: + in6_delmulti(sc->sc_inmulti); + break; +#endif + default: + unhandled_af(sc->sc_af); } - if_vinput(ifp, m); + sc->sc_inmulti = NULL; /* keep it tidy */ - /* success */ - return (1); + NET_UNLOCK(); } -struct mbuf * -vxlan_encap4(struct ifnet *ifp, struct mbuf *m, - struct sockaddr *src, struct sockaddr *dst) -{ - struct vxlan_softc *sc = (struct vxlan_softc *)ifp->if_softc; - struct ip *ip; - - /* - * Remove multicast and broadcast flags or encapsulated packet - * ends up as multicast or broadcast packet. - */ - m->m_flags &= ~(M_BCAST|M_MCAST); +static int +vxlan_set_rdomain(struct vxlan_softc *sc, const struct ifreq *ifr) +{ + struct ifnet *ifp = &sc->sc_ac.ac_if; - M_PREPEND(m, sizeof(*ip), M_DONTWAIT); - if (m == NULL) - return (NULL); + if (ifr->ifr_rdomainid < 0 || + ifr->ifr_rdomainid > RT_TABLEID_MAX) + return (EINVAL); + if (!rtable_exists(ifr->ifr_rdomainid)) + return (EADDRNOTAVAIL); - ip = mtod(m, struct ip *); - ip->ip_v = IPVERSION; - ip->ip_hl = sizeof(struct ip) >> 2; - ip->ip_id = htons(ip_randomid()); - ip->ip_off = sc->sc_df; - ip->ip_p = IPPROTO_UDP; - ip->ip_tos = IFQ_PRIO2TOS(sc->sc_txhprio == IF_HDRPRIO_PACKET ? - m->m_pkthdr.pf.prio : sc->sc_txhprio); - ip->ip_len = htons(m->m_pkthdr.len); + if (sc->sc_rdomain == ifr->ifr_rdomainid) + return (0); - ip->ip_src = satosin(src)->sin_addr; - ip->ip_dst = satosin(dst)->sin_addr; + if (!ISSET(ifp->if_flags, IFF_RUNNING)) + return (EBUSY); - if (sc->sc_ttl > 0) - ip->ip_ttl = sc->sc_ttl; - else - ip->ip_ttl = IPDEFTTL; + /* commit */ + sc->sc_rdomain = ifr->ifr_rdomainid; + etherbridge_flush(&sc->sc_eb, IFBF_FLUSHALL); - return (m); + return (0); +} + +static int +vxlan_get_rdomain(struct vxlan_softc *sc, struct ifreq *ifr) +{ + ifr->ifr_rdomainid = sc->sc_rdomain; + + return (0); } +static int +vxlan_set_tunnel(struct vxlan_softc *sc, const struct if_laddrreq *req) +{ + struct ifnet *ifp = &sc->sc_ac.ac_if; + struct sockaddr *src = (struct sockaddr *)&req->addr; + struct sockaddr *dst = (struct sockaddr *)&req->dstaddr; + struct sockaddr_in *src4, *dst4; #ifdef INET6 -struct mbuf * -vxlan_encap6(struct ifnet *ifp, struct mbuf *m, - struct sockaddr *src, struct sockaddr *dst) -{ - struct vxlan_softc *sc = (struct vxlan_softc *)ifp->if_softc; - struct ip6_hdr *ip6; - struct in6_addr *in6a; - uint32_t flow; - - /* - * Remove multicast and broadcast flags or encapsulated packet - * ends up as multicast or broadcast packet. - */ - m->m_flags &= ~(M_BCAST|M_MCAST); + struct sockaddr_in6 *src6, *dst6; + int error; +#endif + union vxlan_addr saddr, daddr; + unsigned int mode = VXLAN_TMODE_ENDPOINT; + in_port_t port = htons(VXLAN_PORT); - M_PREPEND(m, sizeof(struct ip6_hdr), M_DONTWAIT); - if (m == NULL) - return (NULL); + memset(&saddr, 0, sizeof(saddr)); + memset(&daddr, 0, sizeof(daddr)); - flow = (uint32_t)IFQ_PRIO2TOS(sc->sc_txhprio == IF_HDRPRIO_PACKET ? - m->m_pkthdr.pf.prio : sc->sc_txhprio) << 20; + /* validate */ + switch (src->sa_family) { + case AF_INET: + src4 = (struct sockaddr_in *)src; + if (in_nullhost(src4->sin_addr) || + IN_MULTICAST(src4->sin_addr.s_addr)) + return (EINVAL); - ip6 = mtod(m, struct ip6_hdr *); - ip6->ip6_flow = htonl(flow); - ip6->ip6_vfc &= ~IPV6_VERSION_MASK; - ip6->ip6_vfc |= IPV6_VERSION; - ip6->ip6_nxt = IPPROTO_UDP; - ip6->ip6_plen = htons(m->m_pkthdr.len - sizeof(struct ip6_hdr)); - if (in6_embedscope(&ip6->ip6_src, satosin6(src), NULL) != 0) - goto drop; - if (in6_embedscope(&ip6->ip6_dst, satosin6(dst), NULL) != 0) - goto drop; + if (src4->sin_port != htons(0)) + port = src4->sin_port; - if (sc->sc_ttl > 0) - ip6->ip6_hlim = sc->sc_ttl; - else - ip6->ip6_hlim = ip6_defhlim; + if (dst->sa_family != AF_UNSPEC) { + if (dst->sa_family != AF_INET) + return (EINVAL); + + dst4 = (struct sockaddr_in *)dst; + if (in_nullhost(dst4->sin_addr)) + return (EINVAL); + + /* all good */ + mode = IN_MULTICAST(dst4->sin_addr.s_addr) ? + VXLAN_TMODE_LEARNING : VXLAN_TMODE_P2P; + daddr.in4 = dst4->sin_addr; + } - if (IN6_IS_ADDR_UNSPECIFIED(&satosin6(src)->sin6_addr)) { - if (in6_selectsrc(&in6a, satosin6(dst), NULL, - sc->sc_rdomain) != 0) - goto drop; + saddr.in4 = src4->sin_addr; + break; + +#ifdef INET6 + case AF_INET6: + src6 = (struct sockaddr_in6 *)src; + if (IN6_IS_ADDR_UNSPECIFIED(&src6->sin6_addr) || + IN6_IS_ADDR_MULTICAST(&src6->sin6_addr)) + return (EINVAL); + + if (src6->sin6_port != htons(0)) + port = src6->sin6_port; - ip6->ip6_src = *in6a; + if (dst->sa_family != AF_UNSPEC) { + if (dst->sa_family != AF_INET6) + return (EINVAL); + + dst6 = (struct sockaddr_in6 *)dst; + if (IN6_IS_ADDR_UNSPECIFIED(&dst6->sin6_addr)) + return (EINVAL); + + if (src6->sin6_scope_id != dst6->sin6_scope_id) + return (EINVAL); + + /* all good */ + mode = IN6_IS_ADDR_MULTICAST(&dst6->sin6_addr) ? + VXLAN_TMODE_LEARNING : VXLAN_TMODE_P2P; + error = in6_embedscope(&daddr.in6, dst6, NULL); + if (error != 0) + return (error); + } + + error = in6_embedscope(&saddr.in6, src6, NULL); + if (error != 0) + return (error); + + break; +#endif + default: + return (EAFNOSUPPORT); } - if (sc->sc_df) - SET(m->m_pkthdr.csum_flags, M_IPV6_DF_OUT); + if (memcmp(&sc->sc_src, &saddr, sizeof(sc->sc_src)) == 0 && + memcmp(&sc->sc_dst, &daddr, sizeof(sc->sc_dst)) == 0 && + sc->sc_port == port) + return (0); - /* - * The UDP checksum of VXLAN packets should be set to zero, - * but the IPv6 UDP checksum is not optional. There is an RFC 6539 - * to relax the IPv6 UDP checksum requirement for tunnels, but it - * is currently not supported by most implementations. - */ - m->m_pkthdr.csum_flags |= M_UDP_CSUM_OUT; + if (ISSET(ifp->if_flags, IFF_RUNNING)) + return (EBUSY); - return (m); + /* commit */ + sc->sc_af = src->sa_family; + sc->sc_src = saddr; + sc->sc_dst = daddr; + sc->sc_port = port; + sc->sc_mode = mode; + etherbridge_flush(&sc->sc_eb, IFBF_FLUSHALL); -drop: - m_freem(m); - return (NULL); + return (0); } -#endif /* INET6 */ -int -vxlan_output(struct ifnet *ifp, struct mbuf *m) +static int +vxlan_get_tunnel(struct vxlan_softc *sc, struct if_laddrreq *req) { - struct vxlan_softc *sc = (struct vxlan_softc *)ifp->if_softc; - struct vxlanudphdr *vu; - struct sockaddr *src, *dst; -#if NBRIDGE > 0 - struct bridge_tunneltag *brtag; -#endif - int error, af; - uint32_t tag; - struct mbuf *m0; - - /* VXLAN header, needs new mbuf because of alignment issues */ - MGET(m0, M_DONTWAIT, m->m_type); - if (m0 == NULL) { - ifp->if_oerrors++; - return (ENOBUFS); - } - M_MOVE_PKTHDR(m0, m); - m0->m_next = m; - m = m0; - m_align(m, sizeof(*vu)); - m->m_len = sizeof(*vu); - m->m_pkthdr.len += sizeof(*vu); - - src = sstosa(&sc->sc_src); - dst = sstosa(&sc->sc_dst); - af = src->sa_family; - - vu = mtod(m, struct vxlanudphdr *); - vu->vu_u.uh_sport = sc->sc_dstport; - vu->vu_u.uh_dport = sc->sc_dstport; - vu->vu_u.uh_ulen = htons(m->m_pkthdr.len); - vu->vu_u.uh_sum = 0; - tag = sc->sc_vnetid; - -#if NBRIDGE > 0 - if ((brtag = bridge_tunnel(m)) != NULL) { - dst = &brtag->brtag_peer.sa; - - /* If accepting any VNI, source ip address is from brtag */ - if (sc->sc_vnetid == VXLAN_VNI_ANY) { - src = &brtag->brtag_local.sa; - tag = (uint32_t)brtag->brtag_id; - af = src->sa_family; - } - - if (dst->sa_family != af) { - ifp->if_oerrors++; - m_freem(m); - return (EINVAL); - } - } else + struct sockaddr *dstaddr = (struct sockaddr *)&req->dstaddr; + struct sockaddr_in *sin; +#ifdef INET6 + struct sockaddr_in6 *sin6; #endif - if (sc->sc_vnetid == VXLAN_VNI_ANY) { - /* - * If accepting any VNI, build the vxlan header only by - * bridge_tunneltag or drop packet if the tag does not exist. - */ - ifp->if_oerrors++; - m_freem(m); - return (ENETUNREACH); - } - if (sc->sc_vnetid != VXLAN_VNI_UNSET) { - vu->vu_v.vxlan_flags = htonl(VXLAN_FLAGS_VNI); - vu->vu_v.vxlan_id = htonl(tag << VXLAN_VNI_S); - } else { - vu->vu_v.vxlan_flags = htonl(0); - vu->vu_v.vxlan_id = htonl(0); - } + if (sc->sc_af == AF_UNSPEC) + return (EADDRNOTAVAIL); + KASSERT(sc->sc_mode != VXLAN_TMODE_UNSET); + + memset(&req->addr, 0, sizeof(req->addr)); + memset(&req->dstaddr, 0, sizeof(req->dstaddr)); - switch (af) { + /* default to endpoint */ + dstaddr->sa_len = 2; + dstaddr->sa_family = AF_UNSPEC; + + switch (sc->sc_af) { case AF_INET: - m = vxlan_encap4(ifp, m, src, dst); + sin = (struct sockaddr_in *)&req->addr; + sin->sin_len = sizeof(*sin); + sin->sin_family = AF_INET; + sin->sin_addr = sc->sc_src.in4; + sin->sin_port = sc->sc_port; + + if (sc->sc_mode == VXLAN_TMODE_ENDPOINT) + break; + + sin = (struct sockaddr_in *)&req->dstaddr; + sin->sin_len = sizeof(*sin); + sin->sin_family = AF_INET; + sin->sin_addr = sc->sc_dst.in4; break; + #ifdef INET6 case AF_INET6: - m = vxlan_encap6(ifp, m, src, dst); + sin6 = (struct sockaddr_in6 *)&req->addr; + sin6->sin6_len = sizeof(*sin6); + sin6->sin6_family = AF_INET6; + in6_recoverscope(sin6, &sc->sc_src.in6); + sin6->sin6_port = sc->sc_port; + + if (sc->sc_mode == VXLAN_TMODE_ENDPOINT) + break; + + sin6 = (struct sockaddr_in6 *)&req->dstaddr; + sin6->sin6_len = sizeof(*sin6); + sin6->sin6_family = AF_INET6; + in6_recoverscope(sin6, &sc->sc_dst.in6); break; -#endif /* INET6 */ +#endif default: - m_freem(m); - m = NULL; + unhandled_af(sc->sc_af); } - if (m == NULL) { - ifp->if_oerrors++; - return (ENOBUFS); + return (0); +} + +static int +vxlan_del_tunnel(struct vxlan_softc *sc) +{ + struct ifnet *ifp = &sc->sc_ac.ac_if; + + if (sc->sc_af == AF_UNSPEC) + return (0); + + if (ISSET(ifp->if_flags, IFF_RUNNING)) + return (EBUSY); + + /* commit */ + sc->sc_af = AF_UNSPEC; + memset(&sc->sc_src, 0, sizeof(sc->sc_src)); + memset(&sc->sc_dst, 0, sizeof(sc->sc_dst)); + sc->sc_port = htons(0); + sc->sc_mode = VXLAN_TMODE_UNSET; + etherbridge_flush(&sc->sc_eb, IFBF_FLUSHALL); + + return (0); +} + +static int +vxlan_set_vnetid(struct vxlan_softc *sc, const struct ifreq *ifr) +{ + struct ifnet *ifp = &sc->sc_ac.ac_if; + uint32_t vni; + + if (ifr->ifr_vnetid < VXLAN_VNI_MIN || + ifr->ifr_vnetid > VXLAN_VNI_MAX) + return (EINVAL); + + vni = htonl(ifr->ifr_vnetid << VXLAN_VNI_SHIFT); + if (ISSET(sc->sc_header.vxlan_flags, htonl(VXLAN_F_I)) && + sc->sc_header.vxlan_id == vni) + return (0); + + if (ISSET(ifp->if_flags, IFF_RUNNING)) + return (EBUSY); + + /* commit */ + SET(sc->sc_header.vxlan_flags, htonl(VXLAN_F_I)); + sc->sc_header.vxlan_id = vni; + etherbridge_flush(&sc->sc_eb, IFBF_FLUSHALL); + + return (0); +} + +static int +vxlan_get_vnetid(struct vxlan_softc *sc, struct ifreq *ifr) +{ + uint32_t vni; + + if (!ISSET(sc->sc_header.vxlan_flags, htonl(VXLAN_F_I))) + return (EADDRNOTAVAIL); + + vni = ntohl(sc->sc_header.vxlan_id); + vni &= VXLAN_VNI_MASK; + vni >>= VXLAN_VNI_SHIFT; + + ifr->ifr_vnetid = vni; + + return (0); +} + +static int +vxlan_del_vnetid(struct vxlan_softc *sc) +{ + struct ifnet *ifp = &sc->sc_ac.ac_if; + + if (!ISSET(sc->sc_header.vxlan_flags, htonl(VXLAN_F_I))) + return (0); + + if (ISSET(ifp->if_flags, IFF_RUNNING)) + return (EBUSY); + + /* commit */ + CLR(sc->sc_header.vxlan_flags, htonl(VXLAN_F_I)); + sc->sc_header.vxlan_id = htonl(0 << VXLAN_VNI_SHIFT); + etherbridge_flush(&sc->sc_eb, IFBF_FLUSHALL); + + return (0); +} + +static int +vxlan_set_parent(struct vxlan_softc *sc, const struct if_parent *p) +{ + struct ifnet *ifp = &sc->sc_ac.ac_if; + struct ifnet *ifp0; + int error = 0; + + ifp0 = if_unit(p->ifp_parent); + if (ifp0 == NULL) + return (ENXIO); + + if (!ISSET(ifp0->if_flags, IFF_MULTICAST)) { + error = ENXIO; + goto put; } -#if NBRIDGE > 0 - if (brtag != NULL) - bridge_tunneluntag(m); -#endif + if (sc->sc_if_index0 == ifp0->if_index) + goto put; - m->m_pkthdr.ph_rtableid = sc->sc_rdomain; + if (ISSET(ifp->if_flags, IFF_RUNNING)) { + error = EBUSY; + goto put; + } -#if NPF > 0 - pf_pkt_addr_changed(m); + /* commit */ + sc->sc_if_index0 = ifp0->if_index; + etherbridge_flush(&sc->sc_eb, IFBF_FLUSHALL); + +put: + if_put(ifp0); + return (error); +} + +static int +vxlan_get_parent(struct vxlan_softc *sc, struct if_parent *p) +{ + struct ifnet *ifp0; + int error = 0; + + ifp0 = if_get(sc->sc_if_index0); + if (ifp0 == NULL) + error = EADDRNOTAVAIL; + else + strlcpy(p->ifp_parent, ifp0->if_xname, sizeof(p->ifp_parent)); + if_put(ifp0); + + return (error); +} + +static int +vxlan_del_parent(struct vxlan_softc *sc) +{ + struct ifnet *ifp = &sc->sc_ac.ac_if; + + if (sc->sc_if_index0 == 0) + return (0); + + if (ISSET(ifp->if_flags, IFF_RUNNING)) + return (EBUSY); + + /* commit */ + sc->sc_if_index0 = 0; + etherbridge_flush(&sc->sc_eb, IFBF_FLUSHALL); + + return (0); +} + +static int +vxlan_add_addr(struct vxlan_softc *sc, const struct ifbareq *ifba) +{ + struct sockaddr_in *sin; +#ifdef INET6 + struct sockaddr_in6 *sin6; + struct sockaddr_in6 src6 = { + .sin6_len = sizeof(src6), + .sin6_family = AF_UNSPEC, + }; + int error; #endif + union vxlan_addr endpoint; + unsigned int type; + + switch (sc->sc_mode) { + case VXLAN_TMODE_UNSET: + return (ENOPROTOOPT); + case VXLAN_TMODE_P2P: + return (EPROTONOSUPPORT); + default: + break; + } + + /* ignore ifba_ifsname */ + + if (ISSET(ifba->ifba_flags, ~IFBAF_TYPEMASK)) + return (EINVAL); + switch (ifba->ifba_flags & IFBAF_TYPEMASK) { + case IFBAF_DYNAMIC: + type = EBE_DYNAMIC; + break; + case IFBAF_STATIC: + type = EBE_STATIC; + break; + default: + return (EINVAL); + } + + memset(&endpoint, 0, sizeof(endpoint)); - switch (af) { + if (ifba->ifba_dstsa.ss_family != sc->sc_af) + return (EAFNOSUPPORT); + switch (ifba->ifba_dstsa.ss_family) { case AF_INET: - error = ip_output(m, NULL, NULL, IP_RAWOUTPUT, - &sc->sc_imo, NULL, 0); + sin = (struct sockaddr_in *)&ifba->ifba_dstsa; + if (in_nullhost(sin->sin_addr) || + IN_MULTICAST(sin->sin_addr.s_addr)) + return (EADDRNOTAVAIL); + + if (sin->sin_port != htons(0)) + return (EADDRNOTAVAIL); + + endpoint.in4 = sin->sin_addr; break; + #ifdef INET6 case AF_INET6: - error = ip6_output(m, 0, NULL, IPV6_MINMTU, 0, NULL); + sin6 = (struct sockaddr_in6 *)&ifba->ifba_dstsa; + if (IN6_IS_ADDR_UNSPECIFIED(&sin6->sin6_addr) || + IN6_IS_ADDR_MULTICAST(&sin6->sin6_addr)) + return (EADDRNOTAVAIL); + + in6_recoverscope(&src6, &sc->sc_src.in6); + if (src6.sin6_scope_id != sin6->sin6_scope_id) + return (EADDRNOTAVAIL); + + if (sin6->sin6_port != htons(0)) + return (EADDRNOTAVAIL); + + error = in6_embedscope(&endpoint.in6, sin6, NULL); + if (error != 0) + return (error); + break; -#endif /* INET6 */ - default: - m_freem(m); - error = EAFNOSUPPORT; +#endif + default: /* AF_UNSPEC */ + return (EADDRNOTAVAIL); } - if (error) - ifp->if_oerrors++; + return (etherbridge_add_addr(&sc->sc_eb, &endpoint, + &ifba->ifba_dst, type)); +} - return (error); +static int +vxlan_del_addr(struct vxlan_softc *sc, const struct ifbareq *ifba) +{ + return (etherbridge_del_addr(&sc->sc_eb, &ifba->ifba_dst)); } void -vxlan_addr_change(void *arg) +vxlan_detach_hook(void *arg) { - struct vxlan_softc *sc = arg; - struct ifnet *ifp = &sc->sc_ac.ac_if; - int error; - - /* - * Reset the configuration after resume or any possible address - * configuration changes. - */ - if ((error = vxlan_config(ifp, NULL, NULL))) { - /* - * The source address of the tunnel can temporarily disappear, - * after a link state change when running the DHCP client, - * so keep it configured. - */ + struct vxlan_softc *sc = arg; + struct ifnet *ifp = &sc->sc_ac.ac_if; + + if (ISSET(ifp->if_flags, IFF_RUNNING)) { + vxlan_down(sc); + CLR(ifp->if_flags, IFF_UP); } + + sc->sc_if_index0 = 0; } -void -vxlan_if_change(void *arg) +static int +vxlan_eb_port_eq(void *arg, void *a, void *b) { - struct vxlan_softc *sc = arg; - struct ifnet *ifp = &sc->sc_ac.ac_if; + const union vxlan_addr *va = a, *vb = b; + size_t i; - /* - * Reset the configuration after the parent interface disappeared. - */ - vxlan_multicast_cleanup(ifp); - memset(&sc->sc_src, 0, sizeof(sc->sc_src)); - memset(&sc->sc_dst, 0, sizeof(sc->sc_dst)); - sc->sc_dstport = htons(VXLAN_PORT); + for (i = 0; i < nitems(va->in6.s6_addr32); i++) { + if (va->in6.s6_addr32[i] != vb->in6.s6_addr32[i]) + return (0); + } + + return (1); } -void -vxlan_link_change(void *arg) +static void * +vxlan_eb_port_take(void *arg, void *port) { - struct vxlan_softc *sc = arg; - struct ifnet *ifp = &sc->sc_ac.ac_if; + union vxlan_addr *endpoint; + + endpoint = pool_get(&vxlan_endpoint_pool, PR_NOWAIT); + if (endpoint == NULL) + return (NULL); + + *endpoint = *(union vxlan_addr *)port; - /* - * The machine might have lost its multicast associations after - * link state changes. This fixes a problem with VMware after - * suspend/resume of the host or guest. - */ - (void)vxlan_config(ifp, NULL, NULL); + return (endpoint); } + +static void +vxlan_eb_port_rele(void *arg, void *port) +{ + union vxlan_addr *endpoint = port; + + pool_put(&vxlan_endpoint_pool, endpoint); +} + +static size_t +vxlan_eb_port_ifname(void *arg, char *dst, size_t len, void *port) +{ + struct vxlan_softc *sc = arg; + + return (strlcpy(dst, sc->sc_ac.ac_if.if_xname, len)); +} + +static void +vxlan_eb_port_sa(void *arg, struct sockaddr_storage *ss, void *port) +{ + struct vxlan_softc *sc = arg; + union vxlan_addr *endpoint = port; + + switch (sc->sc_af) { + case AF_INET: { + struct sockaddr_in *sin = (struct sockaddr_in *)ss; + + sin->sin_len = sizeof(*sin); + sin->sin_family = AF_INET; + sin->sin_addr = endpoint->in4; + break; + } +#ifdef INET6 + case AF_INET6: { + struct sockaddr_in6 *sin6 = (struct sockaddr_in6 *)ss; + + sin6->sin6_len = sizeof(*sin6); + sin6->sin6_family = AF_INET6; + in6_recoverscope(sin6, &endpoint->in6); + break; + } +#endif /* INET6 */ + default: + unhandled_af(sc->sc_af); + } +} + +static inline int +vxlan_peer_cmp(const struct vxlan_peer *ap, const struct vxlan_peer *bp) +{ + size_t i; + + if (ap->p_header.vxlan_id > bp->p_header.vxlan_id) + return (1); + if (ap->p_header.vxlan_id < bp->p_header.vxlan_id) + return (-1); + if (ap->p_header.vxlan_flags > bp->p_header.vxlan_flags) + return (1); + if (ap->p_header.vxlan_flags < bp->p_header.vxlan_flags) + return (-1); + + if (ap->p_mask || bp->p_mask) + return (0); + + for (i = 0; i < nitems(ap->p_addr.in6.s6_addr32); i++) { + if (ap->p_addr.in6.s6_addr32[i] > + bp->p_addr.in6.s6_addr32[i]) + return (1); + if (ap->p_addr.in6.s6_addr32[i] < + bp->p_addr.in6.s6_addr32[i]) + return (-1); + } + + return (0); +} + +RBT_GENERATE(vxlan_peers, vxlan_peer, p_entry, vxlan_peer_cmp); Index: conf/files =================================================================== RCS file: /cvs/src/sys/conf/files,v retrieving revision 1.698 diff -u -p -r1.698 files --- conf/files 23 Feb 2021 03:30:04 -0000 1.698 +++ conf/files 5 Mar 2021 06:22:43 -0000 @@ -573,7 +573,7 @@ pseudo-device mpip: ifnet, mpls pseudo-device bpe: ifnet, ether, ifmedia, etherbridge pseudo-device vether: ifnet, ether pseudo-device pppx: ifnet -pseudo-device vxlan: ifnet, ether, ifmedia +pseudo-device vxlan: ifnet, ether, etherbridge pseudo-device switch: ifnet, ether pseudo-device wg: ifnet @@ -840,7 +840,7 @@ file net/if_bpe.c bpe needs-count file net/if_vether.c vether file net/if_pair.c pair file net/if_pppx.c pppx needs-count -file net/if_vxlan.c vxlan needs-count +file net/if_vxlan.c vxlan file net/if_wg.c wg file net/wg_noise.c wg file net/wg_cookie.c wg