On Thu, Mar 04, 2021 at 03:36:19PM +1000, David Gwynne wrote:
> as the subject says, this is a rewrite of vxlan(4).
> 
> vxlan(4) relies on bridge(4) to implement learning, but i want to be
> able to remove bridge(4) one day. while working on veb(4), i wrote
> the guts of a learning bridge implementation that is now used by veb(4),
> bpe(4), and nvgre(4). that learning bridge code is now also used by
> vxlan(4).
> 
> this means that a few of the modes that the manpage talks about are
> different now. because vxlan doesnt need a bridge for learning, there's
> no "multicast mode" anymore, it just does "dynamic mode" out of the box
> when configured with a multicast destination address. there's no
> multipoint mode now too.
> 
> another thing that's always bothered me about vxlan(4) is how it occupies
> the "udp namespace" and gets how it steals packets from the udp stack.
> the new code actually creates and bind udp sockets to handle the
> vxlan packets. this means userland can't collide with a vxlan interface,
> and you get to see that the port is in use in things like netstat. e.g.:
> 
> dlg@ikkaku ~$ ifconfig vxlan0
> vxlan0: flags=8843<UP,BROADCAST,RUNNING,SIMPLEX,MULTICAST> mtu 1500
>       lladdr fe:e1:ba:d1:17:2a
>       index 11 llprio 3
>       encap: vnetid none parent aggr0 txprio 0 rxprio outer
>       groups: vxlan
>       tunnel: inet 192.0.2.36 port 4789 --> 239.0.0.1 ttl 1 nodf
>       Addresses (max cache: 100, timeout: 240):
>       inet 100.64.1.36 netmask 0xffffff00 broadcast 100.64.1.255
> dlg@ikkaku ~$ netstat -na -f inet -p udp
> Active Internet connections (including servers)
> Proto   Recv-Q Send-Q  Local Address          Foreign Address       
> udp          0      0  130.102.96.36.29742    129.250.35.250.123    
> udp          0      0  130.102.96.36.8965     162.159.200.123.123   
> udp          0      0  130.102.96.36.13189    162.159.200.1.123     
> udp          0      0  130.102.96.36.46580    220.158.215.20.123    
> udp          0      0  130.102.96.36.23109    103.38.121.36.123     
> udp          0      0  239.0.0.1.4789         *.*                   
> udp          0      0  192.0.2.36.4789        *.*                   
> 
> ive also added loop prevention, ie, sending an interfaces vxlan
> packets over itself should fail rather than panic now.

here's an updated diff with a few fixes.

Index: netinet/udp_usrreq.c
===================================================================
RCS file: /cvs/src/sys/netinet/udp_usrreq.c,v
retrieving revision 1.262
diff -u -p -r1.262 udp_usrreq.c
--- netinet/udp_usrreq.c        22 Aug 2020 17:54:57 -0000      1.262
+++ netinet/udp_usrreq.c        5 Mar 2021 06:22:43 -0000
@@ -112,11 +112,6 @@
 #include <net/pipex.h>
 #endif
 
-#include "vxlan.h"
-#if NVXLAN > 0
-#include <net/if_vxlan.h>
-#endif
-
 /*
  * UDP protocol implementation.
  * Per RFC 768, August, 1980.
@@ -350,15 +345,6 @@ udp_input(struct mbuf **mp, int *offp, i
                break;
 #endif /* INET6 */
        }
-
-#if NVXLAN > 0
-       if (vxlan_enable > 0 &&
-#if NPF > 0
-           !(m->m_pkthdr.pf.flags & PF_TAG_DIVERTED) &&
-#endif
-           vxlan_lookup(m, uh, iphlen, &srcsa.sa, &dstsa.sa) != 0)
-               return IPPROTO_DONE;
-#endif
 
        if (m->m_flags & (M_BCAST|M_MCAST)) {
                struct inpcb *last;
Index: net/if_vxlan.c
===================================================================
RCS file: /cvs/src/sys/net/if_vxlan.c,v
retrieving revision 1.82
diff -u -p -r1.82 if_vxlan.c
--- net/if_vxlan.c      25 Feb 2021 02:48:21 -0000      1.82
+++ net/if_vxlan.c      5 Mar 2021 06:22:43 -0000
@@ -1,7 +1,7 @@
-/*     $OpenBSD: if_vxlan.c,v 1.82 2021/02/25 02:48:21 dlg Exp $       */
+/*     $OpenBSD$ */
 
 /*
- * Copyright (c) 2013 Reyk Floeter <r...@openbsd.org>
+ * Copyright (c) 2021 David Gwynne <d...@openbsd.org>
  *
  * Permission to use, copy, modify, and distribute this software for any
  * purpose with or without fee is hereby granted, provided that the above
@@ -17,493 +17,759 @@
  */
 
 #include "bpfilter.h"
-#include "vxlan.h"
-#include "vlan.h"
 #include "pf.h"
-#include "bridge.h"
 
 #include <sys/param.h>
 #include <sys/systm.h>
+#include <sys/kernel.h>
 #include <sys/mbuf.h>
 #include <sys/socket.h>
-#include <sys/sockio.h>
 #include <sys/ioctl.h>
+#include <sys/timeout.h>
+#include <sys/pool.h>
+#include <sys/tree.h>
+#include <sys/refcnt.h>
+#include <sys/smr.h>
+
+#include <sys/socket.h>
+#include <sys/socketvar.h>
 
 #include <net/if.h>
 #include <net/if_var.h>
+#include <net/if_dl.h>
 #include <net/if_media.h>
+#include <net/if_types.h>
 #include <net/route.h>
-
-#if NBPFILTER > 0
-#include <net/bpf.h>
-#endif
+#include <net/rtable.h>
 
 #include <netinet/in.h>
 #include <netinet/in_var.h>
 #include <netinet/if_ether.h>
 #include <netinet/ip.h>
-#include <netinet/ip_var.h>
 #include <netinet/udp.h>
-#include <netinet/udp_var.h>
 #include <netinet/in_pcb.h>
+#include <netinet/ip_var.h>
 
-#if NPF > 0
-#include <net/pfvar.h>
+#ifdef INET6
+#include <netinet/ip6.h>
+#include <netinet6/ip6_var.h>
+#include <netinet6/in6_var.h>
 #endif
 
-#if NBRIDGE > 0
+/* for bridge stuff */
 #include <net/if_bridge.h>
+#include <net/if_etherbridge.h>
+
+#if NBPFILTER > 0
+#include <net/bpf.h>
 #endif
 
-#include <net/if_vxlan.h>
+/*
+ * The protocol.
+ */
+
+#define VXLANMTU               1492
+#define VXLAN_PORT             4789
+
+struct vxlan_header {
+       uint32_t                vxlan_flags;
+#define VXLAN_F_I                      (1U << 27)
+       uint32_t                vxlan_id;
+#define VXLAN_VNI_SHIFT                        8
+#define        VXLAN_VNI_MASK                  (0xffffff << VXLAN_VNI_SHIFT)
+};
+
+#define VXLAN_VNI_MAX                  0x00ffffff
+#define VXLAN_VNI_MIN                  0x00000000
+
+/*
+ * The driver.
+ */
+
+union vxlan_addr {
+       struct in_addr          in4;
+       struct in6_addr         in6;
+};
+
+struct vxlan_softc;
+
+struct vxlan_peer {
+       RBT_ENTRY(vxlan_peer)    p_entry;
+
+       unsigned int             p_mask; /* do we use addr in the comparison */
+       union vxlan_addr         p_addr;
+       struct vxlan_header      p_header;
+
+       struct vxlan_softc      *p_sc;
+};
+
+RBT_HEAD(vxlan_peers, vxlan_peer);
+
+struct vxlan_tep {
+       TAILQ_ENTRY(vxlan_tep)   vt_entry;
+
+       sa_family_t              vt_af;
+       unsigned int             vt_rdomain;
+       union vxlan_addr         vt_addr;
+#define vt_addr4 vt_addr.in4
+#define vt_addr6 vt_addr.in6
+       in_port_t                vt_port;
+
+       struct socket           *vt_so;
+
+       struct mutex             vt_mtx;
+       struct vxlan_peers       vt_peers;
+};
+
+TAILQ_HEAD(vxlan_teps, vxlan_tep);
+
+enum vxlan_tunnel_mode {
+       VXLAN_TMODE_UNSET,
+       VXLAN_TMODE_P2P,         /* unicast destination, no learning */
+       VXLAN_TMODE_LEARNING,    /* multicast destination, learning */
+       VXLAN_TMODE_ENDPOINT,    /* unset destination, no learning */
+};
 
 struct vxlan_softc {
        struct arpcom            sc_ac;
-       struct ifmedia           sc_media;
+       struct etherbridge       sc_eb;
+
+       unsigned int             sc_rdomain;
+       sa_family_t              sc_af;
+       union vxlan_addr         sc_src;
+       union vxlan_addr         sc_dst;
+       in_port_t                sc_port;
+       struct vxlan_header      sc_header;
+       unsigned int             sc_if_index0;
 
-       struct ip_moptions       sc_imo;
-       struct task              sc_atask;
-       struct task              sc_ltask;
        struct task              sc_dtask;
+       void                    *sc_inmulti;
+
+       enum vxlan_tunnel_mode   sc_mode;
+       struct vxlan_peer       *sc_ucast_peer;
+       struct vxlan_peer       *sc_mcast_peer;
+       struct refcnt            sc_refs;
 
-       struct sockaddr_storage  sc_src;
-       struct sockaddr_storage  sc_dst;
-       in_port_t                sc_dstport;
-       u_int                    sc_rdomain;
-       int64_t                  sc_vnetid;
        uint16_t                 sc_df;
-       u_int8_t                 sc_ttl;
+       int                      sc_ttl;
        int                      sc_txhprio;
+       int                      sc_rxhprio;
 
-       struct task              sc_sendtask;
-
-       LIST_ENTRY(vxlan_softc)  sc_entry;
+       struct task              sc_send_task;
 };
 
-void    vxlanattach(int);
-int     vxlanioctl(struct ifnet *, u_long, caddr_t);
-void    vxlanstart(struct ifnet *);
-int     vxlan_clone_create(struct if_clone *, int);
-int     vxlan_clone_destroy(struct ifnet *);
-void    vxlan_multicast_cleanup(struct ifnet *);
-int     vxlan_multicast_join(struct ifnet *, struct sockaddr *,
-           struct sockaddr *);
-int     vxlan_media_change(struct ifnet *);
-void    vxlan_media_status(struct ifnet *, struct ifmediareq *);
-int     vxlan_config(struct ifnet *, struct sockaddr *, struct sockaddr *);
-int     vxlan_output(struct ifnet *, struct mbuf *);
-void    vxlan_addr_change(void *);
-void    vxlan_if_change(void *);
-void    vxlan_link_change(void *);
-void    vxlan_send_dispatch(void *);
+void           vxlanattach(int);
+
+static int     vxlan_clone_create(struct if_clone *, int);
+static int     vxlan_clone_destroy(struct ifnet *);
+
+static int     vxlan_output(struct ifnet *, struct mbuf *,
+                   struct sockaddr *, struct rtentry *);
+static int     vxlan_enqueue(struct ifnet *, struct mbuf *);
+static void    vxlan_start(struct ifqueue *);
+static void    vxlan_send(void *);
+
+static int     vxlan_ioctl(struct ifnet *, u_long, caddr_t);
+static int     vxlan_up(struct vxlan_softc *);
+static int     vxlan_down(struct vxlan_softc *);
+static int     vxlan_addmulti(struct vxlan_softc *, struct ifnet *);
+static void    vxlan_delmulti(struct vxlan_softc *);
+
+static struct mbuf *
+               vxlan_input(void *, struct mbuf *,
+                   struct ip *, struct ip6_hdr *, void *, int);
+
+static int     vxlan_set_rdomain(struct vxlan_softc *, const struct ifreq *);
+static int     vxlan_get_rdomain(struct vxlan_softc *, struct ifreq *);
+static int     vxlan_set_tunnel(struct vxlan_softc *,
+                   const struct if_laddrreq *);
+static int     vxlan_get_tunnel(struct vxlan_softc *, struct if_laddrreq *);
+static int     vxlan_del_tunnel(struct vxlan_softc *);
+static int     vxlan_set_vnetid(struct vxlan_softc *, const struct ifreq *);
+static int     vxlan_get_vnetid(struct vxlan_softc *, struct ifreq *);
+static int     vxlan_del_vnetid(struct vxlan_softc *);
+static int     vxlan_set_parent(struct vxlan_softc *,
+                   const struct if_parent *);
+static int     vxlan_get_parent(struct vxlan_softc *, struct if_parent *);
+static int     vxlan_del_parent(struct vxlan_softc *);
+
+static int     vxlan_add_addr(struct vxlan_softc *, const struct ifbareq *);
+static int     vxlan_del_addr(struct vxlan_softc *, const struct ifbareq *);
 
-int     vxlan_sockaddr_cmp(struct sockaddr *, struct sockaddr *);
-uint16_t vxlan_sockaddr_port(struct sockaddr *);
+static void    vxlan_detach_hook(void *);
 
-struct if_clone        vxlan_cloner =
+static struct if_clone vxlan_cloner =
     IF_CLONE_INITIALIZER("vxlan", vxlan_clone_create, vxlan_clone_destroy);
 
-int     vxlan_enable = 0;
-u_long  vxlan_tagmask;
+static int      vxlan_eb_port_eq(void *, void *, void *);
+static void    *vxlan_eb_port_take(void *, void *);
+static void     vxlan_eb_port_rele(void *, void *);
+static size_t   vxlan_eb_port_ifname(void *, char *, size_t, void *);
+static void     vxlan_eb_port_sa(void *, struct sockaddr_storage *, void *);
+
+static const struct etherbridge_ops vxlan_etherbridge_ops = {
+       vxlan_eb_port_eq,
+       vxlan_eb_port_take,
+       vxlan_eb_port_rele,
+       vxlan_eb_port_ifname,
+       vxlan_eb_port_sa,
+};
+
+static struct rwlock vxlan_lock = RWLOCK_INITIALIZER("vteps");
+static struct vxlan_teps vxlan_teps = TAILQ_HEAD_INITIALIZER(vxlan_teps);
+static struct pool vxlan_endpoint_pool;
 
-#define VXLAN_TAGHASHSIZE               32
-#define VXLAN_TAGHASH(tag)              ((unsigned int)tag & vxlan_tagmask)
-LIST_HEAD(vxlan_taghash, vxlan_softc)  *vxlan_tagh, vxlan_any;
+static inline int      vxlan_peer_cmp(const struct vxlan_peer *,
+                           const struct vxlan_peer *);
+
+RBT_PROTOTYPE(vxlan_peers, vxlan_peer, p_entry, vxlan_peer_cmp);
 
 void
 vxlanattach(int count)
 {
-       /* Regular vxlan interfaces with a VNI */
-       if ((vxlan_tagh = hashinit(VXLAN_TAGHASHSIZE, M_DEVBUF, M_NOWAIT,
-           &vxlan_tagmask)) == NULL)
-               panic("vxlanattach: hashinit");
-
-       /* multipoint-to-multipoint interfaces that accept any VNI */
-       LIST_INIT(&vxlan_any);
-
        if_clone_attach(&vxlan_cloner);
 }
 
-int
+static int
 vxlan_clone_create(struct if_clone *ifc, int unit)
 {
-       struct ifnet            *ifp;
-       struct vxlan_softc      *sc;
+       struct vxlan_softc *sc;
+       struct ifnet *ifp;
+       int error;
+
+       if (vxlan_endpoint_pool.pr_size == 0) {
+               pool_init(&vxlan_endpoint_pool, sizeof(union vxlan_addr),
+                   0, IPL_SOFTNET, 0, "vxlanep", NULL);
+       }
 
-       sc = malloc(sizeof(*sc), M_DEVBUF, M_WAITOK|M_ZERO);
-       sc->sc_imo.imo_membership = mallocarray(IP_MIN_MEMBERSHIPS,
-           sizeof(struct in_multi *), M_IPMOPTS, M_WAITOK|M_ZERO);
-       sc->sc_imo.imo_max_memberships = IP_MIN_MEMBERSHIPS;
-       sc->sc_dstport = htons(VXLAN_PORT);
-       sc->sc_vnetid = VXLAN_VNI_UNSET;
-       sc->sc_txhprio = IFQ_TOS2PRIO(IPTOS_PREC_ROUTINE); /* 0 */
-       sc->sc_df = htons(0);
-       task_set(&sc->sc_atask, vxlan_addr_change, sc);
-       task_set(&sc->sc_ltask, vxlan_link_change, sc);
-       task_set(&sc->sc_dtask, vxlan_if_change, sc);
-       task_set(&sc->sc_sendtask, vxlan_send_dispatch, sc);
+       sc = malloc(sizeof(*sc), M_DEVBUF, M_WAITOK|M_ZERO|M_CANFAIL);
+       if (sc == NULL)
+               return (ENOMEM);
 
        ifp = &sc->sc_ac.ac_if;
-       snprintf(ifp->if_xname, sizeof ifp->if_xname, "vxlan%d", unit);
-       ifp->if_flags = IFF_BROADCAST | IFF_SIMPLEX | IFF_MULTICAST;
-       ether_fakeaddr(ifp);
 
-       ifp->if_softc = sc;
-       ifp->if_ioctl = vxlanioctl;
-       ifp->if_start = vxlanstart;
+       snprintf(ifp->if_xname, sizeof(ifp->if_xname), "%s%d",
+           ifc->ifc_name, unit);
 
-       ifp->if_hardmtu = ETHER_MAX_HARDMTU_LEN;
-       ifp->if_capabilities = IFCAP_VLAN_MTU;
-       ifp->if_xflags = IFXF_CLONED;
+       error = etherbridge_init(&sc->sc_eb, ifp->if_xname,
+           &vxlan_etherbridge_ops, sc);
+       if (error == -1) {
+               free(sc, M_DEVBUF, sizeof(*sc));
+               return (error);
+       }
+
+       sc->sc_af = AF_UNSPEC;
+       sc->sc_txhprio = 0;
+       sc->sc_rxhprio = IF_HDRPRIO_OUTER;
+       sc->sc_df = 0;
+       sc->sc_ttl = IP_DEFAULT_MULTICAST_TTL;
+
+       task_set(&sc->sc_dtask, vxlan_detach_hook, sc);
+       refcnt_init(&sc->sc_refs);
+       task_set(&sc->sc_send_task, vxlan_send, sc);
 
-       ifmedia_init(&sc->sc_media, 0, vxlan_media_change,
-           vxlan_media_status);
-       ifmedia_add(&sc->sc_media, IFM_ETHER | IFM_AUTO, 0, NULL);
-       ifmedia_set(&sc->sc_media, IFM_ETHER | IFM_AUTO);
+       ifp->if_softc = sc;
+       ifp->if_hardmtu = ETHER_MAX_HARDMTU_LEN;
+       ifp->if_ioctl = vxlan_ioctl;
+       ifp->if_output = vxlan_output;
+       ifp->if_enqueue = vxlan_enqueue;
+       ifp->if_qstart = vxlan_start;
+       ifp->if_flags = IFF_BROADCAST | IFF_MULTICAST | IFF_SIMPLEX;
+       ifp->if_xflags = IFXF_CLONED | IFXF_MPSAFE;
+       ether_fakeaddr(ifp);
 
        if_counters_alloc(ifp);
        if_attach(ifp);
        ether_ifattach(ifp);
 
-#if 0
-       /*
-        * Instead of using a decreased MTU of 1450 bytes, prefer
-        * to use the default Ethernet-size MTU of 1500 bytes and to
-        * increase the MTU of the outer transport interfaces to
-        * at least 1550 bytes. The following is disabled by default.
-        */
-       ifp->if_mtu = ETHERMTU - sizeof(struct ether_header);
-       ifp->if_mtu -= sizeof(struct vxlanudphdr) + sizeof(struct ipovly);
-#endif
-
-       LIST_INSERT_HEAD(&vxlan_tagh[VXLAN_TAGHASH(0)], sc, sc_entry);
-       vxlan_enable++;
-
        return (0);
 }
 
-int
+static int
 vxlan_clone_destroy(struct ifnet *ifp)
 {
-       struct vxlan_softc      *sc = ifp->if_softc;
+       struct vxlan_softc *sc = ifp->if_softc;
 
        NET_LOCK();
-       vxlan_multicast_cleanup(ifp);
+       if (ISSET(ifp->if_flags, IFF_RUNNING))
+               vxlan_down(sc);
        NET_UNLOCK();
 
-       vxlan_enable--;
-       LIST_REMOVE(sc, sc_entry);
-
-       ifmedia_delete_instance(&sc->sc_media, IFM_INST_ANY);
        ether_ifdetach(ifp);
        if_detach(ifp);
 
-       if (!task_del(net_tq(ifp->if_index), &sc->sc_sendtask))
-               taskq_barrier(net_tq(ifp->if_index));
+       etherbridge_destroy(&sc->sc_eb);
+
+       refcnt_finalize(&sc->sc_refs, "vxlanfini");
 
-       free(sc->sc_imo.imo_membership, M_IPMOPTS,
-           sc->sc_imo.imo_max_memberships * sizeof(struct in_multi *));
        free(sc, M_DEVBUF, sizeof(*sc));
 
        return (0);
 }
 
-void
-vxlan_multicast_cleanup(struct ifnet *ifp)
+static struct vxlan_softc *
+vxlan_take(struct vxlan_softc *sc)
+{
+       refcnt_take(&sc->sc_refs);
+       return (sc);
+}
+
+static void
+vxlan_rele(struct vxlan_softc *sc)
 {
-       struct vxlan_softc      *sc = (struct vxlan_softc *)ifp->if_softc;
-       struct ip_moptions      *imo = &sc->sc_imo;
-       struct ifnet            *mifp;
+       refcnt_rele_wake(&sc->sc_refs);
+}
 
-       mifp = if_get(imo->imo_ifidx);
-       if (mifp != NULL) {
-               if_addrhook_del(mifp, &sc->sc_atask);
-               if_linkstatehook_del(mifp, &sc->sc_ltask);
-               if_detachhook_del(mifp, &sc->sc_dtask);
+static struct mbuf *
+vxlan_encap(struct vxlan_softc *sc, struct mbuf *m,
+    struct mbuf *(ip_encap)(struct vxlan_softc *sc, struct mbuf *,
+    const union vxlan_addr *, uint8_t))
+{
+       struct mbuf *m0;
+       union vxlan_addr gateway;
+       const union vxlan_addr *endpoint;
+       struct vxlan_header *vh;
+       struct udphdr *uh;
+       int prio;
+       uint8_t tos;
 
-               if_put(mifp);
-       }
+       if (sc->sc_mode == VXLAN_TMODE_UNSET)
+               goto drop;
 
-       if (imo->imo_num_memberships > 0) {
-               in_delmulti(imo->imo_membership[--imo->imo_num_memberships]);
-               imo->imo_ifidx = 0;
+       if (sc->sc_mode == VXLAN_TMODE_P2P)
+               endpoint = &sc->sc_dst;
+       else { /* VXLAN_TMODE_LEARNING || VXLAN_TMODE_ENDPOINT */
+               struct ether_header *eh = mtod(m, struct ether_header *);
+
+               smr_read_enter();
+               endpoint = etherbridge_resolve_ea(&sc->sc_eb,
+                   (struct ether_addr *)eh->ether_dhost);
+               if (endpoint != NULL) {
+                       gateway = *endpoint;
+                       endpoint = &gateway;
+               }
+               smr_read_leave();
+
+               if (endpoint == NULL) {
+                       if (sc->sc_mode == VXLAN_TMODE_ENDPOINT)
+                               goto drop;
+
+                       /* "flood" to unknown destinations */
+                       endpoint = &sc->sc_dst;
+               }
        }
+
+       /* force prepend mbuf because of payload alignment */
+       m0 = m_get(M_DONTWAIT, m->m_type);
+       if (m0 == NULL)
+               goto drop;
+
+       m_align(m0, 0);
+       m0->m_len = 0;
+
+       M_MOVE_PKTHDR(m0, m);
+       m0->m_next = m;
+
+       m = m_prepend(m0, sizeof(*vh), M_DONTWAIT);
+       if (m == NULL)
+               return (NULL);
+
+       vh = mtod(m, struct vxlan_header *);
+       *vh = sc->sc_header;
+
+       m = m_prepend(m, sizeof(*uh), M_DONTWAIT);
+       if (m == NULL)
+               return (NULL);
+
+       uh = mtod(m, struct udphdr *);
+       uh->uh_sport = sc->sc_port; /* XXX */
+       uh->uh_dport = sc->sc_port;
+       htobem16(&uh->uh_ulen, m->m_pkthdr.len);
+       uh->uh_sum = htons(0);
+
+       SET(m->m_pkthdr.csum_flags, M_UDP_CSUM_OUT);
+
+       prio = sc->sc_txhprio;
+       if (prio == IF_HDRPRIO_PACKET)
+               prio = m->m_pkthdr.pf.prio;
+       tos = IFQ_PRIO2TOS(prio);
+
+       CLR(m->m_flags, M_BCAST|M_MCAST);
+       m->m_pkthdr.ph_rtableid = sc->sc_rdomain;
+
+#if NPF > 0
+       pf_pkt_addr_changed(m);
+#endif
+
+       return ((*ip_encap)(sc, m, endpoint, tos));
+drop:
+       m_freem(m);
+       return (NULL);
 }
 
-int
-vxlan_multicast_join(struct ifnet *ifp, struct sockaddr *src,
-    struct sockaddr *dst)
+static struct mbuf *
+vxlan_encap_ipv4(struct vxlan_softc *sc, struct mbuf *m,
+    const union vxlan_addr *endpoint, uint8_t tos)
 {
-       struct vxlan_softc      *sc = ifp->if_softc;
-       struct ip_moptions      *imo = &sc->sc_imo;
-       struct sockaddr_in      *src4, *dst4;
-#ifdef INET6
-       struct sockaddr_in6     *dst6;
-#endif /* INET6 */
-       struct ifaddr           *ifa;
-       struct ifnet            *mifp;
+       struct ip *ip;
+
+       m = m_prepend(m, sizeof(*ip), M_DONTWAIT);
+       if (m == NULL)
+               return (NULL);
+
+       ip = mtod(m, struct ip *);
+       ip->ip_v = IPVERSION;
+       ip->ip_hl = sizeof(*ip) >> 2;
+       ip->ip_off = sc->sc_df;
+       ip->ip_tos = tos;
+       ip->ip_len = htons(m->m_pkthdr.len);
+       ip->ip_ttl = sc->sc_ttl;
+       ip->ip_p = IPPROTO_UDP;
+       ip->ip_src = sc->sc_src.in4;
+       ip->ip_dst = endpoint->in4;
+
+       return (m);
+}
 
-       switch (dst->sa_family) {
-       case AF_INET:
-               dst4 = satosin(dst);
-               if (!IN_MULTICAST(dst4->sin_addr.s_addr))
-                       return (0);
-               break;
 #ifdef INET6
-       case AF_INET6:
-               dst6 = satosin6(dst);
-               if (!IN6_IS_ADDR_MULTICAST(&dst6->sin6_addr))
-                       return (0);
+static struct mbuf *
+vxlan_encap_ipv6(struct vxlan_softc *sc, struct mbuf *m,
+    const union vxlan_addr *endpoint, uint8_t tos)
+{
+       struct ip6_hdr *ip6;
+       int len = m->m_pkthdr.len;
 
-               /* Multicast mode is currently not supported for IPv6 */
-               return (EAFNOSUPPORT);
+       m = m_prepend(m, sizeof(*ip6), M_DONTWAIT);
+       if (m == NULL)
+               return (NULL);
+
+       ip6 = mtod(m, struct ip6_hdr *);
+       ip6->ip6_flow = ISSET(m->m_pkthdr.csum_flags, M_FLOWID) ?
+           htonl(m->m_pkthdr.ph_flowid) : 0;
+       ip6->ip6_vfc |= IPV6_VERSION;
+       ip6->ip6_flow |= htonl((uint32_t)tos << 20);
+       ip6->ip6_plen = htons(len);
+       ip6->ip6_nxt = IPPROTO_UDP;
+       ip6->ip6_hlim = sc->sc_ttl;
+       ip6->ip6_src = sc->sc_src.in6;
+       ip6->ip6_dst = endpoint->in6;
+
+       if (sc->sc_df)
+               SET(m->m_pkthdr.csum_flags, M_IPV6_DF_OUT);
+
+       return (m);
+}
 #endif /* INET6 */
-       default:
-               return (EAFNOSUPPORT);
+
+static int
+vxlan_output(struct ifnet *ifp, struct mbuf *m, struct sockaddr *dst,
+    struct rtentry *rt)
+{
+       struct m_tag *mtag;
+       int error = 0;
+
+       mtag = NULL;
+       while ((mtag = m_tag_find(m, PACKET_TAG_GRE, mtag)) != NULL) {
+               if (memcmp((caddr_t)(mtag + 1), &ifp->if_index,
+                   sizeof(ifp->if_index)) == 0) {
+                       error = EIO;
+                       goto drop;
+               }
        }
 
-       src4 = satosin(src);
-       dst4 = satosin(dst);
+       mtag = m_tag_get(PACKET_TAG_GRE, sizeof(ifp->if_index), M_NOWAIT);
+       if (mtag == NULL) {
+               error = ENOBUFS;
+               goto drop;
+       }
+       memcpy((caddr_t)(mtag + 1), &ifp->if_index, sizeof(ifp->if_index));
+       m_tag_prepend(m, mtag);
 
-       if (src4->sin_addr.s_addr == INADDR_ANY ||
-           IN_MULTICAST(src4->sin_addr.s_addr))
-               return (EINVAL);
-       if ((ifa = ifa_ifwithaddr(src, sc->sc_rdomain)) == NULL ||
-           (mifp = ifa->ifa_ifp) == NULL ||
-           (mifp->if_flags & IFF_MULTICAST) == 0)
-               return (EADDRNOTAVAIL);
+       return (ether_output(ifp, m, dst, rt));
 
-       if ((imo->imo_membership[0] =
-           in_addmulti(&dst4->sin_addr, mifp)) == NULL)
-               return (ENOBUFS);
+drop:
+       m_freem(m);
+       return (error);
+}
 
-       imo->imo_num_memberships++;
-       imo->imo_ifidx = mifp->if_index;
-       if (sc->sc_ttl > 0)
-               imo->imo_ttl = sc->sc_ttl;
-       else
-               imo->imo_ttl = IP_DEFAULT_MULTICAST_TTL;
-       imo->imo_loop = 0;
+static int
+vxlan_enqueue(struct ifnet *ifp, struct mbuf *m)
+{
+       struct vxlan_softc *sc = ifp->if_softc;
+       struct ifqueue *ifq = &ifp->if_snd;
+
+       if (ifq_enqueue(ifq, m) != 0)
+               return (ENOBUFS);
 
-       /*
-        * Use interface hooks to track any changes on the interface
-        * that is used to send out the tunnel traffic as multicast.
-        */
-       if_addrhook_add(mifp, &sc->sc_atask);
-       if_linkstatehook_add(mifp, &sc->sc_ltask);
-       if_detachhook_add(mifp, &sc->sc_dtask);
+       task_add(ifq->ifq_softnet, &sc->sc_send_task);
 
        return (0);
 }
 
-void
-vxlanstart(struct ifnet *ifp)
+static void
+vxlan_start(struct ifqueue *ifq)
 {
-       struct vxlan_softc      *sc = (struct vxlan_softc *)ifp->if_softc;
+       struct ifnet *ifp = ifq->ifq_if;
+       struct vxlan_softc *sc = ifp->if_softc;
 
-       task_add(net_tq(ifp->if_index), &sc->sc_sendtask);
+       task_add(ifq->ifq_softnet, &sc->sc_send_task);
 }
 
-void
-vxlan_send_dispatch(void *xsc)
+static uint64_t
+vxlan_send_ipv4(struct vxlan_softc *sc, struct mbuf_list *ml)
 {
-       struct vxlan_softc      *sc = xsc;
-       struct ifnet            *ifp = &sc->sc_ac.ac_if;
-       struct mbuf             *m;
-       struct mbuf_list         ml;
-
-       ml_init(&ml);
-       for (;;) {
-               m = ifq_dequeue(&ifp->if_snd);
-               if (m == NULL)
-                       break;
-
-#if NBPFILTER > 0
-               if (ifp->if_bpf)
-                       bpf_mtap(ifp->if_bpf, m, BPF_DIRECTION_OUT);
-#endif
+       struct ip_moptions imo;
+       struct mbuf *m;
+       uint64_t oerrors = 0;
+
+       imo.imo_ifidx = sc->sc_if_index0;
+       imo.imo_ttl = sc->sc_ttl;
+       imo.imo_loop = 0;
 
-               ml_enqueue(&ml, m);
+       NET_LOCK();
+       while ((m = ml_dequeue(ml)) != NULL) {
+               if (ip_output(m, NULL, NULL, IP_RAWOUTPUT, &imo, NULL, 0) != 0)
+                       oerrors++;
        }
+       NET_UNLOCK();
 
-       if (ml_empty(&ml))
-               return;
+       return (oerrors);
+}
+
+#ifdef INET6
+static uint64_t
+vxlan_send_ipv6(struct vxlan_softc *sc, struct mbuf_list *ml)
+{
+       struct ip6_moptions im6o;
+       struct mbuf *m;
+       uint64_t oerrors = 0;
+
+       im6o.im6o_ifidx = sc->sc_if_index0;
+       im6o.im6o_hlim = sc->sc_ttl;
+       im6o.im6o_loop = 0;
 
        NET_LOCK();
-       while ((m = ml_dequeue(&ml)) != NULL) {
-               vxlan_output(ifp, m);
+       while ((m = ml_dequeue(ml)) != NULL) {
+               if (ip6_output(m, NULL, NULL, 0, &im6o, NULL) != 0)
+                       oerrors++;
        }
        NET_UNLOCK();
-}
 
+       return (oerrors);
+}
+#endif /* INET6 */
 
-int
-vxlan_config(struct ifnet *ifp, struct sockaddr *src, struct sockaddr *dst)
+static void
+vxlan_send(void *arg)
 {
-       struct vxlan_softc      *sc = (struct vxlan_softc *)ifp->if_softc;
-       int                      reset = 0, error, af;
-       socklen_t                slen;
-       in_port_t                port;
-       struct vxlan_taghash    *tagh;
-
-       if (src != NULL && dst != NULL) {
-               if ((af = src->sa_family) != dst->sa_family)
-                       return (EAFNOSUPPORT);
-       } else {
-               /* Reset current configuration */
-               af = sc->sc_src.ss_family;
-               src = sstosa(&sc->sc_src);
-               dst = sstosa(&sc->sc_dst);
-               reset = 1;
-       }
+       struct vxlan_softc *sc = arg;
+       struct ifnet *ifp = &sc->sc_ac.ac_if;
+       struct mbuf *(*ip_encap)(struct vxlan_softc *, struct mbuf *,
+           const union vxlan_addr *, uint8_t);
+       uint64_t (*ip_send)(struct vxlan_softc *, struct mbuf_list *);
+       struct mbuf_list ml = MBUF_LIST_INITIALIZER();
+       struct mbuf *m;
+       uint64_t oerrors;
+
+       if (!ISSET(ifp->if_flags, IFF_RUNNING))
+               return;
 
-       switch (af) {
+       switch (sc->sc_af) {
        case AF_INET:
-               slen = sizeof(struct sockaddr_in);
+               ip_encap = vxlan_encap_ipv4;
+               ip_send = vxlan_send_ipv4;
                break;
 #ifdef INET6
        case AF_INET6:
-               slen = sizeof(struct sockaddr_in6);
+               ip_encap = vxlan_encap_ipv6;
+               ip_send = vxlan_send_ipv6;
                break;
-#endif /* INET6 */
+#endif
        default:
-               return (EAFNOSUPPORT);
+               unhandled_af(sc->sc_af);
+               /* NOTREACHED */
        }
 
-       if (src->sa_len != slen || dst->sa_len != slen)
-               return (EINVAL);
+       while ((m = ifq_dequeue(&ifp->if_snd)) != NULL) {
+#if NBPFILTER > 0
+               caddr_t if_bpf = READ_ONCE(ifp->if_bpf);
+               if (if_bpf != NULL)
+                       bpf_mtap_ether(if_bpf, m, BPF_DIRECTION_OUT);
+#endif
+               m = vxlan_encap(sc, m, ip_encap);
+               if (m == NULL)
+                       continue;
+
+               ml_enqueue(&ml, m);
+       }
 
-       vxlan_multicast_cleanup(ifp);
+       oerrors = (*ip_send)(sc, &ml);
 
-       /* returns without error if multicast is not configured */
-       if ((error = vxlan_multicast_join(ifp, src, dst)) != 0)
-               return (error);
+       counters_add(ifp->if_counters, ifc_oerrors, oerrors);
+}
+
+static struct mbuf *
+vxlan_input(void *arg, struct mbuf *m, struct ip *ip, struct ip6_hdr *ip6,
+    void *uhp, int hlen)
+{
+       struct vxlan_tep *vt = arg;
+       struct vxlan_peer key, *p;
+       struct udphdr *uh;
+       struct vxlan_header *vh;
+       struct ether_header *eh;
+       int vhlen = hlen + sizeof(*vh);
+       struct mbuf *n;
+       int off;
+       in_port_t port;
+       struct vxlan_softc *sc = NULL;
+       struct ifnet *ifp;
+
+       if (m->m_pkthdr.len < vhlen)
+               goto drop;
+
+       uh = uhp;
+       port = uh->uh_sport;
 
-       if ((port = vxlan_sockaddr_port(dst)) != 0)
-               sc->sc_dstport = port;
+       memset(&key, 0, sizeof(key));
+       key.p_mask = 0;
 
-       if (!reset) {
-               bzero(&sc->sc_src, sizeof(sc->sc_src));
-               bzero(&sc->sc_dst, sizeof(sc->sc_dst));
-               memcpy(&sc->sc_src, src, src->sa_len);
-               memcpy(&sc->sc_dst, dst, dst->sa_len);
+       if (ip != NULL)
+               key.p_addr.in4 = ip->ip_src;
+#ifdef INET6
+       else
+               key.p_addr.in6 = ip6->ip6_src;
+#endif
+
+       if (m->m_len < vhlen) {
+               m = m_pullup(m, vhlen);
+               if (m == NULL)
+                       return (NULL);
        }
 
-       if (sc->sc_vnetid == VXLAN_VNI_ANY) {
-               /*
-                * If the interface accepts any VNI, put it into a separate
-                * list that is not part of the main hash.
-                */
-               tagh = &vxlan_any;
-       } else
-               tagh = &vxlan_tagh[VXLAN_TAGHASH(sc->sc_vnetid)];
+       vh = (struct vxlan_header *)(mtod(m, caddr_t) + hlen);
+       key.p_header.vxlan_flags = vh->vxlan_flags & htonl(VXLAN_F_I);
+       key.p_header.vxlan_id = vh->vxlan_id & htonl(VXLAN_VNI_MASK);
 
-       LIST_REMOVE(sc, sc_entry);
-       LIST_INSERT_HEAD(tagh, sc, sc_entry);
+       mtx_enter(&vt->vt_mtx);
+       p = RBT_FIND(vxlan_peers, &vt->vt_peers, &key);
+       if (p != NULL)
+               sc = vxlan_take(p->p_sc);
+       mtx_leave(&vt->vt_mtx);
 
-       return (0);
+       if (sc == NULL)
+               goto drop;
+
+       ifp = &sc->sc_ac.ac_if;
+       if (ISSET(ifp->if_flags, IFF_LINK0) && port != sc->sc_port)
+               goto rele_drop;
+
+       m_adj(m, vhlen);
+
+       if (m->m_pkthdr.len < sizeof(*eh))
+               goto rele_drop;
+
+       if (m->m_len < sizeof(*eh)) {
+               m = m_pullup(m, sizeof(*eh));
+               if (m == NULL)
+                       goto rele;
+       }
+
+       n = m_getptr(m, sizeof(*eh), &off);
+       if (n == NULL)
+               goto rele_drop;
+
+       if (!ALIGNED_POINTER(mtod(n, caddr_t) + off, uint32_t)) {
+               n = m_dup_pkt(m, ETHER_ALIGN, M_NOWAIT);
+               m_freem(m);
+               if (n == NULL)
+                       goto rele;
+               m = n;
+       }
+
+       if (sc->sc_mode == VXLAN_TMODE_LEARNING) {
+               eh = mtod(m, struct ether_header *);
+               etherbridge_map_ea(&sc->sc_eb, &key.p_addr,
+                   (struct ether_addr *)eh->ether_shost);
+       }
+
+       /* XXX prio */
+
+       if_vinput(ifp, m);
+rele:
+       vxlan_rele(sc);
+       return (NULL);
+
+rele_drop:
+       vxlan_rele(sc);
+drop:
+       m_freem(m);
+       return (NULL);
 }
 
-int
-vxlanioctl(struct ifnet *ifp, u_long cmd, caddr_t data)
+static int
+vxlan_ioctl(struct ifnet *ifp, u_long cmd, caddr_t data)
 {
-       struct vxlan_softc      *sc = (struct vxlan_softc *)ifp->if_softc;
-       struct ifreq            *ifr = (struct ifreq *)data;
-       struct if_laddrreq      *lifr = (struct if_laddrreq *)data;
-       int                      error = 0;
+       struct vxlan_softc *sc = ifp->if_softc;
+       struct ifreq *ifr = (struct ifreq *)data;
+       struct ifbrparam *bparam = (struct ifbrparam *)data;
+       int error = 0;
 
        switch (cmd) {
        case SIOCSIFADDR:
-               ifp->if_flags |= IFF_UP;
-               /* FALLTHROUGH */
-
+               break;
        case SIOCSIFFLAGS:
-               if (ifp->if_flags & IFF_UP) {
-                       ifp->if_flags |= IFF_RUNNING;
+               if (ISSET(ifp->if_flags, IFF_UP)) {
+                       if (!ISSET(ifp->if_flags, IFF_RUNNING))
+                               error = vxlan_up(sc);
+                       else
+                               error = 0;
                } else {
-                       ifp->if_flags &= ~IFF_RUNNING;
+                       if (ISSET(ifp->if_flags, IFF_RUNNING))
+                               error = vxlan_down(sc);
                }
                break;
 
-       case SIOCADDMULTI:
-       case SIOCDELMULTI:
+       case SIOCSLIFPHYRTABLE:
+               error = vxlan_set_rdomain(sc, ifr);
                break;
-
-       case SIOCGIFMEDIA:
-       case SIOCSIFMEDIA:
-               error = ifmedia_ioctl(ifp, ifr, &sc->sc_media, cmd);
+       case SIOCGLIFPHYRTABLE:
+               error = vxlan_get_rdomain(sc, ifr);
                break;
 
        case SIOCSLIFPHYADDR:
-               error = vxlan_config(ifp,
-                   sstosa(&lifr->addr),
-                   sstosa(&lifr->dstaddr));
+               error = vxlan_set_tunnel(sc, (const struct if_laddrreq *)data);
+               break;
+       case SIOCGLIFPHYADDR:
+               error = vxlan_get_tunnel(sc, (struct if_laddrreq *)data);
                break;
-
        case SIOCDIFPHYADDR:
-               vxlan_multicast_cleanup(ifp);
-               bzero(&sc->sc_src, sizeof(sc->sc_src));
-               bzero(&sc->sc_dst, sizeof(sc->sc_dst));
-               sc->sc_dstport = htons(VXLAN_PORT);
+               error = vxlan_del_tunnel(sc);
                break;
 
-       case SIOCGLIFPHYADDR:
-               if (sc->sc_dst.ss_family == AF_UNSPEC) {
-                       error = EADDRNOTAVAIL;
-                       break;
-               }
-               bzero(&lifr->addr, sizeof(lifr->addr));
-               bzero(&lifr->dstaddr, sizeof(lifr->dstaddr));
-               memcpy(&lifr->addr, &sc->sc_src, sc->sc_src.ss_len);
-               memcpy(&lifr->dstaddr, &sc->sc_dst, sc->sc_dst.ss_len);
+       case SIOCSVNETID:
+               error = vxlan_set_vnetid(sc, ifr);
                break;
-
-       case SIOCSLIFPHYRTABLE:
-               if (ifr->ifr_rdomainid < 0 ||
-                   ifr->ifr_rdomainid > RT_TABLEID_MAX ||
-                   !rtable_exists(ifr->ifr_rdomainid)) {
-                       error = EINVAL;
-                       break;
-               }
-               sc->sc_rdomain = ifr->ifr_rdomainid;
-               (void)vxlan_config(ifp, NULL, NULL);
+       case SIOCGVNETID:
+               error = vxlan_get_vnetid(sc, ifr);
                break;
-
-       case SIOCGLIFPHYRTABLE:
-               ifr->ifr_rdomainid = sc->sc_rdomain;
+       case SIOCDVNETID:
+               error = vxlan_del_vnetid(sc);
                break;
 
-       case SIOCSLIFPHYTTL:
-               if (ifr->ifr_ttl < 0 || ifr->ifr_ttl > 0xff) {
-                       error = EINVAL;
-                       break;
-               }
-               if (sc->sc_ttl == (u_int8_t)ifr->ifr_ttl)
-                       break;
-               sc->sc_ttl = (u_int8_t)(ifr->ifr_ttl);
-               (void)vxlan_config(ifp, NULL, NULL);
+       case SIOCSIFPARENT:
+               error = vxlan_set_parent(sc, (struct if_parent *)data);
                break;
-
-       case SIOCGLIFPHYTTL:
-               ifr->ifr_ttl = (int)sc->sc_ttl;
-               break;
-
-       case SIOCSLIFPHYDF:
-               /* commit */
-               sc->sc_df = ifr->ifr_df ? htons(IP_DF) : htons(0);
-               break;
-       case SIOCGLIFPHYDF:
-               ifr->ifr_df = sc->sc_df ? 1 : 0;
+       case SIOCGIFPARENT:
+               error = vxlan_get_parent(sc, (struct if_parent *)data);
+               break;
+       case SIOCDIFPARENT:
+               error = vxlan_del_parent(sc);
                break;
 
        case SIOCSTXHPRIO:
-               if (ifr->ifr_hdrprio == IF_HDRPRIO_PACKET)
-                       ; /* fall through */
-               else if (ifr->ifr_hdrprio < IF_HDRPRIO_MIN ||
-                   ifr->ifr_hdrprio > IF_HDRPRIO_MAX) {
-                       error = EINVAL;
+               error = if_txhprio_l2_check(ifr->ifr_hdrprio);
+               if (error != 0)
                        break;
-               }
 
                sc->sc_txhprio = ifr->ifr_hdrprio;
                break;
@@ -511,35 +777,64 @@ vxlanioctl(struct ifnet *ifp, u_long cmd
                ifr->ifr_hdrprio = sc->sc_txhprio;
                break;
 
-       case SIOCSVNETID:
-               if (sc->sc_vnetid == ifr->ifr_vnetid)
+       case SIOCSRXHPRIO:
+               error = if_rxhprio_l2_check(ifr->ifr_hdrprio);
+               if (error != 0)
                        break;
 
-               if ((ifr->ifr_vnetid != VXLAN_VNI_ANY) &&
-                   (ifr->ifr_vnetid > VXLAN_VNI_MAX ||
-                    ifr->ifr_vnetid < VXLAN_VNI_MIN)) {
-                       error = EINVAL;
-                       break;
-               }
+               sc->sc_rxhprio = ifr->ifr_hdrprio;
+               break;
+       case SIOCGRXHPRIO:
+               ifr->ifr_hdrprio = sc->sc_rxhprio;
+               break;
 
-               sc->sc_vnetid = (int)ifr->ifr_vnetid;
-               (void)vxlan_config(ifp, NULL, NULL);
+       case SIOCSLIFPHYDF:
+               /* commit */
+               sc->sc_df = ifr->ifr_df ? htons(IP_DF) : htons(0);
+               break;
+       case SIOCGLIFPHYDF:
+               ifr->ifr_df = sc->sc_df ? 1 : 0;
                break;
 
-       case SIOCGVNETID:
-               if ((sc->sc_vnetid != VXLAN_VNI_ANY) &&
-                   (sc->sc_vnetid > VXLAN_VNI_MAX ||
-                    sc->sc_vnetid < VXLAN_VNI_MIN)) {
-                       error = EADDRNOTAVAIL;
+       case SIOCSLIFPHYTTL:
+               if (ifr->ifr_ttl < 1 || ifr->ifr_ttl > 0xff) {
+                       error = EINVAL;
                        break;
                }
 
-               ifr->ifr_vnetid = sc->sc_vnetid;
+               /* commit */
+               sc->sc_ttl = (uint8_t)ifr->ifr_ttl;
+               break;
+       case SIOCGLIFPHYTTL:
+               ifr->ifr_ttl = (int)sc->sc_ttl;
+               break;
+
+       case SIOCBRDGSCACHE:
+               error = etherbridge_set_max(&sc->sc_eb, bparam);
+               break;
+       case SIOCBRDGGCACHE:
+               error = etherbridge_get_max(&sc->sc_eb, bparam);
+               break;
+       case SIOCBRDGSTO:
+               error = etherbridge_set_tmo(&sc->sc_eb, bparam);
+               break;
+       case SIOCBRDGGTO:
+               error = etherbridge_get_tmo(&sc->sc_eb, bparam);
                break;
 
-       case SIOCDVNETID:
-               sc->sc_vnetid = VXLAN_VNI_UNSET;
-               (void)vxlan_config(ifp, NULL, NULL);
+       case SIOCBRDGRTS:
+               error = etherbridge_rtfind(&sc->sc_eb,
+                   (struct ifbaconf *)data);
+               break;
+       case SIOCBRDGFLUSH:
+               etherbridge_flush(&sc->sc_eb,
+                   ((struct ifbreq *)data)->ifbr_ifsflags);
+               break;
+       case SIOCBRDGSADDR:
+               error = vxlan_add_addr(sc, (struct ifbareq *)data);
+               break;
+       case SIOCBRDGDADDR:
+               error = vxlan_del_addr(sc, (struct ifbareq *)data);
                break;
 
        default:
@@ -550,465 +845,964 @@ vxlanioctl(struct ifnet *ifp, u_long cmd
        return (error);
 }
 
-int
-vxlan_media_change(struct ifnet *ifp)
+static struct vxlan_tep *
+vxlan_tep_get(struct vxlan_softc *sc, const union vxlan_addr *addr)
 {
-       return (0);
-}
+       struct vxlan_tep *vt;
 
-void
-vxlan_media_status(struct ifnet *ifp, struct ifmediareq *imr)
-{
-       imr->ifm_status = IFM_AVALID | IFM_ACTIVE;
+       TAILQ_FOREACH(vt, &vxlan_teps, vt_entry) {
+               if (sc->sc_af == vt->vt_af &&
+                   sc->sc_rdomain == vt->vt_rdomain &&
+                   memcmp(addr, &vt->vt_addr, sizeof(*addr)) == 0 &&
+                   sc->sc_port == vt->vt_port)
+                       return (vt);
+       }
+
+       return (NULL);
 }
 
-int
-vxlan_sockaddr_cmp(struct sockaddr *srcsa, struct sockaddr *dstsa)
+static int
+vxlan_tep_add_addr(struct vxlan_softc *sc, const union vxlan_addr *addr,
+    struct vxlan_peer *p)
 {
-       struct sockaddr_in      *src4, *dst4;
+       struct mbuf m;
+       struct vxlan_tep *vt;
+       struct socket *so;
+       struct sockaddr_in *sin;
 #ifdef INET6
-       struct sockaddr_in6     *src6, *dst6;
-#endif /* INET6 */
+       struct sockaddr_in6 *sin6;
+#endif
+       int error;
+       int s;
 
-       if (srcsa->sa_family != dstsa->sa_family)
-               return (1);
+       vt = vxlan_tep_get(sc, addr);
+       if (vt != NULL) {
+               struct vxlan_peer *op;
+
+               mtx_enter(&vt->vt_mtx);
+               op = RBT_INSERT(vxlan_peers, &vt->vt_peers, p);
+               mtx_leave(&vt->vt_mtx);
+
+               if (op != NULL)
+                       return (EADDRINUSE);
+
+               return (0);
+       }
 
-       switch (dstsa->sa_family) {
+       vt = malloc(sizeof(*vt), M_DEVBUF, M_NOWAIT|M_ZERO);
+       if (vt == NULL)
+               return (ENOMEM);
+
+       vt->vt_af = sc->sc_af;
+       vt->vt_rdomain = sc->sc_rdomain;
+       vt->vt_addr = *addr;
+       vt->vt_port = sc->sc_port;
+
+       mtx_init(&vt->vt_mtx, IPL_SOFTNET);
+       RBT_INIT(vxlan_peers, &vt->vt_peers);
+       RBT_INSERT(vxlan_peers, &vt->vt_peers, p);
+
+       error = socreate(vt->vt_af, &so, SOCK_DGRAM, IPPROTO_UDP);
+       if (error != 0)
+               goto free;
+
+       s = solock(so);
+
+       sotoinpcb(so)->inp_upcall = vxlan_input;
+       sotoinpcb(so)->inp_upcall_arg = vt;
+
+       m_inithdr(&m);
+       m.m_len = sizeof(vt->vt_rdomain);
+       *mtod(&m, unsigned int *) = vt->vt_rdomain;
+       error = sosetopt(so, SOL_SOCKET, SO_RTABLE, &m);
+       if (error != 0)
+               goto close;
+
+       m_inithdr(&m);
+       switch (vt->vt_af) {
        case AF_INET:
-               src4 = satosin(srcsa);
-               dst4 = satosin(dstsa);
-               if (src4->sin_addr.s_addr == dst4->sin_addr.s_addr)
-                       return (0);
+               sin = mtod(&m, struct sockaddr_in *);
+               memset(sin, 0, sizeof(*sin));
+               sin->sin_len = sizeof(*sin);
+               sin->sin_family = AF_INET;
+               sin->sin_addr = addr->in4;
+               sin->sin_port = vt->vt_port;
+
+               m.m_len = sizeof(*sin);
                break;
+
 #ifdef INET6
        case AF_INET6:
-               src6 = satosin6(srcsa);
-               dst6 = satosin6(dstsa);
-               if (IN6_ARE_ADDR_EQUAL(&src6->sin6_addr, &dst6->sin6_addr) &&
-                   src6->sin6_scope_id == dst6->sin6_scope_id)
-                       return (0);
+               sin6 = mtod(&m, struct sockaddr_in6 *);
+               sin6->sin6_len = sizeof(*sin6);
+               sin6->sin6_family = AF_INET6;
+               in6_recoverscope(sin6, &addr->in6);
+               sin6->sin6_port = sc->sc_port;
+
+               m.m_len = sizeof(*sin6);
                break;
-#endif /* INET6 */
+#endif
+       default:
+               unhandled_af(vt->vt_af);
        }
 
-       return (1);
+       error = sobind(so, &m, curproc);
+       if (error != 0)
+               goto close;
+
+       sounlock(so, s);
+
+       rw_assert_wrlock(&vxlan_lock);
+       TAILQ_INSERT_TAIL(&vxlan_teps, vt, vt_entry);
+
+       vt->vt_so = so;
+
+       return (0);
+
+close:
+       sounlock(so, s);
+       soclose(so, MSG_DONTWAIT);
+free:
+       free(vt, M_DEVBUF, sizeof(*vt));
+       return (error);
 }
 
-uint16_t
-vxlan_sockaddr_port(struct sockaddr *sa)
+static void
+vxlan_tep_del_addr(struct vxlan_softc *sc, const union vxlan_addr *addr,
+    struct vxlan_peer *p)
 {
-       struct sockaddr_in      *sin4;
-#ifdef INET6
-       struct sockaddr_in6     *sin6;
-#endif /* INET6 */
+       struct vxlan_tep *vt;
+       int empty;
 
-       switch (sa->sa_family) {
-       case AF_INET:
-               sin4 = satosin(sa);
-               return (sin4->sin_port);
-#ifdef INET6
-       case AF_INET6:
-               sin6 = satosin6(sa);
-               return (sin6->sin6_port);
-#endif /* INET6 */
-       default:
-               break;
-       }
+       vt = vxlan_tep_get(sc, addr);
+       if (vt == NULL)
+               panic("unable to find vxlan_tep for peer %p (sc %p)", p, sc);
+
+       mtx_enter(&vt->vt_mtx);
+       RBT_REMOVE(vxlan_peers, &vt->vt_peers, p);
+       empty = RBT_EMPTY(vxlan_peers, &vt->vt_peers);
+       mtx_leave(&vt->vt_mtx);
+
+       if (!empty)
+               return;
+
+       rw_assert_wrlock(&vxlan_lock);
+       TAILQ_REMOVE(&vxlan_teps, vt, vt_entry);
+
+       soclose(vt->vt_so, MSG_DONTWAIT);
+       free(vt, M_DEVBUF, sizeof(*vt));
+}
+
+static int
+vxlan_tep_up(struct vxlan_softc *sc)
+{
+       struct vxlan_peer *up, *mp;
+       int error;
+
+       up = malloc(sizeof(*up), M_DEVBUF, M_NOWAIT|M_ZERO);
+       if (up == NULL)
+               return (ENOMEM);
+
+       up->p_mask = (sc->sc_mode != VXLAN_TMODE_P2P);
+       up->p_addr = sc->sc_dst;
+       up->p_header = sc->sc_header;
+       up->p_sc = vxlan_take(sc);
+
+       error = vxlan_tep_add_addr(sc, &sc->sc_src, up);
+       if (error != 0)
+               goto freeup;
+
+       sc->sc_ucast_peer = up;
+
+       if (sc->sc_mode != VXLAN_TMODE_LEARNING)
+               return (0);
+
+       mp = malloc(sizeof(*mp), M_DEVBUF, M_NOWAIT|M_ZERO);
+       if (mp == NULL) {
+               error = ENOMEM;
+               goto delup;
+       }
+
+       mp->p_mask = 1;
+       /* addr is masked, leave it as 0s */
+       mp->p_header = sc->sc_header;
+       mp->p_sc = vxlan_take(sc);
+
+       /* destination address is a multicast group we want to join */
+       error = vxlan_tep_add_addr(sc, &sc->sc_dst, up);
+       if (error != 0)
+               goto freemp;
+
+       sc->sc_mcast_peer = mp;
 
        return (0);
+
+freemp:
+       vxlan_rele(mp->p_sc);
+       free(mp, M_DEVBUF, sizeof(*mp));
+delup:
+       vxlan_tep_del_addr(sc, &sc->sc_src, up);
+freeup:
+       vxlan_rele(up->p_sc);
+       free(up, M_DEVBUF, sizeof(*up));
+       return (error);
 }
 
-int
-vxlan_lookup(struct mbuf *m, struct udphdr *uh, int iphlen,
-    struct sockaddr *srcsa, struct sockaddr *dstsa)
-{
-       struct vxlan_softc      *sc = NULL, *sc_cand = NULL;
-       struct vxlan_header      v;
-       int                      vni;
-       struct ifnet            *ifp;
-       int                      skip;
-#if NBRIDGE > 0
-       struct bridge_tunneltag *brtag;
-#endif
-       struct mbuf             *n;
-       int                      off;
-
-       /* XXX Should verify the UDP port first before copying the packet */
-       skip = iphlen + sizeof(*uh);
-       if (m->m_pkthdr.len - skip < sizeof(v))
-               return (0);
-       m_copydata(m, skip, sizeof(v), &v);
-       skip += sizeof(v);
-
-       if (v.vxlan_flags & htonl(VXLAN_RESERVED1) ||
-           v.vxlan_id & htonl(VXLAN_RESERVED2))
-               return (0);
-
-       vni = ntohl(v.vxlan_id) >> VXLAN_VNI_S;
-       if ((v.vxlan_flags & htonl(VXLAN_FLAGS_VNI)) == 0) {
-               if (vni != 0)
-                       return (0);
+static void
+vxlan_tep_down(struct vxlan_softc *sc)
+{
+       struct vxlan_peer *up = sc->sc_ucast_peer;
 
-               vni = VXLAN_VNI_UNSET;
+       if (sc->sc_mode == VXLAN_TMODE_LEARNING) {
+               struct vxlan_peer *mp = sc->sc_mcast_peer;
+               vxlan_tep_del_addr(sc, &sc->sc_dst, mp);
+               vxlan_rele(mp->p_sc);
+               free(mp, M_DEVBUF, sizeof(*mp));
        }
 
+       vxlan_tep_del_addr(sc, &sc->sc_src, up);
+       vxlan_rele(up->p_sc);
+       free(up, M_DEVBUF, sizeof(*up));
+}
+
+static int
+vxlan_up(struct vxlan_softc *sc)
+{
+       struct ifnet *ifp = &sc->sc_ac.ac_if;
+       struct ifnet *ifp0 = NULL;
+       int error;
+
+       KASSERT(!ISSET(ifp->if_flags, IFF_RUNNING));
        NET_ASSERT_LOCKED();
-       /* First search for a vxlan(4) interface with the packet's VNI */
-       LIST_FOREACH(sc, &vxlan_tagh[VXLAN_TAGHASH(vni)], sc_entry) {
-               if ((uh->uh_dport == sc->sc_dstport) &&
-                   vni == sc->sc_vnetid &&
-                   sc->sc_rdomain == rtable_l2(m->m_pkthdr.ph_rtableid)) {
-                       sc_cand = sc;
-                       if (vxlan_sockaddr_cmp(srcsa, sstosa(&sc->sc_dst)) == 0)
-                               goto found;
-               }
+
+       if (sc->sc_af == AF_UNSPEC)
+               return (EDESTADDRREQ);
+       KASSERT(sc->sc_mode != VXLAN_TMODE_UNSET);
+
+       NET_UNLOCK();
+
+       error = rw_enter(&vxlan_lock, RW_WRITE|RW_INTR);
+       if (error != 0)
+               goto netlock;
+
+       NET_LOCK();
+       if (ISSET(ifp->if_flags, IFF_RUNNING)) {
+               /* something else beat us */
+               rw_exit(&vxlan_lock);
+               return (0);
        }
+       NET_UNLOCK();
 
-       /*
-        * Now loop through all the vxlan(4) interfaces that are configured
-        * to accept any VNI and operating in multipoint-to-multipoint mode
-        * that is used in combination with bridge(4) or switch(4).
-        * If a vxlan(4) interface has been found for the packet's VNI, this
-        * code is not reached as the other interface is more specific.
-        */
-       LIST_FOREACH(sc, &vxlan_any, sc_entry) {
-               if ((uh->uh_dport == sc->sc_dstport) &&
-                   (sc->sc_rdomain == rtable_l2(m->m_pkthdr.ph_rtableid))) {
-                       sc_cand = sc;
-                       goto found;
-               }
+       if (sc->sc_mode != VXLAN_TMODE_P2P) {
+               error = etherbridge_up(&sc->sc_eb);
+               if (error != 0)
+                       goto unlock;
        }
 
-       if (sc_cand) {
-               sc = sc_cand;
-               goto found;
+       if (sc->sc_mode == VXLAN_TMODE_LEARNING) {
+               ifp0 = if_get(sc->sc_if_index0);
+               if (ifp0 == NULL) {
+                       error = ENXIO;
+                       goto down;
+               }
+
+               /* check again if multicast will work on top of the parent */
+               if (!ISSET(ifp0->if_flags, IFF_MULTICAST)) {
+                       error = EPROTONOSUPPORT;
+                       goto put;
+               }
+
+               error = vxlan_addmulti(sc, ifp0);
+               if (error != 0)
+                       goto put;
+
+               /* Register callback if parent wants to unregister */
+               if_detachhook_add(ifp0, &sc->sc_dtask);
+       } else {
+               if (sc->sc_if_index0 != 0) {
+                       error = EPROTONOSUPPORT;
+                       goto down;
+               }
        }
 
-       /* not found */
+       error = vxlan_tep_up(sc);
+       if (error != 0)
+               goto del;
+
+       if_put(ifp0);
+
+       NET_LOCK();
+       SET(ifp->if_flags, IFF_RUNNING);
+       rw_exit(&vxlan_lock);
+
        return (0);
 
- found:
-       if (m->m_pkthdr.len < skip + sizeof(struct ether_header)) {
-               m_freem(m);
-               return (EINVAL);
+del:
+       if (ifp0 != NULL)
+               if_detachhook_del(ifp0, &sc->sc_dtask);
+       vxlan_delmulti(sc);
+put:
+       if_put(ifp0);
+down:
+       if (sc->sc_mode != VXLAN_TMODE_P2P)
+               etherbridge_down(&sc->sc_eb);
+unlock:
+       rw_exit(&vxlan_lock);
+netlock:
+       NET_LOCK();
+
+       return (error);
+}
+
+static int
+vxlan_down(struct vxlan_softc *sc)
+{
+       struct ifnet *ifp = &sc->sc_ac.ac_if;
+       struct ifnet *ifp0;
+       int error;
+
+       KASSERT(ISSET(ifp->if_flags, IFF_RUNNING));
+       NET_UNLOCK();
+
+       error = rw_enter(&vxlan_lock, RW_WRITE|RW_INTR);
+       if (error != 0) {
+               NET_LOCK();
+               return (error);
        }
 
-       m_adj(m, skip);
-       ifp = &sc->sc_ac.ac_if;
+       NET_LOCK();
+       if (!ISSET(ifp->if_flags, IFF_RUNNING)) {
+               /* something else beat us */
+               rw_exit(&vxlan_lock);
+               return (0);
+       }
+       NET_UNLOCK();
+
+       vxlan_tep_down(sc);
 
-#if NBRIDGE > 0
-       /* Store the tunnel src/dst IP and vni for the bridge or switch */
-       if ((ifp->if_bridgeidx != 0 || ifp->if_switchport != NULL) &&
-           srcsa->sa_family != AF_UNSPEC &&
-           ((brtag = bridge_tunneltag(m)) != NULL)) {
-               memcpy(&brtag->brtag_peer.sa, srcsa, srcsa->sa_len);
-               memcpy(&brtag->brtag_local.sa, dstsa, dstsa->sa_len);
-               brtag->brtag_id = vni;
+       if (sc->sc_mode == VXLAN_TMODE_LEARNING) {
+               vxlan_delmulti(sc);
+               ifp0 = if_get(sc->sc_if_index0);
+               if (ifp0 != NULL) {
+                       if_detachhook_del(ifp0, &sc->sc_dtask);
+               }
+               if_put(ifp0);
        }
-#endif
 
-       m->m_flags &= ~(M_BCAST|M_MCAST);
+       if (sc->sc_mode != VXLAN_TMODE_P2P)
+               etherbridge_down(&sc->sc_eb);
 
-#if NPF > 0
-       pf_pkt_addr_changed(m);
-#endif
-       if ((m->m_len < sizeof(struct ether_header)) &&
-           (m = m_pullup(m, sizeof(struct ether_header))) == NULL)
-               return (ENOBUFS);
+       taskq_del_barrier(ifp->if_snd.ifq_softnet, &sc->sc_send_task);
+       NET_LOCK();
+       CLR(ifp->if_flags, IFF_RUNNING);
+       rw_exit(&vxlan_lock);
 
-       n = m_getptr(m, sizeof(struct ether_header), &off);
-       if (n == NULL) {
-               m_freem(m);
-               return (EINVAL);
+       return (0);
+}
+
+static int
+vxlan_addmulti(struct vxlan_softc *sc, struct ifnet *ifp0)
+{
+       int error = 0;
+
+       NET_LOCK();
+
+       switch (sc->sc_af) {
+       case AF_INET:
+               sc->sc_inmulti = in_addmulti(&sc->sc_dst.in4, ifp0);
+               if (sc->sc_inmulti == NULL)
+                       error = EADDRNOTAVAIL;
+               break;
+#ifdef INET6
+       case AF_INET6:
+               sc->sc_inmulti = in6_addmulti(&sc->sc_dst.in6, ifp0, &error);
+               break;
+#endif
+       default:
+               unhandled_af(sc->sc_af);
        }
-       if (!ALIGNED_POINTER(mtod(n, caddr_t) + off, uint32_t)) {
-               n = m_dup_pkt(m, ETHER_ALIGN, M_NOWAIT);
-               /* Dispose of the original mbuf chain */
-               m_freem(m);
-               if (n == NULL)
-                       return (ENOBUFS);
-               m = n;
+
+       NET_UNLOCK();
+
+       return (error);
+}
+
+static void
+vxlan_delmulti(struct vxlan_softc *sc)
+{
+       NET_LOCK();
+
+       switch (sc->sc_af) {
+       case AF_INET:
+               in_delmulti(sc->sc_inmulti);
+               break;
+#ifdef INET6
+       case AF_INET6:
+               in6_delmulti(sc->sc_inmulti);
+               break;
+#endif
+       default:
+               unhandled_af(sc->sc_af);
        }
 
-       if_vinput(ifp, m);
+       sc->sc_inmulti = NULL; /* keep it tidy */
 
-       /* success */
-       return (1);
+       NET_UNLOCK();
 }
 
-struct mbuf *
-vxlan_encap4(struct ifnet *ifp, struct mbuf *m,
-    struct sockaddr *src, struct sockaddr *dst)
-{
-       struct vxlan_softc      *sc = (struct vxlan_softc *)ifp->if_softc;
-       struct ip               *ip;
-
-       /*
-        * Remove multicast and broadcast flags or encapsulated packet
-        * ends up as multicast or broadcast packet.
-        */
-       m->m_flags &= ~(M_BCAST|M_MCAST);
+static int
+vxlan_set_rdomain(struct vxlan_softc *sc, const struct ifreq *ifr)
+{
+       struct ifnet *ifp = &sc->sc_ac.ac_if;
 
-       M_PREPEND(m, sizeof(*ip), M_DONTWAIT);
-       if (m == NULL)
-               return (NULL);
+       if (ifr->ifr_rdomainid < 0 ||
+           ifr->ifr_rdomainid > RT_TABLEID_MAX)
+               return (EINVAL);
+       if (!rtable_exists(ifr->ifr_rdomainid))
+               return (EADDRNOTAVAIL);
 
-       ip = mtod(m, struct ip *);
-       ip->ip_v = IPVERSION;
-       ip->ip_hl = sizeof(struct ip) >> 2;
-       ip->ip_id = htons(ip_randomid());
-       ip->ip_off = sc->sc_df;
-       ip->ip_p = IPPROTO_UDP;
-       ip->ip_tos = IFQ_PRIO2TOS(sc->sc_txhprio == IF_HDRPRIO_PACKET ?
-           m->m_pkthdr.pf.prio : sc->sc_txhprio);
-       ip->ip_len = htons(m->m_pkthdr.len);
+       if (sc->sc_rdomain == ifr->ifr_rdomainid)
+               return (0);
 
-       ip->ip_src = satosin(src)->sin_addr;
-       ip->ip_dst = satosin(dst)->sin_addr;
+       if (!ISSET(ifp->if_flags, IFF_RUNNING))
+               return (EBUSY);
 
-       if (sc->sc_ttl > 0)
-               ip->ip_ttl = sc->sc_ttl;
-       else
-               ip->ip_ttl = IPDEFTTL;
+       /* commit */
+       sc->sc_rdomain = ifr->ifr_rdomainid;
+       etherbridge_flush(&sc->sc_eb, IFBF_FLUSHALL);
 
-       return (m);
+       return (0);
+}
+
+static int
+vxlan_get_rdomain(struct vxlan_softc *sc, struct ifreq *ifr)
+{
+       ifr->ifr_rdomainid = sc->sc_rdomain;
+
+       return (0);
 }
 
+static int
+vxlan_set_tunnel(struct vxlan_softc *sc, const struct if_laddrreq *req)
+{
+       struct ifnet *ifp = &sc->sc_ac.ac_if;
+       struct sockaddr *src = (struct sockaddr *)&req->addr;
+       struct sockaddr *dst = (struct sockaddr *)&req->dstaddr;
+       struct sockaddr_in *src4, *dst4;
 #ifdef INET6
-struct mbuf *
-vxlan_encap6(struct ifnet *ifp, struct mbuf *m,
-    struct sockaddr *src, struct sockaddr *dst)
-{
-       struct vxlan_softc      *sc = (struct vxlan_softc *)ifp->if_softc;
-       struct ip6_hdr          *ip6;
-       struct in6_addr         *in6a;
-       uint32_t                 flow;
-
-       /*
-        * Remove multicast and broadcast flags or encapsulated packet
-        * ends up as multicast or broadcast packet.
-        */
-       m->m_flags &= ~(M_BCAST|M_MCAST);
+       struct sockaddr_in6 *src6, *dst6;
+       int error;
+#endif
+       union vxlan_addr saddr, daddr;
+       unsigned int mode = VXLAN_TMODE_ENDPOINT;
+       in_port_t port = htons(VXLAN_PORT);
 
-       M_PREPEND(m, sizeof(struct ip6_hdr), M_DONTWAIT);
-       if (m == NULL)
-               return (NULL);
+       memset(&saddr, 0, sizeof(saddr));
+       memset(&daddr, 0, sizeof(daddr));
 
-       flow = (uint32_t)IFQ_PRIO2TOS(sc->sc_txhprio == IF_HDRPRIO_PACKET ?
-           m->m_pkthdr.pf.prio : sc->sc_txhprio) << 20;
+       /* validate */
+       switch (src->sa_family) {
+       case AF_INET:
+               src4 = (struct sockaddr_in *)src;
+               if (in_nullhost(src4->sin_addr) ||
+                   IN_MULTICAST(src4->sin_addr.s_addr))
+                       return (EINVAL);
 
-       ip6 = mtod(m, struct ip6_hdr *);
-       ip6->ip6_flow = htonl(flow);
-       ip6->ip6_vfc &= ~IPV6_VERSION_MASK;
-       ip6->ip6_vfc |= IPV6_VERSION;
-       ip6->ip6_nxt = IPPROTO_UDP;
-       ip6->ip6_plen = htons(m->m_pkthdr.len - sizeof(struct ip6_hdr));
-       if (in6_embedscope(&ip6->ip6_src, satosin6(src), NULL) != 0)
-               goto drop;
-       if (in6_embedscope(&ip6->ip6_dst, satosin6(dst), NULL) != 0)
-               goto drop;
+               if (src4->sin_port != htons(0))
+                       port = src4->sin_port;
 
-       if (sc->sc_ttl > 0)
-               ip6->ip6_hlim = sc->sc_ttl;
-       else
-               ip6->ip6_hlim = ip6_defhlim;
+               if (dst->sa_family != AF_UNSPEC) {
+                       if (dst->sa_family != AF_INET)
+                               return (EINVAL);
+
+                       dst4 = (struct sockaddr_in *)dst;
+                       if (in_nullhost(dst4->sin_addr))
+                               return (EINVAL);
+
+                       /* all good */
+                       mode = IN_MULTICAST(dst4->sin_addr.s_addr) ?
+                           VXLAN_TMODE_LEARNING : VXLAN_TMODE_P2P;
+                       daddr.in4 = dst4->sin_addr;
+               }
 
-       if (IN6_IS_ADDR_UNSPECIFIED(&satosin6(src)->sin6_addr)) {
-               if (in6_selectsrc(&in6a, satosin6(dst), NULL,
-                   sc->sc_rdomain) != 0)
-                       goto drop;
+               saddr.in4 = src4->sin_addr;
+               break;
+
+#ifdef INET6
+       case AF_INET6:
+               src6 = (struct sockaddr_in6 *)src;
+               if (IN6_IS_ADDR_UNSPECIFIED(&src6->sin6_addr) ||
+                   IN6_IS_ADDR_MULTICAST(&src6->sin6_addr))
+                       return (EINVAL);
+
+               if (src6->sin6_port != htons(0))
+                       port = src6->sin6_port;
 
-               ip6->ip6_src = *in6a;
+               if (dst->sa_family != AF_UNSPEC) {
+                       if (dst->sa_family != AF_INET6)
+                               return (EINVAL);
+
+                       dst6 = (struct sockaddr_in6 *)dst;
+                       if (IN6_IS_ADDR_UNSPECIFIED(&dst6->sin6_addr))
+                               return (EINVAL);
+
+                       if (src6->sin6_scope_id != dst6->sin6_scope_id)
+                               return (EINVAL);
+
+                       /* all good */
+                       mode = IN6_IS_ADDR_MULTICAST(&dst6->sin6_addr) ?
+                           VXLAN_TMODE_LEARNING : VXLAN_TMODE_P2P;
+                       error = in6_embedscope(&daddr.in6, dst6, NULL);
+                       if (error != 0)
+                               return (error);
+               }
+
+               error = in6_embedscope(&saddr.in6, src6, NULL);
+               if (error != 0)
+                       return (error);
+
+               break;
+#endif
+       default:
+               return (EAFNOSUPPORT);
        }
 
-       if (sc->sc_df)
-               SET(m->m_pkthdr.csum_flags, M_IPV6_DF_OUT);
+       if (memcmp(&sc->sc_src, &saddr, sizeof(sc->sc_src)) == 0 &&
+           memcmp(&sc->sc_dst, &daddr, sizeof(sc->sc_dst)) == 0 &&
+           sc->sc_port == port)
+               return (0);
 
-       /*
-        * The UDP checksum of VXLAN packets should be set to zero,
-        * but the IPv6 UDP checksum is not optional.  There is an RFC 6539
-        * to relax the IPv6 UDP checksum requirement for tunnels, but it
-        * is currently not supported by most implementations.
-        */
-       m->m_pkthdr.csum_flags |= M_UDP_CSUM_OUT;
+       if (ISSET(ifp->if_flags, IFF_RUNNING))
+               return (EBUSY);
 
-       return (m);
+       /* commit */
+       sc->sc_af = src->sa_family;
+       sc->sc_src = saddr;
+       sc->sc_dst = daddr;
+       sc->sc_port = port;
+       sc->sc_mode = mode;
+       etherbridge_flush(&sc->sc_eb, IFBF_FLUSHALL);
 
-drop:
-       m_freem(m);
-       return (NULL);
+       return (0);
 }
-#endif /* INET6 */
 
-int
-vxlan_output(struct ifnet *ifp, struct mbuf *m)
+static int
+vxlan_get_tunnel(struct vxlan_softc *sc, struct if_laddrreq *req)
 {
-       struct vxlan_softc      *sc = (struct vxlan_softc *)ifp->if_softc;
-       struct vxlanudphdr      *vu;
-       struct sockaddr         *src, *dst;
-#if NBRIDGE > 0
-       struct bridge_tunneltag *brtag;
-#endif
-       int                      error, af;
-       uint32_t                 tag;
-       struct mbuf             *m0;
-
-       /* VXLAN header, needs new mbuf because of alignment issues */
-       MGET(m0, M_DONTWAIT, m->m_type);
-       if (m0 == NULL) {
-               ifp->if_oerrors++;
-               return (ENOBUFS);
-       }
-       M_MOVE_PKTHDR(m0, m);
-       m0->m_next = m;
-       m = m0;
-       m_align(m, sizeof(*vu));
-       m->m_len = sizeof(*vu);
-       m->m_pkthdr.len += sizeof(*vu);
-
-       src = sstosa(&sc->sc_src);
-       dst = sstosa(&sc->sc_dst);
-       af = src->sa_family;
-
-       vu = mtod(m, struct vxlanudphdr *);
-       vu->vu_u.uh_sport = sc->sc_dstport;
-       vu->vu_u.uh_dport = sc->sc_dstport;
-       vu->vu_u.uh_ulen = htons(m->m_pkthdr.len);
-       vu->vu_u.uh_sum = 0;
-       tag = sc->sc_vnetid;
-
-#if NBRIDGE > 0
-       if ((brtag = bridge_tunnel(m)) != NULL) {
-               dst = &brtag->brtag_peer.sa;
-
-               /* If accepting any VNI, source ip address is from brtag */
-               if (sc->sc_vnetid == VXLAN_VNI_ANY) {
-                       src = &brtag->brtag_local.sa;
-                       tag = (uint32_t)brtag->brtag_id;
-                       af = src->sa_family;
-               }
-
-               if (dst->sa_family != af) {
-                       ifp->if_oerrors++;
-                       m_freem(m);
-                       return (EINVAL);
-               }
-       } else
+       struct sockaddr *dstaddr = (struct sockaddr *)&req->dstaddr;
+       struct sockaddr_in *sin;
+#ifdef INET6
+       struct sockaddr_in6 *sin6;
 #endif
-       if (sc->sc_vnetid == VXLAN_VNI_ANY) {
-               /*
-                * If accepting any VNI, build the vxlan header only by
-                * bridge_tunneltag or drop packet if the tag does not exist.
-                */
-               ifp->if_oerrors++;
-               m_freem(m);
-               return (ENETUNREACH);
-       }
 
-       if (sc->sc_vnetid != VXLAN_VNI_UNSET) {
-               vu->vu_v.vxlan_flags = htonl(VXLAN_FLAGS_VNI);
-               vu->vu_v.vxlan_id = htonl(tag << VXLAN_VNI_S);
-       } else {
-               vu->vu_v.vxlan_flags = htonl(0);
-               vu->vu_v.vxlan_id = htonl(0);
-       }
+       if (sc->sc_af == AF_UNSPEC)
+               return (EADDRNOTAVAIL);
+       KASSERT(sc->sc_mode != VXLAN_TMODE_UNSET);
+
+       memset(&req->addr, 0, sizeof(req->addr));
+       memset(&req->dstaddr, 0, sizeof(req->dstaddr));
 
-       switch (af) {
+       /* default to endpoint */
+       dstaddr->sa_len = 2;
+       dstaddr->sa_family = AF_UNSPEC;
+
+       switch (sc->sc_af) {
        case AF_INET:
-               m = vxlan_encap4(ifp, m, src, dst);
+               sin = (struct sockaddr_in *)&req->addr;
+               sin->sin_len = sizeof(*sin);
+               sin->sin_family = AF_INET;
+               sin->sin_addr = sc->sc_src.in4;
+               sin->sin_port = sc->sc_port;
+
+               if (sc->sc_mode == VXLAN_TMODE_ENDPOINT)
+                       break;
+
+               sin = (struct sockaddr_in *)&req->dstaddr;
+               sin->sin_len = sizeof(*sin);
+               sin->sin_family = AF_INET;
+               sin->sin_addr = sc->sc_dst.in4;
                break;
+
 #ifdef INET6
        case AF_INET6:
-               m = vxlan_encap6(ifp, m, src, dst);
+               sin6 = (struct sockaddr_in6 *)&req->addr;
+               sin6->sin6_len = sizeof(*sin6);
+               sin6->sin6_family = AF_INET6;
+               in6_recoverscope(sin6, &sc->sc_src.in6);
+               sin6->sin6_port = sc->sc_port;
+
+               if (sc->sc_mode == VXLAN_TMODE_ENDPOINT)
+                       break;
+
+               sin6 = (struct sockaddr_in6 *)&req->dstaddr;
+               sin6->sin6_len = sizeof(*sin6);
+               sin6->sin6_family = AF_INET6;
+               in6_recoverscope(sin6, &sc->sc_dst.in6);
                break;
-#endif /* INET6 */
+#endif
        default:
-               m_freem(m);
-               m = NULL;
+               unhandled_af(sc->sc_af);
        }
 
-       if (m == NULL) {
-               ifp->if_oerrors++;
-               return (ENOBUFS);
+       return (0);
+}
+
+static int
+vxlan_del_tunnel(struct vxlan_softc *sc)
+{
+       struct ifnet *ifp = &sc->sc_ac.ac_if;
+
+       if (sc->sc_af == AF_UNSPEC)
+               return (0);
+
+       if (ISSET(ifp->if_flags, IFF_RUNNING))
+               return (EBUSY);
+
+       /* commit */
+       sc->sc_af = AF_UNSPEC;
+       memset(&sc->sc_src, 0, sizeof(sc->sc_src));
+       memset(&sc->sc_dst, 0, sizeof(sc->sc_dst));
+       sc->sc_port = htons(0);
+       sc->sc_mode = VXLAN_TMODE_UNSET;
+       etherbridge_flush(&sc->sc_eb, IFBF_FLUSHALL);
+
+       return (0);
+}
+
+static int
+vxlan_set_vnetid(struct vxlan_softc *sc, const struct ifreq *ifr)
+{
+       struct ifnet *ifp = &sc->sc_ac.ac_if;
+       uint32_t vni;
+
+       if (ifr->ifr_vnetid < VXLAN_VNI_MIN ||
+           ifr->ifr_vnetid > VXLAN_VNI_MAX)
+               return (EINVAL);
+
+       vni = htonl(ifr->ifr_vnetid << VXLAN_VNI_SHIFT);
+       if (ISSET(sc->sc_header.vxlan_flags, htonl(VXLAN_F_I)) &&
+           sc->sc_header.vxlan_id == vni)
+               return (0);
+
+       if (ISSET(ifp->if_flags, IFF_RUNNING))
+               return (EBUSY);
+
+       /* commit */
+       SET(sc->sc_header.vxlan_flags, htonl(VXLAN_F_I));
+       sc->sc_header.vxlan_id = vni;
+       etherbridge_flush(&sc->sc_eb, IFBF_FLUSHALL);
+
+       return (0);
+}
+
+static int
+vxlan_get_vnetid(struct vxlan_softc *sc, struct ifreq *ifr)
+{
+       uint32_t vni;
+
+       if (!ISSET(sc->sc_header.vxlan_flags, htonl(VXLAN_F_I)))
+               return (EADDRNOTAVAIL);
+
+       vni = ntohl(sc->sc_header.vxlan_id);
+       vni &= VXLAN_VNI_MASK;
+       vni >>= VXLAN_VNI_SHIFT;
+
+       ifr->ifr_vnetid = vni;
+
+       return (0);
+}
+
+static int
+vxlan_del_vnetid(struct vxlan_softc *sc)
+{
+       struct ifnet *ifp = &sc->sc_ac.ac_if;
+
+       if (!ISSET(sc->sc_header.vxlan_flags, htonl(VXLAN_F_I)))
+               return (0);
+
+       if (ISSET(ifp->if_flags, IFF_RUNNING))
+               return (EBUSY);
+
+       /* commit */
+       CLR(sc->sc_header.vxlan_flags, htonl(VXLAN_F_I));
+       sc->sc_header.vxlan_id = htonl(0 << VXLAN_VNI_SHIFT);
+       etherbridge_flush(&sc->sc_eb, IFBF_FLUSHALL);
+
+       return (0);
+}
+
+static int
+vxlan_set_parent(struct vxlan_softc *sc, const struct if_parent *p)
+{
+       struct ifnet *ifp = &sc->sc_ac.ac_if;
+       struct ifnet *ifp0;
+       int error = 0;
+
+       ifp0 = if_unit(p->ifp_parent);
+       if (ifp0 == NULL)
+               return (ENXIO);
+
+       if (!ISSET(ifp0->if_flags, IFF_MULTICAST)) {
+               error = ENXIO;
+               goto put;
        }
 
-#if NBRIDGE > 0
-       if (brtag != NULL)
-               bridge_tunneluntag(m);
-#endif
+       if (sc->sc_if_index0 == ifp0->if_index)
+               goto put;
 
-       m->m_pkthdr.ph_rtableid = sc->sc_rdomain;
+       if (ISSET(ifp->if_flags, IFF_RUNNING)) {
+               error = EBUSY;
+               goto put;
+       }
 
-#if NPF > 0
-       pf_pkt_addr_changed(m);
+       /* commit */
+       sc->sc_if_index0 = ifp0->if_index;
+       etherbridge_flush(&sc->sc_eb, IFBF_FLUSHALL);
+
+put:
+       if_put(ifp0);
+       return (error);
+}
+
+static int
+vxlan_get_parent(struct vxlan_softc *sc, struct if_parent *p)
+{
+       struct ifnet *ifp0;
+       int error = 0;
+
+       ifp0 = if_get(sc->sc_if_index0);
+       if (ifp0 == NULL)
+               error = EADDRNOTAVAIL;
+       else
+               strlcpy(p->ifp_parent, ifp0->if_xname, sizeof(p->ifp_parent));
+       if_put(ifp0);
+
+       return (error);
+}
+
+static int
+vxlan_del_parent(struct vxlan_softc *sc)
+{
+       struct ifnet *ifp = &sc->sc_ac.ac_if;
+
+       if (sc->sc_if_index0 == 0)
+               return (0);
+
+       if (ISSET(ifp->if_flags, IFF_RUNNING))
+               return (EBUSY);
+
+       /* commit */
+       sc->sc_if_index0 = 0;
+       etherbridge_flush(&sc->sc_eb, IFBF_FLUSHALL);
+
+       return (0);
+}
+
+static int
+vxlan_add_addr(struct vxlan_softc *sc, const struct ifbareq *ifba)
+{
+       struct sockaddr_in *sin;
+#ifdef INET6
+       struct sockaddr_in6 *sin6;
+       struct sockaddr_in6 src6 = {
+               .sin6_len = sizeof(src6),
+               .sin6_family = AF_UNSPEC,
+       };
+       int error;
 #endif
+       union vxlan_addr endpoint;
+       unsigned int type;
+
+       switch (sc->sc_mode) {
+       case VXLAN_TMODE_UNSET:
+               return (ENOPROTOOPT);
+       case VXLAN_TMODE_P2P:
+               return (EPROTONOSUPPORT);
+       default:
+               break;
+       }
+
+       /* ignore ifba_ifsname */
+
+       if (ISSET(ifba->ifba_flags, ~IFBAF_TYPEMASK))
+               return (EINVAL);
+       switch (ifba->ifba_flags & IFBAF_TYPEMASK) {
+       case IFBAF_DYNAMIC:
+               type = EBE_DYNAMIC;
+               break;
+       case IFBAF_STATIC:
+               type = EBE_STATIC;
+               break;
+       default:
+               return (EINVAL);
+       }
+
+       memset(&endpoint, 0, sizeof(endpoint));
 
-       switch (af) {
+       if (ifba->ifba_dstsa.ss_family != sc->sc_af)
+               return (EAFNOSUPPORT);
+       switch (ifba->ifba_dstsa.ss_family) {
        case AF_INET:
-               error = ip_output(m, NULL, NULL, IP_RAWOUTPUT,
-                   &sc->sc_imo, NULL, 0);
+               sin = (struct sockaddr_in *)&ifba->ifba_dstsa;
+               if (in_nullhost(sin->sin_addr) ||
+                   IN_MULTICAST(sin->sin_addr.s_addr))
+                       return (EADDRNOTAVAIL);
+
+               if (sin->sin_port != htons(0))
+                       return (EADDRNOTAVAIL);
+
+               endpoint.in4 = sin->sin_addr;
                break;
+
 #ifdef INET6
        case AF_INET6:
-               error = ip6_output(m, 0, NULL, IPV6_MINMTU, 0, NULL);
+               sin6 = (struct sockaddr_in6 *)&ifba->ifba_dstsa;
+               if (IN6_IS_ADDR_UNSPECIFIED(&sin6->sin6_addr) ||
+                   IN6_IS_ADDR_MULTICAST(&sin6->sin6_addr))
+                       return (EADDRNOTAVAIL);
+
+               in6_recoverscope(&src6, &sc->sc_src.in6);
+               if (src6.sin6_scope_id != sin6->sin6_scope_id)
+                       return (EADDRNOTAVAIL);
+
+               if (sin6->sin6_port != htons(0))
+                       return (EADDRNOTAVAIL);
+
+               error = in6_embedscope(&endpoint.in6, sin6, NULL);
+               if (error != 0)
+                       return (error);
+
                break;
-#endif /* INET6 */
-       default:
-               m_freem(m);
-               error = EAFNOSUPPORT;
+#endif
+       default: /* AF_UNSPEC */
+               return (EADDRNOTAVAIL);
        }
 
-       if (error)
-               ifp->if_oerrors++;
+       return (etherbridge_add_addr(&sc->sc_eb, &endpoint,
+           &ifba->ifba_dst, type));
+}
 
-       return (error);
+static int
+vxlan_del_addr(struct vxlan_softc *sc, const struct ifbareq *ifba)
+{
+       return (etherbridge_del_addr(&sc->sc_eb, &ifba->ifba_dst));
 }
 
 void
-vxlan_addr_change(void *arg)
+vxlan_detach_hook(void *arg)
 {
-       struct vxlan_softc      *sc = arg;
-       struct ifnet            *ifp = &sc->sc_ac.ac_if;
-       int                      error;
-
-       /*
-        * Reset the configuration after resume or any possible address
-        * configuration changes.
-        */
-       if ((error = vxlan_config(ifp, NULL, NULL))) {
-               /*
-                * The source address of the tunnel can temporarily disappear,
-                * after a link state change when running the DHCP client,
-                * so keep it configured.
-                */
+       struct vxlan_softc *sc = arg;
+       struct ifnet *ifp = &sc->sc_ac.ac_if;
+
+       if (ISSET(ifp->if_flags, IFF_RUNNING)) {
+               vxlan_down(sc);
+               CLR(ifp->if_flags, IFF_UP);
        }
+
+       sc->sc_if_index0 = 0;
 }
 
-void
-vxlan_if_change(void *arg)
+static int
+vxlan_eb_port_eq(void *arg, void *a, void *b)
 {
-       struct vxlan_softc      *sc = arg;
-       struct ifnet            *ifp = &sc->sc_ac.ac_if;
+       const union vxlan_addr *va = a, *vb = b;
+       size_t i;
 
-       /*
-        * Reset the configuration after the parent interface disappeared.
-        */
-       vxlan_multicast_cleanup(ifp);
-       memset(&sc->sc_src, 0, sizeof(sc->sc_src));
-       memset(&sc->sc_dst, 0, sizeof(sc->sc_dst));
-       sc->sc_dstport = htons(VXLAN_PORT);
+       for (i = 0; i < nitems(va->in6.s6_addr32); i++) {
+               if (va->in6.s6_addr32[i] != vb->in6.s6_addr32[i])
+                       return (0);
+       }
+
+       return (1);
 }
 
-void
-vxlan_link_change(void *arg)
+static void *
+vxlan_eb_port_take(void *arg, void *port)
 {
-       struct vxlan_softc      *sc = arg;
-       struct ifnet            *ifp = &sc->sc_ac.ac_if;
+       union vxlan_addr *endpoint;
+
+       endpoint = pool_get(&vxlan_endpoint_pool, PR_NOWAIT);
+       if (endpoint == NULL)
+               return (NULL);
+
+       *endpoint = *(union vxlan_addr *)port;
 
-       /*
-        * The machine might have lost its multicast associations after
-        * link state changes.  This fixes a problem with VMware after
-        * suspend/resume of the host or guest.
-        */
-       (void)vxlan_config(ifp, NULL, NULL);
+       return (endpoint);
 }
+
+static void
+vxlan_eb_port_rele(void *arg, void *port)
+{
+       union vxlan_addr *endpoint = port;
+
+       pool_put(&vxlan_endpoint_pool, endpoint);
+}
+
+static size_t
+vxlan_eb_port_ifname(void *arg, char *dst, size_t len, void *port)
+{
+       struct vxlan_softc *sc = arg;
+
+       return (strlcpy(dst, sc->sc_ac.ac_if.if_xname, len));
+}
+
+static void
+vxlan_eb_port_sa(void *arg, struct sockaddr_storage *ss, void *port)
+{
+       struct vxlan_softc *sc = arg;
+       union vxlan_addr *endpoint = port;
+
+       switch (sc->sc_af) {
+       case AF_INET: {
+               struct sockaddr_in *sin = (struct sockaddr_in *)ss;
+
+               sin->sin_len = sizeof(*sin);
+               sin->sin_family = AF_INET;
+               sin->sin_addr = endpoint->in4;
+               break;
+       }
+#ifdef INET6
+       case AF_INET6: {
+               struct sockaddr_in6 *sin6 = (struct sockaddr_in6 *)ss;
+
+               sin6->sin6_len = sizeof(*sin6);
+               sin6->sin6_family = AF_INET6;
+               in6_recoverscope(sin6, &endpoint->in6);
+               break;
+       }
+#endif /* INET6 */
+       default:
+               unhandled_af(sc->sc_af);
+       }
+}
+
+static inline int
+vxlan_peer_cmp(const struct vxlan_peer *ap, const struct vxlan_peer *bp)
+{
+       size_t i;
+
+       if (ap->p_header.vxlan_id > bp->p_header.vxlan_id)
+               return (1);
+       if (ap->p_header.vxlan_id < bp->p_header.vxlan_id)
+               return (-1);
+       if (ap->p_header.vxlan_flags > bp->p_header.vxlan_flags)
+               return (1);
+       if (ap->p_header.vxlan_flags < bp->p_header.vxlan_flags)
+               return (-1);
+
+       if (ap->p_mask || bp->p_mask)
+               return (0);
+
+       for (i = 0; i < nitems(ap->p_addr.in6.s6_addr32); i++) {
+               if (ap->p_addr.in6.s6_addr32[i] >
+                   bp->p_addr.in6.s6_addr32[i])
+                       return (1);
+               if (ap->p_addr.in6.s6_addr32[i] <
+                   bp->p_addr.in6.s6_addr32[i])
+                       return (-1);
+       }
+
+       return (0);
+}
+
+RBT_GENERATE(vxlan_peers, vxlan_peer, p_entry, vxlan_peer_cmp);
Index: conf/files
===================================================================
RCS file: /cvs/src/sys/conf/files,v
retrieving revision 1.698
diff -u -p -r1.698 files
--- conf/files  23 Feb 2021 03:30:04 -0000      1.698
+++ conf/files  5 Mar 2021 06:22:43 -0000
@@ -573,7 +573,7 @@ pseudo-device mpip: ifnet, mpls
 pseudo-device bpe: ifnet, ether, ifmedia, etherbridge
 pseudo-device vether: ifnet, ether
 pseudo-device pppx: ifnet
-pseudo-device vxlan: ifnet, ether, ifmedia
+pseudo-device vxlan: ifnet, ether, etherbridge
 pseudo-device switch: ifnet, ether
 pseudo-device wg: ifnet
 
@@ -840,7 +840,7 @@ file net/if_bpe.c                   bpe                     
needs-count
 file net/if_vether.c                   vether
 file net/if_pair.c                     pair
 file net/if_pppx.c                     pppx                    needs-count
-file net/if_vxlan.c                    vxlan                   needs-count
+file net/if_vxlan.c                    vxlan
 file net/if_wg.c                       wg
 file net/wg_noise.c                    wg
 file net/wg_cookie.c                   wg

Reply via email to