From: Toshiaki Makita <makita.toshi...@lab.ntt.co.jp>

This allows NIC's XDP to redirect packets to veth. The destination veth
device enqueues redirected packets to the napi ring of its peer, then
they are processed by XDP on its peer veth device.
This can be thought as calling another XDP program by XDP program using
REDIRECT, when the peer enables driver XDP.

Note that whether an XDP program is loaded on the redirect target veth
device does not affect how xdp_frames sent by ndo_xdp_xmit is handled,
since the ring sits in rx (peer) side. Instead, whether XDP program is
loaded on peer veth does.

When peer veth device has driver XDP, ndo_xdp_xmit forwards xdp_frames
to its peer without modification.
If not, ndo_xdp_xmit converts xdp_frames to skb on sender side and
invokes netif_rx rather than dropping them. Although this will not
result in good performance, I'm thinking dropping redirected packets
when XDP is not loaded on the peer device is too restrictive, so added
this fallback.

Signed-off-by: Toshiaki Makita <makita.toshi...@lab.ntt.co.jp>
---
 drivers/net/veth.c     | 73 +++++++++++++++++++++++++++++++++++++++++++++++---
 include/linux/filter.h | 16 +++++++++++
 net/core/filter.c      | 11 +-------
 3 files changed, 87 insertions(+), 13 deletions(-)

diff --git a/drivers/net/veth.c b/drivers/net/veth.c
index 89c91c1c9935..b1d591be0eba 100644
--- a/drivers/net/veth.c
+++ b/drivers/net/veth.c
@@ -54,6 +54,11 @@ static bool veth_is_xdp_frame(void *ptr)
        return (unsigned long)ptr & VETH_XDP_FLAG;
 }
 
+static void *veth_xdp_to_ptr(void *ptr)
+{
+       return (void *)((unsigned long)ptr | VETH_XDP_FLAG);
+}
+
 static void *veth_ptr_to_xdp(void *ptr)
 {
        return (void *)((unsigned long)ptr & ~VETH_XDP_FLAG);
@@ -138,7 +143,7 @@ static void veth_ptr_free(void *ptr)
        }
 }
 
-static void veth_xdp_flush(struct veth_priv *priv)
+static void __veth_xdp_flush(struct veth_priv *priv)
 {
        /* Write ptr_ring before reading rx_notify_masked */
        smp_mb();
@@ -206,7 +211,7 @@ static netdev_tx_t veth_xmit(struct sk_buff *skb, struct 
net_device *dev)
 
        /* TODO: check xmit_more and tx_stopped */
        if (rcv_xdp)
-               veth_xdp_flush(rcv_priv);
+               __veth_xdp_flush(rcv_priv);
 
        rcu_read_unlock();
 
@@ -281,6 +286,66 @@ static struct sk_buff *veth_build_skb(void *head, int 
headroom, int len,
        return skb;
 }
 
+static int veth_xdp_xmit(struct net_device *dev, struct xdp_frame *frame)
+{
+       struct veth_priv *rcv_priv, *priv = netdev_priv(dev);
+       int headroom = frame->data - (void *)frame;
+       struct net_device *rcv;
+       int err = 0;
+
+       rcv = rcu_dereference(priv->peer);
+       if (unlikely(!rcv))
+               return -ENXIO;
+
+       rcv_priv = netdev_priv(rcv);
+       /* xdp_ring is initialized on receive side? */
+       if (rcu_access_pointer(rcv_priv->xdp_prog)) {
+               err = xdp_ok_fwd_dev(rcv, frame->len);
+               if (unlikely(err))
+                       return err;
+
+               err = veth_xdp_enqueue(rcv_priv, veth_xdp_to_ptr(frame));
+       } else {
+               struct sk_buff *skb;
+
+               skb = veth_build_skb(frame, headroom, frame->len, 0);
+               if (unlikely(!skb))
+                       return -ENOMEM;
+
+               /* Get page ref in case skb is dropped in netif_rx.
+                * The caller is responsible for freeing the page on error.
+                */
+               get_page(virt_to_page(frame->data));
+               if (unlikely(veth_forward_skb(rcv, skb, false) != 
NET_RX_SUCCESS))
+                       return -ENXIO;
+
+               /* Put page ref on success */
+               page_frag_free(frame->data);
+       }
+
+       return err;
+}
+
+static void veth_xdp_flush(struct net_device *dev)
+{
+       struct veth_priv *rcv_priv, *priv = netdev_priv(dev);
+       struct net_device *rcv;
+
+       rcu_read_lock();
+       rcv = rcu_dereference(priv->peer);
+       if (unlikely(!rcv))
+               goto out;
+
+       rcv_priv = netdev_priv(rcv);
+       /* xdp_ring is initialized on receive side? */
+       if (unlikely(!rcu_access_pointer(rcv_priv->xdp_prog)))
+               goto out;
+
+       __veth_xdp_flush(rcv_priv);
+out:
+       rcu_read_unlock();
+}
+
 static struct sk_buff *veth_xdp_rcv_one(struct veth_priv *priv,
                                        struct xdp_frame *frame)
 {
@@ -580,7 +645,7 @@ static void veth_poll_controller(struct net_device *dev)
 
        rcu_read_lock();
        if (rcu_access_pointer(priv->xdp_prog))
-               veth_xdp_flush(priv);
+               __veth_xdp_flush(priv);
        rcu_read_unlock();
 }
 #endif /* CONFIG_NET_POLL_CONTROLLER */
@@ -730,6 +795,8 @@ static const struct net_device_ops veth_netdev_ops = {
        .ndo_features_check     = passthru_features_check,
        .ndo_set_rx_headroom    = veth_set_rx_headroom,
        .ndo_bpf                = veth_xdp,
+       .ndo_xdp_xmit           = veth_xdp_xmit,
+       .ndo_xdp_flush          = veth_xdp_flush,
 };
 
 #define VETH_FEATURES (NETIF_F_SG | NETIF_F_FRAGLIST | NETIF_F_HW_CSUM | \
diff --git a/include/linux/filter.h b/include/linux/filter.h
index 4da8b2308174..7d043f51d1d7 100644
--- a/include/linux/filter.h
+++ b/include/linux/filter.h
@@ -19,6 +19,7 @@
 #include <linux/cryptohash.h>
 #include <linux/set_memory.h>
 #include <linux/kallsyms.h>
+#include <linux/if_vlan.h>
 
 #include <net/sch_generic.h>
 
@@ -752,6 +753,21 @@ static inline bool bpf_dump_raw_ok(void)
 struct bpf_prog *bpf_patch_insn_single(struct bpf_prog *prog, u32 off,
                                       const struct bpf_insn *patch, u32 len);
 
+static __always_inline int
+xdp_ok_fwd_dev(const struct net_device *fwd, unsigned int pktlen)
+{
+       unsigned int len;
+
+       if (unlikely(!(fwd->flags & IFF_UP)))
+               return -ENETDOWN;
+
+       len = fwd->mtu + fwd->hard_header_len + VLAN_HLEN;
+       if (pktlen > len)
+               return -EMSGSIZE;
+
+       return 0;
+}
+
 /* The pair of xdp_do_redirect and xdp_do_flush_map MUST be called in the
  * same cpu context. Further for best results no more than a single map
  * for the do_redirect/do_flush pair should be used. This limitation is
diff --git a/net/core/filter.c b/net/core/filter.c
index a374b8560bc4..25ae8ffaa968 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -2923,16 +2923,7 @@ EXPORT_SYMBOL_GPL(xdp_do_redirect);
 
 static int __xdp_generic_ok_fwd_dev(struct sk_buff *skb, struct net_device 
*fwd)
 {
-       unsigned int len;
-
-       if (unlikely(!(fwd->flags & IFF_UP)))
-               return -ENETDOWN;
-
-       len = fwd->mtu + fwd->hard_header_len + VLAN_HLEN;
-       if (skb->len > len)
-               return -EMSGSIZE;
-
-       return 0;
+       return xdp_ok_fwd_dev(fwd, skb->len);
 }
 
 static int xdp_do_generic_redirect_map(struct net_device *dev,
-- 
2.14.3

Reply via email to