On Mon, Mar 26, 2018 at 10:21 AM, David Miller <[email protected]> wrote:
> From: "Md. Islam" <[email protected]>
> Date: Fri, 23 Mar 2018 02:43:16 -0400
>
>> +#ifdef CONFIG_XDP_ROUTER
>> + //if IP forwarding is enabled on the receiver, create xdp_buff
>> + //from skb and call xdp_router_forward()
>
> Never use C++ comments, only use C style.
>
>> + if(is_forwarding_enabled(rcv)){
>
> There must be a space between 'if' and the openning parenthesis. You need
> to also have a space before the openning curly braces.
>
> In fact this entire patch is full of coding style issues, please run your
> changes through checkpatch.pl before resubmitting.
Thanks David for the suggestions! I fixed all the formating errors. I
also modified the forwarding logic. Now the iperf throughput improved
from
53.8Gb/s to around 60Gb/s and the median RTT improved from .055 ms to
.35 ms. The patch is attached.
diff --git a/drivers/net/Kconfig b/drivers/net/Kconfig
index 944ec3c..8474eef 100644
--- a/drivers/net/Kconfig
+++ b/drivers/net/Kconfig
@@ -328,6 +328,18 @@ config VETH
When one end receives the packet it appears on its pair and vice
versa.
+config XDP_ROUTER
+ bool "XDP router for veth"
+ depends on IP_ADVANCED_ROUTER
+ depends on VETH
+ default y
+ ---help---
+ This option will enable IP forwarding on incoming xdp_buff.
+ Currently it is only supported by veth. Say y or n.
+
+ Currently veth uses slow path for packet forwarding. This option
+ forwards packets as soon as it is received (as XDP generic).
+
config VIRTIO_NET
tristate "Virtio network driver"
depends on VIRTIO
diff --git a/drivers/net/veth.c b/drivers/net/veth.c
index a69ad39..4ce10c9 100644
--- a/drivers/net/veth.c
+++ b/drivers/net/veth.c
@@ -111,6 +111,28 @@ static netdev_tx_t veth_xmit(struct sk_buff *skb, struct net_device *dev)
goto drop;
}
+#ifdef CONFIG_XDP_ROUTER
+
+ /* if IP forwarding is enabled on the receiver, create xdp_buff
+ * from skb and call xdp_router_forward()
+ */
+ if (is_forwarding_enabled(rcv)) {
+ struct xdp_buff *xdp = kmalloc(sizeof(*xdp), GFP_KERNEL);
+
+ xdp->data = skb->data;
+ xdp->data_end = skb->data + (skb->len - skb->data_len);
+ xdp->data_meta = skb;
+ if (likely(xdp_router_forward(rcv, xdp) == NET_RX_SUCCESS)) {
+ struct pcpu_vstats *stats = this_cpu_ptr(dev->vstats);
+
+ u64_stats_update_begin(&stats->syncp);
+ stats->bytes += length;
+ stats->packets++;
+ u64_stats_update_end(&stats->syncp);
+ goto success;
+ }
+ }
+#endif
if (likely(dev_forward_skb(rcv, skb) == NET_RX_SUCCESS)) {
struct pcpu_vstats *stats = this_cpu_ptr(dev->vstats);
@@ -122,6 +144,7 @@ static netdev_tx_t veth_xmit(struct sk_buff *skb, struct net_device *dev)
drop:
atomic64_inc(&priv->dropped);
}
+success:
rcu_read_unlock();
return NETDEV_TX_OK;
}
@@ -276,6 +299,61 @@ static void veth_set_rx_headroom(struct net_device *dev, int new_hr)
rcu_read_unlock();
}
+#ifdef CONFIG_XDP_ROUTER
+int veth_xdp_xmit(struct net_device *dev, struct xdp_buff *xdp)
+{
+ struct veth_priv *priv = netdev_priv(dev);
+ struct net_device *rcv;
+ struct ethhdr *ethh;
+ struct sk_buff *skb;
+ int length = xdp->data_end - xdp->data;
+
+ rcu_read_lock();
+ rcv = rcu_dereference(priv->peer);
+ if (unlikely(!rcv)) {
+ kfree(xdp);
+ goto drop;
+ }
+
+ /* Update MAC address and checksum */
+ ethh = eth_hdr_xdp(xdp);
+ ether_addr_copy(ethh->h_source, dev->dev_addr);
+ ether_addr_copy(ethh->h_dest, rcv->dev_addr);
+
+ /* if IP forwarding is enabled on the receiver,
+ * call xdp_router_forward()
+ */
+ if (is_forwarding_enabled(rcv)) {
+ if (likely(xdp_router_forward(rcv, xdp) == NET_RX_SUCCESS)) {
+ struct pcpu_vstats *stats = this_cpu_ptr(dev->vstats);
+
+ u64_stats_update_begin(&stats->syncp);
+ stats->bytes += length;
+ stats->packets++;
+ u64_stats_update_end(&stats->syncp);
+ goto success;
+ }
+ }
+
+ /* Local deliver */
+ skb = (struct sk_buff *)xdp->data_meta;
+ if (likely(dev_forward_skb(rcv, skb) == NET_RX_SUCCESS)) {
+ struct pcpu_vstats *stats = this_cpu_ptr(dev->vstats);
+
+ u64_stats_update_begin(&stats->syncp);
+ stats->bytes += length;
+ stats->packets++;
+ u64_stats_update_end(&stats->syncp);
+ } else {
+drop:
+ atomic64_inc(&priv->dropped);
+ }
+success:
+ rcu_read_unlock();
+ return NETDEV_TX_OK;
+}
+#endif
+
static const struct net_device_ops veth_netdev_ops = {
.ndo_init = veth_dev_init,
.ndo_open = veth_open,
@@ -290,6 +368,9 @@ static const struct net_device_ops veth_netdev_ops = {
.ndo_get_iflink = veth_get_iflink,
.ndo_features_check = passthru_features_check,
.ndo_set_rx_headroom = veth_set_rx_headroom,
+#ifdef CONFIG_XDP_ROUTER
+ .ndo_xdp_xmit = veth_xdp_xmit,
+#endif
};
#define VETH_FEATURES (NETIF_F_SG | NETIF_F_FRAGLIST | NETIF_F_HW_CSUM | \
diff --git a/include/linux/ip.h b/include/linux/ip.h
index 492bc65..025a3ec 100644
--- a/include/linux/ip.h
+++ b/include/linux/ip.h
@@ -19,6 +19,29 @@
#include <linux/skbuff.h>
#include <uapi/linux/ip.h>
+#include <linux/filter.h>
+
+#ifdef CONFIG_XDP_ROUTER
+
+#define MIN_PACKET_SIZE 55
+
+static inline struct iphdr *ip_hdr_xdp(const struct xdp_buff *xdp)
+{
+ return (struct iphdr *)(xdp->data+ETH_HLEN);
+}
+
+static inline struct ethhdr *eth_hdr_xdp(const struct xdp_buff *xdp)
+{
+ return (struct ethhdr *)(xdp->data);
+}
+
+static inline bool is_xdp_forwardable(const struct xdp_buff *xdp)
+{
+ return xdp->data_end - xdp->data >= MIN_PACKET_SIZE;
+}
+
+#endif
+
static inline struct iphdr *ip_hdr(const struct sk_buff *skb)
{
diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index 4c77f39..8369e5e 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -3290,6 +3290,11 @@ static inline void dev_consume_skb_any(struct sk_buff *skb)
__dev_kfree_skb_any(skb, SKB_REASON_CONSUMED);
}
+#ifdef CONFIG_XDP_ROUTER
+bool is_forwarding_enabled(struct net_device *dev);
+int xdp_router_forward(struct net_device *dev, struct xdp_buff *xdp);
+#endif
+
void generic_xdp_tx(struct sk_buff *skb, struct bpf_prog *xdp_prog);
int do_xdp_generic(struct bpf_prog *xdp_prog, struct sk_buff *skb);
int netif_rx(struct sk_buff *skb);
diff --git a/include/net/ip_fib.h b/include/net/ip_fib.h
index f805243..623b2de 100644
--- a/include/net/ip_fib.h
+++ b/include/net/ip_fib.h
@@ -369,6 +369,12 @@ int fib_sync_down_dev(struct net_device *dev, unsigned long event, bool force);
int fib_sync_down_addr(struct net_device *dev, __be32 local);
int fib_sync_up(struct net_device *dev, unsigned int nh_flags);
+#ifdef CONFIG_XDP_ROUTER
+int ip_route_lookup(__be32 daddr, __be32 saddr,
+ u8 tos, struct net_device *dev,
+ struct fib_result *res);
+#endif
+
#ifdef CONFIG_IP_ROUTE_MULTIPATH
int fib_multipath_hash(const struct fib_info *fi, const struct flowi4 *fl4,
const struct sk_buff *skb);
diff --git a/net/core/dev.c b/net/core/dev.c
index dda9d7b..18c18f3 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -4090,6 +4090,56 @@ int do_xdp_generic(struct bpf_prog *xdp_prog, struct sk_buff *skb)
}
EXPORT_SYMBOL_GPL(do_xdp_generic);
+#ifdef CONFIG_XDP_ROUTER
+
+bool is_forwarding_enabled(struct net_device *dev)
+{
+ struct in_device *in_dev;
+
+ /* verify forwarding is enabled on this interface */
+ in_dev = __in_dev_get_rcu(dev);
+ if (unlikely(!in_dev || !IN_DEV_FORWARD(in_dev)))
+ return false;
+
+ return true;
+}
+EXPORT_SYMBOL_GPL(is_forwarding_enabled);
+
+int xdp_router_forward(struct net_device *dev, struct xdp_buff *xdp)
+{
+ int err;
+ struct fib_result res;
+ struct iphdr *iph;
+ struct net_device *rcv;
+
+ if (unlikely(xdp->data_end - xdp->data < MIN_PACKET_SIZE))
+ return NET_RX_DROP;
+
+ iph = (struct iphdr *)(xdp->data + ETH_HLEN);
+
+ /*currently only supports IPv4
+ */
+ if (unlikely(iph->version != 4))
+ return NET_RX_DROP;
+
+ err = ip_route_lookup(iph->daddr, iph->saddr,
+ iph->tos, dev, &res);
+ if (unlikely(err))
+ return NET_RX_DROP;
+
+ rcv = FIB_RES_DEV(res);
+ if (likely(rcv)) {
+ if (likely(rcv->netdev_ops->ndo_xdp_xmit(rcv, xdp) ==
+ NETDEV_TX_OK))
+ return NET_RX_SUCCESS;
+ }
+
+ return NET_RX_DROP;
+}
+EXPORT_SYMBOL_GPL(xdp_router_forward);
+
+#endif
+
static int netif_rx_internal(struct sk_buff *skb)
{
int ret;
diff --git a/net/ipv4/route.c b/net/ipv4/route.c
index 49cc1c1..2333205 100644
--- a/net/ipv4/route.c
+++ b/net/ipv4/route.c
@@ -1866,6 +1866,35 @@ static int ip_mkroute_input(struct sk_buff *skb,
return __mkroute_input(skb, res, in_dev, daddr, saddr, tos);
}
+#ifdef CONFIG_XDP_ROUTER
+
+int ip_route_lookup(__be32 daddr, __be32 saddr,
+ u8 tos, struct net_device *dev,
+ struct fib_result *res)
+{
+ struct flowi4 fl4;
+ int err;
+ struct net *net = dev_net(dev);
+
+ fl4.flowi4_oif = 0;
+ fl4.flowi4_iif = dev->ifindex;
+ fl4.flowi4_mark = 0;
+ fl4.flowi4_tos = tos & IPTOS_RT_MASK;
+ fl4.flowi4_scope = RT_SCOPE_UNIVERSE;
+ fl4.flowi4_flags = 0;
+ fl4.daddr = daddr;
+ fl4.saddr = saddr;
+
+ err = fib_lookup(net, &fl4, res, 0);
+
+ if (unlikely(err != 0 || res->type != RTN_UNICAST))
+ return -EINVAL;
+
+ return 0;
+}
+EXPORT_SYMBOL_GPL(ip_route_lookup);
+#endif
+
/*
* NOTE. We drop all the packets that has local source
* addresses, because every properly looped back packet