On 04/25/2018 08:34 PM, David Ahern wrote: > Provide a helper for doing a FIB and neighbor lookup in the kernel > tables from an XDP program. The helper provides a fastpath for forwarding > packets. If the packet is a local delivery or for any reason is not a > simple lookup and forward, the packet continues up the stack. > > If it is to be forwarded, the forwarding can be done directly if the > neighbor is already known. If the neighbor does not exist, the first > few packets go up the stack for neighbor resolution. Once resolved, the > xdp program provides the fast path. > > On successful lookup the nexthop dmac, current device smac and egress > device index are returned. > > The API supports IPv4, IPv6 and MPLS protocols, but only IPv4 and IPv6 > are implemented in this patch. The API includes layer 4 parameters if > the XDP program chooses to do deep packet inspection to allow compare > against ACLs implemented as FIB rules. > > Header rewrite is left to the XDP program. > > The lookup takes 2 flags: > - BPF_FIB_LOOKUP_DIRECT to do a lookup that bypasses FIB rules and goes > straight to the table associated with the device (expert setting for > those looking to maximize throughput) > > - BPF_FIB_LOOKUP_OUTPUT to do a lookup from the egress perspective. > Default is an ingress lookup. > > Initial performance numbers collected by Jesper, forwarded packets/sec: > > Full stack XDP FIB lookup XDP Direct lookup > IPv4 1,947,969 7,074,156 7,415,333 > IPv6 1,728,000 6,165,504 7,262,720 > > > Signed-off-by: David Ahern <dsah...@gmail.com> > --- > include/uapi/linux/bpf.h | 68 +++++++++++++- > net/core/filter.c | 233 > +++++++++++++++++++++++++++++++++++++++++++++++ > 2 files changed, 300 insertions(+), 1 deletion(-) > > diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h > index e6679393b687..82601c132b9f 100644 > --- a/include/uapi/linux/bpf.h > +++ b/include/uapi/linux/bpf.h > @@ -10,6 +10,8 @@ > > #include <linux/types.h> > #include <linux/bpf_common.h> > +#include <linux/if_ether.h> > +#include <linux/in6.h> > > /* Extended instruction set based on top of classic BPF */ > > @@ -783,6 +785,17 @@ union bpf_attr { > * @size: size of 'struct bpf_xfrm_state' > * @flags: room for future extensions > * Return: 0 on success or negative error > + * > + * int bpf_fib_lookup(ctx, params, plen, flags) > + * Do a FIB lookup based on given parameters > + * @ctx: pointer to context of type xdp_md
Nit: would just say pointer to context here since used with xdp/skb > + * @params: pointer to bpf_fib_lookup > + * @plen: size of params argument > + * @flags: u32 bitmask of BPF_FIB_LOOKUP_* flags > + * Return: egress device index if packet is to be forwarded, > + * 0 for local delivery (anything that needs to be handled > + * by the full stack), or negative on error. > + * If index is > 0, output data in bpf_fib_lookup is set > */ > #define __BPF_FUNC_MAPPER(FN) \ > FN(unspec), \ > @@ -851,7 +864,9 @@ union bpf_attr { > FN(msg_pull_data), \ > FN(bind), \ > FN(xdp_adjust_tail), \ > - FN(skb_get_xfrm_state), > + FN(skb_get_xfrm_state), \ > + FN(fib_lookup), \ > + > Nit: trailing '\' resp. double newline > /* integer value in 'imm' field of BPF_CALL instruction selects which helper > * function eBPF program intends to call [...] > diff --git a/net/core/filter.c b/net/core/filter.c > index 8e45c6c7ab08..37602b2fb94a 100644 > --- a/net/core/filter.c > +++ b/net/core/filter.c > @@ -59,6 +59,10 @@ > #include <net/tcp.h> > #include <net/xfrm.h> > #include <linux/bpf_trace.h> > +#include <linux/inetdevice.h> > +#include <net/ip_fib.h> > +#include <net/flow.h> > +#include <net/arp.h> > > /** > * sk_filter_trim_cap - run a packet through a socket filter > @@ -3787,6 +3791,231 @@ static const struct bpf_func_proto > bpf_skb_get_xfrm_state_proto = { > }; > #endif > > +#if IS_ENABLED(CONFIG_INET) || IS_ENABLED(CONFIG_IPV6) > +static int bpf_fib_set_fwd_params(struct bpf_fib_lookup *params, > + const struct neighbour *neigh, > + const struct net_device *dev) > +{ > + memcpy(params->dmac, neigh->ha, ETH_ALEN); > + memcpy(params->smac, dev->dev_addr, ETH_ALEN); > + params->h_vlan_TCI = 0; > + params->h_vlan_proto = 0; > + > + return dev->ifindex; > +} > +#endif > + > +#if IS_ENABLED(CONFIG_INET) > +static int bpf_ipv4_fib_lookup(struct xdp_buff *ctx, Instead of passing xdp_buff here, just pass the netdev pointer. More below why it's needed. > + struct bpf_fib_lookup *params, u32 flags) > +{ > + struct net *net = dev_net(ctx->rxq->dev); > + struct in_device *in_dev; > + struct neighbour *neigh; > + struct net_device *dev; > + struct fib_result res; > + struct fib_nh *nh; > + struct flowi4 fl4; > + int err; > + > + dev = dev_get_by_index_rcu(net, params->ifindex); > + if (unlikely(!dev)) > + return -ENODEV; > + > + /* verify forwarding is enabled on this interface */ > + in_dev = __in_dev_get_rcu(dev); > + if (unlikely(!in_dev || !IN_DEV_FORWARD(in_dev))) > + return 0; > + > + if (flags & BPF_FIB_LOOKUP_OUTPUT) { > + fl4.flowi4_iif = 1; > + fl4.flowi4_oif = params->ifindex; > + } else { > + fl4.flowi4_iif = params->ifindex; > + fl4.flowi4_oif = 0; > + } > + fl4.flowi4_tos = params->tos & IPTOS_RT_MASK; > + fl4.flowi4_scope = RT_SCOPE_UNIVERSE; > + fl4.flowi4_flags = 0; > + > + fl4.flowi4_proto = params->l4_protocol; > + fl4.daddr = params->ipv4_dst; > + fl4.saddr = params->ipv4_src; > + fl4.fl4_sport = params->sport; > + fl4.fl4_dport = params->dport; > + > + if (flags & BPF_FIB_LOOKUP_DIRECT) { > + u32 tbid = l3mdev_fib_table_rcu(dev) ? : RT_TABLE_MAIN; > + struct fib_table *tb; > + > + tb = fib_get_table(net, tbid); > + if (unlikely(!tb)) > + return 0; > + > + err = fib_table_lookup(tb, &fl4, &res, FIB_LOOKUP_NOREF); > + } else { > + fl4.flowi4_mark = 0; > + fl4.flowi4_secid = 0; > + fl4.flowi4_tun_key.tun_id = 0; > + fl4.flowi4_uid = sock_net_uid(net, NULL); > + > + err = fib_lookup(net, &fl4, &res, FIB_LOOKUP_NOREF); > + } > + > + if (err || res.type != RTN_UNICAST) > + return 0; > + > + if (res.fi->fib_nhs > 1) > + fib_select_path(net, &res, &fl4, NULL); > + > + nh = &res.fi->fib_nh[res.nh_sel]; > + > + /* do not handle lwt encaps right now */ > + if (nh->nh_lwtstate) > + return 0; > + > + dev = nh->nh_dev; > + if (unlikely(!dev)) > + return 0; > + > + if (nh->nh_gw) > + params->ipv4_dst = nh->nh_gw; > + > + params->rt_metric = res.fi->fib_priority; > + > + /* xdp and cls_bpf programs are run in RCU-bh so > + * rcu_read_lock_bh is not needed here > + */ > + neigh = __ipv4_neigh_lookup_noref(dev, (__force u32)params->ipv4_dst); > + if (neigh) > + return bpf_fib_set_fwd_params(params, neigh, dev); > + > + return 0; > +} > +#endif > + > +#if IS_ENABLED(CONFIG_IPV6) > +static int bpf_ipv6_fib_lookup(struct xdp_buff *ctx, Same here. > + struct bpf_fib_lookup *params, u32 flags) > +{ > + struct net *net = dev_net(ctx->rxq->dev); > + struct neighbour *neigh; > + struct net_device *dev; > + struct fib6_info *f6i; > + struct flowi6 fl6; > + int strict = 0; > + int oif; > + > + /* link local addresses are never forwarded */ > + if (rt6_need_strict(¶ms->ipv6_dst) || > + rt6_need_strict(¶ms->ipv6_src)) > + return 0; > + > + dev = dev_get_by_index_rcu(net, params->ifindex); > + if (unlikely(!dev)) > + return -ENODEV; > + > + if (flags & BPF_FIB_LOOKUP_OUTPUT) { > + fl6.flowi6_iif = 1; > + oif = fl6.flowi6_oif = params->ifindex; > + } else { > + oif = fl6.flowi6_iif = params->ifindex; > + fl6.flowi6_oif = 0; > + strict = RT6_LOOKUP_F_HAS_SADDR; > + } > + fl6.flowlabel = params->flowlabel; > + fl6.flowi6_scope = 0; > + fl6.flowi6_flags = 0; > + fl6.mp_hash = 0; > + > + fl6.flowi6_proto = params->l4_protocol; > + fl6.daddr = params->ipv6_dst; > + fl6.saddr = params->ipv6_src; > + fl6.fl6_sport = params->sport; > + fl6.fl6_dport = params->dport; > + > + if (flags & BPF_FIB_LOOKUP_DIRECT) { > + u32 tbid = l3mdev_fib_table_rcu(dev) ? : RT_TABLE_MAIN; > + struct fib6_table *tb; > + > + tb = ipv6_stub->fib6_get_table(net, tbid); > + if (unlikely(!tb)) > + return 0; > + > + f6i = ipv6_stub->fib6_table_lookup(net, tb, oif, &fl6, strict); > + } else { > + fl6.flowi6_mark = 0; > + fl6.flowi6_secid = 0; > + fl6.flowi6_tun_key.tun_id = 0; > + fl6.flowi6_uid = sock_net_uid(net, NULL); > + > + f6i = ipv6_stub->fib6_lookup(net, oif, &fl6, strict); > + } > + > + if (unlikely(IS_ERR_OR_NULL(f6i) || f6i == net->ipv6.fib6_null_entry)) > + return 0; > + > + if (unlikely(f6i->fib6_flags & RTF_REJECT || > + f6i->fib6_type != RTN_UNICAST)) > + return 0; > + > + if (f6i->fib6_nsiblings && fl6.flowi6_oif == 0) > + f6i = ipv6_stub->fib6_multipath_select(net, f6i, &fl6, > + fl6.flowi6_oif, NULL, > + strict); > + > + if (f6i->fib6_nh.nh_lwtstate) > + return 0; > + > + if (f6i->fib6_flags & RTF_GATEWAY) > + params->ipv6_dst = f6i->fib6_nh.nh_gw; > + > + dev = f6i->fib6_nh.nh_dev; > + params->rt_metric = f6i->fib6_metric; > + > + /* xdp and cls_bpf programs are run in RCU-bh so rcu_read_lock_bh is > + * not needed here. Can not use __ipv6_neigh_lookup_noref here > + * because we need to get nd_tbl via the stub > + */ > + neigh = ___neigh_lookup_noref(ipv6_stub->nd_tbl, neigh_key_eq128, > + ndisc_hashfn, ¶ms->ipv6_dst, dev); > + if (neigh) > + return bpf_fib_set_fwd_params(params, neigh, dev); > + > + return 0; > +} > +#endif > + > +BPF_CALL_4(bpf_fib_lookup, struct xdp_buff *, ctx, > + struct bpf_fib_lookup *, params, int, plen, u32, flags) > +{ > + if (plen < sizeof(*params)) > + return -EINVAL; > + > + switch (params->family) { > +#if IS_ENABLED(CONFIG_INET) > + case AF_INET: > + return bpf_ipv4_fib_lookup(ctx, params, flags); > +#endif > +#if IS_ENABLED(CONFIG_IPV6) > + case AF_INET6: > + return bpf_ipv6_fib_lookup(ctx, params, flags); > +#endif > + } > + return -ENOTSUPP; > +} > + > +static const struct bpf_func_proto bpf_fib_lookup_proto = { > + .func = bpf_fib_lookup, > + .gpl_only = true, > + .pkt_access = true, > + .ret_type = RET_INTEGER, > + .arg1_type = ARG_PTR_TO_CTX, > + .arg2_type = ARG_PTR_TO_MEM, > + .arg3_type = ARG_CONST_SIZE, > + .arg4_type = ARG_ANYTHING, > +}; > + > static const struct bpf_func_proto * > bpf_base_func_proto(enum bpf_func_id func_id) > { > @@ -3861,6 +4090,8 @@ sk_filter_func_proto(enum bpf_func_id func_id, const > struct bpf_prog *prog) > return &bpf_get_socket_cookie_proto; > case BPF_FUNC_get_socket_uid: > return &bpf_get_socket_uid_proto; > + case BPF_FUNC_fib_lookup: > + return &bpf_fib_lookup_proto; This part doesn't belong to sk_filter_func_proto(), but to the tc_cls_act_func_proto() instead. > default: > return bpf_base_func_proto(func_id); > } > @@ -3957,6 +4188,8 @@ xdp_func_proto(enum bpf_func_id func_id, const struct > bpf_prog *prog) > return &bpf_xdp_redirect_map_proto; > case BPF_FUNC_xdp_adjust_tail: > return &bpf_xdp_adjust_tail_proto; > + case BPF_FUNC_fib_lookup: > + return &bpf_fib_lookup_proto; Basically, you're using the very same bpf_fib_lookup_proto for both XDP and skb. In the skb case, you're reusing the two functions bpf_ipv{4,6}_fib_lookup(), so when you get the netdev pointer for retrieving the netns, you'll crash at dev_net(ctx->rxq->dev) since this is XDP only and not skb meta data. Therefore, as mentioned, pass the netdev to bpf_ipv{4,6}_fib_lookup() to have it generic and have bpf_xdp_fib_lookup_proto and bpf_skb_fib_lookup_proto where both are under the case BPF_FUNC_fib_lookup in the respective *func_proto(), but using the proper prototypes according to their correct context. Meaning, both reuse bpf_ipv{4,6}_fib_lookup() from each of their BPF_CALL_4() helper implementation. > default: > return bpf_base_func_proto(func_id); > } >