Hi David, first off all sorry for my late reply, been mostly offline last week. I think there's still an issue with the current patch, more below:
On 06/21/2018 05:00 AM, [email protected] wrote: > From: David Ahern <[email protected]> > > For ACLs implemented using either FIB rules or FIB entries, the BPF > program needs the FIB lookup status to be able to drop the packet. > Since the bpf_fib_lookup API has not reached a released kernel yet, > change the return code to contain an encoding of the FIB lookup > result and return the nexthop device index in the params struct. > > In addition, inform the BPF program of any post FIB lookup reason as > to why the packet needs to go up the stack. > > The fib result for unicast routes must have an egress device, so remove > the check that it is non-NULL. > > Signed-off-by: David Ahern <[email protected]> > --- > v2 > - drop BPF_FIB_LKUP_RET_NO_NHDEV; check in dev in fib result not needed > - enhance documentation of BPF_FIB_LKUP_RET_ codes > > include/uapi/linux/bpf.h | 28 ++++++++++++++---- > net/core/filter.c | 72 > ++++++++++++++++++++++++++++++---------------- > samples/bpf/xdp_fwd_kern.c | 8 +++--- > 3 files changed, 74 insertions(+), 34 deletions(-) > > diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h > index 59b19b6a40d7..b7db3261c62d 100644 > --- a/include/uapi/linux/bpf.h > +++ b/include/uapi/linux/bpf.h > @@ -1857,7 +1857,8 @@ union bpf_attr { > * is resolved), the nexthop address is returned in ipv4_dst > * or ipv6_dst based on family, smac is set to mac address of > * egress device, dmac is set to nexthop mac address, rt_metric > - * is set to metric from route (IPv4/IPv6 only). > + * is set to metric from route (IPv4/IPv6 only), and ifindex > + * is set to the device index of the nexthop from the FIB lookup. > * > * *plen* argument is the size of the passed in struct. > * *flags* argument can be a combination of one or more of the > @@ -1873,9 +1874,10 @@ union bpf_attr { > * *ctx* is either **struct xdp_md** for XDP programs or > * **struct sk_buff** tc cls_act programs. > * Return > - * Egress device index on success, 0 if packet needs to continue > - * up the stack for further processing or a negative error in > case > - * of failure. > + * * < 0 if any input argument is invalid > + * * 0 on success (packet is forwarded, nexthop neighbor exists) > + * * > 0 one of **BPF_FIB_LKUP_RET_** codes explaining why the > + * * packet is not forwarded or needs assist from full stack > * > * int bpf_sock_hash_update(struct bpf_sock_ops_kern *skops, struct bpf_map > *map, void *key, u64 flags) > * Description > @@ -2612,6 +2614,18 @@ struct bpf_raw_tracepoint_args { > #define BPF_FIB_LOOKUP_DIRECT BIT(0) > #define BPF_FIB_LOOKUP_OUTPUT BIT(1) > > +enum { > + BPF_FIB_LKUP_RET_SUCCESS, /* lookup successful */ > + BPF_FIB_LKUP_RET_BLACKHOLE, /* dest is blackholed; can be dropped */ > + BPF_FIB_LKUP_RET_UNREACHABLE, /* dest is unreachable; can be dropped */ > + BPF_FIB_LKUP_RET_PROHIBIT, /* dest not allowed; can be dropped */ > + BPF_FIB_LKUP_RET_NOT_FWDED, /* packet is not forwarded */ > + BPF_FIB_LKUP_RET_FWD_DISABLED, /* fwding is not enabled on ingress */ > + BPF_FIB_LKUP_RET_UNSUPP_LWT, /* fwd requires encapsulation */ > + BPF_FIB_LKUP_RET_NO_NEIGH, /* no neighbor entry for nh */ > + BPF_FIB_LKUP_RET_FRAG_NEEDED, /* fragmentation required to fwd */ > +}; > + > struct bpf_fib_lookup { > /* input: network family for lookup (AF_INET, AF_INET6) > * output: network family of egress nexthop > @@ -2625,7 +2639,11 @@ struct bpf_fib_lookup { > > /* total length of packet from network header - used for MTU check */ > __u16 tot_len; > - __u32 ifindex; /* L3 device index for lookup */ > + > + /* input: L3 device index for lookup > + * output: device index from FIB lookup > + */ > + __u32 ifindex; > > union { > /* inputs to lookup */ > diff --git a/net/core/filter.c b/net/core/filter.c > index e7f12e9f598c..f8dd8aa89de4 100644 > --- a/net/core/filter.c > +++ b/net/core/filter.c > @@ -4073,8 +4073,9 @@ static int bpf_fib_set_fwd_params(struct bpf_fib_lookup > *params, > memcpy(params->smac, dev->dev_addr, ETH_ALEN); > params->h_vlan_TCI = 0; > params->h_vlan_proto = 0; > + params->ifindex = dev->ifindex; > > - return dev->ifindex; > + return 0; > } > #endif > > @@ -4098,7 +4099,7 @@ static int bpf_ipv4_fib_lookup(struct net *net, struct > bpf_fib_lookup *params, > /* verify forwarding is enabled on this interface */ > in_dev = __in_dev_get_rcu(dev); > if (unlikely(!in_dev || !IN_DEV_FORWARD(in_dev))) > - return 0; > + return BPF_FIB_LKUP_RET_FWD_DISABLED; > > if (flags & BPF_FIB_LOOKUP_OUTPUT) { > fl4.flowi4_iif = 1; > @@ -4123,7 +4124,7 @@ static int bpf_ipv4_fib_lookup(struct net *net, struct > bpf_fib_lookup *params, > > tb = fib_get_table(net, tbid); > if (unlikely(!tb)) > - return 0; > + return BPF_FIB_LKUP_RET_NOT_FWDED; > > err = fib_table_lookup(tb, &fl4, &res, FIB_LOOKUP_NOREF); > } else { > @@ -4135,8 +4136,20 @@ static int bpf_ipv4_fib_lookup(struct net *net, struct > bpf_fib_lookup *params, > err = fib_lookup(net, &fl4, &res, FIB_LOOKUP_NOREF); > } > > - if (err || res.type != RTN_UNICAST) > - return 0; > + if (err) { > + /* map fib lookup errors to RTN_ type */ > + if (err == -EINVAL) > + return BPF_FIB_LKUP_RET_BLACKHOLE; > + if (err == -EHOSTUNREACH) > + return BPF_FIB_LKUP_RET_UNREACHABLE; > + if (err == -EACCES) > + return BPF_FIB_LKUP_RET_PROHIBIT; > + > + return BPF_FIB_LKUP_RET_NOT_FWDED; > + } [...] You change all the semantics of return code here, but this breaks bpf_skb_fib_lookup(). I cannot see how this would work in that case. The code does the following with the bpf_ipv{4,6}_fib_lookup() return code: [...] switch (params->family) { #if IS_ENABLED(CONFIG_INET) case AF_INET: index = bpf_ipv4_fib_lookup(net, params, flags, false); break; #endif #if IS_ENABLED(CONFIG_IPV6) case AF_INET6: index = bpf_ipv6_fib_lookup(net, params, flags, false); break; #endif } if (index > 0) { struct net_device *dev; dev = dev_get_by_index_rcu(net, index); if (!is_skb_forwardable(dev, skb)) index = 0; } [...] So the BPF_FIB_LKUP_* results become the dev ifindex here and the !is_skb_forwardable() case further suggests that the packet *can* be forwarded based on the new semantics whereas MTU check is bypassed on success. It probably helps to craft a selftest for XDP *and* tc case in future, so we can be sure nothing breaks with new changes. Thanks, Daniel
