Re: [RFC bpf-next 8/9] bpf: Provide helper to do lookups in kernel FIB table

2018-04-25 Thread Daniel Borkmann
On 04/25/2018 08:34 PM, David Ahern wrote:
> Provide a helper for doing a FIB and neighbor lookup in the kernel
> tables from an XDP program. The helper provides a fastpath for forwarding
> packets. If the packet is a local delivery or for any reason is not a
> simple lookup and forward, the packet continues up the stack.
> 
> If it is to be forwarded, the forwarding can be done directly if the
> neighbor is already known. If the neighbor does not exist, the first
> few packets go up the stack for neighbor resolution. Once resolved, the
> xdp program provides the fast path.
> 
> On successful lookup the nexthop dmac, current device smac and egress
> device index are returned.
> 
> The API supports IPv4, IPv6 and MPLS protocols, but only IPv4 and IPv6
> are implemented in this patch. The API includes layer 4 parameters if
> the XDP program chooses to do deep packet inspection to allow compare
> against ACLs implemented as FIB rules.
> 
> Header rewrite is left to the XDP program.
> 
> The lookup takes 2 flags:
> - BPF_FIB_LOOKUP_DIRECT to do a lookup that bypasses FIB rules and goes
>   straight to the table associated with the device (expert setting for
>   those looking to maximize throughput)
> 
> - BPF_FIB_LOOKUP_OUTPUT to do a lookup from the egress perspective.
>   Default is an ingress lookup.
> 
> Initial performance numbers collected by Jesper, forwarded packets/sec:
> 
>Full stackXDP FIB lookupXDP Direct lookup
> IPv4   1,947,969   7,074,156  7,415,333
> IPv6   1,728,000   6,165,504  7,262,720
> 
> 
> Signed-off-by: David Ahern 
> ---
>  include/uapi/linux/bpf.h |  68 +-
>  net/core/filter.c| 233 
> +++
>  2 files changed, 300 insertions(+), 1 deletion(-)
> 
> diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
> index e6679393b687..82601c132b9f 100644
> --- a/include/uapi/linux/bpf.h
> +++ b/include/uapi/linux/bpf.h
> @@ -10,6 +10,8 @@
>  
>  #include 
>  #include 
> +#include 
> +#include 
>  
>  /* Extended instruction set based on top of classic BPF */
>  
> @@ -783,6 +785,17 @@ union bpf_attr {
>   * @size: size of 'struct bpf_xfrm_state'
>   * @flags: room for future extensions
>   * Return: 0 on success or negative error
> + *
> + * int bpf_fib_lookup(ctx, params, plen, flags)
> + * Do a FIB lookup based on given parameters
> + * @ctx: pointer to context of type xdp_md

Nit: would just say pointer to context here since used with xdp/skb

> + * @params:  pointer to bpf_fib_lookup
> + * @plen:size of params argument
> + * @flags:   u32 bitmask of BPF_FIB_LOOKUP_* flags
> + * Return: egress device index if packet is to be forwarded,
> + * 0 for local delivery (anything that needs to be handled
> + * by the full stack), or negative on error.
> + * If index is > 0, output data in bpf_fib_lookup is set
>   */
>  #define __BPF_FUNC_MAPPER(FN)\
>   FN(unspec), \
> @@ -851,7 +864,9 @@ union bpf_attr {
>   FN(msg_pull_data),  \
>   FN(bind),   \
>   FN(xdp_adjust_tail),\
> - FN(skb_get_xfrm_state),
> + FN(skb_get_xfrm_state), \
> + FN(fib_lookup), \
> +
>  

Nit: trailing '\' resp. double newline

>  /* integer value in 'imm' field of BPF_CALL instruction selects which helper
>   * function eBPF program intends to call
[...]

> diff --git a/net/core/filter.c b/net/core/filter.c
> index 8e45c6c7ab08..37602b2fb94a 100644
> --- a/net/core/filter.c
> +++ b/net/core/filter.c
> @@ -59,6 +59,10 @@
>  #include 
>  #include 
>  #include 
> +#include 
> +#include 
> +#include 
> +#include 
>  
>  /**
>   *   sk_filter_trim_cap - run a packet through a socket filter
> @@ -3787,6 +3791,231 @@ static const struct bpf_func_proto 
> bpf_skb_get_xfrm_state_proto = {
>  };
>  #endif
>  
> +#if IS_ENABLED(CONFIG_INET) || IS_ENABLED(CONFIG_IPV6)
> +static int bpf_fib_set_fwd_params(struct bpf_fib_lookup *params,
> +   const struct neighbour *neigh,
> +   const struct net_device *dev)
> +{
> + memcpy(params->dmac, neigh->ha, ETH_ALEN);
> + memcpy(params->smac, dev->dev_addr, ETH_ALEN);
> + params->h_vlan_TCI = 0;
> + params->h_vlan_proto = 0;
> +
> + return dev->ifindex;
> +}
> +#endif
> +
> +#if IS_ENABLED(CONFIG_INET)
> +static int bpf_ipv4_fib_lookup(struct xdp_buff *ctx,

Instead of passing xdp_buff here, just pass the netdev pointer. More below
why it's needed.

> +struct bpf_fib_lookup *params, u32 flags)
> +{
> + struct net *net = dev_net(ctx->rxq->dev);
> + struct in_device *in_dev;
> + struct neighbour *neigh;
> + struct net_device *dev;
> + struct fib_result res;
> + struct fib_nh *nh;
> + struct flowi4 fl4;
> + int err;
> +
> + dev = dev_get_by_inde

Re: [RFC bpf-next 8/9] bpf: Provide helper to do lookups in kernel FIB table

2018-04-25 Thread David Ahern
On 4/25/18 1:55 PM, Daniel Borkmann wrote:
>> @@ -3861,6 +4090,8 @@ sk_filter_func_proto(enum bpf_func_id func_id, const 
>> struct bpf_prog *prog)
>>  return &bpf_get_socket_cookie_proto;
>>  case BPF_FUNC_get_socket_uid:
>>  return &bpf_get_socket_uid_proto;
>> +case BPF_FUNC_fib_lookup:
>> +return &bpf_fib_lookup_proto;
> This part doesn't belong to sk_filter_func_proto(), but to the
> tc_cls_act_func_proto() instead.

oops, somewhere in all of the re-basing it got added to the wrong
function. Will fix.

> 
>>  default:
>>  return bpf_base_func_proto(func_id);
>>  }
>> @@ -3957,6 +4188,8 @@ xdp_func_proto(enum bpf_func_id func_id, const struct 
>> bpf_prog *prog)
>>  return &bpf_xdp_redirect_map_proto;
>>  case BPF_FUNC_xdp_adjust_tail:
>>  return &bpf_xdp_adjust_tail_proto;
>> +case BPF_FUNC_fib_lookup:
>> +return &bpf_fib_lookup_proto;
> Basically, you're using the very same bpf_fib_lookup_proto for
> both XDP and skb. In the skb case, you're reusing the two functions
> bpf_ipv{4,6}_fib_lookup(), so when you get the netdev pointer for
> retrieving the netns, you'll crash at dev_net(ctx->rxq->dev) since
> this is XDP only and not skb meta data.
> 
> Therefore, as mentioned, pass the netdev to bpf_ipv{4,6}_fib_lookup()
> to have it generic and have bpf_xdp_fib_lookup_proto and
> bpf_skb_fib_lookup_proto where both are under the case BPF_FUNC_fib_lookup
> in the respective *func_proto(), but using the proper prototypes according
> to their correct context. Meaning, both reuse bpf_ipv{4,6}_fib_lookup()
> from each of their BPF_CALL_4() helper implementation.

ok. I have been focused on the xdp program and not the tc path. Will fix.


Re: [RFC bpf-next 8/9] bpf: Provide helper to do lookups in kernel FIB table

2018-04-27 Thread Martin KaFai Lau
On Wed, Apr 25, 2018 at 11:34:48AM -0700, David Ahern wrote:
> Provide a helper for doing a FIB and neighbor lookup in the kernel
> tables from an XDP program. The helper provides a fastpath for forwarding
> packets. If the packet is a local delivery or for any reason is not a
> simple lookup and forward, the packet continues up the stack.
> 
> If it is to be forwarded, the forwarding can be done directly if the
> neighbor is already known. If the neighbor does not exist, the first
> few packets go up the stack for neighbor resolution. Once resolved, the
> xdp program provides the fast path.
> 
> On successful lookup the nexthop dmac, current device smac and egress
> device index are returned.
> 
> The API supports IPv4, IPv6 and MPLS protocols, but only IPv4 and IPv6
> are implemented in this patch. The API includes layer 4 parameters if
> the XDP program chooses to do deep packet inspection to allow compare
> against ACLs implemented as FIB rules.
> 
> Header rewrite is left to the XDP program.
> 
> The lookup takes 2 flags:
> - BPF_FIB_LOOKUP_DIRECT to do a lookup that bypasses FIB rules and goes
>   straight to the table associated with the device (expert setting for
>   those looking to maximize throughput)
> 
> - BPF_FIB_LOOKUP_OUTPUT to do a lookup from the egress perspective.
>   Default is an ingress lookup.
> 
> Initial performance numbers collected by Jesper, forwarded packets/sec:
> 
>Full stackXDP FIB lookupXDP Direct lookup
> IPv4   1,947,969   7,074,156  7,415,333
> IPv6   1,728,000   6,165,504  7,262,720
> 
> 
> Signed-off-by: David Ahern 
> ---
>  include/uapi/linux/bpf.h |  68 +-
>  net/core/filter.c| 233 
> +++
>  2 files changed, 300 insertions(+), 1 deletion(-)
> 
> diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
> index e6679393b687..82601c132b9f 100644
> --- a/include/uapi/linux/bpf.h
> +++ b/include/uapi/linux/bpf.h
> @@ -10,6 +10,8 @@
>  
>  #include 
>  #include 
> +#include 
> +#include 
>  
>  /* Extended instruction set based on top of classic BPF */
>  
> @@ -783,6 +785,17 @@ union bpf_attr {
>   * @size: size of 'struct bpf_xfrm_state'
>   * @flags: room for future extensions
>   * Return: 0 on success or negative error
> + *
> + * int bpf_fib_lookup(ctx, params, plen, flags)
> + * Do a FIB lookup based on given parameters
> + * @ctx: pointer to context of type xdp_md
> + * @params:  pointer to bpf_fib_lookup
> + * @plen:size of params argument
> + * @flags:   u32 bitmask of BPF_FIB_LOOKUP_* flags
> + * Return: egress device index if packet is to be forwarded,
> + * 0 for local delivery (anything that needs to be handled
> + * by the full stack), or negative on error.
> + * If index is > 0, output data in bpf_fib_lookup is set
>   */
>  #define __BPF_FUNC_MAPPER(FN)\
>   FN(unspec), \
> @@ -851,7 +864,9 @@ union bpf_attr {
>   FN(msg_pull_data),  \
>   FN(bind),   \
>   FN(xdp_adjust_tail),\
> - FN(skb_get_xfrm_state),
> + FN(skb_get_xfrm_state), \
> + FN(fib_lookup), \
> +
>  
>  /* integer value in 'imm' field of BPF_CALL instruction selects which helper
>   * function eBPF program intends to call
> @@ -1255,4 +1270,55 @@ struct bpf_raw_tracepoint_args {
>   __u64 args[0];
>  };
>  
> +/* DIRECT:  Skip the FIB rules and go to FIB table associated with device
> + * OUTPUT:  Do lookup from egress perspective; default is ingress
> + */
> +#define BPF_FIB_LOOKUP_DIRECT  BIT(0)
> +#define BPF_FIB_LOOKUP_OUTPUT  BIT(1)
> +
> +struct bpf_fib_lookup {
> + /* input */
> + __u8family;   /* network family, AF_INET, AF_INET6, AF_MPLS */
> +
> + /* set if lookup is to consider L4 data - e.g., FIB rules */
> + __u8l4_protocol;
> + __be16  sport;
> + __be16  dport;
> +
> + /* total length of packet from network header - used for MTU check */
> + __u16   tot_len;
> + __u32   ifindex;  /* L3 device index for lookup */
> +
> + union {
> + /* inputs to lookup */
> + __u8tos;/* AF_INET  */
> + __be32  flowlabel;  /* AF_INET6 */
> +
> + /* output: metric of fib result */
> + __u32 rt_metric;
> + };
> +
> + union {
> + __be32  mpls_in;
> + __be32  ipv4_src;
> + struct in6_addr ipv6_src;
> + };
> +
> + /* input to bpf_fib_lookup, *dst is destination address.
> +  * output: bpf_fib_lookup sets to gateway address
> +  */
> + union {
> + /* return for MPLS lookups */
> + __be32  mpls_out[4];  /* support up to 4 labels */
> + __be32  ipv4_dst;
> + struct in6_addr ipv6_dst;
> + };
> +
> + /* outp

Re: [RFC bpf-next 8/9] bpf: Provide helper to do lookups in kernel FIB table

2018-04-27 Thread David Ahern
On 4/27/18 10:43 AM, Martin KaFai Lau wrote:
>> +#if IS_ENABLED(CONFIG_IPV6)
>> +static int bpf_ipv6_fib_lookup(struct xdp_buff *ctx,
>> +   struct bpf_fib_lookup *params, u32 flags)
>> +{
>> +struct net *net = dev_net(ctx->rxq->dev);
>> +struct neighbour *neigh;
>> +struct net_device *dev;
>> +struct fib6_info *f6i;
>> +struct flowi6 fl6;
>> +int strict = 0;
>> +int oif;
>> +
>> +/* link local addresses are never forwarded */
>> +if (rt6_need_strict(¶ms->ipv6_dst) ||
>> +rt6_need_strict(¶ms->ipv6_src))
>> +return 0;
>> +
>> +dev = dev_get_by_index_rcu(net, params->ifindex);
>> +if (unlikely(!dev))
>> +return -ENODEV;
>> +
>> +if (flags & BPF_FIB_LOOKUP_OUTPUT) {
>> +fl6.flowi6_iif = 1;
> 1 is for LOOPBACK_IFINDEX?

yes. The intention is to mirror the flow struct created by full stack so
that routing in bpf == routing in IPv6 stack. ip6_route_output_flags
sets flowi6_iif to 1, so I repeated it here.

> 
>> +oif = fl6.flowi6_oif = params->ifindex;
>> +} else {
>> +oif = fl6.flowi6_iif = params->ifindex;
>> +fl6.flowi6_oif = 0;
>> +strict = RT6_LOOKUP_F_HAS_SADDR;
>> +}