Re: [RFC bpf-next 07/11] bpf: Add helper to retrieve socket in BPF

Martin KaFai Lau Thu, 10 May 2018 22:01:09 -0700

On Wed, May 09, 2018 at 02:07:05PM -0700, Joe Stringer wrote:
> This patch adds a new BPF helper function, sk_lookup() which allows BPF
> programs to find out if there is a socket listening on this host, and
> returns a socket pointer which the BPF program can then access to
> determine, for instance, whether to forward or drop traffic. sk_lookup()
> takes a reference on the socket, so when a BPF program makes use of this
> function, it must subsequently pass the returned pointer into the newly
> added sk_release() to return the reference.
> 
> By way of example, the following pseudocode would filter inbound
> connections at XDP if there is no corresponding service listening for
> the traffic:
> 
>   struct bpf_sock_tuple tuple;
>   struct bpf_sock_ops *sk;
> 
>   populate_tuple(ctx, &tuple); // Extract the 5tuple from the packet
>   sk = bpf_sk_lookup(ctx, &tuple, sizeof tuple, netns, 0);
>   if (!sk) {
>     // Couldn't find a socket listening for this traffic. Drop.
>     return TC_ACT_SHOT;
>   }
>   bpf_sk_release(sk, 0);
>   return TC_ACT_OK;
> 
> Signed-off-by: Joe Stringer <j...@wand.net.nz>
> ---
>  include/uapi/linux/bpf.h                  |  39 +++++++++++-
>  kernel/bpf/verifier.c                     |   8 ++-
>  net/core/filter.c                         | 102 
> ++++++++++++++++++++++++++++++
>  tools/include/uapi/linux/bpf.h            |  40 +++++++++++-
>  tools/testing/selftests/bpf/bpf_helpers.h |   7 ++
>  5 files changed, 193 insertions(+), 3 deletions(-)
> 
> diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
> index d615c777b573..29f38838dbca 100644
> --- a/include/uapi/linux/bpf.h
> +++ b/include/uapi/linux/bpf.h
> @@ -1828,6 +1828,25 @@ union bpf_attr {
>   *   Return
>   *           0 on success, or a negative error in case of failure.
>   *
> + * struct bpf_sock_ops *bpf_sk_lookup(ctx, tuple, tuple_size, netns, flags)
> + *   Decription
> + *           Look for socket matching 'tuple'. The return value must be 
> checked,
> + *           and if non-NULL, released via bpf_sk_release().
> + *           @ctx: pointer to ctx
> + *           @tuple: pointer to struct bpf_sock_tuple
> + *           @tuple_size: size of the tuple
> + *           @flags: flags value
> + *   Return
> + *           pointer to socket ops on success, or
> + *           NULL in case of failure
> + *
> + *  int bpf_sk_release(sock, flags)
> + *   Description
> + *           Release the reference held by 'sock'.
> + *           @sock: Pointer reference to release. Must be found via 
> bpf_sk_lookup().
> + *           @flags: flags value
> + *   Return
> + *           0 on success, or a negative error in case of failure.
>   */
>  #define __BPF_FUNC_MAPPER(FN)                \
>       FN(unspec),                     \
> @@ -1898,7 +1917,9 @@ union bpf_attr {
>       FN(xdp_adjust_tail),            \
>       FN(skb_get_xfrm_state),         \
>       FN(get_stack),                  \
> -     FN(skb_load_bytes_relative),
> +     FN(skb_load_bytes_relative),    \
> +     FN(sk_lookup),                  \
> +     FN(sk_release),
>  
>  /* integer value in 'imm' field of BPF_CALL instruction selects which helper
>   * function eBPF program intends to call
> @@ -2060,6 +2081,22 @@ struct bpf_sock {
>                                */
>  };
>  
> +struct bpf_sock_tuple {
> +     union {
> +             __be32 ipv6[4];
> +             __be32 ipv4;
> +     } saddr;
> +     union {
> +             __be32 ipv6[4];
> +             __be32 ipv4;
> +     } daddr;
> +     __be16 sport;
> +     __be16 dport;
> +     __u32 dst_if;
> +     __u8 family;
> +     __u8 proto;
> +};
> +
>  #define XDP_PACKET_HEADROOM 256
>  
>  /* User return codes for XDP prog type.
> diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
> index 92b9a5dc465a..579012c483e4 100644
> --- a/kernel/bpf/verifier.c
> +++ b/kernel/bpf/verifier.c
> @@ -153,6 +153,12 @@ static const struct bpf_verifier_ops * const 
> bpf_verifier_ops[] = {
>   * PTR_TO_MAP_VALUE, PTR_TO_SOCKET_OR_NULL becomes PTR_TO_SOCKET when the 
> type
>   * passes through a NULL-check conditional. For the branch wherein the state 
> is
>   * changed to CONST_IMM, the verifier releases the reference.
> + *
> + * For each helper function that allocates a reference, such as 
> bpf_sk_lookup(),
> + * there is a corresponding release function, such as bpf_sk_release(). When
> + * a reference type passes into the release function, the verifier also 
> releases
> + * the reference. If any unchecked or unreleased reference remains at the 
> end of
> + * the program, the verifier rejects it.
>   */
>  
>  /* verifier_state + insn_idx are pushed to stack when branch is encountered 
> */
> @@ -277,7 +283,7 @@ static bool arg_type_is_refcounted(enum bpf_arg_type type)
>   */
>  static bool is_release_function(enum bpf_func_id func_id)
>  {
> -     return false;
> +     return func_id == BPF_FUNC_sk_release;
>  }
>  
>  /* string representation of 'enum bpf_reg_type' */
> diff --git a/net/core/filter.c b/net/core/filter.c
> index 4c35152fb3a8..751c255d17d3 100644
> --- a/net/core/filter.c
> +++ b/net/core/filter.c
> @@ -58,8 +58,12 @@
>  #include <net/busy_poll.h>
>  #include <net/tcp.h>
>  #include <net/xfrm.h>
> +#include <net/udp.h>
>  #include <linux/bpf_trace.h>
>  #include <net/xdp_sock.h>
> +#include <net/inet_hashtables.h>
> +#include <net/inet6_hashtables.h>
> +#include <net/net_namespace.h>
>  
>  /**
>   *   sk_filter_trim_cap - run a packet through a socket filter
> @@ -4032,6 +4036,96 @@ static const struct bpf_func_proto 
> bpf_skb_get_xfrm_state_proto = {
>  };
>  #endif
>  
> +struct sock *
> +sk_lookup(struct net *net, struct bpf_sock_tuple *tuple) {
Would it be possible to have another version that
returns a sk without taking its refcnt?
It may have performance benefit.


> +     int dst_if = (int)tuple->dst_if;
> +     struct in6_addr *src6;
> +     struct in6_addr *dst6;
> +
> +     if (tuple->family == AF_INET6) {
> +             src6 = (struct in6_addr *)&tuple->saddr.ipv6;
> +             dst6 = (struct in6_addr *)&tuple->daddr.ipv6;
> +     } else if (tuple->family != AF_INET) {
> +             return ERR_PTR(-EOPNOTSUPP);
> +     }
> +
> +     if (tuple->proto == IPPROTO_TCP) {
> +             if (tuple->family == AF_INET)
> +                     return inet_lookup(net, &tcp_hashinfo, NULL, 0,
> +                                        tuple->saddr.ipv4, tuple->sport,
> +                                        tuple->daddr.ipv4, tuple->dport,
> +                                        dst_if);
> +             else
> +                     return inet6_lookup(net, &tcp_hashinfo, NULL, 0,
> +                                         src6, tuple->sport,
> +                                         dst6, tuple->dport, dst_if);
> +     } else if (tuple->proto == IPPROTO_UDP) {
> +             if (tuple->family == AF_INET)
> +                     return udp4_lib_lookup(net, tuple->saddr.ipv4,
> +                                            tuple->sport, tuple->daddr.ipv4,
> +                                            tuple->dport, dst_if);
> +             else
> +                     return udp6_lib_lookup(net, src6, tuple->sport,
> +                                            dst6, tuple->dport, dst_if);
> +     } else {
> +             return ERR_PTR(-EOPNOTSUPP);
> +     }
> +
> +     return NULL;
> +}
> +
> +BPF_CALL_5(bpf_sk_lookup, struct sk_buff *, skb,
> +        struct bpf_sock_tuple *, tuple, u32, len, u32, netns_id, u64, flags)
> +{
> +     struct net *caller_net = dev_net(skb->dev);
> +     struct sock *sk = NULL;
> +     struct net *net;
> +
> +     /* XXX: Perform verification-time checking of tuple size? */
> +     if (unlikely(len != sizeof(struct bpf_sock_tuple) || flags))
> +             goto out;
> +
> +     net = get_net_ns_by_id(caller_net, netns_id);
> +     if (unlikely(!net))
> +             goto out;
> +
> +     sk = sk_lookup(net, tuple);
> +     put_net(net);
> +     if (IS_ERR_OR_NULL(sk))
> +             sk = NULL;
> +     else
> +             sk = sk_to_full_sk(sk);
> +out:
> +     return (unsigned long) sk;
> +}
> +
> +static const struct bpf_func_proto bpf_sk_lookup_proto = {
> +     .func           = bpf_sk_lookup,
> +     .gpl_only       = false,
> +     .ret_type       = RET_PTR_TO_SOCKET_OR_NULL,
> +     .arg1_type      = ARG_PTR_TO_CTX,
> +     .arg2_type      = ARG_PTR_TO_MEM,
> +     .arg3_type      = ARG_CONST_SIZE,
> +     .arg4_type      = ARG_ANYTHING,
> +     .arg5_type      = ARG_ANYTHING,
> +};
> +
> +BPF_CALL_2(bpf_sk_release, struct sock *, sk, u64, flags)
> +{
> +     sock_gen_put(sk);
> +     if (unlikely(flags))
> +             return -EINVAL;
> +     return 0;
> +}
> +
> +static const struct bpf_func_proto bpf_sk_release_proto = {
> +     .func           = bpf_sk_release,
> +     .gpl_only       = false,
> +     .ret_type       = RET_INTEGER,
> +     .arg1_type      = ARG_PTR_TO_SOCKET,
> +     .arg2_type      = ARG_ANYTHING,
> +};
> +
>  static const struct bpf_func_proto *
>  bpf_base_func_proto(enum bpf_func_id func_id)
>  {
> @@ -4181,6 +4275,10 @@ tc_cls_act_func_proto(enum bpf_func_id func_id, const 
> struct bpf_prog *prog)
>       case BPF_FUNC_skb_get_xfrm_state:
>               return &bpf_skb_get_xfrm_state_proto;
>  #endif
> +     case BPF_FUNC_sk_lookup:
> +             return &bpf_sk_lookup_proto;
> +     case BPF_FUNC_sk_release:
> +             return &bpf_sk_release_proto;
>       default:
>               return bpf_base_func_proto(func_id);
>       }
> @@ -4292,6 +4390,10 @@ sk_skb_func_proto(enum bpf_func_id func_id, const 
> struct bpf_prog *prog)
>               return &bpf_get_socket_uid_proto;
>       case BPF_FUNC_sk_redirect_map:
>               return &bpf_sk_redirect_map_proto;
> +     case BPF_FUNC_sk_lookup:
> +             return &bpf_sk_lookup_proto;
> +     case BPF_FUNC_sk_release:
> +             return &bpf_sk_release_proto;
>       default:
>               return bpf_base_func_proto(func_id);
>       }
> diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h
> index fff51c187d1e..29f38838dbca 100644
> --- a/tools/include/uapi/linux/bpf.h
> +++ b/tools/include/uapi/linux/bpf.h
> @@ -117,6 +117,7 @@ enum bpf_map_type {
>       BPF_MAP_TYPE_DEVMAP,
>       BPF_MAP_TYPE_SOCKMAP,
>       BPF_MAP_TYPE_CPUMAP,
> +     BPF_MAP_TYPE_XSKMAP,
>  };
>  
>  enum bpf_prog_type {
> @@ -1827,6 +1828,25 @@ union bpf_attr {
>   *   Return
>   *           0 on success, or a negative error in case of failure.
>   *
> + * struct bpf_sock_ops *bpf_sk_lookup(ctx, tuple, tuple_size, netns, flags)
> + *   Decription
> + *           Look for socket matching 'tuple'. The return value must be 
> checked,
> + *           and if non-NULL, released via bpf_sk_release().
> + *           @ctx: pointer to ctx
> + *           @tuple: pointer to struct bpf_sock_tuple
> + *           @tuple_size: size of the tuple
> + *           @flags: flags value
> + *   Return
> + *           pointer to socket ops on success, or
> + *           NULL in case of failure
> + *
> + *  int bpf_sk_release(sock, flags)
> + *   Description
> + *           Release the reference held by 'sock'.
> + *           @sock: Pointer reference to release. Must be found via 
> bpf_sk_lookup().
> + *           @flags: flags value
> + *   Return
> + *           0 on success, or a negative error in case of failure.
>   */
>  #define __BPF_FUNC_MAPPER(FN)                \
>       FN(unspec),                     \
> @@ -1897,7 +1917,9 @@ union bpf_attr {
>       FN(xdp_adjust_tail),            \
>       FN(skb_get_xfrm_state),         \
>       FN(get_stack),                  \
> -     FN(skb_load_bytes_relative),
> +     FN(skb_load_bytes_relative),    \
> +     FN(sk_lookup),                  \
> +     FN(sk_release),
>  
>  /* integer value in 'imm' field of BPF_CALL instruction selects which helper
>   * function eBPF program intends to call
> @@ -2059,6 +2081,22 @@ struct bpf_sock {
>                                */
>  };
>  
> +struct bpf_sock_tuple {
> +     union {
> +             __be32 ipv6[4];
> +             __be32 ipv4;
> +     } saddr;
> +     union {
> +             __be32 ipv6[4];
> +             __be32 ipv4;
> +     } daddr;
> +     __be16 sport;
> +     __be16 dport;
> +     __u32 dst_if;
> +     __u8 family;
> +     __u8 proto;
> +};
> +
>  #define XDP_PACKET_HEADROOM 256
>  
>  /* User return codes for XDP prog type.
> diff --git a/tools/testing/selftests/bpf/bpf_helpers.h 
> b/tools/testing/selftests/bpf/bpf_helpers.h
> index 265f8e0e8ada..4dc311ea0c16 100644
> --- a/tools/testing/selftests/bpf/bpf_helpers.h
> +++ b/tools/testing/selftests/bpf/bpf_helpers.h
> @@ -103,6 +103,13 @@ static int (*bpf_skb_get_xfrm_state)(void *ctx, int 
> index, void *state,
>       (void *) BPF_FUNC_skb_get_xfrm_state;
>  static int (*bpf_get_stack)(void *ctx, void *buf, int size, int flags) =
>       (void *) BPF_FUNC_get_stack;
> +static struct bpf_sock *(*bpf_sk_lookup)(void *ctx,
> +                                      struct bpf_sock_tuple *tuple,
> +                                      int size, unsigned int netns_id,
> +                                      unsigned long long flags) =
> +     (void *) BPF_FUNC_sk_lookup;
> +static int (*bpf_sk_release)(struct bpf_sock *sk, unsigned long long flags) =
> +     (void *) BPF_FUNC_sk_release;
>  
>  /* llvm builtin functions that eBPF C program may use to
>   * emit BPF_LD_ABS and BPF_LD_IND instructions
> -- 
> 2.14.1
>

Re: [RFC bpf-next 07/11] bpf: Add helper to retrieve socket in BPF

Reply via email to