Hi Joe, couple of comments inline:
On 09/21/2018 07:10 PM, Joe Stringer wrote: > This patch adds new BPF helper functions, bpf_sk_lookup_tcp() and > bpf_sk_lookup_udp() which allows BPF programs to find out if there is a > socket listening on this host, and returns a socket pointer which the > BPF program can then access to determine, for instance, whether to > forward or drop traffic. bpf_sk_lookup_xxx() may take a reference on the > socket, so when a BPF program makes use of this function, it must > subsequently pass the returned pointer into the newly added sk_release() > to return the reference. > > By way of example, the following pseudocode would filter inbound > connections at XDP if there is no corresponding service listening for > the traffic: > > struct bpf_sock_tuple tuple; > struct bpf_sock_ops *sk; > > populate_tuple(ctx, &tuple); // Extract the 5tuple from the packet > sk = bpf_sk_lookup_tcp(ctx, &tuple, sizeof tuple, netns, 0); > if (!sk) { > // Couldn't find a socket listening for this traffic. Drop. > return TC_ACT_SHOT; > } > bpf_sk_release(sk, 0); > return TC_ACT_OK; > > Signed-off-by: Joe Stringer <j...@wand.net.nz> > > --- > > v2: Rework 'struct bpf_sock_tuple' to allow passing a packet pointer > Limit netns_id field to 32 bits > Fix compile error with CONFIG_IPV6 enabled > Allow direct packet access from helper > --- > include/uapi/linux/bpf.h | 57 ++++++++- > kernel/bpf/verifier.c | 8 +- > net/core/filter.c | 149 ++++++++++++++++++++++ > tools/include/uapi/linux/bpf.h | 57 ++++++++- > tools/testing/selftests/bpf/bpf_helpers.h | 12 ++ > 5 files changed, 280 insertions(+), 3 deletions(-) > > diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h > index aa5ccd2385ed..620adbb09a94 100644 > --- a/include/uapi/linux/bpf.h > +++ b/include/uapi/linux/bpf.h > @@ -2143,6 +2143,41 @@ union bpf_attr { > * request in the skb. > * Return > * 0 on success, or a negative error in case of failure. > + * > + * struct bpf_sock_ops *bpf_sk_lookup_tcp(ctx, tuple, tuple_size, netns, > flags) Nit: could you add proper signature like in other cases that are documented? > + * Decription > + * Look for TCP socket matching 'tuple'. The return value must > + * be checked, and if non-NULL, released via bpf_sk_release(). > + * @ctx: pointer to ctx > + * @tuple: pointer to struct bpf_sock_tuple > + * @tuple_size: size of the tuple > + * @netns: network namespace id > + * @flags: flags value Should probably say in all three cases that it's unused right now and reserved for future. > + * Return > + * pointer to socket ops on success, or > + * NULL in case of failure > + * > + * struct bpf_sock_ops *bpf_sk_lookup_udp(ctx, tuple, tuple_size, netns, > flags) > + * Decription > + * Look for UDP socket matching 'tuple'. The return value must > + * be checked, and if non-NULL, released via bpf_sk_release(). > + * @ctx: pointer to ctx > + * @tuple: pointer to struct bpf_sock_tuple > + * @tuple_size: size of the tuple > + * @netns: network namespace id > + * @flags: flags value > + * Return > + * pointer to socket ops on success, or > + * NULL in case of failure > + * > + * int bpf_sk_release(sock, flags) > + * Description > + * Release the reference held by 'sock'. > + * @sock: Pointer reference to release. Must be found via > + * bpf_sk_lookup_xxx(). > + * @flags: flags value > + * Return > + * 0 on success, or a negative error in case of failure. > */ > #define __BPF_FUNC_MAPPER(FN) \ > FN(unspec), \ > @@ -2228,7 +2263,10 @@ union bpf_attr { > FN(get_current_cgroup_id), \ > FN(get_local_storage), \ > FN(sk_select_reuseport), \ > - FN(skb_ancestor_cgroup_id), > + FN(skb_ancestor_cgroup_id), \ > + FN(sk_lookup_tcp), \ > + FN(sk_lookup_udp), \ > + FN(sk_release), > > /* integer value in 'imm' field of BPF_CALL instruction selects which helper > * function eBPF program intends to call > @@ -2398,6 +2436,23 @@ struct bpf_sock { > */ > }; > > +struct bpf_sock_tuple { > + union { > + struct { > + __be32 saddr; > + __be32 daddr; > + __be16 sport; > + __be16 dport; > + } ipv4; > + struct { > + __be32 saddr[4]; > + __be32 daddr[4]; > + __be16 sport; > + __be16 dport; > + } ipv6; > + }; > +}; > + > #define XDP_PACKET_HEADROOM 256 > > /* User return codes for XDP prog type. > diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c > index 16818508b225..7b7fa94aba58 100644 > --- a/kernel/bpf/verifier.c > +++ b/kernel/bpf/verifier.c > @@ -153,6 +153,12 @@ static const struct bpf_verifier_ops * const > bpf_verifier_ops[] = { > * PTR_TO_MAP_VALUE, PTR_TO_SOCKET_OR_NULL becomes PTR_TO_SOCKET when the > type > * passes through a NULL-check conditional. For the branch wherein the state > is > * changed to CONST_IMM, the verifier releases the reference. > + * > + * For each helper function that allocates a reference, such as > + * bpf_sk_lookup_tcp(), there is a corresponding release function, such as > + * bpf_sk_release(). When a reference type passes into the release function, > + * the verifier also releases the reference. If any unchecked or unreleased > + * reference remains at the end of the program, the verifier rejects it. > */ > > /* verifier_state + insn_idx are pushed to stack when branch is encountered > */ > @@ -300,7 +306,7 @@ static bool arg_type_is_refcounted(enum bpf_arg_type type) > */ > static bool is_release_function(enum bpf_func_id func_id) > { > - return false; > + return func_id == BPF_FUNC_sk_release; > } > > /* string representation of 'enum bpf_reg_type' */ > diff --git a/net/core/filter.c b/net/core/filter.c > index 43f81883f31d..f0715e2e7b07 100644 > --- a/net/core/filter.c > +++ b/net/core/filter.c > @@ -58,13 +58,17 @@ > #include <net/busy_poll.h> > #include <net/tcp.h> > #include <net/xfrm.h> > +#include <net/udp.h> > #include <linux/bpf_trace.h> > #include <net/xdp_sock.h> > #include <linux/inetdevice.h> > +#include <net/inet_hashtables.h> > +#include <net/inet6_hashtables.h> > #include <net/ip_fib.h> > #include <net/flow.h> > #include <net/arp.h> > #include <net/ipv6.h> > +#include <net/net_namespace.h> > #include <linux/seg6_local.h> > #include <net/seg6.h> > #include <net/seg6_local.h> > @@ -4812,6 +4816,139 @@ static const struct bpf_func_proto > bpf_lwt_seg6_adjust_srh_proto = { > }; > #endif /* CONFIG_IPV6_SEG6_BPF */ > > +struct sock *sk_lookup(struct net *net, struct bpf_sock_tuple *tuple, > + struct sk_buff *skb, u8 family, u8 proto) > +{ > + int dif = skb->dev->ifindex; > + bool refcounted = false; > + struct sock *sk = NULL; > + > + if (family == AF_INET) { > + __be32 src4 = tuple->ipv4.saddr; > + __be32 dst4 = tuple->ipv4.daddr; > + int sdif = inet_sdif(skb); > + > + if (proto == IPPROTO_TCP) > + sk = __inet_lookup(net, &tcp_hashinfo, skb, 0, > + src4, tuple->ipv4.sport, > + dst4, tuple->ipv4.dport, > + dif, sdif, &refcounted); > + else > + sk = __udp4_lib_lookup(net, src4, tuple->ipv4.sport, > + dst4, tuple->ipv4.dport, > + dif, sdif, &udp_table, skb); > +#if IS_ENABLED(CONFIG_IPV6) > + } else { > + struct in6_addr *src6 = (struct in6_addr *)&tuple->ipv6.saddr; > + struct in6_addr *dst6 = (struct in6_addr *)&tuple->ipv6.daddr; > + int sdif = inet6_sdif(skb); > + > + if (proto == IPPROTO_TCP) > + sk = __inet6_lookup(net, &tcp_hashinfo, skb, 0, > + src6, tuple->ipv6.sport, > + dst6, tuple->ipv6.dport, > + dif, sdif, &refcounted); > + else > + sk = __udp6_lib_lookup(net, src6, tuple->ipv6.sport, > + dst6, tuple->ipv6.dport, > + dif, sdif, &udp_table, skb); > +#endif > + } > + > + if (unlikely(sk && !refcounted && !sock_flag(sk, SOCK_RCU_FREE))) { > + WARN_ONCE(1, "Found non-RCU, unreferenced socket!"); > + sk = NULL; > + } > + return sk; > +} > + > +/* bpf_sk_lookup performs the core lookup for different types of sockets, > + * taking a reference on the socket if it doesn't have the flag > SOCK_RCU_FREE. > + * Returns the socket as an 'unsigned long' to simplify the casting in the > + * callers to satisfy BPF_CALL declarations. > + */ > +static unsigned long > +bpf_sk_lookup(struct sk_buff *skb, struct bpf_sock_tuple *tuple, u32 len, > + u8 proto, u64 netns_id, u64 flags) > +{ > + struct net *caller_net = dev_net(skb->dev); For sk_skb programs, are we *always* guaranteed to have a skb->dev assigned? This definitely holds true for tc programs, but afaik not for sk_skb ones where you enable the helpers below, so this would result in a NULL ptr dereference. > + struct sock *sk = NULL; > + u8 family = AF_UNSPEC; > + struct net *net; > + > + family = len == sizeof(tuple->ipv4) ? AF_INET : AF_INET6; > + if (unlikely(family == AF_UNSPEC || netns_id > U32_MAX || flags)) > + goto out; > + > + if (netns_id) > + net = get_net_ns_by_id(caller_net, netns_id); > + else > + net = caller_net; > + if (unlikely(!net)) > + goto out; > + sk = sk_lookup(net, tuple, skb, family, proto); > + put_net(net); > + > + if (sk) > + sk = sk_to_full_sk(sk); > +out: > + return (unsigned long) sk; > +} > + > +BPF_CALL_5(bpf_sk_lookup_tcp, struct sk_buff *, skb, > + struct bpf_sock_tuple *, tuple, u32, len, u64, netns_id, u64, flags) > +{ > + return bpf_sk_lookup(skb, tuple, len, IPPROTO_TCP, netns_id, flags); > +} > + > +static const struct bpf_func_proto bpf_sk_lookup_tcp_proto = { > + .func = bpf_sk_lookup_tcp, > + .gpl_only = false, > + .pkt_access = true, > + .ret_type = RET_PTR_TO_SOCKET_OR_NULL, > + .arg1_type = ARG_PTR_TO_CTX, > + .arg2_type = ARG_PTR_TO_MEM, > + .arg3_type = ARG_CONST_SIZE, > + .arg4_type = ARG_ANYTHING, > + .arg5_type = ARG_ANYTHING, > +}; > + > +BPF_CALL_5(bpf_sk_lookup_udp, struct sk_buff *, skb, > + struct bpf_sock_tuple *, tuple, u32, len, u64, netns_id, u64, flags) > +{ > + return bpf_sk_lookup(skb, tuple, len, IPPROTO_UDP, netns_id, flags); > +} > + > +static const struct bpf_func_proto bpf_sk_lookup_udp_proto = { > + .func = bpf_sk_lookup_udp, > + .gpl_only = false, > + .pkt_access = true, > + .ret_type = RET_PTR_TO_SOCKET_OR_NULL, > + .arg1_type = ARG_PTR_TO_CTX, > + .arg2_type = ARG_PTR_TO_MEM, > + .arg3_type = ARG_CONST_SIZE, > + .arg4_type = ARG_ANYTHING, > + .arg5_type = ARG_ANYTHING, > +}; > + > +BPF_CALL_2(bpf_sk_release, struct sock *, sk, u64, flags) > +{ > + if (!sock_flag(sk, SOCK_RCU_FREE)) > + sock_gen_put(sk); > + > + if (unlikely(flags)) > + return -EINVAL; I guess it's probably okay to leave here, though I'm wondering whether it's worth to have a flags part in general in this helper. We need to release the reference in any case beforehands anyway as we otherwise leak it. Personally, I'd probably that arg here. > + return 0; > +} > + > +static const struct bpf_func_proto bpf_sk_release_proto = { > + .func = bpf_sk_release, > + .gpl_only = false, > + .ret_type = RET_INTEGER, > + .arg1_type = ARG_PTR_TO_SOCKET, > + .arg2_type = ARG_ANYTHING, > +}; > + > bool bpf_helper_changes_pkt_data(void *func) > { > if (func == bpf_skb_vlan_push || > @@ -5018,6 +5155,12 @@ tc_cls_act_func_proto(enum bpf_func_id func_id, const > struct bpf_prog *prog) > case BPF_FUNC_skb_ancestor_cgroup_id: > return &bpf_skb_ancestor_cgroup_id_proto; > #endif > + case BPF_FUNC_sk_lookup_tcp: > + return &bpf_sk_lookup_tcp_proto; > + case BPF_FUNC_sk_lookup_udp: > + return &bpf_sk_lookup_udp_proto; > + case BPF_FUNC_sk_release: > + return &bpf_sk_release_proto; > default: > return bpf_base_func_proto(func_id); > } > @@ -5118,6 +5261,12 @@ sk_skb_func_proto(enum bpf_func_id func_id, const > struct bpf_prog *prog) > return &bpf_sk_redirect_hash_proto; > case BPF_FUNC_get_local_storage: > return &bpf_get_local_storage_proto; > + case BPF_FUNC_sk_lookup_tcp: > + return &bpf_sk_lookup_tcp_proto; > + case BPF_FUNC_sk_lookup_udp: > + return &bpf_sk_lookup_udp_proto; > + case BPF_FUNC_sk_release: > + return &bpf_sk_release_proto; (See comment above.) > default: > return bpf_base_func_proto(func_id); > } > diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h > index aa5ccd2385ed..620adbb09a94 100644 > --- a/tools/include/uapi/linux/bpf.h > +++ b/tools/include/uapi/linux/bpf.h > @@ -2143,6 +2143,41 @@ union bpf_attr { > * request in the skb. > * Return > * 0 on success, or a negative error in case of failure. > + * > + * struct bpf_sock_ops *bpf_sk_lookup_tcp(ctx, tuple, tuple_size, netns, > flags) > + * Decription > + * Look for TCP socket matching 'tuple'. The return value must > + * be checked, and if non-NULL, released via bpf_sk_release(). > + * @ctx: pointer to ctx > + * @tuple: pointer to struct bpf_sock_tuple > + * @tuple_size: size of the tuple > + * @netns: network namespace id > + * @flags: flags value > + * Return > + * pointer to socket ops on success, or > + * NULL in case of failure > + * > + * struct bpf_sock_ops *bpf_sk_lookup_udp(ctx, tuple, tuple_size, netns, > flags) > + * Decription > + * Look for UDP socket matching 'tuple'. The return value must > + * be checked, and if non-NULL, released via bpf_sk_release(). > + * @ctx: pointer to ctx > + * @tuple: pointer to struct bpf_sock_tuple > + * @tuple_size: size of the tuple > + * @netns: network namespace id > + * @flags: flags value > + * Return > + * pointer to socket ops on success, or > + * NULL in case of failure > + * > + * int bpf_sk_release(sock, flags) > + * Description > + * Release the reference held by 'sock'. > + * @sock: Pointer reference to release. Must be found via > + * bpf_sk_lookup_xxx(). > + * @flags: flags value > + * Return > + * 0 on success, or a negative error in case of failure. > */ > #define __BPF_FUNC_MAPPER(FN) \ > FN(unspec), \ > @@ -2228,7 +2263,10 @@ union bpf_attr { > FN(get_current_cgroup_id), \ > FN(get_local_storage), \ > FN(sk_select_reuseport), \ > - FN(skb_ancestor_cgroup_id), > + FN(skb_ancestor_cgroup_id), \ > + FN(sk_lookup_tcp), \ > + FN(sk_lookup_udp), \ > + FN(sk_release), > > /* integer value in 'imm' field of BPF_CALL instruction selects which helper > * function eBPF program intends to call > @@ -2398,6 +2436,23 @@ struct bpf_sock { > */ > }; > > +struct bpf_sock_tuple { > + union { > + struct { > + __be32 saddr; > + __be32 daddr; > + __be16 sport; > + __be16 dport; > + } ipv4; > + struct { > + __be32 saddr[4]; > + __be32 daddr[4]; > + __be16 sport; > + __be16 dport; > + } ipv6; > + }; > +}; > + > #define XDP_PACKET_HEADROOM 256 > > /* User return codes for XDP prog type. > diff --git a/tools/testing/selftests/bpf/bpf_helpers.h > b/tools/testing/selftests/bpf/bpf_helpers.h > index e4be7730222d..88ce00c3aa0f 100644 > --- a/tools/testing/selftests/bpf/bpf_helpers.h > +++ b/tools/testing/selftests/bpf/bpf_helpers.h > @@ -143,6 +143,18 @@ static unsigned long long (*bpf_skb_cgroup_id)(void > *ctx) = > (void *) BPF_FUNC_skb_cgroup_id; > static unsigned long long (*bpf_skb_ancestor_cgroup_id)(void *ctx, int > level) = > (void *) BPF_FUNC_skb_ancestor_cgroup_id; > +static struct bpf_sock *(*bpf_sk_lookup_tcp)(void *ctx, > + struct bpf_sock_tuple *tuple, > + int size, unsigned int netns_id, > + unsigned long long flags) = > + (void *) BPF_FUNC_sk_lookup_tcp; > +static struct bpf_sock *(*bpf_sk_lookup_udp)(void *ctx, > + struct bpf_sock_tuple *tuple, > + int size, unsigned int netns_id, > + unsigned long long flags) = > + (void *) BPF_FUNC_sk_lookup_udp; > +static int (*bpf_sk_release)(struct bpf_sock *sk, unsigned long long flags) = > + (void *) BPF_FUNC_sk_release; > > /* llvm builtin functions that eBPF C program may use to > * emit BPF_LD_ABS and BPF_LD_IND instructions >