On Thu, Aug 3, 2017 at 4:37 PM, John Fastabend <john.fastab...@gmail.com> wrote: > Recently we added a new map type called dev map used to forward XDP > packets between ports (6093ec2dc313). This patches introduces a > similar notion for sockets. > > A sockmap allows users to add participating sockets to a map. When > sockets are added to the map enough context is stored with the > map entry to use the entry with a new helper > > bpf_sk_redirect_map(map, key, flags) > > This helper (analogous to bpf_redirect_map in XDP) is given the map > and an entry in the map. When called from a sockmap program, discussed > below, the skb will be sent on the socket using skb_send_sock(). > > With the above we need a bpf program to call the helper from that will > then implement the send logic. The initial site implemented in this > series is the recv_sock hook. For this to work we implemented a map > attach command to add attributes to a map. In sockmap we add two > programs a parse program and a verdict program. The parse program > uses strparser to build messages and pass them to the verdict program. > The parse program usese normal strparser semantics. The verdict > program is of type SOCKET_FILTER. > > The verdict program returns a verdict BPF_OK, BPF_DROP, BPF_REDIRECT. > When BPF_REDIRECT is returned, expected when bpf program uses > bpf_sk_redirect_map(), the sockmap logic will consult per cpu variables > set by the helper routine and pull the sock entry out of the sock map. > This pattern follows the existing redirect logic in cls and xdp > programs. > Hi John,
I'm a bit confused. If the verdict program bpf_mux then? I don't see any use of BPF_OK,DROP, or REDIRECT. I assume I'm missing something. Tom > This gives the flow, > > recv_sock -> str_parser (parse_prog) -> verdict_prog -> skb_send_sock > > As an example use case a message based load balancer may use specific > logic in the verdict program to select the sock to send on. > > Example and sample programs are provided in future patches that > hopefully illustrate the user interfaces. > > TBD: bpf program refcnt'ing needs to be cleaned up, some additional > cleanup in a few error paths, publish performance numbers and some > self tests. > > Signed-off-by: John Fastabend <john.fastab...@gmail.com> > --- > include/linux/bpf.h | 11 + > include/linux/bpf_types.h | 1 > include/uapi/linux/bpf.h | 13 + > kernel/bpf/Makefile | 2 > kernel/bpf/helpers.c | 20 + > kernel/bpf/sockmap.c | 623 > +++++++++++++++++++++++++++++++++++++++++++++ > kernel/bpf/syscall.c | 41 +++ > net/core/filter.c | 51 ++++ > 8 files changed, 759 insertions(+), 3 deletions(-) > create mode 100644 kernel/bpf/sockmap.c > > diff --git a/include/linux/bpf.h b/include/linux/bpf.h > index 6353c74..9ce6aa0 100644 > --- a/include/linux/bpf.h > +++ b/include/linux/bpf.h > @@ -15,6 +15,8 @@ > #include <linux/err.h> > #include <linux/rbtree_latch.h> > > +#include <net/sock.h> > + > struct perf_event; > struct bpf_map; > > @@ -29,6 +31,9 @@ struct bpf_map_ops { > /* funcs callable from userspace and from eBPF programs */ > void *(*map_lookup_elem)(struct bpf_map *map, void *key); > int (*map_update_elem)(struct bpf_map *map, void *key, void *value, > u64 flags); > + int (*map_ctx_update_elem)(struct bpf_sock_ops_kern *skops, > + struct bpf_map *map, > + void *key, u64 flags, u64 map_flags); > int (*map_delete_elem)(struct bpf_map *map, void *key); > > /* funcs called by prog_array and perf_event_array map */ > @@ -37,6 +42,7 @@ struct bpf_map_ops { > void (*map_fd_put_ptr)(void *ptr); > u32 (*map_gen_lookup)(struct bpf_map *map, struct bpf_insn *insn_buf); > u32 (*map_fd_sys_lookup_elem)(void *ptr); > + int (*map_attach)(struct bpf_map *map, struct bpf_prog *p1, struct > bpf_prog *p2); > }; > > struct bpf_map { > @@ -321,6 +327,7 @@ static inline void bpf_long_memcpy(void *dst, const void > *src, u32 size) > > /* Map specifics */ > struct net_device *__dev_map_lookup_elem(struct bpf_map *map, u32 key); > +struct sock *__sock_map_lookup_elem(struct bpf_map *map, u32 key); > void __dev_map_insert_ctx(struct bpf_map *map, u32 index); > void __dev_map_flush(struct bpf_map *map); > > @@ -378,9 +385,13 @@ static inline void __dev_map_flush(struct bpf_map *map) > } > #endif /* CONFIG_BPF_SYSCALL */ > > +inline struct sock *do_sk_redirect_map(void); > +inline u64 get_sk_redirect_flags(void); > + > /* verifier prototypes for helper functions called from eBPF programs */ > extern const struct bpf_func_proto bpf_map_lookup_elem_proto; > extern const struct bpf_func_proto bpf_map_update_elem_proto; > +extern const struct bpf_func_proto bpf_map_ctx_update_elem_proto; > extern const struct bpf_func_proto bpf_map_delete_elem_proto; > > extern const struct bpf_func_proto bpf_get_prandom_u32_proto; > diff --git a/include/linux/bpf_types.h b/include/linux/bpf_types.h > index b1e1035..930be52 100644 > --- a/include/linux/bpf_types.h > +++ b/include/linux/bpf_types.h > @@ -37,4 +37,5 @@ > BPF_MAP_TYPE(BPF_MAP_TYPE_HASH_OF_MAPS, htab_of_maps_map_ops) > #ifdef CONFIG_NET > BPF_MAP_TYPE(BPF_MAP_TYPE_DEVMAP, dev_map_ops) > +BPF_MAP_TYPE(BPF_MAP_TYPE_SOCKMAP, sock_map_ops) > #endif > diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h > index 1106a8c..a89e831 100644 > --- a/include/uapi/linux/bpf.h > +++ b/include/uapi/linux/bpf.h > @@ -105,6 +105,7 @@ enum bpf_map_type { > BPF_MAP_TYPE_ARRAY_OF_MAPS, > BPF_MAP_TYPE_HASH_OF_MAPS, > BPF_MAP_TYPE_DEVMAP, > + BPF_MAP_TYPE_SOCKMAP, > }; > > enum bpf_prog_type { > @@ -129,6 +130,7 @@ enum bpf_attach_type { > BPF_CGROUP_INET_EGRESS, > BPF_CGROUP_INET_SOCK_CREATE, > BPF_CGROUP_SOCK_OPS, > + BPF_SOCKMAP_INGRESS, > __MAX_BPF_ATTACH_TYPE > }; > > @@ -205,6 +207,7 @@ enum bpf_attach_type { > __u32 attach_bpf_fd; /* eBPF program to attach */ > __u32 attach_type; > __u32 attach_flags; > + __u32 attach_bpf_fd2; > }; > > struct { /* anonymous struct used by BPF_PROG_TEST_RUN command */ > @@ -598,7 +601,9 @@ enum bpf_attach_type { > FN(set_hash), \ > FN(setsockopt), \ > FN(skb_adjust_room), \ > - FN(redirect_map), > + FN(redirect_map), \ > + FN(sk_redirect_map), \ > + FN(map_ctx_update_elem), \ > > /* integer value in 'imm' field of BPF_CALL instruction selects which helper > * function eBPF program intends to call > @@ -735,6 +740,12 @@ struct xdp_md { > __u32 data_end; > }; > > +enum sk_action { > + SK_ABORTED = 0, > + SK_DROP, > + SK_REDIRECT, > +}; > + > #define BPF_TAG_SIZE 8 > > struct bpf_prog_info { > diff --git a/kernel/bpf/Makefile b/kernel/bpf/Makefile > index 48e9270..3089102 100644 > --- a/kernel/bpf/Makefile > +++ b/kernel/bpf/Makefile > @@ -3,7 +3,7 @@ obj-y := core.o > obj-$(CONFIG_BPF_SYSCALL) += syscall.o verifier.o inode.o helpers.o > obj-$(CONFIG_BPF_SYSCALL) += hashtab.o arraymap.o percpu_freelist.o > bpf_lru_list.o lpm_trie.o map_in_map.o > ifeq ($(CONFIG_NET),y) > -obj-$(CONFIG_BPF_SYSCALL) += devmap.o > +obj-$(CONFIG_BPF_SYSCALL) += devmap.o sockmap.o > endif > ifeq ($(CONFIG_PERF_EVENTS),y) > obj-$(CONFIG_BPF_SYSCALL) += stackmap.o > diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c > index 3d24e23..feb38e0 100644 > --- a/kernel/bpf/helpers.c > +++ b/kernel/bpf/helpers.c > @@ -43,6 +43,26 @@ > .arg2_type = ARG_PTR_TO_MAP_KEY, > }; > > +BPF_CALL_5(bpf_ctx_map_update_elem, struct bpf_sock_ops_kern *, bpf_sock, > + struct bpf_map *, map, void *, key, u64, flags, u64, map_flags) > +{ > + WARN_ON_ONCE(!rcu_read_lock_held()); > + return map->ops->map_ctx_update_elem(bpf_sock, map, key, > + flags, map_flags); > +} > + > +const struct bpf_func_proto bpf_map_ctx_update_elem_proto = { > + .func = bpf_ctx_map_update_elem, > + .gpl_only = false, > + .pkt_access = true, > + .ret_type = RET_INTEGER, > + .arg1_type = ARG_PTR_TO_CTX, > + .arg2_type = ARG_CONST_MAP_PTR, > + .arg3_type = ARG_PTR_TO_MAP_KEY, > + .arg4_type = ARG_ANYTHING, > + .arg5_type = ARG_ANYTHING, > +}; > + > BPF_CALL_4(bpf_map_update_elem, struct bpf_map *, map, void *, key, > void *, value, u64, flags) > { > diff --git a/kernel/bpf/sockmap.c b/kernel/bpf/sockmap.c > new file mode 100644 > index 0000000..9e88c32 > --- /dev/null > +++ b/kernel/bpf/sockmap.c > @@ -0,0 +1,623 @@ > +/* Copyright (c) 2017 Covalent IO, Inc. http://covalent.io > + * > + * This program is free software; you can redistribute it and/or > + * modify it under the terms of version 2 of the GNU General Public > + * License as published by the Free Software Foundation. > + * > + * This program is distributed in the hope that it will be useful, but > + * WITHOUT ANY WARRANTY; without even the implied warranty of > + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU > + * General Public License for more details. > + */ > + > +/* A BPF sock_map is used to store sock objects. This is primarly used > + * for doing socket redirect with BPF helper routines. > + * > + * A sock map may have two BPF programs attached to it, a program used > + * to parse packets and a program to provide a verdict and redirect > + * decision on the packet. If no BPF parse program is provided it is > + * assumed that every skb is a "message" (skb->len). Otherwise the > + * parse program is attached to strparser and used to build messages > + * that may span multiple skbs. The verdict program will either select > + * a socket to send/receive the skb on or provide the drop code indicating > + * the skb should be dropped. More actions may be added later as needed. > + * The default program will drop packets. > + * > + * For reference this program is similar to devmap used in XDP context > + * reviewing these together may be useful. For a set of examples and > + * test codes using this map please review ./samples/bpf/sockmap/ here > + * you can find common usages such as a socket level load balancer and > + * cgroup integration. > + */ > +#include <linux/bpf.h> > +#include <linux/jhash.h> > +#include <linux/filter.h> > +#include <net/sock.h> > +#include <linux/rculist_nulls.h> > +#include "percpu_freelist.h" > +#include "bpf_lru_list.h" > +#include "map_in_map.h" > + > +#include <linux/errno.h> > +#include <linux/file.h> > +#include <linux/in.h> > +#include <linux/kernel.h> > +#include <linux/module.h> > +#include <linux/net.h> > +#include <linux/rculist.h> > +#include <linux/skbuff.h> > +#include <linux/socket.h> > +#include <linux/workqueue.h> > +#include <linux/list.h> > +#include <linux/bpf.h> > +#include <net/strparser.h> > +#include <net/netns/generic.h> > +#include <net/sock.h> > + > +struct bpf_stab { > + struct bpf_map map; > + struct sock **sock_map; > + struct bpf_prog *bpf_parse; > + struct bpf_prog *bpf_mux; > +}; > + > +struct smap_psock { > + struct rcu_head rcu; > + > + /* datapath variables used under sock lock */ > + struct sk_buff_head rxqueue; > + > + bool strp_enabled; > + > + /* datapath error path cache across tx work invocations */ > + int save_rem; > + int save_off; > + struct sk_buff *save_skb; > + u32 tx_stopped : 1; > + > + struct strparser strp; > + struct bpf_prog *bpf_parse; > + struct bpf_prog *bpf_mux; > + struct bpf_map *map; > + > + /* Back reference to the file descriptor of the sock */ > + int key; > + struct sock *sock; > + > + struct work_struct tx_work; > + > + void (*save_data_ready)(struct sock *sk); > + void (*save_write_space)(struct sock *sk); > + void (*save_state_change)(struct sock *sk); > +}; > + > +static inline struct smap_psock *smap_psock_sk(const struct sock *sk) > +{ > + return (struct smap_psock *)sk->sk_user_data; > +} > + > +static int smap_mux_func(struct smap_psock *psock, struct sk_buff *skb) > +{ > + struct bpf_prog *prog = psock->bpf_mux; > + int rc; > + > + if (unlikely(!prog)) > + return 0; > + > + skb->sk = psock->sock; > + rc = (*prog->bpf_func)(skb, prog->insnsi); > + skb->sk = NULL; > + > + return rc; > +} > + > +static struct smap_psock *smap_peers_get(struct smap_psock *psock, > + struct sk_buff *skb) > +{ > + struct sock *sock; > + int rc; > + > + rc = smap_mux_func(psock, skb); > + if (unlikely(rc < 0)) > + return NULL; > + > + sock = do_sk_redirect_map(); > + if (unlikely(!sock)) > + return NULL; > + > + return smap_psock_sk(sock); > +} > + > +static void smap_report_sk_error(struct smap_psock *psock, int err) > +{ > + struct sock *sk = psock->sock; > + > + sk->sk_err = err; > + sk->sk_error_report(sk); > +} > + > +static int sock_map_delete_elem(struct bpf_map *map, void *key); > + > +static void smap_state_change(struct sock *sk) > +{ > + struct smap_psock *psock = smap_psock_sk(sk); > + > + /* Allowing transitions into established an syn_recv states allows > + * for early binding sockets to a smap object before the connection > + * is established. All other transitions indicate the connection is > + * being torn down so tear down the smap socket. > + */ > + switch (sk->sk_state) { > + case TCP_SYN_RECV: > + case TCP_ESTABLISHED: > + break; > + case TCP_CLOSE_WAIT: > + case TCP_CLOSING: > + case TCP_LAST_ACK: > + case TCP_FIN_WAIT1: > + case TCP_FIN_WAIT2: > + case TCP_LISTEN: > + break; > + case TCP_CLOSE: > + sock_map_delete_elem(psock->map, &psock->key); > + break; > + default: > + smap_report_sk_error(psock, EPIPE); > + break; > + } > +} > + > +static void smap_tx_work(struct work_struct *w); > + > +void schedule_writer(struct smap_psock *psock) > +{ > + schedule_work(&psock->tx_work); > +} > + > +static int smap_tx_writer(struct smap_psock *peer) > +{ > + schedule_writer(peer); > + return 0; > +} > + > +static void smap_read_sock_strparser(struct strparser *strp, > + struct sk_buff *skb) > +{ > + struct smap_psock *psock = container_of(strp, > + struct smap_psock, strp); > + struct smap_psock *peer; > + > + /* TBD useful dbg, add trace here with output sock index or drop */ > + rcu_read_lock(); > + peer = smap_peers_get(psock, skb); > + if (unlikely(!peer)) { > + kfree_skb(skb); > + goto out; > + } > + > + skb_queue_tail(&peer->rxqueue, skb); > + smap_tx_writer(peer); > +out: > + rcu_read_unlock(); > +} > + > +/* Called with lock held on socket */ > +static void smap_data_ready(struct sock *sk) > +{ > + struct smap_psock *psock; > + > + read_lock_bh(&sk->sk_callback_lock); > + > + psock = smap_psock_sk(sk); > + if (likely(psock)) > + strp_data_ready(&psock->strp); > + > + read_unlock_bh(&sk->sk_callback_lock); > +} > + > +static void smap_tx_work(struct work_struct *w) > +{ > + struct smap_psock *psock; > + struct sk_buff *skb; > + int rem, off, n; > + > + psock = container_of(w, struct smap_psock, tx_work); > + if (unlikely(psock->tx_stopped)) > + return; > + > + if (psock->save_skb) { > + skb = psock->save_skb; > + rem = psock->save_rem; > + off = psock->save_off; > + psock->save_skb = NULL; > + goto start; > + } > + > + while ((skb = skb_dequeue(&psock->rxqueue))) { > + rem = skb->len; > + off = 0; > +start: > + do { > + n = skb_send_sock(psock->sock, skb, off, rem); > + if (n <= 0) { > + if (n == -EAGAIN) { > + /* Save state to try again when > + * there's write space on the > + * socket. > + */ > + psock->save_skb = skb; > + psock->save_rem = rem; > + psock->save_off = off; > + break; > + } > + > + /* Got a hard error or socket had > + * been closed somehow. Report this > + * on the transport socket. > + */ > + smap_report_sk_error(psock, n ? -n : EPIPE); > + psock->tx_stopped = 1; > + break; > + } > + rem -= n; > + off += n; > + } while (rem); > + } > +} > + > +static void smap_write_space(struct sock *sk) > +{ > + struct smap_psock *psock = smap_psock_sk(sk); > + > + schedule_writer(psock); > +} > + > +static void smap_stop_sock(struct smap_psock *psock, bool destroy) > +{ > + struct sock *sk = psock->sock; > + > + write_lock_bh(&sk->sk_callback_lock); > + if (psock->strp_enabled) { > + sk->sk_data_ready = psock->save_data_ready; > + sk->sk_write_space = psock->save_write_space; > + sk->sk_state_change = psock->save_state_change; > + strp_stop(&psock->strp); > + } > + > + if (destroy) > + sk->sk_user_data = NULL; > + write_unlock_bh(&sk->sk_callback_lock); > + > + if (psock->strp_enabled) > + strp_done(&psock->strp); > + psock->strp_enabled = false; > +} > + > +static void smap_destroy_psock(struct rcu_head *rcu) > +{ > + struct smap_psock *psock = container_of(rcu, > + struct smap_psock, rcu); > + > + smap_stop_sock(psock, true); > + cancel_work_sync(&psock->tx_work); > + __skb_queue_purge(&psock->rxqueue); > + sock_put(psock->sock); > + kfree(psock); > +} > + > +static void smap_release_proxy(struct sock *sock) > +{ > + struct smap_psock *psock = smap_psock_sk(sock); > + > + call_rcu(&psock->rcu, smap_destroy_psock); > +} > + > +static int smap_parse_func_strparser(struct strparser *strp, > + struct sk_buff *skb) > +{ > + struct smap_psock *psock = container_of(strp, > + struct smap_psock, strp); > + struct bpf_prog *prog = psock->bpf_parse; > + > + if (unlikely(!prog)) > + return skb->len; > + > + return (*prog->bpf_func)(skb, prog->insnsi); > +} > + > + > +static int smap_read_sock_done(struct strparser *strp, int err) > +{ > + return err; > +} > + > +static int smap_init_sock(struct smap_psock *psock, > + struct sock *sock) > +{ > + struct strp_callbacks cb; > + int err; > + > + cb.rcv_msg = smap_read_sock_strparser; > + cb.abort_parser = NULL; > + cb.parse_msg = smap_parse_func_strparser; > + cb.read_sock_done = smap_read_sock_done; > + > + err = strp_init(&psock->strp, sock, &cb); > + if (err) > + return -EINVAL; > + return 0; > +} > + > +static void smap_init_progs(struct smap_psock *psock, struct bpf_stab *stab) > +{ > + /* TBD need prog_put and gets here to avoid programs leaving > + * us or something in attach > + */ > + if (psock->bpf_mux != stab->bpf_mux) > + psock->bpf_mux = stab->bpf_mux; > + > + if (psock->bpf_parse != stab->bpf_parse) > + psock->bpf_parse = stab->bpf_parse; > +} > + > +static int smap_start_sock(struct smap_psock *psock, struct sock *sk) > +{ > + int err = 0; > + > + write_lock_bh(&sk->sk_callback_lock); > + /* only start socket if it is not already running */ > + if (psock->save_data_ready) { > + err = -EINVAL; > + goto out; > + } > + psock->save_data_ready = sk->sk_data_ready; > + psock->save_write_space = sk->sk_write_space; > + psock->save_state_change = sk->sk_state_change; > + sk->sk_data_ready = smap_data_ready; > + sk->sk_write_space = smap_write_space; > + sk->sk_state_change = smap_state_change; > +out: > + write_unlock_bh(&sk->sk_callback_lock); > + return err; > +} > + > +static struct smap_psock *smap_init_psock(struct sock *sock, > + struct bpf_stab *stab) > +{ > + struct smap_psock *psock; > + > + psock = kmalloc(sizeof(struct smap_psock), GFP_ATOMIC); > + if (!psock) > + return ERR_PTR(-ENOMEM); > + > + memset(psock, 0, sizeof(struct smap_psock)); > + smap_init_progs(psock, stab); > + psock->sock = sock; > + > + skb_queue_head_init(&psock->rxqueue); > + INIT_WORK(&psock->tx_work, smap_tx_work); > + > + write_lock_bh(&sock->sk_callback_lock); > + sock->sk_user_data = psock; > + write_unlock_bh(&sock->sk_callback_lock); > + > + sock_hold(sock); > + return psock; > +} > + > +#define SOCK_MAP_STRPARSER 0x01 > +/* BPF map logic */ > +static struct bpf_map *sock_map_alloc(union bpf_attr *attr) > +{ > + struct bpf_stab *stab; > + int err = -EINVAL; > + u64 cost; > + > + /* check sanity of attributes */ > + if (attr->max_entries == 0 || attr->key_size != 4 || > + attr->value_size != 4 || attr->map_flags) > + return ERR_PTR(-EINVAL); > + > + /* if value_size is bigger, the user space won't be able to > + * access the elements. > + */ > + if (attr->value_size > KMALLOC_MAX_SIZE) > + return ERR_PTR(-E2BIG); > + > + stab = kzalloc(sizeof(*stab), GFP_USER); > + if (!stab) > + return ERR_PTR(-ENOMEM); > + > + /* mandatory map attributes */ > + stab->map.map_type = attr->map_type; > + stab->map.key_size = attr->key_size; > + stab->map.value_size = attr->value_size; > + stab->map.max_entries = attr->max_entries; > + stab->map.map_flags = attr->map_flags; > + > + > + /* make sure page count doesn't overflow */ > + cost = (u64) stab->map.max_entries * sizeof(struct sock *) + > + sizeof(struct socket *); > + stab->map.pages = round_up(cost, PAGE_SIZE) >> PAGE_SHIFT; > + > + err = -ENOMEM; > + > + /* if map size is larger than memlock limit, reject it early */ > + err = bpf_map_precharge_memlock(stab->map.pages); > + if (err) > + goto free_stab; > + > + stab->sock_map = bpf_map_area_alloc(stab->map.max_entries * > + sizeof(struct sock *)); > + if (!stab->sock_map) > + goto free_stab; > + > + return &stab->map; > + /* TBD release progs on errors */ > +free_stab: > + kfree(stab); > + return ERR_PTR(err); > +} > + > +static void sock_map_free(struct bpf_map *map) > +{ > + struct bpf_stab *stab = container_of(map, struct bpf_stab, map); > + int i; > + > + synchronize_rcu(); > + > + for (i = 0; i < stab->map.max_entries; i++) { > + struct sock *sock; > + > + sock = stab->sock_map[i]; > + if (!sock) > + continue; > + > + smap_release_proxy(sock); > + } > + > + bpf_map_area_free(stab->sock_map); > + if (stab->bpf_mux) > + bpf_prog_put(stab->bpf_mux); > + if (stab->bpf_parse) > + bpf_prog_put(stab->bpf_mux); > + kfree(stab); > +} > + > +static int sock_map_get_next_key(struct bpf_map *map, void *key, void > *next_key) > +{ > + struct bpf_stab *stab = container_of(map, struct bpf_stab, map); > + u32 i = key ? *(u32 *)key : U32_MAX; > + u32 *next = (u32 *)next_key; > + > + if (i >= stab->map.max_entries) { > + *next = 0; > + return 0; > + } > + > + if (i == stab->map.max_entries - 1) > + return -ENOENT; > + > + *next = i + 1; > + return 0; > +} > + > +struct sock *__sock_map_lookup_elem(struct bpf_map *map, u32 key) > +{ > + struct bpf_stab *stab = container_of(map, struct bpf_stab, map); > + > + if (key >= map->max_entries) > + return NULL; > + > + return stab->sock_map[key]; > +} > + > +static void *sock_map_lookup_elem(struct bpf_map *map, void *key) > +{ > + struct bpf_stab *stab = container_of(map, struct bpf_stab, map); > + struct sock *sock; > + u32 i = *(u32 *)key; > + > + if (i >= map->max_entries) > + return NULL; > + > + sock = stab->sock_map[i]; > + return NULL; > +} > + > +static int sock_map_delete_elem(struct bpf_map *map, void *key) > +{ > + struct bpf_stab *stab = container_of(map, struct bpf_stab, map); > + struct sock *sock; > + int k = *(u32 *)key; > + > + if (k >= map->max_entries) > + return -EINVAL; > + > + sock = stab->sock_map[k]; > + if (!sock) > + return -EINVAL; > + > + smap_release_proxy(sock); > + return 0; > +} > + > +static int sock_map_update_elem(struct bpf_sock_ops_kern *skops, > + struct bpf_map *map, > + void *key, u64 flags, u64 map_flags) > +{ > + struct bpf_stab *stab = container_of(map, struct bpf_stab, map); > + struct sock *old_sock, *sock; > + struct smap_psock *psock = NULL; > + u32 i = *(u32 *)key; > + bool update = false; > + > + if (unlikely(flags > BPF_EXIST)) > + return -EINVAL; > + > + if (unlikely(i >= stab->map.max_entries)) > + return -E2BIG; > + > + if (unlikely(map_flags > SOCK_MAP_STRPARSER)) > + return -EINVAL; > + > + if (flags == BPF_EXIST || flags == BPF_ANY) { > + sock = rcu_dereference(stab->sock_map[i]); > + > + if (!sock && flags == BPF_EXIST) { > + return -ENOENT; > + } else if (sock && sock != skops->sk) { > + return -EINVAL; > + } else if (sock) { > + psock = smap_psock_sk(sock); > + update = true; > + } > + } > + > + if (!psock) { > + sock = skops->sk; > + psock = smap_init_psock(sock, stab); > + if (IS_ERR(psock)) > + return PTR_ERR(psock); > + psock->key = i; > + psock->map = map; > + } > + > + if (map_flags & SOCK_MAP_STRPARSER) { > + smap_start_sock(psock, sock); > + smap_init_progs(psock, stab); > + smap_init_sock(psock, sock); > + psock->strp_enabled = true; > + } else if (update) { > + smap_stop_sock(psock, false); > + } > + > + if (!update) { > + old_sock = xchg(&stab->sock_map[i], skops->sk); > + if (old_sock) > + smap_release_proxy(old_sock); > + } > + > + return 0; > +} > + > +static int sock_map_attach_prog(struct bpf_map *map, > + struct bpf_prog *parse, struct bpf_prog *mux) > +{ > + struct bpf_stab *stab = container_of(map, struct bpf_stab, map); > + > + stab->bpf_parse = parse; > + stab->bpf_mux = mux; > + return 0; > +} > + > +const struct bpf_map_ops sock_map_ops = { > + .map_alloc = sock_map_alloc, > + .map_free = sock_map_free, > + .map_get_next_key = sock_map_get_next_key, > + .map_lookup_elem = sock_map_lookup_elem, > + .map_ctx_update_elem = sock_map_update_elem, > + .map_delete_elem = sock_map_delete_elem, > + .map_attach = sock_map_attach_prog, > +}; > diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c > index 6c772ad..e4f48f5 100644 > --- a/kernel/bpf/syscall.c > +++ b/kernel/bpf/syscall.c > @@ -1045,7 +1045,40 @@ static int bpf_obj_get(const union bpf_attr *attr) > > #ifdef CONFIG_CGROUP_BPF > > -#define BPF_PROG_ATTACH_LAST_FIELD attach_flags > +#define BPF_PROG_ATTACH_LAST_FIELD attach_bpf_fd2 > + > +static int sockmap_get_from_fd(const union bpf_attr *attr, int ptype) > +{ > + struct bpf_prog *prog1, *prog2; > + struct bpf_map *map; > + int err; > + > + map = bpf_map_get_with_uref(attr->target_fd); > + if (IS_ERR(map)) > + return PTR_ERR(map); > + > + if (!map->ops->map_attach) > + return -EOPNOTSUPP; > + > + prog1 = bpf_prog_get_type(attr->attach_bpf_fd, ptype); > + if (IS_ERR(prog1)) > + return PTR_ERR(prog1); > + > + prog2 = bpf_prog_get_type(attr->attach_bpf_fd2, ptype); > + if (IS_ERR(prog2)) { > + bpf_prog_put(prog1); > + return PTR_ERR(prog2); > + } > + > + err = map->ops->map_attach(map, prog1, prog2); > + if (err) { > + bpf_prog_put(prog1); > + bpf_prog_put(prog2); > + return PTR_ERR(map); > + } > + > + return err; > +} > > static int bpf_prog_attach(const union bpf_attr *attr) > { > @@ -1074,10 +1107,16 @@ static int bpf_prog_attach(const union bpf_attr *attr) > case BPF_CGROUP_SOCK_OPS: > ptype = BPF_PROG_TYPE_SOCK_OPS; > break; > + case BPF_SOCKMAP_INGRESS: > + ptype = BPF_PROG_TYPE_SOCKET_FILTER; > + break; > default: > return -EINVAL; > } > > + if (attr->attach_type == BPF_SOCKMAP_INGRESS) > + return sockmap_get_from_fd(attr, ptype); > + > prog = bpf_prog_get_type(attr->attach_bpf_fd, ptype); > if (IS_ERR(prog)) > return PTR_ERR(prog); > diff --git a/net/core/filter.c b/net/core/filter.c > index 7e97086..2644f2d 100644 > --- a/net/core/filter.c > +++ b/net/core/filter.c > @@ -1845,6 +1845,51 @@ int skb_do_redirect(struct sk_buff *skb) > .arg3_type = ARG_ANYTHING, > }; > > +BPF_CALL_3(bpf_sk_redirect_map, struct bpf_map *, map, u32, key, u64, flags) > +{ > + struct redirect_info *ri = this_cpu_ptr(&redirect_info); > + > + ri->ifindex = key; > + ri->flags = flags; > + ri->map = map; > + > + return SK_REDIRECT; > +} > + > +inline struct sock *do_sk_redirect_map(void) > +{ > + struct redirect_info *ri = this_cpu_ptr(&redirect_info); > + struct sock *sk = NULL; > + > + if (ri->map) { > + sk = __sock_map_lookup_elem(ri->map, ri->ifindex); > + > + ri->ifindex = 0; > + ri->map = NULL; > + /* we do not clear flags for future lookup */ > + } > + > + return sk; > +} > +EXPORT_SYMBOL(do_sk_redirect_map); > + > +inline u64 get_sk_redirect_flags(void) > +{ > + struct redirect_info *ri = this_cpu_ptr(&redirect_info); > + > + return ri->flags; > +} > +EXPORT_SYMBOL(get_sk_redirect_flags); > + > +static const struct bpf_func_proto bpf_sk_redirect_map_proto = { > + .func = bpf_sk_redirect_map, > + .gpl_only = false, > + .ret_type = RET_INTEGER, > + .arg1_type = ARG_CONST_MAP_PTR, > + .arg2_type = ARG_ANYTHING, > + .arg3_type = ARG_ANYTHING, > +}; > + > BPF_CALL_1(bpf_get_cgroup_classid, const struct sk_buff *, skb) > { > return task_get_classid(skb); > @@ -3090,6 +3135,10 @@ static unsigned long bpf_xdp_copy(void *dst_buff, > const void *src_buff, > return &bpf_get_socket_cookie_proto; > case BPF_FUNC_get_socket_uid: > return &bpf_get_socket_uid_proto; > + case BPF_FUNC_sk_redirect_map: > + return &bpf_sk_redirect_map_proto; > + case BPF_FUNC_map_ctx_update_elem: > + return &bpf_map_ctx_update_elem_proto; > default: > return bpf_base_func_proto(func_id); > } > @@ -3214,6 +3263,8 @@ static unsigned long bpf_xdp_copy(void *dst_buff, const > void *src_buff, > switch (func_id) { > case BPF_FUNC_setsockopt: > return &bpf_setsockopt_proto; > + case BPF_FUNC_map_ctx_update_elem: > + return &bpf_map_ctx_update_elem_proto; > default: > return bpf_base_func_proto(func_id); > } >