Re: [RFC PATCH 4/6] net: sockmap with sk redirect support

Tom Herbert Thu, 03 Aug 2017 21:22:58 -0700

On Thu, Aug 3, 2017 at 4:37 PM, John Fastabend <john.fastab...@gmail.com> wrote:
> Recently we added a new map type called dev map used to forward XDP
> packets between ports (6093ec2dc313). This patches introduces a
> similar notion for sockets.
>
> A sockmap allows users to add participating sockets to a map. When
> sockets are added to the map enough context is stored with the
> map entry to use the entry with a new helper
>
>   bpf_sk_redirect_map(map, key, flags)
>
> This helper (analogous to bpf_redirect_map in XDP) is given the map
> and an entry in the map. When called from a sockmap program, discussed
> below, the skb will be sent on the socket using skb_send_sock().
>
> With the above we need a bpf program to call the helper from that will
> then implement the send logic. The initial site implemented in this
> series is the recv_sock hook. For this to work we implemented a map
> attach command to add attributes to a map. In sockmap we add two
> programs a parse program and a verdict program. The parse program
> uses strparser to build messages and pass them to the verdict program.
> The parse program usese normal strparser semantics. The verdict
> program is of type SOCKET_FILTER.
>
> The verdict program returns a verdict BPF_OK, BPF_DROP, BPF_REDIRECT.
> When BPF_REDIRECT is returned, expected when bpf program uses
> bpf_sk_redirect_map(), the sockmap logic will consult per cpu variables
> set by the helper routine and pull the sock entry out of the sock map.
> This pattern follows the existing redirect logic in cls and xdp
> programs.
>
Hi John,


I'm a bit confused. If the verdict program bpf_mux then? I don't see
any use of BPF_OK,DROP, or REDIRECT. I assume I'm missing something.

Tom

> This gives the flow,
>
>  recv_sock -> str_parser (parse_prog) -> verdict_prog -> skb_send_sock
>
> As an example use case a message based load balancer may use specific
> logic in the verdict program to select the sock to send on.
>
> Example and sample programs are provided in future patches that
> hopefully illustrate the user interfaces.
>
> TBD: bpf program refcnt'ing needs to be cleaned up, some additional
> cleanup in a few error paths, publish performance numbers and some
> self tests.
>
> Signed-off-by: John Fastabend <john.fastab...@gmail.com>
> ---
>  include/linux/bpf.h       |   11 +
>  include/linux/bpf_types.h |    1
>  include/uapi/linux/bpf.h  |   13 +
>  kernel/bpf/Makefile       |    2
>  kernel/bpf/helpers.c      |   20 +
>  kernel/bpf/sockmap.c      |  623 
> +++++++++++++++++++++++++++++++++++++++++++++
>  kernel/bpf/syscall.c      |   41 +++
>  net/core/filter.c         |   51 ++++
>  8 files changed, 759 insertions(+), 3 deletions(-)
>  create mode 100644 kernel/bpf/sockmap.c
>
> diff --git a/include/linux/bpf.h b/include/linux/bpf.h
> index 6353c74..9ce6aa0 100644
> --- a/include/linux/bpf.h
> +++ b/include/linux/bpf.h
> @@ -15,6 +15,8 @@
>  #include <linux/err.h>
>  #include <linux/rbtree_latch.h>
>
> +#include <net/sock.h>
> +
>  struct perf_event;
>  struct bpf_map;
>
> @@ -29,6 +31,9 @@ struct bpf_map_ops {
>         /* funcs callable from userspace and from eBPF programs */
>         void *(*map_lookup_elem)(struct bpf_map *map, void *key);
>         int (*map_update_elem)(struct bpf_map *map, void *key, void *value, 
> u64 flags);
> +       int (*map_ctx_update_elem)(struct bpf_sock_ops_kern *skops,
> +                                  struct bpf_map *map,
> +                                  void *key, u64 flags, u64 map_flags);
>         int (*map_delete_elem)(struct bpf_map *map, void *key);
>
>         /* funcs called by prog_array and perf_event_array map */
> @@ -37,6 +42,7 @@ struct bpf_map_ops {
>         void (*map_fd_put_ptr)(void *ptr);
>         u32 (*map_gen_lookup)(struct bpf_map *map, struct bpf_insn *insn_buf);
>         u32 (*map_fd_sys_lookup_elem)(void *ptr);
> +       int (*map_attach)(struct bpf_map *map, struct bpf_prog *p1, struct 
> bpf_prog *p2);
>  };
>
>  struct bpf_map {
> @@ -321,6 +327,7 @@ static inline void bpf_long_memcpy(void *dst, const void 
> *src, u32 size)
>
>  /* Map specifics */
>  struct net_device  *__dev_map_lookup_elem(struct bpf_map *map, u32 key);
> +struct sock  *__sock_map_lookup_elem(struct bpf_map *map, u32 key);
>  void __dev_map_insert_ctx(struct bpf_map *map, u32 index);
>  void __dev_map_flush(struct bpf_map *map);
>
> @@ -378,9 +385,13 @@ static inline void __dev_map_flush(struct bpf_map *map)
>  }
>  #endif /* CONFIG_BPF_SYSCALL */
>
> +inline struct sock *do_sk_redirect_map(void);
> +inline u64 get_sk_redirect_flags(void);
> +
>  /* verifier prototypes for helper functions called from eBPF programs */
>  extern const struct bpf_func_proto bpf_map_lookup_elem_proto;
>  extern const struct bpf_func_proto bpf_map_update_elem_proto;
> +extern const struct bpf_func_proto bpf_map_ctx_update_elem_proto;
>  extern const struct bpf_func_proto bpf_map_delete_elem_proto;
>
>  extern const struct bpf_func_proto bpf_get_prandom_u32_proto;
> diff --git a/include/linux/bpf_types.h b/include/linux/bpf_types.h
> index b1e1035..930be52 100644
> --- a/include/linux/bpf_types.h
> +++ b/include/linux/bpf_types.h
> @@ -37,4 +37,5 @@
>  BPF_MAP_TYPE(BPF_MAP_TYPE_HASH_OF_MAPS, htab_of_maps_map_ops)
>  #ifdef CONFIG_NET
>  BPF_MAP_TYPE(BPF_MAP_TYPE_DEVMAP, dev_map_ops)
> +BPF_MAP_TYPE(BPF_MAP_TYPE_SOCKMAP, sock_map_ops)
>  #endif
> diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
> index 1106a8c..a89e831 100644
> --- a/include/uapi/linux/bpf.h
> +++ b/include/uapi/linux/bpf.h
> @@ -105,6 +105,7 @@ enum bpf_map_type {
>         BPF_MAP_TYPE_ARRAY_OF_MAPS,
>         BPF_MAP_TYPE_HASH_OF_MAPS,
>         BPF_MAP_TYPE_DEVMAP,
> +       BPF_MAP_TYPE_SOCKMAP,
>  };
>
>  enum bpf_prog_type {
> @@ -129,6 +130,7 @@ enum bpf_attach_type {
>         BPF_CGROUP_INET_EGRESS,
>         BPF_CGROUP_INET_SOCK_CREATE,
>         BPF_CGROUP_SOCK_OPS,
> +       BPF_SOCKMAP_INGRESS,
>         __MAX_BPF_ATTACH_TYPE
>  };
>
> @@ -205,6 +207,7 @@ enum bpf_attach_type {
>                 __u32           attach_bpf_fd;  /* eBPF program to attach */
>                 __u32           attach_type;
>                 __u32           attach_flags;
> +               __u32           attach_bpf_fd2;
>         };
>
>         struct { /* anonymous struct used by BPF_PROG_TEST_RUN command */
> @@ -598,7 +601,9 @@ enum bpf_attach_type {
>         FN(set_hash),                   \
>         FN(setsockopt),                 \
>         FN(skb_adjust_room),            \
> -       FN(redirect_map),
> +       FN(redirect_map),               \
> +       FN(sk_redirect_map),            \
> +       FN(map_ctx_update_elem),        \
>
>  /* integer value in 'imm' field of BPF_CALL instruction selects which helper
>   * function eBPF program intends to call
> @@ -735,6 +740,12 @@ struct xdp_md {
>         __u32 data_end;
>  };
>
> +enum sk_action {
> +       SK_ABORTED = 0,
> +       SK_DROP,
> +       SK_REDIRECT,
> +};
> +
>  #define BPF_TAG_SIZE   8
>
>  struct bpf_prog_info {
> diff --git a/kernel/bpf/Makefile b/kernel/bpf/Makefile
> index 48e9270..3089102 100644
> --- a/kernel/bpf/Makefile
> +++ b/kernel/bpf/Makefile
> @@ -3,7 +3,7 @@ obj-y := core.o
>  obj-$(CONFIG_BPF_SYSCALL) += syscall.o verifier.o inode.o helpers.o
>  obj-$(CONFIG_BPF_SYSCALL) += hashtab.o arraymap.o percpu_freelist.o 
> bpf_lru_list.o lpm_trie.o map_in_map.o
>  ifeq ($(CONFIG_NET),y)
> -obj-$(CONFIG_BPF_SYSCALL) += devmap.o
> +obj-$(CONFIG_BPF_SYSCALL) += devmap.o sockmap.o
>  endif
>  ifeq ($(CONFIG_PERF_EVENTS),y)
>  obj-$(CONFIG_BPF_SYSCALL) += stackmap.o
> diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c
> index 3d24e23..feb38e0 100644
> --- a/kernel/bpf/helpers.c
> +++ b/kernel/bpf/helpers.c
> @@ -43,6 +43,26 @@
>         .arg2_type      = ARG_PTR_TO_MAP_KEY,
>  };
>
> +BPF_CALL_5(bpf_ctx_map_update_elem, struct bpf_sock_ops_kern *, bpf_sock,
> +          struct bpf_map *, map, void *, key, u64, flags, u64, map_flags)
> +{
> +       WARN_ON_ONCE(!rcu_read_lock_held());
> +       return map->ops->map_ctx_update_elem(bpf_sock, map, key,
> +                                            flags, map_flags);
> +}
> +
> +const struct bpf_func_proto bpf_map_ctx_update_elem_proto = {
> +       .func           = bpf_ctx_map_update_elem,
> +       .gpl_only       = false,
> +       .pkt_access     = true,
> +       .ret_type       = RET_INTEGER,
> +       .arg1_type      = ARG_PTR_TO_CTX,
> +       .arg2_type      = ARG_CONST_MAP_PTR,
> +       .arg3_type      = ARG_PTR_TO_MAP_KEY,
> +       .arg4_type      = ARG_ANYTHING,
> +       .arg5_type      = ARG_ANYTHING,
> +};
> +
>  BPF_CALL_4(bpf_map_update_elem, struct bpf_map *, map, void *, key,
>            void *, value, u64, flags)
>  {
> diff --git a/kernel/bpf/sockmap.c b/kernel/bpf/sockmap.c
> new file mode 100644
> index 0000000..9e88c32
> --- /dev/null
> +++ b/kernel/bpf/sockmap.c
> @@ -0,0 +1,623 @@
> +/* Copyright (c) 2017 Covalent IO, Inc. http://covalent.io
> + *
> + * This program is free software; you can redistribute it and/or
> + * modify it under the terms of version 2 of the GNU General Public
> + * License as published by the Free Software Foundation.
> + *
> + * This program is distributed in the hope that it will be useful, but
> + * WITHOUT ANY WARRANTY; without even the implied warranty of
> + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
> + * General Public License for more details.
> + */
> +
> +/* A BPF sock_map is used to store sock objects. This is primarly used
> + * for doing socket redirect with BPF helper routines.
> + *
> + * A sock map may have two BPF programs attached to it, a program used
> + * to parse packets and a program to provide a verdict and redirect
> + * decision on the packet. If no BPF parse program is provided it is
> + * assumed that every skb is a "message" (skb->len). Otherwise the
> + * parse program is attached to strparser and used to build messages
> + * that may span multiple skbs. The verdict program will either select
> + * a socket to send/receive the skb on or provide the drop code indicating
> + * the skb should be dropped. More actions may be added later as needed.
> + * The default program will drop packets.
> + *
> + * For reference this program is similar to devmap used in XDP context
> + * reviewing these together may be useful. For a set of examples and
> + * test codes using this map please review ./samples/bpf/sockmap/ here
> + * you can find common usages such as a socket level load balancer and
> + * cgroup integration.
> + */
> +#include <linux/bpf.h>
> +#include <linux/jhash.h>
> +#include <linux/filter.h>
> +#include <net/sock.h>
> +#include <linux/rculist_nulls.h>
> +#include "percpu_freelist.h"
> +#include "bpf_lru_list.h"
> +#include "map_in_map.h"
> +
> +#include <linux/errno.h>
> +#include <linux/file.h>
> +#include <linux/in.h>
> +#include <linux/kernel.h>
> +#include <linux/module.h>
> +#include <linux/net.h>
> +#include <linux/rculist.h>
> +#include <linux/skbuff.h>
> +#include <linux/socket.h>
> +#include <linux/workqueue.h>
> +#include <linux/list.h>
> +#include <linux/bpf.h>
> +#include <net/strparser.h>
> +#include <net/netns/generic.h>
> +#include <net/sock.h>
> +
> +struct bpf_stab {
> +       struct bpf_map map;
> +       struct sock **sock_map;
> +       struct bpf_prog *bpf_parse;
> +       struct bpf_prog *bpf_mux;
> +};
> +
> +struct smap_psock {
> +       struct rcu_head rcu;
> +
> +       /* datapath variables used under sock lock */
> +       struct sk_buff_head rxqueue;
> +
> +       bool strp_enabled;
> +
> +       /* datapath error path cache across tx work invocations */
> +       int save_rem;
> +       int save_off;
> +       struct sk_buff *save_skb;
> +       u32 tx_stopped : 1;
> +
> +       struct strparser strp;
> +       struct bpf_prog *bpf_parse;
> +       struct bpf_prog *bpf_mux;
> +       struct bpf_map *map;
> +
> +       /* Back reference to the file descriptor of the sock */
> +       int key;
> +       struct sock *sock;
> +
> +       struct work_struct tx_work;
> +
> +       void (*save_data_ready)(struct sock *sk);
> +       void (*save_write_space)(struct sock *sk);
> +       void (*save_state_change)(struct sock *sk);
> +};
> +
> +static inline struct smap_psock *smap_psock_sk(const struct sock *sk)
> +{
> +       return (struct smap_psock *)sk->sk_user_data;
> +}
> +
> +static int smap_mux_func(struct smap_psock *psock, struct sk_buff *skb)
> +{
> +       struct bpf_prog *prog = psock->bpf_mux;
> +       int rc;
> +
> +       if (unlikely(!prog))
> +               return 0;
> +
> +       skb->sk = psock->sock;
> +       rc = (*prog->bpf_func)(skb, prog->insnsi);
> +       skb->sk = NULL;
> +
> +       return rc;
> +}
> +
> +static struct smap_psock *smap_peers_get(struct smap_psock *psock,
> +                                            struct sk_buff *skb)
> +{
> +       struct sock *sock;
> +       int rc;
> +
> +       rc = smap_mux_func(psock, skb);
> +       if (unlikely(rc < 0))
> +               return NULL;
> +
> +       sock = do_sk_redirect_map();
> +       if (unlikely(!sock))
> +               return NULL;
> +
> +       return smap_psock_sk(sock);
> +}
> +
> +static void smap_report_sk_error(struct smap_psock *psock, int err)
> +{
> +       struct sock *sk = psock->sock;
> +
> +       sk->sk_err = err;
> +       sk->sk_error_report(sk);
> +}
> +
> +static int sock_map_delete_elem(struct bpf_map *map, void *key);
> +
> +static void smap_state_change(struct sock *sk)
> +{
> +       struct smap_psock *psock = smap_psock_sk(sk);
> +
> +       /* Allowing transitions into established an syn_recv states allows
> +        * for early binding sockets to a smap object before the connection
> +        * is established. All other transitions indicate the connection is
> +        * being torn down so tear down the smap socket.
> +        */
> +       switch (sk->sk_state) {
> +       case TCP_SYN_RECV:
> +       case TCP_ESTABLISHED:
> +               break;
> +       case TCP_CLOSE_WAIT:
> +       case TCP_CLOSING:
> +       case TCP_LAST_ACK:
> +       case TCP_FIN_WAIT1:
> +       case TCP_FIN_WAIT2:
> +       case TCP_LISTEN:
> +               break;
> +       case TCP_CLOSE:
> +               sock_map_delete_elem(psock->map, &psock->key);
> +               break;
> +       default:
> +               smap_report_sk_error(psock, EPIPE);
> +               break;
> +       }
> +}
> +
> +static void smap_tx_work(struct work_struct *w);
> +
> +void schedule_writer(struct smap_psock *psock)
> +{
> +       schedule_work(&psock->tx_work);
> +}
> +
> +static int smap_tx_writer(struct smap_psock *peer)
> +{
> +       schedule_writer(peer);
> +       return 0;
> +}
> +
> +static void smap_read_sock_strparser(struct strparser *strp,
> +                                    struct sk_buff *skb)
> +{
> +       struct smap_psock *psock = container_of(strp,
> +                                                 struct smap_psock, strp);
> +       struct smap_psock *peer;
> +
> +       /* TBD useful dbg, add trace here with output sock index or drop */
> +       rcu_read_lock();
> +       peer = smap_peers_get(psock, skb);
> +       if (unlikely(!peer)) {
> +               kfree_skb(skb);
> +               goto out;
> +       }
> +
> +       skb_queue_tail(&peer->rxqueue, skb);
> +       smap_tx_writer(peer);
> +out:
> +       rcu_read_unlock();
> +}
> +
> +/* Called with lock held on socket */
> +static void smap_data_ready(struct sock *sk)
> +{
> +       struct smap_psock *psock;
> +
> +       read_lock_bh(&sk->sk_callback_lock);
> +
> +       psock = smap_psock_sk(sk);
> +       if (likely(psock))
> +               strp_data_ready(&psock->strp);
> +
> +       read_unlock_bh(&sk->sk_callback_lock);
> +}
> +
> +static void smap_tx_work(struct work_struct *w)
> +{
> +       struct smap_psock *psock;
> +       struct sk_buff *skb;
> +       int rem, off, n;
> +
> +       psock = container_of(w, struct smap_psock, tx_work);
> +       if (unlikely(psock->tx_stopped))
> +               return;
> +
> +       if (psock->save_skb) {
> +               skb = psock->save_skb;
> +               rem = psock->save_rem;
> +               off = psock->save_off;
> +               psock->save_skb = NULL;
> +               goto start;
> +       }
> +
> +       while ((skb = skb_dequeue(&psock->rxqueue))) {
> +               rem = skb->len;
> +               off = 0;
> +start:
> +               do {
> +                       n = skb_send_sock(psock->sock, skb, off, rem);
> +                       if (n <= 0) {
> +                               if (n == -EAGAIN) {
> +                                       /* Save state to try again when
> +                                        * there's write space on the
> +                                        * socket.
> +                                        */
> +                                       psock->save_skb = skb;
> +                                       psock->save_rem = rem;
> +                                       psock->save_off = off;
> +                                       break;
> +                               }
> +
> +                               /* Got a hard error or socket had
> +                                * been closed somehow. Report this
> +                                * on the transport socket.
> +                                */
> +                               smap_report_sk_error(psock, n ? -n : EPIPE);
> +                               psock->tx_stopped = 1;
> +                               break;
> +                       }
> +                       rem -= n;
> +                       off += n;
> +               } while (rem);
> +       }
> +}
> +
> +static void smap_write_space(struct sock *sk)
> +{
> +       struct smap_psock *psock = smap_psock_sk(sk);
> +
> +       schedule_writer(psock);
> +}
> +
> +static void smap_stop_sock(struct smap_psock *psock, bool destroy)
> +{
> +       struct sock *sk = psock->sock;
> +
> +       write_lock_bh(&sk->sk_callback_lock);
> +       if (psock->strp_enabled) {
> +               sk->sk_data_ready = psock->save_data_ready;
> +               sk->sk_write_space = psock->save_write_space;
> +               sk->sk_state_change = psock->save_state_change;
> +               strp_stop(&psock->strp);
> +       }
> +
> +       if (destroy)
> +               sk->sk_user_data = NULL;
> +       write_unlock_bh(&sk->sk_callback_lock);
> +
> +       if (psock->strp_enabled)
> +               strp_done(&psock->strp);
> +       psock->strp_enabled = false;
> +}
> +
> +static void smap_destroy_psock(struct rcu_head *rcu)
> +{
> +       struct smap_psock *psock = container_of(rcu,
> +                                                 struct smap_psock, rcu);
> +
> +       smap_stop_sock(psock, true);
> +       cancel_work_sync(&psock->tx_work);
> +       __skb_queue_purge(&psock->rxqueue);
> +       sock_put(psock->sock);
> +       kfree(psock);
> +}
> +
> +static void smap_release_proxy(struct sock *sock)
> +{
> +       struct smap_psock *psock = smap_psock_sk(sock);
> +
> +       call_rcu(&psock->rcu, smap_destroy_psock);
> +}
> +
> +static int smap_parse_func_strparser(struct strparser *strp,
> +                                      struct sk_buff *skb)
> +{
> +       struct smap_psock *psock = container_of(strp,
> +                                                 struct smap_psock, strp);
> +       struct bpf_prog *prog = psock->bpf_parse;
> +
> +       if (unlikely(!prog))
> +               return skb->len;
> +
> +       return (*prog->bpf_func)(skb, prog->insnsi);
> +}
> +
> +
> +static int smap_read_sock_done(struct strparser *strp, int err)
> +{
> +       return err;
> +}
> +
> +static int smap_init_sock(struct smap_psock *psock,
> +                         struct sock *sock)
> +{
> +       struct strp_callbacks cb;
> +       int err;
> +
> +       cb.rcv_msg = smap_read_sock_strparser;
> +       cb.abort_parser = NULL;
> +       cb.parse_msg = smap_parse_func_strparser;
> +       cb.read_sock_done = smap_read_sock_done;
> +
> +       err = strp_init(&psock->strp, sock, &cb);
> +       if (err)
> +               return -EINVAL;
> +       return 0;
> +}
> +
> +static void smap_init_progs(struct smap_psock *psock, struct bpf_stab *stab)
> +{
> +       /* TBD need prog_put and gets here to avoid programs leaving
> +        * us or something in attach
> +        */
> +       if (psock->bpf_mux != stab->bpf_mux)
> +               psock->bpf_mux = stab->bpf_mux;
> +
> +       if (psock->bpf_parse != stab->bpf_parse)
> +               psock->bpf_parse = stab->bpf_parse;
> +}
> +
> +static int smap_start_sock(struct smap_psock *psock, struct sock *sk)
> +{
> +       int err = 0;
> +
> +       write_lock_bh(&sk->sk_callback_lock);
> +       /* only start socket if it is not already running */
> +       if (psock->save_data_ready) {
> +               err = -EINVAL;
> +               goto out;
> +       }
> +       psock->save_data_ready = sk->sk_data_ready;
> +       psock->save_write_space = sk->sk_write_space;
> +       psock->save_state_change = sk->sk_state_change;
> +       sk->sk_data_ready = smap_data_ready;
> +       sk->sk_write_space = smap_write_space;
> +       sk->sk_state_change = smap_state_change;
> +out:
> +       write_unlock_bh(&sk->sk_callback_lock);
> +       return err;
> +}
> +
> +static struct smap_psock *smap_init_psock(struct sock *sock,
> +                                         struct bpf_stab *stab)
> +{
> +       struct smap_psock *psock;
> +
> +       psock = kmalloc(sizeof(struct smap_psock), GFP_ATOMIC);
> +       if (!psock)
> +               return ERR_PTR(-ENOMEM);
> +
> +       memset(psock, 0, sizeof(struct smap_psock));
> +       smap_init_progs(psock, stab);
> +       psock->sock = sock;
> +
> +       skb_queue_head_init(&psock->rxqueue);
> +       INIT_WORK(&psock->tx_work, smap_tx_work);
> +
> +       write_lock_bh(&sock->sk_callback_lock);
> +       sock->sk_user_data = psock;
> +       write_unlock_bh(&sock->sk_callback_lock);
> +
> +       sock_hold(sock);
> +       return psock;
> +}
> +
> +#define SOCK_MAP_STRPARSER 0x01
> +/* BPF map logic */
> +static struct bpf_map *sock_map_alloc(union bpf_attr *attr)
> +{
> +       struct bpf_stab *stab;
> +       int err = -EINVAL;
> +       u64 cost;
> +
> +       /* check sanity of attributes */
> +       if (attr->max_entries == 0 || attr->key_size != 4 ||
> +           attr->value_size != 4 || attr->map_flags)
> +               return ERR_PTR(-EINVAL);
> +
> +       /* if value_size is bigger, the user space won't be able to
> +        * access the elements.
> +        */
> +       if (attr->value_size > KMALLOC_MAX_SIZE)
> +               return ERR_PTR(-E2BIG);
> +
> +       stab = kzalloc(sizeof(*stab), GFP_USER);
> +       if (!stab)
> +               return ERR_PTR(-ENOMEM);
> +
> +       /* mandatory map attributes */
> +       stab->map.map_type = attr->map_type;
> +       stab->map.key_size = attr->key_size;
> +       stab->map.value_size = attr->value_size;
> +       stab->map.max_entries = attr->max_entries;
> +       stab->map.map_flags = attr->map_flags;
> +
> +
> +       /* make sure page count doesn't overflow */
> +       cost = (u64) stab->map.max_entries * sizeof(struct sock *) +
> +                       sizeof(struct socket *);
> +       stab->map.pages = round_up(cost, PAGE_SIZE) >> PAGE_SHIFT;
> +
> +       err = -ENOMEM;
> +
> +       /* if map size is larger than memlock limit, reject it early */
> +       err = bpf_map_precharge_memlock(stab->map.pages);
> +       if (err)
> +               goto free_stab;
> +
> +       stab->sock_map = bpf_map_area_alloc(stab->map.max_entries *
> +                                           sizeof(struct sock *));
> +       if (!stab->sock_map)
> +               goto free_stab;
> +
> +       return &stab->map;
> +       /* TBD release progs on errors */
> +free_stab:
> +       kfree(stab);
> +       return ERR_PTR(err);
> +}
> +
> +static void sock_map_free(struct bpf_map *map)
> +{
> +       struct bpf_stab *stab = container_of(map, struct bpf_stab, map);
> +       int i;
> +
> +       synchronize_rcu();
> +
> +       for (i = 0; i < stab->map.max_entries; i++) {
> +               struct sock *sock;
> +
> +               sock = stab->sock_map[i];
> +               if (!sock)
> +                       continue;
> +
> +               smap_release_proxy(sock);
> +       }
> +
> +       bpf_map_area_free(stab->sock_map);
> +       if (stab->bpf_mux)
> +               bpf_prog_put(stab->bpf_mux);
> +       if (stab->bpf_parse)
> +               bpf_prog_put(stab->bpf_mux);
> +       kfree(stab);
> +}
> +
> +static int sock_map_get_next_key(struct bpf_map *map, void *key, void 
> *next_key)
> +{
> +       struct bpf_stab *stab = container_of(map, struct bpf_stab, map);
> +       u32 i = key ? *(u32 *)key : U32_MAX;
> +       u32 *next = (u32 *)next_key;
> +
> +       if (i >= stab->map.max_entries) {
> +               *next = 0;
> +               return 0;
> +       }
> +
> +       if (i == stab->map.max_entries - 1)
> +               return -ENOENT;
> +
> +       *next = i + 1;
> +       return 0;
> +}
> +
> +struct sock  *__sock_map_lookup_elem(struct bpf_map *map, u32 key)
> +{
> +       struct bpf_stab *stab = container_of(map, struct bpf_stab, map);
> +
> +       if (key >= map->max_entries)
> +               return NULL;
> +
> +       return stab->sock_map[key];
> +}
> +
> +static void *sock_map_lookup_elem(struct bpf_map *map, void *key)
> +{
> +       struct bpf_stab *stab = container_of(map, struct bpf_stab, map);
> +       struct sock *sock;
> +       u32 i = *(u32 *)key;
> +
> +       if (i >= map->max_entries)
> +               return NULL;
> +
> +       sock = stab->sock_map[i];
> +       return NULL;
> +}
> +
> +static int sock_map_delete_elem(struct bpf_map *map, void *key)
> +{
> +       struct bpf_stab *stab = container_of(map, struct bpf_stab, map);
> +       struct sock *sock;
> +       int k = *(u32 *)key;
> +
> +       if (k >= map->max_entries)
> +               return -EINVAL;
> +
> +       sock = stab->sock_map[k];
> +       if (!sock)
> +               return -EINVAL;
> +
> +       smap_release_proxy(sock);
> +       return 0;
> +}
> +
> +static int sock_map_update_elem(struct bpf_sock_ops_kern *skops,
> +                               struct bpf_map *map,
> +                               void *key, u64 flags, u64 map_flags)
> +{
> +       struct bpf_stab *stab = container_of(map, struct bpf_stab, map);
> +       struct sock *old_sock, *sock;
> +       struct smap_psock *psock = NULL;
> +       u32 i = *(u32 *)key;
> +       bool update = false;
> +
> +       if (unlikely(flags > BPF_EXIST))
> +               return -EINVAL;
> +
> +       if (unlikely(i >= stab->map.max_entries))
> +               return -E2BIG;
> +
> +       if (unlikely(map_flags > SOCK_MAP_STRPARSER))
> +               return -EINVAL;
> +
> +       if (flags == BPF_EXIST || flags == BPF_ANY) {
> +               sock = rcu_dereference(stab->sock_map[i]);
> +
> +               if (!sock && flags == BPF_EXIST) {
> +                       return -ENOENT;
> +               } else if (sock && sock != skops->sk) {
> +                       return -EINVAL;
> +               } else if (sock) {
> +                       psock = smap_psock_sk(sock);
> +                       update = true;
> +               }
> +       }
> +
> +       if (!psock) {
> +               sock = skops->sk;
> +               psock = smap_init_psock(sock, stab);
> +               if (IS_ERR(psock))
> +                       return PTR_ERR(psock);
> +               psock->key = i;
> +               psock->map = map;
> +       }
> +
> +       if (map_flags & SOCK_MAP_STRPARSER) {
> +               smap_start_sock(psock, sock);
> +               smap_init_progs(psock, stab);
> +               smap_init_sock(psock, sock);
> +               psock->strp_enabled = true;
> +       } else if (update) {
> +               smap_stop_sock(psock, false);
> +       }
> +
> +       if (!update) {
> +               old_sock = xchg(&stab->sock_map[i], skops->sk);
> +               if (old_sock)
> +                       smap_release_proxy(old_sock);
> +       }
> +
> +       return 0;
> +}
> +
> +static int sock_map_attach_prog(struct bpf_map *map,
> +                               struct bpf_prog *parse, struct bpf_prog *mux)
> +{
> +       struct bpf_stab *stab = container_of(map, struct bpf_stab, map);
> +
> +       stab->bpf_parse = parse;
> +       stab->bpf_mux = mux;
> +       return 0;
> +}
> +
> +const struct bpf_map_ops sock_map_ops = {
> +       .map_alloc = sock_map_alloc,
> +       .map_free = sock_map_free,
> +       .map_get_next_key = sock_map_get_next_key,
> +       .map_lookup_elem = sock_map_lookup_elem,
> +       .map_ctx_update_elem = sock_map_update_elem,
> +       .map_delete_elem = sock_map_delete_elem,
> +       .map_attach = sock_map_attach_prog,
> +};
> diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
> index 6c772ad..e4f48f5 100644
> --- a/kernel/bpf/syscall.c
> +++ b/kernel/bpf/syscall.c
> @@ -1045,7 +1045,40 @@ static int bpf_obj_get(const union bpf_attr *attr)
>
>  #ifdef CONFIG_CGROUP_BPF
>
> -#define BPF_PROG_ATTACH_LAST_FIELD attach_flags
> +#define BPF_PROG_ATTACH_LAST_FIELD attach_bpf_fd2
> +
> +static int sockmap_get_from_fd(const union bpf_attr *attr, int ptype)
> +{
> +       struct bpf_prog *prog1, *prog2;
> +       struct bpf_map *map;
> +       int err;
> +
> +       map = bpf_map_get_with_uref(attr->target_fd);
> +       if (IS_ERR(map))
> +               return PTR_ERR(map);
> +
> +       if (!map->ops->map_attach)
> +               return -EOPNOTSUPP;
> +
> +       prog1 = bpf_prog_get_type(attr->attach_bpf_fd, ptype);
> +       if (IS_ERR(prog1))
> +               return PTR_ERR(prog1);
> +
> +       prog2 = bpf_prog_get_type(attr->attach_bpf_fd2, ptype);
> +       if (IS_ERR(prog2)) {
> +               bpf_prog_put(prog1);
> +               return PTR_ERR(prog2);
> +       }
> +
> +       err = map->ops->map_attach(map, prog1, prog2);
> +       if (err) {
> +               bpf_prog_put(prog1);
> +               bpf_prog_put(prog2);
> +               return PTR_ERR(map);
> +       }
> +
> +       return err;
> +}
>
>  static int bpf_prog_attach(const union bpf_attr *attr)
>  {
> @@ -1074,10 +1107,16 @@ static int bpf_prog_attach(const union bpf_attr *attr)
>         case BPF_CGROUP_SOCK_OPS:
>                 ptype = BPF_PROG_TYPE_SOCK_OPS;
>                 break;
> +       case BPF_SOCKMAP_INGRESS:
> +               ptype = BPF_PROG_TYPE_SOCKET_FILTER;
> +               break;
>         default:
>                 return -EINVAL;
>         }
>
> +       if (attr->attach_type == BPF_SOCKMAP_INGRESS)
> +               return sockmap_get_from_fd(attr, ptype);
> +
>         prog = bpf_prog_get_type(attr->attach_bpf_fd, ptype);
>         if (IS_ERR(prog))
>                 return PTR_ERR(prog);
> diff --git a/net/core/filter.c b/net/core/filter.c
> index 7e97086..2644f2d 100644
> --- a/net/core/filter.c
> +++ b/net/core/filter.c
> @@ -1845,6 +1845,51 @@ int skb_do_redirect(struct sk_buff *skb)
>         .arg3_type      = ARG_ANYTHING,
>  };
>
> +BPF_CALL_3(bpf_sk_redirect_map, struct bpf_map *, map, u32, key, u64, flags)
> +{
> +       struct redirect_info *ri = this_cpu_ptr(&redirect_info);
> +
> +       ri->ifindex = key;
> +       ri->flags = flags;
> +       ri->map = map;
> +
> +       return SK_REDIRECT;
> +}
> +
> +inline struct sock *do_sk_redirect_map(void)
> +{
> +       struct redirect_info *ri = this_cpu_ptr(&redirect_info);
> +       struct sock *sk = NULL;
> +
> +       if (ri->map) {
> +               sk = __sock_map_lookup_elem(ri->map, ri->ifindex);
> +
> +               ri->ifindex = 0;
> +               ri->map = NULL;
> +               /* we do not clear flags for future lookup */
> +       }
> +
> +       return sk;
> +}
> +EXPORT_SYMBOL(do_sk_redirect_map);
> +
> +inline u64 get_sk_redirect_flags(void)
> +{
> +       struct redirect_info *ri = this_cpu_ptr(&redirect_info);
> +
> +       return ri->flags;
> +}
> +EXPORT_SYMBOL(get_sk_redirect_flags);
> +
> +static const struct bpf_func_proto bpf_sk_redirect_map_proto = {
> +       .func           = bpf_sk_redirect_map,
> +       .gpl_only       = false,
> +       .ret_type       = RET_INTEGER,
> +       .arg1_type      = ARG_CONST_MAP_PTR,
> +       .arg2_type      = ARG_ANYTHING,
> +       .arg3_type      = ARG_ANYTHING,
> +};
> +
>  BPF_CALL_1(bpf_get_cgroup_classid, const struct sk_buff *, skb)
>  {
>         return task_get_classid(skb);
> @@ -3090,6 +3135,10 @@ static unsigned long bpf_xdp_copy(void *dst_buff, 
> const void *src_buff,
>                 return &bpf_get_socket_cookie_proto;
>         case BPF_FUNC_get_socket_uid:
>                 return &bpf_get_socket_uid_proto;
> +       case BPF_FUNC_sk_redirect_map:
> +               return &bpf_sk_redirect_map_proto;
> +       case BPF_FUNC_map_ctx_update_elem:
> +               return &bpf_map_ctx_update_elem_proto;
>         default:
>                 return bpf_base_func_proto(func_id);
>         }
> @@ -3214,6 +3263,8 @@ static unsigned long bpf_xdp_copy(void *dst_buff, const 
> void *src_buff,
>         switch (func_id) {
>         case BPF_FUNC_setsockopt:
>                 return &bpf_setsockopt_proto;
> +       case BPF_FUNC_map_ctx_update_elem:
> +               return &bpf_map_ctx_update_elem_proto;
>         default:
>                 return bpf_base_func_proto(func_id);
>         }
>

Re: [RFC PATCH 4/6] net: sockmap with sk redirect support

Reply via email to