[PATCH bpf-next v4 2/4] bpf: sockmap, add hash map support

John Fastabend Thu, 03 May 2018 11:29:20 -0700

Sockmap is currently backed by an array and enforces keys to be
four bytes. This works well for many use cases and was originally
modeled after devmap which also uses four bytes keys. However,
this has become limiting in larger use cases where a hash would
be more appropriate. For example users may want to use the 5-tuple
of the socket as the lookup key.


To support this add hash support.

Signed-off-by: John Fastabend <[email protected]>
---
 include/linux/bpf.h       |   8 +
 include/linux/bpf_types.h |   1 +
 include/uapi/linux/bpf.h  |  53 ++++-
 kernel/bpf/core.c         |   1 +
 kernel/bpf/sockmap.c      | 494 ++++++++++++++++++++++++++++++++++++++++++++--
 kernel/bpf/verifier.c     |  14 +-
 net/core/filter.c         |  58 ++++++
 7 files changed, 611 insertions(+), 18 deletions(-)

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index 38ebbc6..add768a 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -661,6 +661,7 @@ static inline void bpf_map_offload_map_free(struct bpf_map 
*map)
 
 #if defined(CONFIG_STREAM_PARSER) && defined(CONFIG_BPF_SYSCALL) && 
defined(CONFIG_INET)
 struct sock  *__sock_map_lookup_elem(struct bpf_map *map, u32 key);
+struct sock  *__sock_hash_lookup_elem(struct bpf_map *map, void *key);
 int sock_map_prog(struct bpf_map *map, struct bpf_prog *prog, u32 type);
 #else
 static inline struct sock  *__sock_map_lookup_elem(struct bpf_map *map, u32 
key)
@@ -668,6 +669,12 @@ static inline struct sock  *__sock_map_lookup_elem(struct 
bpf_map *map, u32 key)
        return NULL;
 }
 
+static inline struct sock  *__sock_hash_lookup_elem(struct bpf_map *map,
+                                                   void *key)
+{
+       return NULL;
+}
+
 static inline int sock_map_prog(struct bpf_map *map,
                                struct bpf_prog *prog,
                                u32 type)
@@ -693,6 +700,7 @@ static inline int sock_map_prog(struct bpf_map *map,
 extern const struct bpf_func_proto bpf_skb_vlan_pop_proto;
 extern const struct bpf_func_proto bpf_get_stackid_proto;
 extern const struct bpf_func_proto bpf_sock_map_update_proto;
+extern const struct bpf_func_proto bpf_sock_hash_update_proto;
 
 /* Shared helpers among cBPF and eBPF. */
 void bpf_user_rnd_init_once(void);
diff --git a/include/linux/bpf_types.h b/include/linux/bpf_types.h
index 2b28fcf..3101118 100644
--- a/include/linux/bpf_types.h
+++ b/include/linux/bpf_types.h
@@ -47,6 +47,7 @@
 BPF_MAP_TYPE(BPF_MAP_TYPE_DEVMAP, dev_map_ops)
 #if defined(CONFIG_STREAM_PARSER) && defined(CONFIG_INET)
 BPF_MAP_TYPE(BPF_MAP_TYPE_SOCKMAP, sock_map_ops)
+BPF_MAP_TYPE(BPF_MAP_TYPE_SOCKHASH, sock_hash_ops)
 #endif
 BPF_MAP_TYPE(BPF_MAP_TYPE_CPUMAP, cpu_map_ops)
 #endif
diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index da77a93..c2613c5 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -116,6 +116,7 @@ enum bpf_map_type {
        BPF_MAP_TYPE_DEVMAP,
        BPF_MAP_TYPE_SOCKMAP,
        BPF_MAP_TYPE_CPUMAP,
+       BPF_MAP_TYPE_SOCKHASH,
 };
 
 enum bpf_prog_type {
@@ -1767,6 +1768,53 @@ struct bpf_stack_build_id {
  *             **CONFIG_XFRM** configuration option.
  *     Return
  *             0 on success, or a negative error in case of failure.
+ *
+ * int bpf_sock_hash_update(struct bpf_sock_ops_kern *skops, struct bpf_map 
*map, void *key, u64 flags)
+ *     Description
+ *             Add an entry to, or update a sockhash *map* referencing sockets.
+ *             The *skops* is used as a new value for the entry associated to
+ *             *key*. *flags* is one of:
+ *
+ *             **BPF_NOEXIST**
+ *                     The entry for *key* must not exist in the map.
+ *             **BPF_EXIST**
+ *                     The entry for *key* must already exist in the map.
+ *             **BPF_ANY**
+ *                     No condition on the existence of the entry for *key*.
+ *
+ *             If the *map* has eBPF programs (parser and verdict), those will
+ *             be inherited by the socket being added. If the socket is
+ *             already attached to eBPF programs, this results in an error.
+ *     Return
+ *             0 on success, or a negative error in case of failure.
+ *
+ * int bpf_msg_redirect_hash(struct sk_msg_buff *msg, struct bpf_map *map, 
void *key, u64 flags)
+ *     Description
+ *             This helper is used in programs implementing policies at the
+ *             socket level. If the message *msg* is allowed to pass (i.e. if
+ *             the verdict eBPF program returns **SK_PASS**), redirect it to
+ *             the socket referenced by *map* (of type
+ *             **BPF_MAP_TYPE_SOCKHASH**) using hash *key*. Both ingress and
+ *             egress interfaces can be used for redirection. The
+ *             **BPF_F_INGRESS** value in *flags* is used to make the
+ *             distinction (ingress path is selected if the flag is present,
+ *             egress path otherwise). This is the only flag supported for now.
+ *     Return
+ *             **SK_PASS** on success, or **SK_DROP** on error.
+ *
+ * int bpf_sk_redirect_hash(struct sk_buff *skb, struct bpf_map *map, void 
*key, u64 flags)
+ *     Description
+ *             This helper is used in programs implementing policies at the
+ *             skb socket level. If the sk_buff *skb* is allowed to pass (i.e.
+ *             if the verdeict eBPF program returns **SK_PASS**), redirect it
+ *             to the socket referenced by *map* (of type
+ *             **BPF_MAP_TYPE_SOCKHASH**) using hash *key*. Both ingress and
+ *             egress interfaces can be used for redirection. The
+ *             **BPF_F_INGRESS** value in *flags* is used to make the
+ *             distinction (ingress path is selected if the flag is present,
+ *             egress otherwise). This is the only flag supported for now.
+ *     Return
+ *             **SK_PASS** on success, or **SK_DROP** on error.
  */
 #define __BPF_FUNC_MAPPER(FN)          \
        FN(unspec),                     \
@@ -1835,7 +1883,10 @@ struct bpf_stack_build_id {
        FN(msg_pull_data),              \
        FN(bind),                       \
        FN(xdp_adjust_tail),            \
-       FN(skb_get_xfrm_state),
+       FN(skb_get_xfrm_state),         \
+       FN(sock_hash_update),           \
+       FN(msg_redirect_hash),          \
+       FN(sk_redirect_hash),
 
 /* integer value in 'imm' field of BPF_CALL instruction selects which helper
  * function eBPF program intends to call
diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c
index ba03ec3..5917cc1 100644
--- a/kernel/bpf/core.c
+++ b/kernel/bpf/core.c
@@ -1782,6 +1782,7 @@ void bpf_user_rnd_init_once(void)
 const struct bpf_func_proto bpf_get_current_uid_gid_proto __weak;
 const struct bpf_func_proto bpf_get_current_comm_proto __weak;
 const struct bpf_func_proto bpf_sock_map_update_proto __weak;
+const struct bpf_func_proto bpf_sock_hash_update_proto __weak;
 
 const struct bpf_func_proto * __weak bpf_get_trace_printk_proto(void)
 {
diff --git a/kernel/bpf/sockmap.c b/kernel/bpf/sockmap.c
index 4eef5b1..306eb5d 100644
--- a/kernel/bpf/sockmap.c
+++ b/kernel/bpf/sockmap.c
@@ -60,6 +60,28 @@ struct bpf_stab {
        struct bpf_sock_progs progs;
 };
 
+struct bucket {
+       struct hlist_head head;
+       raw_spinlock_t lock;
+};
+
+struct bpf_htab {
+       struct bpf_map map;
+       struct bucket *buckets;
+       atomic_t count;
+       u32 n_buckets;
+       u32 elem_size;
+       struct bpf_sock_progs progs;
+};
+
+struct htab_elem {
+       struct rcu_head rcu;
+       struct hlist_node hash_node;
+       u32 hash;
+       struct sock *sk;
+       char key[0];
+};
+
 enum smap_psock_state {
        SMAP_TX_RUNNING,
 };
@@ -67,6 +89,8 @@ enum smap_psock_state {
 struct smap_psock_map_entry {
        struct list_head list;
        struct sock **entry;
+       struct htab_elem *hash_link;
+       struct bpf_htab *htab;
 };
 
 struct smap_psock {
@@ -195,6 +219,12 @@ static void bpf_tcp_release(struct sock *sk)
        rcu_read_unlock();
 }
 
+static void free_htab_elem(struct bpf_htab *htab, struct htab_elem *l)
+{
+       atomic_dec(&htab->count);
+       kfree_rcu(l, rcu);
+}
+
 static void bpf_tcp_close(struct sock *sk, long timeout)
 {
        void (*close_fun)(struct sock *sk, long timeout);
@@ -231,10 +261,16 @@ static void bpf_tcp_close(struct sock *sk, long timeout)
        }
 
        list_for_each_entry_safe(e, tmp, &psock->maps, list) {
-               osk = cmpxchg(e->entry, sk, NULL);
-               if (osk == sk) {
-                       list_del(&e->list);
-                       smap_release_sock(psock, sk);
+               if (e->entry) {
+                       osk = cmpxchg(e->entry, sk, NULL);
+                       if (osk == sk) {
+                               list_del(&e->list);
+                               smap_release_sock(psock, sk);
+                       }
+               } else {
+                       hlist_del_rcu(&e->hash_link->hash_node);
+                       smap_release_sock(psock, e->hash_link->sk);
+                       free_htab_elem(e->htab, e->hash_link);
                }
        }
        write_unlock_bh(&sk->sk_callback_lock);
@@ -1526,12 +1562,14 @@ static struct bpf_map *sock_map_alloc(union bpf_attr 
*attr)
        return ERR_PTR(err);
 }
 
-static void smap_list_remove(struct smap_psock *psock, struct sock **entry)
+static void smap_list_remove(struct smap_psock *psock,
+                            struct sock **entry,
+                            struct htab_elem *hash_link)
 {
        struct smap_psock_map_entry *e, *tmp;
 
        list_for_each_entry_safe(e, tmp, &psock->maps, list) {
-               if (e->entry == entry) {
+               if (e->entry == entry || e->hash_link == hash_link) {
                        list_del(&e->list);
                        break;
                }
@@ -1569,7 +1607,7 @@ static void sock_map_free(struct bpf_map *map)
                 * to be null and queued for garbage collection.
                 */
                if (likely(psock)) {
-                       smap_list_remove(psock, &stab->sock_map[i]);
+                       smap_list_remove(psock, &stab->sock_map[i], NULL);
                        smap_release_sock(psock, sock);
                }
                write_unlock_bh(&sock->sk_callback_lock);
@@ -1628,7 +1666,7 @@ static int sock_map_delete_elem(struct bpf_map *map, void 
*key)
 
        if (psock->bpf_parse)
                smap_stop_sock(psock, sock);
-       smap_list_remove(psock, &stab->sock_map[k]);
+       smap_list_remove(psock, &stab->sock_map[k], NULL);
        smap_release_sock(psock, sock);
 out:
        write_unlock_bh(&sock->sk_callback_lock);
@@ -1745,10 +1783,12 @@ static int __sock_map_ctx_update_elem(struct bpf_map 
*map,
                new = true;
        }
 
-       e = kzalloc(sizeof(*e), GFP_ATOMIC | __GFP_NOWARN);
-       if (!e) {
-               err = -ENOMEM;
-               goto out_progs;
+       if (map_link) {
+               e = kzalloc(sizeof(*e), GFP_ATOMIC | __GFP_NOWARN);
+               if (!e) {
+                       err = -ENOMEM;
+                       goto out_progs;
+               }
        }
 
        /* 3. At this point we have a reference to a valid psock that is
@@ -1782,6 +1822,7 @@ static int __sock_map_ctx_update_elem(struct bpf_map *map,
        write_unlock_bh(&sock->sk_callback_lock);
        return err;
 out_free:
+       kfree(e);
        smap_release_sock(psock, sock);
 out_progs:
        if (verdict)
@@ -1828,7 +1869,7 @@ static int sock_map_ctx_update_elem(struct 
bpf_sock_ops_kern *skops,
                struct smap_psock *opsock = smap_psock_sk(osock);
 
                write_lock_bh(&osock->sk_callback_lock);
-               smap_list_remove(opsock, &stab->sock_map[i]);
+               smap_list_remove(opsock, &stab->sock_map[i], NULL);
                smap_release_sock(opsock, osock);
                write_unlock_bh(&osock->sk_callback_lock);
        }
@@ -1845,6 +1886,10 @@ int sock_map_prog(struct bpf_map *map, struct bpf_prog 
*prog, u32 type)
                struct bpf_stab *stab = container_of(map, struct bpf_stab, map);
 
                progs = &stab->progs;
+       } else if (map->map_type == BPF_MAP_TYPE_SOCKHASH) {
+               struct bpf_htab *htab = container_of(map, struct bpf_htab, map);
+
+               progs = &htab->progs;
        } else {
                return -EINVAL;
        }
@@ -1905,11 +1950,19 @@ static int sock_map_update_elem(struct bpf_map *map,
 
 static void sock_map_release(struct bpf_map *map)
 {
-       struct bpf_stab *stab = container_of(map, struct bpf_stab, map);
        struct bpf_sock_progs *progs;
        struct bpf_prog *orig;
 
-       progs = &stab->progs;
+       if (map->map_type == BPF_MAP_TYPE_SOCKMAP) {
+               struct bpf_stab *stab = container_of(map, struct bpf_stab, map);
+
+               progs = &stab->progs;
+       } else {
+               struct bpf_htab *htab = container_of(map, struct bpf_htab, map);
+
+               progs = &htab->progs;
+       }
+
        orig = xchg(&progs->bpf_parse, NULL);
        if (orig)
                bpf_prog_put(orig);
@@ -1922,6 +1975,390 @@ static void sock_map_release(struct bpf_map *map)
                bpf_prog_put(orig);
 }
 
+static struct bpf_map *sock_hash_alloc(union bpf_attr *attr)
+{
+       struct bpf_htab *htab;
+       int i, err;
+       u64 cost;
+
+       if (!capable(CAP_NET_ADMIN))
+               return ERR_PTR(-EPERM);
+
+       /* check sanity of attributes */
+       if (attr->max_entries == 0 || attr->value_size != 4 ||
+           attr->map_flags & ~SOCK_CREATE_FLAG_MASK)
+               return ERR_PTR(-EINVAL);
+
+       err = bpf_tcp_ulp_register();
+       if (err && err != -EEXIST)
+               return ERR_PTR(err);
+
+       htab = kzalloc(sizeof(*htab), GFP_USER);
+       if (!htab)
+               return ERR_PTR(-ENOMEM);
+
+       bpf_map_init_from_attr(&htab->map, attr);
+
+       htab->n_buckets = roundup_pow_of_two(htab->map.max_entries);
+       htab->elem_size = sizeof(struct htab_elem) +
+                         round_up(htab->map.key_size, 8);
+
+       if (htab->n_buckets == 0 ||
+           htab->n_buckets > U32_MAX / sizeof(struct bucket))
+               goto free_htab;
+
+       cost = (u64) htab->n_buckets * sizeof(struct bucket) +
+              (u64) htab->elem_size * htab->map.max_entries;
+
+       if (cost >= U32_MAX - PAGE_SIZE)
+               goto free_htab;
+
+       htab->map.pages = round_up(cost, PAGE_SIZE) >> PAGE_SHIFT;
+       err = bpf_map_precharge_memlock(htab->map.pages);
+       if (err)
+               goto free_htab;
+
+       err = -ENOMEM;
+       htab->buckets = bpf_map_area_alloc(
+                               htab->n_buckets * sizeof(struct bucket),
+                               htab->map.numa_node);
+       if (!htab->buckets)
+               goto free_htab;
+
+       for (i = 0; i < htab->n_buckets; i++) {
+               INIT_HLIST_HEAD(&htab->buckets[i].head);
+               raw_spin_lock_init(&htab->buckets[i].lock);
+       }
+
+       return &htab->map;
+free_htab:
+       kfree(htab);
+       return ERR_PTR(err);
+}
+
+static inline struct bucket *__select_bucket(struct bpf_htab *htab, u32 hash)
+{
+       return &htab->buckets[hash & (htab->n_buckets - 1)];
+}
+
+static inline struct hlist_head *select_bucket(struct bpf_htab *htab, u32 hash)
+{
+       return &__select_bucket(htab, hash)->head;
+}
+
+static void sock_hash_free(struct bpf_map *map)
+{
+       struct bpf_htab *htab = container_of(map, struct bpf_htab, map);
+       int i;
+
+       synchronize_rcu();
+
+       /* At this point no update, lookup or delete operations can happen.
+        * However, be aware we can still get a socket state event updates,
+        * and data ready callabacks that reference the psock from sk_user_data
+        * Also psock worker threads are still in-flight. So smap_release_sock
+        * will only free the psock after cancel_sync on the worker threads
+        * and a grace period expire to ensure psock is really safe to remove.
+        */
+       rcu_read_lock();
+       for (i = 0; i < htab->n_buckets; i++) {
+               struct hlist_head *head = select_bucket(htab, i);
+               struct hlist_node *n;
+               struct htab_elem *l;
+
+               hlist_for_each_entry_safe(l, n, head, hash_node) {
+                       struct sock *sock = l->sk;
+                       struct smap_psock *psock;
+
+                       hlist_del_rcu(&l->hash_node);
+                       write_lock_bh(&sock->sk_callback_lock);
+                       psock = smap_psock_sk(sock);
+                       /* This check handles a racing sock event that can get
+                        * the sk_callback_lock before this case but after xchg
+                        * causing the refcnt to hit zero and sock user data
+                        * (psock) to be null and queued for garbage collection.
+                        */
+                       if (likely(psock)) {
+                               smap_list_remove(psock, NULL, l);
+                               smap_release_sock(psock, sock);
+                       }
+                       write_unlock_bh(&sock->sk_callback_lock);
+                       kfree(l);
+               }
+       }
+       rcu_read_unlock();
+       bpf_map_area_free(htab->buckets);
+       kfree(htab);
+}
+
+static struct htab_elem *alloc_sock_hash_elem(struct bpf_htab *htab,
+                                             void *key, u32 key_size, u32 hash,
+                                             struct sock *sk,
+                                             struct htab_elem *old_elem)
+{
+       struct htab_elem *l_new;
+
+       if (atomic_inc_return(&htab->count) > htab->map.max_entries) {
+               if (!old_elem) {
+                       atomic_dec(&htab->count);
+                       return ERR_PTR(-E2BIG);
+               }
+       }
+       l_new = kmalloc_node(htab->elem_size, GFP_ATOMIC | __GFP_NOWARN,
+                            htab->map.numa_node);
+       if (!l_new)
+               return ERR_PTR(-ENOMEM);
+
+       memcpy(l_new->key, key, key_size);
+       l_new->sk = sk;
+       l_new->hash = hash;
+       return l_new;
+}
+
+static struct htab_elem *lookup_elem_raw(struct hlist_head *head,
+                                        u32 hash, void *key, u32 key_size)
+{
+       struct htab_elem *l;
+
+       hlist_for_each_entry_rcu(l, head, hash_node) {
+               if (l->hash == hash && !memcmp(&l->key, key, key_size))
+                       return l;
+       }
+
+       return NULL;
+}
+
+static inline u32 htab_map_hash(const void *key, u32 key_len)
+{
+       return jhash(key, key_len, 0);
+}
+
+static int sock_hash_get_next_key(struct bpf_map *map,
+                                 void *key, void *next_key)
+{
+       struct bpf_htab *htab = container_of(map, struct bpf_htab, map);
+       struct htab_elem *l, *next_l;
+       struct hlist_head *h;
+       u32 hash, key_size;
+       int i = 0;
+
+       WARN_ON_ONCE(!rcu_read_lock_held());
+
+       key_size = map->key_size;
+       if (!key)
+               goto find_first_elem;
+       hash = htab_map_hash(key, key_size);
+       h = select_bucket(htab, hash);
+
+       l = lookup_elem_raw(h, hash, key, key_size);
+       if (!l)
+               goto find_first_elem;
+       next_l = hlist_entry_safe(
+                    rcu_dereference_raw(hlist_next_rcu(&l->hash_node)),
+                    struct htab_elem, hash_node);
+       if (next_l) {
+               memcpy(next_key, next_l->key, key_size);
+               return 0;
+       }
+
+       /* no more elements in this hash list, go to the next bucket */
+       i = hash & (htab->n_buckets - 1);
+       i++;
+
+find_first_elem:
+       /* iterate over buckets */
+       for (; i < htab->n_buckets; i++) {
+               h = select_bucket(htab, i);
+
+               /* pick first element in the bucket */
+               next_l = hlist_entry_safe(
+                               rcu_dereference_raw(hlist_first_rcu(h)),
+                               struct htab_elem, hash_node);
+               if (next_l) {
+                       /* if it's not empty, just return it */
+                       memcpy(next_key, next_l->key, key_size);
+                       return 0;
+               }
+       }
+
+       /* iterated over all buckets and all elements */
+       return -ENOENT;
+}
+
+static int sock_hash_ctx_update_elem(struct bpf_sock_ops_kern *skops,
+                                    struct bpf_map *map,
+                                    void *key, u64 map_flags)
+{
+       struct bpf_htab *htab = container_of(map, struct bpf_htab, map);
+       struct bpf_sock_progs *progs = &htab->progs;
+       struct htab_elem *l_new = NULL, *l_old;
+       struct smap_psock_map_entry *e = NULL;
+       struct hlist_head *head;
+       struct smap_psock *psock;
+       u32 key_size, hash;
+       struct sock *sock;
+       struct bucket *b;
+       int err;
+
+       sock = skops->sk;
+
+       if (sock->sk_type != SOCK_STREAM ||
+           sock->sk_protocol != IPPROTO_TCP)
+               return -EOPNOTSUPP;
+
+       if (unlikely(map_flags > BPF_EXIST))
+               return -EINVAL;
+
+       e = kzalloc(sizeof(*e), GFP_ATOMIC | __GFP_NOWARN);
+       if (!e)
+               return -ENOMEM;
+
+       WARN_ON_ONCE(!rcu_read_lock_held());
+       key_size = map->key_size;
+       hash = htab_map_hash(key, key_size);
+       b = __select_bucket(htab, hash);
+       head = &b->head;
+
+       err = __sock_map_ctx_update_elem(map, progs, sock, NULL, key);
+       if (err)
+               goto err;
+
+       /* bpf_map_update_elem() can be called in_irq() */
+       raw_spin_lock_bh(&b->lock);
+       l_old = lookup_elem_raw(head, hash, key, key_size);
+       if (l_old && map_flags == BPF_NOEXIST) {
+               err = -EEXIST;
+               goto bucket_err;
+       }
+       if (!l_old && map_flags == BPF_EXIST) {
+               err = -ENOENT;
+               goto bucket_err;
+       }
+
+       l_new = alloc_sock_hash_elem(htab, key, key_size, hash, sock, l_old);
+       if (IS_ERR(l_new)) {
+               err = PTR_ERR(l_new);
+               goto bucket_err;
+       }
+
+       psock = smap_psock_sk(sock);
+       if (unlikely(!psock)) {
+               err = -EINVAL;
+               goto bucket_err;
+       }
+
+       e->hash_link = l_new;
+       e->htab = container_of(map, struct bpf_htab, map);
+       list_add_tail(&e->list, &psock->maps);
+
+       /* add new element to the head of the list, so that
+        * concurrent search will find it before old elem
+        */
+       hlist_add_head_rcu(&l_new->hash_node, head);
+       if (l_old) {
+               psock = smap_psock_sk(l_old->sk);
+
+               hlist_del_rcu(&l_old->hash_node);
+               smap_list_remove(psock, NULL, l_old);
+               smap_release_sock(psock, l_old->sk);
+               free_htab_elem(htab, l_old);
+       }
+       raw_spin_unlock_bh(&b->lock);
+       return 0;
+bucket_err:
+       raw_spin_unlock_bh(&b->lock);
+err:
+       kfree(e);
+       psock = smap_psock_sk(sock);
+       if (psock)
+               smap_release_sock(psock, sock);
+       return err;
+}
+
+static int sock_hash_update_elem(struct bpf_map *map,
+                               void *key, void *value, u64 flags)
+{
+       struct bpf_sock_ops_kern skops;
+       u32 fd = *(u32 *)value;
+       struct socket *socket;
+       int err;
+
+       socket = sockfd_lookup(fd, &err);
+       if (!socket)
+               return err;
+
+       skops.sk = socket->sk;
+       if (!skops.sk) {
+               fput(socket->file);
+               return -EINVAL;
+       }
+
+       err = sock_hash_ctx_update_elem(&skops, map, key, flags);
+       fput(socket->file);
+       return err;
+}
+
+static int sock_hash_delete_elem(struct bpf_map *map, void *key)
+{
+       struct bpf_htab *htab = container_of(map, struct bpf_htab, map);
+       struct hlist_head *head;
+       struct bucket *b;
+       struct htab_elem *l;
+       u32 hash, key_size;
+       int ret = -ENOENT;
+
+       key_size = map->key_size;
+       hash = htab_map_hash(key, key_size);
+       b = __select_bucket(htab, hash);
+       head = &b->head;
+
+       raw_spin_lock_bh(&b->lock);
+       l = lookup_elem_raw(head, hash, key, key_size);
+       if (l) {
+               struct sock *sock = l->sk;
+               struct smap_psock *psock;
+
+               hlist_del_rcu(&l->hash_node);
+               write_lock_bh(&sock->sk_callback_lock);
+               psock = smap_psock_sk(sock);
+               /* This check handles a racing sock event that can get the
+                * sk_callback_lock before this case but after xchg happens
+                * causing the refcnt to hit zero and sock user data (psock)
+                * to be null and queued for garbage collection.
+                */
+               if (likely(psock)) {
+                       smap_list_remove(psock, NULL, l);
+                       smap_release_sock(psock, sock);
+               }
+               write_unlock_bh(&sock->sk_callback_lock);
+               free_htab_elem(htab, l);
+               ret = 0;
+       }
+       raw_spin_unlock_bh(&b->lock);
+       return ret;
+}
+
+struct sock  *__sock_hash_lookup_elem(struct bpf_map *map, void *key)
+{
+       struct bpf_htab *htab = container_of(map, struct bpf_htab, map);
+       struct hlist_head *head;
+       struct htab_elem *l;
+       u32 key_size, hash;
+       struct bucket *b;
+       struct sock *sk;
+
+       key_size = map->key_size;
+       hash = htab_map_hash(key, key_size);
+       b = __select_bucket(htab, hash);
+       head = &b->head;
+
+       raw_spin_lock_bh(&b->lock);
+       l = lookup_elem_raw(head, hash, key, key_size);
+       sk = l ? l->sk : NULL;
+       raw_spin_unlock_bh(&b->lock);
+       return sk;
+}
+
 const struct bpf_map_ops sock_map_ops = {
        .map_alloc = sock_map_alloc,
        .map_free = sock_map_free,
@@ -1932,6 +2369,15 @@ static void sock_map_release(struct bpf_map *map)
        .map_release_uref = sock_map_release,
 };
 
+const struct bpf_map_ops sock_hash_ops = {
+       .map_alloc = sock_hash_alloc,
+       .map_free = sock_hash_free,
+       .map_lookup_elem = sock_map_lookup,
+       .map_get_next_key = sock_hash_get_next_key,
+       .map_update_elem = sock_hash_update_elem,
+       .map_delete_elem = sock_hash_delete_elem,
+};
+
 BPF_CALL_4(bpf_sock_map_update, struct bpf_sock_ops_kern *, bpf_sock,
           struct bpf_map *, map, void *, key, u64, flags)
 {
@@ -1949,3 +2395,21 @@ static void sock_map_release(struct bpf_map *map)
        .arg3_type      = ARG_PTR_TO_MAP_KEY,
        .arg4_type      = ARG_ANYTHING,
 };
+
+BPF_CALL_4(bpf_sock_hash_update, struct bpf_sock_ops_kern *, bpf_sock,
+          struct bpf_map *, map, void *, key, u64, flags)
+{
+       WARN_ON_ONCE(!rcu_read_lock_held());
+       return sock_hash_ctx_update_elem(bpf_sock, map, key, flags);
+}
+
+const struct bpf_func_proto bpf_sock_hash_update_proto = {
+       .func           = bpf_sock_hash_update,
+       .gpl_only       = false,
+       .pkt_access     = true,
+       .ret_type       = RET_INTEGER,
+       .arg1_type      = ARG_PTR_TO_CTX,
+       .arg2_type      = ARG_CONST_MAP_PTR,
+       .arg3_type      = ARG_PTR_TO_MAP_KEY,
+       .arg4_type      = ARG_ANYTHING,
+};
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index eb1a596..cd3966d 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -2078,6 +2078,13 @@ static int check_map_func_compatibility(struct 
bpf_verifier_env *env,
                    func_id != BPF_FUNC_msg_redirect_map)
                        goto error;
                break;
+       case BPF_MAP_TYPE_SOCKHASH:
+               if (func_id != BPF_FUNC_sk_redirect_hash &&
+                   func_id != BPF_FUNC_sock_hash_update &&
+                   func_id != BPF_FUNC_map_delete_elem &&
+                   func_id != BPF_FUNC_msg_redirect_hash)
+                       goto error;
+               break;
        default:
                break;
        }
@@ -2114,11 +2121,14 @@ static int check_map_func_compatibility(struct 
bpf_verifier_env *env,
                break;
        case BPF_FUNC_sk_redirect_map:
        case BPF_FUNC_msg_redirect_map:
+       case BPF_FUNC_sock_map_update:
                if (map->map_type != BPF_MAP_TYPE_SOCKMAP)
                        goto error;
                break;
-       case BPF_FUNC_sock_map_update:
-               if (map->map_type != BPF_MAP_TYPE_SOCKMAP)
+       case BPF_FUNC_sk_redirect_hash:
+       case BPF_FUNC_msg_redirect_hash:
+       case BPF_FUNC_sock_hash_update:
+               if (map->map_type != BPF_MAP_TYPE_SOCKHASH)
                        goto error;
                break;
        default:
diff --git a/net/core/filter.c b/net/core/filter.c
index 5623dc8..4cde871 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -1851,6 +1851,33 @@ int skb_do_redirect(struct sk_buff *skb)
        .arg2_type      = ARG_ANYTHING,
 };
 
+BPF_CALL_4(bpf_sk_redirect_hash, struct sk_buff *, skb,
+          struct bpf_map *, map, void *, key, u64, flags)
+{
+       struct tcp_skb_cb *tcb = TCP_SKB_CB(skb);
+
+       /* If user passes invalid input drop the packet. */
+       if (unlikely(flags & ~(BPF_F_INGRESS)))
+               return SK_DROP;
+
+       tcb->bpf.flags = flags;
+       tcb->bpf.sk_redir = __sock_hash_lookup_elem(map, key);
+       if (!tcb->bpf.sk_redir)
+               return SK_DROP;
+
+       return SK_PASS;
+}
+
+static const struct bpf_func_proto bpf_sk_redirect_hash_proto = {
+       .func           = bpf_sk_redirect_hash,
+       .gpl_only       = false,
+       .ret_type       = RET_INTEGER,
+       .arg1_type      = ARG_PTR_TO_CTX,
+       .arg2_type      = ARG_CONST_MAP_PTR,
+       .arg3_type      = ARG_PTR_TO_MAP_KEY,
+       .arg4_type      = ARG_ANYTHING,
+};
+
 BPF_CALL_4(bpf_sk_redirect_map, struct sk_buff *, skb,
           struct bpf_map *, map, u32, key, u64, flags)
 {
@@ -1885,6 +1912,31 @@ struct sock *do_sk_redirect_map(struct sk_buff *skb)
        .arg4_type      = ARG_ANYTHING,
 };
 
+BPF_CALL_4(bpf_msg_redirect_hash, struct sk_msg_buff *, msg,
+          struct bpf_map *, map, void *, key, u64, flags)
+{
+       /* If user passes invalid input drop the packet. */
+       if (unlikely(flags & ~(BPF_F_INGRESS)))
+               return SK_DROP;
+
+       msg->flags = flags;
+       msg->sk_redir = __sock_hash_lookup_elem(map, key);
+       if (!msg->sk_redir)
+               return SK_DROP;
+
+       return SK_PASS;
+}
+
+static const struct bpf_func_proto bpf_msg_redirect_hash_proto = {
+       .func           = bpf_msg_redirect_hash,
+       .gpl_only       = false,
+       .ret_type       = RET_INTEGER,
+       .arg1_type      = ARG_PTR_TO_CTX,
+       .arg2_type      = ARG_CONST_MAP_PTR,
+       .arg3_type      = ARG_PTR_TO_MAP_KEY,
+       .arg4_type      = ARG_ANYTHING,
+};
+
 BPF_CALL_4(bpf_msg_redirect_map, struct sk_msg_buff *, msg,
           struct bpf_map *, map, u32, key, u64, flags)
 {
@@ -3987,6 +4039,8 @@ static unsigned long bpf_xdp_copy(void *dst_buff, const 
void *src_buff,
                return &bpf_sock_ops_cb_flags_set_proto;
        case BPF_FUNC_sock_map_update:
                return &bpf_sock_map_update_proto;
+       case BPF_FUNC_sock_hash_update:
+               return &bpf_sock_hash_update_proto;
        default:
                return bpf_base_func_proto(func_id);
        }
@@ -3998,6 +4052,8 @@ static unsigned long bpf_xdp_copy(void *dst_buff, const 
void *src_buff,
        switch (func_id) {
        case BPF_FUNC_msg_redirect_map:
                return &bpf_msg_redirect_map_proto;
+       case BPF_FUNC_msg_redirect_hash:
+               return &bpf_msg_redirect_hash_proto;
        case BPF_FUNC_msg_apply_bytes:
                return &bpf_msg_apply_bytes_proto;
        case BPF_FUNC_msg_cork_bytes:
@@ -4029,6 +4085,8 @@ static unsigned long bpf_xdp_copy(void *dst_buff, const 
void *src_buff,
                return &bpf_get_socket_uid_proto;
        case BPF_FUNC_sk_redirect_map:
                return &bpf_sk_redirect_map_proto;
+       case BPF_FUNC_sk_redirect_hash:
+               return &bpf_sk_redirect_hash_proto;
        default:
                return bpf_base_func_proto(func_id);
        }
-- 
1.9.1

[PATCH bpf-next v4 2/4] bpf: sockmap, add hash map support

Reply via email to