On 6/22/17, 3:41 PM, "Daniel Borkmann" <dan...@iogearbox.net> wrote:
On 06/20/2017 05:00 AM, Lawrence Brakmo wrote: [...] > diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h > index f94b48b..861dbe9 100644 > --- a/include/uapi/linux/bpf.h > +++ b/include/uapi/linux/bpf.h > @@ -120,12 +120,14 @@ enum bpf_prog_type { > BPF_PROG_TYPE_LWT_IN, > BPF_PROG_TYPE_LWT_OUT, > BPF_PROG_TYPE_LWT_XMIT, > + BPF_PROG_TYPE_SOCK_OPS, > }; > > enum bpf_attach_type { > BPF_CGROUP_INET_INGRESS, > BPF_CGROUP_INET_EGRESS, > BPF_CGROUP_INET_SOCK_CREATE, > + BPF_GLOBAL_SOCK_OPS, > __MAX_BPF_ATTACH_TYPE > }; [...] > #endif /* _UAPI__LINUX_BPF_H__ */ > diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c > index 8942c82..e02831f 100644 > --- a/kernel/bpf/syscall.c > +++ b/kernel/bpf/syscall.c [...] > +static int bpf_prog_attach(const union bpf_attr *attr) > +{ > + if (!capable(CAP_NET_ADMIN)) > + return -EPERM; > + > + if (CHECK_ATTR(BPF_PROG_ATTACH)) > + return -EINVAL; > + > + if (attr->attach_type == BPF_GLOBAL_SOCK_OPS) > + return bpf_sock_ops_attach_global_prog(attr->attach_bpf_fd); > + else > + return bpf_prog_attach_cgroup(attr); > +} > + [...] > +static int bpf_prog_detach(const union bpf_attr *attr) > +{ > + if (!capable(CAP_NET_ADMIN)) > + return -EPERM; > + > + if (CHECK_ATTR(BPF_PROG_DETACH)) > + return -EINVAL; > + > + if (attr->attach_type == BPF_GLOBAL_SOCK_OPS) > + return bpf_sock_ops_detach_global_prog(); > + else > + return bpf_prog_detach_cgroup(attr); > +} > > #define BPF_PROG_TEST_RUN_LAST_FIELD test.duration > > @@ -1431,14 +1467,12 @@ SYSCALL_DEFINE3(bpf, int, cmd, union bpf_attr __user *, uattr, unsigned int, siz > case BPF_OBJ_GET: > err = bpf_obj_get(&attr); > break; > -#ifdef CONFIG_CGROUP_BPF > case BPF_PROG_ATTACH: > err = bpf_prog_attach(&attr); > break; > case BPF_PROG_DETACH: > err = bpf_prog_detach(&attr); > break; > -#endif > case BPF_PROG_TEST_RUN: > err = bpf_prog_test_run(&attr, uattr); > break; [...] > diff --git a/net/core/sock_bpfops.c b/net/core/sock_bpfops.c > new file mode 100644 > index 0000000..06f4a64 > --- /dev/null > +++ b/net/core/sock_bpfops.c > @@ -0,0 +1,65 @@ > +/* > + * BPF support for sockets > + * > + * Copyright (c) 2016 Lawrence Brakmo <bra...@fb.com> > + * > + * This program is free software; you can redistribute it and/or modify > + * it under the terms of the GNU General Public License version 2 > + * as published by the Free Software Foundation. > + */ > + > +#include <net/sock.h> > +#include <linux/skbuff.h> > +#include <linux/bpf.h> > +#include <linux/filter.h> > +#include <linux/errno.h> > +#ifdef CONFIG_NET_NS > +#include <net/net_namespace.h> > +#include <linux/proc_ns.h> > +#endif > + > +/* Global BPF program for sockets */ > +static struct bpf_prog *bpf_global_sock_ops_prog; > + > +int bpf_sock_ops_detach_global_prog(void) > +{ > + struct bpf_prog *old_prog; > + > + old_prog = xchg(&bpf_global_sock_ops_prog, NULL); > + > + if (old_prog) > + bpf_prog_put(old_prog); > + > + return 0; > +} > + > +int bpf_sock_ops_attach_global_prog(int fd) > +{ > + struct bpf_prog *prog, *old_prog; > + int err = 0; > + > + prog = bpf_prog_get_type(fd, BPF_PROG_TYPE_SOCK_OPS); > + if (IS_ERR(prog)) > + return PTR_ERR(prog); > + > + old_prog = xchg(&bpf_global_sock_ops_prog, prog); > + if (old_prog) > + bpf_prog_put(old_prog); > + return err; > +} > + > +int bpf_sock_ops_call(struct bpf_sock_ops_kern *bpf_sock) > +{ > + struct bpf_prog *prog; > + int ret; > + > + rcu_read_lock(); > + prog = READ_ONCE(bpf_global_sock_ops_prog); > + if (prog) > + ret = BPF_PROG_RUN(prog, bpf_sock); > + else > + ret = -1; > + rcu_read_unlock(); > + > + return ret; > +} Now that we integrate with BPF_PROG_ATTACH/DETACH, can you make all the above also per cgroup as we have with all other BPF_CGROUP_INET_* progs? It seems kind of weird that we have one single global program doing the enforcement of TCP and congctl options. Something on a more fine-grained level like cgroups would be more suited wrt containers, etc. Right now there's no notion of global program of such kind. Thanks, Daniel Daniel, I see value for having a global program, so I would like to keep that. When this patchset is accepted, I will submit one that adds support for per cgroup sock_ops programs, with the option to use the global one if none is specified for a cgroup. We could also have the option of the cgroup sock_ops program choosing if the global program should run for a particular op based on its return value. We can iron it out the details when that patch is submitted. Is it acceptable? Thanks, Lawrence