Without a per-VE cap a single container could exhaust the system-wide bpf JIT memory budget by loading excessive numbers of CGROUP_DEVICE programs via the VE_FEATURE_BPF path.
Add bpf_prog_avail_nr / bpf_prog_max_nr counters to ve_struct and enforce them in bpf_prog_load() for non-bpf-capable callers loading CGROUP_DEVICE programs. Lifetime note: A BPF program loadded in VE takes a reference to ve, when container is stopped, all open fds to the BPF programm will be closed and when container manager removes container cgroups the BPF program will be released and thus releasing the reference to VE. Default max number note: It is somehow similar to ve.netif_max_nr, there each docker container creates two veths, and I also observe that docker container loads two bpf programs (one by dockerd, one by systemd). So let's use the same number. https://virtuozzo.atlassian.net/browse/VSTOR-131947 Signed-off-by: Pavel Tikhomirov <[email protected]> Feature: ve: allow BPF in Containers --- include/linux/bpf.h | 8 ++++++++ include/linux/ve.h | 4 ++++ kernel/bpf/core.c | 8 ++++++++ kernel/bpf/syscall.c | 35 +++++++++++++++++++++++++++++++++++ kernel/ve/ve.c | 5 +++++ 5 files changed, 60 insertions(+) diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 80175c7a21c27..0212806d5efc2 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -56,6 +56,7 @@ struct cgroup; struct bpf_token; struct user_namespace; struct super_block; +struct ve_struct; struct inode; extern struct idr btf_idr; @@ -1522,6 +1523,13 @@ struct bpf_prog_aux { void *security; #endif struct bpf_token *token; +#ifdef CONFIG_VE + /* VE that loaded the program via VE_FEATURE_BPF path and against whose + * bpf_prog_avail_nr counter the program is accounted. NULL for programs + * loaded through the regular (non VE-restricted) path. + */ + struct ve_struct *owner_ve; +#endif struct bpf_prog_offload *offload; struct btf *btf; struct bpf_func_info *func_info; diff --git a/include/linux/ve.h b/include/linux/ve.h index 224acf012821f..88b4d531c466e 100644 --- a/include/linux/ve.h +++ b/include/linux/ve.h @@ -76,6 +76,9 @@ struct ve_struct { atomic_t netif_avail_nr; int netif_max_nr; + atomic_t bpf_prog_avail_nr; + int bpf_prog_max_nr; + atomic64_t _uevent_seqnum; int _randomize_va_space; @@ -149,6 +152,7 @@ extern int nr_ve; #define NETNS_MAX_NR_DEFAULT 256 /* number of net-namespaces per-VE */ #define NETIF_MAX_NR_DEFAULT 256 /* number of net-interfaces per-VE */ +#define BPF_PROG_MAX_NR_DEFAULT 256 /* number of loaded BPF progs per-VE */ extern unsigned int sysctl_ve_mount_nr; diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c index 4de8774458aca..7aaf73180fcdc 100644 --- a/kernel/bpf/core.c +++ b/kernel/bpf/core.c @@ -38,6 +38,7 @@ #include <linux/bpf_mem_alloc.h> #include <linux/memcontrol.h> #include <linux/execmem.h> +#include <linux/ve.h> #include <asm/barrier.h> #include <linux/unaligned.h> @@ -2828,6 +2829,13 @@ void bpf_prog_free(struct bpf_prog *fp) if (aux->dst_prog) bpf_prog_put(aux->dst_prog); bpf_token_put(aux->token); +#ifdef CONFIG_VE + if (aux->owner_ve) { + atomic_inc(&aux->owner_ve->bpf_prog_avail_nr); + put_ve(aux->owner_ve); + aux->owner_ve = NULL; + } +#endif INIT_WORK(&aux->work, bpf_prog_free_deferred); schedule_work(&aux->work); } diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index 0475a72c93c06..8bfea71716de9 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -2663,6 +2663,7 @@ static int bpf_prog_load(union bpf_attr *attr, bpfptr_t uattr, u32 uattr_size) struct bpf_prog *prog, *dst_prog = NULL; struct btf *attach_btf = NULL; struct bpf_token *token = NULL; + struct ve_struct *load_ve = NULL; bool bpf_cap; int err; char license[128]; @@ -2744,6 +2745,22 @@ static int bpf_prog_load(union bpf_attr *attr, bpfptr_t uattr, u32 uattr_size) if (is_perfmon_prog_type(type) && !bpf_token_capable(token, CAP_PERFMON)) goto put_token; +#ifdef CONFIG_VE + /* Restrict the number of BPF programs that can be loaded via the + * VE-allowed path. Without this, a single container could exhaust + * the system-wide bpf JIT memory budget by loading excessive + * numbers of CGROUP_DEVICE programs. + */ + if (!bpf_cap && type == BPF_PROG_TYPE_CGROUP_DEVICE) { + load_ve = get_exec_env(); + if (atomic_dec_if_positive(&load_ve->bpf_prog_avail_nr) < 0) { + load_ve = NULL; + err = -ENOSPC; + goto put_token; + } + } +#endif + /* attach_prog_fd/attach_btf_obj_fd can specify fd of either bpf_prog * or btf, we need to check which one it is */ @@ -2809,6 +2826,16 @@ static int bpf_prog_load(union bpf_attr *attr, bpfptr_t uattr, u32 uattr_size) prog->aux->dev_bound = !!attr->prog_ifindex; prog->aux->xdp_has_frags = attr->prog_flags & BPF_F_XDP_HAS_FRAGS; +#ifdef CONFIG_VE + /* Hand the avail_nr slot reservation over to the prog. bpf_prog_free() + * will release it via put_ve + counter increment. + */ + if (load_ve) { + prog->aux->owner_ve = get_ve(load_ve); + load_ve = NULL; + } +#endif + /* move token into prog->aux, reuse taken refcnt */ prog->aux->token = token; token = NULL; @@ -2932,6 +2959,14 @@ static int bpf_prog_load(union bpf_attr *attr, bpfptr_t uattr, u32 uattr_size) btf_put(prog->aux->attach_btf); bpf_prog_free(prog); put_token: +#ifdef CONFIG_VE + /* The load_ve is non-NULL only if we decremented bpf_prog_avail_nr + * but did not hand the reservation off to the prog yet (i.e. failure + * happened before bpf_prog_alloc()). Roll back the counter. + */ + if (load_ve) + atomic_inc(&load_ve->bpf_prog_avail_nr); +#endif bpf_token_put(token); return err; } diff --git a/kernel/ve/ve.c b/kernel/ve/ve.c index 198c82f010cc1..48da546117bb7 100644 --- a/kernel/ve/ve.c +++ b/kernel/ve/ve.c @@ -76,6 +76,8 @@ struct ve_struct ve0 = { .netns_max_nr = INT_MAX, .netif_avail_nr = ATOMIC_INIT(INT_MAX), .netif_max_nr = INT_MAX, + .bpf_prog_avail_nr = ATOMIC_INIT(INT_MAX), + .bpf_prog_max_nr = INT_MAX, .fsync_enable = FSYNC_FILTERED, ._randomize_va_space = #ifdef CONFIG_COMPAT_BRK @@ -983,6 +985,9 @@ static struct cgroup_subsys_state *ve_create(struct cgroup_subsys_state *parent_ atomic_set(&ve->netif_avail_nr, NETIF_MAX_NR_DEFAULT); ve->netif_max_nr = NETIF_MAX_NR_DEFAULT; + atomic_set(&ve->bpf_prog_avail_nr, BPF_PROG_MAX_NR_DEFAULT); + ve->bpf_prog_max_nr = BPF_PROG_MAX_NR_DEFAULT; + err = ve_log_init(ve); if (err) goto err_log; -- 2.54.0 _______________________________________________ Devel mailing list [email protected] https://lists.openvz.org/mailman/listinfo/devel
