The following pull request was submitted through Github. It can be accessed and reviewed at: https://github.com/lxc/lxc/pull/3194
This e-mail was sent by the LXC bot, direct replies will not reach the author unless they happen to be subscribed to this list. === Description (from pull-request) === Signed-off-by: Christian Brauner <christian.brau...@ubuntu.com>
From ceaa8d4be7ae2a549925f9c24b86169aa64c82d8 Mon Sep 17 00:00:00 2001 From: Christian Brauner <christian.brau...@ubuntu.com> Date: Thu, 28 Nov 2019 16:22:36 +0100 Subject: [PATCH] [WIP]: cgroups: add cgroup2 device controller support Signed-off-by: Christian Brauner <christian.brau...@ubuntu.com> --- src/lxc/Makefile.am | 2 + src/lxc/cgroups/cgfsng.c | 163 +++++++++++- src/lxc/cgroups/cgroup.h | 2 + src/lxc/cgroups/cgroup2_devices.c | 411 ++++++++++++++++++++++++++++++ src/lxc/cgroups/cgroup2_devices.h | 81 ++++++ src/lxc/conf.c | 2 + src/lxc/conf.h | 1 + src/lxc/macro.h | 78 ++++++ src/lxc/start.c | 6 + 9 files changed, 737 insertions(+), 9 deletions(-) create mode 100644 src/lxc/cgroups/cgroup2_devices.c create mode 100644 src/lxc/cgroups/cgroup2_devices.h diff --git a/src/lxc/Makefile.am b/src/lxc/Makefile.am index 4b18ac5d82..56c64f596a 100644 --- a/src/lxc/Makefile.am +++ b/src/lxc/Makefile.am @@ -7,6 +7,7 @@ noinst_HEADERS = api_extensions.h \ caps.h \ cgroups/cgroup.h \ cgroups/cgroup_utils.h \ + cgroups/cgroup2_devices.h \ compiler.h \ conf.h \ confile.h \ @@ -95,6 +96,7 @@ liblxc_la_SOURCES = af_unix.c af_unix.h \ caps.c caps.h \ cgroups/cgfsng.c \ cgroups/cgroup.c cgroups/cgroup.h \ + cgroups/cgroup2_devices.c cgroups/cgroup2_devices.h \ cgroups/cgroup_utils.c cgroups/cgroup_utils.h \ compiler.h \ commands.c commands.h \ diff --git a/src/lxc/cgroups/cgfsng.c b/src/lxc/cgroups/cgfsng.c index 1e6a45cff2..3db0602dbc 100644 --- a/src/lxc/cgroups/cgfsng.c +++ b/src/lxc/cgroups/cgfsng.c @@ -54,6 +54,7 @@ #include "caps.h" #include "cgroup.h" +#include "cgroup2_devices.h" #include "cgroup_utils.h" #include "commands.h" #include "conf.h" @@ -2474,8 +2475,17 @@ static bool __cg_legacy_setup_limits(struct cgroup_ops *ops, return ret; } +struct dev_exception_item { + char type; + int major; + int minor; + char access[100]; + int allow; +}; + static bool __cg_unified_setup_limits(struct cgroup_ops *ops, - struct lxc_list *cgroup_settings) + struct lxc_list *cgroup_settings, + struct lxc_conf *conf) { struct lxc_list *iterator; struct hierarchy *h = ops->unified; @@ -2486,17 +2496,130 @@ static bool __cg_unified_setup_limits(struct cgroup_ops *ops, if (!h) return false; - lxc_list_for_each(iterator, cgroup_settings) { + lxc_list_for_each (iterator, cgroup_settings) { __do_free char *fullpath = NULL; int ret; struct lxc_cgroup *cg = iterator->elem; - fullpath = must_make_path(h->container_full_path, cg->subsystem, NULL); - ret = lxc_write_to_file(fullpath, cg->value, strlen(cg->value), false, 0666); - if (ret < 0) { - SYSERROR("Failed to set \"%s\" to \"%s\"", - cg->subsystem, cg->value); - return false; + if (strncmp("devices", cg->subsystem, 7) == 0) { + const char *val = cg->value; + struct dev_exception_item ex = {0}; + int count, rc = 0; + char temp[50]; + struct bpf_program *device; + + if (conf->cgroup2_devices) { + device = conf->cgroup2_devices; + } else { + device = bpf_program_new(BPF_PROG_TYPE_CGROUP_DEVICE); + if (device) + device = bpf_program_init(device); + } + if (!device) { + ERROR("Failed to create new ebpf device program"); + return false; + } + + switch (*val) { + case 'a': + __fallthrough; + case 'b': + __fallthrough; + case 'c': + ex.type = *val; + break; + default: + return false; + } + + val++; + if (!isspace(*val)) + return false; + val++; + if (*val == '*') { + ex.major = ~0; + val++; + } else if (isdigit(*val)) { + memset(temp, 0, sizeof(temp)); + for (count = 0; count < sizeof(temp) - 1; + count++) { + temp[count] = *val; + val++; + if (!isdigit(*val)) + break; + } + rc = lxc_safe_uint(temp, &ex.major); + if (rc) + return false; + } else { + return false; + } + if (*val != ':') + return false; + val++; + + /* read minor */ + if (*val == '*') { + ex.minor = ~0; + val++; + } else if (isdigit(*val)) { + memset(temp, 0, sizeof(temp)); + for (count = 0; count < sizeof(temp) - 1; + count++) { + temp[count] = *val; + val++; + if (!isdigit(*val)) + break; + } + rc = lxc_safe_uint(temp, &ex.minor); + if (rc) + return false; + } else { + return false; + } + if (!isspace(*val)) + return false; + for (val++, count = 0; count < 3; count++, val++) { + switch (*val) { + case 'r': + ex.access[count] = *val; + break; + case 'w': + ex.access[count] = *val; + break; + case 'm': + ex.access[count] = *val; + break; + case '\n': + case '\0': + count = 3; + break; + default: + return false; + } + } + + if (strcmp("devices.allow", cg->subsystem) == 0) + ex.allow = 1; + + device = bpf_program_append_device(device, ex.type, + ex.major, ex.minor, + ex.access, ex.allow); + if (!device) { + ERROR("Failed to add new rule to bpf device program"); + return false; + } + } else { + + fullpath = must_make_path(h->container_full_path, + cg->subsystem, NULL); + ret = lxc_write_to_file(fullpath, cg->value, + strlen(cg->value), false, 0666); + if (ret < 0) { + SYSERROR("Failed to set \"%s\" to \"%s\"", + cg->subsystem, cg->value); + return false; + } } TRACE("Set \"%s\" to \"%s\"", cg->subsystem, cg->value); } @@ -2505,6 +2628,27 @@ static bool __cg_unified_setup_limits(struct cgroup_ops *ops, return true; } +__cgfsng_ops bool cgfsng_devices_activate(struct cgroup_ops *ops, + struct lxc_handler *handler) +{ + struct hierarchy *h = ops->unified; + struct bpf_program *device = handler->conf->cgroup2_devices; + + if (!h) + return false; + + if (!device) + return true; + + device = bpf_program_complete_finalize(device); + if (!device) + return false; + + return bpf_program_cgroup_attach(device, BPF_CGROUP_DEVICE, + h->container_full_path, + BPF_F_ALLOW_MULTI); +} + __cgfsng_ops static bool cgfsng_setup_limits(struct cgroup_ops *ops, struct lxc_conf *conf, bool do_devices) @@ -2512,7 +2656,7 @@ __cgfsng_ops static bool cgfsng_setup_limits(struct cgroup_ops *ops, if (!__cg_legacy_setup_limits(ops, &conf->cgroup, do_devices)) return false; - return __cg_unified_setup_limits(ops, &conf->cgroup2); + return __cg_unified_setup_limits(ops, &conf->cgroup2, conf); } static bool cgroup_use_wants_controllers(const struct cgroup_ops *ops, @@ -2893,6 +3037,7 @@ struct cgroup_ops *cgfsng_ops_init(struct lxc_conf *conf) cgfsng_ops->chown = cgfsng_chown; cgfsng_ops->mount = cgfsng_mount; cgfsng_ops->nrtasks = cgfsng_nrtasks; + cgfsng_ops->devices_activate = cgfsng_devices_activate; return move_ptr(cgfsng_ops); } diff --git a/src/lxc/cgroups/cgroup.h b/src/lxc/cgroups/cgroup.h index 6ab5187c25..bb6c91cce8 100644 --- a/src/lxc/cgroups/cgroup.h +++ b/src/lxc/cgroups/cgroup.h @@ -164,6 +164,8 @@ struct cgroup_ops { bool (*mount)(struct cgroup_ops *ops, struct lxc_handler *handler, const char *root, int type); int (*nrtasks)(struct cgroup_ops *ops); + bool (*devices_activate)(struct cgroup_ops *ops, + struct lxc_handler *handler); }; extern struct cgroup_ops *cgroup_init(struct lxc_conf *conf); diff --git a/src/lxc/cgroups/cgroup2_devices.c b/src/lxc/cgroups/cgroup2_devices.c new file mode 100644 index 0000000000..c3c897a011 --- /dev/null +++ b/src/lxc/cgroups/cgroup2_devices.c @@ -0,0 +1,411 @@ +/* SPDX-License-Identifier: LGPL-2.1+ */ +#include <fcntl.h> +#include <linux/bpf.h> +#include <linux/filter.h> +#include <stddef.h> +#include <stdbool.h> +#include <stdint.h> +#include <sys/stat.h> +#include <sys/syscall.h> +#include <sys/types.h> +#include <unistd.h> + +#include "cgroup2_devices.h" +#include "macro.h" +#include "memory_utils.h" + +static struct bpf_program *bpf_program_add_instructions(struct bpf_program *prog, + const struct bpf_insn *instructions, + size_t count) +{ + + struct bpf_insn *new_insn; + + /* Don't allow modification after we loaded things into the kernel. */ + if (prog->kernel_fd >= 0) + return NULL; + + new_insn = + realloc(prog->instructions, + sizeof(struct bpf_insn) * (count + prog->n_instructions)); + if (!new_insn) + return NULL; + + prog->instructions = new_insn; + memcpy(prog->instructions + prog->n_instructions, instructions, + sizeof(struct bpf_insn) * count); + prog->n_instructions += count; + + return prog; +} + +static struct bpf_program *bpf_program_free(struct bpf_program *prog) +{ + /* Unfortunately, the kernel currently doesn't implicitly detach BPF + * programs from their cgroups when the last fd to the BPF program is + * closed. This has nasty side-effects since this means that abnormally + * terminated programs that attached one of their BPF programs to a + * cgroup will leave this programs pinned for good with zero chance of + * recovery, until the cgroup is removed. This is particularly + * problematic if the cgroup in question is the root cgroup (or any + * other cgroup belonging to a service that cannot be restarted during + * operation, such as dbus), as the memory for the BPF program can only + * be reclaimed through a reboot. To counter this, we track closely to + * which cgroup a program was attached to and will detach it on our own + * whenever we close the BPF fd. */ + (void)bpf_program_cgroup_detach(prog); + + close(prog->kernel_fd); + free(prog->instructions); + free(prog->attached_path); + free(prog); + + return NULL; +} + +/* Memory load, dst_reg = *(uint *) (src_reg + off16) */ +#define BPF_LDX_MEM(SIZE, DST, SRC, OFF) \ + ((struct bpf_insn){.code = BPF_LDX | BPF_SIZE(SIZE) | BPF_MEM, \ + .dst_reg = DST, \ + .src_reg = SRC, \ + .off = OFF, \ + .imm = 0}) + +/* ALU ops on immediates, bpf_add|sub|...: dst_reg += imm32 */ +#define BPF_ALU32_IMM(OP, DST, IMM) \ + ((struct bpf_insn){.code = BPF_ALU | BPF_OP(OP) | BPF_K, \ + .dst_reg = DST, \ + .src_reg = 0, \ + .off = 0, \ + .imm = IMM}) + +/* Short form of mov, dst_reg = src_reg */ +#define BPF_MOV64_IMM(DST, IMM) \ + ((struct bpf_insn){.code = BPF_ALU64 | BPF_MOV | BPF_K, \ + .dst_reg = DST, \ + .src_reg = 0, \ + .off = 0, \ + .imm = IMM}) + +#define BPF_MOV32_REG(DST, SRC) \ + ((struct bpf_insn){.code = BPF_ALU | BPF_MOV | BPF_X, \ + .dst_reg = DST, \ + .src_reg = SRC, \ + .off = 0, \ + .imm = 0}) + +/* Conditional jumps against registers, if (dst_reg 'op' src_reg) goto pc + off16 */ +#define BPF_JMP_REG(OP, DST, SRC, OFF) \ + ((struct bpf_insn){.code = BPF_JMP | BPF_OP(OP) | BPF_X, \ + .dst_reg = DST, \ + .src_reg = SRC, \ + .off = OFF, \ + .imm = 0}) + +/* Conditional jumps against immediates, if (dst_reg 'op' imm32) goto pc + off16 */ +#define BPF_JMP_IMM(OP, DST, IMM, OFF) \ + ((struct bpf_insn){.code = BPF_JMP | BPF_OP(OP) | BPF_K, \ + .dst_reg = DST, \ + .src_reg = 0, \ + .off = OFF, \ + .imm = IMM}) + +/* Program exit */ +#define BPF_EXIT_INSN() \ + ((struct bpf_insn){.code = BPF_JMP | BPF_EXIT, \ + .dst_reg = 0, \ + .src_reg = 0, \ + .off = 0, \ + .imm = 0}) + +static int bpf_access_mask(const char *acc) +{ + int mask = 0; + + for (; *acc; acc++) + switch (*acc) { + case 'r': + mask |= BPF_DEVCG_ACC_READ; + break; + case 'w': + mask |= BPF_DEVCG_ACC_WRITE; + break; + case 'm': + mask |= BPF_DEVCG_ACC_MKNOD; + break; + default: + return -EINVAL; + } + + return mask; +} + +static int bpf_device_type(char type) +{ + switch (type) { + case 'a': + return 0; + case 'b': + return BPF_DEVCG_DEV_BLOCK; + case 'c': + return BPF_DEVCG_DEV_CHAR; + } + + return -1; +} + +static inline bool bpf_device_all_access(int access_mask) +{ + return (access_mask == (BPF_DEVCG_ACC_READ | BPF_DEVCG_ACC_WRITE | + BPF_DEVCG_ACC_MKNOD)); +} + +struct bpf_program *bpf_program_new(uint32_t prog_type) +{ + __do_free struct bpf_program *prog = NULL; + + prog = calloc(1, sizeof(struct bpf_program)); + if (!prog) + return NULL; + + prog->prog_type = prog_type; + prog->kernel_fd = -EBADF; + + return move_ptr(prog); +} + +struct bpf_program *bpf_program_init(struct bpf_program *prog) +{ + const struct bpf_insn pre_insn[] = { + /* load device type to r2 */ + BPF_LDX_MEM(BPF_W, BPF_REG_2, BPF_REG_1, offsetof(struct bpf_cgroup_dev_ctx, access_type)), + BPF_ALU32_IMM(BPF_AND, BPF_REG_2, 0xFFFF), + + /* load access type to r3 */ + BPF_LDX_MEM(BPF_W, BPF_REG_3, BPF_REG_1, offsetof(struct bpf_cgroup_dev_ctx, access_type)), + BPF_ALU32_IMM(BPF_RSH, BPF_REG_3, 16), + + /* load major number to r4 */ + BPF_LDX_MEM(BPF_W, BPF_REG_4, BPF_REG_1, offsetof(struct bpf_cgroup_dev_ctx, major)), + + /* load minor number to r5 */ + BPF_LDX_MEM(BPF_W, BPF_REG_5, BPF_REG_1, offsetof(struct bpf_cgroup_dev_ctx, minor)), + }; + + return bpf_program_add_instructions(prog, pre_insn, ARRAY_SIZE(pre_insn)); +} + +struct bpf_program *bpf_program_append_device(struct bpf_program *prog, + char type, int major, int minor, + const char *access, int allow) +{ + int jump_nr = 1; + struct bpf_insn bpf_access_decision[] = { + BPF_MOV64_IMM(BPF_REG_0, allow), + BPF_EXIT_INSN(), + }; + int access_mask; + int device_type; + + device_type = bpf_device_type(type); + if (device_type < 0) + return NULL; + + if (device_type > 0) + jump_nr++; + + access_mask = bpf_access_mask(access); + if (!bpf_device_all_access(access_mask)) + jump_nr += 3; + + if (major >= 0) + jump_nr++; + + if (minor >= 0) + jump_nr++; + + if (device_type > 0) { + puts("A"); + struct bpf_insn ins[] = { + BPF_JMP_IMM(BPF_JNE, BPF_REG_2, device_type, jump_nr--), + }; + + if (bpf_program_add_instructions(prog, ins, ARRAY_SIZE(ins))) + return NULL; + } + + if (!bpf_device_all_access(access_mask)) { + puts("B"); + struct bpf_insn ins[] = { + BPF_MOV32_REG(BPF_REG_1, BPF_REG_3), + BPF_ALU32_IMM(BPF_AND, BPF_REG_1, access_mask), + BPF_JMP_REG(BPF_JNE, BPF_REG_1, BPF_REG_3, jump_nr), /* compare access type */ + }; + + jump_nr -= 3; + if (bpf_program_add_instructions(prog, ins, ARRAY_SIZE(ins))) + return NULL; + } + + if (major >= 0) { + puts("C"); + struct bpf_insn ins[] = { + BPF_JMP_IMM(BPF_JNE, BPF_REG_4, major, jump_nr--), + }; + + if (bpf_program_add_instructions(prog, ins, ARRAY_SIZE(ins))) + return NULL; + } + + if (minor >= 0) { + puts("D"); + struct bpf_insn ins[] = { + BPF_JMP_IMM(BPF_JNE, BPF_REG_5, minor, jump_nr--), + }; + + if (bpf_program_add_instructions(prog, ins, ARRAY_SIZE(ins))) + return NULL; + } + + return bpf_program_add_instructions(prog, bpf_access_decision, + ARRAY_SIZE(bpf_access_decision)); +} + +struct bpf_program *bpf_program_complete_finalize(struct bpf_program *prog) +{ + struct bpf_insn ins[] = { + BPF_MOV64_IMM(BPF_REG_0, 0 /* This determines blacklist or whitelist. */), + BPF_EXIT_INSN(), + }; + + return bpf_program_add_instructions(prog, ins, ARRAY_SIZE(ins)); +} + +static int bpf_program_load_kernel(struct bpf_program *prog, char *log_buf, + size_t log_size) +{ + union bpf_attr attr; + + if (prog->kernel_fd >= 0) { + memset(log_buf, 0, log_size); + return 0; + } + + attr = (union bpf_attr){ + .prog_type = prog->prog_type, + .insns = PTR_TO_UINT64(prog->instructions), + .insn_cnt = prog->n_instructions, + .license = PTR_TO_UINT64("GPL"), + .log_buf = PTR_TO_UINT64(log_buf), + .log_level = !!log_buf, + .log_size = log_size, + }; + + prog->kernel_fd = bpf(BPF_PROG_LOAD, &attr, sizeof(attr)); + if (prog->kernel_fd < 0) + return -1; + + return 0; +} + +struct bpf_program *bpf_program_cgroup_attach(struct bpf_program *prog, int type, + const char *path, uint32_t flags) +{ + __do_free char *copy = NULL; + __do_close_prot_errno int fd = -EBADF; + union bpf_attr attr; + int r; + + if (flags & ~(BPF_F_ALLOW_OVERRIDE, BPF_F_ALLOW_MULTI)) + return NULL; + + /* We need to track which cgroup the program is attached to, and we can + * only track one attachment, hence let's refuse this early. */ + if (prog->attached_path) { + if (prog->attached_type != type) + return NULL; + if (prog->attached_flags != flags) + return NULL; + + /* Here's a shortcut: if we previously attached this program already, + * then we don't have to do so again. Well, with one exception: + * if we are in BPF_F_ALLOW_OVERRIDE mode then someone else might have + * replaced our program since the last time, hence let's reattach + * it again, just to be safe. In flags + * == 0 mode this is not an issue since nobody else can replace + * our program in that case, and in flags + * == BPF_F_ALLOW_MULTI mode any other's program would be installed + * in addition to ours hence ours would remain in effect. */ + if (flags != BPF_F_ALLOW_OVERRIDE) + return prog; + } + + /* Ensure we have a kernel object for this. */ + r = bpf_program_load_kernel(prog, NULL, 0); + if (r < 0) + return NULL; + + copy = strdup(path); + if (!copy) + return NULL; + + fd = open(path, O_DIRECTORY | O_RDONLY | O_CLOEXEC); + if (fd < 0) + return NULL; + + attr = (union bpf_attr){ + .attach_type = type, + .target_fd = fd, + .attach_bpf_fd = prog->kernel_fd, + .attach_flags = flags, + }; + + if (bpf(BPF_PROG_ATTACH, &attr, sizeof(attr)) < 0) + return NULL; + + free_and_replace(prog->attached_path, copy); + prog->attached_type = type; + prog->attached_flags = flags; + + return prog; +} + +struct bpf_program *bpf_program_cgroup_detach(struct bpf_program *prog) +{ + __do_close_prot_errno int fd = -EBADF; + + if (!prog->attached_path) + return NULL; + + fd = open(prog->attached_path, O_DIRECTORY | O_RDONLY | O_CLOEXEC); + if (fd < 0) { + if (errno != ENOENT) + return NULL; + + /* If the cgroup does not exist anymore, then we don't have to + * explicitly detach, it got detached implicitly by the removal, hence don't complain */ + + } else { + union bpf_attr attr; + + attr = (union bpf_attr){ + .attach_type = prog->attached_type, + .target_fd = fd, + .attach_bpf_fd = prog->kernel_fd, + }; + + if (bpf(BPF_PROG_DETACH, &attr, sizeof(attr)) < 0) + return NULL; + } + + free(prog->attached_path); + prog->attached_path = NULL; + + return prog; +} + +void lxc_clear_cgroup2_devices(struct lxc_conf *conf) +{ + (void)bpf_program_cgroup_detach(conf->cgroup2_devices); + (void)bpf_program_free(conf->cgroup2_devices); +} diff --git a/src/lxc/cgroups/cgroup2_devices.h b/src/lxc/cgroups/cgroup2_devices.h new file mode 100644 index 0000000000..e9893ce21f --- /dev/null +++ b/src/lxc/cgroups/cgroup2_devices.h @@ -0,0 +1,81 @@ +/* SPDX-License-Identifier: LGPL-2.1+ */ + +#ifndef __LXC_CGROUP2_DEVICES_H +#define __LXC_CGROUP2_DEVICES_H + +#include <fcntl.h> +#include <linux/bpf.h> +#include <linux/filter.h> +#include <stdbool.h> +#include <stddef.h> +#include <stdint.h> +#include <sys/stat.h> +#include <sys/syscall.h> +#include <sys/types.h> +#include <unistd.h> + +#include "conf.h" + +#if !HAVE_BPF +#if !(defined __NR_bpf && __NR_bpf > 0) +#if defined __NR_bpf +#undef __NR_bpf +#endif +#if defined __i386__ +#define __NR_bpf 357 +#elif defined __x86_64__ +#define __NR_bpf 321 +#elif defined __aarch64__ +#define __NR_bpf 280 +#elif defined __arm__ +#define __NR_bpf 386 +#elif defined __sparc__ +#define __NR_bpf 349 +#elif defined __s390__ +#define __NR_bpf 351 +#elif defined __tilegx__ +#define __NR_bpf 280 +#else +#warning "__NR_bpf not defined for your architecture" +#endif +#endif + +union bpf_attr; + +static inline int missing_bpf(int cmd, union bpf_attr *attr, size_t size) +{ +#ifdef __NR_bpf + return (int)syscall(__NR_bpf, cmd, attr, size); +#else + errno = ENOSYS; + return -1; +#endif +} + +#define bpf missing_bpf +#endif + +struct bpf_program { + int kernel_fd; + uint32_t prog_type; + + size_t n_instructions; + struct bpf_insn *instructions; + + char *attached_path; + int attached_type; + uint32_t attached_flags; +}; + +struct bpf_program *bpf_program_new(uint32_t prog_type); +struct bpf_program *bpf_program_init(struct bpf_program *prog); +struct bpf_program *bpf_program_append_device(struct bpf_program *prog, + char type, int major, int minor, + const char *access, int allow); +struct bpf_program *bpf_program_complete_finalize(struct bpf_program *prog); +struct bpf_program *bpf_program_cgroup_attach(struct bpf_program *prog, int type, + const char *path, uint32_t flags); +struct bpf_program *bpf_program_cgroup_detach(struct bpf_program *prog); +void lxc_clear_cgroup2_devices(struct lxc_conf *conf); + +#endif /* __LXC_CGROUP2_DEVICES_H */ diff --git a/src/lxc/conf.c b/src/lxc/conf.c index 06e4adcc38..c03b663835 100644 --- a/src/lxc/conf.c +++ b/src/lxc/conf.c @@ -57,6 +57,7 @@ #include "af_unix.h" #include "caps.h" #include "cgroup.h" +#include "cgroup2_devices.h" #include "conf.h" #include "config.h" #include "confile.h" @@ -4118,6 +4119,7 @@ void lxc_conf_free(struct lxc_conf *conf) lxc_clear_config_keepcaps(conf); lxc_clear_cgroups(conf, "lxc.cgroup", CGROUP_SUPER_MAGIC); lxc_clear_cgroups(conf, "lxc.cgroup2", CGROUP2_SUPER_MAGIC); + lxc_clear_cgroup2_devices(conf); lxc_clear_hooks(conf, "lxc.hook"); lxc_clear_mount_entries(conf); lxc_clear_idmaps(conf); diff --git a/src/lxc/conf.h b/src/lxc/conf.h index 9f4a93d0b2..741ac4f096 100644 --- a/src/lxc/conf.h +++ b/src/lxc/conf.h @@ -241,6 +241,7 @@ struct lxc_conf { struct { struct lxc_list cgroup; struct lxc_list cgroup2; + struct bpf_program *cgroup2_devices; }; struct { diff --git a/src/lxc/macro.h b/src/lxc/macro.h index f96a90019e..6f3379b3c4 100644 --- a/src/lxc/macro.h +++ b/src/lxc/macro.h @@ -429,6 +429,8 @@ enum { #define PTR_TO_INTMAX(p) ((intmax_t)((intptr_t)(p))) #define INTMAX_TO_PTR(u) ((void *)((intptr_t)(u))) +#define PTR_TO_UINT64(p) ((uint64_t)((intptr_t)(p))) + #define LXC_INVALID_UID ((uid_t)-1) #define LXC_INVALID_GID ((gid_t)-1) @@ -465,4 +467,80 @@ enum { #define LXC_TIMESTAMP_FNAME "ts" #define LXC_COMMENT_FNAME "comment" +/* Taken from systemd. */ +#define free_and_replace(a, b) \ + ({ \ + free(a); \ + (a) = (b); \ + (b) = NULL; \ + 0; \ + }) + +#define XCONCATENATE(x, y) x##y +#define CONCATENATE(x, y) XCONCATENATE(x, y) +#define UNIQ_T(x, uniq) CONCATENATE(__unique_prefix_, CONCATENATE(x, uniq)) +#define UNIQ __COUNTER__ +#undef MIN +#define MIN(a, b) __MIN(UNIQ, (a), UNIQ, (b)) +#define __MIN(aq, a, bq, b) \ + ({ \ + const typeof(a) UNIQ_T(A, aq) = (a); \ + const typeof(b) UNIQ_T(B, bq) = (b); \ + UNIQ_T(A, aq) < UNIQ_T(B, bq) ? UNIQ_T(A, aq) : UNIQ_T(B, bq); \ + }) + +/* Taken from the kernel. */ + +/* + * min()/max()/clamp() macros must accomplish three things: + * + * - avoid multiple evaluations of the arguments (so side-effects like + * "x++" happen only once) when non-constant. + * - perform strict type-checking (to generate warnings instead of + * nasty runtime surprises). See the "unnecessary" pointer comparison + * in __typecheck(). + * - retain result as a constant expressions when called with only + * constant expressions (to avoid tripping VLA warnings in stack + * allocation usage). + */ +#define __typecheck(x, y) (!!(sizeof((typeof(x) *)1 == (typeof(y) *)1))) + +/* + * This returns a constant expression while determining if an argument is + * a constant expression, most importantly without evaluating the argument. + * Glory to Martin Uecker <martin.uec...@med.uni-goettingen.de> + */ +#define __is_constexpr(x) \ + (sizeof(int) == sizeof(*(8 ? ((void *)((long)(x)*0l)) : (int *)8))) + +#define __no_side_effects(x, y) (__is_constexpr(x) && __is_constexpr(y)) + +#define __safe_cmp(x, y) (__typecheck(x, y) && __no_side_effects(x, y)) + +#define __cmp(x, y, op) ((x)op(y) ? (x) : (y)) + +#define __cmp_once(x, y, unique_x, unique_y, op) \ + ({ \ + typeof(x) unique_x = (x); \ + typeof(y) unique_y = (y); \ + __cmp(unique_x, unique_y, op); \ + }) + +#define __careful_cmp(x, y, op) \ + __builtin_choose_expr(__safe_cmp(x, y), __cmp(x, y, op), \ + __cmp_once(x, y, __UNIQUE_ID(__x), \ + __UNIQUE_ID(__y), op)) + +/** + * min - return minimum of two values of the same or compatible types + * @x: first value + * @y: second value + */ +#define min(x, y) __careful_cmp(x, y, <) + +#define ARRAY_SIZE(x) \ + (__builtin_choose_expr(!__builtin_types_compatible_p(typeof(x), \ + typeof(&*(x))), \ + sizeof(x) / sizeof((x)[0]), ((void)0))) + #endif /* __LXC_MACRO_H */ diff --git a/src/lxc/start.c b/src/lxc/start.c index 3cfc8b2f57..ec1557fdec 100644 --- a/src/lxc/start.c +++ b/src/lxc/start.c @@ -1912,6 +1912,12 @@ static int lxc_spawn(struct lxc_handler *handler) } TRACE("Set up legacy device cgroup controller limits"); + if (!cgroup_ops->devices_activate(cgroup_ops, handler)) { + ERROR("Failed to setup cgroup2 device controller limits"); + goto out_delete_net; + } + TRACE("Set up cgroup2 device controller limits"); + if (handler->ns_clone_flags & CLONE_NEWCGROUP) { /* Now we're ready to preserve the cgroup namespace */ ret = lxc_try_preserve_ns(handler->pid, "cgroup");
_______________________________________________ lxc-devel mailing list lxc-devel@lists.linuxcontainers.org http://lists.linuxcontainers.org/listinfo/lxc-devel