The following pull request was submitted through Github. It can be accessed and reviewed at: https://github.com/lxc/lxcfs/pull/318
This e-mail was sent by the LXC bot, direct replies will not reach the author unless they happen to be subscribed to this list. === Description (from pull-request) === Mostly based on code I've written for liblxc. Signed-off-by: Christian Brauner <christian.brau...@ubuntu.com>
From 4f8198790acda0337010090255aac90b9f943902 Mon Sep 17 00:00:00 2001 From: Christian Brauner <christian.brau...@ubuntu.com> Date: Thu, 20 Feb 2020 16:30:47 +0100 Subject: [PATCH] bindings: add infrastructure for cgroup2 support Mostly based on code I've written for liblxc. Signed-off-by: Christian Brauner <christian.brau...@ubuntu.com> --- Makefile.am | 10 +- bindings.c | 497 ++++++------------------ bindings.h | 4 +- cgroups/cgfsng.c | 787 ++++++++++++++++++++++++++++++++++++++ cgroups/cgroup.c | 79 ++++ cgroups/cgroup.h | 150 ++++++++ cgroups/cgroup2_devices.c | 457 ++++++++++++++++++++++ cgroups/cgroup2_devices.h | 154 ++++++++ cgroups/cgroup_utils.c | 726 +++++++++++++++++++++++++++++++++++ cgroups/cgroup_utils.h | 72 ++++ configure.ac | 9 + macro.h | 56 ++- memory_utils.h | 2 + sysfs_fuse.c | 4 +- 14 files changed, 2618 insertions(+), 389 deletions(-) create mode 100644 cgroups/cgfsng.c create mode 100644 cgroups/cgroup.c create mode 100644 cgroups/cgroup.h create mode 100644 cgroups/cgroup2_devices.c create mode 100644 cgroups/cgroup2_devices.h create mode 100644 cgroups/cgroup_utils.c create mode 100644 cgroups/cgroup_utils.h diff --git a/Makefile.am b/Makefile.am index 13fb1e3..e783f29 100644 --- a/Makefile.am +++ b/Makefile.am @@ -13,6 +13,10 @@ AM_LDFLAGS = $(FUSE_LIBS) -pthread AM_CFLAGS += -DRUNTIME_PATH=\"$(RUNTIME_PATH)\" liblxcfs_la_SOURCES = bindings.c bindings.h \ + cgroups/cgfsng.c \ + cgroups/cgroup.c cgroups/cgroup.h \ + cgroups/cgroup2_devices.c cgroups/cgroup2_devices.h \ + cgroups/cgroup_utils.c cgroups/cgroup_utils.h \ cpuset.c \ memory_utils.h \ sysfs_fuse.c sysfs_fuse.h @@ -20,13 +24,17 @@ liblxcfs_la_CFLAGS = $(AM_CFLAGS) liblxcfs_la_LDFLAGS = $(AM_CFLAGS) -module -avoid-version -shared liblxcfstest_la_SOURCES = bindings.c bindings.h \ + cgroups/cgfsng.c \ + cgroups/cgroup.c cgroups/cgroup.h \ + cgroups/cgroup2_devices.c cgroups/cgroup2_devices.h \ + cgroups/cgroup_utils.c cgroups/cgroup_utils.h \ cpuset.c \ memory_utils.h \ sysfs_fuse.c sysfs_fuse.h liblxcfstest_la_CFLAGS = $(AM_CFLAGS) -DRELOADTEST liblxcfstest_la_LDFLAGS = $(AM_CFLAGS) -module -avoid-version -shared -noinst_HEADERS = bindings.h macro.h memory_utils.h sysfs_fuse.h +noinst_HEADERS = bindings.h cgroups/cgroup.h cgroups/cgroup2_devices.h cgroups/cgroup_utils.h macro.h memory_utils.h sysfs_fuse.h sodir=$(libdir) lxcfs_LTLIBRARIES = liblxcfs.la diff --git a/bindings.c b/bindings.c index 4a8a421..ab0cd71 100644 --- a/bindings.c +++ b/bindings.c @@ -38,6 +38,8 @@ #include <sys/vfs.h> #include "bindings.h" +#include "cgroups/cgroup.h" +#include "cgroups/cgroup_utils.h" #include "memory_utils.h" #include "config.h" @@ -410,25 +412,8 @@ static void lock_mutex(pthread_mutex_t *l) } } -/* READ-ONLY after __constructor__ collect_and_mount_subsystems() has run. - * Number of hierarchies mounted. */ -static int num_hierarchies; +static struct cgroup_ops *cgroup_ops; -/* READ-ONLY after __constructor__ collect_and_mount_subsystems() has run. - * Hierachies mounted {cpuset, blkio, ...}: - * Initialized via __constructor__ collect_and_mount_subsystems(). */ -static char **hierarchies; - -/* READ-ONLY after __constructor__ collect_and_mount_subsystems() has run. - * Open file descriptors: - * @fd_hierarchies[i] refers to cgroup @hierarchies[i]. They are mounted in a - * private mount namespace. - * Initialized via __constructor__ collect_and_mount_subsystems(). - * @fd_hierarchies[i] can be used to perform file operations on the cgroup - * mounts and respective files in the private namespace even when located in - * another namespace using the *at() family of functions - * {openat(), fchownat(), ...}. */ -static int *fd_hierarchies; static int cgroup_mount_ns_fd = -1; static void unlock_mutex(pthread_mutex_t *l) @@ -599,70 +584,6 @@ static int is_dir(const char *path, int fd) return 0; } -static char *must_copy_string(const char *str) -{ - char *dup = NULL; - if (!str) - return NULL; - do { - dup = strdup(str); - } while (!dup); - - return dup; -} - -static inline void drop_trailing_newlines(char *s) -{ - int l; - - for (l=strlen(s); l>0 && s[l-1] == '\n'; l--) - s[l-1] = '\0'; -} - -#define BATCH_SIZE 50 -static void dorealloc(char **mem, size_t oldlen, size_t newlen) -{ - int newbatches = (newlen / BATCH_SIZE) + 1; - int oldbatches = (oldlen / BATCH_SIZE) + 1; - - if (!*mem || newbatches > oldbatches) { - char *tmp; - do { - tmp = realloc(*mem, newbatches * BATCH_SIZE); - } while (!tmp); - *mem = tmp; - } -} -static void append_line(char **contents, size_t *len, char *line, ssize_t linelen) -{ - size_t newlen = *len + linelen; - dorealloc(contents, *len, newlen + 1); - memcpy(*contents + *len, line, linelen+1); - *len = newlen; -} - -static char *slurp_file(const char *from, int fd) -{ - char *line = NULL; - char *contents = NULL; - FILE *f = fdopen(fd, "r"); - size_t len = 0, fulllen = 0; - ssize_t linelen; - - if (!f) - return NULL; - - while ((linelen = getline(&line, &len, f)) != -1) { - append_line(&contents, &fulllen, line, linelen); - } - fclose(f); - - if (contents) - drop_trailing_newlines(contents); - free(line); - return contents; -} - static int preserve_ns(const int pid, const char *ns) { int ret; @@ -776,79 +697,29 @@ struct cgfs_files { uint32_t mode; }; -#define ALLOC_NUM 20 -static bool store_hierarchy(char *stridx, char *h) -{ - if (num_hierarchies % ALLOC_NUM == 0) { - size_t n = (num_hierarchies / ALLOC_NUM) + 1; - n *= ALLOC_NUM; - char **tmp = realloc(hierarchies, n * sizeof(char *)); - if (!tmp) { - lxcfs_error("%s\n", strerror(errno)); - exit(1); - } - hierarchies = tmp; - } - - hierarchies[num_hierarchies++] = must_copy_string(h); - return true; -} - static void print_subsystems(void) { - int i; + int i = 0; fprintf(stderr, "mount namespace: %d\n", cgroup_mount_ns_fd); fprintf(stderr, "hierarchies:\n"); - for (i = 0; i < num_hierarchies; i++) { - if (hierarchies[i]) - fprintf(stderr, " %2d: fd: %3d: %s\n", i, - fd_hierarchies[i], hierarchies[i]); + for (struct hierarchy **h = cgroup_ops->hierarchies; h && *h; h++, i++) { + __do_free char *controllers = lxc_string_join(",", (const char **)(*h)->controllers, false); + fprintf(stderr, " %2d: fd: %3d: %s\n", i, (*h)->fd, controllers ?: ""); } } -static bool in_comma_list(const char *needle, const char *haystack) -{ - const char *s = haystack, *e; - size_t nlen = strlen(needle); - - while (*s && (e = strchr(s, ','))) { - if (nlen != e - s) { - s = e + 1; - continue; - } - if (strncmp(needle, s, nlen) == 0) - return true; - s = e + 1; - } - if (strcmp(needle, s) == 0) - return true; - return false; -} - /* do we need to do any massaging here? I'm not sure... */ /* Return the mounted controller and store the corresponding open file descriptor * referring to the controller mountpoint in the private lxcfs namespace in * @cfd. */ -static char *find_mounted_controller(const char *controller, int *cfd) +static int find_mounted_controller(const char *controller) { - int i; - - for (i = 0; i < num_hierarchies; i++) { - if (!hierarchies[i]) - continue; - if (strcmp(hierarchies[i], controller) == 0) { - *cfd = fd_hierarchies[i]; - return hierarchies[i]; - } - if (in_comma_list(controller, hierarchies[i])) { - *cfd = fd_hierarchies[i]; - return hierarchies[i]; - } - } + struct hierarchy *h; - return NULL; + h = cgroup_ops->get_hierarchy(cgroup_ops, controller); + return h ? h->fd : -EBADF; } bool cgfs_set_value(const char *controller, const char *cgroup, const char *file, @@ -856,10 +727,10 @@ bool cgfs_set_value(const char *controller, const char *cgroup, const char *file { int ret, fd, cfd; size_t len; - char *fnam, *tmpc; + char *fnam; - tmpc = find_mounted_controller(controller, &cfd); - if (!tmpc) + cfd = find_mounted_controller(controller); + if (cfd < 0) return false; /* Make sure we pass a relative path to *at() family of functions. @@ -922,10 +793,10 @@ int cgfs_create(const char *controller, const char *cg, uid_t uid, gid_t gid) { int cfd; size_t len; - char *dirnam, *tmpc; + char *dirnam; - tmpc = find_mounted_controller(controller, &cfd); - if (!tmpc) + cfd = find_mounted_controller(controller); + if (cfd < 0) return -EINVAL; /* Make sure we pass a relative path to *at() family of functions. @@ -1012,11 +883,11 @@ bool cgfs_remove(const char *controller, const char *cg) { int fd, cfd; size_t len; - char *dirnam, *tmpc; + char *dirnam; bool bret; - tmpc = find_mounted_controller(controller, &cfd); - if (!tmpc) + cfd = find_mounted_controller(controller); + if (cfd < 0) return false; /* Make sure we pass a relative path to *at() family of functions. @@ -1039,10 +910,10 @@ bool cgfs_chmod_file(const char *controller, const char *file, mode_t mode) { int cfd; size_t len; - char *pathname, *tmpc; + char *pathname; - tmpc = find_mounted_controller(controller, &cfd); - if (!tmpc) + cfd = find_mounted_controller(controller); + if (cfd < 0) return false; /* Make sure we pass a relative path to *at() family of functions. @@ -1076,11 +947,11 @@ int cgfs_chown_file(const char *controller, const char *file, uid_t uid, gid_t g { int cfd; size_t len; - char *pathname, *tmpc; + char *pathname; - tmpc = find_mounted_controller(controller, &cfd); - if (!tmpc) - return -EINVAL; + cfd = find_mounted_controller(controller); + if (cfd < 0) + return false; /* Make sure we pass a relative path to *at() family of functions. * . + /file + \0 @@ -1102,11 +973,11 @@ FILE *open_pids_file(const char *controller, const char *cgroup) { int fd, cfd; size_t len; - char *pathname, *tmpc; + char *pathname; - tmpc = find_mounted_controller(controller, &cfd); - if (!tmpc) - return NULL; + cfd = find_mounted_controller(controller); + if (cfd < 0) + return false; /* Make sure we pass a relative path to *at() family of functions. * . + /cgroup + / "cgroup.procs" + \0 @@ -1128,15 +999,15 @@ static bool cgfs_iterate_cgroup(const char *controller, const char *cgroup, bool { int cfd, fd, ret; size_t len; - char *cg, *tmpc; + char *cg; char pathname[MAXPATHLEN]; size_t sz = 0, asz = 0; struct dirent *dirent; DIR *dir; - tmpc = find_mounted_controller(controller, &cfd); + cfd = find_mounted_controller(controller); *list = NULL; - if (!tmpc) + if (cfd < 0) return false; /* Make sure we pass a relative path to *at() family of functions. */ @@ -1233,12 +1104,12 @@ void free_keys(struct cgfs_files **keys) bool cgfs_get_value(const char *controller, const char *cgroup, const char *file, char **value) { - int ret, fd, cfd; + int ret, cfd; size_t len; - char *fnam, *tmpc; + char *fnam; - tmpc = find_mounted_controller(controller, &cfd); - if (!tmpc) + cfd = find_mounted_controller(controller); + if (cfd < 0) return false; /* Make sure we pass a relative path to *at() family of functions. @@ -1250,11 +1121,7 @@ bool cgfs_get_value(const char *controller, const char *cgroup, const char *file if (ret < 0 || (size_t)ret >= len) return false; - fd = openat(cfd, fnam, O_RDONLY); - if (fd < 0) - return false; - - *value = slurp_file(fnam, fd); + *value = readat_file(cfd, fnam); return *value != NULL; } @@ -1262,10 +1129,10 @@ bool cgfs_param_exist(const char *controller, const char *cgroup, const char *fi { int ret, cfd; size_t len; - char *fnam, *tmpc; + char *fnam; - tmpc = find_mounted_controller(controller, &cfd); - if (!tmpc) + cfd = find_mounted_controller(controller); + if (cfd < 0) return false; /* Make sure we pass a relative path to *at() family of functions. @@ -1284,12 +1151,12 @@ struct cgfs_files *cgfs_get_key(const char *controller, const char *cgroup, cons { int ret, cfd; size_t len; - char *fnam, *tmpc; + char *fnam; struct stat sb; struct cgfs_files *newkey; - tmpc = find_mounted_controller(controller, &cfd); - if (!tmpc) + cfd = find_mounted_controller(controller); + if (cfd < 0) return false; if (file && *file == '/') @@ -1347,12 +1214,12 @@ bool is_child_cgroup(const char *controller, const char *cgroup, const char *f) { int cfd; size_t len; - char *fnam, *tmpc; + char *fnam; int ret; struct stat sb; - tmpc = find_mounted_controller(controller, &cfd); - if (!tmpc) + cfd = find_mounted_controller(controller); + if (cfd < 0) return false; /* Make sure we pass a relative path to *at() family of functions. @@ -1707,58 +1574,18 @@ static char *get_next_cgroup_dir(const char *taskcg, const char *querycg) return start; } -static void stripnewline(char *x) -{ - size_t l = strlen(x); - if (l && x[l-1] == '\n') - x[l-1] = '\0'; -} - char *get_pid_cgroup(pid_t pid, const char *contrl) { int cfd; - char fnam[PROCLEN]; - FILE *f; - char *answer = NULL; - char *line = NULL; - size_t len = 0; - int ret; - const char *h = find_mounted_controller(contrl, &cfd); - if (!h) - return NULL; - ret = snprintf(fnam, PROCLEN, "/proc/%d/cgroup", pid); - if (ret < 0 || ret >= PROCLEN) - return NULL; - if (!(f = fopen(fnam, "r"))) - return NULL; + cfd = find_mounted_controller(contrl); + if (cfd < 0) + return false; - while (getline(&line, &len, f) != -1) { - char *c1, *c2; - if (!line[0]) - continue; - c1 = strchr(line, ':'); - if (!c1) - goto out; - c1++; - c2 = strchr(c1, ':'); - if (!c2) - goto out; - *c2 = '\0'; - if (strcmp(c1, h) != 0) - continue; - c2++; - stripnewline(c2); - do { - answer = strdup(c2); - } while (!answer); - break; - } + if (pure_unified_layout(cgroup_ops)) + return cg_unified_get_current_cgroup(pid); -out: - fclose(f); - free(line); - return answer; + return cg_legacy_get_current_cgroup(pid, contrl); } /* @@ -1939,10 +1766,9 @@ static char *pick_controller_from_path(struct fuse_context *fc, const char *path if (slash) *slash = '\0'; - int i; - for (i = 0; i < num_hierarchies; i++) { - if (hierarchies[i] && strcmp(hierarchies[i], contr) == 0) - return hierarchies[i]; + for (struct hierarchy **h = cgroup_ops->hierarchies; h && *h; h++) { + if ((*h)->__controllers && strcmp((*h)->__controllers, contr) == 0) + return (*h)->__controllers; } errno = ENOENT; return NULL; @@ -2005,7 +1831,7 @@ int cg_getattr(const char *path, struct stat *sb) int ret = -ENOENT; - if (!fc) + if (!fc || !cgroup_ops || pure_unified_layout(cgroup_ops)) return -EIO; memset(sb, 0, sizeof(struct stat)); @@ -2110,7 +1936,7 @@ int cg_opendir(const char *path, struct fuse_file_info *fi) struct file_info *dir_info; char *controller = NULL; - if (!fc) + if (!fc || !cgroup_ops || pure_unified_layout(cgroup_ops)) return -EIO; if (strcmp(path, "/cgroup") == 0) { @@ -2164,6 +1990,9 @@ int cg_readdir(const char *path, void *buf, fuse_fill_dir_t filler, off_t offset struct fuse_context *fc = fuse_get_context(); char **clist = NULL; + if (!fc || !cgroup_ops || pure_unified_layout(cgroup_ops)) + return -EIO; + if (filler(buf, ".", NULL, 0) != 0 || filler(buf, "..", NULL, 0) != 0) return -EIO; @@ -2172,14 +2001,18 @@ int cg_readdir(const char *path, void *buf, fuse_fill_dir_t filler, off_t offset return -EIO; } if (!d->cgroup && !d->controller) { - // ls /var/lib/lxcfs/cgroup - just show list of controllers - int i; + /* + * ls /var/lib/lxcfs/cgroup - just show list of controllers. + * This only works with the legacy hierarchy. + */ + for (struct hierarchy **h = cgroup_ops->hierarchies; h && *h; h++) { + if (is_unified_hierarchy(*h)) + continue; - for (i = 0; i < num_hierarchies; i++) { - if (hierarchies[i] && filler(buf, hierarchies[i], NULL, 0) != 0) { + if ((*h)->__controllers && filler(buf, (*h)->__controllers, NULL, 0)) return -EIO; - } } + return 0; } @@ -2274,7 +2107,7 @@ int cg_open(const char *path, struct fuse_file_info *fi) struct fuse_context *fc = fuse_get_context(); int ret; - if (!fc) + if (!fc || !cgroup_ops || pure_unified_layout(cgroup_ops)) return -EIO; controller = pick_controller_from_path(fc, path); @@ -2342,12 +2175,12 @@ int cg_access(const char *path, int mode) struct cgfs_files *k = NULL; struct fuse_context *fc = fuse_get_context(); + if (!fc || !cgroup_ops || pure_unified_layout(cgroup_ops)) + return -EIO; + if (strcmp(path, "/cgroup") == 0) return 0; - if (!fc) - return -EIO; - controller = pick_controller_from_path(fc, path); if (!controller) return -errno; @@ -2758,6 +2591,9 @@ int cg_read(const char *path, char *buf, size_t size, off_t offset, int ret, s; bool r; + if (!fc || !cgroup_ops || pure_unified_layout(cgroup_ops)) + return -EIO; + if (f->type != LXC_TYPE_CGFILE) { lxcfs_error("%s\n", "Internal error: directory cache info used in cg_read."); return -EIO; @@ -2766,9 +2602,6 @@ int cg_read(const char *path, char *buf, size_t size, off_t offset, if (offset) return 0; - if (!fc) - return -EIO; - if (!f->controller) return -EINVAL; @@ -3068,6 +2901,9 @@ int cg_write(const char *path, const char *buf, size_t size, off_t offset, struct file_info *f = (struct file_info *)fi->fh; bool r; + if (!fc || !cgroup_ops || pure_unified_layout(cgroup_ops)) + return -EIO; + if (f->type != LXC_TYPE_CGFILE) { lxcfs_error("%s\n", "Internal error: directory cache info used in cg_write."); return -EIO; @@ -3076,9 +2912,6 @@ int cg_write(const char *path, const char *buf, size_t size, off_t offset, if (offset) return 0; - if (!fc) - return -EIO; - localbuf = alloca(size+1); localbuf[size] = '\0'; memcpy(localbuf, buf, size); @@ -3118,7 +2951,7 @@ int cg_chown(const char *path, uid_t uid, gid_t gid) const char *cgroup; int ret; - if (!fc) + if (!fc || !cgroup_ops || pure_unified_layout(cgroup_ops)) return -EIO; if (strcmp(path, "/cgroup") == 0) @@ -3184,7 +3017,7 @@ int cg_chmod(const char *path, mode_t mode) const char *cgroup; int ret; - if (!fc) + if (!fc || !cgroup_ops || pure_unified_layout(cgroup_ops)) return -EIO; if (strcmp(path, "/cgroup") == 0) @@ -3252,7 +3085,7 @@ int cg_mkdir(const char *path, mode_t mode) const char *cgroup; int ret; - if (!fc) + if (!fc || !cgroup_ops || pure_unified_layout(cgroup_ops)) return -EIO; controller = pick_controller_from_path(fc, path); @@ -3306,7 +3139,7 @@ int cg_rmdir(const char *path) const char *cgroup; int ret; - if (!fc) + if (!fc || !cgroup_ops || pure_unified_layout(cgroup_ops)) return -EIO; controller = pick_controller_from_path(fc, path); @@ -3427,7 +3260,7 @@ static void get_blkio_io_value(char *str, unsigned major, unsigned minor, char * } } -int read_file(const char *path, char *buf, size_t size, struct file_info *d) +int read_file_fuse(const char *path, char *buf, size_t size, struct file_info *d) { size_t linelen = 0, total_len = 0, rv = 0; char *line = NULL; @@ -3538,7 +3371,7 @@ static int proc_meminfo_read(char *buf, size_t size, off_t offset, initpid = fc->pid; cg = get_pid_cgroup(initpid, "memory"); if (!cg) - return read_file("/proc/meminfo", buf, size, d); + return read_file_fuse("/proc/meminfo", buf, size, d); prune_init_slice(cg); memlimit = get_min_memlimit(cg, "memory.limit_in_bytes"); @@ -3828,14 +3661,13 @@ static double exact_cpu_count(const char *cg) bool use_cpuview(const char *cg) { int cfd; - char *tmpc; - tmpc = find_mounted_controller("cpu", &cfd); - if (!tmpc) + cfd = find_mounted_controller("cpu"); + if (cfd < 0) return false; - tmpc = find_mounted_controller("cpuacct", &cfd); - if (!tmpc) + cfd = find_mounted_controller("cpuacct"); + if (cfd < 0) return false; return true; @@ -3885,7 +3717,7 @@ static int proc_cpuinfo_read(char *buf, size_t size, off_t offset, initpid = fc->pid; cg = get_pid_cgroup(initpid, "cpuset"); if (!cg) - return read_file("proc/cpuinfo", buf, size, d); + return read_file_fuse("proc/cpuinfo", buf, size, d); prune_init_slice(cg); cpuset = get_cpuset(cg); @@ -4988,13 +4820,13 @@ static int proc_stat_read(char *buf, size_t size, off_t offset, * in some case cpuacct_usage.all in "/" will larger then /proc/stat */ if (initpid == 1) { - return read_file("/proc/stat", buf, size, d); + return read_file_fuse("/proc/stat", buf, size, d); } cg = get_pid_cgroup(initpid, "cpuset"); lxcfs_v("cg: %s\n", cg); if (!cg) - return read_file("/proc/stat", buf, size, d); + return read_file_fuse("/proc/stat", buf, size, d); prune_init_slice(cg); cpuset = get_cpuset(cg); @@ -5333,7 +5165,7 @@ static int proc_diskstats_read(char *buf, size_t size, off_t offset, initpid = fc->pid; cg = get_pid_cgroup(initpid, "blkio"); if (!cg) - return read_file("/proc/diskstats", buf, size, d); + return read_file_fuse("/proc/diskstats", buf, size, d); prune_init_slice(cg); if (!cgfs_get_value("blkio", cg, "blkio.io_serviced_recursive", &io_serviced_str)) @@ -5455,7 +5287,7 @@ static int proc_swaps_read(char *buf, size_t size, off_t offset, initpid = fc->pid; cg = get_pid_cgroup(initpid, "memory"); if (!cg) - return read_file("/proc/swaps", buf, size, d); + return read_file_fuse("/proc/swaps", buf, size, d); prune_init_slice(cg); memlimit = get_min_memlimit(cg, "memory.limit_in_bytes"); @@ -5810,14 +5642,14 @@ static int proc_loadavg_read(char *buf, size_t size, off_t offset, return total_len; } if (!loadavg) - return read_file("/proc/loadavg", buf, size, d); + return read_file_fuse("/proc/loadavg", buf, size, d); initpid = lookup_initpid_in_store(fc->pid); if (initpid <= 1 || is_shared_pidns(initpid)) initpid = fc->pid; cg = get_pid_cgroup(initpid, "cpu"); if (!cg) - return read_file("/proc/loadavg", buf, size, d); + return read_file_fuse("/proc/loadavg", buf, size, d); prune_init_slice(cg); hash = calc_hash(cg) % LOAD_SIZE; @@ -5825,7 +5657,8 @@ static int proc_loadavg_read(char *buf, size_t size, off_t offset, /* First time */ if (n == NULL) { - if (!find_mounted_controller("cpu", &cfd)) { + cfd = find_mounted_controller("cpu"); + if (cfd >= 0) { /* * In locate_node() above, pthread_rwlock_unlock() isn't used * because delete is not allowed before read has ended. @@ -6069,30 +5902,6 @@ int proc_read(const char *path, char *buf, size_t size, off_t offset, * Functions needed to setup cgroups in the __constructor__. */ -static bool mkdir_p(const char *dir, mode_t mode) -{ - const char *tmp = dir; - const char *orig = dir; - char *makeme; - - do { - dir = tmp + strspn(tmp, "/"); - tmp = dir + strcspn(dir, "/"); - makeme = strndup(orig, dir - orig); - if (!makeme) - return false; - if (mkdir(makeme, mode) && errno != EEXIST) { - lxcfs_error("Failed to create directory '%s': %s.\n", - makeme, strerror(errno)); - free(makeme); - return false; - } - free(makeme); - } while(tmp != dir); - - return true; -} - static bool umount_if_mounted(void) { if (umount2(BASEDIR, MNT_DETACH) < 0 && errno != EINVAL) { @@ -6345,45 +6154,19 @@ static bool cgfs_prepare_mounts(void) static bool cgfs_mount_hierarchies(void) { - char *target; - size_t clen, len; - int i, ret; - - for (i = 0; i < num_hierarchies; i++) { - char *controller = hierarchies[i]; - - clen = strlen(controller); - len = strlen(BASEDIR) + clen + 2; - target = malloc(len); - if (!target) - return false; + if (!mkdir_p(BASEDIR DEFAULT_CGROUP_MOUNTPOINT, 0755)) + return false; - ret = snprintf(target, len, "%s/%s", BASEDIR, controller); - if (ret < 0 || ret >= len) { - free(target); - return false; - } - if (mkdir(target, 0755) < 0 && errno != EEXIST) { - free(target); - return false; - } - if (!strcmp(controller, "unified")) - ret = mount("none", target, "cgroup2", 0, NULL); - else - ret = mount(controller, target, "cgroup", 0, controller); - if (ret < 0) { - lxcfs_error("Failed mounting cgroup %s: %s\n", controller, strerror(errno)); - free(target); - return false; - } + if (!cgroup_ops->mount(cgroup_ops, BASEDIR)) + return false; - fd_hierarchies[i] = open(target, O_DIRECTORY); - if (fd_hierarchies[i] < 0) { - free(target); + for (struct hierarchy **h = cgroup_ops->hierarchies; h && *h; h++) { + __do_free char *path = must_make_path(BASEDIR, (*h)->mountpoint, NULL); + (*h)->fd = open(path, O_DIRECTORY | O_CLOEXEC | O_NOFOLLOW); + if ((*h)->fd < 0) return false; - } - free(target); } + return true; } @@ -6405,45 +6188,13 @@ static bool cgfs_setup_controllers(void) static void __attribute__((constructor)) collect_and_mount_subsystems(void) { - FILE *f; - char *cret, *line = NULL; + char *cret; char cwd[MAXPATHLEN]; - size_t len = 0; - int i, init_ns = -1; - bool found_unified = false; + int init_ns = -1; - if ((f = fopen("/proc/self/cgroup", "r")) == NULL) { - lxcfs_error("Error opening /proc/self/cgroup: %s\n", strerror(errno)); + cgroup_ops = cgroup_init(); + if (!cgroup_ops) return; - } - - while (getline(&line, &len, f) != -1) { - char *idx, *p, *p2; - - p = strchr(line, ':'); - if (!p) - goto out; - idx = line; - *(p++) = '\0'; - - p2 = strrchr(p, ':'); - if (!p2) - goto out; - *p2 = '\0'; - - /* With cgroupv2 /proc/self/cgroup can contain entries of the - * form: 0::/ This will cause lxcfs to fail the cgroup mounts - * because it parses out the empty string "" and later on passes - * it to mount(). Let's skip such entries. - */ - if (!strcmp(p, "") && !strcmp(idx, "0") && !found_unified) { - found_unified = true; - p = "unified"; - } - - if (!store_hierarchy(line, p)) - goto out; - } /* Preserve initial namespace. */ init_ns = preserve_mnt_ns(getpid()); @@ -6452,15 +6203,6 @@ static void __attribute__((constructor)) collect_and_mount_subsystems(void) goto out; } - fd_hierarchies = malloc(sizeof(int) * num_hierarchies); - if (!fd_hierarchies) { - lxcfs_error("%s\n", strerror(errno)); - goto out; - } - - for (i = 0; i < num_hierarchies; i++) - fd_hierarchies[i] = -1; - cret = getcwd(cwd, MAXPATHLEN); if (!cret) lxcfs_debug("Could not retrieve current working directory: %s.\n", strerror(errno)); @@ -6488,26 +6230,15 @@ static void __attribute__((constructor)) collect_and_mount_subsystems(void) print_subsystems(); out: - free(line); - fclose(f); if (init_ns >= 0) close(init_ns); } static void __attribute__((destructor)) free_subsystems(void) { - int i; - lxcfs_debug("%s\n", "Running destructor for liblxcfs."); - for (i = 0; i < num_hierarchies; i++) { - if (hierarchies[i]) - free(hierarchies[i]); - if (fd_hierarchies && fd_hierarchies[i] >= 0) - close(fd_hierarchies[i]); - } - free(hierarchies); - free(fd_hierarchies); + cgroup_exit(cgroup_ops); free_cpuview(); if (cgroup_mount_ns_fd >= 0) diff --git a/bindings.h b/bindings.h index 250bbac..229d64c 100644 --- a/bindings.h +++ b/bindings.h @@ -75,8 +75,8 @@ extern int stop_load_daemon(pthread_t pid); extern pid_t lookup_initpid_in_store(pid_t qpid); extern char *get_pid_cgroup(pid_t pid, const char *contrl); -extern int read_file(const char *path, char *buf, size_t size, - struct file_info *d); +extern int read_file_fuse(const char *path, char *buf, size_t size, + struct file_info *d); extern void prune_init_slice(char *cg); extern char *get_cpuset(const char *cg); extern bool use_cpuview(const char *cg); diff --git a/cgroups/cgfsng.c b/cgroups/cgfsng.c new file mode 100644 index 0000000..08b719d --- /dev/null +++ b/cgroups/cgfsng.c @@ -0,0 +1,787 @@ +/* SPDX-License-Identifier: LGPL-2.1+ */ + +/* + * cgfs-ng.c: this is a new, simplified implementation of a filesystem + * cgroup backend. The original cgfs.c was designed to be as flexible + * as possible. It would try to find cgroup filesystems no matter where + * or how you had them mounted, and deduce the most usable mount for + * each controller. + * + * This new implementation assumes that cgroup filesystems are mounted + * under /sys/fs/cgroup/clist where clist is either the controller, or + * a comma-separated list of controllers. + */ + +#ifndef _GNU_SOURCE +#define _GNU_SOURCE 1 +#endif +#include <ctype.h> +#include <dirent.h> +#include <errno.h> +#include <grp.h> +#include <linux/kdev_t.h> +#include <linux/types.h> +#include <poll.h> +#include <signal.h> +#include <stdint.h> +#include <stdio.h> +#include <stdlib.h> +#include <string.h> +#include <sys/mount.h> +#include <sys/types.h> +#include <unistd.h> + +#include "cgroup.h" +#include "cgroup2_devices.h" +#include "cgroup_utils.h" +#include "macro.h" +#include "memory_utils.h" + +static void free_string_list(char **clist) +{ + int i; + + if (!clist) + return; + + for (i = 0; clist[i]; i++) + free(clist[i]); + + free(clist); +} + +/* Given a pointer to a null-terminated array of pointers, realloc to add one + * entry, and point the new entry to NULL. Do not fail. Return the index to the + * second-to-last entry - that is, the one which is now available for use + * (keeping the list null-terminated). + */ +static int append_null_to_list(void ***list) +{ + int newentry = 0; + + if (*list) + for (; (*list)[newentry]; newentry++) + ; + + *list = must_realloc(*list, (newentry + 2) * sizeof(void **)); + (*list)[newentry + 1] = NULL; + return newentry; +} + +/* Given a null-terminated array of strings, check whether @entry is one of the + * strings. + */ +static bool string_in_list(char **list, const char *entry) +{ + int i; + + if (!list) + return false; + + for (i = 0; list[i]; i++) + if (strcmp(list[i], entry) == 0) + return true; + + return false; +} + +/* Return a copy of @entry prepending "name=", i.e. turn "systemd" into + * "name=systemd". Do not fail. + */ +static char *cg_legacy_must_prefix_named(char *entry) +{ + size_t len; + char *prefixed; + + len = strlen(entry); + prefixed = must_realloc(NULL, len + 6); + + memcpy(prefixed, "name=", STRLITERALLEN("name=")); + memcpy(prefixed + STRLITERALLEN("name="), entry, len); + prefixed[len + 5] = '\0'; + + return prefixed; +} + +/* Append an entry to the clist. Do not fail. @clist must be NULL the first time + * we are called. + * + * We also handle named subsystems here. Any controller which is not a kernel + * subsystem, we prefix "name=". Any which is both a kernel and named subsystem, + * we refuse to use because we're not sure which we have here. + * (TODO: We could work around this in some cases by just remounting to be + * unambiguous, or by comparing mountpoint contents with current cgroup.) + * + * The last entry will always be NULL. + */ +static void must_append_controller(char **klist, char **nlist, char ***clist, + char *entry) +{ + int newentry; + char *copy; + + if (string_in_list(klist, entry) && string_in_list(nlist, entry)) + return; + + newentry = append_null_to_list((void ***)clist); + + if (strncmp(entry, "name=", 5) == 0) + copy = must_copy_string(entry); + else if (string_in_list(klist, entry)) + copy = must_copy_string(entry); + else + copy = cg_legacy_must_prefix_named(entry); + + (*clist)[newentry] = copy; +} + +/* Given a handler's cgroup data, return the struct hierarchy for the controller + * @c, or NULL if there is none. + */ +static struct hierarchy *cgfsng_get_hierarchy(struct cgroup_ops *ops, + const char *controller) +{ + int i; + + errno = ENOENT; + + if (!ops->hierarchies) + return NULL; + + for (i = 0; ops->hierarchies[i]; i++) { + if (!controller) { + /* This is the empty unified hierarchy. */ + if (ops->hierarchies[i]->controllers && + !ops->hierarchies[i]->controllers[0]) + return ops->hierarchies[i]; + continue; + } else if (pure_unified_layout(ops) && + strcmp(controller, "devices") == 0) { + if (ops->unified->bpf_device_controller) + return ops->unified; + break; + } + + if (string_in_list(ops->hierarchies[i]->controllers, controller)) + return ops->hierarchies[i]; + } + + return NULL; +} + +static inline struct hierarchy *get_hierarchy(struct cgroup_ops *ops, + const char *controller) +{ + return cgfsng_get_hierarchy(ops, controller); +} + +/* Given two null-terminated lists of strings, return true if any string is in + * both. + */ +static bool controller_lists_intersect(char **l1, char **l2) +{ + int i; + + if (!l1 || !l2) + return false; + + for (i = 0; l1[i]; i++) { + if (string_in_list(l2, l1[i])) + return true; + } + + return false; +} + +/* For a null-terminated list of controllers @clist, return true if any of those + * controllers is already listed the null-terminated list of hierarchies @hlist. + * Realistically, if one is present, all must be present. + */ +static bool controller_list_is_dup(struct hierarchy **hlist, char **clist) +{ + int i; + + if (!hlist) + return false; + + for (i = 0; hlist[i]; i++) + if (controller_lists_intersect(hlist[i]->controllers, clist)) + return true; + + return false; +} + +/* Get the controllers from a mountinfo line There are other ways we could get + * this info. For lxcfs, field 3 is /cgroup/controller-list. For cgroupfs, we + * could parse the mount options. But we simply assume that the mountpoint must + * be /sys/fs/cgroup/controller-list + */ +static char **cg_hybrid_get_controllers(char **klist, char **nlist, char *line, + int type, char **controllers) +{ + /* The fourth field is /sys/fs/cgroup/comma-delimited-controller-list + * for legacy hierarchies. + */ + int i; + char *p2, *tok; + char *p = line, *sep = ","; + char **aret = NULL; + + for (i = 0; i < 4; i++) { + p = strchr(p, ' '); + if (!p) + return NULL; + p++; + } + + /* Note, if we change how mountinfo works, then our caller will need to + * verify /sys/fs/cgroup/ in this field. + */ + if (strncmp(p, DEFAULT_CGROUP_MOUNTPOINT "/", 15) != 0) + return NULL; + + p += 15; + p2 = strchr(p, ' '); + if (!p2) + return NULL; + *p2 = '\0'; + + if (type == CGROUP_SUPER_MAGIC) { + __do_free char *dup = NULL; + + /* strdup() here for v1 hierarchies. Otherwise + * lxc_iterate_parts() will destroy mountpoints such as + * "/sys/fs/cgroup/cpu,cpuacct". + */ + dup = must_copy_string(p); + if (!dup) + return NULL; + + lxc_iterate_parts (tok, dup, sep) + must_append_controller(klist, nlist, &aret, tok); + *controllers = move_ptr(dup); + } + *p2 = ' '; + + return aret; +} + +static char **cg_unified_make_empty_controller(void) +{ + int newentry; + char **aret = NULL; + + newentry = append_null_to_list((void ***)&aret); + aret[newentry] = NULL; + return aret; +} + +static char **cg_unified_get_controllers(const char *file) +{ + __do_free char *buf = NULL; + char *sep = " \t\n"; + char **aret = NULL; + char *tok; + + buf = read_file(file); + if (!buf) + return NULL; + + lxc_iterate_parts(tok, buf, sep) { + int newentry; + char *copy; + + newentry = append_null_to_list((void ***)&aret); + copy = must_copy_string(tok); + aret[newentry] = copy; + } + + return aret; +} + +static struct hierarchy *add_hierarchy(struct hierarchy ***h, char **clist, char *mountpoint, + char *container_base_path, int type) +{ + struct hierarchy *new; + int newentry; + + new = zalloc(sizeof(*new)); + new->controllers = clist; + new->mountpoint = mountpoint; + new->container_base_path = container_base_path; + new->version = type; + + newentry = append_null_to_list((void ***)h); + (*h)[newentry] = new; + return new; +} + +/* Get a copy of the mountpoint from @line, which is a line from + * /proc/self/mountinfo. + */ +static char *cg_hybrid_get_mountpoint(char *line) +{ + int i; + size_t len; + char *p2; + char *p = line, *sret = NULL; + + for (i = 0; i < 4; i++) { + p = strchr(p, ' '); + if (!p) + return NULL; + p++; + } + + if (strncmp(p, DEFAULT_CGROUP_MOUNTPOINT "/", 15) != 0) + return NULL; + + p2 = strchr(p + 15, ' '); + if (!p2) + return NULL; + *p2 = '\0'; + + len = strlen(p); + sret = must_realloc(NULL, len + 1); + memcpy(sret, p, len); + sret[len] = '\0'; + return sret; +} + +static void must_append_string(char ***list, char *entry) +{ + int newentry; + char *copy; + + newentry = append_null_to_list((void ***)list); + copy = must_copy_string(entry); + (*list)[newentry] = copy; +} + +static int get_existing_subsystems(char ***klist, char ***nlist) +{ + __do_free char *line = NULL; + __do_fclose FILE *f = NULL; + size_t len = 0; + + f = fopen("/proc/self/cgroup", "r"); + if (!f) + return -1; + + while (getline(&line, &len, f) != -1) { + char *p, *p2, *tok; + p = strchr(line, ':'); + if (!p) + continue; + p++; + p2 = strchr(p, ':'); + if (!p2) + continue; + *p2 = '\0'; + + /* If the kernel has cgroup v2 support, then /proc/self/cgroup + * contains an entry of the form: + * + * 0::/some/path + * + * In this case we use "cgroup2" as controller name. + */ + if ((p2 - p) == 0) { + must_append_string(klist, "cgroup2"); + continue; + } + + lxc_iterate_parts(tok, p, ",") { + if (strncmp(tok, "name=", 5) == 0) + must_append_string(nlist, tok); + else + must_append_string(klist, tok); + } + } + + return 0; +} + +static void trim(char *s) +{ + size_t len; + + len = strlen(s); + while ((len > 1) && (s[len - 1] == '\n')) + s[--len] = '\0'; +} + +/* __cg_mount_direct + * + * Mount cgroup hierarchies directly without using bind-mounts. The main + * uses-cases are mounting cgroup hierarchies in cgroup namespaces and mounting + * cgroups for the LXC_AUTO_CGROUP_FULL option. + */ +static int __cg_mount_direct(struct hierarchy *h, const char *controllerpath) +{ + __do_free char *controllers = NULL; + char *fstype = "cgroup2"; + unsigned long flags = 0; + int ret; + + flags |= MS_NOSUID; + flags |= MS_NOEXEC; + flags |= MS_NODEV; + flags |= MS_RELATIME; + + if (h->version != CGROUP2_SUPER_MAGIC) { + controllers = lxc_string_join(",", (const char **)h->controllers, false); + if (!controllers) + return -ENOMEM; + fstype = "cgroup"; + } + + ret = mount("cgroup", controllerpath, fstype, flags, controllers); + if (ret < 0) + return -1; + + return 0; +} + +static inline int cg_mount_cgroup_full(struct hierarchy *h, + const char *controllerpath) +{ + return __cg_mount_direct(h, controllerpath); +} + +static bool cgfsng_mount(struct cgroup_ops *ops, const char *root) +{ + __do_free char *cgroup_root = NULL; + int ret; + bool retval = false; + + if (!ops) + return ret_set_errno(false, ENOENT); + + if (!ops->hierarchies) + return true; + + cgroup_root = must_make_path(root, DEFAULT_CGROUP_MOUNTPOINT, NULL); + if (ops->cgroup_layout == CGROUP_LAYOUT_UNIFIED) + return cg_mount_cgroup_full(ops->unified, cgroup_root) == 0; + + /* mount tmpfs */ + ret = safe_mount(NULL, cgroup_root, "tmpfs", + MS_NOSUID | MS_NODEV | MS_NOEXEC | MS_RELATIME, + "size=10240k,mode=755", root); + if (ret < 0) + goto on_error; + + for (int i = 0; ops->hierarchies[i]; i++) { + __do_free char *controllerpath = NULL; + struct hierarchy *h = ops->hierarchies[i]; + char *controller = strrchr(h->mountpoint, '/'); + + if (!controller) + continue; + controller++; + + controllerpath = must_make_path(cgroup_root, controller, NULL); + if (dir_exists(controllerpath)) + continue; + + ret = mkdir(controllerpath, 0755); + if (ret < 0) + log_error_errno(goto on_error, errno, + "Error creating cgroup path: %s", + controllerpath); + + ret = cg_mount_cgroup_full( h, controllerpath); + if (ret < 0) + goto on_error; + } + retval = true; + +on_error: + return retval; +} + +static int recursive_count_nrtasks(char *dirname) +{ + __do_free char *path = NULL; + __do_closedir DIR *dir = NULL; + struct dirent *direntp; + int count = 0, ret; + + dir = opendir(dirname); + if (!dir) + return 0; + + while ((direntp = readdir(dir))) { + struct stat mystat; + + if (!strcmp(direntp->d_name, ".") || + !strcmp(direntp->d_name, "..")) + continue; + + path = must_make_path(dirname, direntp->d_name, NULL); + + if (lstat(path, &mystat)) + continue; + + if (!S_ISDIR(mystat.st_mode)) + continue; + + count += recursive_count_nrtasks(path); + } + + path = must_make_path(dirname, "cgroup.procs", NULL); + ret = lxc_count_file_lines(path); + if (ret != -1) + count += ret; + + return count; +} + +static int cgfsng_nrtasks(struct cgroup_ops *ops) +{ + __do_free char *path = NULL; + + if (!ops) + return ret_set_errno(-1, ENOENT); + + if (!ops->container_cgroup || !ops->hierarchies) + return ret_set_errno(-1, EINVAL); + + path = must_make_path(ops->hierarchies[0]->container_full_path, NULL); + return recursive_count_nrtasks(path); +} + +static int cgfsng_num_hierarchies(struct cgroup_ops *ops) +{ + int i = 0; + + if (!ops) + return ret_set_errno(-1, ENOENT); + + if (!ops->hierarchies) + return 0; + + for (; ops->hierarchies[i]; i++) + ; + + return i; +} + +static bool cgfsng_get_hierarchies(struct cgroup_ops *ops, int n, char ***out) +{ + int i; + + if (!ops) + return ret_set_errno(false, ENOENT); + + if (!ops->hierarchies) + return false; + + /* sanity check n */ + for (i = 0; i < n; i++) + if (!ops->hierarchies[i]) + return ret_set_errno(false, ENOENT); + + *out = ops->hierarchies[i]->controllers; + + return true; +} + +/* At startup, parse_hierarchies finds all the info we need about cgroup + * mountpoints and current cgroups, and stores it in @d. + */ +static int cg_hybrid_init(struct cgroup_ops *ops) +{ + __do_free char *basecginfo = NULL; + __do_free char *line = NULL; + __do_fclose FILE *f = NULL; + int ret; + size_t len = 0; + char **klist = NULL, **nlist = NULL; + + /* Root spawned containers escape the current cgroup, so use init's + * cgroups as our base in that case. + */ + basecginfo = read_file("/proc/1/cgroup"); + if (!basecginfo) + return ret_set_errno(-1, ENOMEM); + + ret = get_existing_subsystems(&klist, &nlist); + if (ret < 0) + return log_error_errno(-1, errno, "Failed to retrieve available legacy cgroup controllers"); + + f = fopen("/proc/self/mountinfo", "r"); + if (!f) + return log_error_errno(-1, errno, "Failed to open \"/proc/self/mountinfo\""); + + while (getline(&line, &len, f) != -1) { + int type; + struct hierarchy *new; + char *base_cgroup = NULL, *mountpoint = NULL; + char **controller_list = NULL; + __do_free char *controllers = NULL; + + type = get_cgroup_version(line); + if (type == 0) + continue; + + if (type == CGROUP2_SUPER_MAGIC && ops->unified) + continue; + + if (ops->cgroup_layout == CGROUP_LAYOUT_UNKNOWN) { + if (type == CGROUP2_SUPER_MAGIC) + ops->cgroup_layout = CGROUP_LAYOUT_UNIFIED; + else if (type == CGROUP_SUPER_MAGIC) + ops->cgroup_layout = CGROUP_LAYOUT_LEGACY; + } else if (ops->cgroup_layout == CGROUP_LAYOUT_UNIFIED) { + if (type == CGROUP_SUPER_MAGIC) + ops->cgroup_layout = CGROUP_LAYOUT_HYBRID; + } else if (ops->cgroup_layout == CGROUP_LAYOUT_LEGACY) { + if (type == CGROUP2_SUPER_MAGIC) + ops->cgroup_layout = CGROUP_LAYOUT_HYBRID; + } + + controller_list = cg_hybrid_get_controllers(klist, nlist, line, + type, &controllers); + if (!controller_list && type == CGROUP_SUPER_MAGIC) + continue; + + if (type == CGROUP_SUPER_MAGIC) + if (controller_list_is_dup(ops->hierarchies, controller_list)) + ret_set_errno(goto next, EEXIST); + + mountpoint = cg_hybrid_get_mountpoint(line); + if (!mountpoint) + log_error_errno(goto next, EINVAL, "Failed parsing mountpoint from \"%s\"", line); + + if (type == CGROUP_SUPER_MAGIC) { + base_cgroup = cg_hybrid_get_current_cgroup(basecginfo, controller_list[0], CGROUP_SUPER_MAGIC); + } else { + base_cgroup = cg_hybrid_get_current_cgroup(basecginfo, NULL, CGROUP2_SUPER_MAGIC); + } + if (!base_cgroup) + log_error_errno(goto next, EINVAL, "Failed to find current cgroup %s", mountpoint); + + trim(base_cgroup); + prune_init_scope(base_cgroup); + + if (type == CGROUP2_SUPER_MAGIC) { + char *cgv2_ctrl_path; + + cgv2_ctrl_path = must_make_path(mountpoint, base_cgroup, + "cgroup.controllers", + NULL); + + controller_list = cg_unified_get_controllers(cgv2_ctrl_path); + free(cgv2_ctrl_path); + if (!controller_list) + controller_list = cg_unified_make_empty_controller(); + } + + new = add_hierarchy(&ops->hierarchies, controller_list, mountpoint, base_cgroup, type); + new->__controllers = move_ptr(controllers); + if (type == CGROUP2_SUPER_MAGIC && !ops->unified) + ops->unified = new; + + continue; + + next: + free_string_list(controller_list); + free(mountpoint); + free(base_cgroup); + } + + free_string_list(klist); + free_string_list(nlist); + + return 0; +} + +static int cg_unified_init(struct cgroup_ops *ops) +{ + __do_free char *subtree_path = NULL; + int ret; + char *mountpoint; + char **delegatable; + struct hierarchy *new; + char *base_cgroup = NULL; + + ret = unified_cgroup_hierarchy(); + if (ret == -ENOMEDIUM) + return ret_errno(ENOMEDIUM); + + if (ret != CGROUP2_SUPER_MAGIC) + return 0; + + base_cgroup = cg_unified_get_current_cgroup(1); + if (!base_cgroup) + return ret_errno(EINVAL); + prune_init_scope(base_cgroup); + + /* + * We assume that the cgroup we're currently in has been delegated to + * us and we are free to further delege all of the controllers listed + * in cgroup.controllers further down the hierarchy. + */ + mountpoint = must_copy_string(DEFAULT_CGROUP_MOUNTPOINT); + subtree_path = must_make_path(mountpoint, base_cgroup, "cgroup.controllers", NULL); + delegatable = cg_unified_get_controllers(subtree_path); + if (!delegatable) + delegatable = cg_unified_make_empty_controller(); + + /* TODO: If the user requested specific controllers via lxc.cgroup.use + * we should verify here. The reason I'm not doing it right is that I'm + * not convinced that lxc.cgroup.use will be the future since it is a + * global property. I much rather have an option that lets you request + * controllers per container. + */ + + new = add_hierarchy(&ops->hierarchies, delegatable, mountpoint, base_cgroup, CGROUP2_SUPER_MAGIC); + + if (bpf_devices_cgroup_supported()) + new->bpf_device_controller = 1; + + ops->cgroup_layout = CGROUP_LAYOUT_UNIFIED; + ops->unified = new; + return CGROUP2_SUPER_MAGIC; +} + +static int cg_init(struct cgroup_ops *ops) +{ + int ret; + + ret = cg_unified_init(ops); + if (ret < 0) + return -1; + + if (ret == CGROUP2_SUPER_MAGIC) + return 0; + + return cg_hybrid_init(ops); +} + +struct cgroup_ops *cgfsng_ops_init(void) +{ + __do_free struct cgroup_ops *cgfsng_ops = NULL; + + cgfsng_ops = malloc(sizeof(struct cgroup_ops)); + if (!cgfsng_ops) + return ret_set_errno(NULL, ENOMEM); + + memset(cgfsng_ops, 0, sizeof(struct cgroup_ops)); + cgfsng_ops->cgroup_layout = CGROUP_LAYOUT_UNKNOWN; + + if (cg_init(cgfsng_ops)) + return NULL; + + cgfsng_ops->num_hierarchies = cgfsng_num_hierarchies; + cgfsng_ops->get_hierarchies = cgfsng_get_hierarchies; + cgfsng_ops->get_hierarchy = get_hierarchy; + cgfsng_ops->driver = "cgfsng"; + cgfsng_ops->version = "1.0.0"; + cgfsng_ops->mount = cgfsng_mount; + cgfsng_ops->nrtasks = cgfsng_nrtasks; + + return move_ptr(cgfsng_ops); +} diff --git a/cgroups/cgroup.c b/cgroups/cgroup.c new file mode 100644 index 0000000..aebafbd --- /dev/null +++ b/cgroups/cgroup.c @@ -0,0 +1,79 @@ +/* SPDX-License-Identifier: LGPL-2.1+ */ + +#ifndef _GNU_SOURCE +#define _GNU_SOURCE 1 +#endif +#include <stdlib.h> +#include <string.h> +#include <sys/types.h> +#include <unistd.h> + +#include "cgroup.h" +#include "cgroup2_devices.h" + +extern struct cgroup_ops *cgfsng_ops_init(void); + +struct cgroup_ops *cgroup_init(void) +{ + struct cgroup_ops *cgroup_ops; + + cgroup_ops = cgfsng_ops_init(); + if (!cgroup_ops) + return log_error_errno(NULL, errno, "Failed to initialize cgroup driver"); + + return cgroup_ops; +} + +void cgroup_exit(struct cgroup_ops *ops) +{ + struct hierarchy **it; + + if (!ops) + return; + + free(ops->container_cgroup); + free(ops->monitor_cgroup); + + for (it = ops->hierarchies; it && *it; it++) { + char **p; + + for (p = (*it)->controllers; p && *p; p++) + free(*p); + free((*it)->controllers); + free((*it)->__controllers); + + if ((*it)->fd >= 0) + close((*it)->fd); + + free((*it)->mountpoint); + free((*it)->container_base_path); + free((*it)->container_full_path); + free((*it)->monitor_full_path); + free(*it); + } + free(ops->hierarchies); + + free(ops); + + return; +} + +#define INIT_SCOPE "/init.scope" +void prune_init_scope(char *cg) +{ + char *point; + + if (!cg) + return; + + point = cg + strlen(cg) - strlen(INIT_SCOPE); + if (point < cg) + return; + + if (strcmp(point, INIT_SCOPE) == 0) { + if (point == cg) + *(point + 1) = '\0'; + else + *point = '\0'; + } +} diff --git a/cgroups/cgroup.h b/cgroups/cgroup.h new file mode 100644 index 0000000..8895533 --- /dev/null +++ b/cgroups/cgroup.h @@ -0,0 +1,150 @@ +/* SPDX-License-Identifier: LGPL-2.1+ */ + +#ifndef __LXC_CGROUP_H +#define __LXC_CGROUP_H + +#include <stdbool.h> +#include <stddef.h> +#include <sys/types.h> + +#include "macro.h" + +#define DEFAULT_CGROUP_MOUNTPOINT "/sys/fs/cgroup" + +typedef enum { + CGROUP_LAYOUT_UNKNOWN = -1, + CGROUP_LAYOUT_LEGACY = 0, + CGROUP_LAYOUT_HYBRID = 1, + CGROUP_LAYOUT_UNIFIED = 2, +} cgroup_layout_t; + +/* A descriptor for a mounted hierarchy + * + * @controllers + * - legacy hierarchy + * Either NULL, or a null-terminated list of all the co-mounted controllers. + * - unified hierarchy + * Either NULL, or a null-terminated list of all enabled controllers. + * + * @mountpoint + * - The mountpoint we will use. + * - legacy hierarchy + * It will be either /sys/fs/cgroup/controller or + * /sys/fs/cgroup/controllerlist. + * - unified hierarchy + * It will either be /sys/fs/cgroup or /sys/fs/cgroup/<mountpoint-name> + * depending on whether this is a hybrid cgroup layout (mix of legacy and + * unified hierarchies) or a pure unified cgroup layout. + * + * @container_base_path + * - The cgroup under which the container cgroup path + * is created. This will be either the caller's cgroup (if not root), or + * init's cgroup (if root). + * + * @container_full_path + * - The full path to the containers cgroup. + * + * @monitor_full_path + * - The full path to the monitor's cgroup. + * + * @version + * - legacy hierarchy + * If the hierarchy is a legacy hierarchy this will be set to + * CGROUP_SUPER_MAGIC. + * - unified hierarchy + * If the hierarchy is a unified hierarchy this will be set to + * CGROUP2_SUPER_MAGIC. + */ +struct hierarchy { + /* + * cgroup2 only: what files need to be chowned to delegate a cgroup to + * an unprivileged user. + */ + char **controllers; + char *__controllers; + char *mountpoint; + char *container_base_path; + char *container_full_path; + char *monitor_full_path; + int version; + + /* cgroup2 only */ + unsigned int bpf_device_controller:1; + int fd; +}; + +struct cgroup_ops { + /* string constant */ + const char *driver; + + /* string constant */ + const char *version; + + /* What controllers is the container supposed to use. */ + char *container_cgroup; + char *monitor_cgroup; + + /* @hierarchies + * - A NULL-terminated array of struct hierarchy, one per legacy + * hierarchy. No duplicates. First sufficient, writeable mounted + * hierarchy wins. + */ + struct hierarchy **hierarchies; + /* Pointer to the unified hierarchy. Do not free! */ + struct hierarchy *unified; + + /* + * @cgroup_layout + * - What cgroup layout the container is running with. + * - CGROUP_LAYOUT_UNKNOWN + * The cgroup layout could not be determined. This should be treated + * as an error condition. + * - CGROUP_LAYOUT_LEGACY + * The container is running with all controllers mounted into legacy + * cgroup hierarchies. + * - CGROUP_LAYOUT_HYBRID + * The container is running with at least one controller mounted + * into a legacy cgroup hierarchy and a mountpoint for the unified + * hierarchy. The unified hierarchy can be empty (no controllers + * enabled) or non-empty (controllers enabled). + * - CGROUP_LAYOUT_UNIFIED + * The container is running on a pure unified cgroup hierarchy. The + * unified hierarchy can be empty (no controllers enabled) or + * non-empty (controllers enabled). + */ + cgroup_layout_t cgroup_layout; + + int (*num_hierarchies)(struct cgroup_ops *ops); + bool (*get_hierarchies)(struct cgroup_ops *ops, int n, char ***out); + bool (*mount)(struct cgroup_ops *ops, const char *root); + int (*nrtasks)(struct cgroup_ops *ops); + struct hierarchy *(*get_hierarchy)(struct cgroup_ops *ops, + const char *controller); +}; + +extern struct cgroup_ops *cgroup_init(void); +extern void cgroup_exit(struct cgroup_ops *ops); + +extern void prune_init_scope(char *cg); + +static inline void __auto_cgroup_exit__(struct cgroup_ops **ops) +{ + if (*ops) + cgroup_exit(*ops); +} + +extern int cgroup_attach(const char *name, const char *lxcpath, int64_t pid); + +#define __do_cgroup_exit __attribute__((__cleanup__(__auto_cgroup_exit__))) + +static inline bool pure_unified_layout(const struct cgroup_ops *ops) +{ + return ops->cgroup_layout == CGROUP_LAYOUT_UNIFIED; +} + +static inline bool is_unified_hierarchy(const struct hierarchy *h) +{ + return h->version == CGROUP2_SUPER_MAGIC; +} + +#endif diff --git a/cgroups/cgroup2_devices.c b/cgroups/cgroup2_devices.c new file mode 100644 index 0000000..92df160 --- /dev/null +++ b/cgroups/cgroup2_devices.c @@ -0,0 +1,457 @@ +/* SPDX-License-Identifier: LGPL-2.1+ */ + +/* Parts of this taken from systemd's implementation. */ + +#ifndef _GNU_SOURCE +#define _GNU_SOURCE 1 +#endif +#include <errno.h> +#include <fcntl.h> +#include <stdbool.h> +#include <stddef.h> +#include <stdint.h> +#include <sys/stat.h> +#include <sys/syscall.h> +#include <sys/types.h> +#include <unistd.h> + +#include "cgroup2_devices.h" +#include "macro.h" +#include "memory_utils.h" + +#ifdef HAVE_STRUCT_BPF_CGROUP_DEV_CTX +#include <linux/bpf.h> +#include <linux/filter.h> + +static int bpf_program_add_instructions(struct bpf_program *prog, + const struct bpf_insn *instructions, + size_t count) +{ + + struct bpf_insn *new_insn; + + if (prog->kernel_fd >= 0) + return log_error_errno(-1, EBUSY, "Refusing to update bpf cgroup program that's already loaded"); + + new_insn = realloc(prog->instructions, sizeof(struct bpf_insn) * (count + prog->n_instructions)); + if (!new_insn) + return log_error_errno(-1, ENOMEM, "Failed to reallocate bpf cgroup program"); + + prog->instructions = new_insn; + memcpy(prog->instructions + prog->n_instructions, instructions, + sizeof(struct bpf_insn) * count); + prog->n_instructions += count; + + return 0; +} + +void bpf_program_free(struct bpf_program *prog) +{ + if (!prog) + return; + + (void)bpf_program_cgroup_detach(prog); + + if (prog->kernel_fd >= 0) + close(prog->kernel_fd); + free(prog->instructions); + free(prog->attached_path); + free(prog); +} + +/* Memory load, dst_reg = *(uint *) (src_reg + off16) */ +#define BPF_LDX_MEM(SIZE, DST, SRC, OFF) \ + ((struct bpf_insn){.code = BPF_LDX | BPF_SIZE(SIZE) | BPF_MEM, \ + .dst_reg = DST, \ + .src_reg = SRC, \ + .off = OFF, \ + .imm = 0}) + +/* ALU ops on immediates, bpf_add|sub|...: dst_reg += imm32 */ +#define BPF_ALU32_IMM(OP, DST, IMM) \ + ((struct bpf_insn){.code = BPF_ALU | BPF_OP(OP) | BPF_K, \ + .dst_reg = DST, \ + .src_reg = 0, \ + .off = 0, \ + .imm = IMM}) + +/* Short form of mov, dst_reg = src_reg */ +#define BPF_MOV64_IMM(DST, IMM) \ + ((struct bpf_insn){.code = BPF_ALU64 | BPF_MOV | BPF_K, \ + .dst_reg = DST, \ + .src_reg = 0, \ + .off = 0, \ + .imm = IMM}) + +#define BPF_MOV32_REG(DST, SRC) \ + ((struct bpf_insn){.code = BPF_ALU | BPF_MOV | BPF_X, \ + .dst_reg = DST, \ + .src_reg = SRC, \ + .off = 0, \ + .imm = 0}) + +/* Conditional jumps against registers, if (dst_reg 'op' src_reg) goto pc + off16 */ +#define BPF_JMP_REG(OP, DST, SRC, OFF) \ + ((struct bpf_insn){.code = BPF_JMP | BPF_OP(OP) | BPF_X, \ + .dst_reg = DST, \ + .src_reg = SRC, \ + .off = OFF, \ + .imm = 0}) + +/* Conditional jumps against immediates, if (dst_reg 'op' imm32) goto pc + off16 */ +#define BPF_JMP_IMM(OP, DST, IMM, OFF) \ + ((struct bpf_insn){.code = BPF_JMP | BPF_OP(OP) | BPF_K, \ + .dst_reg = DST, \ + .src_reg = 0, \ + .off = OFF, \ + .imm = IMM}) + +/* Program exit */ +#define BPF_EXIT_INSN() \ + ((struct bpf_insn){.code = BPF_JMP | BPF_EXIT, \ + .dst_reg = 0, \ + .src_reg = 0, \ + .off = 0, \ + .imm = 0}) + +static int bpf_access_mask(const char *acc) +{ + int mask = 0; + + if (!acc) + return mask; + + for (; *acc; acc++) + switch (*acc) { + case 'r': + mask |= BPF_DEVCG_ACC_READ; + break; + case 'w': + mask |= BPF_DEVCG_ACC_WRITE; + break; + case 'm': + mask |= BPF_DEVCG_ACC_MKNOD; + break; + default: + return -EINVAL; + } + + return mask; +} + +static int bpf_device_type(char type) +{ + switch (type) { + case 'a': + return 0; + case 'b': + return BPF_DEVCG_DEV_BLOCK; + case 'c': + return BPF_DEVCG_DEV_CHAR; + } + + return -1; +} + +static inline bool bpf_device_all_access(int access_mask) +{ + return (access_mask == (BPF_DEVCG_ACC_READ | BPF_DEVCG_ACC_WRITE | + BPF_DEVCG_ACC_MKNOD)); +} + +struct bpf_program *bpf_program_new(uint32_t prog_type) +{ + __do_free struct bpf_program *prog = NULL; + + prog = calloc(1, sizeof(struct bpf_program)); + if (!prog) + return NULL; + + prog->prog_type = prog_type; + prog->kernel_fd = -EBADF; + /* + * By default a whitelist is used unless the user tells us otherwise. + */ + prog->device_list_type = LXC_BPF_DEVICE_CGROUP_WHITELIST; + + return move_ptr(prog); +} + +int bpf_program_init(struct bpf_program *prog) +{ + if (!prog) + return ret_set_errno(-1, EINVAL); + + const struct bpf_insn pre_insn[] = { + /* load device type to r2 */ + BPF_LDX_MEM(BPF_W, BPF_REG_2, BPF_REG_1, offsetof(struct bpf_cgroup_dev_ctx, access_type)), + BPF_ALU32_IMM(BPF_AND, BPF_REG_2, 0xFFFF), + + /* load access type to r3 */ + BPF_LDX_MEM(BPF_W, BPF_REG_3, BPF_REG_1, offsetof(struct bpf_cgroup_dev_ctx, access_type)), + BPF_ALU32_IMM(BPF_RSH, BPF_REG_3, 16), + + /* load major number to r4 */ + BPF_LDX_MEM(BPF_W, BPF_REG_4, BPF_REG_1, offsetof(struct bpf_cgroup_dev_ctx, major)), + + /* load minor number to r5 */ + BPF_LDX_MEM(BPF_W, BPF_REG_5, BPF_REG_1, offsetof(struct bpf_cgroup_dev_ctx, minor)), + }; + + return bpf_program_add_instructions(prog, pre_insn, ARRAY_SIZE(pre_insn)); +} + +int bpf_program_append_device(struct bpf_program *prog, struct device_item *device) +{ + int ret; + int jump_nr = 1; + struct bpf_insn bpf_access_decision[] = { + BPF_MOV64_IMM(BPF_REG_0, device->allow), + BPF_EXIT_INSN(), + }; + int access_mask; + int device_type; + + if (!prog || !device) + return ret_set_errno(-1, EINVAL); + + /* This is a global rule so no need to append anything. */ + if (device->global_rule > LXC_BPF_DEVICE_CGROUP_LOCAL_RULE) { + prog->device_list_type = device->global_rule; + return 0; + } + + device_type = bpf_device_type(device->type); + if (device_type < 0) + return log_error_errno(-1, EINVAL, "Invalid bpf cgroup device type %c", device->type); + + if (device_type > 0) + jump_nr++; + + access_mask = bpf_access_mask(device->access); + if (!bpf_device_all_access(access_mask)) + jump_nr += 3; + + if (device->major != -1) + jump_nr++; + + if (device->minor != -1) + jump_nr++; + + if (device_type > 0) { + struct bpf_insn ins[] = { + BPF_JMP_IMM(BPF_JNE, BPF_REG_2, device_type, jump_nr--), + }; + + ret = bpf_program_add_instructions(prog, ins, ARRAY_SIZE(ins)); + if (ret) + return log_error_errno(-1, errno, "Failed to add instructions to bpf cgroup program"); + } + + if (!bpf_device_all_access(access_mask)) { + struct bpf_insn ins[] = { + BPF_MOV32_REG(BPF_REG_1, BPF_REG_3), + BPF_ALU32_IMM(BPF_AND, BPF_REG_1, access_mask), + BPF_JMP_REG(BPF_JNE, BPF_REG_1, BPF_REG_3, jump_nr), + }; + + jump_nr -= 3; + ret = bpf_program_add_instructions(prog, ins, ARRAY_SIZE(ins)); + if (ret) + return log_error_errno(-1, errno, "Failed to add instructions to bpf cgroup program"); + } + + if (device->major >= 0) { + struct bpf_insn ins[] = { + BPF_JMP_IMM(BPF_JNE, BPF_REG_4, device->major, jump_nr--), + }; + + ret = bpf_program_add_instructions(prog, ins, ARRAY_SIZE(ins)); + if (ret) + return log_error_errno(-1, errno, "Failed to add instructions to bpf cgroup program"); + } + + if (device->minor >= 0) { + struct bpf_insn ins[] = { + BPF_JMP_IMM(BPF_JNE, BPF_REG_5, device->minor, jump_nr--), + }; + + ret = bpf_program_add_instructions(prog, ins, ARRAY_SIZE(ins)); + if (ret) + return log_error_errno(-1, errno, "Failed to add instructions to bpf cgroup program"); + } + + ret = bpf_program_add_instructions(prog, bpf_access_decision, + ARRAY_SIZE(bpf_access_decision)); + if (ret) + return log_error_errno(-1, errno, "Failed to add instructions to bpf cgroup program"); + + return 0; +} + +int bpf_program_finalize(struct bpf_program *prog) +{ + struct bpf_insn ins[] = { + BPF_MOV64_IMM(BPF_REG_0, prog->device_list_type), + BPF_EXIT_INSN(), + }; + + if (!prog) + return ret_set_errno(-1, EINVAL); + + TRACE("Implementing %s bpf device cgroup program", + prog->device_list_type == LXC_BPF_DEVICE_CGROUP_BLACKLIST + ? "blacklist" + : "whitelist"); + return bpf_program_add_instructions(prog, ins, ARRAY_SIZE(ins)); +} + +static int bpf_program_load_kernel(struct bpf_program *prog, char *log_buf, + size_t log_size) +{ + union bpf_attr attr; + + if (prog->kernel_fd >= 0) { + memset(log_buf, 0, log_size); + return 0; + } + + attr = (union bpf_attr){ + .prog_type = prog->prog_type, + .insns = PTR_TO_UINT64(prog->instructions), + .insn_cnt = prog->n_instructions, + .license = PTR_TO_UINT64("GPL"), + .log_buf = PTR_TO_UINT64(log_buf), + .log_level = !!log_buf, + .log_size = log_size, + }; + + prog->kernel_fd = bpf(BPF_PROG_LOAD, &attr, sizeof(attr)); + if (prog->kernel_fd < 0) + return log_error_errno(-1, errno, "Failed to load bpf program"); + + return 0; +} + +int bpf_program_cgroup_attach(struct bpf_program *prog, int type, + const char *path, uint32_t flags) +{ + __do_free char *copy = NULL; + __do_close_prot_errno int fd = -EBADF; + union bpf_attr attr; + int ret; + + if (!prog) + return ret_set_errno(-1, EINVAL); + + if (flags & ~(BPF_F_ALLOW_OVERRIDE, BPF_F_ALLOW_MULTI)) + return log_error_errno(-1, EINVAL, "Invalid flags for bpf program"); + + if (prog->attached_path) { + if (prog->attached_type != type) + return log_error_errno(-1, EBUSY, "Wrong type for bpf program"); + + if (prog->attached_flags != flags) + return log_error_errno(-1, EBUSY, "Wrong flags for bpf program"); + + if (flags != BPF_F_ALLOW_OVERRIDE) + return true; + } + + ret = bpf_program_load_kernel(prog, NULL, 0); + if (ret < 0) + return log_error_errno(-1, ret, "Failed to load bpf program"); + + copy = strdup(path); + if (!copy) + return log_error_errno(-1, ENOMEM, "Failed to duplicate cgroup path %s", path); + + fd = open(path, O_DIRECTORY | O_RDONLY | O_CLOEXEC); + if (fd < 0) + return log_error_errno(-1, errno, "Failed to open cgroup path %s", path); + + attr = (union bpf_attr){ + .attach_type = type, + .target_fd = fd, + .attach_bpf_fd = prog->kernel_fd, + .attach_flags = flags, + }; + + ret = bpf(BPF_PROG_ATTACH, &attr, sizeof(attr)); + if (ret < 0) + return log_error_errno(-1, errno, "Failed to attach bpf program"); + + free_replace_move_ptr(prog->attached_path, copy); + prog->attached_type = type; + prog->attached_flags = flags; + + TRACE("Loaded and attached bpf program to cgroup %s", prog->attached_path); + return 0; +} + +int bpf_program_cgroup_detach(struct bpf_program *prog) +{ + int ret; + __do_close_prot_errno int fd = -EBADF; + + if (!prog) + return 0; + + if (!prog->attached_path) + return 0; + + fd = open(prog->attached_path, O_DIRECTORY | O_RDONLY | O_CLOEXEC); + if (fd < 0) { + if (errno != ENOENT) + return log_error_errno(-1, errno, "Failed to open attach cgroup %s", + prog->attached_path); + } else { + union bpf_attr attr; + + attr = (union bpf_attr){ + .attach_type = prog->attached_type, + .target_fd = fd, + .attach_bpf_fd = prog->kernel_fd, + }; + + ret = bpf(BPF_PROG_DETACH, &attr, sizeof(attr)); + if (ret < 0) + return log_error_errno(-1, errno, "Failed to detach bpf program from cgroup %s", + prog->attached_path); + } + + free(prog->attached_path); + prog->attached_path = NULL; + + return 0; +} + +bool bpf_devices_cgroup_supported(void) +{ + const struct bpf_insn dummy[] = { + BPF_MOV64_IMM(BPF_REG_0, 1), + BPF_EXIT_INSN(), + }; + + __do_bpf_program_free struct bpf_program *prog = NULL; + int ret; + + if (geteuid() != 0) + return log_trace(false, + "The bpf device cgroup requires real root"); + + prog = bpf_program_new(BPF_PROG_TYPE_CGROUP_DEVICE); + if (prog < 0) + return log_trace(false, "Failed to allocate new bpf device cgroup program"); + + ret = bpf_program_add_instructions(prog, dummy, ARRAY_SIZE(dummy)); + if (ret < 0) + return log_trace(false, "Failed to add new instructions to bpf device cgroup program"); + + ret = bpf_program_load_kernel(prog, NULL, 0); + if (ret < 0) + return log_trace(false, "Failed to load new bpf device cgroup program"); + + return log_trace(true, "The bpf device cgroup is supported"); +} +#endif diff --git a/cgroups/cgroup2_devices.h b/cgroups/cgroup2_devices.h new file mode 100644 index 0000000..4fee779 --- /dev/null +++ b/cgroups/cgroup2_devices.h @@ -0,0 +1,154 @@ +/* SPDX-License-Identifier: LGPL-2.1+ */ + +/* Parts of this taken from systemd's implementation. */ + +#ifndef __LXC_CGROUP2_DEVICES_H +#define __LXC_CGROUP2_DEVICES_H + +#include <errno.h> +#include <fcntl.h> +#include <stdbool.h> +#include <stddef.h> +#include <stdint.h> +#include <sys/stat.h> +#include <sys/syscall.h> +#include <sys/types.h> +#include <unistd.h> + +#ifdef HAVE_STRUCT_BPF_CGROUP_DEV_CTX +#include <linux/bpf.h> +#include <linux/filter.h> +#endif + +#if !HAVE_BPF +#if !(defined __NR_bpf && __NR_bpf > 0) +#if defined __NR_bpf +#undef __NR_bpf +#endif +#if defined __i386__ +#define __NR_bpf 357 +#elif defined __x86_64__ +#define __NR_bpf 321 +#elif defined __aarch64__ +#define __NR_bpf 280 +#elif defined __arm__ +#define __NR_bpf 386 +#elif defined __sparc__ +#define __NR_bpf 349 +#elif defined __s390__ +#define __NR_bpf 351 +#elif defined __tilegx__ +#define __NR_bpf 280 +#else +#warning "__NR_bpf not defined for your architecture" +#endif +#endif + +union bpf_attr; + +static inline int missing_bpf(int cmd, union bpf_attr *attr, size_t size) +{ +#ifdef __NR_bpf + return (int)syscall(__NR_bpf, cmd, attr, size); +#else + errno = ENOSYS; + return -1; +#endif +} + +#define bpf missing_bpf +#endif + +struct bpf_program { + int device_list_type; + int kernel_fd; + uint32_t prog_type; + + size_t n_instructions; +#ifdef HAVE_STRUCT_BPF_CGROUP_DEV_CTX + struct bpf_insn *instructions; +#endif + + char *attached_path; + int attached_type; + uint32_t attached_flags; +}; + +#ifdef HAVE_STRUCT_BPF_CGROUP_DEV_CTX +struct bpf_program *bpf_program_new(uint32_t prog_type); +int bpf_program_init(struct bpf_program *prog); +int bpf_program_append_device(struct bpf_program *prog, + struct device_item *device); +int bpf_program_finalize(struct bpf_program *prog); +int bpf_program_cgroup_attach(struct bpf_program *prog, int type, + const char *path, uint32_t flags); +int bpf_program_cgroup_detach(struct bpf_program *prog); +void bpf_program_free(struct bpf_program *prog); +bool bpf_devices_cgroup_supported(void); +static inline void __auto_bpf_program_free__(struct bpf_program **prog) +{ + if (*prog) { + bpf_program_free(*prog); + *prog = NULL; + } +} +#else +static inline struct bpf_program *bpf_program_new(uint32_t prog_type) +{ + errno = ENOSYS; + return NULL; +} + +static inline int bpf_program_init(struct bpf_program *prog) +{ + errno = ENOSYS; + return -1; +} + +static inline int bpf_program_append_device(struct bpf_program *prog, char type, + int major, int minor, + const char *access, int allow) +{ + errno = ENOSYS; + return -1; +} + +static inline int bpf_program_finalize(struct bpf_program *prog) +{ + errno = ENOSYS; + return -1; +} + +static inline int bpf_program_cgroup_attach(struct bpf_program *prog, int type, + const char *path, uint32_t flags) +{ + errno = ENOSYS; + return -1; +} + +static inline int bpf_program_cgroup_detach(struct bpf_program *prog) +{ + errno = ENOSYS; + return -1; +} + +static inline void bpf_program_free(struct bpf_program *prog) +{ +} + + +static inline bool bpf_devices_cgroup_supported(void) +{ + return false; +} + +static inline void __auto_bpf_program_free__(struct bpf_program **prog) +{ +} + +#endif + +#define __do_bpf_program_free \ + __attribute__((__cleanup__(__auto_bpf_program_free__))) + +#endif /* __LXC_CGROUP2_DEVICES_H */ diff --git a/cgroups/cgroup_utils.c b/cgroups/cgroup_utils.c new file mode 100644 index 0000000..26e7438 --- /dev/null +++ b/cgroups/cgroup_utils.c @@ -0,0 +1,726 @@ +/* SPDX-License-Identifier: LGPL-2.1+ */ + +#ifndef _GNU_SOURCE +#define _GNU_SOURCE 1 +#endif +#include <fcntl.h> +#include <stdbool.h> +#include <stdlib.h> +#include <string.h> +#include <sys/mount.h> +#include <sys/stat.h> +#include <sys/types.h> +#include <sys/vfs.h> +#include <unistd.h> + +#include "cgroup.h" +#include "cgroup_utils.h" +#include "macro.h" +#include "memory_utils.h" + +int get_cgroup_version(char *line) +{ + if (is_cgroupfs_v1(line)) + return CGROUP_SUPER_MAGIC; + + if (is_cgroupfs_v2(line)) + return CGROUP2_SUPER_MAGIC; + + return 0; +} + +bool is_cgroupfs_v1(char *line) +{ + char *p = strstr(line, " - "); + if (!p) + return false; + return strncmp(p, " - cgroup ", 10) == 0; +} + +bool is_cgroupfs_v2(char *line) +{ + char *p = strstr(line, " - "); + if (!p) + return false; + + return strncmp(p, " - cgroup2 ", 11) == 0; +} + +int unified_cgroup_hierarchy(void) +{ + + int ret; + struct statfs fs; + + ret = statfs(DEFAULT_CGROUP_MOUNTPOINT, &fs); + if (ret < 0) + return -ENOMEDIUM; + + if (is_fs_type(&fs, CGROUP2_SUPER_MAGIC)) + return CGROUP2_SUPER_MAGIC; + + return 0; +} + +void *must_realloc(void *orig, size_t sz) +{ + void *ret; + + do { + ret = realloc(orig, sz); + } while (!ret); + + return ret; +} + +char *must_make_path(const char *first, ...) +{ + va_list args; + char *cur, *dest; + size_t full_len = strlen(first); + size_t buf_len; + size_t cur_len; + + dest = must_copy_string(first); + cur_len = full_len; + + va_start(args, first); + while ((cur = va_arg(args, char *)) != NULL) { + buf_len = strlen(cur); + + full_len += buf_len; + if (cur[0] != '/') + full_len++; + + dest = must_realloc(dest, full_len + 1); + + if (cur[0] != '/') { + memcpy(dest + cur_len, "/", 1); + cur_len++; + } + + memcpy(dest + cur_len, cur, buf_len); + cur_len += buf_len; + } + va_end(args); + + dest[cur_len] = '\0'; + return dest; +} + +bool is_fs_type(const struct statfs *fs, fs_type_magic magic_val) +{ + return (fs->f_type == (fs_type_magic)magic_val); +} + +char *must_copy_string(const char *entry) +{ + char *ret; + + if (!entry) + return NULL; + + do { + ret = strdup(entry); + } while (!ret); + + return ret; +} + +char *lxc_string_join(const char *sep, const char **parts, bool use_as_prefix) +{ + char *result; + char **p; + size_t sep_len = strlen(sep); + size_t result_len = use_as_prefix * sep_len; + size_t buf_len; + + /* calculate new string length */ + for (p = (char **)parts; *p; p++) + result_len += (p > (char **)parts) * sep_len + strlen(*p); + + buf_len = result_len + 1; + result = calloc(buf_len, 1); + if (!result) + return NULL; + + if (use_as_prefix) + (void)strlcpy(result, sep, buf_len); + + for (p = (char **)parts; *p; p++) { + if (p > (char **)parts) + (void)strlcat(result, sep, buf_len); + + (void)strlcat(result, *p, buf_len); + } + + return result; +} + +int lxc_count_file_lines(const char *fn) +{ + FILE *f; + char *line = NULL; + size_t sz = 0; + int n = 0; + + f = fopen_cloexec(fn, "r"); + if (!f) + return -1; + + while (getline(&line, &sz, f) != -1) { + n++; + } + + free(line); + fclose(f); + return n; +} + +bool dir_exists(const char *path) +{ + struct stat sb; + int ret; + + ret = stat(path, &sb); + if (ret < 0) + /* Could be something other than eexist, just say "no". */ + return false; + + return S_ISDIR(sb.st_mode); +} + +/* + * @path: a pathname where / replaced with '\0'. + * @offsetp: pointer to int showing which path segment was last seen. + * Updated on return to reflect the next segment. + * @fulllen: full original path length. + * Returns a pointer to the next path segment, or NULL if done. + */ +static char *get_nextpath(char *path, int *offsetp, int fulllen) +{ + int offset = *offsetp; + + if (offset >= fulllen) + return NULL; + + while (offset < fulllen && path[offset] != '\0') + offset++; + + while (offset < fulllen && path[offset] == '\0') + offset++; + + *offsetp = offset; + + return (offset < fulllen) ? &path[offset] : NULL; +} + +/* + * Check that @subdir is a subdir of @dir. @len is the length of + * @dir (to avoid having to recalculate it). + */ +static bool is_subdir(const char *subdir, const char *dir, size_t len) +{ + size_t subdirlen = strlen(subdir); + + if (subdirlen < len) + return false; + + if (strncmp(subdir, dir, len) != 0) + return false; + + if (dir[len-1] == '/') + return true; + + if (subdir[len] == '/' || subdirlen == len) + return true; + + return false; +} + +/* + * Check if the open fd is a symlink. Return -ELOOP if it is. Return + * -ENOENT if we couldn't fstat. Return 0 if the fd is ok. + */ +static int check_symlink(int fd) +{ + struct stat sb; + int ret; + + ret = fstat(fd, &sb); + if (ret < 0) + return -ENOENT; + + if (S_ISLNK(sb.st_mode)) + return -ELOOP; + + return 0; +} + +/* + * Open a file or directory, provided that it contains no symlinks. + * + * CAVEAT: This function must not be used for other purposes than container + * setup before executing the container's init + */ +static int open_if_safe(int dirfd, const char *nextpath) +{ + int newfd = openat(dirfd, nextpath, O_RDONLY | O_NOFOLLOW); + if (newfd >= 0) /* Was not a symlink, all good. */ + return newfd; + + if (errno == ELOOP) + return newfd; + + if (errno == EPERM || errno == EACCES) { + /* We're not root (cause we got EPERM) so try opening with + * O_PATH. + */ + newfd = openat(dirfd, nextpath, O_PATH | O_NOFOLLOW); + if (newfd >= 0) { + /* O_PATH will return an fd for symlinks. We know + * nextpath wasn't a symlink at last openat, so if fd is + * now a link, then something * fishy is going on. + */ + int ret = check_symlink(newfd); + if (ret < 0) { + close(newfd); + newfd = ret; + } + } + } + + return newfd; +} + +/* + * Open a path intending for mounting, ensuring that the final path + * is inside the container's rootfs. + * + * CAVEAT: This function must not be used for other purposes than container + * setup before executing the container's init + * + * @target: path to be opened + * @prefix_skip: a part of @target in which to ignore symbolic links. This + * would be the container's rootfs. + * + * Return an open fd for the path, or <0 on error. + */ +static int open_without_symlink(const char *target, const char *prefix_skip) +{ + int curlen = 0, dirfd, fulllen, i; + char *dup; + + fulllen = strlen(target); + + /* make sure prefix-skip makes sense */ + if (prefix_skip && strlen(prefix_skip) > 0) { + curlen = strlen(prefix_skip); + if (!is_subdir(target, prefix_skip, curlen)) + return -EINVAL; + + /* + * get_nextpath() expects the curlen argument to be + * on a (turned into \0) / or before it, so decrement + * curlen to make sure that happens + */ + if (curlen) + curlen--; + } else { + prefix_skip = "/"; + curlen = 0; + } + + /* Make a copy of target which we can hack up, and tokenize it */ + if ((dup = strdup(target)) == NULL) + return -ENOMEM; + + for (i = 0; i < fulllen; i++) { + if (dup[i] == '/') + dup[i] = '\0'; + } + + dirfd = open(prefix_skip, O_RDONLY); + if (dirfd < 0) + goto out; + + for (;;) { + int newfd, saved_errno; + char *nextpath; + + if ((nextpath = get_nextpath(dup, &curlen, fulllen)) == NULL) + goto out; + + newfd = open_if_safe(dirfd, nextpath); + saved_errno = errno; + close(dirfd); + + dirfd = newfd; + if (newfd < 0) { + errno = saved_errno; + goto out; + } + } + +out: + free(dup); + return dirfd; +} + +/* + * Safely mount a path into a container, ensuring that the mount target + * is under the container's @rootfs. (If @rootfs is NULL, then the container + * uses the host's /) + * + * CAVEAT: This function must not be used for other purposes than container + * setup before executing the container's init + */ +int safe_mount(const char *src, const char *dest, const char *fstype, + unsigned long flags, const void *data, const char *rootfs) +{ + int destfd, ret, saved_errno; + /* Only needs enough for /proc/self/fd/<fd>. */ + char srcbuf[50], destbuf[50]; + int srcfd = -1; + const char *mntsrc = src; + + if (!rootfs) + rootfs = ""; + + /* todo - allow symlinks for relative paths if 'allowsymlinks' option is passed */ + if (flags & MS_BIND && src && src[0] != '/') { + + srcfd = open_without_symlink(src, NULL); + if (srcfd < 0) + return srcfd; + + ret = snprintf(srcbuf, sizeof(srcbuf), "/proc/self/fd/%d", srcfd); + if (ret < 0 || ret >= (int)sizeof(srcbuf)) { + close(srcfd); + return -EINVAL; + } + mntsrc = srcbuf; + } + + destfd = open_without_symlink(dest, rootfs); + if (destfd < 0) { + if (srcfd != -1) { + saved_errno = errno; + close(srcfd); + errno = saved_errno; + } + + return destfd; + } + + ret = snprintf(destbuf, sizeof(destbuf), "/proc/self/fd/%d", destfd); + if (ret < 0 || ret >= (int)sizeof(destbuf)) { + if (srcfd != -1) + close(srcfd); + + close(destfd); + return -EINVAL; + } + + ret = mount(mntsrc, destbuf, fstype, flags, data); + saved_errno = errno; + if (srcfd != -1) + close(srcfd); + + close(destfd); + if (ret < 0) { + errno = saved_errno; + return ret; + } + + return 0; +} + +#ifndef HAVE_STRLCPY +size_t strlcpy(char *dest, const char *src, size_t size) +{ + size_t ret = strlen(src); + + if (size) { + size_t len = (ret >= size) ? size - 1 : ret; + memcpy(dest, src, len); + dest[len] = '\0'; + } + + return ret; +} +#endif + +#ifndef HAVE_STRLCAT +size_t strlcat(char *d, const char *s, size_t n) +{ + size_t l = strnlen(d, n); + if (l == n) + return l + strlen(s); + + return l + strlcpy(d + l, s, n - l); +} +#endif + +FILE *fopen_cloexec(const char *path, const char *mode) +{ + int open_mode = 0; + int step = 0; + int fd; + int saved_errno = 0; + FILE *ret; + + if (!strncmp(mode, "r+", 2)) { + open_mode = O_RDWR; + step = 2; + } else if (!strncmp(mode, "r", 1)) { + open_mode = O_RDONLY; + step = 1; + } else if (!strncmp(mode, "w+", 2)) { + open_mode = O_RDWR | O_TRUNC | O_CREAT; + step = 2; + } else if (!strncmp(mode, "w", 1)) { + open_mode = O_WRONLY | O_TRUNC | O_CREAT; + step = 1; + } else if (!strncmp(mode, "a+", 2)) { + open_mode = O_RDWR | O_CREAT | O_APPEND; + step = 2; + } else if (!strncmp(mode, "a", 1)) { + open_mode = O_WRONLY | O_CREAT | O_APPEND; + step = 1; + } + for (; mode[step]; step++) + if (mode[step] == 'x') + open_mode |= O_EXCL; + open_mode |= O_CLOEXEC; + + fd = open(path, open_mode, 0660); + if (fd < 0) + return NULL; + + ret = fdopen(fd, mode); + saved_errno = errno; + if (!ret) + close(fd); + errno = saved_errno; + return ret; +} + +/* Given a multi-line string, return a null-terminated copy of the current line. */ +static char *copy_to_eol(char *p) +{ + char *p2 = strchr(p, '\n'), *sret; + size_t len; + + if (!p2) + return NULL; + + len = p2 - p; + sret = must_realloc(NULL, len + 1); + memcpy(sret, p, len); + sret[len] = '\0'; + return sret; +} + +static void batch_realloc(char **mem, size_t oldlen, size_t newlen) +{ + int newbatches = (newlen / BATCH_SIZE) + 1; + int oldbatches = (oldlen / BATCH_SIZE) + 1; + + if (!*mem || newbatches > oldbatches) { + *mem = must_realloc(*mem, newbatches * BATCH_SIZE); + } +} + +void append_line(char **dest, size_t oldlen, char *new, size_t newlen) +{ + size_t full = oldlen + newlen; + + batch_realloc(dest, oldlen, full + 1); + + memcpy(*dest + oldlen, new, newlen + 1); +} + +static inline void drop_trailing_newlines(char *s) +{ + int l; + + for (l = strlen(s); l > 0 && s[l - 1] == '\n'; l--) + s[l - 1] = '\0'; +} + +/* Slurp in a whole file */ +char *read_file(const char *fnam) +{ + __do_free char *line = NULL; + __do_fclose FILE *f = NULL; + int linelen; + char *buf = NULL; + size_t len = 0, fulllen = 0; + + f = fopen(fnam, "r"); + if (!f) + return NULL; + while ((linelen = getline(&line, &len, f)) != -1) { + append_line(&buf, fulllen, line, linelen); + fulllen += linelen; + } + return buf; +} + +char *read_file_strip_newline(const char *fnam) +{ + char *buf; + + buf = read_file(fnam); + if (buf) + drop_trailing_newlines(buf); + return buf; +} + +/* Get current cgroup from /proc/self/cgroup for the cgroupfs v2 hierarchy. */ +char *cg_unified_get_current_cgroup(pid_t pid) +{ + __do_free char *basecginfo = NULL; + char path[STRLITERALLEN("/proc//cgroup") + INTTYPE_TO_STRLEN(pid_t) + 1]; + char *base_cgroup; + + snprintf(path, sizeof(path), "/proc/%d/cgroup", pid > 0 ?: 1); + basecginfo = read_file(path); + if (!basecginfo) + return NULL; + + base_cgroup = strstr(basecginfo, "0::/"); + if (!base_cgroup) + return NULL; + + base_cgroup = base_cgroup + 3; + return copy_to_eol(base_cgroup); +} + +/* cgline: pointer to character after the first ':' in a line in a \n-terminated + * /proc/self/cgroup file. Check whether controller c is present. + */ +static bool controller_in_clist(char *cgline, const char *c) +{ + __do_free char *tmp = NULL; + char *tok, *eol; + size_t len; + + eol = strchr(cgline, ':'); + if (!eol) + return false; + + len = eol - cgline; + tmp = must_realloc(NULL, len + 1); + memcpy(tmp, cgline, len); + tmp[len] = '\0'; + + lxc_iterate_parts(tok, tmp, ",") + if (strcmp(tok, c) == 0) + return true; + + return false; +} + +/* @basecginfo is a copy of /proc/$$/cgroup. Return the current cgroup for + * @controller. + */ +char *cg_hybrid_get_current_cgroup(char *basecginfo, const char *controller, int type) +{ + char *p = basecginfo; + + for (;;) { + bool is_cgv2_base_cgroup = false; + + /* cgroup v2 entry in "/proc/<pid>/cgroup": "0::/some/path" */ + if ((type == CGROUP2_SUPER_MAGIC) && (*p == '0')) + is_cgv2_base_cgroup = true; + + p = strchr(p, ':'); + if (!p) + return NULL; + p++; + + if (is_cgv2_base_cgroup || (controller && controller_in_clist(p, controller))) { + p = strchr(p, ':'); + if (!p) + return NULL; + p++; + return copy_to_eol(p); + } + + p = strchr(p, '\n'); + if (!p) + return NULL; + p++; + } +} + +char *cg_legacy_get_current_cgroup(pid_t pid, const char *controller) +{ + __do_free char *basecginfo = NULL; + char path[STRLITERALLEN("/proc//cgroup") + INTTYPE_TO_STRLEN(pid_t) + 1]; + + snprintf(path, sizeof(path), "/proc/%d/cgroup", pid > 0 ?: 1); + basecginfo = read_file(path); + if (!basecginfo) + return ret_set_errno(NULL, ENOMEM); + + return cg_hybrid_get_current_cgroup(basecginfo, controller, + CGROUP_SUPER_MAGIC); +} + + +char *readat_file(int dirfd, const char *path) +{ + __do_close_prot_errno int fd = -EBADF; + __do_free char *line = NULL; + __do_fclose FILE *f = NULL; + char *buf = NULL; + size_t len = 0, fulllen = 0; + ssize_t linelen; + + fd = openat(dirfd, path, O_NOFOLLOW | O_RDONLY | O_CLOEXEC); + if (fd < 0) + return NULL; + + /* transfer ownership of fd */ + f = fdopen(move_fd(fd), "re"); + if (!f) + return NULL; + + while ((linelen = getline(&line, &len, f)) != -1) { + append_line(&buf, fulllen, line, linelen); + fulllen += linelen; + } + + if (buf) + drop_trailing_newlines(buf); + + return buf; +} + +bool mkdir_p(const char *dir, mode_t mode) +{ + const char *tmp = dir; + const char *orig = dir; + char *makeme; + + do { + dir = tmp + strspn(tmp, "/"); + tmp = dir + strcspn(dir, "/"); + makeme = strndup(orig, dir - orig); + if (!makeme) + return false; + if (mkdir(makeme, mode) && errno != EEXIST) { + lxcfs_error("Failed to create directory '%s': %s.\n", + makeme, strerror(errno)); + free(makeme); + return false; + } + free(makeme); + } while(tmp != dir); + + return true; +} diff --git a/cgroups/cgroup_utils.h b/cgroups/cgroup_utils.h new file mode 100644 index 0000000..d4df757 --- /dev/null +++ b/cgroups/cgroup_utils.h @@ -0,0 +1,72 @@ +/* SPDX-License-Identifier: LGPL-2.1+ */ + +#ifndef __LXC_CGROUP_UTILS_H +#define __LXC_CGROUP_UTILS_H + +#include <stdbool.h> +#include <stdio.h> +#include <sys/stat.h> +#include <sys/types.h> +#include <sys/vfs.h> + +/* Retrieve the cgroup version of a given entry from /proc/<pid>/mountinfo. */ +extern int get_cgroup_version(char *line); + +/* Check if given entry from /proc/<pid>/mountinfo is a cgroupfs v1 mount. */ +extern bool is_cgroupfs_v1(char *line); + +/* Check if given entry from /proc/<pid>/mountinfo is a cgroupfs v2 mount. */ +extern bool is_cgroupfs_v2(char *line); + +/* Given a v1 hierarchy @mountpoint and base @path, verify that we can create + * directories underneath it. + */ +extern bool test_writeable_v1(char *mountpoint, char *path); + +/* Given a v2 hierarchy @mountpoint and base @path, verify that we can create + * directories underneath it and that we have write access to the cgroup's + * "cgroup.procs" file. + */ +extern bool test_writeable_v2(char *mountpoint, char *path); + +extern int unified_cgroup_hierarchy(void); + +extern void *must_realloc(void *orig, size_t sz); + +extern char *must_make_path(const char *first, ...); + +extern char *must_copy_string(const char *entry); + +/* __typeof__ should be safe to use with all compilers. */ +typedef __typeof__(((struct statfs *)NULL)->f_type) fs_type_magic; +extern bool is_fs_type(const struct statfs *fs, fs_type_magic magic_val); + +extern char *lxc_string_join(const char *sep, const char **parts, + bool use_as_prefix); +extern int lxc_count_file_lines(const char *fn); + +extern bool dir_exists(const char *path); + +extern int safe_mount(const char *src, const char *dest, const char *fstype, + unsigned long flags, const void *data, const char *rootfs); + +#ifndef HAVE_STRLCPY +extern size_t strlcpy(char *, const char *, size_t); +#endif + +#ifndef HAVE_STRLCAT +extern size_t strlcat(char *d, const char *s, size_t n); +#endif + +extern FILE *fopen_cloexec(const char *path, const char *mode); +extern void append_line(char **dest, size_t oldlen, char *new, size_t newlen); +extern char *read_file(const char *fnam); +extern char *readat_file(int fd, const char *path); +extern char *read_file_strip_newline(const char *fnam); +extern char *cg_unified_get_current_cgroup(pid_t pid); +extern char *cg_hybrid_get_current_cgroup(char *basecginfo, + const char *controller, int type); +extern char *cg_legacy_get_current_cgroup(pid_t pid, const char *controller); +extern bool mkdir_p(const char *dir, mode_t mode); + +#endif /* __LXC_CGROUP_UTILS_H */ diff --git a/configure.ac b/configure.ac index 81027cd..63cd934 100644 --- a/configure.ac +++ b/configure.ac @@ -162,4 +162,13 @@ AC_ARG_WITH([rootfs-path], AS_AC_EXPAND(LIBDIR, "$libdir") +AC_CHECK_FUNCS([strlcpy], + AM_CONDITIONAL(HAVE_STRLCPY, true) + AC_DEFINE(HAVE_STRLCPY,1,[Have strlcpy]), + AM_CONDITIONAL(HAVE_STRLCPY, false)) +AC_CHECK_FUNCS([strlcat], + AM_CONDITIONAL(HAVE_STRLCAT, true) + AC_DEFINE(HAVE_STRLCAT,1,[Have strlcat]), + AM_CONDITIONAL(HAVE_STRLCAT, false)) + AC_OUTPUT diff --git a/macro.h b/macro.h index 3e9ef82..4ec3876 100644 --- a/macro.h +++ b/macro.h @@ -1,9 +1,22 @@ #ifndef __LXCFS_MACRO_H #define __LXCFS_MACRO_H +#include <stdio.h> + +#define BATCH_SIZE 50 + +/* filesystem magic values */ +#ifndef CGROUP_SUPER_MAGIC +#define CGROUP_SUPER_MAGIC 0x27e0eb +#endif + +#ifndef CGROUP2_SUPER_MAGIC +#define CGROUP2_SUPER_MAGIC 0x63677270 +#endif + #define lxcfs_debug_stream(stream, format, ...) \ do { \ - fprintf(stream, "%s: %d: %s: " format, __FILE__, __LINE__, \ + fprintf(stream, "%s: %d: %s: " format "\n", __FILE__, __LINE__, \ __func__, ##__VA_ARGS__); \ } while (false) @@ -21,4 +34,45 @@ #define lxcfs_v(format, ...) #endif /* VERBOSE */ +#define log_error_errno(__ret__, __errno__, format, ...) \ + ({ \ + errno = __errno__; \ + lxcfs_error(format, ##__VA_ARGS__); \ + __ret__; \ + }) + +#define STRLITERALLEN(x) (sizeof(""x"") - 1) + +/* Calculate the number of chars needed to represent a given integer as a C + * string. Include room for '-' to indicate negative numbers and the \0 byte. + * This is based on systemd. + */ +#define INTTYPE_TO_STRLEN(type) \ + (2 + (sizeof(type) <= 1 \ + ? 3 \ + : sizeof(type) <= 2 \ + ? 5 \ + : sizeof(type) <= 4 \ + ? 10 \ + : sizeof(type) <= 8 \ + ? 20 \ + : sizeof(int[-2 * (sizeof(type) > 8)]))) + +#define ret_errno(__errno__) \ + ({ \ + errno = __errno__; \ + -__errno__; \ + }) + +#define ret_set_errno(__ret__, __errno__) \ + ({ \ + errno = __errno__; \ + __ret__; \ + }) + +#define lxc_iterate_parts(__iterator, __splitme, __separators) \ + for (char *__p = NULL, *__it = strtok_r(__splitme, __separators, &__p); \ + (__iterator = __it); \ + __iterator = __it = strtok_r(NULL, __separators, &__p)) + #endif /* __LXCFS_MACRO_H */ diff --git a/memory_utils.h b/memory_utils.h index 73e04fc..ac00b10 100644 --- a/memory_utils.h +++ b/memory_utils.h @@ -67,4 +67,6 @@ static inline void __auto_close__(int *fd) __internal_fd__; \ }) +#define zalloc(__size__) (calloc(1, __size__)) + #endif /* __LXCFS_MEMORY_UTILS_H */ diff --git a/sysfs_fuse.c b/sysfs_fuse.c index 32a59b7..d2b187b 100644 --- a/sysfs_fuse.c +++ b/sysfs_fuse.c @@ -65,7 +65,7 @@ static int sys_devices_system_cpu_online_read(char *buf, size_t size, initpid = fc->pid; cg = get_pid_cgroup(initpid, "cpuset"); if (!cg) - return read_file("/sys/devices/system/cpu/online", buf, size, d); + return read_file_fuse("/sys/devices/system/cpu/online", buf, size, d); prune_init_slice(cg); cpuset = get_cpuset(cg); @@ -78,7 +78,7 @@ static int sys_devices_system_cpu_online_read(char *buf, size_t size, max_cpus = max_cpu_count(cg); if (max_cpus == 0) - return read_file("/sys/devices/system/cpu/online", buf, size, d); + return read_file_fuse("/sys/devices/system/cpu/online", buf, size, d); if (max_cpus > 1) total_len = snprintf(d->buf, d->buflen, "0-%d\n", max_cpus - 1); else
_______________________________________________ lxc-devel mailing list lxc-devel@lists.linuxcontainers.org http://lists.linuxcontainers.org/listinfo/lxc-devel