[lxc-devel] [lxcfs/master] bindings: add infrastructure for cgroup2 support

brauner on Github Thu, 20 Feb 2020 07:32:17 -0800

The following pull request was submitted through Github.
It can be accessed and reviewed at: https://github.com/lxc/lxcfs/pull/318


This e-mail was sent by the LXC bot, direct replies will not reach the author
unless they happen to be subscribed to this list.

=== Description (from pull-request) ===
Mostly based on code I've written for liblxc.

Signed-off-by: Christian Brauner <christian.brau...@ubuntu.com>

From 4f8198790acda0337010090255aac90b9f943902 Mon Sep 17 00:00:00 2001
From: Christian Brauner <christian.brau...@ubuntu.com>
Date: Thu, 20 Feb 2020 16:30:47 +0100
Subject: [PATCH] bindings: add infrastructure for cgroup2 support

Mostly based on code I've written for liblxc.

Signed-off-by: Christian Brauner <christian.brau...@ubuntu.com>
---
 Makefile.am               |  10 +-
 bindings.c                | 497 ++++++------------------
 bindings.h                |   4 +-
 cgroups/cgfsng.c          | 787 ++++++++++++++++++++++++++++++++++++++
 cgroups/cgroup.c          |  79 ++++
 cgroups/cgroup.h          | 150 ++++++++
 cgroups/cgroup2_devices.c | 457 ++++++++++++++++++++++
 cgroups/cgroup2_devices.h | 154 ++++++++
 cgroups/cgroup_utils.c    | 726 +++++++++++++++++++++++++++++++++++
 cgroups/cgroup_utils.h    |  72 ++++
 configure.ac              |   9 +
 macro.h                   |  56 ++-
 memory_utils.h            |   2 +
 sysfs_fuse.c              |   4 +-
 14 files changed, 2618 insertions(+), 389 deletions(-)
 create mode 100644 cgroups/cgfsng.c
 create mode 100644 cgroups/cgroup.c
 create mode 100644 cgroups/cgroup.h
 create mode 100644 cgroups/cgroup2_devices.c
 create mode 100644 cgroups/cgroup2_devices.h
 create mode 100644 cgroups/cgroup_utils.c
 create mode 100644 cgroups/cgroup_utils.h

diff --git a/Makefile.am b/Makefile.am
index 13fb1e3..e783f29 100644
--- a/Makefile.am
+++ b/Makefile.am
@@ -13,6 +13,10 @@ AM_LDFLAGS = $(FUSE_LIBS) -pthread
 AM_CFLAGS += -DRUNTIME_PATH=\"$(RUNTIME_PATH)\"
 
 liblxcfs_la_SOURCES = bindings.c bindings.h \
+                     cgroups/cgfsng.c \
+                     cgroups/cgroup.c cgroups/cgroup.h \
+                     cgroups/cgroup2_devices.c cgroups/cgroup2_devices.h \
+                     cgroups/cgroup_utils.c cgroups/cgroup_utils.h \
                      cpuset.c \
                      memory_utils.h \
                      sysfs_fuse.c sysfs_fuse.h
@@ -20,13 +24,17 @@ liblxcfs_la_CFLAGS = $(AM_CFLAGS)
 liblxcfs_la_LDFLAGS = $(AM_CFLAGS) -module -avoid-version -shared
 
 liblxcfstest_la_SOURCES = bindings.c bindings.h \
+                         cgroups/cgfsng.c \
+                         cgroups/cgroup.c cgroups/cgroup.h \
+                         cgroups/cgroup2_devices.c cgroups/cgroup2_devices.h \
+                         cgroups/cgroup_utils.c cgroups/cgroup_utils.h \
                          cpuset.c \
                          memory_utils.h \
                          sysfs_fuse.c sysfs_fuse.h
 liblxcfstest_la_CFLAGS = $(AM_CFLAGS) -DRELOADTEST
 liblxcfstest_la_LDFLAGS = $(AM_CFLAGS) -module -avoid-version -shared
 
-noinst_HEADERS = bindings.h macro.h memory_utils.h sysfs_fuse.h
+noinst_HEADERS = bindings.h cgroups/cgroup.h cgroups/cgroup2_devices.h 
cgroups/cgroup_utils.h macro.h memory_utils.h sysfs_fuse.h
 
 sodir=$(libdir)
 lxcfs_LTLIBRARIES = liblxcfs.la
diff --git a/bindings.c b/bindings.c
index 4a8a421..ab0cd71 100644
--- a/bindings.c
+++ b/bindings.c
@@ -38,6 +38,8 @@
 #include <sys/vfs.h>
 
 #include "bindings.h"
+#include "cgroups/cgroup.h"
+#include "cgroups/cgroup_utils.h"
 #include "memory_utils.h"
 #include "config.h"
 
@@ -410,25 +412,8 @@ static void lock_mutex(pthread_mutex_t *l)
        }
 }
 
-/* READ-ONLY after __constructor__ collect_and_mount_subsystems() has run.
- * Number of hierarchies mounted. */
-static int num_hierarchies;
+static struct cgroup_ops *cgroup_ops;
 
-/* READ-ONLY after __constructor__ collect_and_mount_subsystems() has run.
- * Hierachies mounted {cpuset, blkio, ...}:
- * Initialized via __constructor__ collect_and_mount_subsystems(). */
-static char **hierarchies;
-
-/* READ-ONLY after __constructor__ collect_and_mount_subsystems() has run.
- * Open file descriptors:
- * @fd_hierarchies[i] refers to cgroup @hierarchies[i]. They are mounted in a
- * private mount namespace.
- * Initialized via __constructor__ collect_and_mount_subsystems().
- * @fd_hierarchies[i] can be used to perform file operations on the cgroup
- * mounts and respective files in the private namespace even when located in
- * another namespace using the *at() family of functions
- * {openat(), fchownat(), ...}. */
-static int *fd_hierarchies;
 static int cgroup_mount_ns_fd = -1;
 
 static void unlock_mutex(pthread_mutex_t *l)
@@ -599,70 +584,6 @@ static int is_dir(const char *path, int fd)
        return 0;
 }
 
-static char *must_copy_string(const char *str)
-{
-       char *dup = NULL;
-       if (!str)
-               return NULL;
-       do {
-               dup = strdup(str);
-       } while (!dup);
-
-       return dup;
-}
-
-static inline void drop_trailing_newlines(char *s)
-{
-       int l;
-
-       for (l=strlen(s); l>0 && s[l-1] == '\n'; l--)
-               s[l-1] = '\0';
-}
-
-#define BATCH_SIZE 50
-static void dorealloc(char **mem, size_t oldlen, size_t newlen)
-{
-       int newbatches = (newlen / BATCH_SIZE) + 1;
-       int oldbatches = (oldlen / BATCH_SIZE) + 1;
-
-       if (!*mem || newbatches > oldbatches) {
-               char *tmp;
-               do {
-                       tmp = realloc(*mem, newbatches * BATCH_SIZE);
-               } while (!tmp);
-               *mem = tmp;
-       }
-}
-static void append_line(char **contents, size_t *len, char *line, ssize_t 
linelen)
-{
-       size_t newlen = *len + linelen;
-       dorealloc(contents, *len, newlen + 1);
-       memcpy(*contents + *len, line, linelen+1);
-       *len = newlen;
-}
-
-static char *slurp_file(const char *from, int fd)
-{
-       char *line = NULL;
-       char *contents = NULL;
-       FILE *f = fdopen(fd, "r");
-       size_t len = 0, fulllen = 0;
-       ssize_t linelen;
-
-       if (!f)
-               return NULL;
-
-       while ((linelen = getline(&line, &len, f)) != -1) {
-               append_line(&contents, &fulllen, line, linelen);
-       }
-       fclose(f);
-
-       if (contents)
-               drop_trailing_newlines(contents);
-       free(line);
-       return contents;
-}
-
 static int preserve_ns(const int pid, const char *ns)
 {
        int ret;
@@ -776,79 +697,29 @@ struct cgfs_files {
        uint32_t mode;
 };
 
-#define ALLOC_NUM 20
-static bool store_hierarchy(char *stridx, char *h)
-{
-       if (num_hierarchies % ALLOC_NUM == 0) {
-               size_t n = (num_hierarchies / ALLOC_NUM) + 1;
-               n *= ALLOC_NUM;
-               char **tmp = realloc(hierarchies, n * sizeof(char *));
-               if (!tmp) {
-                       lxcfs_error("%s\n", strerror(errno));
-                       exit(1);
-               }
-               hierarchies = tmp;
-       }
-
-       hierarchies[num_hierarchies++] = must_copy_string(h);
-       return true;
-}
-
 static void print_subsystems(void)
 {
-       int i;
+       int i = 0;
 
        fprintf(stderr, "mount namespace: %d\n", cgroup_mount_ns_fd);
        fprintf(stderr, "hierarchies:\n");
-       for (i = 0; i < num_hierarchies; i++) {
-               if (hierarchies[i])
-                       fprintf(stderr, " %2d: fd: %3d: %s\n", i,
-                               fd_hierarchies[i], hierarchies[i]);
+       for (struct hierarchy **h = cgroup_ops->hierarchies; h && *h; h++, i++) 
{
+               __do_free char *controllers = lxc_string_join(",", (const char 
**)(*h)->controllers, false);
+               fprintf(stderr, " %2d: fd: %3d: %s\n", i, (*h)->fd, controllers 
?: "");
        }
 }
 
-static bool in_comma_list(const char *needle, const char *haystack)
-{
-       const char *s = haystack, *e;
-       size_t nlen = strlen(needle);
-
-       while (*s && (e = strchr(s, ','))) {
-               if (nlen != e - s) {
-                       s = e + 1;
-                       continue;
-               }
-               if (strncmp(needle, s, nlen) == 0)
-                       return true;
-               s = e + 1;
-       }
-       if (strcmp(needle, s) == 0)
-               return true;
-       return false;
-}
-
 /* do we need to do any massaging here?  I'm not sure... */
 /* Return the mounted controller and store the corresponding open file 
descriptor
  * referring to the controller mountpoint in the private lxcfs namespace in
  * @cfd.
  */
-static char *find_mounted_controller(const char *controller, int *cfd)
+static int find_mounted_controller(const char *controller)
 {
-       int i;
-
-       for (i = 0; i < num_hierarchies; i++) {
-               if (!hierarchies[i])
-                       continue;
-               if (strcmp(hierarchies[i], controller) == 0) {
-                       *cfd = fd_hierarchies[i];
-                       return hierarchies[i];
-               }
-               if (in_comma_list(controller, hierarchies[i])) {
-                       *cfd = fd_hierarchies[i];
-                       return hierarchies[i];
-               }
-       }
+       struct hierarchy *h;
 
-       return NULL;
+       h = cgroup_ops->get_hierarchy(cgroup_ops, controller);
+       return h ? h->fd : -EBADF;
 }
 
 bool cgfs_set_value(const char *controller, const char *cgroup, const char 
*file,
@@ -856,10 +727,10 @@ bool cgfs_set_value(const char *controller, const char 
*cgroup, const char *file
 {
        int ret, fd, cfd;
        size_t len;
-       char *fnam, *tmpc;
+       char *fnam;
 
-       tmpc = find_mounted_controller(controller, &cfd);
-       if (!tmpc)
+       cfd = find_mounted_controller(controller);
+       if (cfd < 0)
                return false;
 
        /* Make sure we pass a relative path to *at() family of functions.
@@ -922,10 +793,10 @@ int cgfs_create(const char *controller, const char *cg, 
uid_t uid, gid_t gid)
 {
        int cfd;
        size_t len;
-       char *dirnam, *tmpc;
+       char *dirnam;
 
-       tmpc = find_mounted_controller(controller, &cfd);
-       if (!tmpc)
+       cfd = find_mounted_controller(controller);
+       if (cfd < 0)
                return -EINVAL;
 
        /* Make sure we pass a relative path to *at() family of functions.
@@ -1012,11 +883,11 @@ bool cgfs_remove(const char *controller, const char *cg)
 {
        int fd, cfd;
        size_t len;
-       char *dirnam, *tmpc;
+       char *dirnam;
        bool bret;
 
-       tmpc = find_mounted_controller(controller, &cfd);
-       if (!tmpc)
+       cfd = find_mounted_controller(controller);
+       if (cfd < 0)
                return false;
 
        /* Make sure we pass a relative path to *at() family of functions.
@@ -1039,10 +910,10 @@ bool cgfs_chmod_file(const char *controller, const char 
*file, mode_t mode)
 {
        int cfd;
        size_t len;
-       char *pathname, *tmpc;
+       char *pathname;
 
-       tmpc = find_mounted_controller(controller, &cfd);
-       if (!tmpc)
+       cfd = find_mounted_controller(controller);
+       if (cfd < 0)
                return false;
 
        /* Make sure we pass a relative path to *at() family of functions.
@@ -1076,11 +947,11 @@ int cgfs_chown_file(const char *controller, const char 
*file, uid_t uid, gid_t g
 {
        int cfd;
        size_t len;
-       char *pathname, *tmpc;
+       char *pathname;
 
-       tmpc = find_mounted_controller(controller, &cfd);
-       if (!tmpc)
-               return -EINVAL;
+       cfd = find_mounted_controller(controller);
+       if (cfd < 0)
+               return false;
 
        /* Make sure we pass a relative path to *at() family of functions.
         * . + /file + \0
@@ -1102,11 +973,11 @@ FILE *open_pids_file(const char *controller, const char 
*cgroup)
 {
        int fd, cfd;
        size_t len;
-       char *pathname, *tmpc;
+       char *pathname;
 
-       tmpc = find_mounted_controller(controller, &cfd);
-       if (!tmpc)
-               return NULL;
+       cfd = find_mounted_controller(controller);
+       if (cfd < 0)
+               return false;
 
        /* Make sure we pass a relative path to *at() family of functions.
         * . + /cgroup + / "cgroup.procs" + \0
@@ -1128,15 +999,15 @@ static bool cgfs_iterate_cgroup(const char *controller, 
const char *cgroup, bool
 {
        int cfd, fd, ret;
        size_t len;
-       char *cg, *tmpc;
+       char *cg;
        char pathname[MAXPATHLEN];
        size_t sz = 0, asz = 0;
        struct dirent *dirent;
        DIR *dir;
 
-       tmpc = find_mounted_controller(controller, &cfd);
+       cfd = find_mounted_controller(controller);
        *list = NULL;
-       if (!tmpc)
+       if (cfd < 0)
                return false;
 
        /* Make sure we pass a relative path to *at() family of functions. */
@@ -1233,12 +1104,12 @@ void free_keys(struct cgfs_files **keys)
 
 bool cgfs_get_value(const char *controller, const char *cgroup, const char 
*file, char **value)
 {
-       int ret, fd, cfd;
+       int ret, cfd;
        size_t len;
-       char *fnam, *tmpc;
+       char *fnam;
 
-       tmpc = find_mounted_controller(controller, &cfd);
-       if (!tmpc)
+       cfd = find_mounted_controller(controller);
+       if (cfd < 0)
                return false;
 
        /* Make sure we pass a relative path to *at() family of functions.
@@ -1250,11 +1121,7 @@ bool cgfs_get_value(const char *controller, const char 
*cgroup, const char *file
        if (ret < 0 || (size_t)ret >= len)
                return false;
 
-       fd = openat(cfd, fnam, O_RDONLY);
-       if (fd < 0)
-               return false;
-
-       *value = slurp_file(fnam, fd);
+       *value = readat_file(cfd, fnam);
        return *value != NULL;
 }
 
@@ -1262,10 +1129,10 @@ bool cgfs_param_exist(const char *controller, const 
char *cgroup, const char *fi
 {
        int ret, cfd;
        size_t len;
-       char *fnam, *tmpc;
+       char *fnam;
 
-       tmpc = find_mounted_controller(controller, &cfd);
-       if (!tmpc)
+       cfd = find_mounted_controller(controller);
+       if (cfd < 0)
                return false;
 
        /* Make sure we pass a relative path to *at() family of functions.
@@ -1284,12 +1151,12 @@ struct cgfs_files *cgfs_get_key(const char *controller, 
const char *cgroup, cons
 {
        int ret, cfd;
        size_t len;
-       char *fnam, *tmpc;
+       char *fnam;
        struct stat sb;
        struct cgfs_files *newkey;
 
-       tmpc = find_mounted_controller(controller, &cfd);
-       if (!tmpc)
+       cfd = find_mounted_controller(controller);
+       if (cfd < 0)
                return false;
 
        if (file && *file == '/')
@@ -1347,12 +1214,12 @@ bool is_child_cgroup(const char *controller, const char 
*cgroup, const char *f)
 {
        int cfd;
        size_t len;
-       char *fnam, *tmpc;
+       char *fnam;
        int ret;
        struct stat sb;
 
-       tmpc = find_mounted_controller(controller, &cfd);
-       if (!tmpc)
+       cfd = find_mounted_controller(controller);
+       if (cfd < 0)
                return false;
 
        /* Make sure we pass a relative path to *at() family of functions.
@@ -1707,58 +1574,18 @@ static char *get_next_cgroup_dir(const char *taskcg, 
const char *querycg)
        return start;
 }
 
-static void stripnewline(char *x)
-{
-       size_t l = strlen(x);
-       if (l && x[l-1] == '\n')
-               x[l-1] = '\0';
-}
-
 char *get_pid_cgroup(pid_t pid, const char *contrl)
 {
        int cfd;
-       char fnam[PROCLEN];
-       FILE *f;
-       char *answer = NULL;
-       char *line = NULL;
-       size_t len = 0;
-       int ret;
-       const char *h = find_mounted_controller(contrl, &cfd);
-       if (!h)
-               return NULL;
 
-       ret = snprintf(fnam, PROCLEN, "/proc/%d/cgroup", pid);
-       if (ret < 0 || ret >= PROCLEN)
-               return NULL;
-       if (!(f = fopen(fnam, "r")))
-               return NULL;
+       cfd = find_mounted_controller(contrl);
+       if (cfd < 0)
+               return false;
 
-       while (getline(&line, &len, f) != -1) {
-               char *c1, *c2;
-               if (!line[0])
-                       continue;
-               c1 = strchr(line, ':');
-               if (!c1)
-                       goto out;
-               c1++;
-               c2 = strchr(c1, ':');
-               if (!c2)
-                       goto out;
-               *c2 = '\0';
-               if (strcmp(c1, h) != 0)
-                       continue;
-               c2++;
-               stripnewline(c2);
-               do {
-                       answer = strdup(c2);
-               } while (!answer);
-               break;
-       }
+       if (pure_unified_layout(cgroup_ops))
+               return cg_unified_get_current_cgroup(pid);
 
-out:
-       fclose(f);
-       free(line);
-       return answer;
+       return cg_legacy_get_current_cgroup(pid, contrl);
 }
 
 /*
@@ -1939,10 +1766,9 @@ static char *pick_controller_from_path(struct 
fuse_context *fc, const char *path
        if (slash)
                *slash = '\0';
 
-       int i;
-       for (i = 0; i < num_hierarchies; i++) {
-               if (hierarchies[i] && strcmp(hierarchies[i], contr) == 0)
-                       return hierarchies[i];
+       for (struct hierarchy **h = cgroup_ops->hierarchies; h && *h; h++) {
+               if ((*h)->__controllers && strcmp((*h)->__controllers, contr) 
== 0)
+                       return (*h)->__controllers;
        }
        errno = ENOENT;
        return NULL;
@@ -2005,7 +1831,7 @@ int cg_getattr(const char *path, struct stat *sb)
        int ret = -ENOENT;
 
 
-       if (!fc)
+       if (!fc || !cgroup_ops || pure_unified_layout(cgroup_ops))
                return -EIO;
 
        memset(sb, 0, sizeof(struct stat));
@@ -2110,7 +1936,7 @@ int cg_opendir(const char *path, struct fuse_file_info 
*fi)
        struct file_info *dir_info;
        char *controller = NULL;
 
-       if (!fc)
+       if (!fc || !cgroup_ops || pure_unified_layout(cgroup_ops))
                return -EIO;
 
        if (strcmp(path, "/cgroup") == 0) {
@@ -2164,6 +1990,9 @@ int cg_readdir(const char *path, void *buf, 
fuse_fill_dir_t filler, off_t offset
        struct fuse_context *fc = fuse_get_context();
        char **clist = NULL;
 
+       if (!fc || !cgroup_ops || pure_unified_layout(cgroup_ops))
+               return -EIO;
+
        if (filler(buf, ".", NULL, 0) != 0 || filler(buf, "..", NULL, 0) != 0)
                return -EIO;
 
@@ -2172,14 +2001,18 @@ int cg_readdir(const char *path, void *buf, 
fuse_fill_dir_t filler, off_t offset
                return -EIO;
        }
        if (!d->cgroup && !d->controller) {
-               // ls /var/lib/lxcfs/cgroup - just show list of controllers
-               int i;
+               /*
+                * ls /var/lib/lxcfs/cgroup - just show list of controllers.
+                * This only works with the legacy hierarchy.
+                */
+               for (struct hierarchy **h = cgroup_ops->hierarchies; h && *h; 
h++) {
+                       if (is_unified_hierarchy(*h))
+                               continue;
 
-               for (i = 0;  i < num_hierarchies; i++) {
-                       if (hierarchies[i] && filler(buf, hierarchies[i], NULL, 
0) != 0) {
+                       if ((*h)->__controllers && filler(buf, 
(*h)->__controllers, NULL, 0))
                                return -EIO;
-                       }
                }
+
                return 0;
        }
 
@@ -2274,7 +2107,7 @@ int cg_open(const char *path, struct fuse_file_info *fi)
        struct fuse_context *fc = fuse_get_context();
        int ret;
 
-       if (!fc)
+       if (!fc || !cgroup_ops || pure_unified_layout(cgroup_ops))
                return -EIO;
 
        controller = pick_controller_from_path(fc, path);
@@ -2342,12 +2175,12 @@ int cg_access(const char *path, int mode)
        struct cgfs_files *k = NULL;
        struct fuse_context *fc = fuse_get_context();
 
+       if (!fc || !cgroup_ops || pure_unified_layout(cgroup_ops))
+               return -EIO;
+
        if (strcmp(path, "/cgroup") == 0)
                return 0;
 
-       if (!fc)
-               return -EIO;
-
        controller = pick_controller_from_path(fc, path);
        if (!controller)
                return -errno;
@@ -2758,6 +2591,9 @@ int cg_read(const char *path, char *buf, size_t size, 
off_t offset,
        int ret, s;
        bool r;
 
+       if (!fc || !cgroup_ops || pure_unified_layout(cgroup_ops))
+               return -EIO;
+
        if (f->type != LXC_TYPE_CGFILE) {
                lxcfs_error("%s\n", "Internal error: directory cache info used 
in cg_read.");
                return -EIO;
@@ -2766,9 +2602,6 @@ int cg_read(const char *path, char *buf, size_t size, 
off_t offset,
        if (offset)
                return 0;
 
-       if (!fc)
-               return -EIO;
-
        if (!f->controller)
                return -EINVAL;
 
@@ -3068,6 +2901,9 @@ int cg_write(const char *path, const char *buf, size_t 
size, off_t offset,
        struct file_info *f = (struct file_info *)fi->fh;
        bool r;
 
+       if (!fc || !cgroup_ops || pure_unified_layout(cgroup_ops))
+               return -EIO;
+
        if (f->type != LXC_TYPE_CGFILE) {
                lxcfs_error("%s\n", "Internal error: directory cache info used 
in cg_write.");
                return -EIO;
@@ -3076,9 +2912,6 @@ int cg_write(const char *path, const char *buf, size_t 
size, off_t offset,
        if (offset)
                return 0;
 
-       if (!fc)
-               return -EIO;
-
        localbuf = alloca(size+1);
        localbuf[size] = '\0';
        memcpy(localbuf, buf, size);
@@ -3118,7 +2951,7 @@ int cg_chown(const char *path, uid_t uid, gid_t gid)
        const char *cgroup;
        int ret;
 
-       if (!fc)
+       if (!fc || !cgroup_ops || pure_unified_layout(cgroup_ops))
                return -EIO;
 
        if (strcmp(path, "/cgroup") == 0)
@@ -3184,7 +3017,7 @@ int cg_chmod(const char *path, mode_t mode)
        const char *cgroup;
        int ret;
 
-       if (!fc)
+       if (!fc || !cgroup_ops || pure_unified_layout(cgroup_ops))
                return -EIO;
 
        if (strcmp(path, "/cgroup") == 0)
@@ -3252,7 +3085,7 @@ int cg_mkdir(const char *path, mode_t mode)
        const char *cgroup;
        int ret;
 
-       if (!fc)
+       if (!fc || !cgroup_ops || pure_unified_layout(cgroup_ops))
                return -EIO;
 
        controller = pick_controller_from_path(fc, path);
@@ -3306,7 +3139,7 @@ int cg_rmdir(const char *path)
        const char *cgroup;
        int ret;
 
-       if (!fc)
+       if (!fc || !cgroup_ops || pure_unified_layout(cgroup_ops))
                return -EIO;
 
        controller = pick_controller_from_path(fc, path);
@@ -3427,7 +3260,7 @@ static void get_blkio_io_value(char *str, unsigned major, 
unsigned minor, char *
        }
 }
 
-int read_file(const char *path, char *buf, size_t size, struct file_info *d)
+int read_file_fuse(const char *path, char *buf, size_t size, struct file_info 
*d)
 {
        size_t linelen = 0, total_len = 0, rv = 0;
        char *line = NULL;
@@ -3538,7 +3371,7 @@ static int proc_meminfo_read(char *buf, size_t size, 
off_t offset,
                initpid = fc->pid;
        cg = get_pid_cgroup(initpid, "memory");
        if (!cg)
-               return read_file("/proc/meminfo", buf, size, d);
+               return read_file_fuse("/proc/meminfo", buf, size, d);
        prune_init_slice(cg);
 
        memlimit = get_min_memlimit(cg, "memory.limit_in_bytes");
@@ -3828,14 +3661,13 @@ static double exact_cpu_count(const char *cg)
 bool use_cpuview(const char *cg)
 {
        int cfd;
-       char *tmpc;
 
-       tmpc = find_mounted_controller("cpu", &cfd);
-       if (!tmpc)
+       cfd = find_mounted_controller("cpu");
+       if (cfd < 0)
                return false;
 
-       tmpc = find_mounted_controller("cpuacct", &cfd);
-       if (!tmpc)
+       cfd = find_mounted_controller("cpuacct");
+       if (cfd < 0)
                return false;
 
        return true;
@@ -3885,7 +3717,7 @@ static int proc_cpuinfo_read(char *buf, size_t size, 
off_t offset,
                initpid = fc->pid;
        cg = get_pid_cgroup(initpid, "cpuset");
        if (!cg)
-               return read_file("proc/cpuinfo", buf, size, d);
+               return read_file_fuse("proc/cpuinfo", buf, size, d);
        prune_init_slice(cg);
 
        cpuset = get_cpuset(cg);
@@ -4988,13 +4820,13 @@ static int proc_stat_read(char *buf, size_t size, off_t 
offset,
         * in some case cpuacct_usage.all in "/" will larger then /proc/stat
         */
        if (initpid == 1) {
-           return read_file("/proc/stat", buf, size, d);
+           return read_file_fuse("/proc/stat", buf, size, d);
        }
 
        cg = get_pid_cgroup(initpid, "cpuset");
        lxcfs_v("cg: %s\n", cg);
        if (!cg)
-               return read_file("/proc/stat", buf, size, d);
+               return read_file_fuse("/proc/stat", buf, size, d);
        prune_init_slice(cg);
 
        cpuset = get_cpuset(cg);
@@ -5333,7 +5165,7 @@ static int proc_diskstats_read(char *buf, size_t size, 
off_t offset,
                initpid = fc->pid;
        cg = get_pid_cgroup(initpid, "blkio");
        if (!cg)
-               return read_file("/proc/diskstats", buf, size, d);
+               return read_file_fuse("/proc/diskstats", buf, size, d);
        prune_init_slice(cg);
 
        if (!cgfs_get_value("blkio", cg, "blkio.io_serviced_recursive", 
&io_serviced_str))
@@ -5455,7 +5287,7 @@ static int proc_swaps_read(char *buf, size_t size, off_t 
offset,
                initpid = fc->pid;
        cg = get_pid_cgroup(initpid, "memory");
        if (!cg)
-               return read_file("/proc/swaps", buf, size, d);
+               return read_file_fuse("/proc/swaps", buf, size, d);
        prune_init_slice(cg);
 
        memlimit = get_min_memlimit(cg, "memory.limit_in_bytes");
@@ -5810,14 +5642,14 @@ static int proc_loadavg_read(char *buf, size_t size, 
off_t offset,
                return total_len;
        }
        if (!loadavg)
-               return read_file("/proc/loadavg", buf, size, d);
+               return read_file_fuse("/proc/loadavg", buf, size, d);
 
        initpid = lookup_initpid_in_store(fc->pid);
        if (initpid <= 1 || is_shared_pidns(initpid))
                initpid = fc->pid;
        cg = get_pid_cgroup(initpid, "cpu");
        if (!cg)
-               return read_file("/proc/loadavg", buf, size, d);
+               return read_file_fuse("/proc/loadavg", buf, size, d);
 
        prune_init_slice(cg);
        hash = calc_hash(cg) % LOAD_SIZE;
@@ -5825,7 +5657,8 @@ static int proc_loadavg_read(char *buf, size_t size, 
off_t offset,
 
        /* First time */
        if (n == NULL) {
-               if (!find_mounted_controller("cpu", &cfd)) {
+               cfd = find_mounted_controller("cpu");
+               if (cfd >= 0) {
                        /*
                         * In locate_node() above, pthread_rwlock_unlock() 
isn't used
                         * because delete is not allowed before read has ended.
@@ -6069,30 +5902,6 @@ int proc_read(const char *path, char *buf, size_t size, 
off_t offset,
  * Functions needed to setup cgroups in the __constructor__.
  */
 
-static bool mkdir_p(const char *dir, mode_t mode)
-{
-       const char *tmp = dir;
-       const char *orig = dir;
-       char *makeme;
-
-       do {
-               dir = tmp + strspn(tmp, "/");
-               tmp = dir + strcspn(dir, "/");
-               makeme = strndup(orig, dir - orig);
-               if (!makeme)
-                       return false;
-               if (mkdir(makeme, mode) && errno != EEXIST) {
-                       lxcfs_error("Failed to create directory '%s': %s.\n",
-                               makeme, strerror(errno));
-                       free(makeme);
-                       return false;
-               }
-               free(makeme);
-       } while(tmp != dir);
-
-       return true;
-}
-
 static bool umount_if_mounted(void)
 {
        if (umount2(BASEDIR, MNT_DETACH) < 0 && errno != EINVAL) {
@@ -6345,45 +6154,19 @@ static bool cgfs_prepare_mounts(void)
 
 static bool cgfs_mount_hierarchies(void)
 {
-       char *target;
-       size_t clen, len;
-       int i, ret;
-
-       for (i = 0; i < num_hierarchies; i++) {
-               char *controller = hierarchies[i];
-
-               clen = strlen(controller);
-               len = strlen(BASEDIR) + clen + 2;
-               target = malloc(len);
-               if (!target)
-                       return false;
+       if (!mkdir_p(BASEDIR DEFAULT_CGROUP_MOUNTPOINT, 0755))
+               return false;
 
-               ret = snprintf(target, len, "%s/%s", BASEDIR, controller);
-               if (ret < 0 || ret >= len) {
-                       free(target);
-                       return false;
-               }
-               if (mkdir(target, 0755) < 0 && errno != EEXIST) {
-                       free(target);
-                       return false;
-               }
-               if (!strcmp(controller, "unified"))
-                       ret = mount("none", target, "cgroup2", 0, NULL);
-               else
-                       ret = mount(controller, target, "cgroup", 0, 
controller);
-               if (ret < 0) {
-                       lxcfs_error("Failed mounting cgroup %s: %s\n", 
controller, strerror(errno));
-                       free(target);
-                       return false;
-               }
+       if (!cgroup_ops->mount(cgroup_ops, BASEDIR))
+               return false;
 
-               fd_hierarchies[i] = open(target, O_DIRECTORY);
-               if (fd_hierarchies[i] < 0) {
-                       free(target);
+       for (struct hierarchy **h = cgroup_ops->hierarchies; h && *h; h++) {
+               __do_free char *path = must_make_path(BASEDIR, 
(*h)->mountpoint, NULL);
+               (*h)->fd = open(path, O_DIRECTORY | O_CLOEXEC | O_NOFOLLOW);
+               if ((*h)->fd < 0)
                        return false;
-               }
-               free(target);
        }
+
        return true;
 }
 
@@ -6405,45 +6188,13 @@ static bool cgfs_setup_controllers(void)
 
 static void __attribute__((constructor)) collect_and_mount_subsystems(void)
 {
-       FILE *f;
-       char *cret, *line = NULL;
+       char *cret;
        char cwd[MAXPATHLEN];
-       size_t len = 0;
-       int i, init_ns = -1;
-       bool found_unified = false;
+       int init_ns = -1;
 
-       if ((f = fopen("/proc/self/cgroup", "r")) == NULL) {
-               lxcfs_error("Error opening /proc/self/cgroup: %s\n", 
strerror(errno));
+       cgroup_ops = cgroup_init();
+       if (!cgroup_ops)
                return;
-       }
-
-       while (getline(&line, &len, f) != -1) {
-               char *idx, *p, *p2;
-
-               p = strchr(line, ':');
-               if (!p)
-                       goto out;
-               idx = line;
-               *(p++) = '\0';
-
-               p2 = strrchr(p, ':');
-               if (!p2)
-                       goto out;
-               *p2 = '\0';
-
-               /* With cgroupv2 /proc/self/cgroup can contain entries of the
-                * form: 0::/ This will cause lxcfs to fail the cgroup mounts
-                * because it parses out the empty string "" and later on passes
-                * it to mount(). Let's skip such entries.
-                */
-               if (!strcmp(p, "") && !strcmp(idx, "0") && !found_unified) {
-                       found_unified = true;
-                       p = "unified";
-               }
-
-               if (!store_hierarchy(line, p))
-                       goto out;
-       }
 
        /* Preserve initial namespace. */
        init_ns = preserve_mnt_ns(getpid());
@@ -6452,15 +6203,6 @@ static void __attribute__((constructor)) 
collect_and_mount_subsystems(void)
                goto out;
        }
 
-       fd_hierarchies = malloc(sizeof(int) * num_hierarchies);
-       if (!fd_hierarchies) {
-               lxcfs_error("%s\n", strerror(errno));
-               goto out;
-       }
-
-       for (i = 0; i < num_hierarchies; i++)
-               fd_hierarchies[i] = -1;
-
        cret = getcwd(cwd, MAXPATHLEN);
        if (!cret)
                lxcfs_debug("Could not retrieve current working directory: 
%s.\n", strerror(errno));
@@ -6488,26 +6230,15 @@ static void __attribute__((constructor)) 
collect_and_mount_subsystems(void)
        print_subsystems();
 
 out:
-       free(line);
-       fclose(f);
        if (init_ns >= 0)
                close(init_ns);
 }
 
 static void __attribute__((destructor)) free_subsystems(void)
 {
-       int i;
-
        lxcfs_debug("%s\n", "Running destructor for liblxcfs.");
 
-       for (i = 0; i < num_hierarchies; i++) {
-               if (hierarchies[i])
-                       free(hierarchies[i]);
-               if (fd_hierarchies && fd_hierarchies[i] >= 0)
-                       close(fd_hierarchies[i]);
-       }
-       free(hierarchies);
-       free(fd_hierarchies);
+       cgroup_exit(cgroup_ops);
        free_cpuview();
 
        if (cgroup_mount_ns_fd >= 0)
diff --git a/bindings.h b/bindings.h
index 250bbac..229d64c 100644
--- a/bindings.h
+++ b/bindings.h
@@ -75,8 +75,8 @@ extern int stop_load_daemon(pthread_t pid);
 
 extern pid_t lookup_initpid_in_store(pid_t qpid);
 extern char *get_pid_cgroup(pid_t pid, const char *contrl);
-extern int read_file(const char *path, char *buf, size_t size,
-                    struct file_info *d);
+extern int read_file_fuse(const char *path, char *buf, size_t size,
+                         struct file_info *d);
 extern void prune_init_slice(char *cg);
 extern char *get_cpuset(const char *cg);
 extern bool use_cpuview(const char *cg);
diff --git a/cgroups/cgfsng.c b/cgroups/cgfsng.c
new file mode 100644
index 0000000..08b719d
--- /dev/null
+++ b/cgroups/cgfsng.c
@@ -0,0 +1,787 @@
+/* SPDX-License-Identifier: LGPL-2.1+ */
+
+/*
+ * cgfs-ng.c: this is a new, simplified implementation of a filesystem
+ * cgroup backend.  The original cgfs.c was designed to be as flexible
+ * as possible.  It would try to find cgroup filesystems no matter where
+ * or how you had them mounted, and deduce the most usable mount for
+ * each controller.
+ *
+ * This new implementation assumes that cgroup filesystems are mounted
+ * under /sys/fs/cgroup/clist where clist is either the controller, or
+ * a comma-separated list of controllers.
+ */
+
+#ifndef _GNU_SOURCE
+#define _GNU_SOURCE 1
+#endif
+#include <ctype.h>
+#include <dirent.h>
+#include <errno.h>
+#include <grp.h>
+#include <linux/kdev_t.h>
+#include <linux/types.h>
+#include <poll.h>
+#include <signal.h>
+#include <stdint.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <sys/mount.h>
+#include <sys/types.h>
+#include <unistd.h>
+
+#include "cgroup.h"
+#include "cgroup2_devices.h"
+#include "cgroup_utils.h"
+#include "macro.h"
+#include "memory_utils.h"
+
+static void free_string_list(char **clist)
+{
+       int i;
+
+       if (!clist)
+               return;
+
+       for (i = 0; clist[i]; i++)
+               free(clist[i]);
+
+       free(clist);
+}
+
+/* Given a pointer to a null-terminated array of pointers, realloc to add one
+ * entry, and point the new entry to NULL. Do not fail. Return the index to the
+ * second-to-last entry - that is, the one which is now available for use
+ * (keeping the list null-terminated).
+ */
+static int append_null_to_list(void ***list)
+{
+       int newentry = 0;
+
+       if (*list)
+               for (; (*list)[newentry]; newentry++)
+                       ;
+
+       *list = must_realloc(*list, (newentry + 2) * sizeof(void **));
+       (*list)[newentry + 1] = NULL;
+       return newentry;
+}
+
+/* Given a null-terminated array of strings, check whether @entry is one of the
+ * strings.
+ */
+static bool string_in_list(char **list, const char *entry)
+{
+       int i;
+
+       if (!list)
+               return false;
+
+       for (i = 0; list[i]; i++)
+               if (strcmp(list[i], entry) == 0)
+                       return true;
+
+       return false;
+}
+
+/* Return a copy of @entry prepending "name=", i.e.  turn "systemd" into
+ * "name=systemd". Do not fail.
+ */
+static char *cg_legacy_must_prefix_named(char *entry)
+{
+       size_t len;
+       char *prefixed;
+
+       len = strlen(entry);
+       prefixed = must_realloc(NULL, len + 6);
+
+       memcpy(prefixed, "name=", STRLITERALLEN("name="));
+       memcpy(prefixed + STRLITERALLEN("name="), entry, len);
+       prefixed[len + 5] = '\0';
+
+       return prefixed;
+}
+
+/* Append an entry to the clist. Do not fail. @clist must be NULL the first 
time
+ * we are called.
+ *
+ * We also handle named subsystems here. Any controller which is not a kernel
+ * subsystem, we prefix "name=". Any which is both a kernel and named 
subsystem,
+ * we refuse to use because we're not sure which we have here.
+ * (TODO: We could work around this in some cases by just remounting to be
+ * unambiguous, or by comparing mountpoint contents with current cgroup.)
+ *
+ * The last entry will always be NULL.
+ */
+static void must_append_controller(char **klist, char **nlist, char ***clist,
+                                  char *entry)
+{
+       int newentry;
+       char *copy;
+
+       if (string_in_list(klist, entry) && string_in_list(nlist, entry))
+               return;
+
+       newentry = append_null_to_list((void ***)clist);
+
+       if (strncmp(entry, "name=", 5) == 0)
+               copy = must_copy_string(entry);
+       else if (string_in_list(klist, entry))
+               copy = must_copy_string(entry);
+       else
+               copy = cg_legacy_must_prefix_named(entry);
+
+       (*clist)[newentry] = copy;
+}
+
+/* Given a handler's cgroup data, return the struct hierarchy for the 
controller
+ * @c, or NULL if there is none.
+ */
+static struct hierarchy *cgfsng_get_hierarchy(struct cgroup_ops *ops,
+                                             const char *controller)
+{
+       int i;
+
+       errno = ENOENT;
+
+       if (!ops->hierarchies)
+               return NULL;
+
+       for (i = 0; ops->hierarchies[i]; i++) {
+               if (!controller) {
+                       /* This is the empty unified hierarchy. */
+                       if (ops->hierarchies[i]->controllers &&
+                           !ops->hierarchies[i]->controllers[0])
+                               return ops->hierarchies[i];
+                       continue;
+               } else if (pure_unified_layout(ops) &&
+                          strcmp(controller, "devices") == 0) {
+                       if (ops->unified->bpf_device_controller)
+                               return ops->unified;
+                       break;
+               }
+
+               if (string_in_list(ops->hierarchies[i]->controllers, 
controller))
+                       return ops->hierarchies[i];
+       }
+
+       return NULL;
+}
+
+static inline struct hierarchy *get_hierarchy(struct cgroup_ops *ops,
+                                             const char *controller)
+{
+       return cgfsng_get_hierarchy(ops, controller);
+}
+
+/* Given two null-terminated lists of strings, return true if any string is in
+ * both.
+ */
+static bool controller_lists_intersect(char **l1, char **l2)
+{
+       int i;
+
+       if (!l1 || !l2)
+               return false;
+
+       for (i = 0; l1[i]; i++) {
+               if (string_in_list(l2, l1[i]))
+                       return true;
+       }
+
+       return false;
+}
+
+/* For a null-terminated list of controllers @clist, return true if any of 
those
+ * controllers is already listed the null-terminated list of hierarchies 
@hlist.
+ * Realistically, if one is present, all must be present.
+ */
+static bool controller_list_is_dup(struct hierarchy **hlist, char **clist)
+{
+       int i;
+
+       if (!hlist)
+               return false;
+
+       for (i = 0; hlist[i]; i++)
+               if (controller_lists_intersect(hlist[i]->controllers, clist))
+                       return true;
+
+       return false;
+}
+
+/* Get the controllers from a mountinfo line There are other ways we could get
+ * this info. For lxcfs, field 3 is /cgroup/controller-list. For cgroupfs, we
+ * could parse the mount options. But we simply assume that the mountpoint must
+ * be /sys/fs/cgroup/controller-list
+ */
+static char **cg_hybrid_get_controllers(char **klist, char **nlist, char *line,
+                                       int type, char **controllers)
+{
+       /* The fourth field is /sys/fs/cgroup/comma-delimited-controller-list
+        * for legacy hierarchies.
+        */
+       int i;
+       char *p2, *tok;
+       char *p = line, *sep = ",";
+       char **aret = NULL;
+
+       for (i = 0; i < 4; i++) {
+               p = strchr(p, ' ');
+               if (!p)
+                       return NULL;
+               p++;
+       }
+
+       /* Note, if we change how mountinfo works, then our caller will need to
+        * verify /sys/fs/cgroup/ in this field.
+        */
+       if (strncmp(p, DEFAULT_CGROUP_MOUNTPOINT "/", 15) != 0)
+               return NULL;
+
+       p += 15;
+       p2 = strchr(p, ' ');
+       if (!p2)
+               return NULL;
+       *p2 = '\0';
+
+       if (type == CGROUP_SUPER_MAGIC) {
+               __do_free char *dup = NULL;
+
+               /* strdup() here for v1 hierarchies. Otherwise
+                * lxc_iterate_parts() will destroy mountpoints such as
+                * "/sys/fs/cgroup/cpu,cpuacct".
+                */
+               dup = must_copy_string(p);
+               if (!dup)
+                       return NULL;
+
+               lxc_iterate_parts (tok, dup, sep)
+                       must_append_controller(klist, nlist, &aret, tok);
+               *controllers = move_ptr(dup);
+       }
+       *p2 = ' ';
+
+       return aret;
+}
+
+static char **cg_unified_make_empty_controller(void)
+{
+       int newentry;
+       char **aret = NULL;
+
+       newentry = append_null_to_list((void ***)&aret);
+       aret[newentry] = NULL;
+       return aret;
+}
+
+static char **cg_unified_get_controllers(const char *file)
+{
+       __do_free char *buf = NULL;
+       char *sep = " \t\n";
+       char **aret = NULL;
+       char *tok;
+
+       buf = read_file(file);
+       if (!buf)
+               return NULL;
+
+       lxc_iterate_parts(tok, buf, sep) {
+               int newentry;
+               char *copy;
+
+               newentry = append_null_to_list((void ***)&aret);
+               copy = must_copy_string(tok);
+               aret[newentry] = copy;
+       }
+
+       return aret;
+}
+
+static struct hierarchy *add_hierarchy(struct hierarchy ***h, char **clist, 
char *mountpoint,
+                                      char *container_base_path, int type)
+{
+       struct hierarchy *new;
+       int newentry;
+
+       new = zalloc(sizeof(*new));
+       new->controllers = clist;
+       new->mountpoint = mountpoint;
+       new->container_base_path = container_base_path;
+       new->version = type;
+
+       newentry = append_null_to_list((void ***)h);
+       (*h)[newentry] = new;
+       return new;
+}
+
+/* Get a copy of the mountpoint from @line, which is a line from
+ * /proc/self/mountinfo.
+ */
+static char *cg_hybrid_get_mountpoint(char *line)
+{
+       int i;
+       size_t len;
+       char *p2;
+       char *p = line, *sret = NULL;
+
+       for (i = 0; i < 4; i++) {
+               p = strchr(p, ' ');
+               if (!p)
+                       return NULL;
+               p++;
+       }
+
+       if (strncmp(p, DEFAULT_CGROUP_MOUNTPOINT "/", 15) != 0)
+               return NULL;
+
+       p2 = strchr(p + 15, ' ');
+       if (!p2)
+               return NULL;
+       *p2 = '\0';
+
+       len = strlen(p);
+       sret = must_realloc(NULL, len + 1);
+       memcpy(sret, p, len);
+       sret[len] = '\0';
+       return sret;
+}
+
+static void must_append_string(char ***list, char *entry)
+{
+       int newentry;
+       char *copy;
+
+       newentry = append_null_to_list((void ***)list);
+       copy = must_copy_string(entry);
+       (*list)[newentry] = copy;
+}
+
+static int get_existing_subsystems(char ***klist, char ***nlist)
+{
+       __do_free char *line = NULL;
+       __do_fclose FILE *f = NULL;
+       size_t len = 0;
+
+       f = fopen("/proc/self/cgroup", "r");
+       if (!f)
+               return -1;
+
+       while (getline(&line, &len, f) != -1) {
+               char *p, *p2, *tok;
+               p = strchr(line, ':');
+               if (!p)
+                       continue;
+               p++;
+               p2 = strchr(p, ':');
+               if (!p2)
+                       continue;
+               *p2 = '\0';
+
+               /* If the kernel has cgroup v2 support, then /proc/self/cgroup
+                * contains an entry of the form:
+                *
+                *      0::/some/path
+                *
+                * In this case we use "cgroup2" as controller name.
+                */
+               if ((p2 - p) == 0) {
+                       must_append_string(klist, "cgroup2");
+                       continue;
+               }
+
+               lxc_iterate_parts(tok, p, ",") {
+                       if (strncmp(tok, "name=", 5) == 0)
+                               must_append_string(nlist, tok);
+                       else
+                               must_append_string(klist, tok);
+               }
+       }
+
+       return 0;
+}
+
+static void trim(char *s)
+{
+       size_t len;
+
+       len = strlen(s);
+       while ((len > 1) && (s[len - 1] == '\n'))
+               s[--len] = '\0';
+}
+
+/* __cg_mount_direct
+ *
+ * Mount cgroup hierarchies directly without using bind-mounts. The main
+ * uses-cases are mounting cgroup hierarchies in cgroup namespaces and mounting
+ * cgroups for the LXC_AUTO_CGROUP_FULL option.
+ */
+static int __cg_mount_direct(struct hierarchy *h, const char *controllerpath)
+{
+        __do_free char *controllers = NULL;
+        char *fstype = "cgroup2";
+        unsigned long flags = 0;
+        int ret;
+
+        flags |= MS_NOSUID;
+        flags |= MS_NOEXEC;
+        flags |= MS_NODEV;
+        flags |= MS_RELATIME;
+
+        if (h->version != CGROUP2_SUPER_MAGIC) {
+                controllers = lxc_string_join(",", (const char 
**)h->controllers, false);
+                if (!controllers)
+                        return -ENOMEM;
+                fstype = "cgroup";
+       }
+
+       ret = mount("cgroup", controllerpath, fstype, flags, controllers);
+       if (ret < 0)
+               return -1;
+
+       return 0;
+}
+
+static inline int cg_mount_cgroup_full(struct hierarchy *h,
+                                      const char *controllerpath)
+{
+       return __cg_mount_direct(h, controllerpath);
+}
+
+static bool cgfsng_mount(struct cgroup_ops *ops, const char *root)
+{
+       __do_free char *cgroup_root = NULL;
+       int ret;
+       bool retval = false;
+
+       if (!ops)
+               return ret_set_errno(false, ENOENT);
+
+       if (!ops->hierarchies)
+               return true;
+
+       cgroup_root = must_make_path(root, DEFAULT_CGROUP_MOUNTPOINT, NULL);
+       if (ops->cgroup_layout == CGROUP_LAYOUT_UNIFIED)
+               return cg_mount_cgroup_full(ops->unified, cgroup_root) == 0;
+
+       /* mount tmpfs */
+       ret = safe_mount(NULL, cgroup_root, "tmpfs",
+                        MS_NOSUID | MS_NODEV | MS_NOEXEC | MS_RELATIME,
+                        "size=10240k,mode=755", root);
+       if (ret < 0)
+               goto on_error;
+
+       for (int i = 0; ops->hierarchies[i]; i++) {
+               __do_free char *controllerpath = NULL;
+               struct hierarchy *h = ops->hierarchies[i];
+               char *controller = strrchr(h->mountpoint, '/');
+
+               if (!controller)
+                       continue;
+               controller++;
+
+               controllerpath = must_make_path(cgroup_root, controller, NULL);
+               if (dir_exists(controllerpath))
+                       continue;
+
+               ret = mkdir(controllerpath, 0755);
+               if (ret < 0)
+                       log_error_errno(goto on_error, errno,
+                                       "Error creating cgroup path: %s",
+                                       controllerpath);
+
+               ret = cg_mount_cgroup_full( h, controllerpath);
+               if (ret < 0)
+                       goto on_error;
+       }
+       retval = true;
+
+on_error:
+       return retval;
+}
+
+static int recursive_count_nrtasks(char *dirname)
+{
+       __do_free char *path = NULL;
+       __do_closedir DIR *dir = NULL;
+       struct dirent *direntp;
+       int count = 0, ret;
+
+       dir = opendir(dirname);
+       if (!dir)
+               return 0;
+
+       while ((direntp = readdir(dir))) {
+               struct stat mystat;
+
+               if (!strcmp(direntp->d_name, ".") ||
+                   !strcmp(direntp->d_name, ".."))
+                       continue;
+
+               path = must_make_path(dirname, direntp->d_name, NULL);
+
+               if (lstat(path, &mystat))
+                       continue;
+
+               if (!S_ISDIR(mystat.st_mode))
+                       continue;
+
+               count += recursive_count_nrtasks(path);
+       }
+
+       path = must_make_path(dirname, "cgroup.procs", NULL);
+       ret = lxc_count_file_lines(path);
+       if (ret != -1)
+               count += ret;
+
+       return count;
+}
+
+static int cgfsng_nrtasks(struct cgroup_ops *ops)
+{
+       __do_free char *path = NULL;
+
+       if (!ops)
+               return ret_set_errno(-1, ENOENT);
+
+       if (!ops->container_cgroup || !ops->hierarchies)
+               return ret_set_errno(-1, EINVAL);
+
+       path = must_make_path(ops->hierarchies[0]->container_full_path, NULL);
+       return recursive_count_nrtasks(path);
+}
+
+static int cgfsng_num_hierarchies(struct cgroup_ops *ops)
+{
+       int i = 0;
+
+       if (!ops)
+               return ret_set_errno(-1, ENOENT);
+
+       if (!ops->hierarchies)
+               return 0;
+
+       for (; ops->hierarchies[i]; i++)
+               ;
+
+       return i;
+}
+
+static bool cgfsng_get_hierarchies(struct cgroup_ops *ops, int n, char ***out)
+{
+       int i;
+
+       if (!ops)
+               return ret_set_errno(false, ENOENT);
+
+       if (!ops->hierarchies)
+               return false;
+
+       /* sanity check n */
+       for (i = 0; i < n; i++)
+               if (!ops->hierarchies[i])
+                       return ret_set_errno(false, ENOENT);
+
+       *out = ops->hierarchies[i]->controllers;
+
+       return true;
+}
+
+/* At startup, parse_hierarchies finds all the info we need about cgroup
+ * mountpoints and current cgroups, and stores it in @d.
+ */
+static int cg_hybrid_init(struct cgroup_ops *ops)
+{
+       __do_free char *basecginfo = NULL;
+       __do_free char *line = NULL;
+       __do_fclose FILE *f = NULL;
+       int ret;
+       size_t len = 0;
+       char **klist = NULL, **nlist = NULL;
+
+       /* Root spawned containers escape the current cgroup, so use init's
+        * cgroups as our base in that case.
+        */
+       basecginfo = read_file("/proc/1/cgroup");
+       if (!basecginfo)
+               return ret_set_errno(-1, ENOMEM);
+
+       ret = get_existing_subsystems(&klist, &nlist);
+       if (ret < 0)
+               return log_error_errno(-1, errno, "Failed to retrieve available 
legacy cgroup controllers");
+
+       f = fopen("/proc/self/mountinfo", "r");
+       if (!f)
+               return log_error_errno(-1, errno, "Failed to open 
\"/proc/self/mountinfo\"");
+
+       while (getline(&line, &len, f) != -1) {
+               int type;
+               struct hierarchy *new;
+               char *base_cgroup = NULL, *mountpoint = NULL;
+               char **controller_list = NULL;
+               __do_free char *controllers = NULL;
+
+               type = get_cgroup_version(line);
+               if (type == 0)
+                       continue;
+
+               if (type == CGROUP2_SUPER_MAGIC && ops->unified)
+                       continue;
+
+               if (ops->cgroup_layout == CGROUP_LAYOUT_UNKNOWN) {
+                       if (type == CGROUP2_SUPER_MAGIC)
+                               ops->cgroup_layout = CGROUP_LAYOUT_UNIFIED;
+                       else if (type == CGROUP_SUPER_MAGIC)
+                               ops->cgroup_layout = CGROUP_LAYOUT_LEGACY;
+               } else if (ops->cgroup_layout == CGROUP_LAYOUT_UNIFIED) {
+                       if (type == CGROUP_SUPER_MAGIC)
+                               ops->cgroup_layout = CGROUP_LAYOUT_HYBRID;
+               } else if (ops->cgroup_layout == CGROUP_LAYOUT_LEGACY) {
+                       if (type == CGROUP2_SUPER_MAGIC)
+                               ops->cgroup_layout = CGROUP_LAYOUT_HYBRID;
+               }
+
+               controller_list = cg_hybrid_get_controllers(klist, nlist, line,
+                                                           type, &controllers);
+               if (!controller_list && type == CGROUP_SUPER_MAGIC)
+                       continue;
+
+               if (type == CGROUP_SUPER_MAGIC)
+                       if (controller_list_is_dup(ops->hierarchies, 
controller_list))
+                               ret_set_errno(goto next, EEXIST);
+
+               mountpoint = cg_hybrid_get_mountpoint(line);
+               if (!mountpoint)
+                       log_error_errno(goto next, EINVAL, "Failed parsing 
mountpoint from \"%s\"", line);
+
+               if (type == CGROUP_SUPER_MAGIC) {
+                       base_cgroup = cg_hybrid_get_current_cgroup(basecginfo, 
controller_list[0], CGROUP_SUPER_MAGIC);
+               } else {
+                       base_cgroup = cg_hybrid_get_current_cgroup(basecginfo, 
NULL, CGROUP2_SUPER_MAGIC);
+               }
+               if (!base_cgroup)
+                       log_error_errno(goto next, EINVAL, "Failed to find 
current cgroup %s", mountpoint);
+
+               trim(base_cgroup);
+               prune_init_scope(base_cgroup);
+
+               if (type == CGROUP2_SUPER_MAGIC) {
+                       char *cgv2_ctrl_path;
+
+                       cgv2_ctrl_path = must_make_path(mountpoint, base_cgroup,
+                                                       "cgroup.controllers",
+                                                       NULL);
+
+                       controller_list = 
cg_unified_get_controllers(cgv2_ctrl_path);
+                       free(cgv2_ctrl_path);
+                       if (!controller_list)
+                               controller_list = 
cg_unified_make_empty_controller();
+               }
+
+               new = add_hierarchy(&ops->hierarchies, controller_list, 
mountpoint, base_cgroup, type);
+               new->__controllers = move_ptr(controllers);
+               if (type == CGROUP2_SUPER_MAGIC && !ops->unified)
+                       ops->unified = new;
+
+               continue;
+
+       next:
+               free_string_list(controller_list);
+               free(mountpoint);
+               free(base_cgroup);
+       }
+
+       free_string_list(klist);
+       free_string_list(nlist);
+
+       return 0;
+}
+
+static int cg_unified_init(struct cgroup_ops *ops)
+{
+       __do_free char *subtree_path = NULL;
+       int ret;
+       char *mountpoint;
+       char **delegatable;
+       struct hierarchy *new;
+       char *base_cgroup = NULL;
+
+       ret = unified_cgroup_hierarchy();
+       if (ret == -ENOMEDIUM)
+               return ret_errno(ENOMEDIUM);
+
+       if (ret != CGROUP2_SUPER_MAGIC)
+               return 0;
+
+       base_cgroup = cg_unified_get_current_cgroup(1);
+       if (!base_cgroup)
+               return ret_errno(EINVAL);
+       prune_init_scope(base_cgroup);
+
+       /*
+        * We assume that the cgroup we're currently in has been delegated to
+        * us and we are free to further delege all of the controllers listed
+        * in cgroup.controllers further down the hierarchy.
+        */
+       mountpoint = must_copy_string(DEFAULT_CGROUP_MOUNTPOINT);
+       subtree_path = must_make_path(mountpoint, base_cgroup, 
"cgroup.controllers", NULL);
+       delegatable = cg_unified_get_controllers(subtree_path);
+       if (!delegatable)
+               delegatable = cg_unified_make_empty_controller();
+
+       /* TODO: If the user requested specific controllers via lxc.cgroup.use
+        * we should verify here. The reason I'm not doing it right is that I'm
+        * not convinced that lxc.cgroup.use will be the future since it is a
+        * global property. I much rather have an option that lets you request
+        * controllers per container.
+        */
+
+       new = add_hierarchy(&ops->hierarchies, delegatable, mountpoint, 
base_cgroup, CGROUP2_SUPER_MAGIC);
+
+       if (bpf_devices_cgroup_supported())
+               new->bpf_device_controller = 1;
+
+       ops->cgroup_layout = CGROUP_LAYOUT_UNIFIED;
+       ops->unified = new;
+       return CGROUP2_SUPER_MAGIC;
+}
+
+static int cg_init(struct cgroup_ops *ops)
+{
+       int ret;
+
+       ret = cg_unified_init(ops);
+       if (ret < 0)
+               return -1;
+
+       if (ret == CGROUP2_SUPER_MAGIC)
+               return 0;
+
+       return cg_hybrid_init(ops);
+}
+
+struct cgroup_ops *cgfsng_ops_init(void)
+{
+       __do_free struct cgroup_ops *cgfsng_ops = NULL;
+
+       cgfsng_ops = malloc(sizeof(struct cgroup_ops));
+       if (!cgfsng_ops)
+               return ret_set_errno(NULL, ENOMEM);
+
+       memset(cgfsng_ops, 0, sizeof(struct cgroup_ops));
+       cgfsng_ops->cgroup_layout = CGROUP_LAYOUT_UNKNOWN;
+
+       if (cg_init(cgfsng_ops))
+               return NULL;
+
+       cgfsng_ops->num_hierarchies = cgfsng_num_hierarchies;
+       cgfsng_ops->get_hierarchies = cgfsng_get_hierarchies;
+       cgfsng_ops->get_hierarchy = get_hierarchy;
+       cgfsng_ops->driver = "cgfsng";
+       cgfsng_ops->version = "1.0.0";
+       cgfsng_ops->mount = cgfsng_mount;
+       cgfsng_ops->nrtasks = cgfsng_nrtasks;
+
+       return move_ptr(cgfsng_ops);
+}
diff --git a/cgroups/cgroup.c b/cgroups/cgroup.c
new file mode 100644
index 0000000..aebafbd
--- /dev/null
+++ b/cgroups/cgroup.c
@@ -0,0 +1,79 @@
+/* SPDX-License-Identifier: LGPL-2.1+ */
+
+#ifndef _GNU_SOURCE
+#define _GNU_SOURCE 1
+#endif
+#include <stdlib.h>
+#include <string.h>
+#include <sys/types.h>
+#include <unistd.h>
+
+#include "cgroup.h"
+#include "cgroup2_devices.h"
+
+extern struct cgroup_ops *cgfsng_ops_init(void);
+
+struct cgroup_ops *cgroup_init(void)
+{
+       struct cgroup_ops *cgroup_ops;
+
+       cgroup_ops = cgfsng_ops_init();
+       if (!cgroup_ops)
+               return log_error_errno(NULL, errno, "Failed to initialize 
cgroup driver");
+
+       return cgroup_ops;
+}
+
+void cgroup_exit(struct cgroup_ops *ops)
+{
+       struct hierarchy **it;
+
+       if (!ops)
+               return;
+
+       free(ops->container_cgroup);
+       free(ops->monitor_cgroup);
+
+       for (it = ops->hierarchies; it && *it; it++) {
+               char **p;
+
+               for (p = (*it)->controllers; p && *p; p++)
+                       free(*p);
+               free((*it)->controllers);
+               free((*it)->__controllers);
+
+               if ((*it)->fd >= 0)
+                       close((*it)->fd);
+
+               free((*it)->mountpoint);
+               free((*it)->container_base_path);
+               free((*it)->container_full_path);
+               free((*it)->monitor_full_path);
+               free(*it);
+       }
+       free(ops->hierarchies);
+
+       free(ops);
+
+       return;
+}
+
+#define INIT_SCOPE "/init.scope"
+void prune_init_scope(char *cg)
+{
+       char *point;
+
+       if (!cg)
+               return;
+
+       point = cg + strlen(cg) - strlen(INIT_SCOPE);
+       if (point < cg)
+               return;
+
+       if (strcmp(point, INIT_SCOPE) == 0) {
+               if (point == cg)
+                       *(point + 1) = '\0';
+               else
+                       *point = '\0';
+       }
+}
diff --git a/cgroups/cgroup.h b/cgroups/cgroup.h
new file mode 100644
index 0000000..8895533
--- /dev/null
+++ b/cgroups/cgroup.h
@@ -0,0 +1,150 @@
+/* SPDX-License-Identifier: LGPL-2.1+ */
+
+#ifndef __LXC_CGROUP_H
+#define __LXC_CGROUP_H
+
+#include <stdbool.h>
+#include <stddef.h>
+#include <sys/types.h>
+
+#include "macro.h"
+
+#define DEFAULT_CGROUP_MOUNTPOINT "/sys/fs/cgroup"
+
+typedef enum {
+        CGROUP_LAYOUT_UNKNOWN = -1,
+        CGROUP_LAYOUT_LEGACY  =  0,
+        CGROUP_LAYOUT_HYBRID  =  1,
+        CGROUP_LAYOUT_UNIFIED =  2,
+} cgroup_layout_t;
+
+/* A descriptor for a mounted hierarchy
+ *
+ * @controllers
+ * - legacy hierarchy
+ *   Either NULL, or a null-terminated list of all the co-mounted controllers.
+ * - unified hierarchy
+ *   Either NULL, or a null-terminated list of all enabled controllers.
+ *
+ * @mountpoint
+ * - The mountpoint we will use.
+ * - legacy hierarchy
+ *   It will be either /sys/fs/cgroup/controller or
+ *   /sys/fs/cgroup/controllerlist.
+ * - unified hierarchy
+ *   It will either be /sys/fs/cgroup or /sys/fs/cgroup/<mountpoint-name>
+ *   depending on whether this is a hybrid cgroup layout (mix of legacy and
+ *   unified hierarchies) or a pure unified cgroup layout.
+ *
+ * @container_base_path
+ * - The cgroup under which the container cgroup path
+ *   is created. This will be either the caller's cgroup (if not root), or
+ *   init's cgroup (if root).
+ *
+ * @container_full_path
+ * - The full path to the containers cgroup.
+ *
+ * @monitor_full_path
+ * - The full path to the monitor's cgroup.
+ *
+ * @version
+ * - legacy hierarchy
+ *   If the hierarchy is a legacy hierarchy this will be set to
+ *   CGROUP_SUPER_MAGIC.
+ * - unified hierarchy
+ *   If the hierarchy is a unified hierarchy this will be set to
+ *   CGROUP2_SUPER_MAGIC.
+ */
+struct hierarchy {
+       /*
+        * cgroup2 only: what files need to be chowned to delegate a cgroup to
+        * an unprivileged user.
+        */
+       char **controllers;
+       char *__controllers;
+       char *mountpoint;
+       char *container_base_path;
+       char *container_full_path;
+       char *monitor_full_path;
+       int version;
+
+       /* cgroup2 only */
+       unsigned int bpf_device_controller:1;
+       int fd;
+};
+
+struct cgroup_ops {
+       /* string constant */
+       const char *driver;
+
+       /* string constant */
+       const char *version;
+
+       /* What controllers is the container supposed to use. */
+       char *container_cgroup;
+       char *monitor_cgroup;
+
+       /* @hierarchies
+        * - A NULL-terminated array of struct hierarchy, one per legacy
+        *   hierarchy. No duplicates. First sufficient, writeable mounted
+        *   hierarchy wins.
+        */
+       struct hierarchy **hierarchies;
+       /* Pointer to the unified hierarchy. Do not free! */
+       struct hierarchy *unified;
+
+       /*
+        * @cgroup_layout
+        * - What cgroup layout the container is running with.
+        *   - CGROUP_LAYOUT_UNKNOWN
+        *     The cgroup layout could not be determined. This should be treated
+        *     as an error condition.
+        *   - CGROUP_LAYOUT_LEGACY
+        *     The container is running with all controllers mounted into legacy
+        *     cgroup hierarchies.
+        *   - CGROUP_LAYOUT_HYBRID
+        *     The container is running with at least one controller mounted
+        *     into a legacy cgroup hierarchy and a mountpoint for the unified
+        *     hierarchy. The unified hierarchy can be empty (no controllers
+        *     enabled) or non-empty (controllers enabled).
+        *   - CGROUP_LAYOUT_UNIFIED
+        *     The container is running on a pure unified cgroup hierarchy. The
+        *     unified hierarchy can be empty (no controllers enabled) or
+        *     non-empty (controllers enabled).
+        */
+       cgroup_layout_t cgroup_layout;
+
+       int (*num_hierarchies)(struct cgroup_ops *ops);
+       bool (*get_hierarchies)(struct cgroup_ops *ops, int n, char ***out);
+       bool (*mount)(struct cgroup_ops *ops, const char *root);
+       int (*nrtasks)(struct cgroup_ops *ops);
+       struct hierarchy *(*get_hierarchy)(struct cgroup_ops *ops,
+                                          const char *controller);
+};
+
+extern struct cgroup_ops *cgroup_init(void);
+extern void cgroup_exit(struct cgroup_ops *ops);
+
+extern void prune_init_scope(char *cg);
+
+static inline void __auto_cgroup_exit__(struct cgroup_ops **ops)
+{
+       if (*ops)
+               cgroup_exit(*ops);
+}
+
+extern int cgroup_attach(const char *name, const char *lxcpath, int64_t pid);
+
+#define __do_cgroup_exit __attribute__((__cleanup__(__auto_cgroup_exit__)))
+
+static inline bool pure_unified_layout(const struct cgroup_ops *ops)
+{
+       return ops->cgroup_layout == CGROUP_LAYOUT_UNIFIED;
+}
+
+static inline bool is_unified_hierarchy(const struct hierarchy *h)
+{
+       return h->version == CGROUP2_SUPER_MAGIC;
+}
+
+#endif
diff --git a/cgroups/cgroup2_devices.c b/cgroups/cgroup2_devices.c
new file mode 100644
index 0000000..92df160
--- /dev/null
+++ b/cgroups/cgroup2_devices.c
@@ -0,0 +1,457 @@
+/* SPDX-License-Identifier: LGPL-2.1+ */
+
+/* Parts of this taken from systemd's implementation. */
+
+#ifndef _GNU_SOURCE
+#define _GNU_SOURCE 1
+#endif
+#include <errno.h>
+#include <fcntl.h>
+#include <stdbool.h>
+#include <stddef.h>
+#include <stdint.h>
+#include <sys/stat.h>
+#include <sys/syscall.h>
+#include <sys/types.h>
+#include <unistd.h>
+
+#include "cgroup2_devices.h"
+#include "macro.h"
+#include "memory_utils.h"
+
+#ifdef HAVE_STRUCT_BPF_CGROUP_DEV_CTX
+#include <linux/bpf.h>
+#include <linux/filter.h>
+
+static int bpf_program_add_instructions(struct bpf_program *prog,
+                                       const struct bpf_insn *instructions,
+                                       size_t count)
+{
+
+       struct bpf_insn *new_insn;
+
+       if (prog->kernel_fd >= 0)
+               return log_error_errno(-1, EBUSY, "Refusing to update bpf 
cgroup program that's already loaded");
+
+       new_insn = realloc(prog->instructions, sizeof(struct bpf_insn) * (count 
+ prog->n_instructions));
+       if (!new_insn)
+               return log_error_errno(-1, ENOMEM, "Failed to reallocate bpf 
cgroup program");
+
+       prog->instructions = new_insn;
+       memcpy(prog->instructions + prog->n_instructions, instructions,
+              sizeof(struct bpf_insn) * count);
+       prog->n_instructions += count;
+
+       return 0;
+}
+
+void bpf_program_free(struct bpf_program *prog)
+{
+       if (!prog)
+               return;
+
+       (void)bpf_program_cgroup_detach(prog);
+
+       if (prog->kernel_fd >= 0)
+               close(prog->kernel_fd);
+       free(prog->instructions);
+       free(prog->attached_path);
+       free(prog);
+}
+
+/* Memory load, dst_reg = *(uint *) (src_reg + off16) */
+#define BPF_LDX_MEM(SIZE, DST, SRC, OFF)                               \
+       ((struct bpf_insn){.code = BPF_LDX | BPF_SIZE(SIZE) | BPF_MEM, \
+                          .dst_reg = DST,                             \
+                          .src_reg = SRC,                             \
+                          .off = OFF,                                 \
+                          .imm = 0})
+
+/* ALU ops on immediates, bpf_add|sub|...: dst_reg += imm32 */
+#define BPF_ALU32_IMM(OP, DST, IMM)                              \
+       ((struct bpf_insn){.code = BPF_ALU | BPF_OP(OP) | BPF_K, \
+                          .dst_reg = DST,                       \
+                          .src_reg = 0,                         \
+                          .off = 0,                             \
+                          .imm = IMM})
+
+/* Short form of mov, dst_reg = src_reg */
+#define BPF_MOV64_IMM(DST, IMM)                                 \
+       ((struct bpf_insn){.code = BPF_ALU64 | BPF_MOV | BPF_K, \
+                          .dst_reg = DST,                      \
+                          .src_reg = 0,                        \
+                          .off = 0,                            \
+                          .imm = IMM})
+
+#define BPF_MOV32_REG(DST, SRC)                               \
+       ((struct bpf_insn){.code = BPF_ALU | BPF_MOV | BPF_X, \
+                          .dst_reg = DST,                    \
+                          .src_reg = SRC,                    \
+                          .off = 0,                          \
+                          .imm = 0})
+
+/* Conditional jumps against registers, if (dst_reg 'op' src_reg) goto pc + 
off16 */
+#define BPF_JMP_REG(OP, DST, SRC, OFF)                           \
+       ((struct bpf_insn){.code = BPF_JMP | BPF_OP(OP) | BPF_X, \
+                          .dst_reg = DST,                       \
+                          .src_reg = SRC,                       \
+                          .off = OFF,                           \
+                          .imm = 0})
+
+/* Conditional jumps against immediates, if (dst_reg 'op' imm32) goto pc + 
off16 */
+#define BPF_JMP_IMM(OP, DST, IMM, OFF)                           \
+       ((struct bpf_insn){.code = BPF_JMP | BPF_OP(OP) | BPF_K, \
+                          .dst_reg = DST,                       \
+                          .src_reg = 0,                         \
+                          .off = OFF,                           \
+                          .imm = IMM})
+
+/* Program exit */
+#define BPF_EXIT_INSN()                                \
+       ((struct bpf_insn){.code = BPF_JMP | BPF_EXIT, \
+                          .dst_reg = 0,               \
+                          .src_reg = 0,               \
+                          .off = 0,                   \
+                          .imm = 0})
+
+static int bpf_access_mask(const char *acc)
+{
+       int mask = 0;
+
+       if (!acc)
+               return mask;
+
+       for (; *acc; acc++)
+               switch (*acc) {
+               case 'r':
+                       mask |= BPF_DEVCG_ACC_READ;
+                       break;
+               case 'w':
+                       mask |= BPF_DEVCG_ACC_WRITE;
+                       break;
+               case 'm':
+                       mask |= BPF_DEVCG_ACC_MKNOD;
+                       break;
+               default:
+                       return -EINVAL;
+               }
+
+       return mask;
+}
+
+static int bpf_device_type(char type)
+{
+       switch (type) {
+       case 'a':
+               return 0;
+       case 'b':
+               return BPF_DEVCG_DEV_BLOCK;
+       case 'c':
+               return BPF_DEVCG_DEV_CHAR;
+       }
+
+       return -1;
+}
+
+static inline bool bpf_device_all_access(int access_mask)
+{
+       return (access_mask == (BPF_DEVCG_ACC_READ | BPF_DEVCG_ACC_WRITE |
+                               BPF_DEVCG_ACC_MKNOD));
+}
+
+struct bpf_program *bpf_program_new(uint32_t prog_type)
+{
+       __do_free struct bpf_program *prog = NULL;
+
+       prog = calloc(1, sizeof(struct bpf_program));
+       if (!prog)
+               return NULL;
+
+       prog->prog_type = prog_type;
+       prog->kernel_fd = -EBADF;
+       /*
+        * By default a whitelist is used unless the user tells us otherwise.
+        */
+       prog->device_list_type = LXC_BPF_DEVICE_CGROUP_WHITELIST;
+
+       return move_ptr(prog);
+}
+
+int bpf_program_init(struct bpf_program *prog)
+{
+       if (!prog)
+               return ret_set_errno(-1, EINVAL);
+
+       const struct bpf_insn pre_insn[] = {
+           /* load device type to r2 */
+           BPF_LDX_MEM(BPF_W, BPF_REG_2, BPF_REG_1, offsetof(struct 
bpf_cgroup_dev_ctx, access_type)),
+           BPF_ALU32_IMM(BPF_AND, BPF_REG_2, 0xFFFF),
+
+           /* load access type to r3 */
+           BPF_LDX_MEM(BPF_W, BPF_REG_3, BPF_REG_1, offsetof(struct 
bpf_cgroup_dev_ctx, access_type)),
+           BPF_ALU32_IMM(BPF_RSH, BPF_REG_3, 16),
+
+           /* load major number to r4 */
+           BPF_LDX_MEM(BPF_W, BPF_REG_4, BPF_REG_1, offsetof(struct 
bpf_cgroup_dev_ctx, major)),
+
+           /* load minor number to r5 */
+           BPF_LDX_MEM(BPF_W, BPF_REG_5, BPF_REG_1, offsetof(struct 
bpf_cgroup_dev_ctx, minor)),
+       };
+
+       return bpf_program_add_instructions(prog, pre_insn, 
ARRAY_SIZE(pre_insn));
+}
+
+int bpf_program_append_device(struct bpf_program *prog, struct device_item 
*device)
+{
+       int ret;
+       int jump_nr = 1;
+       struct bpf_insn bpf_access_decision[] = {
+           BPF_MOV64_IMM(BPF_REG_0, device->allow),
+           BPF_EXIT_INSN(),
+       };
+       int access_mask;
+       int device_type;
+
+       if (!prog || !device)
+               return ret_set_errno(-1, EINVAL);
+
+       /* This is a global rule so no need to append anything. */
+       if (device->global_rule > LXC_BPF_DEVICE_CGROUP_LOCAL_RULE) {
+               prog->device_list_type = device->global_rule;
+               return 0;
+       }
+
+       device_type = bpf_device_type(device->type);
+       if (device_type < 0)
+               return log_error_errno(-1, EINVAL, "Invalid bpf cgroup device 
type %c", device->type);
+
+       if (device_type > 0)
+               jump_nr++;
+
+       access_mask = bpf_access_mask(device->access);
+       if (!bpf_device_all_access(access_mask))
+               jump_nr += 3;
+
+       if (device->major != -1)
+               jump_nr++;
+
+       if (device->minor != -1)
+               jump_nr++;
+
+       if (device_type > 0) {
+               struct bpf_insn ins[] = {
+                   BPF_JMP_IMM(BPF_JNE, BPF_REG_2, device_type, jump_nr--),
+               };
+
+               ret = bpf_program_add_instructions(prog, ins, ARRAY_SIZE(ins));
+               if (ret)
+                       return log_error_errno(-1, errno, "Failed to add 
instructions to bpf cgroup program");
+       }
+
+       if (!bpf_device_all_access(access_mask)) {
+               struct bpf_insn ins[] = {
+                   BPF_MOV32_REG(BPF_REG_1, BPF_REG_3),
+                   BPF_ALU32_IMM(BPF_AND, BPF_REG_1, access_mask),
+                   BPF_JMP_REG(BPF_JNE, BPF_REG_1, BPF_REG_3, jump_nr),
+               };
+
+               jump_nr -= 3;
+               ret = bpf_program_add_instructions(prog, ins, ARRAY_SIZE(ins));
+               if (ret)
+                       return log_error_errno(-1, errno, "Failed to add 
instructions to bpf cgroup program");
+       }
+
+       if (device->major >= 0) {
+               struct bpf_insn ins[] = {
+                   BPF_JMP_IMM(BPF_JNE, BPF_REG_4, device->major, jump_nr--),
+               };
+
+               ret = bpf_program_add_instructions(prog, ins, ARRAY_SIZE(ins));
+               if (ret)
+                       return log_error_errno(-1, errno, "Failed to add 
instructions to bpf cgroup program");
+       }
+
+       if (device->minor >= 0) {
+               struct bpf_insn ins[] = {
+                   BPF_JMP_IMM(BPF_JNE, BPF_REG_5, device->minor, jump_nr--),
+               };
+
+               ret = bpf_program_add_instructions(prog, ins, ARRAY_SIZE(ins));
+               if (ret)
+                       return log_error_errno(-1, errno, "Failed to add 
instructions to bpf cgroup program");
+       }
+
+       ret = bpf_program_add_instructions(prog, bpf_access_decision,
+                                           ARRAY_SIZE(bpf_access_decision));
+       if (ret)
+               return log_error_errno(-1, errno, "Failed to add instructions 
to bpf cgroup program");
+
+       return 0;
+}
+
+int bpf_program_finalize(struct bpf_program *prog)
+{
+       struct bpf_insn ins[] = {
+           BPF_MOV64_IMM(BPF_REG_0, prog->device_list_type),
+           BPF_EXIT_INSN(),
+       };
+
+       if (!prog)
+               return ret_set_errno(-1, EINVAL);
+
+       TRACE("Implementing %s bpf device cgroup program",
+             prog->device_list_type == LXC_BPF_DEVICE_CGROUP_BLACKLIST
+                 ? "blacklist"
+                 : "whitelist");
+       return bpf_program_add_instructions(prog, ins, ARRAY_SIZE(ins));
+}
+
+static int bpf_program_load_kernel(struct bpf_program *prog, char *log_buf,
+                                  size_t log_size)
+{
+       union bpf_attr attr;
+
+       if (prog->kernel_fd >= 0) {
+               memset(log_buf, 0, log_size);
+               return 0;
+       }
+
+       attr = (union bpf_attr){
+           .prog_type  = prog->prog_type,
+           .insns      = PTR_TO_UINT64(prog->instructions),
+           .insn_cnt   = prog->n_instructions,
+           .license    = PTR_TO_UINT64("GPL"),
+           .log_buf    = PTR_TO_UINT64(log_buf),
+           .log_level  = !!log_buf,
+           .log_size   = log_size,
+       };
+
+       prog->kernel_fd = bpf(BPF_PROG_LOAD, &attr, sizeof(attr));
+       if (prog->kernel_fd < 0)
+               return log_error_errno(-1, errno, "Failed to load bpf program");
+
+       return 0;
+}
+
+int bpf_program_cgroup_attach(struct bpf_program *prog, int type,
+                             const char *path, uint32_t flags)
+{
+       __do_free char *copy = NULL;
+       __do_close_prot_errno int fd = -EBADF;
+       union bpf_attr attr;
+       int ret;
+
+       if (!prog)
+               return ret_set_errno(-1, EINVAL);
+
+       if (flags & ~(BPF_F_ALLOW_OVERRIDE, BPF_F_ALLOW_MULTI))
+               return log_error_errno(-1, EINVAL, "Invalid flags for bpf 
program");
+
+       if (prog->attached_path) {
+               if (prog->attached_type != type)
+                       return log_error_errno(-1, EBUSY, "Wrong type for bpf 
program");
+
+               if (prog->attached_flags != flags)
+                       return log_error_errno(-1, EBUSY, "Wrong flags for bpf 
program");
+
+               if (flags != BPF_F_ALLOW_OVERRIDE)
+                       return true;
+       }
+
+       ret = bpf_program_load_kernel(prog, NULL, 0);
+       if (ret < 0)
+               return log_error_errno(-1, ret, "Failed to load bpf program");
+
+       copy = strdup(path);
+       if (!copy)
+               return log_error_errno(-1, ENOMEM, "Failed to duplicate cgroup 
path %s", path);
+
+       fd = open(path, O_DIRECTORY | O_RDONLY | O_CLOEXEC);
+       if (fd < 0)
+               return log_error_errno(-1, errno, "Failed to open cgroup path 
%s", path);
+
+       attr = (union bpf_attr){
+           .attach_type        = type,
+           .target_fd          = fd,
+           .attach_bpf_fd      = prog->kernel_fd,
+           .attach_flags       = flags,
+       };
+
+       ret = bpf(BPF_PROG_ATTACH, &attr, sizeof(attr));
+       if (ret < 0)
+               return log_error_errno(-1, errno, "Failed to attach bpf 
program");
+
+       free_replace_move_ptr(prog->attached_path, copy);
+       prog->attached_type = type;
+       prog->attached_flags = flags;
+
+       TRACE("Loaded and attached bpf program to cgroup %s", 
prog->attached_path);
+       return 0;
+}
+
+int bpf_program_cgroup_detach(struct bpf_program *prog)
+{
+       int ret;
+       __do_close_prot_errno int fd = -EBADF;
+
+       if (!prog)
+               return 0;
+
+       if (!prog->attached_path)
+               return 0;
+
+       fd = open(prog->attached_path, O_DIRECTORY | O_RDONLY | O_CLOEXEC);
+       if (fd < 0) {
+               if (errno != ENOENT)
+                       return log_error_errno(-1, errno, "Failed to open 
attach cgroup %s",
+                                              prog->attached_path);
+       } else {
+               union bpf_attr attr;
+
+               attr = (union bpf_attr){
+                   .attach_type        = prog->attached_type,
+                   .target_fd          = fd,
+                   .attach_bpf_fd      = prog->kernel_fd,
+               };
+
+               ret = bpf(BPF_PROG_DETACH, &attr, sizeof(attr));
+               if (ret < 0)
+                       return log_error_errno(-1, errno, "Failed to detach bpf 
program from cgroup %s",
+                                              prog->attached_path);
+       }
+
+       free(prog->attached_path);
+       prog->attached_path = NULL;
+
+       return 0;
+}
+
+bool bpf_devices_cgroup_supported(void)
+{
+       const struct bpf_insn dummy[] = {
+           BPF_MOV64_IMM(BPF_REG_0, 1),
+           BPF_EXIT_INSN(),
+       };
+
+       __do_bpf_program_free struct bpf_program *prog = NULL;
+       int ret;
+
+       if (geteuid() != 0)
+               return log_trace(false,
+                                "The bpf device cgroup requires real root");
+
+       prog = bpf_program_new(BPF_PROG_TYPE_CGROUP_DEVICE);
+       if (prog < 0)
+               return log_trace(false, "Failed to allocate new bpf device 
cgroup program");
+
+       ret = bpf_program_add_instructions(prog, dummy, ARRAY_SIZE(dummy));
+       if (ret < 0)
+               return log_trace(false, "Failed to add new instructions to bpf 
device cgroup program");
+
+       ret = bpf_program_load_kernel(prog, NULL, 0);
+       if (ret < 0)
+               return log_trace(false, "Failed to load new bpf device cgroup 
program");
+
+       return log_trace(true, "The bpf device cgroup is supported");
+}
+#endif
diff --git a/cgroups/cgroup2_devices.h b/cgroups/cgroup2_devices.h
new file mode 100644
index 0000000..4fee779
--- /dev/null
+++ b/cgroups/cgroup2_devices.h
@@ -0,0 +1,154 @@
+/* SPDX-License-Identifier: LGPL-2.1+ */
+
+/* Parts of this taken from systemd's implementation. */
+
+#ifndef __LXC_CGROUP2_DEVICES_H
+#define __LXC_CGROUP2_DEVICES_H
+
+#include <errno.h>
+#include <fcntl.h>
+#include <stdbool.h>
+#include <stddef.h>
+#include <stdint.h>
+#include <sys/stat.h>
+#include <sys/syscall.h>
+#include <sys/types.h>
+#include <unistd.h>
+
+#ifdef HAVE_STRUCT_BPF_CGROUP_DEV_CTX
+#include <linux/bpf.h>
+#include <linux/filter.h>
+#endif
+
+#if !HAVE_BPF
+#if !(defined __NR_bpf && __NR_bpf > 0)
+#if defined __NR_bpf
+#undef __NR_bpf
+#endif
+#if defined __i386__
+#define __NR_bpf 357
+#elif defined __x86_64__
+#define __NR_bpf 321
+#elif defined __aarch64__
+#define __NR_bpf 280
+#elif defined __arm__
+#define __NR_bpf 386
+#elif defined __sparc__
+#define __NR_bpf 349
+#elif defined __s390__
+#define __NR_bpf 351
+#elif defined __tilegx__
+#define __NR_bpf 280
+#else
+#warning "__NR_bpf not defined for your architecture"
+#endif
+#endif
+
+union bpf_attr;
+
+static inline int missing_bpf(int cmd, union bpf_attr *attr, size_t size)
+{
+#ifdef __NR_bpf
+       return (int)syscall(__NR_bpf, cmd, attr, size);
+#else
+       errno = ENOSYS;
+       return -1;
+#endif
+}
+
+#define bpf missing_bpf
+#endif
+
+struct bpf_program {
+       int device_list_type;
+       int kernel_fd;
+       uint32_t prog_type;
+
+       size_t n_instructions;
+#ifdef HAVE_STRUCT_BPF_CGROUP_DEV_CTX
+       struct bpf_insn *instructions;
+#endif
+
+       char *attached_path;
+       int attached_type;
+       uint32_t attached_flags;
+};
+
+#ifdef HAVE_STRUCT_BPF_CGROUP_DEV_CTX
+struct bpf_program *bpf_program_new(uint32_t prog_type);
+int bpf_program_init(struct bpf_program *prog);
+int bpf_program_append_device(struct bpf_program *prog,
+                             struct device_item *device);
+int bpf_program_finalize(struct bpf_program *prog);
+int bpf_program_cgroup_attach(struct bpf_program *prog, int type,
+                             const char *path, uint32_t flags);
+int bpf_program_cgroup_detach(struct bpf_program *prog);
+void bpf_program_free(struct bpf_program *prog);
+bool bpf_devices_cgroup_supported(void);
+static inline void __auto_bpf_program_free__(struct bpf_program **prog)
+{
+       if (*prog) {
+               bpf_program_free(*prog);
+               *prog = NULL;
+       }
+}
+#else
+static inline struct bpf_program *bpf_program_new(uint32_t prog_type)
+{
+       errno = ENOSYS;
+       return NULL;
+}
+
+static inline int bpf_program_init(struct bpf_program *prog)
+{
+       errno = ENOSYS;
+       return -1;
+}
+
+static inline int bpf_program_append_device(struct bpf_program *prog, char 
type,
+                                           int major, int minor,
+                                           const char *access, int allow)
+{
+       errno = ENOSYS;
+       return -1;
+}
+
+static inline int bpf_program_finalize(struct bpf_program *prog)
+{
+       errno = ENOSYS;
+       return -1;
+}
+
+static inline int bpf_program_cgroup_attach(struct bpf_program *prog, int type,
+                                           const char *path, uint32_t flags)
+{
+       errno = ENOSYS;
+       return -1;
+}
+
+static inline int bpf_program_cgroup_detach(struct bpf_program *prog)
+{
+       errno = ENOSYS;
+       return -1;
+}
+
+static inline void bpf_program_free(struct bpf_program *prog)
+{
+}
+
+
+static inline bool bpf_devices_cgroup_supported(void)
+{
+       return false;
+}
+
+static inline void __auto_bpf_program_free__(struct bpf_program **prog)
+{
+}
+
+#endif
+
+#define __do_bpf_program_free \
+       __attribute__((__cleanup__(__auto_bpf_program_free__)))
+
+#endif /* __LXC_CGROUP2_DEVICES_H */
diff --git a/cgroups/cgroup_utils.c b/cgroups/cgroup_utils.c
new file mode 100644
index 0000000..26e7438
--- /dev/null
+++ b/cgroups/cgroup_utils.c
@@ -0,0 +1,726 @@
+/* SPDX-License-Identifier: LGPL-2.1+ */
+
+#ifndef _GNU_SOURCE
+#define _GNU_SOURCE 1
+#endif
+#include <fcntl.h>
+#include <stdbool.h>
+#include <stdlib.h>
+#include <string.h>
+#include <sys/mount.h>
+#include <sys/stat.h>
+#include <sys/types.h>
+#include <sys/vfs.h>
+#include <unistd.h>
+
+#include "cgroup.h"
+#include "cgroup_utils.h"
+#include "macro.h"
+#include "memory_utils.h"
+
+int get_cgroup_version(char *line)
+{
+       if (is_cgroupfs_v1(line))
+               return CGROUP_SUPER_MAGIC;
+
+       if (is_cgroupfs_v2(line))
+               return CGROUP2_SUPER_MAGIC;
+
+       return 0;
+}
+
+bool is_cgroupfs_v1(char *line)
+{
+       char *p = strstr(line, " - ");
+       if (!p)
+               return false;
+       return strncmp(p, " - cgroup ", 10) == 0;
+}
+
+bool is_cgroupfs_v2(char *line)
+{
+       char *p = strstr(line, " - ");
+       if (!p)
+               return false;
+
+       return strncmp(p, " - cgroup2 ", 11) == 0;
+}
+
+int unified_cgroup_hierarchy(void)
+{
+
+       int ret;
+       struct statfs fs;
+
+       ret = statfs(DEFAULT_CGROUP_MOUNTPOINT, &fs);
+       if (ret < 0)
+               return -ENOMEDIUM;
+
+       if (is_fs_type(&fs, CGROUP2_SUPER_MAGIC))
+               return CGROUP2_SUPER_MAGIC;
+
+       return 0;
+}
+
+void *must_realloc(void *orig, size_t sz)
+{
+       void *ret;
+
+       do {
+               ret = realloc(orig, sz);
+       } while (!ret);
+
+       return ret;
+}
+
+char *must_make_path(const char *first, ...)
+{
+       va_list args;
+       char *cur, *dest;
+       size_t full_len = strlen(first);
+       size_t buf_len;
+       size_t cur_len;
+
+       dest = must_copy_string(first);
+       cur_len = full_len;
+
+       va_start(args, first);
+       while ((cur = va_arg(args, char *)) != NULL) {
+               buf_len = strlen(cur);
+
+               full_len += buf_len;
+               if (cur[0] != '/')
+                       full_len++;
+
+               dest = must_realloc(dest, full_len + 1);
+
+               if (cur[0] != '/') {
+                       memcpy(dest + cur_len, "/", 1);
+                       cur_len++;
+               }
+
+               memcpy(dest + cur_len, cur, buf_len);
+               cur_len += buf_len;
+       }
+       va_end(args);
+
+       dest[cur_len] = '\0';
+       return dest;
+}
+
+bool is_fs_type(const struct statfs *fs, fs_type_magic magic_val)
+{
+       return (fs->f_type == (fs_type_magic)magic_val);
+}
+
+char *must_copy_string(const char *entry)
+{
+       char *ret;
+
+       if (!entry)
+               return NULL;
+
+       do {
+               ret = strdup(entry);
+       } while (!ret);
+
+       return ret;
+}
+
+char *lxc_string_join(const char *sep, const char **parts, bool use_as_prefix)
+{
+       char *result;
+       char **p;
+       size_t sep_len = strlen(sep);
+       size_t result_len = use_as_prefix * sep_len;
+       size_t buf_len;
+
+       /* calculate new string length */
+       for (p = (char **)parts; *p; p++)
+               result_len += (p > (char **)parts) * sep_len + strlen(*p);
+
+       buf_len = result_len + 1;
+       result = calloc(buf_len, 1);
+       if (!result)
+               return NULL;
+
+       if (use_as_prefix)
+               (void)strlcpy(result, sep, buf_len);
+
+       for (p = (char **)parts; *p; p++) {
+               if (p > (char **)parts)
+                       (void)strlcat(result, sep, buf_len);
+
+               (void)strlcat(result, *p, buf_len);
+       }
+
+       return result;
+}
+
+int lxc_count_file_lines(const char *fn)
+{
+       FILE *f;
+       char *line = NULL;
+       size_t sz = 0;
+       int n = 0;
+
+       f = fopen_cloexec(fn, "r");
+       if (!f)
+               return -1;
+
+       while (getline(&line, &sz, f) != -1) {
+               n++;
+       }
+
+       free(line);
+       fclose(f);
+       return n;
+}
+
+bool dir_exists(const char *path)
+{
+       struct stat sb;
+       int ret;
+
+       ret = stat(path, &sb);
+       if (ret < 0)
+               /* Could be something other than eexist, just say "no". */
+               return false;
+
+       return S_ISDIR(sb.st_mode);
+}
+
+/*
+ * @path:    a pathname where / replaced with '\0'.
+ * @offsetp: pointer to int showing which path segment was last seen.
+ *           Updated on return to reflect the next segment.
+ * @fulllen: full original path length.
+ * Returns a pointer to the next path segment, or NULL if done.
+ */
+static char *get_nextpath(char *path, int *offsetp, int fulllen)
+{
+       int offset = *offsetp;
+
+       if (offset >= fulllen)
+               return NULL;
+
+       while (offset < fulllen && path[offset] != '\0')
+               offset++;
+
+       while (offset < fulllen && path[offset] == '\0')
+               offset++;
+
+       *offsetp = offset;
+
+       return (offset < fulllen) ? &path[offset] : NULL;
+}
+
+/*
+ * Check that @subdir is a subdir of @dir.  @len is the length of
+ * @dir (to avoid having to recalculate it).
+ */
+static bool is_subdir(const char *subdir, const char *dir, size_t len)
+{
+       size_t subdirlen = strlen(subdir);
+
+       if (subdirlen < len)
+               return false;
+
+       if (strncmp(subdir, dir, len) != 0)
+               return false;
+
+       if (dir[len-1] == '/')
+               return true;
+
+       if (subdir[len] == '/' || subdirlen == len)
+               return true;
+
+       return false;
+}
+
+/*
+ * Check if the open fd is a symlink.  Return -ELOOP if it is.  Return
+ * -ENOENT if we couldn't fstat.  Return 0 if the fd is ok.
+ */
+static int check_symlink(int fd)
+{
+       struct stat sb;
+       int ret;
+
+       ret = fstat(fd, &sb);
+       if (ret < 0)
+               return -ENOENT;
+
+       if (S_ISLNK(sb.st_mode))
+               return -ELOOP;
+
+       return 0;
+}
+
+/*
+ * Open a file or directory, provided that it contains no symlinks.
+ *
+ * CAVEAT: This function must not be used for other purposes than container
+ * setup before executing the container's init
+ */
+static int open_if_safe(int dirfd, const char *nextpath)
+{
+       int newfd = openat(dirfd, nextpath, O_RDONLY | O_NOFOLLOW);
+       if (newfd >= 0) /* Was not a symlink, all good. */
+               return newfd;
+
+       if (errno == ELOOP)
+               return newfd;
+
+       if (errno == EPERM || errno == EACCES) {
+               /* We're not root (cause we got EPERM) so try opening with
+                * O_PATH.
+                */
+               newfd = openat(dirfd, nextpath, O_PATH | O_NOFOLLOW);
+               if (newfd >= 0) {
+                       /* O_PATH will return an fd for symlinks. We know
+                        * nextpath wasn't a symlink at last openat, so if fd is
+                        * now a link, then something * fishy is going on.
+                        */
+                       int ret = check_symlink(newfd);
+                       if (ret < 0) {
+                               close(newfd);
+                               newfd = ret;
+                       }
+               }
+       }
+
+       return newfd;
+}
+
+/*
+ * Open a path intending for mounting, ensuring that the final path
+ * is inside the container's rootfs.
+ *
+ * CAVEAT: This function must not be used for other purposes than container
+ * setup before executing the container's init
+ *
+ * @target: path to be opened
+ * @prefix_skip: a part of @target in which to ignore symbolic links.  This
+ * would be the container's rootfs.
+ *
+ * Return an open fd for the path, or <0 on error.
+ */
+static int open_without_symlink(const char *target, const char *prefix_skip)
+{
+       int curlen = 0, dirfd, fulllen, i;
+       char *dup;
+
+       fulllen = strlen(target);
+
+       /* make sure prefix-skip makes sense */
+       if (prefix_skip && strlen(prefix_skip) > 0) {
+               curlen = strlen(prefix_skip);
+               if (!is_subdir(target, prefix_skip, curlen))
+                       return -EINVAL;
+
+               /*
+                * get_nextpath() expects the curlen argument to be
+                * on a  (turned into \0) / or before it, so decrement
+                * curlen to make sure that happens
+                */
+               if (curlen)
+                       curlen--;
+       } else {
+               prefix_skip = "/";
+               curlen = 0;
+       }
+
+       /* Make a copy of target which we can hack up, and tokenize it */
+       if ((dup = strdup(target)) == NULL)
+               return -ENOMEM;
+
+       for (i = 0; i < fulllen; i++) {
+               if (dup[i] == '/')
+                       dup[i] = '\0';
+       }
+
+       dirfd = open(prefix_skip, O_RDONLY);
+       if (dirfd < 0)
+               goto out;
+
+       for (;;) {
+               int newfd, saved_errno;
+               char *nextpath;
+
+               if ((nextpath = get_nextpath(dup, &curlen, fulllen)) == NULL)
+                       goto out;
+
+               newfd = open_if_safe(dirfd, nextpath);
+               saved_errno = errno;
+               close(dirfd);
+
+               dirfd = newfd;
+               if (newfd < 0) {
+                       errno = saved_errno;
+                       goto out;
+               }
+       }
+
+out:
+       free(dup);
+       return dirfd;
+}
+
+/*
+ * Safely mount a path into a container, ensuring that the mount target
+ * is under the container's @rootfs.  (If @rootfs is NULL, then the container
+ * uses the host's /)
+ *
+ * CAVEAT: This function must not be used for other purposes than container
+ * setup before executing the container's init
+ */
+int safe_mount(const char *src, const char *dest, const char *fstype,
+               unsigned long flags, const void *data, const char *rootfs)
+{
+       int destfd, ret, saved_errno;
+       /* Only needs enough for /proc/self/fd/<fd>. */
+       char srcbuf[50], destbuf[50];
+       int srcfd = -1;
+       const char *mntsrc = src;
+
+       if (!rootfs)
+               rootfs = "";
+
+       /* todo - allow symlinks for relative paths if 'allowsymlinks' option 
is passed */
+       if (flags & MS_BIND && src && src[0] != '/') {
+
+               srcfd = open_without_symlink(src, NULL);
+               if (srcfd < 0)
+                       return srcfd;
+
+               ret = snprintf(srcbuf, sizeof(srcbuf), "/proc/self/fd/%d", 
srcfd);
+               if (ret < 0 || ret >= (int)sizeof(srcbuf)) {
+                       close(srcfd);
+                       return -EINVAL;
+               }
+               mntsrc = srcbuf;
+       }
+
+       destfd = open_without_symlink(dest, rootfs);
+       if (destfd < 0) {
+               if (srcfd != -1) {
+                       saved_errno = errno;
+                       close(srcfd);
+                       errno = saved_errno;
+               }
+
+               return destfd;
+       }
+
+       ret = snprintf(destbuf, sizeof(destbuf), "/proc/self/fd/%d", destfd);
+       if (ret < 0 || ret >= (int)sizeof(destbuf)) {
+               if (srcfd != -1)
+                       close(srcfd);
+
+               close(destfd);
+               return -EINVAL;
+       }
+
+       ret = mount(mntsrc, destbuf, fstype, flags, data);
+       saved_errno = errno;
+       if (srcfd != -1)
+               close(srcfd);
+
+       close(destfd);
+       if (ret < 0) {
+               errno = saved_errno;
+               return ret;
+       }
+
+       return 0;
+}
+
+#ifndef HAVE_STRLCPY
+size_t strlcpy(char *dest, const char *src, size_t size)
+{
+       size_t ret = strlen(src);
+
+       if (size) {
+               size_t len = (ret >= size) ? size - 1 : ret;
+               memcpy(dest, src, len);
+               dest[len] = '\0';
+       }
+
+       return ret;
+}
+#endif
+
+#ifndef HAVE_STRLCAT
+size_t strlcat(char *d, const char *s, size_t n)
+{
+       size_t l = strnlen(d, n);
+       if (l == n)
+               return l + strlen(s);
+
+       return l + strlcpy(d + l, s, n - l);
+}
+#endif
+
+FILE *fopen_cloexec(const char *path, const char *mode)
+{
+       int open_mode = 0;
+       int step = 0;
+       int fd;
+       int saved_errno = 0;
+       FILE *ret;
+
+       if (!strncmp(mode, "r+", 2)) {
+               open_mode = O_RDWR;
+               step = 2;
+       } else if (!strncmp(mode, "r", 1)) {
+               open_mode = O_RDONLY;
+               step = 1;
+       } else if (!strncmp(mode, "w+", 2)) {
+               open_mode = O_RDWR | O_TRUNC | O_CREAT;
+               step = 2;
+       } else if (!strncmp(mode, "w", 1)) {
+               open_mode = O_WRONLY | O_TRUNC | O_CREAT;
+               step = 1;
+       } else if (!strncmp(mode, "a+", 2)) {
+               open_mode = O_RDWR | O_CREAT | O_APPEND;
+               step = 2;
+       } else if (!strncmp(mode, "a", 1)) {
+               open_mode = O_WRONLY | O_CREAT | O_APPEND;
+               step = 1;
+       }
+       for (; mode[step]; step++)
+               if (mode[step] == 'x')
+                       open_mode |= O_EXCL;
+       open_mode |= O_CLOEXEC;
+
+       fd = open(path, open_mode, 0660);
+       if (fd < 0)
+               return NULL;
+
+       ret = fdopen(fd, mode);
+       saved_errno = errno;
+       if (!ret)
+               close(fd);
+       errno = saved_errno;
+       return ret;
+}
+
+/* Given a multi-line string, return a null-terminated copy of the current 
line. */
+static char *copy_to_eol(char *p)
+{
+       char *p2 = strchr(p, '\n'), *sret;
+       size_t len;
+
+       if (!p2)
+               return NULL;
+
+       len = p2 - p;
+       sret = must_realloc(NULL, len + 1);
+       memcpy(sret, p, len);
+       sret[len] = '\0';
+       return sret;
+}
+
+static void batch_realloc(char **mem, size_t oldlen, size_t newlen)
+{
+       int newbatches = (newlen / BATCH_SIZE) + 1;
+       int oldbatches = (oldlen / BATCH_SIZE) + 1;
+
+       if (!*mem || newbatches > oldbatches) {
+               *mem = must_realloc(*mem, newbatches * BATCH_SIZE);
+       }
+}
+
+void append_line(char **dest, size_t oldlen, char *new, size_t newlen)
+{
+       size_t full = oldlen + newlen;
+
+       batch_realloc(dest, oldlen, full + 1);
+
+       memcpy(*dest + oldlen, new, newlen + 1);
+}
+
+static inline void drop_trailing_newlines(char *s)
+{
+       int l;
+
+       for (l = strlen(s); l > 0 && s[l - 1] == '\n'; l--)
+               s[l - 1] = '\0';
+}
+
+/* Slurp in a whole file */
+char *read_file(const char *fnam)
+{
+       __do_free char *line = NULL;
+       __do_fclose FILE *f = NULL;
+       int linelen;
+       char *buf = NULL;
+       size_t len = 0, fulllen = 0;
+
+       f = fopen(fnam, "r");
+       if (!f)
+               return NULL;
+       while ((linelen = getline(&line, &len, f)) != -1) {
+               append_line(&buf, fulllen, line, linelen);
+               fulllen += linelen;
+       }
+       return buf;
+}
+
+char *read_file_strip_newline(const char *fnam)
+{
+       char *buf;
+
+       buf = read_file(fnam);
+       if (buf)
+               drop_trailing_newlines(buf);
+       return buf;
+}
+
+/* Get current cgroup from /proc/self/cgroup for the cgroupfs v2 hierarchy. */
+char *cg_unified_get_current_cgroup(pid_t pid)
+{
+       __do_free char *basecginfo = NULL;
+       char path[STRLITERALLEN("/proc//cgroup") + INTTYPE_TO_STRLEN(pid_t) + 
1];
+       char *base_cgroup;
+
+       snprintf(path, sizeof(path), "/proc/%d/cgroup", pid > 0 ?: 1);
+       basecginfo = read_file(path);
+       if (!basecginfo)
+               return NULL;
+
+       base_cgroup = strstr(basecginfo, "0::/");
+       if (!base_cgroup)
+               return NULL;
+
+       base_cgroup = base_cgroup + 3;
+       return copy_to_eol(base_cgroup);
+}
+
+/* cgline: pointer to character after the first ':' in a line in a 
\n-terminated
+ * /proc/self/cgroup file. Check whether controller c is present.
+ */
+static bool controller_in_clist(char *cgline, const char *c)
+{
+       __do_free char *tmp = NULL;
+       char *tok, *eol;
+       size_t len;
+
+       eol = strchr(cgline, ':');
+       if (!eol)
+               return false;
+
+       len = eol - cgline;
+       tmp = must_realloc(NULL, len + 1);
+       memcpy(tmp, cgline, len);
+       tmp[len] = '\0';
+
+       lxc_iterate_parts(tok, tmp, ",")
+               if (strcmp(tok, c) == 0)
+                       return true;
+
+       return false;
+}
+
+/* @basecginfo is a copy of /proc/$$/cgroup. Return the current cgroup for
+ * @controller.
+ */
+char *cg_hybrid_get_current_cgroup(char *basecginfo, const char *controller, 
int type)
+{
+       char *p = basecginfo;
+
+       for (;;) {
+               bool is_cgv2_base_cgroup = false;
+
+               /* cgroup v2 entry in "/proc/<pid>/cgroup": "0::/some/path" */
+               if ((type == CGROUP2_SUPER_MAGIC) && (*p == '0'))
+                       is_cgv2_base_cgroup = true;
+
+               p = strchr(p, ':');
+               if (!p)
+                       return NULL;
+               p++;
+
+               if (is_cgv2_base_cgroup || (controller && 
controller_in_clist(p, controller))) {
+                       p = strchr(p, ':');
+                       if (!p)
+                               return NULL;
+                       p++;
+                       return copy_to_eol(p);
+               }
+
+               p = strchr(p, '\n');
+               if (!p)
+                       return NULL;
+               p++;
+       }
+}
+
+char *cg_legacy_get_current_cgroup(pid_t pid, const char *controller)
+{
+       __do_free char *basecginfo = NULL;
+       char path[STRLITERALLEN("/proc//cgroup") + INTTYPE_TO_STRLEN(pid_t) + 
1];
+
+       snprintf(path, sizeof(path), "/proc/%d/cgroup", pid > 0 ?: 1);
+       basecginfo = read_file(path);
+       if (!basecginfo)
+               return ret_set_errno(NULL, ENOMEM);
+
+       return cg_hybrid_get_current_cgroup(basecginfo, controller,
+                                           CGROUP_SUPER_MAGIC);
+}
+
+
+char *readat_file(int dirfd, const char *path)
+{
+       __do_close_prot_errno int fd = -EBADF;
+       __do_free char *line = NULL;
+       __do_fclose FILE *f = NULL;
+       char *buf = NULL;
+       size_t len = 0, fulllen = 0;
+       ssize_t linelen;
+
+       fd = openat(dirfd, path, O_NOFOLLOW | O_RDONLY | O_CLOEXEC);
+       if (fd < 0)
+               return NULL;
+
+       /* transfer ownership of fd */
+       f = fdopen(move_fd(fd), "re");
+       if (!f)
+               return NULL;
+
+       while ((linelen = getline(&line, &len, f)) != -1) {
+               append_line(&buf, fulllen, line, linelen);
+               fulllen += linelen;
+       }
+
+       if (buf)
+               drop_trailing_newlines(buf);
+
+       return buf;
+}
+
+bool mkdir_p(const char *dir, mode_t mode)
+{
+       const char *tmp = dir;
+       const char *orig = dir;
+       char *makeme;
+
+       do {
+               dir = tmp + strspn(tmp, "/");
+               tmp = dir + strcspn(dir, "/");
+               makeme = strndup(orig, dir - orig);
+               if (!makeme)
+                       return false;
+               if (mkdir(makeme, mode) && errno != EEXIST) {
+                       lxcfs_error("Failed to create directory '%s': %s.\n",
+                               makeme, strerror(errno));
+                       free(makeme);
+                       return false;
+               }
+               free(makeme);
+       } while(tmp != dir);
+
+       return true;
+}
diff --git a/cgroups/cgroup_utils.h b/cgroups/cgroup_utils.h
new file mode 100644
index 0000000..d4df757
--- /dev/null
+++ b/cgroups/cgroup_utils.h
@@ -0,0 +1,72 @@
+/* SPDX-License-Identifier: LGPL-2.1+ */
+
+#ifndef __LXC_CGROUP_UTILS_H
+#define __LXC_CGROUP_UTILS_H
+
+#include <stdbool.h>
+#include <stdio.h>
+#include <sys/stat.h>
+#include <sys/types.h>
+#include <sys/vfs.h>
+
+/* Retrieve the cgroup version of a given entry from /proc/<pid>/mountinfo. */
+extern int get_cgroup_version(char *line);
+
+/* Check if given entry from /proc/<pid>/mountinfo is a cgroupfs v1 mount. */
+extern bool is_cgroupfs_v1(char *line);
+
+/* Check if given entry from /proc/<pid>/mountinfo is a cgroupfs v2 mount. */
+extern bool is_cgroupfs_v2(char *line);
+
+/* Given a v1 hierarchy @mountpoint and base @path, verify that we can create
+ * directories underneath it.
+ */
+extern bool test_writeable_v1(char *mountpoint, char *path);
+
+/* Given a v2 hierarchy @mountpoint and base @path, verify that we can create
+ * directories underneath it and that we have write access to the cgroup's
+ * "cgroup.procs" file.
+ */
+extern bool test_writeable_v2(char *mountpoint, char *path);
+
+extern int unified_cgroup_hierarchy(void);
+
+extern void *must_realloc(void *orig, size_t sz);
+
+extern char *must_make_path(const char *first, ...);
+
+extern char *must_copy_string(const char *entry);
+
+/* __typeof__ should be safe to use with all compilers. */
+typedef __typeof__(((struct statfs *)NULL)->f_type) fs_type_magic;
+extern bool is_fs_type(const struct statfs *fs, fs_type_magic magic_val);
+
+extern char *lxc_string_join(const char *sep, const char **parts,
+                            bool use_as_prefix);
+extern int lxc_count_file_lines(const char *fn);
+
+extern bool dir_exists(const char *path);
+
+extern int safe_mount(const char *src, const char *dest, const char *fstype,
+                     unsigned long flags, const void *data, const char 
*rootfs);
+
+#ifndef HAVE_STRLCPY
+extern size_t strlcpy(char *, const char *, size_t);
+#endif
+
+#ifndef HAVE_STRLCAT
+extern size_t strlcat(char *d, const char *s, size_t n);
+#endif
+
+extern FILE *fopen_cloexec(const char *path, const char *mode);
+extern void append_line(char **dest, size_t oldlen, char *new, size_t newlen);
+extern char *read_file(const char *fnam);
+extern char *readat_file(int fd, const char *path);
+extern char *read_file_strip_newline(const char *fnam);
+extern char *cg_unified_get_current_cgroup(pid_t pid);
+extern char *cg_hybrid_get_current_cgroup(char *basecginfo,
+                                         const char *controller, int type);
+extern char *cg_legacy_get_current_cgroup(pid_t pid, const char *controller);
+extern bool mkdir_p(const char *dir, mode_t mode);
+
+#endif /* __LXC_CGROUP_UTILS_H */
diff --git a/configure.ac b/configure.ac
index 81027cd..63cd934 100644
--- a/configure.ac
+++ b/configure.ac
@@ -162,4 +162,13 @@ AC_ARG_WITH([rootfs-path],
 
 AS_AC_EXPAND(LIBDIR, "$libdir")
 
+AC_CHECK_FUNCS([strlcpy],
+       AM_CONDITIONAL(HAVE_STRLCPY, true)
+       AC_DEFINE(HAVE_STRLCPY,1,[Have strlcpy]),
+       AM_CONDITIONAL(HAVE_STRLCPY, false))
+AC_CHECK_FUNCS([strlcat],
+       AM_CONDITIONAL(HAVE_STRLCAT, true)
+       AC_DEFINE(HAVE_STRLCAT,1,[Have strlcat]),
+       AM_CONDITIONAL(HAVE_STRLCAT, false))
+
 AC_OUTPUT
diff --git a/macro.h b/macro.h
index 3e9ef82..4ec3876 100644
--- a/macro.h
+++ b/macro.h
@@ -1,9 +1,22 @@
 #ifndef __LXCFS_MACRO_H
 #define __LXCFS_MACRO_H
 
+#include <stdio.h>
+
+#define BATCH_SIZE 50
+
+/* filesystem magic values */
+#ifndef CGROUP_SUPER_MAGIC
+#define CGROUP_SUPER_MAGIC 0x27e0eb
+#endif
+
+#ifndef CGROUP2_SUPER_MAGIC
+#define CGROUP2_SUPER_MAGIC 0x63677270
+#endif
+
 #define lxcfs_debug_stream(stream, format, ...)                                
\
        do {                                                                   \
-               fprintf(stream, "%s: %d: %s: " format, __FILE__, __LINE__,     \
+               fprintf(stream, "%s: %d: %s: " format "\n", __FILE__, __LINE__, 
    \
                        __func__, ##__VA_ARGS__);                               
 \
        } while (false)
 
@@ -21,4 +34,45 @@
 #define lxcfs_v(format, ...)
 #endif /* VERBOSE */
 
+#define log_error_errno(__ret__, __errno__, format, ...) \
+       ({                                               \
+               errno = __errno__;                       \
+               lxcfs_error(format, ##__VA_ARGS__);      \
+               __ret__;                                 \
+       })
+
+#define STRLITERALLEN(x) (sizeof(""x"") - 1)
+
+/* Calculate the number of chars needed to represent a given integer as a C
+ * string. Include room for '-' to indicate negative numbers and the \0 byte.
+ * This is based on systemd.
+ */
+#define INTTYPE_TO_STRLEN(type)                   \
+       (2 + (sizeof(type) <= 1                   \
+                 ? 3                             \
+                 : sizeof(type) <= 2             \
+                       ? 5                       \
+                       : sizeof(type) <= 4       \
+                             ? 10                \
+                             : sizeof(type) <= 8 \
+                                   ? 20          \
+                                   : sizeof(int[-2 * (sizeof(type) > 8)])))
+
+#define ret_errno(__errno__)       \
+       ({                         \
+               errno = __errno__; \
+               -__errno__;        \
+       })
+
+#define ret_set_errno(__ret__, __errno__) \
+       ({                                \
+               errno = __errno__;        \
+               __ret__;                  \
+       })
+
+#define lxc_iterate_parts(__iterator, __splitme, __separators)                 
 \
+       for (char *__p = NULL, *__it = strtok_r(__splitme, __separators, &__p); 
\
+            (__iterator = __it);                                               
\
+            __iterator = __it = strtok_r(NULL, __separators, &__p))
+
 #endif /* __LXCFS_MACRO_H */
diff --git a/memory_utils.h b/memory_utils.h
index 73e04fc..ac00b10 100644
--- a/memory_utils.h
+++ b/memory_utils.h
@@ -67,4 +67,6 @@ static inline void __auto_close__(int *fd)
                __internal_fd__;            \
        })
 
+#define zalloc(__size__) (calloc(1, __size__))
+
 #endif /* __LXCFS_MEMORY_UTILS_H */
diff --git a/sysfs_fuse.c b/sysfs_fuse.c
index 32a59b7..d2b187b 100644
--- a/sysfs_fuse.c
+++ b/sysfs_fuse.c
@@ -65,7 +65,7 @@ static int sys_devices_system_cpu_online_read(char *buf, 
size_t size,
                initpid = fc->pid;
        cg = get_pid_cgroup(initpid, "cpuset");
        if (!cg)
-               return read_file("/sys/devices/system/cpu/online", buf, size, 
d);
+               return read_file_fuse("/sys/devices/system/cpu/online", buf, 
size, d);
        prune_init_slice(cg);
 
        cpuset = get_cpuset(cg);
@@ -78,7 +78,7 @@ static int sys_devices_system_cpu_online_read(char *buf, 
size_t size,
                max_cpus = max_cpu_count(cg);
 
        if (max_cpus == 0)
-               return read_file("/sys/devices/system/cpu/online", buf, size, 
d);
+               return read_file_fuse("/sys/devices/system/cpu/online", buf, 
size, d);
        if (max_cpus > 1)
                total_len = snprintf(d->buf, d->buflen, "0-%d\n", max_cpus - 1);
        else

_______________________________________________
lxc-devel mailing list
lxc-devel@lists.linuxcontainers.org
http://lists.linuxcontainers.org/listinfo/lxc-devel

[lxc-devel] [lxcfs/master] bindings: add infrastructure for cgroup2 support

Reply via email to