apologies for top posting, this phone doesn't support inline)

Where are you preventing less privileged tasks from limiting the caps of a more 
privileged task?  It looks like you are relying on the cgroupfs for that?

Overall I'm not a fan of this for several reasons.  Can you tell us precisely 
what your use case is?
On 6/18/16 14:31 Topi Miettinen wrote:
Add a new cgroup controller for enforcement of and monitoring of
capabilities in the cgroup.

Test case (boot to rdshell);
BusyBox v1.22.1 (Debian 1:1.22.0-19) built-in shell (ash)
Enter 'help' for a list of built-in commands.

(initramfs) cd /sys/fs
(initramfs) mount -t cgroup2 cgroup cgroup
(initramfs) cd cgroup
(initramfs) echo +capability > cgroup.subtree_control
(initramfs) mkdir test; cd test
(initramfs) ls
capability.bounding_set  cgroup.controllers       cgroup.procs
capability.used          cgroup.events            cgroup.subtree_control
(initramfs) sh

BusyBox v1.22.1 (Debian 1:1.22.0-19) built-in shell (ash)
Enter 'help' for a list of built-in commands.

(initramfs) echo $$ >cgroup.procs
(initramfs) cat capability.used
0000000000000000
(initramfs) mknod /dev/z1 c 1 2
(initramfs) cat capability.used
0000000008000000
(initramfs) exit
(initramfs) echo 0000000000000000 > capability.bounding_set
(initramfs) sh

BusyBox v1.22.1 (Debian 1:1.22.0-19) built-in shell (ash)
Enter 'help' for a list of built-in commands.

(initramfs) echo $$ >cgroup.procs
(initramfs) mknod /dev/z2 c 1 2
mknod: /dev/z2: Operation not permitted
(initramfs) exit

Signed-off-by: Topi Miettinen <toiwo...@gmail.com>
---
 include/linux/capability_cgroup.h |   7 ++
 include/linux/cgroup_subsys.h     |   4 +
 init/Kconfig                      |   6 ++
 kernel/capability.c               |   2 +
 security/Makefile                 |   1 +
 security/capability_cgroup.c      | 216 ++++++++++++++++++++++++++++++++++++++
 6 files changed, 236 insertions(+)
 create mode 100644 include/linux/capability_cgroup.h
 create mode 100644 security/capability_cgroup.c

diff --git a/include/linux/capability_cgroup.h 
b/include/linux/capability_cgroup.h
new file mode 100644
index 0000000..c03b58d
--- /dev/null
+++ b/include/linux/capability_cgroup.h
@@ -0,0 +1,7 @@
+#ifdef CONFIG_CGROUP_CAPABILITY
+void capability_cgroup_update_used(int cap);
+#else
+static inline void capability_cgroup_update_used(int cap)
+{
+}
+#endif
diff --git a/include/linux/cgroup_subsys.h b/include/linux/cgroup_subsys.h
index 0df0336a..a5161d0 100644
--- a/include/linux/cgroup_subsys.h
+++ b/include/linux/cgroup_subsys.h
@@ -56,6 +56,10 @@ SUBSYS(hugetlb)
 SUBSYS(pids)
 #endif
 
+#if IS_ENABLED(CONFIG_CGROUP_CAPABILITY)
+SUBSYS(capability)
+#endif
+
 /*
  * The following subsystems are not supported on the default hierarchy.
  */
diff --git a/init/Kconfig b/init/Kconfig
index f755a60..098ce66 100644
--- a/init/Kconfig
+++ b/init/Kconfig
@@ -1141,6 +1141,12 @@ config CGROUP_PERF
 
          Say N if unsure.
 
+config CGROUP_CAPABILITY
+       bool "Capability controller"
+       help
+         Provides a simple controller for enforcement of and monitoring of
+         capabilities in the cgroup.
+
 config CGROUP_DEBUG
        bool "Example controller"
        default n
diff --git a/kernel/capability.c b/kernel/capability.c
index 45432b5..b57d7f9 100644
--- a/kernel/capability.c
+++ b/kernel/capability.c
@@ -17,6 +17,7 @@
 #include <linux/syscalls.h>
 #include <linux/pid_namespace.h>
 #include <linux/user_namespace.h>
+#include <linux/capability_cgroup.h>
 #include <asm/uaccess.h>
 
 /*
@@ -380,6 +381,7 @@ bool ns_capable(struct user_namespace *ns, int cap)
        }
 
        if (security_capable(current_cred(), ns, cap) == 0) {
+               capability_cgroup_update_used(cap);
                current->flags |= PF_SUPERPRIV;
                return true;
        }
diff --git a/security/Makefile b/security/Makefile
index f2d71cd..2bb04f1 100644
--- a/security/Makefile
+++ b/security/Makefile
@@ -25,6 +25,7 @@ obj-$(CONFIG_SECURITY_APPARMOR)               += apparmor/
 obj-$(CONFIG_SECURITY_YAMA)            += yama/
 obj-$(CONFIG_SECURITY_LOADPIN)         += loadpin/
 obj-$(CONFIG_CGROUP_DEVICE)            += device_cgroup.o
+obj-$(CONFIG_CGROUP_CAPABILITY)                += capability_cgroup.o
 
 # Object integrity file lists
 subdir-$(CONFIG_INTEGRITY)             += integrity
diff --git a/security/capability_cgroup.c b/security/capability_cgroup.c
new file mode 100644
index 0000000..6e03fce
--- /dev/null
+++ b/security/capability_cgroup.c
@@ -0,0 +1,216 @@
+/*
+ * Capability cgroup
+ *
+ * Copyright 2016 Topi Miettinen
+ *
+ * This file is subject to the terms and conditions of the GNU General
+ * Public License.  See the file COPYING in the main directory of the
+ * Linux distribution for more details.
+ */
+
+#include <linux/capability.h>
+#include <linux/capability_cgroup.h>
+#include <linux/cgroup.h>
+#include <linux/cred.h>
+#include <linux/security.h>
+#include <linux/seq_file.h>
+#include <linux/slab.h>
+
+static DEFINE_MUTEX(capcg_mutex);
+
+struct capcg_cgroup {
+       struct cgroup_subsys_state css;
+       kernel_cap_t cap_bset; /* Capability bounding set */
+       kernel_cap_t cap_used; /* Capabilities actually used */
+};
+
+static inline struct capcg_cgroup *css_to_capcg(struct cgroup_subsys_state *s)
+{
+       return s ? container_of(s, struct capcg_cgroup, css) : NULL;
+}
+
+static inline struct capcg_cgroup *task_to_capcg(struct task_struct *task)
+{
+       return css_to_capcg(task_css(task, capability_cgrp_id));
+}
+
+static struct cgroup_subsys_state *capcg_css_alloc(struct cgroup_subsys_state
+                                                  *parent)
+{
+       struct capcg_cgroup *caps;
+
+       caps = kzalloc(sizeof(*caps), GFP_KERNEL);
+       if (!caps)
+               return ERR_PTR(-ENOMEM);
+
+       caps->cap_bset = CAP_FULL_SET;
+       cap_clear(caps->cap_used);
+       return &caps->css;
+}
+
+static void capcg_css_free(struct cgroup_subsys_state *css)
+{
+       kfree(css_to_capcg(css));
+}
+
+/**
+ * capcg_apply_bset - apply cgroup bounding set to all task's capabilities
+ */
+static int capcg_task_apply_bset(struct task_struct *task, kernel_cap_t bset)
+{
+       struct cred *new;
+       const struct cred *old;
+       kernel_cap_t bounding, effective, inheritable, permitted;
+       int ret;
+
+       new = prepare_creds();
+       if (!new)
+               return -ENOMEM;
+
+       ret = security_capget(task, 
+                             &effective, &inheritable, &permitted);
+       if (ret < 0)
+               goto abort_cred;
+
+       old = get_task_cred(task);
+       bounding = cap_intersect(bset, old->cap_bset);
+       effective = cap_intersect(bset, effective);
+       inheritable = cap_intersect(bset, inheritable);
+       permitted = cap_intersect(bset, permitted);
+
+       /* security_capset() also updates ambient capabilities */
+       ret = security_capset(new, old,
+                             &effective, &inheritable, &permitted);
+       new->cap_bset = bounding;
+               
+       put_cred(old);
+       if (ret < 0)
+               goto abort_cred;
+
+       ret = commit_creds(new);
+       return ret;
+
+ abort_cred:
+       abort_creds(new);
+       return ret;
+}
+
+static void capcg_attach(struct cgroup_taskset *tset)
+{
+       struct task_struct *task;
+       struct cgroup_subsys_state *css;
+
+       rcu_read_lock();
+       cgroup_taskset_for_each(task, css, tset) {
+               struct capcg_cgroup *caps = css_to_capcg(css);
+               
+               capcg_task_apply_bset(task, caps->cap_bset);
+       }
+       rcu_read_unlock();
+}
+
+/** capcg_write_bset - update css tree and their tasks with new
+ *  bounding capability
+ */
+static ssize_t capcg_write_bset(struct kernfs_open_file *of, char *buf,
+                               size_t nbytes, loff_t off)
+{
+       struct cgroup_subsys_state *css = of_css(of), *pos;
+       struct capcg_cgroup *caps = css_to_capcg(css);
+       u32 capi;
+       int err;
+       kernel_cap_t new_bset;
+
+       buf = strstrip(buf);
+
+       CAP_FOR_EACH_U32(capi) {
+               char buf2[9]; /* for each 32 bit block */
+               u32 capv;
+
+               memcpy(buf2, &buf[capi * 8], 8);
+               buf2[8] = '\0';
+               err = kstrtou32(buf2, 16, &capv);
+               if (err)
+                       return err;
+               new_bset.cap[CAP_LAST_U32 - capi] = capv;
+       }
+
+       mutex_lock(&capcg_mutex);
+       caps->cap_bset = cap_intersect(caps->cap_bset, new_bset);
+       mutex_unlock(&capcg_mutex);
+
+       rcu_read_lock();
+       css_for_each_child(pos, css) {
+               struct css_task_iter it;
+               struct task_struct *task;
+
+               css_task_iter_start(pos, &it);
+               while ((task = css_task_iter_next(&it)))
+                       capcg_task_apply_bset(task, new_bset);
+       }
+       rcu_read_unlock();
+
+       return nbytes;
+}
+
+static int capcg_seq_show_cap(struct seq_file *m, kernel_cap_t *cap)
+{
+       u32 capi;
+
+       rcu_read_lock();
+
+       CAP_FOR_EACH_U32(capi) {
+               seq_printf(m, "%08x",
+                          cap->cap[CAP_LAST_U32 - capi]);
+       }
+       seq_putc(m, '\n');
+
+       rcu_read_unlock();
+
+       return 0;
+}
+
+static int capcg_seq_show_bset(struct seq_file *m, void *v)
+{
+       struct capcg_cgroup *capcg = css_to_capcg(seq_css(m));
+
+       return capcg_seq_show_cap(m, &capcg->cap_bset);
+}
+
+static int capcg_seq_show_used(struct seq_file *m, void *v)
+{
+       struct capcg_cgroup *capcg = css_to_capcg(seq_css(m));
+
+       return capcg_seq_show_cap(m, &capcg->cap_used);
+}
+
+static struct cftype capcg_files[] = {
+       {
+               .name = "bounding_set",
+               .seq_show = capcg_seq_show_bset,
+               .write = capcg_write_bset,
+               .flags = CFTYPE_NOT_ON_ROOT,
+       },
+       {
+               .name = "used",
+               .seq_show = capcg_seq_show_used,
+               .flags = CFTYPE_NOT_ON_ROOT,
+       },
+       { }     /* terminate */
+};
+
+struct cgroup_subsys capability_cgrp_subsys = {
+       .css_alloc = capcg_css_alloc,
+       .css_free = capcg_css_free,
+       .attach = capcg_attach,
+       .dfl_cftypes = capcg_files,
+};
+
+void capability_cgroup_update_used(int cap)
+{
+       struct capcg_cgroup *caps = task_to_capcg(current);
+
+       mutex_lock(&capcg_mutex);
+       cap_raise(caps->cap_used, cap);
+       mutex_unlock(&capcg_mutex);
+}
-- 
2.8.1

Reply via email to