On 11/24/25 12:20, Pavel Tikhomirov wrote:
...
+static struct ve_namespace *clone_ve_ns(struct user_namespace *user_ns,
+ struct ve_namespace *old_ns)
+{
+ struct ve_namespace *ns;
+ struct ucounts *ucounts;
+ int err;
+
+ ucounts = inc_ve_namespaces(user_ns);
+ if (!ucounts)
+ return ERR_PTR(-ENOSPC);
+
+ err = -ENOMEM;
+ ns = kmalloc(sizeof(*ns), GFP_KERNEL_ACCOUNT);
+ if (!ns)
+ goto err_dec_ucount;
+
+ refcount_set(&ns->ns.count, 1);
+
+ err = ns_alloc_inum(&ns->ns);
+ if (err)
+ goto err_free_ns;
+
+ ns->ucounts = ucounts;
+ ns->ns.ops = &ve_ns_operations;
+ ns->user_ns = get_user_ns(user_ns);
+
+ /*
+ * VE namespace links to current ve cgroup
+ * FIXME it should be a 1:1 link
+ */
+ ns->ve = get_ve(css_to_ve(current->cgroups->subsys[ve_cgrp_id]));
Complete Scenario: When `current->cgroups->subsys[ve_cgrp_id]` Can Be NULL
Step 1: Creating a cgroup hierarchy WITHOUT VE subsystem
1 │# Administrator creates a new cgroup hierarchy without VE subsystem
2 │mkdir /sys/fs/cgroup/myhierarchy
3 │mount -t cgroup2 none /sys/fs/cgroup/myhierarchy
4 │# VE subsystem is NOT enabled in this hierarchy
Result: root->subsys_mask & (1UL << ve_cgrp_id) == 0
Step 2: Process migrates into this cgroup
1 │// kernel/cgroup/cgroup.c:find_existing_css_set()
2 │static struct css_set *find_existing_css_set(...)
3 │{
4 │ for_each_subsys(ss, i) {
5 │ if (root->subsys_mask & (1UL << i)) {
6 │ // VE subsystem is NOT enabled, this block does NOT execute
7 │ template[i] = cgroup_e_css_by_mask(cgrp, ss);
8 │ } else {
9 │ // Uses old css from old_cset
10 │ template[i] = old_cset->subsys[i];
11 │ }
12 │ }
13 │}
If old_cset->subsys[ve_cgrp_id] was already NULL (e.g., process was created
in this hierarchy), then
template[ve_cgrp_id] = NULL.
Step 3: Creating a new css_set
1 │// kernel/cgroup/cgroup.c:find_css_set() (line 1257)
2 │cset = kzalloc(sizeof(*cset), GFP_KERNEL); // Zeroes memory
3 │// ...
4 │memcpy(cset->subsys, template, sizeof(cset->subsys)); // Copies
template
Result: cset->subsys[ve_cgrp_id] = NULL
Step 4: Process receives this css_set
1 │// Process now has:
2 │current->cgroups = cset; // where cset->subsys[ve_cgrp_id] == NULL
Step 5: Process calls `clone()` with `CLONE_NEWVE`
1 │// Userspace:
2 │pid = clone(child_func, stack, CLONE_NEWVE | SIGCHLD, NULL);
Step 6: Kernel calls `copy_ve_ns()`
1 │// kernel/fork.c:copy_process() (line 2391)
2 │retval = copy_ve_ns(clone_flags, p);
1 │// kernel/ve/ve_namespace.c:copy_ve_ns() (line 67)
2 │int copy_ve_ns(unsigned long flags, struct task_struct *p)
3 │{
4 │ // ...
5 │ if (!(flags & CLONE_NEWVE)) {
6 │ get_ve_ns(old_ve_ns);
7 │ return 0;
8 │ }
9 │
10 │ if (!ns_capable(user_ns, CAP_SYS_ADMIN))
11 │ return -EPERM;
12 │
13 │ new_ve_ns = clone_ve_ns(user_ns, p->ve_ns); // <-- Call
14 │ // ...
15 │}
Step 7: `clone_ve_ns()` called with NULL
1 │// kernel/ve/ve_namespace.c:clone_ve_ns() (line 57)
2 │ns->ve = get_ve(css_to_ve(current->cgroups->subsys[ve_cgrp_id]));
3 │// ^^^^^^^^^^^^^^^^^^^^^^^^^
4 │// THIS IS NULL!
Step 8: Call breakdown
1 │// include/linux/ve.h:css_to_ve() (line 198)
2 │static inline struct ve_struct *css_to_ve(struct cgroup_subsys_state
*css)
3 │{
4 │ return css ? container_of(css, struct ve_struct, css) : NULL;
5 │ // ^^^^
6 │ // css == NULL, returns NULL
7 │}
8 │
9 │// kernel/ve/ve.c:get_ve() (line 115)
10 │struct ve_struct *get_ve(struct ve_struct *ve)
11 │{
12 │ if (ve) // ve == NULL, this block does NOT execute
13 │ css_get(&ve->css);
14 │ return ve; // Returns NULL
15 │}
Result: ns->ve = NULL — this is a problem.
---
When This Can Happen
1. Process created in a cgroup hierarchy without VE subsystem:
1 │ // When creating a process in such hierarchy:
2 │ // init_css_set may not have ve_cgrp_id if VE subsystem was not
initialized
3 │ // or was disabled in this hierarchy
2. Process migrated to a cgroup without VE subsystem:
1 │ # Process was in VE cgroup, then migrated:
2 │ echo $PID > /sys/fs/cgroup/myhierarchy/cgroup.procs
3 │ # where myhierarchy does not have VE subsystem
3. VE subsystem disabled in hierarchy:
1 │ // kernel/cgroup/cgroup.c:cgroup_apply_control()
2 │ // VE subsystem can be disabled dynamically
3 │ root->subsys_mask &= ~(1 << ve_cgrp_id);
---
Why This Is a Problem
1. ns->ve = NULL — violates structure invariant.
2. Subsequent accesses to ns->ve can cause NULL pointer dereference.
3. Inconsistency: process has task_ve (via get_exec_env()), but namespace is
not linked to a VE.
---
+
+ return ns;
+err_free_ns:
+ kfree(ns);
+err_dec_ucount:
+ dec_ve_namespaces(ucounts);
+ return ERR_PTR(err);
+}
...
_______________________________________________
Devel mailing list
[email protected]
https://lists.openvz.org/mailman/listinfo/devel