From: Hui Zhu <[email protected]>

To allow for more flexible attachment policies in nested cgroup
hierarchies, this patch introduces support for the
`BPF_F_ALLOW_OVERRIDE` flag for `memcg_bpf_ops`.

When a `memcg_bpf_ops` is attached to a cgroup with this flag, it
permits child cgroups to attach their own, different `memcg_bpf_ops`,
overriding the parent's inherited program. Without this flag,
attaching a BPF program to a cgroup that already has one (either
directly or via inheritance) will fail.

The implementation involves:
- Adding a `bpf_ops_flags` field to `struct mem_cgroup`.
- During registration (`bpf_memcg_ops_reg`), checking for existing
  programs and the `BPF_F_ALLOW_OVERRIDE` flag.
- During unregistration (`bpf_memcg_ops_unreg`), correctly restoring
  the parent's BPF program to the cgroup hierarchy.
- Ensuring flags are inherited by child cgroups during online events.

This change enables complex, multi-level policy enforcement where
different subtrees of the cgroup hierarchy can have distinct memory
management BPF programs.

Signed-off-by: Geliang Tang <[email protected]>
Signed-off-by: Hui Zhu <[email protected]>
---
 include/linux/memcontrol.h |  1 +
 mm/bpf_memcontrol.c        | 83 ++++++++++++++++++++++++--------------
 2 files changed, 53 insertions(+), 31 deletions(-)

diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
index 1083be5d0362..6e15da44ba35 100644
--- a/include/linux/memcontrol.h
+++ b/include/linux/memcontrol.h
@@ -354,6 +354,7 @@ struct mem_cgroup {
 
 #ifdef CONFIG_BPF_SYSCALL
        struct memcg_bpf_ops *bpf_ops;
+       u32 bpf_ops_flags;
 #endif
 
        struct mem_cgroup_per_node *nodeinfo[];
diff --git a/mm/bpf_memcontrol.c b/mm/bpf_memcontrol.c
index 20c5c3552ce3..756a7d4eb4e3 100644
--- a/mm/bpf_memcontrol.c
+++ b/mm/bpf_memcontrol.c
@@ -213,6 +213,7 @@ void memcontrol_bpf_online(struct mem_cgroup *memcg)
                goto out;
 
        WRITE_ONCE(memcg->bpf_ops, ops);
+       memcg->bpf_ops_flags = parent_memcg->bpf_ops_flags;
 
        /*
         * If the BPF program implements it, call the online handler to
@@ -340,52 +341,54 @@ static int bpf_memcg_ops_init_member(const struct 
btf_type *t,
        return 0;
 }
 
-/**
- * clean_memcg_bpf_ops - Detach BPF programs from a cgroup hierarchy.
- * @memcg: The root of the cgroup hierarchy to clean.
- * @ops:   The specific ops struct to detach. If NULL, detach any ops.
- *
- * Iterates through all descendant cgroups of @memcg (including itself)
- * and clears their bpf_ops pointer. This is used when a BPF program
- * is detached or if attachment fails midway.
- */
-static void clean_memcg_bpf_ops(struct mem_cgroup *memcg,
-                               struct memcg_bpf_ops *ops)
-{
-       struct mem_cgroup *iter = NULL;
-
-       while ((iter = mem_cgroup_iter(memcg, iter, NULL))) {
-               if (ops) {
-                       if (!WARN_ON(READ_ONCE(iter->bpf_ops) != ops))
-                               WRITE_ONCE(iter->bpf_ops, NULL);
-               } else
-                       WRITE_ONCE(iter->bpf_ops, NULL);
-       }
-}
-
 static int bpf_memcg_ops_reg(void *kdata, struct bpf_link *link)
 {
        struct bpf_struct_ops_link *ops_link
                = container_of(link, struct bpf_struct_ops_link, link);
-       struct memcg_bpf_ops *ops = kdata;
+       struct memcg_bpf_ops *ops = kdata, *parent_ops = NULL;
        struct mem_cgroup *memcg, *iter = NULL;
        int err = 0;
 
+       if (ops_link->flags & ~BPF_F_ALLOW_OVERRIDE) {
+               pr_err("attach only support BPF_F_ALLOW_OVERRIDE\n");
+               return -EOPNOTSUPP;
+       }
+
        memcg = mem_cgroup_get_from_ino(ops_link->cgroup_id);
        if (IS_ERR_OR_NULL(memcg))
                return PTR_ERR(memcg);
 
        cgroup_lock();
-       while ((iter = mem_cgroup_iter(memcg, iter, NULL))) {
-               if (READ_ONCE(iter->bpf_ops)) {
-                       mem_cgroup_iter_break(memcg, iter);
+
+       if (READ_ONCE(memcg->bpf_ops)) {
+               /* Check if bpf_ops of the parent is BPF_F_ALLOW_OVERRIDE. */
+               if (memcg->bpf_ops_flags & BPF_F_ALLOW_OVERRIDE) {
+                       iter = parent_mem_cgroup(memcg);
+                       if (!iter || READ_ONCE(iter->bpf_ops) !=
+                                    READ_ONCE(memcg->bpf_ops))
+                               goto busy_out;
+
+                       parent_ops = READ_ONCE(memcg->bpf_ops);
+               } else {
+busy_out:
                        err = -EBUSY;
-                       break;
+                       goto unlock_out;
+               }
+       }
+
+       iter = NULL;
+       while ((iter = mem_cgroup_iter(memcg, iter, NULL))) {
+               struct memcg_bpf_ops *iter_ops = READ_ONCE(iter->bpf_ops);
+
+               if (iter_ops && iter_ops != parent_ops) {
+                       /* cannot override existing bpf_ops of sub-cgroup. */
+                       continue;
                }
                WRITE_ONCE(iter->bpf_ops, ops);
+               iter->bpf_ops_flags = ops_link->flags;
        }
-       if (err)
-               clean_memcg_bpf_ops(memcg, NULL);
+
+unlock_out:
        cgroup_unlock();
 
        mem_cgroup_put(memcg);
@@ -399,13 +402,31 @@ static void bpf_memcg_ops_unreg(void *kdata, struct 
bpf_link *link)
                = container_of(link, struct bpf_struct_ops_link, link);
        struct memcg_bpf_ops *ops = kdata;
        struct mem_cgroup *memcg;
+       struct mem_cgroup *iter;
+       struct memcg_bpf_ops *parent_bpf_ops = NULL;
+       u32 parent_bpf_ops_flags = 0;
 
        memcg = mem_cgroup_get_from_ino(ops_link->cgroup_id);
        if (IS_ERR_OR_NULL(memcg))
                goto out;
 
        cgroup_lock();
-       clean_memcg_bpf_ops(memcg, ops);
+
+       /* Get the parent bpf_ops and bpf_ops_flags */
+       iter = parent_mem_cgroup(memcg);
+       if (iter) {
+               parent_bpf_ops = READ_ONCE(iter->bpf_ops);
+               parent_bpf_ops_flags = iter->bpf_ops_flags;
+       }
+
+       iter = NULL;
+       while ((iter = mem_cgroup_iter(memcg, iter, NULL))) {
+               if (READ_ONCE(iter->bpf_ops) == ops) {
+                       WRITE_ONCE(iter->bpf_ops, parent_bpf_ops);
+                       iter->bpf_ops_flags = parent_bpf_ops_flags;
+               }
+       }
+
        cgroup_unlock();
 
        mem_cgroup_put(memcg);
-- 
2.43.0


Reply via email to