[Devel] [PATCH 2/3] cgroup : make the mount options parsing more accurate
The actual code does not detect 'all' with one subsystem name, which is IMHO mutually exclusive and when an option is specified even if it is not a subsystem name, we have to specify the 'all' option with the other option. eg: not detected : mount -t cgroup -o all,freezer cgroup /cgroup not flexible : mount -t cgroup -o noprefix,all cgroup /cgroup This patch fix this and makes the code a bit more clear by replacing 'else if' indentation by 'continue' blocks in the loop. Signed-off-by: Daniel Lezcano Signed-off-by: Serge E. Hallyn Cc: Eric W. Biederman Cc: Paul Menage Reviewed-by: Li Zefan --- kernel/cgroup.c | 91 +-- 1 files changed, 61 insertions(+), 30 deletions(-) diff --git a/kernel/cgroup.c b/kernel/cgroup.c index 0473a9a..ca2314f 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -1074,7 +1074,8 @@ struct cgroup_sb_opts { */ static int parse_cgroupfs_options(char *data, struct cgroup_sb_opts *opts) { - char *token, *o = data ?: "all"; + char *token, *o = data; + bool all_ss = false, one_ss = false; unsigned long mask = (unsigned long)-1; int i; bool module_pin_failed = false; @@ -1088,26 +1089,30 @@ static int parse_cgroupfs_options(char *data, struct cgroup_sb_opts *opts) memset(opts, 0, sizeof(*opts)); while ((token = strsep(&o, ",")) != NULL) { + if (!*token) return -EINVAL; - if (!strcmp(token, "all")) { - /* Add all non-disabled subsystems */ - opts->subsys_bits = 0; - for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) { - struct cgroup_subsys *ss = subsys[i]; - if (ss == NULL) - continue; - if (!ss->disabled) - opts->subsys_bits |= 1ul << i; - } - } else if (!strcmp(token, "none")) { + if (!strcmp(token, "none")) { /* Explicitly have no subsystems */ opts->none = true; - } else if (!strcmp(token, "noprefix")) { + continue; + } + if (!strcmp(token, "all")) { + /* Mutually exclusive option 'all' + subsystem name */ + if (one_ss) + return -EINVAL; + all_ss = true; + continue; + } + if (!strcmp(token, "noprefix")) { set_bit(ROOT_NOPREFIX, &opts->flags); - } else if (!strcmp(token, "clone_children")) { + continue; + } + if (!strcmp(token, "clone_children")) { set_bit(ROOT_CLONE_CHILDREN, &opts->flags); - } else if (!strncmp(token, "release_agent=", 14)) { + continue; + } + if (!strncmp(token, "release_agent=", 14)) { /* Specifying two release agents is forbidden */ if (opts->release_agent) return -EINVAL; @@ -1115,7 +1120,9 @@ static int parse_cgroupfs_options(char *data, struct cgroup_sb_opts *opts) kstrndup(token + 14, PATH_MAX - 1, GFP_KERNEL); if (!opts->release_agent) return -ENOMEM; - } else if (!strncmp(token, "name=", 5)) { + continue; + } + if (!strncmp(token, "name=", 5)) { const char *name = token + 5; /* Can't specify an empty name */ if (!strlen(name)) @@ -1137,20 +1144,44 @@ static int parse_cgroupfs_options(char *data, struct cgroup_sb_opts *opts) GFP_KERNEL); if (!opts->name) return -ENOMEM; - } else { - struct cgroup_subsys *ss; - for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) { - ss = subsys[i]; - if (ss == NULL) - continue; - if (!strcmp(token, ss->name)) { - if (!ss->disabled) - set_bit(i, &opts->subsys_bits); - break; - } - } - if (i == CGROUP_SUBSYS_COUNT) - return -ENOENT; + + continue; + } + + for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) { +
[Devel] [PATCH 3/3] cgroup : remove the ns_cgroup
The ns_cgroup is an annoying cgroup at the namespace / cgroup frontier. For example, a single process can not handle a big amount of namespaces without interacting with this cgroup and falling in an exponential creation time due to the nested cgroup directory depth (eg. /cgroup//...//...). That was spotted when creating a single process using multiple network namespaces, the objective was 4096 network namespaces, but at 820 netns, the creation time was dramatically slow and the creation time for a namespace increased from 10msec to 10sec. After five hours, the expected numbers of netns was not reached. Without the ns_cgroup interaction, 4K netns are created after 2 minutes. In order to solve that, we have to mount the cgroup with all the subsystems except the ns_cgroup, it's a little weird and hard to manage from an administration pov because we have to know what are the cgroup available on the system and we can't do a simple 'mount -t cgroup cgroup /cgroup'. With the previous patch which adds a 'clone_children' parameter to a cgroup, we should be able to remove the ns_cgroup and manage manually the creation + adding a task to the cgroup consistenly with the rest of the subsystems. This patch removes the ns_cgroup as suggested in the following thread: https://lists.linux-foundation.org/pipermail/containers/2009-June/018616.html The 'cgroup_clone' function is removed because it is no longer used. Changelog: * Sep 1 (dle): refreshed CONFIG_CGROUP_NS references * Jul 29 (seh): remove references to ns_cgroup_clone(), fix up some documentation, and remove CONFIG_CGROUP_NS references. Signed-off-by: Daniel Lezcano Signed-off-by: Serge E. Hallyn Cc: Eric W. Biederman Cc: Jamal Hadi Salim Reviewed-by: Li Zefan Acked-by: Paul Menage Acked-by: Matt Helsley --- Documentation/cgroups/cgroups.txt |2 +- arch/mips/configs/bcm47xx_defconfig|1 - arch/powerpc/configs/ppc6xx_defconfig |1 - arch/powerpc/configs/pseries_defconfig |1 - arch/s390/defconfig|1 - arch/sh/configs/sdk7786_defconfig |1 - arch/sh/configs/se7206_defconfig |1 - arch/sh/configs/shx3_defconfig |1 - arch/sh/configs/urquell_defconfig |1 - arch/x86/configs/i386_defconfig|1 - arch/x86/configs/x86_64_defconfig |1 - include/linux/cgroup.h |3 - include/linux/cgroup_subsys.h |6 -- include/linux/nsproxy.h|9 --- init/Kconfig |9 --- kernel/Makefile|1 - kernel/cgroup.c| 116 kernel/cpuset.c|7 +- kernel/fork.c |6 -- kernel/ns_cgroup.c | 110 -- kernel/nsproxy.c |4 - 21 files changed, 4 insertions(+), 279 deletions(-) delete mode 100644 kernel/ns_cgroup.c diff --git a/Documentation/cgroups/cgroups.txt b/Documentation/cgroups/cgroups.txt index 190018b..6a5ba63 100644 --- a/Documentation/cgroups/cgroups.txt +++ b/Documentation/cgroups/cgroups.txt @@ -618,7 +618,7 @@ always handled well. void post_clone(struct cgroup_subsys *ss, struct cgroup *cgrp) (cgroup_mutex held by caller) -Called at the end of cgroup_clone() to do any parameter +Called during cgroup_create() to do any parameter initialization which might be required before a task could attach. For example in cpusets, no task may attach before 'cpus' and 'mems' are set up. diff --git a/arch/mips/configs/bcm47xx_defconfig b/arch/mips/configs/bcm47xx_defconfig index 927d58b..c4338e0 100644 --- a/arch/mips/configs/bcm47xx_defconfig +++ b/arch/mips/configs/bcm47xx_defconfig @@ -16,7 +16,6 @@ CONFIG_TASK_IO_ACCOUNTING=y CONFIG_AUDIT=y CONFIG_TINY_RCU=y CONFIG_CGROUPS=y -CONFIG_CGROUP_NS=y CONFIG_CGROUP_CPUACCT=y CONFIG_RELAY=y CONFIG_BLK_DEV_INITRD=y diff --git a/arch/powerpc/configs/ppc6xx_defconfig b/arch/powerpc/configs/ppc6xx_defconfig index 9d64a68..9b253f6 100644 --- a/arch/powerpc/configs/ppc6xx_defconfig +++ b/arch/powerpc/configs/ppc6xx_defconfig @@ -10,7 +10,6 @@ CONFIG_TASK_XACCT=y CONFIG_TASK_IO_ACCOUNTING=y CONFIG_AUDIT=y CONFIG_CGROUPS=y -CONFIG_CGROUP_NS=y CONFIG_CGROUP_DEVICE=y CONFIG_CGROUP_CPUACCT=y CONFIG_RESOURCE_COUNTERS=y diff --git a/arch/powerpc/configs/pseries_defconfig b/arch/powerpc/configs/pseries_defconfig index f87f0e1..972587f 100644 --- a/arch/powerpc/configs/pseries_defconfig +++ b/arch/powerpc/configs/pseries_defconfig @@ -15,7 +15,6 @@ CONFIG_AUDITSYSCALL=y CONFIG_IKCONFIG=y CONFIG_IKCONFIG_PROC=y CONFIG_CGROUPS=y -CONFIG_CGROUP_NS=y CONFIG_CGROUP_FREEZER=y CONFIG_CGROUP_DEVICE=y CONFIG_CPUSETS=y diff --git a/arch/s390/defconfig b/arch/s390/defconfig index e40ac6e..4b6d1a1 100644 --- a/arch/s390/defconfig +++ b/arch/s390/defconfig @@ -5,7 +5,6 @@ CONFIG_AUDIT=y CONFIG_IKCONFIG=y CONFIG
[Devel] [PATCH 1/3] cgroup : add clone_children control file
This patch is sent as an answer to a previous thread around the ns_cgroup. https://lists.linux-foundation.org/pipermail/containers/2009-June/018627.html It adds a control file 'clone_children' for a cgroup. This control file is a boolean specifying if the child cgroup should be a clone of the parent cgroup or not. The default value is 'false'. This flag makes the child cgroup to call the post_clone callback of all the subsystem, if it is available. At present, the cpuset is the only one which had implemented the post_clone callback. The option can be set at mount time by specifying the 'clone_children' mount option. Signed-off-by: Daniel Lezcano Signed-off-by: Serge E. Hallyn Cc: Eric W. Biederman Cc: Paul Menage Reviewed-by: Li Zefan --- Documentation/cgroups/cgroups.txt | 14 +++- include/linux/cgroup.h|4 +++ kernel/cgroup.c | 39 + 3 files changed, 55 insertions(+), 2 deletions(-) diff --git a/Documentation/cgroups/cgroups.txt b/Documentation/cgroups/cgroups.txt index b34823f..190018b 100644 --- a/Documentation/cgroups/cgroups.txt +++ b/Documentation/cgroups/cgroups.txt @@ -18,7 +18,8 @@ CONTENTS: 1.2 Why are cgroups needed ? 1.3 How are cgroups implemented ? 1.4 What does notify_on_release do ? - 1.5 How do I use cgroups ? + 1.5 What does clone_children do ? + 1.6 How do I use cgroups ? 2. Usage Examples and Syntax 2.1 Basic Usage 2.2 Attaching processes @@ -293,7 +294,16 @@ notify_on_release in the root cgroup at system boot is disabled value of their parents notify_on_release setting. The default value of a cgroup hierarchy's release_agent path is empty. -1.5 How do I use cgroups ? +1.5 What does clone_children do ? +- + +If the clone_children flag is enabled (1) in a cgroup, then all +cgroups created beneath will call the post_clone callbacks for each +subsystem of the newly created cgroup. Usually when this callback is +implemented for a subsystem, it copies the values of the parent +subsystem, this is the case for the cpuset. + +1.6 How do I use cgroups ? -- To start a new job that is to be contained within a cgroup, using diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h index 3cb7d04..d01543b 100644 --- a/include/linux/cgroup.h +++ b/include/linux/cgroup.h @@ -154,6 +154,10 @@ enum { * A thread in rmdir() is wating for this cgroup. */ CGRP_WAIT_ON_RMDIR, + /* +* Clone cgroup values when creating a new child cgroup +*/ + CGRP_CLONE_CHILDREN, }; /* which pidlist file are we talking about? */ diff --git a/kernel/cgroup.c b/kernel/cgroup.c index e5c5497..0473a9a 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -229,6 +229,7 @@ inline int cgroup_is_removed(const struct cgroup *cgrp) /* bits in struct cgroupfs_root flags field */ enum { ROOT_NOPREFIX, /* mounted subsystems have no named prefix */ + ROOT_CLONE_CHILDREN, /* mounted subsystems will inherit from parent */ }; static int cgroup_is_releasable(const struct cgroup *cgrp) @@ -244,6 +245,11 @@ static int notify_on_release(const struct cgroup *cgrp) return test_bit(CGRP_NOTIFY_ON_RELEASE, &cgrp->flags); } +static int clone_children(const struct cgroup *cgrp) +{ + return test_bit(CGRP_CLONE_CHILDREN, &cgrp->flags); +} + /* * for_each_subsys() allows you to iterate on each subsystem attached to * an active hierarchy @@ -1038,6 +1044,8 @@ static int cgroup_show_options(struct seq_file *seq, struct vfsmount *vfs) seq_printf(seq, ",%s", ss->name); if (test_bit(ROOT_NOPREFIX, &root->flags)) seq_puts(seq, ",noprefix"); + if (test_bit(ROOT_CLONE_CHILDREN, &root->flags)) + seq_puts(seq, ",clone_children"); if (strlen(root->release_agent_path)) seq_printf(seq, ",release_agent=%s", root->release_agent_path); if (strlen(root->name)) @@ -1097,6 +1105,8 @@ static int parse_cgroupfs_options(char *data, struct cgroup_sb_opts *opts) opts->none = true; } else if (!strcmp(token, "noprefix")) { set_bit(ROOT_NOPREFIX, &opts->flags); + } else if (!strcmp(token, "clone_children")) { + set_bit(ROOT_CLONE_CHILDREN, &opts->flags); } else if (!strncmp(token, "release_agent=", 14)) { /* Specifying two release agents is forbidden */ if (opts->release_agent) @@ -1357,6 +1367,8 @@ static struct cgroupfs_root *cgroup_root_from_opts(struct cgroup_sb_opts *opts) strcpy(root->release_agent_path, opts->release_agent); if (opts->name) strcpy(root->name, opts->name); + if (test_bit(ROOT_CLONE_CHILDREN, &opts->flags)) + set_bit(CGRP_CLONE_CHILDREN, &root->top_cgroup.flags);