Re: [Devel] [PATCH vz8 v5 2/5] trusted/ve/fs/exec: Don't allow a privileged user to execute untrusted files
On Tue, Jun 08, 2021 at 07:31:27PM +0300, Konstantin Khorenko wrote: > From: Pavel Tikhomirov > > If we run some binary (exploit) from CT on host, it can easily give a > user in these CT an ability to do anything on host sending commands > through unix socket to the exploit. Such an exploit can mimic to bash, > ip, systemd, ping or some other "trusted" utility. > > I've tested with these patch that we don't call from VE0 any binaries > from CT-fs on start, stop, enter, suspend, resume or migration. But to > be on the safe side, so that in future we don't become affected, lets > prohibit running any binary from ploop disks and from CT mounts if the > caller is from VE0. > > Also we protect admins of our customer from unintentionally calling such > an exploit: > > [root@kuchy ~]# strace -e trace=execve /vz/root/58a2c524-b486-42c8-849\ > b-c659bf165a91/bin/ls > execve("/vz/root/58a2c524-b486-42c8-849b-c659bf165a91/bin/ls",\ > ["/vz/root/58a2c524-b486-42c8-849b"...], [/* 27 vars */]) = -1\ > EACCES (Permission denied) > strace: exec: Permission denied > +++ exited with 1 +++ > > We need same check in sys_uselib as process from host can also try to > load shared library from the file in CT's ploop, which cannot be trusted > too. > > https://jira.sw.ru/browse/PSBM-98094 > > Signed-off-by: Pavel Tikhomirov > Acked-by: Konstantin Khorenko > > https://jira.sw.ru/browse/PSBM-129741 > > Based on vz7 commit 29154b5e5af9 ("ve/fs/exec: don't allow a privileged > user to execute untrusted files") > > Signed-off-by: Valeriy Vdovin > Reviewed-by: Pavel Tikhomirov > Reviewed-by: Konstantin Khorenko > --- > fs/exec.c | 16 ++-- > include/linux/ve.h | 1 + > kernel/ve/ve.c | 47 ++ > 3 files changed, 62 insertions(+), 2 deletions(-) > > diff --git a/fs/exec.c b/fs/exec.c > index c036db0323e0..0f4c741e19db 100644 > --- a/fs/exec.c > +++ b/fs/exec.c > @@ -62,6 +62,7 @@ > #include > #include > #include > +#include > > #include > #include > @@ -134,10 +135,9 @@ SYSCALL_DEFINE1(uselib, const char __user *, library) > goto out; > > file = do_filp_open(AT_FDCWD, tmp, &uselib_flags); > - putname(tmp); > error = PTR_ERR(file); > if (IS_ERR(file)) > - goto out; > + goto put; > > error = -EINVAL; > if (!S_ISREG(file_inode(file)->i_mode)) > @@ -147,6 +147,12 @@ SYSCALL_DEFINE1(uselib, const char __user *, library) > if (path_noexec(&file->f_path)) > goto exit; > > + if (!ve_check_trusted_exec(file, tmp)) > + goto exit; > + > + putname(tmp); > + tmp = NULL; > + > fsnotify_open(file); > > error = -ENOEXEC; > @@ -167,6 +173,9 @@ SYSCALL_DEFINE1(uselib, const char __user *, library) > read_unlock(&binfmt_lock); > exit: > fput(file); > +put: > + if (tmp) > + putname(tmp); > out: > return error; > } > @@ -861,6 +870,9 @@ static struct file *do_open_execat(int fd, struct > filename *name, int flags) > if (path_noexec(&file->f_path)) > goto exit; > > + if (!ve_check_trusted_exec(file, name)) > + goto exit; > + > err = deny_write_access(file); > if (err) > goto exit; > diff --git a/include/linux/ve.h b/include/linux/ve.h > index 9c553ac96072..edf4e95d97e7 100644 > --- a/include/linux/ve.h > +++ b/include/linux/ve.h > @@ -162,6 +162,7 @@ extern void put_ve(struct ve_struct *ve); > > void ve_stop_ns(struct pid_namespace *ns); > void ve_exit_ns(struct pid_namespace *ns); > +bool ve_check_trusted_exec(struct file *file, struct filename *name); > > #ifdef CONFIG_TTY > #define MAX_NR_VTTY_CONSOLES (12) > diff --git a/kernel/ve/ve.c b/kernel/ve/ve.c > index 12e91e6ee1a1..6594772a10dd 100644 > --- a/kernel/ve/ve.c > +++ b/kernel/ve/ve.c > @@ -28,10 +28,12 @@ > #include > #include > #include > +#include > > #include > #include > > +#include "../fs/mount.h" > #include "../cgroup/cgroup-internal.h" /* For cgroup_task_count() */ > > struct per_cgroot_data { > @@ -1781,6 +1783,51 @@ static int __init ve_subsys_init(void) > } > late_initcall(ve_subsys_init); > > +static bool ve_check_trusted_file(struct file *file) > +{ > + struct block_device *bdev; > + bool exec_from_ct; > + bool file_on_host_mount; > + > + /* The current process does
Re: [Devel] [VZ8 PATCH] lib/radix-tree: fixup for BUG_ON in __radix_tree_insert()
Looks like it's not needed on it's own, because the other patch that it fixes in the commit message: "Fixes: c68ade41961d4 ("radix-tree: save previous gfp_t tags in radix tree" Is marked as dropped in the table. So i guess we can drop this one too. From: Konstantin Khorenko Sent: Monday, June 7, 2021 7:14 PM To: Valeriy Vdovin Cc: devel@openvz.org Subject: Re: [VZ8 PATCH] lib/radix-tree: fixup for BUG_ON in __radix_tree_insert() Valera, please recheck if we need it in vz8. i guess - no. -- Best regards, Konstantin Khorenko, Virtuozzo Linux Kernel Team On 06/04/2021 04:01 PM, Valeriy Vdovin wrote: > From: Vasily Averin > > __radix_tree_insert() triggers BUG_ON if root have set any prev tag > bits. Let's ignore prev tag bits. > > Fixes: c68ade41961d4 ("radix-tree: save previous gfp_t tags in radix tree > for dirty memory accounting") > > https://jira.sw.ru/browse/PSBM-100579 > Signed-off-by: Vasily Averin > (cherry-picked from 5ddc0f15746712d643895dd4756e330f4a26ffeb) > https://jira.sw.ru/browse/PSBM-127849 > Signed-off-by: Valeriy Vdovin > --- > include/linux/radix-tree.h | 2 ++ > lib/radix-tree.c | 2 +- > 2 files changed, 3 insertions(+), 1 deletion(-) > > diff --git a/include/linux/radix-tree.h b/include/linux/radix-tree.h > index d9f3cf0a4c4d..fe681dd5c06b 100644 > --- a/include/linux/radix-tree.h > +++ b/include/linux/radix-tree.h > @@ -75,6 +75,8 @@ static inline bool radix_tree_is_internal_node(void *ptr) > #define RADIX_TREE_TAG_LONGS XA_MARK_LONGS > #endif > > +#define RADIX_ROOT_TAG_MASK(((1< __GFP_BITS_SHIFT) > + > #define RADIX_TREE_INDEX_BITS (8 /* CHAR_BIT */ * sizeof(unsigned long)) > #define RADIX_TREE_MAX_PATH (DIV_ROUND_UP(RADIX_TREE_INDEX_BITS, \ > RADIX_TREE_MAP_SHIFT)) > diff --git a/lib/radix-tree.c b/lib/radix-tree.c > index 310e89cc0f92..28591214be53 100644 > --- a/lib/radix-tree.c > +++ b/lib/radix-tree.c > @@ -167,7 +167,7 @@ static inline int root_tag_get(const struct > radix_tree_root *root, unsigned tag) > > static inline unsigned root_tags_get(const struct radix_tree_root *root) > { > - return (__force unsigned)root->xa_flags >> ROOT_TAG_SHIFT; > + return (__force unsigned)(root->xa_flags & RADIX_ROOT_TAG_MASK) >> > __GFP_BITS_SHIFT; > } > > static inline bool is_idr(const struct radix_tree_root *root) > ___ Devel mailing list Devel@openvz.org https://lists.openvz.org/mailman/listinfo/devel
[Devel] [PATCH VZ8 3/3] cgroup: fix potential deadlock
In cgroup_mark_ve_roots at error path one css_spin_lock is not released. Fixed that. Signed-off-by: Valeriy Vdovin --- kernel/cgroup/cgroup.c | 1 + 1 file changed, 1 insertion(+) diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c index 500da91..4bd2401 100644 --- a/kernel/cgroup/cgroup.c +++ b/kernel/cgroup/cgroup.c @@ -1996,6 +1996,7 @@ int cgroup_mark_ve_roots(struct ve_struct *ve) cset = rcu_dereference(ve->ve_ns)->cgroup_ns->root_cset; if (WARN_ON(!cset)) { rcu_read_unlock(); + spin_unlock_irq(&css_set_lock); return -ENODEV; } -- 1.8.3.1 ___ Devel mailing list Devel@openvz.org https://lists.openvz.org/mailman/listinfo/devel
[Devel] [PATCH VZ8 2/3] cgroup: dont check debug cgroup at container start
Fixes: 1d668375f702847d11301882cb36ddc750226ed2 A follow up fix for a cherry-picked cset validation code that runs at container start. The validation code consists of 2 functions is_virtualized_cgroup and css_has_host_cgroups. Both check that cgroup_mark_ve_roots is safe to proceed. In case if container is started with invalid configuration they will forbit further ve root marking. The fix is needed due to the new debug cgroup which appeared in VZ8. - vzctl doesn't know about debug cgroup and does not create a subfolder for it. - The validation code detects it and forces cgroup_mark_ve_roots to return with -EINVAL. - debug cgroup is only present kernel debug configuration so it only plays role in development builds. - also debug cgroup does not have any value for virtualization. - That is why we can just skip it's validation and ignore it totally at VE_ROOT marking procedure. Signed-off-by: Valeriy Vdovin --- kernel/cgroup/cgroup.c | 4 1 file changed, 4 insertions(+) diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c index ae8c876..500da91 100644 --- a/kernel/cgroup/cgroup.c +++ b/kernel/cgroup/cgroup.c @@ -1928,6 +1928,10 @@ static int cgroup_add_file(struct cgroup_subsys_state *css, struct cgroup *cgrp, #ifdef CONFIG_VE static inline bool is_virtualized_cgroup(struct cgroup *cgrp) { +#if IS_ENABLED(CONFIG_CGROUP_DEBUG) + if (cgrp->subsys[debug_cgrp_id]) + return false; +#endif if (cgrp->root->subsys_mask) return true; -- 1.8.3.1 ___ Devel mailing list Devel@openvz.org https://lists.openvz.org/mailman/listinfo/devel
[Devel] [PATCH VZ8 1/3] ve/cgroup: At container start check ve's css_set for host-level cgroups.
cgroup_mark_ve_roots is not protected against cases when a container is started in an invalid cgroup set configuration. The official supported way of doing that from cgroups point of view is as follows: 1. Create a child cgroup in "ve" cgroup hierarchy. 2. Along with "ve" create child cgroups in all other major cgroup subsystems, mounted on the system (cpuset, blkio, etc). 3. Create a child cgroup in a special cgroup hierarchy named "systemd". 4. Add a task, that will start a container to each of the newly created cgroups from above. 5. Now this task should write "START" to "ve.state" property of the relevant ve cgroup. >From the userspace it's possible to ignore the supported way and proceed to starting a container skipping steps 2-4. In kernel code, this results in ve receiving a root css_set which includes host-level cgroups, which in turn leads to a variety of problems like trying to add a "release_agent" file to a host-level cgroup which already has one, as well as trying to remove it from host-level cgroup at container stop. Prior to performing actions on cgroups, we should first run a quick check that none of the host-level cgroups are present in the ve's css_set. In the check while iterating ve's css_set we skip rootnode cgroup because it's a special case cgroup that is present in each css_set and will always give a false positive. https://jira.sw.ru/browse/PSBM-123506 Signed-off-by: Valeriy Vdovin (Cherry-picked from 2a53a1a966f467dd0e015badbd15b4d18faf75bd) +++ cgroup/ve: at container start only check virtualizable cgroups. The above commit prevented situation when the a task tried to start container without first creating the right cgroups context for that. The logic behind that check was: - there is a set of cgroups that will be virtualized during container start. - for that these cgroups will be modified. - the cgroup that will be chosen for modification are in starting task css set. - it is invalid and forbidden to modify cgroups that a located in the root of each cgroup hierarchy. - therefore we have to check all the css set to see if it has cgroups with no parent (indication of root) and forbid the whole procedure if at least some cgroup matches. The bug in this behaviour was: - there are cases when there are non-virtualizable cgroup mounts. - these are named cgroups which do not have a bound cgroup subsystems on them. - there is one exception which is a named cgroup "systemd". - therefore container starters do not have to make nested cgroups for these type of non-virtualizable cgroup hierarchies. - therefore there can be named cgroups with parent == NULL in css set of a starting task and they will not pass the check and container start will fail. We fix the bug to only check those cgroups in css set, that are virtualizable. We already have the check helper that is used a bit later in cgroup_mark_ve_roots, so let's use it. Fixes 105332edc47c ("ve/cgroup: At container start check ve's css_set for host-level cgroups.") https://jira.sw.ru/browse/PSBM-125040 Signed-off-by: Valeriy Vdovin (Cherry-picked from 4ee3ba25bc0970c6ff659fff9362d70f1affa699) Signed-off-by: Valeriy Vdovin --- kernel/cgroup/cgroup.c | 43 +-- 1 file changed, 37 insertions(+), 6 deletions(-) diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c index b02c880..ae8c876 100644 --- a/kernel/cgroup/cgroup.c +++ b/kernel/cgroup/cgroup.c @@ -1928,12 +1928,6 @@ static int cgroup_add_file(struct cgroup_subsys_state *css, struct cgroup *cgrp, #ifdef CONFIG_VE static inline bool is_virtualized_cgroup(struct cgroup *cgrp) { - /* -* no parent means this is the host cgroup -*/ - if (!cgrp->kn->parent) - return false; - if (cgrp->root->subsys_mask) return true; @@ -1943,6 +1937,31 @@ static inline bool is_virtualized_cgroup(struct cgroup *cgrp) return false; } +/* + * Iterate all cgroups in a given css_set and check if it is a top cgroup + * of it's hierarchy. + * rootnode should be ignored as it is always present in each css set as + * a placeholder for any unmounted subsystem and will give false positive. + */ +static inline bool css_has_host_cgroups(struct css_set *cset) +{ + struct cgrp_cset_link *link; + + lockdep_assert_held(&css_set_lock); + + list_for_each_entry(link, &cset->cgrp_links, cgrp_link) { + if (link->cgrp->root == &cgrp_dfl_root) + continue; + + if (!is_virtualized_cgroup(link->cgrp)) + continue; + + if (!link->cgrp->kn->parent) + return true; + } + return false; +} + int cgroup_mark_ve_roots(struct ve_struct *ve) { int err; @@ -1976,6 +1995,18 @@ int
[Devel] [PATCH 1/2] ve/cgroup: At container start check ve's css_set for host-level cgroups.
cgroup_mark_ve_roots is not protected against cases when a container is started in an invalid cgroup set configuration. The official supported way of doing that from cgroups point of view is as follows: 1. Create a child cgroup in "ve" cgroup hierarchy. 2. Along with "ve" create child cgroups in all other major cgroup subsystems, mounted on the system (cpuset, blkio, etc). 3. Create a child cgroup in a special cgroup hierarchy named "systemd". 4. Add a task, that will start a container to each of the newly created cgroups from above. 5. Now this task should write "START" to "ve.state" property of the relevant ve cgroup. >From the userspace it's possible to ignore the supported way and proceed to starting a container skipping steps 2-4. In kernel code, this results in ve receiving a root css_set which includes host-level cgroups, which in turn leads to a variety of problems like trying to add a "release_agent" file to a host-level cgroup which already has one, as well as trying to remove it from host-level cgroup at container stop. Prior to performing actions on cgroups, we should first run a quick check that none of the host-level cgroups are present in the ve's css_set. In the check while iterating ve's css_set we skip rootnode cgroup because it's a special case cgroup that is present in each css_set and will always give a false positive. https://jira.sw.ru/browse/PSBM-123506 Signed-off-by: Valeriy Vdovin (Cherry-picked from 2a53a1a966f467dd0e015badbd15b4d18faf75bd) +++ cgroup/ve: at container start only check virtualizable cgroups. The above commit prevented situation when the a task tried to start container without first creating the right cgroups context for that. The logic behind that check was: - there is a set of cgroups that will be virtualized during container start. - for that these cgroups will be modified. - the cgroup that will be chosen for modification are in starting task css set. - it is invalid and forbidden to modify cgroups that a located in the root of each cgroup hierarchy. - therefore we have to check all the css set to see if it has cgroups with no parent (indication of root) and forbid the whole procedure if at least some cgroup matches. The bug in this behaviour was: - there are cases when there are non-virtualizable cgroup mounts. - these are named cgroups which do not have a bound cgroup subsystems on them. - there is one exception which is a named cgroup "systemd". - therefore container starters do not have to make nested cgroups for these type of non-virtualizable cgroup hierarchies. - therefore there can be named cgroups with parent == NULL in css set of a starting task and they will not pass the check and container start will fail. We fix the bug to only check those cgroups in css set, that are virtualizable. We already have the check helper that is used a bit later in cgroup_mark_ve_roots, so let's use it. Fixes 105332edc47c ("ve/cgroup: At container start check ve's css_set for host-level cgroups.") https://jira.sw.ru/browse/PSBM-125040 Signed-off-by: Valeriy Vdovin (Cherry-picked from 4ee3ba25bc0970c6ff659fff9362d70f1affa699) --- kernel/cgroup/cgroup.c | 42 -- 1 file changed, 36 insertions(+), 6 deletions(-) diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c index b02c88063a27..75447685f258 100644 --- a/kernel/cgroup/cgroup.c +++ b/kernel/cgroup/cgroup.c @@ -1928,12 +1928,6 @@ static int cgroup_add_file(struct cgroup_subsys_state *css, struct cgroup *cgrp, #ifdef CONFIG_VE static inline bool is_virtualized_cgroup(struct cgroup *cgrp) { - /* -* no parent means this is the host cgroup -*/ - if (!cgrp->kn->parent) - return false; - if (cgrp->root->subsys_mask) return true; @@ -1943,6 +1937,30 @@ static inline bool is_virtualized_cgroup(struct cgroup *cgrp) return false; } +/* + * Iterate all cgroups in a given css_set and check if it is a top cgroup + * of it's hierarchy. + * rootnode should be ignored as it is always present in each css set as + * a placeholder for any unmounted subsystem and will give false positive. + */ +static inline bool css_has_host_cgroups(struct css_set *cset) +{ + struct cgrp_cset_link *link; + lockdep_assert_held(&css_set_lock); + + list_for_each_entry(link, &cset->cgrp_links, cgrp_link) { + if (link->cgrp->root == &cgrp_dfl_root) + continue; + + if (!is_virtualized_cgroup(link->cgrp)) + continue; + + if (!link->cgrp->kn->parent) + return true; + } + return false; +} + int cgroup_mark_ve_roots(struct ve_struct *ve) { int err; @@ -1976,6 +1994,18 @@ int cgroup
[Devel] [PATCH 2/2] cgroup: dont check debug cgroup at container start
Fixes: 1d668375f702847d11301882cb36ddc750226ed2 A follow up fix for a cherry-picked cset validation code that runs at container start. The validation code consists of 2 functions is_virtualized_cgroup and css_has_host_cgroups. Both check that cgroup_mark_ve_roots is safe to proceed. In case if container is started with invalid configuration they will forbit further ve root marking. The fix is needed due to the new debug cgroup which appeared in VZ8. - vzctl doesn't know about debug cgroup and does not create a subfolder for it. - The validation code detects it and forces cgroup_mark_ve_roots to return with -EINVAL. - debug cgroup is only present kernel debug configuration so it only plays role in development builds. - also debug cgroup does not have any value for virtualization. - That is why we can just skip it's validation and ignore it totally at VE_ROOT marking procedure. Signed-off-by: Valeriy Vdovin --- kernel/cgroup/cgroup.c | 4 1 file changed, 4 insertions(+) diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c index 75447685f258..eeced498b121 100644 --- a/kernel/cgroup/cgroup.c +++ b/kernel/cgroup/cgroup.c @@ -1928,6 +1928,10 @@ static int cgroup_add_file(struct cgroup_subsys_state *css, struct cgroup *cgrp, #ifdef CONFIG_VE static inline bool is_virtualized_cgroup(struct cgroup *cgrp) { +#if IS_ENABLED(CONFIG_CGROUP_DEBUG) + if (cgrp->subsys[debug_cgrp_id]) + return false; +#endif if (cgrp->root->subsys_mask) return true; -- 2.27.0 ___ Devel mailing list Devel@openvz.org https://lists.openvz.org/mailman/listinfo/devel
[Devel] [PATCH vz8] ve: fix double-free if cgroup_mark_ve_roots fails
Below is a dmesg output whne cgroup_mark_ve_roots exits with error. [ +0.973416] [ +0.000140] BUG: KASAN: double-free or invalid-free in kfree+0xd6/0x2d0 [ +0.88] CPU: 0 PID: 285 Comm: kworker/0:3 ve: / Not tainted [ +0.02] Hardware name: Virtuozzo KVM, BIOS [ +0.17] Workqueue: cgroup_destroy css_killed_work_fn [ +0.07] Call Trace: [ +0.19] dump_stack+0x9a/0xf0 [ +0.06] print_address_description.cold.3+0x9/0x23b [ +0.04] ? kfree+0xd6/0x2d0 [ +0.05] kasan_report_invalid_free+0x65/0xa0 [ +0.04] ? kfree+0xd6/0x2d0 [ +0.03] __kasan_slab_free+0x157/0x170 [ +0.06] slab_free_freelist_hook+0x5e/0x140 [ +0.12] ? ve_offline+0x34/0x70 [ +0.05] kfree+0xd6/0x2d0 [ +0.07] ve_offline+0x34/0x70 [ +0.04] css_killed_work_fn+0xa5/0x490 [ +0.15] process_one_work+0x8f0/0x17a0 [ +0.13] ? pwq_dec_nr_in_flight+0x320/0x320 [ +0.03] ? lock_acquire+0x14f/0x3b0 [ +0.15] worker_thread+0x87/0xb50 [ +0.10] ? __kthread_parkme+0xb6/0x180 [ +0.06] ? process_one_work+0x17a0/0x17a0 [ +0.04] kthread+0x30e/0x3d0 [ +0.03] ? kthread_create_fn+0x70/0x70 [ +0.12] ret_from_fork+0x3a/0x50 [ +0.36] Allocated by task 2817: [ +0.38] kasan_kmalloc+0xbf/0xe0 [ +0.03] __kmalloc+0x157/0x320 [ +0.22] ext4_htree_store_dirent+0x88/0x570 [ext4] [ +0.18] htree_dirblock_to_tree+0x235/0x540 [ext4] [ +0.16] ext4_htree_fill_tree+0x1e0/0x880 [ext4] [ +0.14] ext4_readdir+0xf5e/0x2910 [ext4] [ +0.10] iterate_dir+0x3b0/0x610 [ +0.03] ksys_getdents64+0x11f/0x1f0 [ +0.03] __x64_sys_getdents64+0x6f/0xb0 [ +0.04] do_syscall_64+0xa5/0x4d0 [ +0.06] entry_SYSCALL_64_after_hwframe+0x6a/0xdf [ +0.03] 0x [ +0.22] Freed by task 2817: [ +0.34] __kasan_slab_free+0x125/0x170 [ +0.03] slab_free_freelist_hook+0x5e/0x140 [ +0.03] kfree+0xd6/0x2d0 [ +0.15] free_rb_tree_fname+0x67/0xb0 [ext4] [ +0.14] ext4_release_dir+0x3c/0x60 [ext4] [ +0.03] __fput+0x272/0x7b0 [ +0.03] task_work_run+0x115/0x180 [ +0.03] exit_to_usermode_loop+0x152/0x170 [ +0.03] do_syscall_64+0x41e/0x4d0 [ +0.03] entry_SYSCALL_64_after_hwframe+0x6a/0xdf [ +0.03] 0x [ +0.22] The buggy address belongs to the object at 888107da8b80 which belongs to the cache kmalloc-64 of size 64 [ +0.000120] The buggy address is located 0 bytes inside of 64-byte region [888107da8b80, 888107da8bc0) [ +0.000107] The buggy address belongs to the page: [ +0.54] page:ea00041f6a00 refcount:1 mapcount:0 mapping:888107c0f000 index:0x888107da8b80 [ +0.04] flags: 0x17c100(slab) [ +0.04] raw: 0017c100 ea000436a0c8 ea000428c748 [ +0.03] raw: 888107da8b80 0020001f 0001 [ +0.02] page dumped because: kasan: bad access detected [ +0.21] Memory state around the buggy address: [ +0.000117] 888107da8a80: 00 00 00 00 00 00 00 fc fc fc fc fc fc fc [ +0.71] 888107da8b00: 00 00 00 00 00 fc fc fc fc fc fc fc fc fc [ +0.000103] >888107da8b80: fb fb fb fb fb fb fb fb fc fc fc fc fc fc [ +0.000102]^ [ +0.35] 888107da8c00: 00 00 00 00 00 00 00 00 fc fc fc fc fc fc [ +0.69] 888107da8c80: 00 00 00 00 00 00 fc fc fc fc fc fc fc fc [ +0.69] == When this happens, ve_start_container proceeds to error path and does kfree(ve->ve_owner). The same kfree is later done in ve_offline function. Fix adds ve->ve_owner = NULL after kfree. Signed-off-by: Valeriy Vdovin --- kernel/ve/ve.c | 1 + 1 file changed, 1 insertion(+) diff --git a/kernel/ve/ve.c b/kernel/ve/ve.c index 15f511f..1fd7d0d 100644 --- a/kernel/ve/ve.c +++ b/kernel/ve/ve.c @@ -676,6 +676,7 @@ static int ve_start_container(struct ve_struct *ve) err_list: ve_drop_context(ve); kfree(ve->ve_name); + ve->ve_name = NULL; return err; } -- 1.8.3.1 ___ Devel mailing list Devel@openvz.org https://lists.openvz.org/mailman/listinfo/devel
[Devel] [PATCH vz8] cgroup/ve: moved ve workqueue stop to ve_stop_ns
Fixes: a01c704c0c3f21ec3e3e7540e91682f44f4065cd The above commit fixed the bug which can be described as follows: - It is possible that functions ve_stop_ns and cgroup1_release_agent will be executed in parallel. - cgroup1_release_agent, depends on ve->is_running, which order-depends on umh_worker state (working/stopped). If it sees that ve->is_running then it will spawn a userspace process. - this logic can easily fail as ve_stop_ns can set is_running to false and stop umh_worker in the scheduling window of cgroup1_release_agent. cgroup1_release_agent has read ve->is_running as true and went to a long sleep, during which ve_stop_ns worked and set ve->is_running to 0 and stopped umh_worker as well. When cgroup1_relase_agent wakes up, it doesn't not know is_running is already 0 and will just try to spawn a new userspace program via umh_worker. - The stopped umh_worker has no way to signal to te caller that it will not exectue the task. Instead it just hungs. - The fix then would be to eliminate data race, between ve_stop_ns and cgroup1_release_agent by locking ve->op_sem for the whole period of checking ve->is_running flag and subsequent spawn of userpace process via still running umh_worker. This way ve_stop_ns will not be able to proceed past ve->is_running = 0 to stop the umh_worker until the userspace process is terminated. The fix has resolved the data race between ve_stop_ns and cgroup1_release_agent. Unfortunately it has also introduced a new deadlock: - Suppose that ve_stop_ns has already finished and returned and now ve_exit_ns starts. - First ve_exit_ns will acquire ve->op_sem. - Then ve_exit_ns will call cgroup_unmark_ve_roots. Inside of cgroup_unmark_ve_roots ve_workqueue will be flushed. - Workqueue flush is a blocking operation which will block with ve->op_sem acquired. - Also workqueue flush will block until all cgroup1_release_agent work is done. - cgroup1_release_agent will not finish because it will also try to acquire ve->op_sem. - We have a deadlock between ve_exit_ns and cgroup1_release_agent now. How do we fix that so that both ve_stop_ns and ve_exit_ns did not have synchronization issues? We should not have ve workqueue working during ve_exit_ns. It has no way of spawning userspace processes anyways since ve_stop_ns has stopped umh_worker. Ideally, ve workqueue should be stopped before umh_worker. That's why we move ve workqueue stopping code to ve_stop_ns after ve->is_running = 0, but before ve_stop_umh. Now because ve workqueue is guaranteed to work only when umh_worker also works, we can remove 'if (ve-is_running)' check from cgroup1_release_agent togather with ve->op_sem locking code. One issue left is that the newly emptied cgroups can still get into the workqueue via ve_add_to_release_list. This is a big issue because following ve_stop_ns many container processes will start dying and their cgroups will become empty and they will be added to ve->release_list. cgroup1_release_agent among all was responsible to remove the cgroups from this list at ve_exit_ns and now it's impossible to do this cleanup. To fix this, we need a flag release_list_allow that will be set to false in ve_stop_ns, so that ve_add_to_release_list could say no to all cgroups that try to add after ve_stop_ns. What's left is to remove flush_workqueue code from unmark_ve_roots, because it's too late to do the flush when workqueue is already stopped long ago. https://jira.sw.ru/browse/PSBM-127457 Signed-off-by: Valeriy Vdovin --- include/linux/ve.h| 1 + kernel/cgroup/cgroup-v1.c | 6 -- kernel/cgroup/cgroup.c| 9 - kernel/ve/ve.c| 24 ++-- 4 files changed, 23 insertions(+), 17 deletions(-) diff --git a/include/linux/ve.h b/include/linux/ve.h index 3b487f8a4a50..041538e11851 100644 --- a/include/linux/ve.h +++ b/include/linux/ve.h @@ -109,6 +109,7 @@ struct ve_struct { * cgroups, that want to notify about becoming * empty, are linked to this release_list. */ + boolrelease_list_allow; struct list_headrelease_list; spinlock_t release_list_lock; diff --git a/kernel/cgroup/cgroup-v1.c b/kernel/cgroup/cgroup-v1.c index cd1a0df6c528..f6ef1f45383f 100644 --- a/kernel/cgroup/cgroup-v1.c +++ b/kernel/cgroup/cgroup-v1.c @@ -936,10 +936,6 @@ void cgroup1_release_agent(struct work_struct *work) mutex_unlock(&cgroup_mutex); - down_write(&ve->op_sem); - if (!ve->is_running) - goto continue_with_mutex; - err = call_usermodehelper_ve(ve, argv[0], argv, envp, UMH_WAIT_EXEC); @@ -947,8 +943,6 @@ void cgroup1_release_agent(struct work_struct *work) pr_warn_ratelimited(
[Devel] [VZ8 PATCH] cgroup: revert unused code from cherry-picked release_agent patchset
This reverts some of the cherry-picked code from release agent patchset from VZ7. Commit 38d7d0783105d3a22f9bbae8bbd9866abe646c11: fully reverted Commit 8fb15caacff3fa33ffc67dbbe2037280a3c2f3ba: moved function cset_cgroup_from_root is reverted into it's original location. Signed-off-by: Valeriy Vdovin --- include/linux/cgroup.h | 28 kernel/cgroup/cgroup-internal.h | 28 ++-- kernel/cgroup/cgroup.c | 3 +-- 3 files changed, 27 insertions(+), 32 deletions(-) diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h index 8f0d057abb25..42ce2ece14f8 100644 --- a/include/linux/cgroup.h +++ b/include/linux/cgroup.h @@ -512,34 +512,6 @@ task_get_css(struct task_struct *task, int subsys_id) return css; } -void put_css_set_locked(struct css_set *cset); - -static inline void put_css_set(struct css_set *cset) -{ - unsigned long flags; - - /* -* Ensure that the refcount doesn't hit zero while any readers -* can see it. Similar to atomic_dec_and_lock(), but for an -* rwlock -*/ - if (refcount_dec_not_one(&cset->refcount)) - return; - - spin_lock_irqsave(&css_set_lock, flags); - put_css_set_locked(cset); - spin_unlock_irqrestore(&css_set_lock, flags); -} - -/* - * refcounted get/put for css_set objects - */ -static inline void get_css_set(struct css_set *cset) -{ - refcount_inc(&cset->refcount); -} - - /** * task_css_is_root - test whether a task belongs to the root css * @task: the target task diff --git a/kernel/cgroup/cgroup-internal.h b/kernel/cgroup/cgroup-internal.h index 112bd917e99d..249630a9c239 100644 --- a/kernel/cgroup/cgroup-internal.h +++ b/kernel/cgroup/cgroup-internal.h @@ -160,8 +160,32 @@ static inline bool notify_on_release(const struct cgroup *cgrp) return test_bit(CGRP_NOTIFY_ON_RELEASE, &cgrp->flags); } -struct cgroup *cset_cgroup_from_root(struct css_set *cset, - struct cgroup_root *root); +void put_css_set_locked(struct css_set *cset); + +static inline void put_css_set(struct css_set *cset) +{ + unsigned long flags; + + /* +* Ensure that the refcount doesn't hit zero while any readers +* can see it. Similar to atomic_dec_and_lock(), but for an +* rwlock +*/ + if (refcount_dec_not_one(&cset->refcount)) + return; + + spin_lock_irqsave(&css_set_lock, flags); + put_css_set_locked(cset); + spin_unlock_irqrestore(&css_set_lock, flags); +} + +/* + * refcounted get/put for css_set objects + */ +static inline void get_css_set(struct css_set *cset) +{ + refcount_inc(&cset->refcount); +} bool cgroup_ssid_enabled(int ssid); bool cgroup_on_dfl(const struct cgroup *cgrp); diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c index 09d328b76dab..779a71bdbaef 100644 --- a/kernel/cgroup/cgroup.c +++ b/kernel/cgroup/cgroup.c @@ -58,7 +58,6 @@ #include #include #include -#include #include @@ -1436,7 +1435,7 @@ current_cgns_cgroup_from_root(struct cgroup_root *root) } /* look up cgroup associated with given css_set on the specified hierarchy */ -struct cgroup *cset_cgroup_from_root(struct css_set *cset, +static struct cgroup *cset_cgroup_from_root(struct css_set *cset, struct cgroup_root *root) { struct cgroup *res = NULL; -- 2.27.0 ___ Devel mailing list Devel@openvz.org https://lists.openvz.org/mailman/listinfo/devel
[Devel] [PATCH VZ8 v3 13/14] cgroup/ve: pass cgroup_root to ve_set(get)_release_agent
Due to virtualization of release_agent cgroup property, cgroup1_show_options has become more complex. struct cgroup_root is one of the arguments to that function, it was previously holding the value of release_agent. But now this property is per-ve AND per-cgroup. That's why to find the right release_agent value, the code should convert cgroup_root into one specific cgroup that is a 'virtual cgroup root' of a container, represented by the current VE. Getting ve is trivial but cgroup can be found by a helper function that will iterate css_set links under cgroup_mutex lock. There is a lock inversion problem when using cgroup_mutex in cgroup1_show_options, lockdep shows cgroup_mutex conflicts with kernfs_node->dep_map. This can be solved easily by converting per-cgroup data structure in VE into per-cgroup-root. This way we can provide ve_set(get)release_agent_path directly with struct cgroup_root agrument. For each cgroup hierarchy there is only one root and for each VE there can only be one virtual root either, that's why it is safe to just use cgroup_root as a key to find the proper release_agent path in each VE. Signed-off-by: Valeriy Vdovin Reviewed-by: Kirill Tkhai --- include/linux/ve.h| 8 --- kernel/cgroup/cgroup-v1.c | 44 +++ kernel/cgroup/cgroup.c| 5 +++-- kernel/ve/ve.c| 19 +++-- 4 files changed, 27 insertions(+), 49 deletions(-) diff --git a/include/linux/ve.h b/include/linux/ve.h index 7cef4b39847e..3b487f8a4a50 100644 --- a/include/linux/ve.h +++ b/include/linux/ve.h @@ -145,12 +145,14 @@ extern int nr_ve; void ve_add_to_release_list(struct cgroup *cgrp); void ve_rm_from_release_list(struct cgroup *cgrp); -int ve_set_release_agent_path(struct cgroup *cgroot, +int ve_set_release_agent_path(struct ve_struct *ve, struct cgroup_root *cgroot, const char *release_agent); -const char *ve_get_release_agent_path(struct cgroup *cgrp_root); +const char *ve_get_release_agent_path(struct ve_struct *ve, + struct cgroup_root *cgroot); -void ve_cleanup_per_cgroot_data(struct ve_struct *ve, struct cgroup *cgrp); +void ve_cleanup_per_cgroot_data(struct ve_struct *ve, + struct cgroup_root *cgrp); extern struct ve_struct *get_ve(struct ve_struct *ve); extern void put_ve(struct ve_struct *ve); diff --git a/kernel/cgroup/cgroup-v1.c b/kernel/cgroup/cgroup-v1.c index 46be2f688503..993ac38b895f 100644 --- a/kernel/cgroup/cgroup-v1.c +++ b/kernel/cgroup/cgroup-v1.c @@ -577,7 +577,8 @@ static ssize_t cgroup_release_agent_write(struct kernfs_open_file *of, } if (root_cgrp->ve_owner) - ret = ve_set_release_agent_path(root_cgrp, strstrip(buf)); + ret = ve_set_release_agent_path(root_cgrp->ve_owner, + root_cgrp->root, strstrip(buf)); else ret = -ENODEV; @@ -598,7 +599,9 @@ static int cgroup_release_agent_show(struct seq_file *seq, void *v) root_cgrp = cgroup_get_local_root(cgrp); if (root_cgrp->ve_owner) { rcu_read_lock(); - release_agent = ve_get_release_agent_path(root_cgrp); + release_agent = ve_get_release_agent_path( + rcu_dereference(root_cgrp->ve_owner), + root_cgrp->root); if (release_agent) seq_puts(seq, release_agent); @@ -910,7 +913,7 @@ void cgroup1_release_agent(struct work_struct *work) goto continue_free; } - release_agent = ve_get_release_agent_path(root_cgrp); + release_agent = ve_get_release_agent_path(ve, root_cgrp->root); *agentbuf = 0; if (release_agent) @@ -931,7 +934,9 @@ void cgroup1_release_agent(struct work_struct *work) envp[i++] = "PATH=/sbin:/bin:/usr/sbin:/usr/bin"; envp[i] = NULL; + mutex_unlock(&cgroup_mutex); + err = call_usermodehelper_ve(ve, argv[0], argv, envp, UMH_WAIT_EXEC); @@ -939,6 +944,7 @@ void cgroup1_release_agent(struct work_struct *work) pr_warn_ratelimited("cgroup1_release_agent " "%s %s failed: %d\n", agentbuf, pathbuf, err); + up_write(&ve->op_sem); mutex_lock(&cgroup_mutex); continue_free: kfree(pathbuf); @@ -989,7 +995,6 @@ static int cgroup1_show_options(struct seq_file *seq, struct kernfs_root *kf_roo const char *release_agent; struct cgroup_root *root = cgroup_root_from_kf(kf_root); struct cgroup_subsys *ss; - struct cgroup *root_cgrp = &root->cgrp; int ssid; for_each_subsys(ss, ssid) @@ -1003,32 +1008,7 @@ static int cgr
[Devel] [PATCH VZ8 v3 14/14] cgroup/ve: do not run release_agent on non-running ve
cgroup1_release_agent is a function that runs within a private ve workqueue. When exectured, it runs an executable in a userspace by a call to call_usermodehelper_ve. There is conflict that when ve is getting shutdown and some of last cgroups get's deleted at the same time, the workqueue might still be running, but ve_stop_ns has already been called. ve_stop_ns will stop usermode helper threads, needed for call_usermodehelper_ve. Because of that a call to call_usermodehelper_ve will never return, causing a hang. To defeat that hang VZ7 code of call_usermodehelper_ve included the check that ve is still running before running the userspace executable. It also checked for ve->init_task->flags & PF_EXITING condition. But in VZ8 the whole usermodehelper infrastructure is much more different. Also VZ8 does not have ve->init_task in it's fields. That's why it seems more relevant right now to do ve->is_running check before the call to call_usermodehelper_ve. Signed-off-by: Valeriy Vdovin --- kernel/cgroup/cgroup-v1.c | 6 +- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/kernel/cgroup/cgroup-v1.c b/kernel/cgroup/cgroup-v1.c index 993ac38b895f..cd1a0df6c528 100644 --- a/kernel/cgroup/cgroup-v1.c +++ b/kernel/cgroup/cgroup-v1.c @@ -934,9 +934,12 @@ void cgroup1_release_agent(struct work_struct *work) envp[i++] = "PATH=/sbin:/bin:/usr/sbin:/usr/bin"; envp[i] = NULL; - mutex_unlock(&cgroup_mutex); + down_write(&ve->op_sem); + if (!ve->is_running) + goto continue_with_mutex; + err = call_usermodehelper_ve(ve, argv[0], argv, envp, UMH_WAIT_EXEC); @@ -944,6 +947,7 @@ void cgroup1_release_agent(struct work_struct *work) pr_warn_ratelimited("cgroup1_release_agent " "%s %s failed: %d\n", agentbuf, pathbuf, err); +continue_with_mutex: up_write(&ve->op_sem); mutex_lock(&cgroup_mutex); continue_free: -- 2.27.0 ___ Devel mailing list Devel@openvz.org https://lists.openvz.org/mailman/listinfo/devel
[Devel] [PATCH VZ8 v3 11/14] ve/cgroup: added release_agent to each container root cgroup.
Each container will now have access to it's own cgroup release_agent file. Creation: Normally all cgroup files are created during a call to cgroup_create by cgroup_populate_dir function. It creates or not creates all cgroup files once and they immediately become visible to userspace as filesystem objects. Due to specifics of container creation process, it is not possible to use the same code for 'release_agent' file creation. For VE to start operating, first a list of ordinary cgroups is being created for each subsystem, then the set of newly created cgroups are converted to "virtual roots", so at the time when cgroup_create is executed, there is no knowledge of wheather or not "release_agent" file should be created. This information only comes at "conversion" step which is 'cgroup_mark_ve_roots' function. As the file is created dynamically in a live cgroup, a rather delicate locking sequence is present in the new code: - each new "virtual root" cgroup will have to add "release_agent" file, thus each cgroup's directory would need to be locked during the insertion time by cgroup->dentry->d_inode->i_mutex. - d_inode->i_mutex has an ordering dependency with cgroup_mutex (see cgroup_mount/cgroup_remount). They can not be locked in order {lock(cgroup_mutex), lock(inode->i_mutex)}. - to collect a list of cgroups, that need to become virtual we need cgroup_mutex lock to iterate active roots. - to overcome the above conflict we first need to collect a list of all virtual cgroups under cgroup_mutex lock, then release it and after that to insert "release_agent" to each root under inode->i_mutex lock. - to collect a list of cgroups on stack we utilize cgroup->cft_q_node, made specially for that purpose under it's own cgroup_cft_mutex. Destruction: Destruction is done in reverse from the above within cgroup_unmark_ve_roots. After file destruction we must prevent further write operations to this file in case when someone has opened this file prior to VE and cgroup destruction. This is achieved by checking if cgroup in the argument to cgroup_file_write function has features of host or virtual root. https://jira.sw.ru/browse/PSBM-83887 Signed-off-by: Valeriy Vdovin +++ cgroup: add missing dput() in cgroup_unmark_ve_roots() cgroup_unmark_ve_roots() calls dget() on cgroup's dentry but don't have the corresponding dput() call. This leads to leaking cgroups. Add missing dput() to fix this. https://jira.sw.ru/browse/PSBM-107328 Fixes: 1ac69e183447 ("ve/cgroup: added release_agent to each container root cgroup.") (Cherry-picked from 4a1635024df1bae4f4809a3bc445f0cf64d4acf4) Signed-off-by: Valeriy Vdovin Reviewed-by: Kirill Tkhai --- include/linux/cgroup-defs.h | 3 + include/linux/cgroup.h | 2 +- include/linux/ve.h | 4 +- kernel/cgroup/cgroup-internal.h | 1 + kernel/cgroup/cgroup-v1.c | 37 +- kernel/cgroup/cgroup.c | 127 ++-- kernel/ve/ve.c | 42 --- 7 files changed, 179 insertions(+), 37 deletions(-) diff --git a/include/linux/cgroup-defs.h b/include/linux/cgroup-defs.h index 57ee48874404..be7d0f599179 100644 --- a/include/linux/cgroup-defs.h +++ b/include/linux/cgroup-defs.h @@ -451,6 +451,9 @@ struct cgroup { */ struct list_head cset_links; + /* Used for cgroup_mark/umark ve */ + struct list_head cft_q_node; + /* * Linked list running through all cgroups that can * potentially be reaped by the release agent. Protected by diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h index 6693cd36fd82..8f0d057abb25 100644 --- a/include/linux/cgroup.h +++ b/include/linux/cgroup.h @@ -900,7 +900,7 @@ int cgroup_path_ns(struct cgroup *cgrp, char *buf, size_t buflen, void cgroup1_release_agent(struct work_struct *work); #ifdef CONFIG_VE -extern void cgroup_mark_ve_root(struct ve_struct *ve); +int cgroup_mark_ve_roots(struct ve_struct *ve); void cgroup_unmark_ve_roots(struct ve_struct *ve); struct ve_struct *cgroup_get_ve_owner(struct cgroup *cgrp); #endif diff --git a/include/linux/ve.h b/include/linux/ve.h index 65c19f2b9b98..7cef4b39847e 100644 --- a/include/linux/ve.h +++ b/include/linux/ve.h @@ -145,11 +145,13 @@ extern int nr_ve; void ve_add_to_release_list(struct cgroup *cgrp); void ve_rm_from_release_list(struct cgroup *cgrp); -int ve_set_release_agent_path(struct ve_struct *ve, struct cgroup *cgroot, +int ve_set_release_agent_path(struct cgroup *cgroot, const char *release_agent); const char *ve_get_release_agent_path(struct cgroup *cgrp_root); +void ve_cleanup_per_cgroot_data(struct ve_struct *ve, struct cgroup *cgrp); + extern struct ve_struct *get_ve
[Devel] [PATCH VZ8 v3 07/14] cgroup/ve: added helper function to get ve-related cgroup paths
This fill make fake-absolute paths to support virtual ve roots in cgroup hierarchies. The path will be used in subsequent patches. Signed-off-by: Valeriy.Vdovin Reviewed-by: Kirill Tkhai --- kernel/cgroup/cgroup-v1.c | 10 ++ 1 file changed, 10 insertions(+) diff --git a/kernel/cgroup/cgroup-v1.c b/kernel/cgroup/cgroup-v1.c index fb06fc9d96ca..21a7c36fbf44 100644 --- a/kernel/cgroup/cgroup-v1.c +++ b/kernel/cgroup/cgroup-v1.c @@ -787,6 +787,16 @@ void cgroup1_check_for_release(struct cgroup *cgrp) schedule_work(&cgrp->release_agent_work); } +/* + * Used to get a fake-absolute path to a cgroup on kernfs filesystem, but it + * actually be relative to cgroup root, provided in the argument. + */ +static inline int cgroup_path_ve_relative(struct cgroup *ve_root_cgrp, + struct cgroup *cgrp, char *buf, size_t buflen) +{ + return kernfs_path_from_node(cgrp->kn, ve_root_cgrp->kn, buf, buflen); +} + /* * Notify userspace when a cgroup is released, by running the * configured release agent with the name of the cgroup (path -- 2.27.0 ___ Devel mailing list Devel@openvz.org https://lists.openvz.org/mailman/listinfo/devel
[Devel] [PATCH VZ8 v3 10/14] ve/cgroup: set release_agent_path for root cgroups separately
This is done so that each container could set it's own release agent. Release agent information is now stored in per-cgroup-root data structure in ve. https://jira.sw.ru/browse/PSBM-83887 Signed-off-by: Valeriy Vdovin +++ ve/cgroup: change resource release order in ve_drop_context This fixes 87cb5fdb5b5c77ac617b46a0fe118a7d50a77b1c In the mentioned patch in cgroup_show_options ve->ve_ns is checked to ensure that ve->root_css_set is usable. But in ve_drop_context root_css_set is being released before ve_ns, which is a bug. root_css_set will now be set to NULL after ve_ns is released. This reordering only affects the described piece of code in cgroup_show_options. https://jira.sw.ru/browse/PSBM-121438 Signed-off-by: Valeriy Vdovin Reviewed-by: Kirill Tkhai +++ cgroup: do not use cgroup_mutex in cgroup_show_options In 87cb5fdb5b5c77ac617b46a0fe118a7d50a77b1c function cgroup_show_options started to lock cgroup_mutex, which introduced new deadlock possibility, described below: Thread A: m_start() --> down_read(&namespace_sem); cgroup_show_options() --> mutex_lock(&cgroup_mutex); Thread B: attach_task_by_pid() cgroup_lock_live_group --> mutex_lock(&cgroup_mutex); threadgroup_lock() --> down_write(&tsk->signal->group_rwsem); Thread C: copy_process threadgroup_change_begin() --> down_read(&tsk->signal->group_rwsem); copy_namespaces create_new_namespaces copy_mnt_ns namespace_lock() --> down_write(&namespace_sem) Clearly cgroup_mutex can not be locked right after locking namespace_sem, because opposite locking order is also present in the code and should be removed from cgroup_show_options. After reviewing cgroup_show_options, it was established that cgroup_mutex is not absolutely needed to guarantee safe access to root_cgrp. It was used in combination with a call to task_cgroup_from_root to ensure that root_cgrp lived long enough to access it's value of release_agent path. But in this funciton we know that root_cgrp is part of ve->root_css_set, which holds reference to it. In turn root_css_set is referenced while ve->ve_ns is not NULL, the check of which we already have in the code. This means that root_cgrp is valid until ve->ve_ns is valid. ve->ve_ns is valid until the point of rcu_synchronize in ve_drop_context, that's why rcu_read_lock should be maintained all the time when root_cgrp is being accessed. The patch also removes BUG_ON from css_cgroup_from_root, because all 3 calls to this function pass ve->root_css_set as an argument and the above logic applies. https://jira.sw.ru/browse/PSBM-121438 Signed-off-by: Valeriy Vdovin Reviewed-by: Kirill Tkhai +++ ve: cleanup in function ve_get_release_agent_path (Cherry-picked from f1199bd9589b7c0914343dcc72f49ddaa9b98496) Signed-off-by: Valeriy Vdovin Reviewed-by: Kirill Tkhai --- include/linux/cgroup-defs.h | 3 -- include/linux/ve.h | 6 +++ kernel/cgroup/cgroup-internal.h | 4 +- kernel/cgroup/cgroup-v1.c | 86 ++--- kernel/cgroup/cgroup.c | 9 ++-- kernel/ve/ve.c | 76 + 6 files changed, 150 insertions(+), 34 deletions(-) diff --git a/include/linux/cgroup-defs.h b/include/linux/cgroup-defs.h index 22d84aa0778e..57ee48874404 100644 --- a/include/linux/cgroup-defs.h +++ b/include/linux/cgroup-defs.h @@ -569,9 +569,6 @@ struct cgroup_root { /* IDs for cgroups in this hierarchy */ struct idr cgroup_idr; - /* The path to use for release notifications. */ - char release_agent_path[PATH_MAX]; - /* The name for this hierarchy - may be empty */ char name[MAX_CGROUP_ROOT_NAMELEN]; }; diff --git a/include/linux/ve.h b/include/linux/ve.h index 44369dddeb24..65c19f2b9b98 100644 --- a/include/linux/ve.h +++ b/include/linux/ve.h @@ -144,6 +144,12 @@ extern int nr_ve; #ifdef CONFIG_VE void ve_add_to_release_list(struct cgroup *cgrp); void ve_rm_from_release_list(struct cgroup *cgrp); + +int ve_set_release_agent_path(struct ve_struct *ve, struct cgroup *cgroot, + const char *release_agent); + +const char *ve_get_release_agent_path(struct cgroup *cgrp_root); + extern struct ve_struct *get_ve(struct ve_struct *ve); extern void put_ve(struct ve_struct *ve); diff --git a/kernel/cgroup/cgroup-internal.h b/kernel/cgroup/cgroup-internal.h index 4de66630d456..be0cd157d4dc 100644 --- a/kernel/cgroup/cgroup-internal.h +++ b/kernel/cgroup/cgroup-internal.h @@ -160,6 +160,9 @@ static inline bool notify_on_release(const struct cgroup *cgrp) return test_bit(CGRP_NOTIFY_ON_RELEASE, &cgrp->flags); } +struct cgroup *cset_cgroup_from_root(struct css_set *cset, + struct cgroup_root *root); + bool cgroup_ssid_enabled(int ssid); bool cgroup_on_dfl(const struct cgroup *cgrp); bool cgroup_is_thread_root(struct cgroup *cgrp); @@ -
[Devel] [PATCH VZ8 v3 12/14] ve/cgroup: At cgroup_mark(unmark)_ve_roots skip non-virtualized roots
During container start there might be a situation when not all cgroup hierarchies get virtualized by container manager (like vzctl). By virtualizing a cgroup hierarchy I mean creation of sub-directory within a particular mounted cgroup. When container starts it looks in css set of it's init process to list all affilated cgroups and perform actions on each. But non-virtualized cgroups will also be present in init's css_set and they should not be touched from inside of any non root ve. Signed-off-by: Valeriy Vdovin Reviewed-by: Kirill Tkhai --- kernel/cgroup/cgroup.c | 17 + 1 file changed, 17 insertions(+) diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c index 8aea78f07b5b..75997b503d3c 100644 --- a/kernel/cgroup/cgroup.c +++ b/kernel/cgroup/cgroup.c @@ -1927,6 +1927,23 @@ static int cgroup_add_file(struct cgroup_subsys_state *css, struct cgroup *cgrp, struct cftype *cft, bool activate); #ifdef CONFIG_VE +static inline bool is_virtualized_cgroup(struct cgroup *cgrp) +{ + /* +* no parent means this is the host cgroup +*/ + if (!cgrp->kn->parent) + return false; + + if (cgrp->root->subsys_mask) + return true; + + if (!strcmp(cgrp->root->name, "systemd")) + return true; + + return false; +} + int cgroup_mark_ve_roots(struct ve_struct *ve) { int err; -- 2.27.0 ___ Devel mailing list Devel@openvz.org https://lists.openvz.org/mailman/listinfo/devel
[Devel] [PATCH VZ8 v3 09/14] ve/cgroup: private per-cgroup-root data container
As long as each ve is internally attached to a particular css_set via it's init_task, it's good to have container with parameters, which are common to each cgroup subsystem hierarchy, rooting from it's virtual root. (Cherry-picked from 4a98f07102fd248ad4218a07b5ec5ec90da10288) Signed-off-by: Valeriy Vdovin Reviewed-by: Kirill Tkhai --- include/linux/ve.h | 7 + kernel/ve/ve.c | 75 ++ 2 files changed, 82 insertions(+) diff --git a/include/linux/ve.h b/include/linux/ve.h index 2ab39b607708..44369dddeb24 100644 --- a/include/linux/ve.h +++ b/include/linux/ve.h @@ -114,6 +114,13 @@ struct ve_struct { struct workqueue_struct *wq; struct work_struct release_agent_work; + + /* +* List of data, private for each root cgroup in +* ve's css_set. +*/ + struct list_headper_cgroot_list; + spinlock_t per_cgroot_list_lock; }; struct ve_devmnt { diff --git a/kernel/ve/ve.c b/kernel/ve/ve.c index 934a5ff1c9bb..a108cb63bc9f 100644 --- a/kernel/ve/ve.c +++ b/kernel/ve/ve.c @@ -30,6 +30,14 @@ #include "../cgroup/cgroup-internal.h" /* For cgroup_task_count() */ +struct per_cgroot_data { + struct list_head list; + /* +* data is related to this cgroup +*/ + struct cgroup *cgroot; +}; + extern struct kmapset_set sysfs_ve_perms_set; static struct kmem_cache *ve_cachep; @@ -67,6 +75,9 @@ struct ve_struct ve0 = { .release_list = LIST_HEAD_INIT(ve0.release_list), .release_agent_work = __WORK_INITIALIZER(ve0.release_agent_work, cgroup1_release_agent), + .per_cgroot_list= LIST_HEAD_INIT(ve0.per_cgroot_list), + .per_cgroot_list_lock = __SPIN_LOCK_UNLOCKED( + ve0.per_cgroot_list_lock), }; EXPORT_SYMBOL(ve0); @@ -199,6 +210,53 @@ int nr_threads_ve(struct ve_struct *ve) } EXPORT_SYMBOL(nr_threads_ve); +static struct per_cgroot_data *per_cgroot_data_find_locked( + struct list_head *per_cgroot_list, struct cgroup *cgroot) +{ + struct per_cgroot_data *data; + + list_for_each_entry(data, per_cgroot_list, list) { + if (data->cgroot == cgroot) + return data; + } + return NULL; +} + +static inline struct per_cgroot_data *per_cgroot_get_or_create( + struct ve_struct *ve, struct cgroup *cgroot) +{ + struct per_cgroot_data *data, *other_data; + unsigned long flags; + + spin_lock_irqsave(&ve->per_cgroot_list_lock, flags); + data = per_cgroot_data_find_locked(&ve->per_cgroot_list, + cgroot); + spin_unlock_irqrestore(&ve->per_cgroot_list_lock, flags); + + if (data) + return data; + + data = kzalloc(sizeof(struct per_cgroot_data), GFP_KERNEL); + if (!data) + return ERR_PTR(-ENOMEM); + + spin_lock_irqsave(&ve->per_cgroot_list_lock, flags); + other_data = per_cgroot_data_find_locked(&ve->per_cgroot_list, + cgroot); + + if (other_data) { + spin_unlock_irqrestore(&ve->per_cgroot_list_lock, flags); + kfree(data); + return other_data; + } + + data->cgroot = cgroot; + list_add(&data->list, &ve->per_cgroot_list); + + spin_unlock_irqrestore(&ve->per_cgroot_list_lock, flags); + return data; +} + struct cgroup_subsys_state *ve_get_init_css(struct ve_struct *ve, int subsys_id) { struct cgroup_subsys_state *css; @@ -533,6 +591,19 @@ static int ve_start_container(struct ve_struct *ve) return err; } +static void ve_per_cgroot_free(struct ve_struct *ve) +{ + struct per_cgroot_data *data, *saved; + unsigned long flags; + + spin_lock_irqsave(&ve->per_cgroot_list_lock, flags); + list_for_each_entry_safe(data, saved, &ve->per_cgroot_list, list) { + list_del_init(&data->list); + kfree(data); + } + spin_unlock_irqrestore(&ve->per_cgroot_list_lock, flags); +} + void ve_stop_ns(struct pid_namespace *pid_ns) { struct ve_struct *ve = current->task_ve; @@ -589,6 +660,8 @@ void ve_exit_ns(struct pid_namespace *pid_ns) ve_workqueue_stop(ve); + ve_per_cgroot_free(ve); + /* * At this point all userspace tasks in container are dead. */ @@ -699,6 +772,7 @@ static struct cgroup_subsys_state *ve_create(struct cgroup_subsys_state *parent_ INIT_WORK(&ve->release_agent_work, cgroup1_release_agent); spin_lock_init(&ve->release_list_lock); + spin_lock_init(&ve->per_cgroot_list_lock); ve->_randomize_va_space = ve0._randomize_va_space; @@ -721,6 +795,7 @@ static struct cgro
[Devel] [PATCH VZ8 v3 05/14] cgroup: exported put_css_set and get_css_set to cgroup.h
(Cherry-picked from 8222bbe47ed1e3824e0890a1404735324189c0cb) Signed-off-by: Valeriy.Vdovin Reviewed-by: Kirill Tkhai --- include/linux/cgroup.h | 28 kernel/cgroup/cgroup-internal.h | 27 --- kernel/cgroup/cgroup.c | 1 + 3 files changed, 29 insertions(+), 27 deletions(-) diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h index 808b31605d07..391702cf43bd 100644 --- a/include/linux/cgroup.h +++ b/include/linux/cgroup.h @@ -512,6 +512,34 @@ task_get_css(struct task_struct *task, int subsys_id) return css; } +void put_css_set_locked(struct css_set *cset); + +static inline void put_css_set(struct css_set *cset) +{ + unsigned long flags; + + /* +* Ensure that the refcount doesn't hit zero while any readers +* can see it. Similar to atomic_dec_and_lock(), but for an +* rwlock +*/ + if (refcount_dec_not_one(&cset->refcount)) + return; + + spin_lock_irqsave(&css_set_lock, flags); + put_css_set_locked(cset); + spin_unlock_irqrestore(&css_set_lock, flags); +} + +/* + * refcounted get/put for css_set objects + */ +static inline void get_css_set(struct css_set *cset) +{ + refcount_inc(&cset->refcount); +} + + /** * task_css_is_root - test whether a task belongs to the root css * @task: the target task diff --git a/kernel/cgroup/cgroup-internal.h b/kernel/cgroup/cgroup-internal.h index ce1c1553c696..829997989c41 100644 --- a/kernel/cgroup/cgroup-internal.h +++ b/kernel/cgroup/cgroup-internal.h @@ -159,33 +159,6 @@ static inline bool notify_on_release(const struct cgroup *cgrp) return test_bit(CGRP_NOTIFY_ON_RELEASE, &cgrp->flags); } -void put_css_set_locked(struct css_set *cset); - -static inline void put_css_set(struct css_set *cset) -{ - unsigned long flags; - - /* -* Ensure that the refcount doesn't hit zero while any readers -* can see it. Similar to atomic_dec_and_lock(), but for an -* rwlock -*/ - if (refcount_dec_not_one(&cset->refcount)) - return; - - spin_lock_irqsave(&css_set_lock, flags); - put_css_set_locked(cset); - spin_unlock_irqrestore(&css_set_lock, flags); -} - -/* - * refcounted get/put for css_set objects - */ -static inline void get_css_set(struct css_set *cset) -{ - refcount_inc(&cset->refcount); -} - bool cgroup_ssid_enabled(int ssid); bool cgroup_on_dfl(const struct cgroup *cgrp); bool cgroup_is_thread_root(struct cgroup *cgrp); diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c index 946031fcb393..de105e651607 100644 --- a/kernel/cgroup/cgroup.c +++ b/kernel/cgroup/cgroup.c @@ -58,6 +58,7 @@ #include #include #include +#include #include -- 2.27.0 ___ Devel mailing list Devel@openvz.org https://lists.openvz.org/mailman/listinfo/devel
[Devel] [PATCH VZ8 v3 08/14] ve/cgroup: moved release_agent from system_wq to per-ve workqueues
Each VE should execute release agent notifications within it's own workqueue. This way we achieve a more fine-grained control over release_agent work flushing at VE destruction. (Cherry-picked from 9fbfb5b4cfb87ba7c9dd63eec5e5e27946a38d3c) Signed-off-by: Valeriy Vdovin Reviewed-by: Kirill Tkhai --- include/linux/cgroup-defs.h | 10 ++- include/linux/cgroup.h | 2 + include/linux/ve.h | 10 +++ kernel/cgroup/cgroup-internal.h | 1 + kernel/cgroup/cgroup-v1.c | 109 kernel/cgroup/cgroup.c | 12 +++- kernel/ve/ve.c | 48 ++ 7 files changed, 159 insertions(+), 33 deletions(-) diff --git a/include/linux/cgroup-defs.h b/include/linux/cgroup-defs.h index a8eb94d2f97f..22d84aa0778e 100644 --- a/include/linux/cgroup-defs.h +++ b/include/linux/cgroup-defs.h @@ -451,6 +451,13 @@ struct cgroup { */ struct list_head cset_links; + /* +* Linked list running through all cgroups that can +* potentially be reaped by the release agent. Protected by +* release_list_lock +*/ + struct list_head release_list; + /* * On the default hierarchy, a css_set for a cgroup with some * susbsys disabled will point to css's which are associated with @@ -488,9 +495,6 @@ struct cgroup { /* used to wait for offlining of csses */ wait_queue_head_t offline_waitq; - /* used to schedule release agent */ - struct work_struct release_agent_work; - /* used to store eBPF programs */ struct cgroup_bpf bpf; diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h index 17ee29f4071b..6693cd36fd82 100644 --- a/include/linux/cgroup.h +++ b/include/linux/cgroup.h @@ -897,6 +897,8 @@ struct cgroup_namespace *copy_cgroup_ns(unsigned long flags, int cgroup_path_ns(struct cgroup *cgrp, char *buf, size_t buflen, struct cgroup_namespace *ns); +void cgroup1_release_agent(struct work_struct *work); + #ifdef CONFIG_VE extern void cgroup_mark_ve_root(struct ve_struct *ve); void cgroup_unmark_ve_roots(struct ve_struct *ve); diff --git a/include/linux/ve.h b/include/linux/ve.h index d3c1ab840444..2ab39b607708 100644 --- a/include/linux/ve.h +++ b/include/linux/ve.h @@ -105,7 +105,15 @@ struct ve_struct { unsigned long aio_nr; unsigned long aio_max_nr; #endif + /* +* cgroups, that want to notify about becoming +* empty, are linked to this release_list. +*/ + struct list_headrelease_list; + spinlock_t release_list_lock; + struct workqueue_struct *wq; + struct work_struct release_agent_work; }; struct ve_devmnt { @@ -127,6 +135,8 @@ extern int nr_ve; (ve_is_super(get_exec_env()) && capable(CAP_SYS_ADMIN)) #ifdef CONFIG_VE +void ve_add_to_release_list(struct cgroup *cgrp); +void ve_rm_from_release_list(struct cgroup *cgrp); extern struct ve_struct *get_ve(struct ve_struct *ve); extern void put_ve(struct ve_struct *ve); diff --git a/kernel/cgroup/cgroup-internal.h b/kernel/cgroup/cgroup-internal.h index 829997989c41..4de66630d456 100644 --- a/kernel/cgroup/cgroup-internal.h +++ b/kernel/cgroup/cgroup-internal.h @@ -135,6 +135,7 @@ extern spinlock_t css_set_lock; extern struct cgroup_subsys *cgroup_subsys[]; extern struct list_head cgroup_roots; extern struct file_system_type cgroup_fs_type; +struct cgroup *cgroup_get_local_root(struct cgroup *cgrp); /* iterate across the hierarchies */ #define for_each_root(root)\ diff --git a/kernel/cgroup/cgroup-v1.c b/kernel/cgroup/cgroup-v1.c index 21a7c36fbf44..c1891317ae3a 100644 --- a/kernel/cgroup/cgroup-v1.c +++ b/kernel/cgroup/cgroup-v1.c @@ -784,7 +784,7 @@ void cgroup1_check_for_release(struct cgroup *cgrp) { if (notify_on_release(cgrp) && !cgroup_is_populated(cgrp) && !css_has_online_children(&cgrp->self) && !cgroup_is_dead(cgrp)) - schedule_work(&cgrp->release_agent_work); + ve_add_to_release_list(cgrp); } /* @@ -822,42 +822,95 @@ static inline int cgroup_path_ve_relative(struct cgroup *ve_root_cgrp, */ void cgroup1_release_agent(struct work_struct *work) { - struct cgroup *cgrp = - container_of(work, struct cgroup, release_agent_work); - char *pathbuf = NULL, *agentbuf = NULL; - char *argv[3], *envp[3]; - int ret; + struct ve_struct *ve; + unsigned long flags; + char *agentbuf; + + agentbuf = kzalloc(PATH_MAX, GFP_KERNEL); + if (!agentbuf) { + pr_warn("failed to allocate agentbuf\n"); + return; + } + ve = container_of(work, struct ve_struct, release_agent_work); mutex_lock(&cgroup_mutex); + spin_lock_irqsave(&v
[Devel] [PATCH VZ8 v3 03/14] ve/cgroup: implemented per-ve workqueue.
Signed-off-by: Valeriy Vdovin Reviewed-by: Kirill Tkhai (Cherry-picked from 0293870666c4f96bd56f612d94f560626c76e2fd) Signed-off-by: Valeriy Vdovin Reviewed-by: Kirill Tkhai --- include/linux/ve.h | 1 + kernel/ve/ve.c | 25 + 2 files changed, 26 insertions(+) diff --git a/include/linux/ve.h b/include/linux/ve.h index 103d0a9044fc..d3c1ab840444 100644 --- a/include/linux/ve.h +++ b/include/linux/ve.h @@ -105,6 +105,7 @@ struct ve_struct { unsigned long aio_nr; unsigned long aio_max_nr; #endif + struct workqueue_struct *wq; }; struct ve_devmnt { diff --git a/kernel/ve/ve.c b/kernel/ve/ve.c index f7d605357d2e..25455264b225 100644 --- a/kernel/ve/ve.c +++ b/kernel/ve/ve.c @@ -388,6 +388,21 @@ static void ve_set_vdso_time(struct ve_struct *ve, u64 time) *vdso_start_time = time; } +static int ve_workqueue_start(struct ve_struct *ve) +{ + ve->wq = alloc_workqueue("ve_wq_%s", + WQ_SYSFS|WQ_FREEZABLE|WQ_UNBOUND, 8, ve->ve_name); + + if (!ve->wq) + return -ENOMEM; + return 0; +} + +static void ve_workqueue_stop(struct ve_struct *ve) +{ + destroy_workqueue(ve->wq); +} + /* under ve->op_sem write-lock */ static int ve_start_container(struct ve_struct *ve) { @@ -443,6 +458,10 @@ static int ve_start_container(struct ve_struct *ve) if (err) goto err_umh; + err = ve_workqueue_start(ve); + if (err) + goto err_workqueue; + err = ve_hook_iterate_init(VE_SS_CHAIN, ve); if (err < 0) goto err_iterate; @@ -458,6 +477,8 @@ static int ve_start_container(struct ve_struct *ve) return 0; err_iterate: + ve_workqueue_stop(ve); +err_workqueue: ve_stop_umh(ve); err_umh: ve_stop_kthreadd(ve); @@ -523,6 +544,8 @@ void ve_exit_ns(struct pid_namespace *pid_ns) cgroup_unmark_ve_roots(ve); + ve_workqueue_stop(ve); + /* * At this point all userspace tasks in container are dead. */ @@ -1363,6 +1386,8 @@ static int __init ve_subsys_init(void) { ve_cachep = KMEM_CACHE_USERCOPY(ve_struct, SLAB_PANIC, core_pattern); list_add(&ve0.ve_list, &ve_list_head); + ve0.wq = alloc_workqueue("ve0_wq", WQ_FREEZABLE|WQ_UNBOUND, 8); + BUG_ON(!ve0.wq); return 0; } late_initcall(ve_subsys_init); -- 2.27.0 ___ Devel mailing list Devel@openvz.org https://lists.openvz.org/mailman/listinfo/devel
[Devel] [PATCH VZ8 v3 04/14] cgroup: added rcu node string wrapper for in-cgroup usage. This will be used in further patches in same patchset.
Signed-off-by: Valeriy Vdovin Reviewed-by: Kirill Tkhai (Cherry-picked from e828803a5d776125c9c329f194aff74fb4ec181a) Signed-off-by: Valeriy Vdovin Reviewed-by: Kirill Tkhai --- include/linux/cgroup-defs.h | 5 + include/linux/cgroup.h | 4 kernel/cgroup/cgroup-v1.c | 15 +++ 3 files changed, 24 insertions(+) diff --git a/include/linux/cgroup-defs.h b/include/linux/cgroup-defs.h index a3b309ab1a90..772fcee71f37 100644 --- a/include/linux/cgroup-defs.h +++ b/include/linux/cgroup-defs.h @@ -360,6 +360,11 @@ struct cgroup_freezer_state { int nr_frozen_tasks; }; +struct cgroup_rcu_string { + struct rcu_head rcu_head; + char val[]; +}; + struct cgroup { /* self css with NULL ->ss, points back to this cgroup */ struct cgroup_subsys_state self; diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h index c0a42c3d43fa..808b31605d07 100644 --- a/include/linux/cgroup.h +++ b/include/linux/cgroup.h @@ -689,6 +689,10 @@ static inline union kernfs_node_id *cgroup_get_kernfs_id(struct cgroup *cgrp) void cgroup_path_from_kernfs_id(const union kernfs_node_id *id, char *buf, size_t buflen); + +struct cgroup_rcu_string; + +struct cgroup_rcu_string *cgroup_rcu_strdup(const char *str, int len); #else /* !CONFIG_CGROUPS */ struct cgroup_subsys_state; diff --git a/kernel/cgroup/cgroup-v1.c b/kernel/cgroup/cgroup-v1.c index db10a1ed282a..fb06fc9d96ca 100644 --- a/kernel/cgroup/cgroup-v1.c +++ b/kernel/cgroup/cgroup-v1.c @@ -231,6 +231,21 @@ static void cgroup_pidlist_destroy_work_fn(struct work_struct *work) kfree(tofree); } +struct cgroup_rcu_string *cgroup_rcu_strdup(const char *str, int len) +{ + struct cgroup_rcu_string *result; + size_t buflen = len + 1; + + result = kmalloc(sizeof(*result) + buflen, GFP_KERNEL); + if (!result) + return ERR_PTR(-ENOMEM); + if (strlcpy(result->val, str, buflen) >= buflen) { + kfree(result); + return ERR_PTR(-ENAMETOOLONG); + } + return result; +} + /* * pidlist_uniq - given a kmalloc()ed list, strip out all duplicate entries * Returns the number of unique elements. -- 2.27.0 ___ Devel mailing list Devel@openvz.org https://lists.openvz.org/mailman/listinfo/devel
[Devel] [PATCH VZ8 v3 06/14] ve/cgroup: Added ve_owner field to cgroup
Each cgroup representing a host or a container root of cgroup subsystem hierarhy will have this field set to a valid ve_struct, that owns this root. This way each cgroup in a system will be able to know it's owning VE. Non root cgroups will have this field set to NULL, this is an optimization for cleanup code: at VE destruction we only need to iterate over all root cgroups to clean reference to former owning VE, rather than over all cgroup hierarchy. Still any cgroup that wants to know about it's owning VE can find it's virtual root cgroup and read it's ve_owner field. cgroup->ve_owner is declared as RCU pointer, because it fits RCU semantics - rare writes/often reads. ve_owner will be read from multiple locations in code in further patches and is only rarely set at cgroup_mark_ve_root/cgroup_mount. cgroup_get_ve_owner is a read wrapper for this purpose. (Cherry-picked from eb9c0bfae39fe336173a0dec11bc24f7275de3f8) Signed-off-by: Valeriy Vdovin Reviewed-by: Kirill Tkhai --- include/linux/cgroup-defs.h | 3 +++ include/linux/cgroup.h | 1 + kernel/cgroup/cgroup.c | 44 + 3 files changed, 48 insertions(+) diff --git a/include/linux/cgroup-defs.h b/include/linux/cgroup-defs.h index 772fcee71f37..a8eb94d2f97f 100644 --- a/include/linux/cgroup-defs.h +++ b/include/linux/cgroup-defs.h @@ -526,6 +526,9 @@ struct cgroup { u64 subgroups_limit; + /* ve_owner, responsible for running release agent. */ + struct ve_struct __rcu *ve_owner; + /* ids of the ancestors at each level including self */ int ancestor_ids[]; }; diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h index 391702cf43bd..17ee29f4071b 100644 --- a/include/linux/cgroup.h +++ b/include/linux/cgroup.h @@ -900,6 +900,7 @@ int cgroup_path_ns(struct cgroup *cgrp, char *buf, size_t buflen, #ifdef CONFIG_VE extern void cgroup_mark_ve_root(struct ve_struct *ve); void cgroup_unmark_ve_roots(struct ve_struct *ve); +struct ve_struct *cgroup_get_ve_owner(struct cgroup *cgrp); #endif #else /* !CONFIG_CGROUPS */ diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c index de105e651607..beb26dd7cd88 100644 --- a/kernel/cgroup/cgroup.c +++ b/kernel/cgroup/cgroup.c @@ -303,6 +303,43 @@ bool cgroup_on_dfl(const struct cgroup *cgrp) return cgrp->root == &cgrp_dfl_root; } +struct cgroup *cgroup_get_local_root(struct cgroup *cgrp) +{ + /* +* Find nearest root cgroup, which might be host cgroup root +* or ve cgroup root. +* +* -> local_root +* \^ +* | +* \ | +* ---> from here +*\ +* -> local_root +* \ ^ +* | +* \ | +* --->from here +*/ + + while (cgrp->kn->parent && !test_bit(CGRP_VE_ROOT, &cgrp->flags)) + cgrp = cgrp->kn->parent->priv; + + return cgrp; +} + +struct ve_struct *cgroup_get_ve_owner(struct cgroup *cgrp) +{ + struct ve_struct *ve; + /* Caller should hold RCU */ + + cgrp = cgroup_get_local_root(cgrp); + ve = rcu_dereference(cgrp->ve_owner); + if (!ve) + ve = get_ve0(); + return ve; +} + /* IDR wrappers which synchronize using cgroup_idr_lock */ static int cgroup_idr_alloc(struct idr *idr, void *ptr, int start, int end, gfp_t gfp_mask) @@ -1900,6 +1937,7 @@ void cgroup_mark_ve_root(struct ve_struct *ve) list_for_each_entry(link, &cset->cgrp_links, cgrp_link) { cgrp = link->cgrp; + rcu_assign_pointer(cgrp->ve_owner, ve); set_bit(CGRP_VE_ROOT, &cgrp->flags); } link_ve_root_cpu_cgroup(cset->subsys[cpu_cgrp_id]); @@ -1907,6 +1945,7 @@ void cgroup_mark_ve_root(struct ve_struct *ve) rcu_read_unlock(); spin_unlock_irq(&css_set_lock); + synchronize_rcu(); } void cgroup_unmark_ve_roots(struct ve_struct *ve) @@ -1924,12 +1963,15 @@ void cgroup_unmark_ve_roots(struct ve_struct *ve) list_for_each_entry(link, &cset->cgrp_links, cgrp_link) { cgrp = link->cgrp; + rcu_assign_pointer(cgrp->ve_owner, NULL); clear_bit(CGRP_VE_ROOT, &cgrp->flags); } unlock: rcu_read_unlock(); spin_unlock_irq(&css_set_lock); + /* ve_owner == NULL will be visible */ + synchronize_rcu(); } struct cgroup *cgroup_get_ve_root1(struct cgroup *cgrp) @@ -2114,6 +2156,8 @@ struct dentry *cgroup_do_mount(struct file_system_type *fs_type, int flags, struct dentry *dentry; bool new_sb = false; + RCU_INIT_POINTER(root-
[Devel] [PATCH VZ8 v3 02/14] cgroup/cfs: added 'activate' option to cgroup_add_file
In kernfs files get created in 'deactivated' state, which means they are not visible. Add option to activate the file after creation immediately making it visible in the parent directory. Will be used in later patches. Signed-off-by: Valeriy Vdovin Reviewed-by: Kirill Tkhai --- kernel/cgroup/cgroup.c | 6 -- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c index b71d4ccb2f0c..946031fcb393 100644 --- a/kernel/cgroup/cgroup.c +++ b/kernel/cgroup/cgroup.c @@ -3823,7 +3823,7 @@ static void cgroup_file_notify_timer(struct timer_list *timer) } static int cgroup_add_file(struct cgroup_subsys_state *css, struct cgroup *cgrp, - struct cftype *cft) + struct cftype *cft, bool activate) { char name[CGROUP_FILE_NAME_MAX]; struct kernfs_node *kn; @@ -3865,6 +3865,8 @@ static int cgroup_add_file(struct cgroup_subsys_state *css, struct cgroup *cgrp, if (IS_ERR(kn_link)) return PTR_ERR(kn_link); } + if (activate) + kernfs_activate(kn); return 0; } @@ -3902,7 +3904,7 @@ static int cgroup_addrm_files(struct cgroup_subsys_state *css, if ((cft->flags & CFTYPE_DEBUG) && !cgroup_debug) continue; if (is_add) { - ret = cgroup_add_file(css, cgrp, cft); + ret = cgroup_add_file(css, cgrp, cft, false); if (ret) { pr_warn("%s: failed to add %s, err=%d\n", __func__, cft->name, ret); -- 2.27.0 ___ Devel mailing list Devel@openvz.org https://lists.openvz.org/mailman/listinfo/devel
[Devel] [PATCH VZ8 v3 01/14] ve/cgroup: unmark ve-root cgroups at container stop
fixes: 915a1130c7ee4ffb6de3f69a5bd98c5ee42a723f Signed-off-by: Valeriy Vdovin Reviewed-by: Kirill Tkhai Cherry-picked from 5dceccf5dd794673ebb1b0e6840d96aa654ec33e) Signed-off-by: Valeriy Vdovin Reviewed-by: Kirill Tkhai --- include/linux/cgroup.h | 1 + kernel/cgroup/cgroup.c | 23 +++ kernel/ve/ve.c | 3 +++ 3 files changed, 27 insertions(+) diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h index 4f0dd51338bf..c0a42c3d43fa 100644 --- a/include/linux/cgroup.h +++ b/include/linux/cgroup.h @@ -867,6 +867,7 @@ int cgroup_path_ns(struct cgroup *cgrp, char *buf, size_t buflen, #ifdef CONFIG_VE extern void cgroup_mark_ve_root(struct ve_struct *ve); +void cgroup_unmark_ve_roots(struct ve_struct *ve); #endif #else /* !CONFIG_CGROUPS */ diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c index 0335a07f64e6..b71d4ccb2f0c 100644 --- a/kernel/cgroup/cgroup.c +++ b/kernel/cgroup/cgroup.c @@ -1908,6 +1908,29 @@ void cgroup_mark_ve_root(struct ve_struct *ve) spin_unlock_irq(&css_set_lock); } +void cgroup_unmark_ve_roots(struct ve_struct *ve) +{ + struct cgrp_cset_link *link; + struct css_set *cset; + struct cgroup *cgrp; + + spin_lock_irq(&css_set_lock); + + rcu_read_lock(); + cset = rcu_dereference(ve->ve_ns)->cgroup_ns->root_cset; + if (WARN_ON(!cset)) + goto unlock; + + list_for_each_entry(link, &cset->cgrp_links, cgrp_link) { + cgrp = link->cgrp; + clear_bit(CGRP_VE_ROOT, &cgrp->flags); + } +unlock: + rcu_read_unlock(); + + spin_unlock_irq(&css_set_lock); +} + struct cgroup *cgroup_get_ve_root1(struct cgroup *cgrp) { struct cgroup *ve_root = NULL; diff --git a/kernel/ve/ve.c b/kernel/ve/ve.c index b83b2b66a875..f7d605357d2e 100644 --- a/kernel/ve/ve.c +++ b/kernel/ve/ve.c @@ -520,6 +520,9 @@ void ve_exit_ns(struct pid_namespace *pid_ns) */ if (!ve_ns || ve_ns->pid_ns_for_children != pid_ns) goto unlock; + + cgroup_unmark_ve_roots(ve); + /* * At this point all userspace tasks in container are dead. */ -- 2.27.0 ___ Devel mailing list Devel@openvz.org https://lists.openvz.org/mailman/listinfo/devel
[Devel] [PATCH VZ8 v3 00/14] Port release_agent virtualization from vz7
This patchset ports virtualization of cgroup release_agent virtualization from vz7. Major challanges of porting are differences between vz7 and vz8 cgroup implementations: - transition of cgroups to kernfs - slightly changed locking scheme, which relies on css_set_lock in places, previously relied on cgroup_mutex. There is a small number of patches that have been ported without modifications, but most of the patches had suffered a lot of modification due to the factors described above. v1: - original patchset v2: - removed port of CGRP_REMOVED due to the use of CSS_ONLINE in VZ8 for same reason - changed ve_set(get)_release_agent_path signature for more optimal - added ve->is_running check before calling userspace executable v3: - use goto after check for ve->is_running in last patch Valeriy Vdovin (14): ve/cgroup: unmark ve-root cgroups at container stop cgroup/cfs: added 'activate' option to cgroup_add_file ve/cgroup: implemented per-ve workqueue. cgroup: added rcu node string wrapper for in-cgroup usage. This will be used in further patches in same patchset. cgroup: exported put_css_set and get_css_set to cgroup.h ve/cgroup: Added ve_owner field to cgroup cgroup/ve: added helper function to get ve-related cgroup paths ve/cgroup: moved release_agent from system_wq to per-ve workqueues ve/cgroup: private per-cgroup-root data container ve/cgroup: set release_agent_path for root cgroups separately ve/cgroup: added release_agent to each container root cgroup. ve/cgroup: At cgroup_mark(unmark)_ve_roots skip non-virtualized roots cgroup/ve: pass cgroup_root to ve_set(get)_release_agent cgroup/ve: do not run release_agent on non-running ve include/linux/cgroup-defs.h | 24 +++- include/linux/cgroup.h | 38 - include/linux/ve.h | 28 kernel/cgroup/cgroup-internal.h | 31 +--- kernel/cgroup/cgroup-v1.c | 235 --- kernel/cgroup/cgroup.c | 212 ++-- kernel/ve/ve.c | 242 +++- 7 files changed, 707 insertions(+), 103 deletions(-) -- 2.27.0 ___ Devel mailing list Devel@openvz.org https://lists.openvz.org/mailman/listinfo/devel
[Devel] [PATCH VZ8 v2 11/14] ve/cgroup: added release_agent to each container root cgroup.
Each container will now have access to it's own cgroup release_agent file. Creation: Normally all cgroup files are created during a call to cgroup_create by cgroup_populate_dir function. It creates or not creates all cgroup files once and they immediately become visible to userspace as filesystem objects. Due to specifics of container creation process, it is not possible to use the same code for 'release_agent' file creation. For VE to start operating, first a list of ordinary cgroups is being created for each subsystem, then the set of newly created cgroups are converted to "virtual roots", so at the time when cgroup_create is executed, there is no knowledge of wheather or not "release_agent" file should be created. This information only comes at "conversion" step which is 'cgroup_mark_ve_roots' function. As the file is created dynamically in a live cgroup, a rather delicate locking sequence is present in the new code: - each new "virtual root" cgroup will have to add "release_agent" file, thus each cgroup's directory would need to be locked during the insertion time by cgroup->dentry->d_inode->i_mutex. - d_inode->i_mutex has an ordering dependency with cgroup_mutex (see cgroup_mount/cgroup_remount). They can not be locked in order {lock(cgroup_mutex), lock(inode->i_mutex)}. - to collect a list of cgroups, that need to become virtual we need cgroup_mutex lock to iterate active roots. - to overcome the above conflict we first need to collect a list of all virtual cgroups under cgroup_mutex lock, then release it and after that to insert "release_agent" to each root under inode->i_mutex lock. - to collect a list of cgroups on stack we utilize cgroup->cft_q_node, made specially for that purpose under it's own cgroup_cft_mutex. Destruction: Destruction is done in reverse from the above within cgroup_unmark_ve_roots. After file destruction we must prevent further write operations to this file in case when someone has opened this file prior to VE and cgroup destruction. This is achieved by checking if cgroup in the argument to cgroup_file_write function has features of host or virtual root. https://jira.sw.ru/browse/PSBM-83887 Signed-off-by: Valeriy Vdovin +++ cgroup: add missing dput() in cgroup_unmark_ve_roots() cgroup_unmark_ve_roots() calls dget() on cgroup's dentry but don't have the corresponding dput() call. This leads to leaking cgroups. Add missing dput() to fix this. https://jira.sw.ru/browse/PSBM-107328 Fixes: 1ac69e183447 ("ve/cgroup: added release_agent to each container root cgroup.") (Cherry-picked from 4a1635024df1bae4f4809a3bc445f0cf64d4acf4) Signed-off-by: Valeriy Vdovin Reviewed-by: Kirill Tkhai --- include/linux/cgroup-defs.h | 3 + include/linux/cgroup.h | 2 +- include/linux/ve.h | 4 +- kernel/cgroup/cgroup-internal.h | 1 + kernel/cgroup/cgroup-v1.c | 37 +- kernel/cgroup/cgroup.c | 127 ++-- kernel/ve/ve.c | 42 --- 7 files changed, 179 insertions(+), 37 deletions(-) diff --git a/include/linux/cgroup-defs.h b/include/linux/cgroup-defs.h index 57ee48874404..be7d0f599179 100644 --- a/include/linux/cgroup-defs.h +++ b/include/linux/cgroup-defs.h @@ -451,6 +451,9 @@ struct cgroup { */ struct list_head cset_links; + /* Used for cgroup_mark/umark ve */ + struct list_head cft_q_node; + /* * Linked list running through all cgroups that can * potentially be reaped by the release agent. Protected by diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h index 6693cd36fd82..8f0d057abb25 100644 --- a/include/linux/cgroup.h +++ b/include/linux/cgroup.h @@ -900,7 +900,7 @@ int cgroup_path_ns(struct cgroup *cgrp, char *buf, size_t buflen, void cgroup1_release_agent(struct work_struct *work); #ifdef CONFIG_VE -extern void cgroup_mark_ve_root(struct ve_struct *ve); +int cgroup_mark_ve_roots(struct ve_struct *ve); void cgroup_unmark_ve_roots(struct ve_struct *ve); struct ve_struct *cgroup_get_ve_owner(struct cgroup *cgrp); #endif diff --git a/include/linux/ve.h b/include/linux/ve.h index 65c19f2b9b98..7cef4b39847e 100644 --- a/include/linux/ve.h +++ b/include/linux/ve.h @@ -145,11 +145,13 @@ extern int nr_ve; void ve_add_to_release_list(struct cgroup *cgrp); void ve_rm_from_release_list(struct cgroup *cgrp); -int ve_set_release_agent_path(struct ve_struct *ve, struct cgroup *cgroot, +int ve_set_release_agent_path(struct cgroup *cgroot, const char *release_agent); const char *ve_get_release_agent_path(struct cgroup *cgrp_root); +void ve_cleanup_per_cgroot_data(struct ve_struct *ve, struct cgroup *cgrp); + extern struct ve_struct *get_ve
[Devel] [PATCH VZ8 v2 04/14] cgroup: added rcu node string wrapper for in-cgroup usage. This will be used in further patches in same patchset.
Signed-off-by: Valeriy Vdovin Reviewed-by: Kirill Tkhai (Cherry-picked from e828803a5d776125c9c329f194aff74fb4ec181a) Signed-off-by: Valeriy Vdovin Reviewed-by: Kirill Tkhai --- include/linux/cgroup-defs.h | 5 + include/linux/cgroup.h | 4 kernel/cgroup/cgroup-v1.c | 15 +++ 3 files changed, 24 insertions(+) diff --git a/include/linux/cgroup-defs.h b/include/linux/cgroup-defs.h index a3b309ab1a90..772fcee71f37 100644 --- a/include/linux/cgroup-defs.h +++ b/include/linux/cgroup-defs.h @@ -360,6 +360,11 @@ struct cgroup_freezer_state { int nr_frozen_tasks; }; +struct cgroup_rcu_string { + struct rcu_head rcu_head; + char val[]; +}; + struct cgroup { /* self css with NULL ->ss, points back to this cgroup */ struct cgroup_subsys_state self; diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h index c0a42c3d43fa..808b31605d07 100644 --- a/include/linux/cgroup.h +++ b/include/linux/cgroup.h @@ -689,6 +689,10 @@ static inline union kernfs_node_id *cgroup_get_kernfs_id(struct cgroup *cgrp) void cgroup_path_from_kernfs_id(const union kernfs_node_id *id, char *buf, size_t buflen); + +struct cgroup_rcu_string; + +struct cgroup_rcu_string *cgroup_rcu_strdup(const char *str, int len); #else /* !CONFIG_CGROUPS */ struct cgroup_subsys_state; diff --git a/kernel/cgroup/cgroup-v1.c b/kernel/cgroup/cgroup-v1.c index db10a1ed282a..fb06fc9d96ca 100644 --- a/kernel/cgroup/cgroup-v1.c +++ b/kernel/cgroup/cgroup-v1.c @@ -231,6 +231,21 @@ static void cgroup_pidlist_destroy_work_fn(struct work_struct *work) kfree(tofree); } +struct cgroup_rcu_string *cgroup_rcu_strdup(const char *str, int len) +{ + struct cgroup_rcu_string *result; + size_t buflen = len + 1; + + result = kmalloc(sizeof(*result) + buflen, GFP_KERNEL); + if (!result) + return ERR_PTR(-ENOMEM); + if (strlcpy(result->val, str, buflen) >= buflen) { + kfree(result); + return ERR_PTR(-ENAMETOOLONG); + } + return result; +} + /* * pidlist_uniq - given a kmalloc()ed list, strip out all duplicate entries * Returns the number of unique elements. -- 2.27.0 ___ Devel mailing list Devel@openvz.org https://lists.openvz.org/mailman/listinfo/devel
[Devel] [PATCH VZ8 v2 07/14] cgroup/ve: added helper function to get ve-related cgroup paths
This fill make fake-absolute paths to support virtual ve roots in cgroup hierarchies. The path will be used in subsequent patches. Signed-off-by: Valeriy.Vdovin Reviewed-by: Kirill Tkhai --- kernel/cgroup/cgroup-v1.c | 10 ++ 1 file changed, 10 insertions(+) diff --git a/kernel/cgroup/cgroup-v1.c b/kernel/cgroup/cgroup-v1.c index fb06fc9d96ca..21a7c36fbf44 100644 --- a/kernel/cgroup/cgroup-v1.c +++ b/kernel/cgroup/cgroup-v1.c @@ -787,6 +787,16 @@ void cgroup1_check_for_release(struct cgroup *cgrp) schedule_work(&cgrp->release_agent_work); } +/* + * Used to get a fake-absolute path to a cgroup on kernfs filesystem, but it + * actually be relative to cgroup root, provided in the argument. + */ +static inline int cgroup_path_ve_relative(struct cgroup *ve_root_cgrp, + struct cgroup *cgrp, char *buf, size_t buflen) +{ + return kernfs_path_from_node(cgrp->kn, ve_root_cgrp->kn, buf, buflen); +} + /* * Notify userspace when a cgroup is released, by running the * configured release agent with the name of the cgroup (path -- 2.27.0 ___ Devel mailing list Devel@openvz.org https://lists.openvz.org/mailman/listinfo/devel
[Devel] [PATCH VZ8 v2 10/14] ve/cgroup: set release_agent_path for root cgroups separately
This is done so that each container could set it's own release agent. Release agent information is now stored in per-cgroup-root data structure in ve. https://jira.sw.ru/browse/PSBM-83887 Signed-off-by: Valeriy Vdovin +++ ve/cgroup: change resource release order in ve_drop_context This fixes 87cb5fdb5b5c77ac617b46a0fe118a7d50a77b1c In the mentioned patch in cgroup_show_options ve->ve_ns is checked to ensure that ve->root_css_set is usable. But in ve_drop_context root_css_set is being released before ve_ns, which is a bug. root_css_set will now be set to NULL after ve_ns is released. This reordering only affects the described piece of code in cgroup_show_options. https://jira.sw.ru/browse/PSBM-121438 Signed-off-by: Valeriy Vdovin Reviewed-by: Kirill Tkhai +++ cgroup: do not use cgroup_mutex in cgroup_show_options In 87cb5fdb5b5c77ac617b46a0fe118a7d50a77b1c function cgroup_show_options started to lock cgroup_mutex, which introduced new deadlock possibility, described below: Thread A: m_start() --> down_read(&namespace_sem); cgroup_show_options() --> mutex_lock(&cgroup_mutex); Thread B: attach_task_by_pid() cgroup_lock_live_group --> mutex_lock(&cgroup_mutex); threadgroup_lock() --> down_write(&tsk->signal->group_rwsem); Thread C: copy_process threadgroup_change_begin() --> down_read(&tsk->signal->group_rwsem); copy_namespaces create_new_namespaces copy_mnt_ns namespace_lock() --> down_write(&namespace_sem) Clearly cgroup_mutex can not be locked right after locking namespace_sem, because opposite locking order is also present in the code and should be removed from cgroup_show_options. After reviewing cgroup_show_options, it was established that cgroup_mutex is not absolutely needed to guarantee safe access to root_cgrp. It was used in combination with a call to task_cgroup_from_root to ensure that root_cgrp lived long enough to access it's value of release_agent path. But in this funciton we know that root_cgrp is part of ve->root_css_set, which holds reference to it. In turn root_css_set is referenced while ve->ve_ns is not NULL, the check of which we already have in the code. This means that root_cgrp is valid until ve->ve_ns is valid. ve->ve_ns is valid until the point of rcu_synchronize in ve_drop_context, that's why rcu_read_lock should be maintained all the time when root_cgrp is being accessed. The patch also removes BUG_ON from css_cgroup_from_root, because all 3 calls to this function pass ve->root_css_set as an argument and the above logic applies. https://jira.sw.ru/browse/PSBM-121438 Signed-off-by: Valeriy Vdovin Reviewed-by: Kirill Tkhai +++ ve: cleanup in function ve_get_release_agent_path (Cherry-picked from f1199bd9589b7c0914343dcc72f49ddaa9b98496) Signed-off-by: Valeriy Vdovin Reviewed-by: Kirill Tkhai --- include/linux/cgroup-defs.h | 3 -- include/linux/ve.h | 6 +++ kernel/cgroup/cgroup-internal.h | 4 +- kernel/cgroup/cgroup-v1.c | 86 ++--- kernel/cgroup/cgroup.c | 9 ++-- kernel/ve/ve.c | 76 + 6 files changed, 150 insertions(+), 34 deletions(-) diff --git a/include/linux/cgroup-defs.h b/include/linux/cgroup-defs.h index 22d84aa0778e..57ee48874404 100644 --- a/include/linux/cgroup-defs.h +++ b/include/linux/cgroup-defs.h @@ -569,9 +569,6 @@ struct cgroup_root { /* IDs for cgroups in this hierarchy */ struct idr cgroup_idr; - /* The path to use for release notifications. */ - char release_agent_path[PATH_MAX]; - /* The name for this hierarchy - may be empty */ char name[MAX_CGROUP_ROOT_NAMELEN]; }; diff --git a/include/linux/ve.h b/include/linux/ve.h index 44369dddeb24..65c19f2b9b98 100644 --- a/include/linux/ve.h +++ b/include/linux/ve.h @@ -144,6 +144,12 @@ extern int nr_ve; #ifdef CONFIG_VE void ve_add_to_release_list(struct cgroup *cgrp); void ve_rm_from_release_list(struct cgroup *cgrp); + +int ve_set_release_agent_path(struct ve_struct *ve, struct cgroup *cgroot, + const char *release_agent); + +const char *ve_get_release_agent_path(struct cgroup *cgrp_root); + extern struct ve_struct *get_ve(struct ve_struct *ve); extern void put_ve(struct ve_struct *ve); diff --git a/kernel/cgroup/cgroup-internal.h b/kernel/cgroup/cgroup-internal.h index 4de66630d456..be0cd157d4dc 100644 --- a/kernel/cgroup/cgroup-internal.h +++ b/kernel/cgroup/cgroup-internal.h @@ -160,6 +160,9 @@ static inline bool notify_on_release(const struct cgroup *cgrp) return test_bit(CGRP_NOTIFY_ON_RELEASE, &cgrp->flags); } +struct cgroup *cset_cgroup_from_root(struct css_set *cset, + struct cgroup_root *root); + bool cgroup_ssid_enabled(int ssid); bool cgroup_on_dfl(const struct cgroup *cgrp); bool cgroup_is_thread_root(struct cgroup *cgrp); @@ -
[Devel] [PATCH VZ8 v2 08/14] ve/cgroup: moved release_agent from system_wq to per-ve workqueues
Each VE should execute release agent notifications within it's own workqueue. This way we achieve a more fine-grained control over release_agent work flushing at VE destruction. (Cherry-picked from 9fbfb5b4cfb87ba7c9dd63eec5e5e27946a38d3c) Signed-off-by: Valeriy Vdovin Reviewed-by: Kirill Tkhai --- include/linux/cgroup-defs.h | 10 ++- include/linux/cgroup.h | 2 + include/linux/ve.h | 10 +++ kernel/cgroup/cgroup-internal.h | 1 + kernel/cgroup/cgroup-v1.c | 109 kernel/cgroup/cgroup.c | 12 +++- kernel/ve/ve.c | 48 ++ 7 files changed, 159 insertions(+), 33 deletions(-) diff --git a/include/linux/cgroup-defs.h b/include/linux/cgroup-defs.h index a8eb94d2f97f..22d84aa0778e 100644 --- a/include/linux/cgroup-defs.h +++ b/include/linux/cgroup-defs.h @@ -451,6 +451,13 @@ struct cgroup { */ struct list_head cset_links; + /* +* Linked list running through all cgroups that can +* potentially be reaped by the release agent. Protected by +* release_list_lock +*/ + struct list_head release_list; + /* * On the default hierarchy, a css_set for a cgroup with some * susbsys disabled will point to css's which are associated with @@ -488,9 +495,6 @@ struct cgroup { /* used to wait for offlining of csses */ wait_queue_head_t offline_waitq; - /* used to schedule release agent */ - struct work_struct release_agent_work; - /* used to store eBPF programs */ struct cgroup_bpf bpf; diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h index 17ee29f4071b..6693cd36fd82 100644 --- a/include/linux/cgroup.h +++ b/include/linux/cgroup.h @@ -897,6 +897,8 @@ struct cgroup_namespace *copy_cgroup_ns(unsigned long flags, int cgroup_path_ns(struct cgroup *cgrp, char *buf, size_t buflen, struct cgroup_namespace *ns); +void cgroup1_release_agent(struct work_struct *work); + #ifdef CONFIG_VE extern void cgroup_mark_ve_root(struct ve_struct *ve); void cgroup_unmark_ve_roots(struct ve_struct *ve); diff --git a/include/linux/ve.h b/include/linux/ve.h index d3c1ab840444..2ab39b607708 100644 --- a/include/linux/ve.h +++ b/include/linux/ve.h @@ -105,7 +105,15 @@ struct ve_struct { unsigned long aio_nr; unsigned long aio_max_nr; #endif + /* +* cgroups, that want to notify about becoming +* empty, are linked to this release_list. +*/ + struct list_headrelease_list; + spinlock_t release_list_lock; + struct workqueue_struct *wq; + struct work_struct release_agent_work; }; struct ve_devmnt { @@ -127,6 +135,8 @@ extern int nr_ve; (ve_is_super(get_exec_env()) && capable(CAP_SYS_ADMIN)) #ifdef CONFIG_VE +void ve_add_to_release_list(struct cgroup *cgrp); +void ve_rm_from_release_list(struct cgroup *cgrp); extern struct ve_struct *get_ve(struct ve_struct *ve); extern void put_ve(struct ve_struct *ve); diff --git a/kernel/cgroup/cgroup-internal.h b/kernel/cgroup/cgroup-internal.h index 829997989c41..4de66630d456 100644 --- a/kernel/cgroup/cgroup-internal.h +++ b/kernel/cgroup/cgroup-internal.h @@ -135,6 +135,7 @@ extern spinlock_t css_set_lock; extern struct cgroup_subsys *cgroup_subsys[]; extern struct list_head cgroup_roots; extern struct file_system_type cgroup_fs_type; +struct cgroup *cgroup_get_local_root(struct cgroup *cgrp); /* iterate across the hierarchies */ #define for_each_root(root)\ diff --git a/kernel/cgroup/cgroup-v1.c b/kernel/cgroup/cgroup-v1.c index 21a7c36fbf44..c1891317ae3a 100644 --- a/kernel/cgroup/cgroup-v1.c +++ b/kernel/cgroup/cgroup-v1.c @@ -784,7 +784,7 @@ void cgroup1_check_for_release(struct cgroup *cgrp) { if (notify_on_release(cgrp) && !cgroup_is_populated(cgrp) && !css_has_online_children(&cgrp->self) && !cgroup_is_dead(cgrp)) - schedule_work(&cgrp->release_agent_work); + ve_add_to_release_list(cgrp); } /* @@ -822,42 +822,95 @@ static inline int cgroup_path_ve_relative(struct cgroup *ve_root_cgrp, */ void cgroup1_release_agent(struct work_struct *work) { - struct cgroup *cgrp = - container_of(work, struct cgroup, release_agent_work); - char *pathbuf = NULL, *agentbuf = NULL; - char *argv[3], *envp[3]; - int ret; + struct ve_struct *ve; + unsigned long flags; + char *agentbuf; + + agentbuf = kzalloc(PATH_MAX, GFP_KERNEL); + if (!agentbuf) { + pr_warn("failed to allocate agentbuf\n"); + return; + } + ve = container_of(work, struct ve_struct, release_agent_work); mutex_lock(&cgroup_mutex); + spin_lock_irqsave(&v
[Devel] [PATCH VZ8 v2 14/14] cgroup/ve: do not run release_agent on non-running ve
cgroup1_release_agent is a function that runs within a private ve workqueue. When executed, it runs an executable in a userspace by a call to call_usermodehelper_ve. There is conflict that when ve is getting shutdown and some of last cgroups get's deleted at the same time, the workqueue might still be running, but ve_stop_ns has already been called. ve_stop_ns will stop usermode helper threads, needed for call_usermodehelper_ve. Because of that a call to call_usermodehelper_ve will never return, causing a hang. To defeat that hang VZ7 code of call_usermodehelper_ve included the check that ve is still running before running the userspace executable. It also checked for ve->init_task->flags & PF_EXITING condition. But in VZ8 the whole usermodehelper infrastructure is much more different. Also VZ8 does not have ve->init_task in it's fields. That is why it seems more relevant right now to do ve->is_running check before the call to call_usermodehelper_ve. Signed-off-by: Valeriy Vdovin --- kernel/cgroup/cgroup-v1.c | 7 +++ 1 file changed, 7 insertions(+) diff --git a/kernel/cgroup/cgroup-v1.c b/kernel/cgroup/cgroup-v1.c index 993ac38b895f..2521d2727b42 100644 --- a/kernel/cgroup/cgroup-v1.c +++ b/kernel/cgroup/cgroup-v1.c @@ -937,6 +937,13 @@ void cgroup1_release_agent(struct work_struct *work) mutex_unlock(&cgroup_mutex); + down_write(&ve->op_sem); + if (!ve->is_running) { + up_write(&ve->op_sem); + mutex_lock(&cgroup_mutex); + goto continue_free; + } + err = call_usermodehelper_ve(ve, argv[0], argv, envp, UMH_WAIT_EXEC); -- 2.27.0 ___ Devel mailing list Devel@openvz.org https://lists.openvz.org/mailman/listinfo/devel
[Devel] [PATCH VZ8 v2 09/14] ve/cgroup: private per-cgroup-root data container
As long as each ve is internally attached to a particular css_set via it's init_task, it's good to have container with parameters, which are common to each cgroup subsystem hierarchy, rooting from it's virtual root. (Cherry-picked from 4a98f07102fd248ad4218a07b5ec5ec90da10288) Signed-off-by: Valeriy Vdovin Reviewed-by: Kirill Tkhai --- include/linux/ve.h | 7 + kernel/ve/ve.c | 75 ++ 2 files changed, 82 insertions(+) diff --git a/include/linux/ve.h b/include/linux/ve.h index 2ab39b607708..44369dddeb24 100644 --- a/include/linux/ve.h +++ b/include/linux/ve.h @@ -114,6 +114,13 @@ struct ve_struct { struct workqueue_struct *wq; struct work_struct release_agent_work; + + /* +* List of data, private for each root cgroup in +* ve's css_set. +*/ + struct list_headper_cgroot_list; + spinlock_t per_cgroot_list_lock; }; struct ve_devmnt { diff --git a/kernel/ve/ve.c b/kernel/ve/ve.c index 934a5ff1c9bb..a108cb63bc9f 100644 --- a/kernel/ve/ve.c +++ b/kernel/ve/ve.c @@ -30,6 +30,14 @@ #include "../cgroup/cgroup-internal.h" /* For cgroup_task_count() */ +struct per_cgroot_data { + struct list_head list; + /* +* data is related to this cgroup +*/ + struct cgroup *cgroot; +}; + extern struct kmapset_set sysfs_ve_perms_set; static struct kmem_cache *ve_cachep; @@ -67,6 +75,9 @@ struct ve_struct ve0 = { .release_list = LIST_HEAD_INIT(ve0.release_list), .release_agent_work = __WORK_INITIALIZER(ve0.release_agent_work, cgroup1_release_agent), + .per_cgroot_list= LIST_HEAD_INIT(ve0.per_cgroot_list), + .per_cgroot_list_lock = __SPIN_LOCK_UNLOCKED( + ve0.per_cgroot_list_lock), }; EXPORT_SYMBOL(ve0); @@ -199,6 +210,53 @@ int nr_threads_ve(struct ve_struct *ve) } EXPORT_SYMBOL(nr_threads_ve); +static struct per_cgroot_data *per_cgroot_data_find_locked( + struct list_head *per_cgroot_list, struct cgroup *cgroot) +{ + struct per_cgroot_data *data; + + list_for_each_entry(data, per_cgroot_list, list) { + if (data->cgroot == cgroot) + return data; + } + return NULL; +} + +static inline struct per_cgroot_data *per_cgroot_get_or_create( + struct ve_struct *ve, struct cgroup *cgroot) +{ + struct per_cgroot_data *data, *other_data; + unsigned long flags; + + spin_lock_irqsave(&ve->per_cgroot_list_lock, flags); + data = per_cgroot_data_find_locked(&ve->per_cgroot_list, + cgroot); + spin_unlock_irqrestore(&ve->per_cgroot_list_lock, flags); + + if (data) + return data; + + data = kzalloc(sizeof(struct per_cgroot_data), GFP_KERNEL); + if (!data) + return ERR_PTR(-ENOMEM); + + spin_lock_irqsave(&ve->per_cgroot_list_lock, flags); + other_data = per_cgroot_data_find_locked(&ve->per_cgroot_list, + cgroot); + + if (other_data) { + spin_unlock_irqrestore(&ve->per_cgroot_list_lock, flags); + kfree(data); + return other_data; + } + + data->cgroot = cgroot; + list_add(&data->list, &ve->per_cgroot_list); + + spin_unlock_irqrestore(&ve->per_cgroot_list_lock, flags); + return data; +} + struct cgroup_subsys_state *ve_get_init_css(struct ve_struct *ve, int subsys_id) { struct cgroup_subsys_state *css; @@ -533,6 +591,19 @@ static int ve_start_container(struct ve_struct *ve) return err; } +static void ve_per_cgroot_free(struct ve_struct *ve) +{ + struct per_cgroot_data *data, *saved; + unsigned long flags; + + spin_lock_irqsave(&ve->per_cgroot_list_lock, flags); + list_for_each_entry_safe(data, saved, &ve->per_cgroot_list, list) { + list_del_init(&data->list); + kfree(data); + } + spin_unlock_irqrestore(&ve->per_cgroot_list_lock, flags); +} + void ve_stop_ns(struct pid_namespace *pid_ns) { struct ve_struct *ve = current->task_ve; @@ -589,6 +660,8 @@ void ve_exit_ns(struct pid_namespace *pid_ns) ve_workqueue_stop(ve); + ve_per_cgroot_free(ve); + /* * At this point all userspace tasks in container are dead. */ @@ -699,6 +772,7 @@ static struct cgroup_subsys_state *ve_create(struct cgroup_subsys_state *parent_ INIT_WORK(&ve->release_agent_work, cgroup1_release_agent); spin_lock_init(&ve->release_list_lock); + spin_lock_init(&ve->per_cgroot_list_lock); ve->_randomize_va_space = ve0._randomize_va_space; @@ -721,6 +795,7 @@ static struct cgro
[Devel] [PATCH VZ8 v2 00/14] Port release_agent virtualization from vz7
This patchset ports virtualization of cgroup release_agent virtualization from vz7. Major challanges of porting are differences between vz7 and vz8 cgroup implementations: - transition of cgroups to kernfs - slightly changed locking scheme, which relies on css_set_lock in places, previously relied on cgroup_mutex. There is a small number of patches that have been ported without modifications, but most of the patches had suffered a lot of modification due to the factors described above. v1: - original patchset v2: - removed port of CGRP_REMOVED due to the use of CSS_ONLINE in VZ8 for same reason - changed ve_set(get)_release_agent_path signature for more optimal - added ve->is_running check before calling userspace executable Valeriy Vdovin (14): ve/cgroup: unmark ve-root cgroups at container stop cgroup/cfs: added 'activate' option to cgroup_add_file ve/cgroup: implemented per-ve workqueue. cgroup: added rcu node string wrapper for in-cgroup usage. This will be used in further patches in same patchset. cgroup: exported put_css_set and get_css_set to cgroup.h ve/cgroup: Added ve_owner field to cgroup cgroup/ve: added helper function to get ve-related cgroup paths ve/cgroup: moved release_agent from system_wq to per-ve workqueues ve/cgroup: private per-cgroup-root data container ve/cgroup: set release_agent_path for root cgroups separately ve/cgroup: added release_agent to each container root cgroup. ve/cgroup: At cgroup_mark(unmark)_ve_roots skip non-virtualized roots cgroup/ve: pass cgroup_root to ve_set(get)_release_agent cgroup/ve: do not run release_agent on non-running ve include/linux/cgroup-defs.h | 24 +++- include/linux/cgroup.h | 38 - include/linux/ve.h | 28 kernel/cgroup/cgroup-internal.h | 31 +--- kernel/cgroup/cgroup-v1.c | 238 --- kernel/cgroup/cgroup.c | 212 ++-- kernel/ve/ve.c | 242 +++- 7 files changed, 710 insertions(+), 103 deletions(-) -- 2.27.0 ___ Devel mailing list Devel@openvz.org https://lists.openvz.org/mailman/listinfo/devel
[Devel] [PATCH VZ8 v2 12/14] ve/cgroup: At cgroup_mark(unmark)_ve_roots skip non-virtualized roots
During container start there might be a situation when not all cgroup hierarchies get virtualized by container manager (like vzctl). By virtualizing a cgroup hierarchy I mean creation of sub-directory within a particular mounted cgroup. When container starts it looks in css set of it's init process to list all affilated cgroups and perform actions on each. But non-virtualized cgroups will also be present in init's css_set and they should not be touched from inside of any non root ve. Signed-off-by: Valeriy Vdovin Reviewed-by: Kirill Tkhai --- kernel/cgroup/cgroup.c | 17 + 1 file changed, 17 insertions(+) diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c index 8aea78f07b5b..75997b503d3c 100644 --- a/kernel/cgroup/cgroup.c +++ b/kernel/cgroup/cgroup.c @@ -1927,6 +1927,23 @@ static int cgroup_add_file(struct cgroup_subsys_state *css, struct cgroup *cgrp, struct cftype *cft, bool activate); #ifdef CONFIG_VE +static inline bool is_virtualized_cgroup(struct cgroup *cgrp) +{ + /* +* no parent means this is the host cgroup +*/ + if (!cgrp->kn->parent) + return false; + + if (cgrp->root->subsys_mask) + return true; + + if (!strcmp(cgrp->root->name, "systemd")) + return true; + + return false; +} + int cgroup_mark_ve_roots(struct ve_struct *ve) { int err; -- 2.27.0 ___ Devel mailing list Devel@openvz.org https://lists.openvz.org/mailman/listinfo/devel
[Devel] [PATCH VZ8 v2 06/14] ve/cgroup: Added ve_owner field to cgroup
Each cgroup representing a host or a container root of cgroup subsystem hierarhy will have this field set to a valid ve_struct, that owns this root. This way each cgroup in a system will be able to know it's owning VE. Non root cgroups will have this field set to NULL, this is an optimization for cleanup code: at VE destruction we only need to iterate over all root cgroups to clean reference to former owning VE, rather than over all cgroup hierarchy. Still any cgroup that wants to know about it's owning VE can find it's virtual root cgroup and read it's ve_owner field. cgroup->ve_owner is declared as RCU pointer, because it fits RCU semantics - rare writes/often reads. ve_owner will be read from multiple locations in code in further patches and is only rarely set at cgroup_mark_ve_root/cgroup_mount. cgroup_get_ve_owner is a read wrapper for this purpose. (Cherry-picked from eb9c0bfae39fe336173a0dec11bc24f7275de3f8) Signed-off-by: Valeriy Vdovin Reviewed-by: Kirill Tkhai --- include/linux/cgroup-defs.h | 3 +++ include/linux/cgroup.h | 1 + kernel/cgroup/cgroup.c | 44 + 3 files changed, 48 insertions(+) diff --git a/include/linux/cgroup-defs.h b/include/linux/cgroup-defs.h index 772fcee71f37..a8eb94d2f97f 100644 --- a/include/linux/cgroup-defs.h +++ b/include/linux/cgroup-defs.h @@ -526,6 +526,9 @@ struct cgroup { u64 subgroups_limit; + /* ve_owner, responsible for running release agent. */ + struct ve_struct __rcu *ve_owner; + /* ids of the ancestors at each level including self */ int ancestor_ids[]; }; diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h index 391702cf43bd..17ee29f4071b 100644 --- a/include/linux/cgroup.h +++ b/include/linux/cgroup.h @@ -900,6 +900,7 @@ int cgroup_path_ns(struct cgroup *cgrp, char *buf, size_t buflen, #ifdef CONFIG_VE extern void cgroup_mark_ve_root(struct ve_struct *ve); void cgroup_unmark_ve_roots(struct ve_struct *ve); +struct ve_struct *cgroup_get_ve_owner(struct cgroup *cgrp); #endif #else /* !CONFIG_CGROUPS */ diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c index de105e651607..beb26dd7cd88 100644 --- a/kernel/cgroup/cgroup.c +++ b/kernel/cgroup/cgroup.c @@ -303,6 +303,43 @@ bool cgroup_on_dfl(const struct cgroup *cgrp) return cgrp->root == &cgrp_dfl_root; } +struct cgroup *cgroup_get_local_root(struct cgroup *cgrp) +{ + /* +* Find nearest root cgroup, which might be host cgroup root +* or ve cgroup root. +* +* -> local_root +* \^ +* | +* \ | +* ---> from here +*\ +* -> local_root +* \ ^ +* | +* \ | +* --->from here +*/ + + while (cgrp->kn->parent && !test_bit(CGRP_VE_ROOT, &cgrp->flags)) + cgrp = cgrp->kn->parent->priv; + + return cgrp; +} + +struct ve_struct *cgroup_get_ve_owner(struct cgroup *cgrp) +{ + struct ve_struct *ve; + /* Caller should hold RCU */ + + cgrp = cgroup_get_local_root(cgrp); + ve = rcu_dereference(cgrp->ve_owner); + if (!ve) + ve = get_ve0(); + return ve; +} + /* IDR wrappers which synchronize using cgroup_idr_lock */ static int cgroup_idr_alloc(struct idr *idr, void *ptr, int start, int end, gfp_t gfp_mask) @@ -1900,6 +1937,7 @@ void cgroup_mark_ve_root(struct ve_struct *ve) list_for_each_entry(link, &cset->cgrp_links, cgrp_link) { cgrp = link->cgrp; + rcu_assign_pointer(cgrp->ve_owner, ve); set_bit(CGRP_VE_ROOT, &cgrp->flags); } link_ve_root_cpu_cgroup(cset->subsys[cpu_cgrp_id]); @@ -1907,6 +1945,7 @@ void cgroup_mark_ve_root(struct ve_struct *ve) rcu_read_unlock(); spin_unlock_irq(&css_set_lock); + synchronize_rcu(); } void cgroup_unmark_ve_roots(struct ve_struct *ve) @@ -1924,12 +1963,15 @@ void cgroup_unmark_ve_roots(struct ve_struct *ve) list_for_each_entry(link, &cset->cgrp_links, cgrp_link) { cgrp = link->cgrp; + rcu_assign_pointer(cgrp->ve_owner, NULL); clear_bit(CGRP_VE_ROOT, &cgrp->flags); } unlock: rcu_read_unlock(); spin_unlock_irq(&css_set_lock); + /* ve_owner == NULL will be visible */ + synchronize_rcu(); } struct cgroup *cgroup_get_ve_root1(struct cgroup *cgrp) @@ -2114,6 +2156,8 @@ struct dentry *cgroup_do_mount(struct file_system_type *fs_type, int flags, struct dentry *dentry; bool new_sb = false; + RCU_INIT_POINTER(root-
[Devel] [PATCH VZ8 v2 05/14] cgroup: exported put_css_set and get_css_set to cgroup.h
(Cherry-picked from 8222bbe47ed1e3824e0890a1404735324189c0cb) Signed-off-by: Valeriy.Vdovin Reviewed-by: Kirill Tkhai --- include/linux/cgroup.h | 28 kernel/cgroup/cgroup-internal.h | 27 --- kernel/cgroup/cgroup.c | 1 + 3 files changed, 29 insertions(+), 27 deletions(-) diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h index 808b31605d07..391702cf43bd 100644 --- a/include/linux/cgroup.h +++ b/include/linux/cgroup.h @@ -512,6 +512,34 @@ task_get_css(struct task_struct *task, int subsys_id) return css; } +void put_css_set_locked(struct css_set *cset); + +static inline void put_css_set(struct css_set *cset) +{ + unsigned long flags; + + /* +* Ensure that the refcount doesn't hit zero while any readers +* can see it. Similar to atomic_dec_and_lock(), but for an +* rwlock +*/ + if (refcount_dec_not_one(&cset->refcount)) + return; + + spin_lock_irqsave(&css_set_lock, flags); + put_css_set_locked(cset); + spin_unlock_irqrestore(&css_set_lock, flags); +} + +/* + * refcounted get/put for css_set objects + */ +static inline void get_css_set(struct css_set *cset) +{ + refcount_inc(&cset->refcount); +} + + /** * task_css_is_root - test whether a task belongs to the root css * @task: the target task diff --git a/kernel/cgroup/cgroup-internal.h b/kernel/cgroup/cgroup-internal.h index ce1c1553c696..829997989c41 100644 --- a/kernel/cgroup/cgroup-internal.h +++ b/kernel/cgroup/cgroup-internal.h @@ -159,33 +159,6 @@ static inline bool notify_on_release(const struct cgroup *cgrp) return test_bit(CGRP_NOTIFY_ON_RELEASE, &cgrp->flags); } -void put_css_set_locked(struct css_set *cset); - -static inline void put_css_set(struct css_set *cset) -{ - unsigned long flags; - - /* -* Ensure that the refcount doesn't hit zero while any readers -* can see it. Similar to atomic_dec_and_lock(), but for an -* rwlock -*/ - if (refcount_dec_not_one(&cset->refcount)) - return; - - spin_lock_irqsave(&css_set_lock, flags); - put_css_set_locked(cset); - spin_unlock_irqrestore(&css_set_lock, flags); -} - -/* - * refcounted get/put for css_set objects - */ -static inline void get_css_set(struct css_set *cset) -{ - refcount_inc(&cset->refcount); -} - bool cgroup_ssid_enabled(int ssid); bool cgroup_on_dfl(const struct cgroup *cgrp); bool cgroup_is_thread_root(struct cgroup *cgrp); diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c index 946031fcb393..de105e651607 100644 --- a/kernel/cgroup/cgroup.c +++ b/kernel/cgroup/cgroup.c @@ -58,6 +58,7 @@ #include #include #include +#include #include -- 2.27.0 ___ Devel mailing list Devel@openvz.org https://lists.openvz.org/mailman/listinfo/devel
[Devel] [PATCH VZ8 v2 03/14] ve/cgroup: implemented per-ve workqueue.
Signed-off-by: Valeriy Vdovin Reviewed-by: Kirill Tkhai (Cherry-picked from 0293870666c4f96bd56f612d94f560626c76e2fd) Signed-off-by: Valeriy Vdovin Reviewed-by: Kirill Tkhai --- include/linux/ve.h | 1 + kernel/ve/ve.c | 25 + 2 files changed, 26 insertions(+) diff --git a/include/linux/ve.h b/include/linux/ve.h index 103d0a9044fc..d3c1ab840444 100644 --- a/include/linux/ve.h +++ b/include/linux/ve.h @@ -105,6 +105,7 @@ struct ve_struct { unsigned long aio_nr; unsigned long aio_max_nr; #endif + struct workqueue_struct *wq; }; struct ve_devmnt { diff --git a/kernel/ve/ve.c b/kernel/ve/ve.c index f7d605357d2e..25455264b225 100644 --- a/kernel/ve/ve.c +++ b/kernel/ve/ve.c @@ -388,6 +388,21 @@ static void ve_set_vdso_time(struct ve_struct *ve, u64 time) *vdso_start_time = time; } +static int ve_workqueue_start(struct ve_struct *ve) +{ + ve->wq = alloc_workqueue("ve_wq_%s", + WQ_SYSFS|WQ_FREEZABLE|WQ_UNBOUND, 8, ve->ve_name); + + if (!ve->wq) + return -ENOMEM; + return 0; +} + +static void ve_workqueue_stop(struct ve_struct *ve) +{ + destroy_workqueue(ve->wq); +} + /* under ve->op_sem write-lock */ static int ve_start_container(struct ve_struct *ve) { @@ -443,6 +458,10 @@ static int ve_start_container(struct ve_struct *ve) if (err) goto err_umh; + err = ve_workqueue_start(ve); + if (err) + goto err_workqueue; + err = ve_hook_iterate_init(VE_SS_CHAIN, ve); if (err < 0) goto err_iterate; @@ -458,6 +477,8 @@ static int ve_start_container(struct ve_struct *ve) return 0; err_iterate: + ve_workqueue_stop(ve); +err_workqueue: ve_stop_umh(ve); err_umh: ve_stop_kthreadd(ve); @@ -523,6 +544,8 @@ void ve_exit_ns(struct pid_namespace *pid_ns) cgroup_unmark_ve_roots(ve); + ve_workqueue_stop(ve); + /* * At this point all userspace tasks in container are dead. */ @@ -1363,6 +1386,8 @@ static int __init ve_subsys_init(void) { ve_cachep = KMEM_CACHE_USERCOPY(ve_struct, SLAB_PANIC, core_pattern); list_add(&ve0.ve_list, &ve_list_head); + ve0.wq = alloc_workqueue("ve0_wq", WQ_FREEZABLE|WQ_UNBOUND, 8); + BUG_ON(!ve0.wq); return 0; } late_initcall(ve_subsys_init); -- 2.27.0 ___ Devel mailing list Devel@openvz.org https://lists.openvz.org/mailman/listinfo/devel
[Devel] [PATCH VZ8 v2 13/14] cgroup/ve: pass cgroup_root to ve_set(get)_release_agent
Due to virtualization of release_agent cgroup property, cgroup1_show_options has become more complex. struct cgroup_root is one of the arguments to that function, it was previously holding the value of release_agent. But now this property is per-ve AND per-cgroup. That's why to find the right release_agent value, the code should convert cgroup_root into one specific cgroup that is a 'virtual cgroup root' of a container, represented by the current VE. Getting ve is trivial but cgroup can be found by a helper function that will iterate css_set links under cgroup_mutex lock. There is a lock inversion problem when using cgroup_mutex in cgroup1_show_options, lockdep shows cgroup_mutex conflicts with kernfs_node->dep_map. This can be solved easily by converting per-cgroup data structure in VE into per-cgroup-root. This way we can provide ve_set(get)release_agent_path directly with struct cgroup_root agrument. For each cgroup hierarchy there is only one root and for each VE there can only be one virtual root either, that's why it is safe to just use cgroup_root as a key to find the proper release_agent path in each VE. Signed-off-by: Valeriy Vdovin --- include/linux/ve.h| 8 --- kernel/cgroup/cgroup-v1.c | 44 +++ kernel/cgroup/cgroup.c| 5 +++-- kernel/ve/ve.c| 19 +++-- 4 files changed, 27 insertions(+), 49 deletions(-) diff --git a/include/linux/ve.h b/include/linux/ve.h index 7cef4b39847e..3b487f8a4a50 100644 --- a/include/linux/ve.h +++ b/include/linux/ve.h @@ -145,12 +145,14 @@ extern int nr_ve; void ve_add_to_release_list(struct cgroup *cgrp); void ve_rm_from_release_list(struct cgroup *cgrp); -int ve_set_release_agent_path(struct cgroup *cgroot, +int ve_set_release_agent_path(struct ve_struct *ve, struct cgroup_root *cgroot, const char *release_agent); -const char *ve_get_release_agent_path(struct cgroup *cgrp_root); +const char *ve_get_release_agent_path(struct ve_struct *ve, + struct cgroup_root *cgroot); -void ve_cleanup_per_cgroot_data(struct ve_struct *ve, struct cgroup *cgrp); +void ve_cleanup_per_cgroot_data(struct ve_struct *ve, + struct cgroup_root *cgrp); extern struct ve_struct *get_ve(struct ve_struct *ve); extern void put_ve(struct ve_struct *ve); diff --git a/kernel/cgroup/cgroup-v1.c b/kernel/cgroup/cgroup-v1.c index 46be2f688503..993ac38b895f 100644 --- a/kernel/cgroup/cgroup-v1.c +++ b/kernel/cgroup/cgroup-v1.c @@ -577,7 +577,8 @@ static ssize_t cgroup_release_agent_write(struct kernfs_open_file *of, } if (root_cgrp->ve_owner) - ret = ve_set_release_agent_path(root_cgrp, strstrip(buf)); + ret = ve_set_release_agent_path(root_cgrp->ve_owner, + root_cgrp->root, strstrip(buf)); else ret = -ENODEV; @@ -598,7 +599,9 @@ static int cgroup_release_agent_show(struct seq_file *seq, void *v) root_cgrp = cgroup_get_local_root(cgrp); if (root_cgrp->ve_owner) { rcu_read_lock(); - release_agent = ve_get_release_agent_path(root_cgrp); + release_agent = ve_get_release_agent_path( + rcu_dereference(root_cgrp->ve_owner), + root_cgrp->root); if (release_agent) seq_puts(seq, release_agent); @@ -910,7 +913,7 @@ void cgroup1_release_agent(struct work_struct *work) goto continue_free; } - release_agent = ve_get_release_agent_path(root_cgrp); + release_agent = ve_get_release_agent_path(ve, root_cgrp->root); *agentbuf = 0; if (release_agent) @@ -931,7 +934,9 @@ void cgroup1_release_agent(struct work_struct *work) envp[i++] = "PATH=/sbin:/bin:/usr/sbin:/usr/bin"; envp[i] = NULL; + mutex_unlock(&cgroup_mutex); + err = call_usermodehelper_ve(ve, argv[0], argv, envp, UMH_WAIT_EXEC); @@ -939,6 +944,7 @@ void cgroup1_release_agent(struct work_struct *work) pr_warn_ratelimited("cgroup1_release_agent " "%s %s failed: %d\n", agentbuf, pathbuf, err); + up_write(&ve->op_sem); mutex_lock(&cgroup_mutex); continue_free: kfree(pathbuf); @@ -989,7 +995,6 @@ static int cgroup1_show_options(struct seq_file *seq, struct kernfs_root *kf_roo const char *release_agent; struct cgroup_root *root = cgroup_root_from_kf(kf_root); struct cgroup_subsys *ss; - struct cgroup *root_cgrp = &root->cgrp; int ssid; for_each_subsys(ss, ssid) @@ -1003,32 +1008,7 @@ static int cgroup1_show_options(
[Devel] [PATCH VZ8 v2 02/14] cgroup/cfs: added 'activate' option to cgroup_add_file
In kernfs files get created in 'deactivated' state, which means they are not visible. Add option to activate the file after creation immediately making it visible in the parent directory. Will be used in later patches. Signed-off-by: Valeriy Vdovin Reviewed-by: Kirill Tkhai --- kernel/cgroup/cgroup.c | 6 -- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c index b71d4ccb2f0c..946031fcb393 100644 --- a/kernel/cgroup/cgroup.c +++ b/kernel/cgroup/cgroup.c @@ -3823,7 +3823,7 @@ static void cgroup_file_notify_timer(struct timer_list *timer) } static int cgroup_add_file(struct cgroup_subsys_state *css, struct cgroup *cgrp, - struct cftype *cft) + struct cftype *cft, bool activate) { char name[CGROUP_FILE_NAME_MAX]; struct kernfs_node *kn; @@ -3865,6 +3865,8 @@ static int cgroup_add_file(struct cgroup_subsys_state *css, struct cgroup *cgrp, if (IS_ERR(kn_link)) return PTR_ERR(kn_link); } + if (activate) + kernfs_activate(kn); return 0; } @@ -3902,7 +3904,7 @@ static int cgroup_addrm_files(struct cgroup_subsys_state *css, if ((cft->flags & CFTYPE_DEBUG) && !cgroup_debug) continue; if (is_add) { - ret = cgroup_add_file(css, cgrp, cft); + ret = cgroup_add_file(css, cgrp, cft, false); if (ret) { pr_warn("%s: failed to add %s, err=%d\n", __func__, cft->name, ret); -- 2.27.0 ___ Devel mailing list Devel@openvz.org https://lists.openvz.org/mailman/listinfo/devel
[Devel] [PATCH VZ8 v2 01/14] ve/cgroup: unmark ve-root cgroups at container stop
fixes: 915a1130c7ee4ffb6de3f69a5bd98c5ee42a723f Signed-off-by: Valeriy Vdovin Reviewed-by: Kirill Tkhai Cherry-picked from 5dceccf5dd794673ebb1b0e6840d96aa654ec33e) Signed-off-by: Valeriy Vdovin Reviewed-by: Kirill Tkhai --- include/linux/cgroup.h | 1 + kernel/cgroup/cgroup.c | 23 +++ kernel/ve/ve.c | 3 +++ 3 files changed, 27 insertions(+) diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h index 4f0dd51338bf..c0a42c3d43fa 100644 --- a/include/linux/cgroup.h +++ b/include/linux/cgroup.h @@ -867,6 +867,7 @@ int cgroup_path_ns(struct cgroup *cgrp, char *buf, size_t buflen, #ifdef CONFIG_VE extern void cgroup_mark_ve_root(struct ve_struct *ve); +void cgroup_unmark_ve_roots(struct ve_struct *ve); #endif #else /* !CONFIG_CGROUPS */ diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c index 0335a07f64e6..b71d4ccb2f0c 100644 --- a/kernel/cgroup/cgroup.c +++ b/kernel/cgroup/cgroup.c @@ -1908,6 +1908,29 @@ void cgroup_mark_ve_root(struct ve_struct *ve) spin_unlock_irq(&css_set_lock); } +void cgroup_unmark_ve_roots(struct ve_struct *ve) +{ + struct cgrp_cset_link *link; + struct css_set *cset; + struct cgroup *cgrp; + + spin_lock_irq(&css_set_lock); + + rcu_read_lock(); + cset = rcu_dereference(ve->ve_ns)->cgroup_ns->root_cset; + if (WARN_ON(!cset)) + goto unlock; + + list_for_each_entry(link, &cset->cgrp_links, cgrp_link) { + cgrp = link->cgrp; + clear_bit(CGRP_VE_ROOT, &cgrp->flags); + } +unlock: + rcu_read_unlock(); + + spin_unlock_irq(&css_set_lock); +} + struct cgroup *cgroup_get_ve_root1(struct cgroup *cgrp) { struct cgroup *ve_root = NULL; diff --git a/kernel/ve/ve.c b/kernel/ve/ve.c index b83b2b66a875..f7d605357d2e 100644 --- a/kernel/ve/ve.c +++ b/kernel/ve/ve.c @@ -520,6 +520,9 @@ void ve_exit_ns(struct pid_namespace *pid_ns) */ if (!ve_ns || ve_ns->pid_ns_for_children != pid_ns) goto unlock; + + cgroup_unmark_ve_roots(ve); + /* * At this point all userspace tasks in container are dead. */ -- 2.27.0 ___ Devel mailing list Devel@openvz.org https://lists.openvz.org/mailman/listinfo/devel
[Devel] [VZ7 PATCH 2/2] fs/direct-io.c: keep dio_warn_stale_pagecache() when CONFIG_BLOCK=n
This helper prints warning if direct I/O write failed to invalidate cache, and set EIO at inode to warn usersapce about possible data corruption. See also commit 5a9d929d6e13 ("iomap: report collisions between directio and buffered writes to userspace"). Direct I/O is supported by non-disk filesystems, for example NFS. Thus generic code needs this even in kernel without CONFIG_BLOCK. Link: http://lkml.kernel.org/r/157270038074.4812.798085554455740.stgit@buzz Signed-off-by: Konstantin Khlebnikov Reviewed-by: Andrew Morton Reviewed-by: Jan Kara Cc: Jens Axboe Cc: Alexander Viro Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds (cherry-picked from a92853b6746fe5ffef20a7c30addf6320561e669) https://jira.sw.ru/browse/PSBM-124609 Signed-off-by: Valeriy Vdovin --- fs/direct-io.c | 21 - include/linux/fs.h | 6 +- mm/filemap.c | 21 + 3 files changed, 26 insertions(+), 22 deletions(-) diff --git a/fs/direct-io.c b/fs/direct-io.c index 886989d..be44dcf 100644 --- a/fs/direct-io.c +++ b/fs/direct-io.c @@ -256,27 +256,6 @@ static void dio_iodone2_helper(struct dio *dio, loff_t offset, } } -/* - * Warn about a page cache invalidation failure during a direct io write. - */ -void dio_warn_stale_pagecache(struct file *filp) -{ - static DEFINE_RATELIMIT_STATE(_rs, 86400 * HZ, DEFAULT_RATELIMIT_BURST); - char pathname[128]; - struct inode *inode = file_inode(filp); - char *path; - - errseq_set(&inode->i_mapping->wb_err, -EIO); - if (__ratelimit(&_rs)) { - path = d_path(&filp->f_path, pathname, sizeof(pathname)); - if (IS_ERR(path)) - path = "(unknown)"; - pr_crit("Page cache invalidation failure on direct I/O. Possible data corruption due to collision with buffered I/O!\n"); - pr_crit("File: %s PID: %d Comm: %.20s\n", path, current->pid, - current->comm); - } -} - /** * dio_complete() - called when all DIO BIO I/O has been completed * @offset: the byte offset in the file of the completed operation diff --git a/include/linux/fs.h b/include/linux/fs.h index bc5417f..521ff1a 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -3316,7 +3316,6 @@ enum { }; void dio_end_io(struct bio *bio, int error); -void dio_warn_stale_pagecache(struct file *filp); ssize_t __blockdev_direct_IO(int rw, struct kiocb *iocb, struct inode *inode, struct block_device *bdev, struct iov_iter *iter, loff_t offset, @@ -3361,6 +3360,11 @@ static inline void inode_dio_end(struct inode *inode) wake_up_bit(&inode->i_state, __I_DIO_WAKEUP); } +/* + * Warn about a page cache invalidation failure diring a direct I/O write. + */ +void dio_warn_stale_pagecache(struct file *filp); + extern void inode_set_flags(struct inode *inode, unsigned int flags, unsigned int mask); diff --git a/mm/filemap.c b/mm/filemap.c index 585c57e..2d35844 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -2794,6 +2794,27 @@ int pagecache_write_end(struct file *file, struct address_space *mapping, } EXPORT_SYMBOL(pagecache_write_end); +/* + * Warn about a page cache invalidation failure during a direct I/O write. + */ +void dio_warn_stale_pagecache(struct file *filp) +{ + static DEFINE_RATELIMIT_STATE(_rs, 86400 * HZ, DEFAULT_RATELIMIT_BURST); + char pathname[128]; + struct inode *inode = file_inode(filp); + char *path; + + errseq_set(&inode->i_mapping->wb_err, -EIO); + if (__ratelimit(&_rs)) { + path = d_path(&filp->f_path, pathname, sizeof(pathname)); + if (IS_ERR(path)) + path = "(unknown)"; + pr_crit("Page cache invalidation failure on direct I/O. Possible data corruption due to collision with buffered I/O!\n"); + pr_crit("File: %s PID: %d Comm: %.20s\n", path, current->pid, + current->comm); + } +} + ssize_t generic_file_direct_write(struct kiocb *iocb, struct iov_iter *iter, loff_t pos, loff_t *ppos, size_t count) -- 1.8.3.1 ___ Devel mailing list Devel@openvz.org https://lists.openvz.org/mailman/listinfo/devel
[Devel] [VZ7 PATCH 1/2] iomap: report collisions between directio and buffered writes to userspace
If two programs simultaneously try to write to the same part of a file via direct IO and buffered IO, there's a chance that the post-diowrite pagecache invalidation will fail on the dirty page. When this happens, the dio write succeeded, which means that the page cache is no longer coherent with the disk! Programs are not supposed to mix IO types and this is a clear case of data corruption, so store an EIO which will be reflected to userspace during the next fsync. Replace the WARN_ON with a ratelimited pr_crit so that the developers have /some/ kind of breadcrumb to track down the offending program(s) and file(s) involved. Signed-off-by: Darrick J. Wong Reviewed-by: Liu Bo (cherry-picked from 5a9d929d6e13278df62bd9e3d3ceae8c87ad1eea) file_path changed to d_path https://jira.sw.ru/browse/PSBM-124609 Signed-off-by: Valeriy Vdovin --- fs/direct-io.c | 24 +++- include/linux/fs.h | 1 + 2 files changed, 24 insertions(+), 1 deletion(-) diff --git a/fs/direct-io.c b/fs/direct-io.c index f5fd6ff..886989d 100644 --- a/fs/direct-io.c +++ b/fs/direct-io.c @@ -256,6 +256,27 @@ static void dio_iodone2_helper(struct dio *dio, loff_t offset, } } +/* + * Warn about a page cache invalidation failure during a direct io write. + */ +void dio_warn_stale_pagecache(struct file *filp) +{ + static DEFINE_RATELIMIT_STATE(_rs, 86400 * HZ, DEFAULT_RATELIMIT_BURST); + char pathname[128]; + struct inode *inode = file_inode(filp); + char *path; + + errseq_set(&inode->i_mapping->wb_err, -EIO); + if (__ratelimit(&_rs)) { + path = d_path(&filp->f_path, pathname, sizeof(pathname)); + if (IS_ERR(path)) + path = "(unknown)"; + pr_crit("Page cache invalidation failure on direct I/O. Possible data corruption due to collision with buffered I/O!\n"); + pr_crit("File: %s PID: %d Comm: %.20s\n", path, current->pid, + current->comm); + } +} + /** * dio_complete() - called when all DIO BIO I/O has been completed * @offset: the byte offset in the file of the completed operation @@ -312,7 +333,8 @@ static ssize_t dio_complete(struct dio *dio, loff_t offset, ssize_t ret, err = invalidate_inode_pages2_range(dio->inode->i_mapping, offset >> PAGE_SHIFT, (offset + ret - 1) >> PAGE_SHIFT); - WARN_ON_ONCE(err); + if (err) + dio_warn_stale_pagecache(dio->iocb->ki_filp); } /* diff --git a/include/linux/fs.h b/include/linux/fs.h index aee8adf..bc5417f 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -3316,6 +3316,7 @@ enum { }; void dio_end_io(struct bio *bio, int error); +void dio_warn_stale_pagecache(struct file *filp); ssize_t __blockdev_direct_IO(int rw, struct kiocb *iocb, struct inode *inode, struct block_device *bdev, struct iov_iter *iter, loff_t offset, -- 1.8.3.1 ___ Devel mailing list Devel@openvz.org https://lists.openvz.org/mailman/listinfo/devel
[Devel] [VZ7 PATCH] cgroup/ve: at container start only check virtualizable cgroups.
fixes 105332edc47ce43b9321983249417512f70906ce The above commit prevented situation when the a task tried to start container without first creating the right cgroups context for that. The logic behind that check was: - there is a set of cgroups that will be virtualized during container start. - for that these cgroups will be modified. - the cgroup that will be chosen for modification are in starting task css set. - it is invalid and forbidden to modify cgroups that a located in the root of each cgroup hierarchy. - therefore we have to check all the css set to see if it has cgroups with no parent (indication of root) and forbid the whole procedure if at least some cgroup matches. The bug in this behaviour was: - there are cases when there are non-virtualizable cgroup mounts. - these are named cgroups which do not have a bound cgroup subsystems on them. - there is one exception which is a named cgroup "systemd". - therefore container starters do not have to make nested cgroups for these type of non-virtualizable cgroup hierarchies. - therefore there can be named cgroups with parent == NULL in css set of a starting task and they will not pass the check and container start will fail. We fix the bug to only check those cgroups in css set, that are virtualizable. We already have the check helper that is used a bit later in cgroup_mark_ve_roots, so let's use it. https://jira.sw.ru/browse/PSBM-125040 Signed-off-by: Valeriy Vdovin --- kernel/cgroup.c | 30 ++ 1 file changed, 18 insertions(+), 12 deletions(-) diff --git a/kernel/cgroup.c b/kernel/cgroup.c index 85d281e..b6408e6 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -658,6 +658,19 @@ static struct cgroup *css_cgroup_from_root(struct css_set *css_set, return res; } +#ifdef CONFIG_VE +static inline bool is_virtualized_cgroup(struct cgroup *cgrp) +{ + lockdep_assert_held(&cgroup_mutex); + if (cgrp->root->subsys_mask) + return true; + + if (!strcmp(cgrp->root->name, "systemd")) + return true; + + return false; +} + /* * Iterate all cgroups in a given css_set and check if it is a top cgroup * of it's hierarchy. @@ -674,6 +687,9 @@ static inline bool css_has_host_cgroups(struct css_set *css_set) if (link->cgrp->root == &rootnode) continue; + if (!is_virtualized_cgroup(link->cgrp)) + continue; + if (!link->cgrp->parent) { read_unlock(&css_set_lock); return true; @@ -682,6 +698,8 @@ static inline bool css_has_host_cgroups(struct css_set *css_set) read_unlock(&css_set_lock); return false; } +#endif + /* * Return the cgroup for "task" from the given hierarchy. Must be @@ -4628,18 +4646,6 @@ static struct cftype *get_cftype_by_name(const char *name) } #ifdef CONFIG_VE -static inline bool is_virtualized_cgroup(struct cgroup *cgrp) -{ - lockdep_assert_held(&cgroup_mutex); - if (cgrp->root->subsys_mask) - return true; - - if (!strcmp(cgrp->root->name, "systemd")) - return true; - - return false; -} - int cgroup_mark_ve_roots(struct ve_struct *ve) { struct cgroup *cgrp, *tmp; -- 1.8.3.1 ___ Devel mailing list Devel@openvz.org https://lists.openvz.org/mailman/listinfo/devel
[Devel] [PATCH] cgroup/ve: at container start only check virtualizable cgroups.
fixes 105332edc47ce43b9321983249417512f70906ce The above commit prevented situation when the a task tried to start container without first creating the right cgroups context for that. The logic behind that check was: - there is a set of cgroups that will be virtualized during container start. - for that these cgroups will be modified. - the cgroup that will be chosen for modification are in starting task css set. - it is invalid and forbidden to modify cgroups that a located in the root of each cgroup hierarchy. - therefore we have to check all the css set to see if it has cgroups with no parent (indication of root) and forbid the whole procedure if at least some cgroup matches. The bug in this behaviour was: - there are cases when there are non-virtualizable cgroup mounts. - these are named cgroups which do not have a bound cgroup subsystems on them. - there is one exception which is a named cgroup "systemd". - therefore container starters do not have to make nested cgroups for these type of non-virtualizable cgroup hierarchies. - therefore there can be named cgroups with parent == NULL in css set of a starting task and they will not pass the check and container start will fail. We fix the bug to only check those cgroups in css set, that are virtualizable. We already have the check helper that is used a bit later in cgroup_mark_ve_roots, so let's use it. Signed-off-by: Valeriy Vdovin --- kernel/cgroup.c | 30 ++ 1 file changed, 18 insertions(+), 12 deletions(-) diff --git a/kernel/cgroup.c b/kernel/cgroup.c index 85d281e..b6408e6 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -658,6 +658,19 @@ static struct cgroup *css_cgroup_from_root(struct css_set *css_set, return res; } +#ifdef CONFIG_VE +static inline bool is_virtualized_cgroup(struct cgroup *cgrp) +{ + lockdep_assert_held(&cgroup_mutex); + if (cgrp->root->subsys_mask) + return true; + + if (!strcmp(cgrp->root->name, "systemd")) + return true; + + return false; +} + /* * Iterate all cgroups in a given css_set and check if it is a top cgroup * of it's hierarchy. @@ -674,6 +687,9 @@ static inline bool css_has_host_cgroups(struct css_set *css_set) if (link->cgrp->root == &rootnode) continue; + if (!is_virtualized_cgroup(link->cgrp)) + continue; + if (!link->cgrp->parent) { read_unlock(&css_set_lock); return true; @@ -682,6 +698,8 @@ static inline bool css_has_host_cgroups(struct css_set *css_set) read_unlock(&css_set_lock); return false; } +#endif + /* * Return the cgroup for "task" from the given hierarchy. Must be @@ -4628,18 +4646,6 @@ static struct cftype *get_cftype_by_name(const char *name) } #ifdef CONFIG_VE -static inline bool is_virtualized_cgroup(struct cgroup *cgrp) -{ - lockdep_assert_held(&cgroup_mutex); - if (cgrp->root->subsys_mask) - return true; - - if (!strcmp(cgrp->root->name, "systemd")) - return true; - - return false; -} - int cgroup_mark_ve_roots(struct ve_struct *ve) { struct cgroup *cgrp, *tmp; -- 1.8.3.1 ___ Devel mailing list Devel@openvz.org https://lists.openvz.org/mailman/listinfo/devel
[Devel] [PATCH VZ8 v1 14/14] cgroup: relaxed lockdep assertion for cset_cgroup_from_root
Signed-off-by: Valeriy Vdovin --- kernel/cgroup/cgroup-internal.h | 3 ++- kernel/cgroup/cgroup-v1.c | 3 ++- kernel/cgroup/cgroup.c | 15 +-- 3 files changed, 13 insertions(+), 8 deletions(-) diff --git a/kernel/cgroup/cgroup-internal.h b/kernel/cgroup/cgroup-internal.h index 112bd917e99d..e1ce16e77bb4 100644 --- a/kernel/cgroup/cgroup-internal.h +++ b/kernel/cgroup/cgroup-internal.h @@ -161,7 +161,8 @@ static inline bool notify_on_release(const struct cgroup *cgrp) } struct cgroup *cset_cgroup_from_root(struct css_set *cset, - struct cgroup_root *root); + struct cgroup_root *root, + bool assert_locks_strict); bool cgroup_ssid_enabled(int ssid); bool cgroup_on_dfl(const struct cgroup *cgrp); diff --git a/kernel/cgroup/cgroup-v1.c b/kernel/cgroup/cgroup-v1.c index 46be2f688503..57d36cf69aea 100644 --- a/kernel/cgroup/cgroup-v1.c +++ b/kernel/cgroup/cgroup-v1.c @@ -1022,7 +1022,8 @@ static int cgroup1_show_options(struct seq_file *seq, struct kernfs_root *kf_roo spin_lock_irq(&css_set_lock); cset = ve_ns->cgroup_ns->root_cset; BUG_ON(!cset); - root_cgrp = cset_cgroup_from_root(cset, root); + root_cgrp = cset_cgroup_from_root(cset, root, + false); spin_unlock_irq(&css_set_lock); } } diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c index 34e049361611..aac04a729535 100644 --- a/kernel/cgroup/cgroup.c +++ b/kernel/cgroup/cgroup.c @@ -1437,11 +1437,13 @@ current_cgns_cgroup_from_root(struct cgroup_root *root) /* look up cgroup associated with given css_set on the specified hierarchy */ struct cgroup *cset_cgroup_from_root(struct css_set *cset, - struct cgroup_root *root) + struct cgroup_root *root, + bool assert_locks_strict) { struct cgroup *res = NULL; - lockdep_assert_held(&cgroup_mutex); + if (assert_locks_strict) + lockdep_assert_held(&cgroup_mutex); lockdep_assert_held(&css_set_lock); if (cset == &init_css_set) { @@ -1476,7 +1478,7 @@ struct cgroup *task_cgroup_from_root(struct task_struct *task, * No need to lock the task - since we hold css_set_lock the * task can't change groups. */ - return cset_cgroup_from_root(task_css_set(task), root); + return cset_cgroup_from_root(task_css_set(task), root, true); } /* @@ -2284,7 +2286,7 @@ struct dentry *cgroup_do_mount(struct file_system_type *fs_type, int flags, mutex_lock(&cgroup_mutex); spin_lock_irq(&css_set_lock); - cgrp = cset_cgroup_from_root(ns->root_cset, root); + cgrp = cset_cgroup_from_root(ns->root_cset, root, true); spin_unlock_irq(&css_set_lock); mutex_unlock(&cgroup_mutex); @@ -2380,7 +2382,8 @@ static struct file_system_type cgroup2_fs_type = { int cgroup_path_ns_locked(struct cgroup *cgrp, char *buf, size_t buflen, struct cgroup_namespace *ns) { - struct cgroup *root = cset_cgroup_from_root(ns->root_cset, cgrp->root); + struct cgroup *root = cset_cgroup_from_root(ns->root_cset, cgrp->root, + true); return kernfs_path_from_node(cgrp->kn, root->kn, buf, buflen); } @@ -2749,7 +2752,7 @@ void cgroup_migrate_add_src(struct css_set *src_cset, if (src_cset->dead) return; - src_cgrp = cset_cgroup_from_root(src_cset, dst_cgrp->root); + src_cgrp = cset_cgroup_from_root(src_cset, dst_cgrp->root, true); if (!list_empty(&src_cset->mg_preload_node)) return; -- 2.27.0 ___ Devel mailing list Devel@openvz.org https://lists.openvz.org/mailman/listinfo/devel
[Devel] [PATCH VZ8 v1 13/14] ve/cgroup: At cgroup_mark(unmark)_ve_roots skip non-virtualized roots
During container start there might be a situation when not all cgroup hierarchies get virtualized by container manager (like vzctl). By virtualizing a cgroup hierarchy I mean creation of sub-directory within a particular mounted cgroup. When container starts it looks in css set of it's init process to list all affilated cgroups and perform actions on each. But non-virtualized cgroups will also be present in init's css_set and they should not be touched from inside of any non root ve. Signed-off-by: Valeriy Vdovin --- kernel/cgroup/cgroup.c | 17 + 1 file changed, 17 insertions(+) diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c index 285e84d1150f..34e049361611 100644 --- a/kernel/cgroup/cgroup.c +++ b/kernel/cgroup/cgroup.c @@ -1927,6 +1927,23 @@ static int cgroup_add_file(struct cgroup_subsys_state *css, struct cgroup *cgrp, struct cftype *cft, bool activate); #ifdef CONFIG_VE +static inline bool is_virtualized_cgroup(struct cgroup *cgrp) +{ + /* +* no parent means this is the host cgroup +*/ + if (!cgrp->kn->parent) + return false; + + if (cgrp->root->subsys_mask) + return true; + + if (!strcmp(cgrp->root->name, "systemd")) + return true; + + return false; +} + int cgroup_mark_ve_roots(struct ve_struct *ve) { int err; -- 2.27.0 ___ Devel mailing list Devel@openvz.org https://lists.openvz.org/mailman/listinfo/devel
[Devel] [PATCH VZ8 v1 12/14] ve/cgroup: added release_agent to each container root cgroup.
Each container will now have access to it's own cgroup release_agent file. Creation: Normally all cgroup files are created during a call to cgroup_create by cgroup_populate_dir function. It creates or not creates all cgroup files once and they immediately become visible to userspace as filesystem objects. Due to specifics of container creation process, it is not possible to use the same code for 'release_agent' file creation. For VE to start operating, first a list of ordinary cgroups is being created for each subsystem, then the set of newly created cgroups are converted to "virtual roots", so at the time when cgroup_create is executed, there is no knowledge of wheather or not "release_agent" file should be created. This information only comes at "conversion" step which is 'cgroup_mark_ve_roots' function. As the file is created dynamically in a live cgroup, a rather delicate locking sequence is present in the new code: - each new "virtual root" cgroup will have to add "release_agent" file, thus each cgroup's directory would need to be locked during the insertion time by cgroup->dentry->d_inode->i_mutex. - d_inode->i_mutex has an ordering dependency with cgroup_mutex (see cgroup_mount/cgroup_remount). They can not be locked in order {lock(cgroup_mutex), lock(inode->i_mutex)}. - to collect a list of cgroups, that need to become virtual we need cgroup_mutex lock to iterate active roots. - to overcome the above conflict we first need to collect a list of all virtual cgroups under cgroup_mutex lock, then release it and after that to insert "release_agent" to each root under inode->i_mutex lock. - to collect a list of cgroups on stack we utilize cgroup->cft_q_node, made specially for that purpose under it's own cgroup_cft_mutex. Destruction: Destruction is done in reverse from the above within cgroup_unmark_ve_roots. After file destruction we must prevent further write operations to this file in case when someone has opened this file prior to VE and cgroup destruction. This is achieved by checking if cgroup in the argument to cgroup_file_write function has features of host or virtual root. https://jira.sw.ru/browse/PSBM-83887 Signed-off-by: Valeriy Vdovin +++ cgroup: add missing dput() in cgroup_unmark_ve_roots() cgroup_unmark_ve_roots() calls dget() on cgroup's dentry but don't have the corresponding dput() call. This leads to leaking cgroups. Add missing dput() to fix this. https://jira.sw.ru/browse/PSBM-107328 Fixes: 1ac69e183447 ("ve/cgroup: added release_agent to each container root cgroup.") (Cherry-picked from 4a1635024df1bae4f4809a3bc445f0cf64d4acf4) Signed-off-by: Valeriy Vdovin --- include/linux/cgroup-defs.h | 3 + include/linux/cgroup.h | 2 +- include/linux/ve.h | 4 +- kernel/cgroup/cgroup-internal.h | 1 + kernel/cgroup/cgroup-v1.c | 37 - kernel/cgroup/cgroup.c | 128 ++-- kernel/ve/ve.c | 42 --- 7 files changed, 179 insertions(+), 38 deletions(-) diff --git a/include/linux/cgroup-defs.h b/include/linux/cgroup-defs.h index 4404862c1eaf..c7ce5e84f610 100644 --- a/include/linux/cgroup-defs.h +++ b/include/linux/cgroup-defs.h @@ -453,6 +453,9 @@ struct cgroup { */ struct list_head cset_links; + /* Used for cgroup_mark/umark ve */ + struct list_head cft_q_node; + /* * Linked list running through all cgroups that can * potentially be reaped by the release agent. Protected by diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h index 0920b2ffb15b..b846617b0a53 100644 --- a/include/linux/cgroup.h +++ b/include/linux/cgroup.h @@ -900,7 +900,7 @@ int cgroup_path_ns(struct cgroup *cgrp, char *buf, size_t buflen, void cgroup1_release_agent(struct work_struct *work); #ifdef CONFIG_VE -extern void cgroup_mark_ve_root(struct ve_struct *ve); +int cgroup_mark_ve_roots(struct ve_struct *ve); void cgroup_unmark_ve_roots(struct ve_struct *ve); struct ve_struct *cgroup_get_ve_owner(struct cgroup *cgrp); #endif diff --git a/include/linux/ve.h b/include/linux/ve.h index 65c19f2b9b98..7cef4b39847e 100644 --- a/include/linux/ve.h +++ b/include/linux/ve.h @@ -145,11 +145,13 @@ extern int nr_ve; void ve_add_to_release_list(struct cgroup *cgrp); void ve_rm_from_release_list(struct cgroup *cgrp); -int ve_set_release_agent_path(struct ve_struct *ve, struct cgroup *cgroot, +int ve_set_release_agent_path(struct cgroup *cgroot, const char *release_agent); const char *ve_get_release_agent_path(struct cgroup *cgrp_root); +void ve_cleanup_per_cgroot_data(struct ve_struct *ve, struct cgroup *cgrp); + extern struct ve_struct *get_ve(struct ve_struct *ve); exter
[Devel] [PATCH VZ8 v1 11/14] ve/cgroup: set release_agent_path for root cgroups separately
This is done so that each container could set it's own release agent. Release agent information is now stored in per-cgroup-root data structure in ve. https://jira.sw.ru/browse/PSBM-83887 Signed-off-by: Valeriy Vdovin +++ ve/cgroup: change resource release order in ve_drop_context This fixes 87cb5fdb5b5c77ac617b46a0fe118a7d50a77b1c In the mentioned patch in cgroup_show_options ve->ve_ns is checked to ensure that ve->root_css_set is usable. But in ve_drop_context root_css_set is being released before ve_ns, which is a bug. root_css_set will now be set to NULL after ve_ns is released. This reordering only affects the described piece of code in cgroup_show_options. https://jira.sw.ru/browse/PSBM-121438 Signed-off-by: Valeriy Vdovin Reviewed-by: Kirill Tkhai +++ cgroup: do not use cgroup_mutex in cgroup_show_options In 87cb5fdb5b5c77ac617b46a0fe118a7d50a77b1c function cgroup_show_options started to lock cgroup_mutex, which introduced new deadlock possibility, described below: Thread A: m_start() --> down_read(&namespace_sem); cgroup_show_options() --> mutex_lock(&cgroup_mutex); Thread B: attach_task_by_pid() cgroup_lock_live_group --> mutex_lock(&cgroup_mutex); threadgroup_lock() --> down_write(&tsk->signal->group_rwsem); Thread C: copy_process threadgroup_change_begin() --> down_read(&tsk->signal->group_rwsem); copy_namespaces create_new_namespaces copy_mnt_ns namespace_lock() --> down_write(&namespace_sem) Clearly cgroup_mutex can not be locked right after locking namespace_sem, because opposite locking order is also present in the code and should be removed from cgroup_show_options. After reviewing cgroup_show_options, it was established that cgroup_mutex is not absolutely needed to guarantee safe access to root_cgrp. It was used in combination with a call to task_cgroup_from_root to ensure that root_cgrp lived long enough to access it's value of release_agent path. But in this funciton we know that root_cgrp is part of ve->root_css_set, which holds reference to it. In turn root_css_set is referenced while ve->ve_ns is not NULL, the check of which we already have in the code. This means that root_cgrp is valid until ve->ve_ns is valid. ve->ve_ns is valid until the point of rcu_synchronize in ve_drop_context, that's why rcu_read_lock should be maintained all the time when root_cgrp is being accessed. The patch also removes BUG_ON from css_cgroup_from_root, because all 3 calls to this function pass ve->root_css_set as an argument and the above logic applies. https://jira.sw.ru/browse/PSBM-121438 Signed-off-by: Valeriy Vdovin Reviewed-by: Kirill Tkhai +++ ve: cleanup in function ve_get_release_agent_path (Cherry-picked from f1199bd9589b7c0914343dcc72f49ddaa9b98496) Signed-off-by: Valeriy Vdovin Reviewed-by: Kirill Tkhai --- include/linux/cgroup-defs.h | 3 -- include/linux/ve.h | 6 +++ kernel/cgroup/cgroup-internal.h | 4 +- kernel/cgroup/cgroup-v1.c | 86 ++--- kernel/cgroup/cgroup.c | 9 ++-- kernel/ve/ve.c | 76 + 6 files changed, 150 insertions(+), 34 deletions(-) diff --git a/include/linux/cgroup-defs.h b/include/linux/cgroup-defs.h index d6fe95320819..4404862c1eaf 100644 --- a/include/linux/cgroup-defs.h +++ b/include/linux/cgroup-defs.h @@ -571,9 +571,6 @@ struct cgroup_root { /* IDs for cgroups in this hierarchy */ struct idr cgroup_idr; - /* The path to use for release notifications. */ - char release_agent_path[PATH_MAX]; - /* The name for this hierarchy - may be empty */ char name[MAX_CGROUP_ROOT_NAMELEN]; }; diff --git a/include/linux/ve.h b/include/linux/ve.h index 44369dddeb24..65c19f2b9b98 100644 --- a/include/linux/ve.h +++ b/include/linux/ve.h @@ -144,6 +144,12 @@ extern int nr_ve; #ifdef CONFIG_VE void ve_add_to_release_list(struct cgroup *cgrp); void ve_rm_from_release_list(struct cgroup *cgrp); + +int ve_set_release_agent_path(struct ve_struct *ve, struct cgroup *cgroot, + const char *release_agent); + +const char *ve_get_release_agent_path(struct cgroup *cgrp_root); + extern struct ve_struct *get_ve(struct ve_struct *ve); extern void put_ve(struct ve_struct *ve); diff --git a/kernel/cgroup/cgroup-internal.h b/kernel/cgroup/cgroup-internal.h index 4de66630d456..be0cd157d4dc 100644 --- a/kernel/cgroup/cgroup-internal.h +++ b/kernel/cgroup/cgroup-internal.h @@ -160,6 +160,9 @@ static inline bool notify_on_release(const struct cgroup *cgrp) return test_bit(CGRP_NOTIFY_ON_RELEASE, &cgrp->flags); } +struct cgroup *cset_cgroup_from_root(struct css_set *cset, + struct cgroup_root *root); + bool cgroup_ssid_enabled(int ssid); bool cgroup_on_dfl(const struct cgroup *cgrp); bool cgroup_is_thread_root(struct cgroup *cgrp); @@ -
[Devel] [PATCH VZ8 v1 10/14] ve/cgroup: private per-cgroup-root data container
As long as each ve is internally attached to a particular css_set via it's init_task, it's good to have container with parameters, which are common to each cgroup subsystem hierarchy, rooting from it's virtual root. (Cherry-picked from 4a98f07102fd248ad4218a07b5ec5ec90da10288) Signed-off-by: Valeriy Vdovin --- include/linux/ve.h | 7 + kernel/ve/ve.c | 75 ++ 2 files changed, 82 insertions(+) diff --git a/include/linux/ve.h b/include/linux/ve.h index 2ab39b607708..44369dddeb24 100644 --- a/include/linux/ve.h +++ b/include/linux/ve.h @@ -114,6 +114,13 @@ struct ve_struct { struct workqueue_struct *wq; struct work_struct release_agent_work; + + /* +* List of data, private for each root cgroup in +* ve's css_set. +*/ + struct list_headper_cgroot_list; + spinlock_t per_cgroot_list_lock; }; struct ve_devmnt { diff --git a/kernel/ve/ve.c b/kernel/ve/ve.c index 008e372756f9..738f8b6465d4 100644 --- a/kernel/ve/ve.c +++ b/kernel/ve/ve.c @@ -30,6 +30,14 @@ #include "../cgroup/cgroup-internal.h" /* For cgroup_task_count() */ +struct per_cgroot_data { + struct list_head list; + /* +* data is related to this cgroup +*/ + struct cgroup *cgroot; +}; + extern struct kmapset_set sysfs_ve_perms_set; static struct kmem_cache *ve_cachep; @@ -67,6 +75,9 @@ struct ve_struct ve0 = { .release_list = LIST_HEAD_INIT(ve0.release_list), .release_agent_work = __WORK_INITIALIZER(ve0.release_agent_work, cgroup1_release_agent), + .per_cgroot_list= LIST_HEAD_INIT(ve0.per_cgroot_list), + .per_cgroot_list_lock = __SPIN_LOCK_UNLOCKED( + ve0.per_cgroot_list_lock), }; EXPORT_SYMBOL(ve0); @@ -199,6 +210,53 @@ int nr_threads_ve(struct ve_struct *ve) } EXPORT_SYMBOL(nr_threads_ve); +static struct per_cgroot_data *per_cgroot_data_find_locked( + struct list_head *per_cgroot_list, struct cgroup *cgroot) +{ + struct per_cgroot_data *data; + + list_for_each_entry(data, per_cgroot_list, list) { + if (data->cgroot == cgroot) + return data; + } + return NULL; +} + +static inline struct per_cgroot_data *per_cgroot_get_or_create( + struct ve_struct *ve, struct cgroup *cgroot) +{ + struct per_cgroot_data *data, *other_data; + unsigned long flags; + + spin_lock_irqsave(&ve->per_cgroot_list_lock, flags); + data = per_cgroot_data_find_locked(&ve->per_cgroot_list, + cgroot); + spin_unlock_irqrestore(&ve->per_cgroot_list_lock, flags); + + if (data) + return data; + + data = kzalloc(sizeof(struct per_cgroot_data), GFP_KERNEL); + if (!data) + return ERR_PTR(-ENOMEM); + + spin_lock_irqsave(&ve->per_cgroot_list_lock, flags); + other_data = per_cgroot_data_find_locked(&ve->per_cgroot_list, + cgroot); + + if (other_data) { + spin_unlock_irqrestore(&ve->per_cgroot_list_lock, flags); + kfree(data); + return other_data; + } + + data->cgroot = cgroot; + list_add(&data->list, &ve->per_cgroot_list); + + spin_unlock_irqrestore(&ve->per_cgroot_list_lock, flags); + return data; +} + struct cgroup_subsys_state *ve_get_init_css(struct ve_struct *ve, int subsys_id) { struct cgroup_subsys_state *css; @@ -533,6 +591,19 @@ static int ve_start_container(struct ve_struct *ve) return err; } +static void ve_per_cgroot_free(struct ve_struct *ve) +{ + struct per_cgroot_data *data, *saved; + unsigned long flags; + + spin_lock_irqsave(&ve->per_cgroot_list_lock, flags); + list_for_each_entry_safe(data, saved, &ve->per_cgroot_list, list) { + list_del_init(&data->list); + kfree(data); + } + spin_unlock_irqrestore(&ve->per_cgroot_list_lock, flags); +} + void ve_stop_ns(struct pid_namespace *pid_ns) { struct ve_struct *ve = current->task_ve; @@ -589,6 +660,8 @@ void ve_exit_ns(struct pid_namespace *pid_ns) ve_workqueue_stop(ve); + ve_per_cgroot_free(ve); + /* * At this point all userspace tasks in container are dead. */ @@ -699,6 +772,7 @@ static struct cgroup_subsys_state *ve_create(struct cgroup_subsys_state *parent_ INIT_WORK(&ve->release_agent_work, cgroup1_release_agent); spin_lock_init(&ve->release_list_lock); + spin_lock_init(&ve->per_cgroot_list_lock); ve->_randomize_va_space = ve0._randomize_va_space; @@ -721,6 +795,7 @@ static struct cgroup_subsys_state *ve_create(st
[Devel] [PATCH VZ8 v1 09/14] ve/cgroup: moved release_agent from system_wq to per-ve workqueues
Each VE should execute release agent notifications within it's own workqueue. This way we achieve a more fine-grained control over release_agent work flushing at VE destruction. (Cherry-picked from 9fbfb5b4cfb87ba7c9dd63eec5e5e27946a38d3c) Signed-off-by: Valeriy Vdovin --- include/linux/cgroup-defs.h | 10 ++- include/linux/cgroup.h | 2 + include/linux/ve.h | 10 +++ kernel/cgroup/cgroup-internal.h | 1 + kernel/cgroup/cgroup-v1.c | 109 kernel/cgroup/cgroup.c | 12 +++- kernel/ve/ve.c | 48 ++ 7 files changed, 159 insertions(+), 33 deletions(-) diff --git a/include/linux/cgroup-defs.h b/include/linux/cgroup-defs.h index e497387872f4..d6fe95320819 100644 --- a/include/linux/cgroup-defs.h +++ b/include/linux/cgroup-defs.h @@ -453,6 +453,13 @@ struct cgroup { */ struct list_head cset_links; + /* +* Linked list running through all cgroups that can +* potentially be reaped by the release agent. Protected by +* release_list_lock +*/ + struct list_head release_list; + /* * On the default hierarchy, a css_set for a cgroup with some * susbsys disabled will point to css's which are associated with @@ -490,9 +497,6 @@ struct cgroup { /* used to wait for offlining of csses */ wait_queue_head_t offline_waitq; - /* used to schedule release agent */ - struct work_struct release_agent_work; - /* used to store eBPF programs */ struct cgroup_bpf bpf; diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h index df9a9a09ce2a..0920b2ffb15b 100644 --- a/include/linux/cgroup.h +++ b/include/linux/cgroup.h @@ -897,6 +897,8 @@ struct cgroup_namespace *copy_cgroup_ns(unsigned long flags, int cgroup_path_ns(struct cgroup *cgrp, char *buf, size_t buflen, struct cgroup_namespace *ns); +void cgroup1_release_agent(struct work_struct *work); + #ifdef CONFIG_VE extern void cgroup_mark_ve_root(struct ve_struct *ve); void cgroup_unmark_ve_roots(struct ve_struct *ve); diff --git a/include/linux/ve.h b/include/linux/ve.h index d3c1ab840444..2ab39b607708 100644 --- a/include/linux/ve.h +++ b/include/linux/ve.h @@ -105,7 +105,15 @@ struct ve_struct { unsigned long aio_nr; unsigned long aio_max_nr; #endif + /* +* cgroups, that want to notify about becoming +* empty, are linked to this release_list. +*/ + struct list_headrelease_list; + spinlock_t release_list_lock; + struct workqueue_struct *wq; + struct work_struct release_agent_work; }; struct ve_devmnt { @@ -127,6 +135,8 @@ extern int nr_ve; (ve_is_super(get_exec_env()) && capable(CAP_SYS_ADMIN)) #ifdef CONFIG_VE +void ve_add_to_release_list(struct cgroup *cgrp); +void ve_rm_from_release_list(struct cgroup *cgrp); extern struct ve_struct *get_ve(struct ve_struct *ve); extern void put_ve(struct ve_struct *ve); diff --git a/kernel/cgroup/cgroup-internal.h b/kernel/cgroup/cgroup-internal.h index 829997989c41..4de66630d456 100644 --- a/kernel/cgroup/cgroup-internal.h +++ b/kernel/cgroup/cgroup-internal.h @@ -135,6 +135,7 @@ extern spinlock_t css_set_lock; extern struct cgroup_subsys *cgroup_subsys[]; extern struct list_head cgroup_roots; extern struct file_system_type cgroup_fs_type; +struct cgroup *cgroup_get_local_root(struct cgroup *cgrp); /* iterate across the hierarchies */ #define for_each_root(root)\ diff --git a/kernel/cgroup/cgroup-v1.c b/kernel/cgroup/cgroup-v1.c index 21a7c36fbf44..c1891317ae3a 100644 --- a/kernel/cgroup/cgroup-v1.c +++ b/kernel/cgroup/cgroup-v1.c @@ -784,7 +784,7 @@ void cgroup1_check_for_release(struct cgroup *cgrp) { if (notify_on_release(cgrp) && !cgroup_is_populated(cgrp) && !css_has_online_children(&cgrp->self) && !cgroup_is_dead(cgrp)) - schedule_work(&cgrp->release_agent_work); + ve_add_to_release_list(cgrp); } /* @@ -822,42 +822,95 @@ static inline int cgroup_path_ve_relative(struct cgroup *ve_root_cgrp, */ void cgroup1_release_agent(struct work_struct *work) { - struct cgroup *cgrp = - container_of(work, struct cgroup, release_agent_work); - char *pathbuf = NULL, *agentbuf = NULL; - char *argv[3], *envp[3]; - int ret; + struct ve_struct *ve; + unsigned long flags; + char *agentbuf; + + agentbuf = kzalloc(PATH_MAX, GFP_KERNEL); + if (!agentbuf) { + pr_warn("failed to allocate agentbuf\n"); + return; + } + ve = container_of(work, struct ve_struct, release_agent_work); mutex_lock(&cgroup_mutex); + spin_lock_irqsave(&ve->release_list_lock,
[Devel] [PATCH VZ8 v1 08/14] cgroup/ve: added helper function to get ve-related cgroup paths
This fill make fake-absolute paths to support virtual ve roots in cgroup hierarchies. The path will be used in subsequent patches. Signed-off-by: Valeriy.Vdovin --- kernel/cgroup/cgroup-v1.c | 10 ++ 1 file changed, 10 insertions(+) diff --git a/kernel/cgroup/cgroup-v1.c b/kernel/cgroup/cgroup-v1.c index fb06fc9d96ca..21a7c36fbf44 100644 --- a/kernel/cgroup/cgroup-v1.c +++ b/kernel/cgroup/cgroup-v1.c @@ -787,6 +787,16 @@ void cgroup1_check_for_release(struct cgroup *cgrp) schedule_work(&cgrp->release_agent_work); } +/* + * Used to get a fake-absolute path to a cgroup on kernfs filesystem, but it + * actually be relative to cgroup root, provided in the argument. + */ +static inline int cgroup_path_ve_relative(struct cgroup *ve_root_cgrp, + struct cgroup *cgrp, char *buf, size_t buflen) +{ + return kernfs_path_from_node(cgrp->kn, ve_root_cgrp->kn, buf, buflen); +} + /* * Notify userspace when a cgroup is released, by running the * configured release agent with the name of the cgroup (path -- 2.27.0 ___ Devel mailing list Devel@openvz.org https://lists.openvz.org/mailman/listinfo/devel
[Devel] [PATCH VZ8 v1 07/14] ve/cgroup: Added ve_owner field to cgroup
Each cgroup representing a host or a container root of cgroup subsystem hierarhy will have this field set to a valid ve_struct, that owns this root. This way each cgroup in a system will be able to know it's owning VE. Non root cgroups will have this field set to NULL, this is an optimization for cleanup code: at VE destruction we only need to iterate over all root cgroups to clean reference to former owning VE, rather than over all cgroup hierarchy. Still any cgroup that wants to know about it's owning VE can find it's virtual root cgroup and read it's ve_owner field. cgroup->ve_owner is declared as RCU pointer, because it fits RCU semantics - rare writes/often reads. ve_owner will be read from multiple locations in code in further patches and is only rarely set at cgroup_mark_ve_root/cgroup_mount. cgroup_get_ve_owner is a read wrapper for this purpose. (Cherry-picked from eb9c0bfae39fe336173a0dec11bc24f7275de3f8) Signed-off-by: Valeriy Vdovin --- include/linux/cgroup-defs.h | 3 +++ include/linux/cgroup.h | 1 + kernel/cgroup/cgroup.c | 44 + 3 files changed, 48 insertions(+) diff --git a/include/linux/cgroup-defs.h b/include/linux/cgroup-defs.h index 4fd566414274..e497387872f4 100644 --- a/include/linux/cgroup-defs.h +++ b/include/linux/cgroup-defs.h @@ -528,6 +528,9 @@ struct cgroup { u64 subgroups_limit; + /* ve_owner, responsible for running release agent. */ + struct ve_struct __rcu *ve_owner; + /* ids of the ancestors at each level including self */ int ancestor_ids[]; }; diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h index a1e6822bca8f..df9a9a09ce2a 100644 --- a/include/linux/cgroup.h +++ b/include/linux/cgroup.h @@ -900,6 +900,7 @@ int cgroup_path_ns(struct cgroup *cgrp, char *buf, size_t buflen, #ifdef CONFIG_VE extern void cgroup_mark_ve_root(struct ve_struct *ve); void cgroup_unmark_ve_roots(struct ve_struct *ve); +struct ve_struct *cgroup_get_ve_owner(struct cgroup *cgrp); #endif #else /* !CONFIG_CGROUPS */ diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c index 53a349539256..ff0a803c3aad 100644 --- a/kernel/cgroup/cgroup.c +++ b/kernel/cgroup/cgroup.c @@ -303,6 +303,43 @@ bool cgroup_on_dfl(const struct cgroup *cgrp) return cgrp->root == &cgrp_dfl_root; } +struct cgroup *cgroup_get_local_root(struct cgroup *cgrp) +{ + /* +* Find nearest root cgroup, which might be host cgroup root +* or ve cgroup root. +* +* -> local_root +* \^ +* | +* \ | +* ---> from here +*\ +* -> local_root +* \ ^ +* | +* \ | +* --->from here +*/ + + while (cgrp->kn->parent && !test_bit(CGRP_VE_ROOT, &cgrp->flags)) + cgrp = cgrp->kn->parent->priv; + + return cgrp; +} + +struct ve_struct *cgroup_get_ve_owner(struct cgroup *cgrp) +{ + struct ve_struct *ve; + /* Caller should hold RCU */ + + cgrp = cgroup_get_local_root(cgrp); + ve = rcu_dereference(cgrp->ve_owner); + if (!ve) + ve = get_ve0(); + return ve; +} + /* IDR wrappers which synchronize using cgroup_idr_lock */ static int cgroup_idr_alloc(struct idr *idr, void *ptr, int start, int end, gfp_t gfp_mask) @@ -1900,6 +1937,7 @@ void cgroup_mark_ve_root(struct ve_struct *ve) list_for_each_entry(link, &cset->cgrp_links, cgrp_link) { cgrp = link->cgrp; + rcu_assign_pointer(cgrp->ve_owner, ve); set_bit(CGRP_VE_ROOT, &cgrp->flags); } link_ve_root_cpu_cgroup(cset->subsys[cpu_cgrp_id]); @@ -1907,6 +1945,7 @@ void cgroup_mark_ve_root(struct ve_struct *ve) rcu_read_unlock(); spin_unlock_irq(&css_set_lock); + synchronize_rcu(); } void cgroup_unmark_ve_roots(struct ve_struct *ve) @@ -1924,6 +1963,7 @@ void cgroup_unmark_ve_roots(struct ve_struct *ve) list_for_each_entry(link, &cset->cgrp_links, cgrp_link) { cgrp = link->cgrp; + rcu_assign_pointer(cgrp->ve_owner, NULL); clear_bit(CGRP_VE_ROOT, &cgrp->flags); } link_ve_root_cpu_cgroup(cset->subsys[cpu_cgrp_id]); @@ -1931,6 +1971,8 @@ void cgroup_unmark_ve_roots(struct ve_struct *ve) rcu_read_unlock(); spin_unlock_irq(&css_set_lock); + /* ve_owner == NULL will be visible */ + synchronize_rcu(); } struct cgroup *cgroup_get_ve_root1(struct cgroup *cgrp) @@ -2115,6 +2157,8 @@ struct dentry *cgroup_do_mount(struct file_system_type *fs_type, int flags,
[Devel] [PATCH VZ8 v1 06/14] cgroup: exported put_css_set and get_css_set to cgroup.h
(Cherry-picked from 8222bbe47ed1e3824e0890a1404735324189c0cb) Signed-off-by: Valeriy.Vdovin --- include/linux/cgroup.h | 28 kernel/cgroup/cgroup-internal.h | 27 --- kernel/cgroup/cgroup.c | 1 + 3 files changed, 29 insertions(+), 27 deletions(-) diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h index 76e38ac6..a1e6822bca8f 100644 --- a/include/linux/cgroup.h +++ b/include/linux/cgroup.h @@ -512,6 +512,34 @@ task_get_css(struct task_struct *task, int subsys_id) return css; } +void put_css_set_locked(struct css_set *cset); + +static inline void put_css_set(struct css_set *cset) +{ + unsigned long flags; + + /* +* Ensure that the refcount doesn't hit zero while any readers +* can see it. Similar to atomic_dec_and_lock(), but for an +* rwlock +*/ + if (refcount_dec_not_one(&cset->refcount)) + return; + + spin_lock_irqsave(&css_set_lock, flags); + put_css_set_locked(cset); + spin_unlock_irqrestore(&css_set_lock, flags); +} + +/* + * refcounted get/put for css_set objects + */ +static inline void get_css_set(struct css_set *cset) +{ + refcount_inc(&cset->refcount); +} + + /** * task_css_is_root - test whether a task belongs to the root css * @task: the target task diff --git a/kernel/cgroup/cgroup-internal.h b/kernel/cgroup/cgroup-internal.h index ce1c1553c696..829997989c41 100644 --- a/kernel/cgroup/cgroup-internal.h +++ b/kernel/cgroup/cgroup-internal.h @@ -159,33 +159,6 @@ static inline bool notify_on_release(const struct cgroup *cgrp) return test_bit(CGRP_NOTIFY_ON_RELEASE, &cgrp->flags); } -void put_css_set_locked(struct css_set *cset); - -static inline void put_css_set(struct css_set *cset) -{ - unsigned long flags; - - /* -* Ensure that the refcount doesn't hit zero while any readers -* can see it. Similar to atomic_dec_and_lock(), but for an -* rwlock -*/ - if (refcount_dec_not_one(&cset->refcount)) - return; - - spin_lock_irqsave(&css_set_lock, flags); - put_css_set_locked(cset); - spin_unlock_irqrestore(&css_set_lock, flags); -} - -/* - * refcounted get/put for css_set objects - */ -static inline void get_css_set(struct css_set *cset) -{ - refcount_inc(&cset->refcount); -} - bool cgroup_ssid_enabled(int ssid); bool cgroup_on_dfl(const struct cgroup *cgrp); bool cgroup_is_thread_root(struct cgroup *cgrp); diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c index 447c8f003496..53a349539256 100644 --- a/kernel/cgroup/cgroup.c +++ b/kernel/cgroup/cgroup.c @@ -58,6 +58,7 @@ #include #include #include +#include #include -- 2.27.0 ___ Devel mailing list Devel@openvz.org https://lists.openvz.org/mailman/listinfo/devel
[Devel] [PATCH VZ8 v1 05/14] cgroup: added rcu node string wrapper for in-cgroup usage. This will be used in further patches in same patchset.
Signed-off-by: Valeriy Vdovin Reviewed-by: Kirill Tkhai (Cherry-picked from e828803a5d776125c9c329f194aff74fb4ec181a) Signed-off-by: Valeriy Vdovin --- include/linux/cgroup-defs.h | 5 + include/linux/cgroup.h | 4 kernel/cgroup/cgroup-v1.c | 15 +++ 3 files changed, 24 insertions(+) diff --git a/include/linux/cgroup-defs.h b/include/linux/cgroup-defs.h index 5ee5f10e3de7..4fd566414274 100644 --- a/include/linux/cgroup-defs.h +++ b/include/linux/cgroup-defs.h @@ -362,6 +362,11 @@ struct cgroup_freezer_state { int nr_frozen_tasks; }; +struct cgroup_rcu_string { + struct rcu_head rcu_head; + char val[]; +}; + struct cgroup { /* self css with NULL ->ss, points back to this cgroup */ struct cgroup_subsys_state self; diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h index dfd9460986ee..76e38ac6 100644 --- a/include/linux/cgroup.h +++ b/include/linux/cgroup.h @@ -689,6 +689,10 @@ static inline union kernfs_node_id *cgroup_get_kernfs_id(struct cgroup *cgrp) void cgroup_path_from_kernfs_id(const union kernfs_node_id *id, char *buf, size_t buflen); + +struct cgroup_rcu_string; + +struct cgroup_rcu_string *cgroup_rcu_strdup(const char *str, int len); #else /* !CONFIG_CGROUPS */ struct cgroup_subsys_state; diff --git a/kernel/cgroup/cgroup-v1.c b/kernel/cgroup/cgroup-v1.c index db10a1ed282a..fb06fc9d96ca 100644 --- a/kernel/cgroup/cgroup-v1.c +++ b/kernel/cgroup/cgroup-v1.c @@ -231,6 +231,21 @@ static void cgroup_pidlist_destroy_work_fn(struct work_struct *work) kfree(tofree); } +struct cgroup_rcu_string *cgroup_rcu_strdup(const char *str, int len) +{ + struct cgroup_rcu_string *result; + size_t buflen = len + 1; + + result = kmalloc(sizeof(*result) + buflen, GFP_KERNEL); + if (!result) + return ERR_PTR(-ENOMEM); + if (strlcpy(result->val, str, buflen) >= buflen) { + kfree(result); + return ERR_PTR(-ENAMETOOLONG); + } + return result; +} + /* * pidlist_uniq - given a kmalloc()ed list, strip out all duplicate entries * Returns the number of unique elements. -- 2.27.0 ___ Devel mailing list Devel@openvz.org https://lists.openvz.org/mailman/listinfo/devel
[Devel] [PATCH VZ8 v1 04/14] ve/cgroup: implemented per-ve workqueue.
Signed-off-by: Valeriy Vdovin Reviewed-by: Kirill Tkhai (Cherry-picked from 0293870666c4f96bd56f612d94f560626c76e2fd) Signed-off-by: Valeriy Vdovin --- include/linux/ve.h | 1 + kernel/ve/ve.c | 25 + 2 files changed, 26 insertions(+) diff --git a/include/linux/ve.h b/include/linux/ve.h index 103d0a9044fc..d3c1ab840444 100644 --- a/include/linux/ve.h +++ b/include/linux/ve.h @@ -105,6 +105,7 @@ struct ve_struct { unsigned long aio_nr; unsigned long aio_max_nr; #endif + struct workqueue_struct *wq; }; struct ve_devmnt { diff --git a/kernel/ve/ve.c b/kernel/ve/ve.c index f7d605357d2e..25455264b225 100644 --- a/kernel/ve/ve.c +++ b/kernel/ve/ve.c @@ -388,6 +388,21 @@ static void ve_set_vdso_time(struct ve_struct *ve, u64 time) *vdso_start_time = time; } +static int ve_workqueue_start(struct ve_struct *ve) +{ + ve->wq = alloc_workqueue("ve_wq_%s", + WQ_SYSFS|WQ_FREEZABLE|WQ_UNBOUND, 8, ve->ve_name); + + if (!ve->wq) + return -ENOMEM; + return 0; +} + +static void ve_workqueue_stop(struct ve_struct *ve) +{ + destroy_workqueue(ve->wq); +} + /* under ve->op_sem write-lock */ static int ve_start_container(struct ve_struct *ve) { @@ -443,6 +458,10 @@ static int ve_start_container(struct ve_struct *ve) if (err) goto err_umh; + err = ve_workqueue_start(ve); + if (err) + goto err_workqueue; + err = ve_hook_iterate_init(VE_SS_CHAIN, ve); if (err < 0) goto err_iterate; @@ -458,6 +477,8 @@ static int ve_start_container(struct ve_struct *ve) return 0; err_iterate: + ve_workqueue_stop(ve); +err_workqueue: ve_stop_umh(ve); err_umh: ve_stop_kthreadd(ve); @@ -523,6 +544,8 @@ void ve_exit_ns(struct pid_namespace *pid_ns) cgroup_unmark_ve_roots(ve); + ve_workqueue_stop(ve); + /* * At this point all userspace tasks in container are dead. */ @@ -1363,6 +1386,8 @@ static int __init ve_subsys_init(void) { ve_cachep = KMEM_CACHE_USERCOPY(ve_struct, SLAB_PANIC, core_pattern); list_add(&ve0.ve_list, &ve_list_head); + ve0.wq = alloc_workqueue("ve0_wq", WQ_FREEZABLE|WQ_UNBOUND, 8); + BUG_ON(!ve0.wq); return 0; } late_initcall(ve_subsys_init); -- 2.27.0 ___ Devel mailing list Devel@openvz.org https://lists.openvz.org/mailman/listinfo/devel
[Devel] [PATCH VZ8 v1 03/14] cgroup: port CGROUP_REMOVED flag from vz7
The flag will be used in subsequent patches Signed-off-by: Valeriy Vdovin --- include/linux/cgroup-defs.h | 2 ++ include/linux/cgroup.h | 5 + kernel/cgroup/cgroup.c | 1 + 3 files changed, 8 insertions(+) diff --git a/include/linux/cgroup-defs.h b/include/linux/cgroup-defs.h index a3b309ab1a90..5ee5f10e3de7 100644 --- a/include/linux/cgroup-defs.h +++ b/include/linux/cgroup-defs.h @@ -57,6 +57,8 @@ enum { /* bits in struct cgroup flags field */ enum { + /* Control Cgroup is dead */ + CGRP_REMOVED, /* Control Group requires release notifications to userspace */ CGRP_NOTIFY_ON_RELEASE, /* diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h index c0a42c3d43fa..dfd9460986ee 100644 --- a/include/linux/cgroup.h +++ b/include/linux/cgroup.h @@ -922,6 +922,11 @@ static inline bool cgroup_task_frozen(struct task_struct *task) return task->frozen; } +static inline int cgroup_is_removed(const struct cgroup *cgrp) +{ + return test_bit(CGRP_REMOVED, &cgrp->flags); +} + #else /* !CONFIG_CGROUPS */ static inline void cgroup_enter_frozen(void) { } diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c index 797a3971ab46..447c8f003496 100644 --- a/kernel/cgroup/cgroup.c +++ b/kernel/cgroup/cgroup.c @@ -5562,6 +5562,7 @@ static int cgroup_destroy_locked(struct cgroup *cgrp) tcgrp->freezer.nr_frozen_descendants--; } spin_unlock_irq(&css_set_lock); + set_bit(CGRP_REMOVED, &cgrp->flags); cgroup1_check_for_release(parent); -- 2.27.0 ___ Devel mailing list Devel@openvz.org https://lists.openvz.org/mailman/listinfo/devel
[Devel] [PATCH VZ8 v1 02/14] cgroup/cfs: added 'activate' option to cgroup_add_file
In kernfs files get created in 'deactivated' state, which means they are not visible. Add option to activate the file after creation immediately making it visible in the parent directory. Will be used in later patches. Signed-off-by: Valeriy Vdovin --- kernel/cgroup/cgroup.c | 6 -- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c index 4488df184235..797a3971ab46 100644 --- a/kernel/cgroup/cgroup.c +++ b/kernel/cgroup/cgroup.c @@ -3824,7 +3824,7 @@ static void cgroup_file_notify_timer(struct timer_list *timer) } static int cgroup_add_file(struct cgroup_subsys_state *css, struct cgroup *cgrp, - struct cftype *cft) + struct cftype *cft, bool activate) { char name[CGROUP_FILE_NAME_MAX]; struct kernfs_node *kn; @@ -3866,6 +3866,8 @@ static int cgroup_add_file(struct cgroup_subsys_state *css, struct cgroup *cgrp, if (IS_ERR(kn_link)) return PTR_ERR(kn_link); } + if (activate) + kernfs_activate(kn); return 0; } @@ -3903,7 +3905,7 @@ static int cgroup_addrm_files(struct cgroup_subsys_state *css, if ((cft->flags & CFTYPE_DEBUG) && !cgroup_debug) continue; if (is_add) { - ret = cgroup_add_file(css, cgrp, cft); + ret = cgroup_add_file(css, cgrp, cft, false); if (ret) { pr_warn("%s: failed to add %s, err=%d\n", __func__, cft->name, ret); -- 2.27.0 ___ Devel mailing list Devel@openvz.org https://lists.openvz.org/mailman/listinfo/devel
[Devel] [PATCH VZ8 v1 01/14] ve/cgroup: unmark ve-root cgroups at container stop
fixes: 915a1130c7ee4ffb6de3f69a5bd98c5ee42a723f Signed-off-by: Valeriy Vdovin Reviewed-by: Kirill Tkhai Cherry-picked from 5dceccf5dd794673ebb1b0e6840d96aa654ec33e) Signed-off-by: Valeriy Vdovin --- include/linux/cgroup.h | 1 + kernel/cgroup/cgroup.c | 24 kernel/ve/ve.c | 3 +++ 3 files changed, 28 insertions(+) diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h index 4f0dd51338bf..c0a42c3d43fa 100644 --- a/include/linux/cgroup.h +++ b/include/linux/cgroup.h @@ -867,6 +867,7 @@ int cgroup_path_ns(struct cgroup *cgrp, char *buf, size_t buflen, #ifdef CONFIG_VE extern void cgroup_mark_ve_root(struct ve_struct *ve); +void cgroup_unmark_ve_roots(struct ve_struct *ve); #endif #else /* !CONFIG_CGROUPS */ diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c index 0335a07f64e6..4488df184235 100644 --- a/kernel/cgroup/cgroup.c +++ b/kernel/cgroup/cgroup.c @@ -1908,6 +1908,30 @@ void cgroup_mark_ve_root(struct ve_struct *ve) spin_unlock_irq(&css_set_lock); } +void cgroup_unmark_ve_roots(struct ve_struct *ve) +{ + struct cgrp_cset_link *link; + struct css_set *cset; + struct cgroup *cgrp; + + spin_lock_irq(&css_set_lock); + + rcu_read_lock(); + cset = rcu_dereference(ve->ve_ns)->cgroup_ns->root_cset; + if (WARN_ON(!cset)) + goto unlock; + + list_for_each_entry(link, &cset->cgrp_links, cgrp_link) { + cgrp = link->cgrp; + clear_bit(CGRP_VE_ROOT, &cgrp->flags); + } + link_ve_root_cpu_cgroup(cset->subsys[cpu_cgrp_id]); +unlock: + rcu_read_unlock(); + + spin_unlock_irq(&css_set_lock); +} + struct cgroup *cgroup_get_ve_root1(struct cgroup *cgrp) { struct cgroup *ve_root = NULL; diff --git a/kernel/ve/ve.c b/kernel/ve/ve.c index b83b2b66a875..f7d605357d2e 100644 --- a/kernel/ve/ve.c +++ b/kernel/ve/ve.c @@ -520,6 +520,9 @@ void ve_exit_ns(struct pid_namespace *pid_ns) */ if (!ve_ns || ve_ns->pid_ns_for_children != pid_ns) goto unlock; + + cgroup_unmark_ve_roots(ve); + /* * At this point all userspace tasks in container are dead. */ -- 2.27.0 ___ Devel mailing list Devel@openvz.org https://lists.openvz.org/mailman/listinfo/devel
[Devel] [PATCH VZ8 v1 00/14] Port release_agent virtualization from vz7
This patchset ports virtualization of cgroup release_agent virtualization from vz7. Major challanges of porting are differences between vz7 and vz8 cgroup implementations: - transition of cgroups to kernfs - slightly changed locking scheme, which relies on css_set_lock in places, previously relied on cgroup_mutex. There is a small number of patches that have been ported without modifications, but most of the patches had suffered a lot of modification due to the factors described above. Valeriy Vdovin (14): ve/cgroup: unmark ve-root cgroups at container stop cgroup/cfs: added 'activate' option to cgroup_add_file cgroup: port CGROUP_REMOVED flag from vz7 ve/cgroup: implemented per-ve workqueue. cgroup: added rcu node string wrapper for in-cgroup usage. This will be used in further patches in same patchset. cgroup: exported put_css_set and get_css_set to cgroup.h ve/cgroup: Added ve_owner field to cgroup cgroup/ve: added helper function to get ve-related cgroup paths ve/cgroup: moved release_agent from system_wq to per-ve workqueues ve/cgroup: private per-cgroup-root data container ve/cgroup: set release_agent_path for root cgroups separately ve/cgroup: added release_agent to each container root cgroup. ve/cgroup: At cgroup_mark(unmark)_ve_roots skip non-virtualized roots cgroup: relaxed lockdep assertion for cset_cgroup_from_root include/linux/cgroup-defs.h | 26 +++- include/linux/cgroup.h | 43 +- include/linux/ve.h | 26 kernel/cgroup/cgroup-internal.h | 32 +--- kernel/cgroup/cgroup-v1.c | 252 +--- kernel/cgroup/cgroup.c | 227 +--- kernel/ve/ve.c | 247 ++- 7 files changed, 744 insertions(+), 109 deletions(-) -- 2.27.0 ___ Devel mailing list Devel@openvz.org https://lists.openvz.org/mailman/listinfo/devel
[Devel] [PATCH] scripts/checkpatch.pl: fixed regression after COPYING was renamed
fixes: 90e3b8f80b22002418d3056438a82837769c4691 checpatch.pl first checks that the script is run from top of kernel tree. it does so by calling top_of_kernel_tree function, which has a hardcoded set of files that are enough to identify top of tree by their presence. The troublesome patch renames COPYING by adding prefix and so the check fails. Signed-off-by: Valeriy Vdovin --- scripts/checkpatch.pl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/checkpatch.pl b/scripts/checkpatch.pl index 4d80526..cb9cddd 100755 --- a/scripts/checkpatch.pl +++ b/scripts/checkpatch.pl @@ -1069,7 +1069,7 @@ sub top_of_kernel_tree { my ($root) = @_; my @tree_check = ( - "COPYING", "CREDITS", "Kbuild", "MAINTAINERS", "Makefile", + "COPYING-4.18.0", "CREDITS", "Kbuild", "MAINTAINERS", "Makefile", "README", "Documentation", "arch", "include", "drivers", "fs", "init", "ipc", "kernel", "lib", "scripts", ); -- 1.8.3.1 ___ Devel mailing list Devel@openvz.org https://lists.openvz.org/mailman/listinfo/devel
[Devel] [PATCH VZ8 v0 14/14] cgroup: relaxed lockdep assertion for cset_cgroup_from_root
Signed-off-by: Valeriy Vdovin --- kernel/cgroup/cgroup-internal.h | 3 ++- kernel/cgroup/cgroup-v1.c | 2 +- kernel/cgroup/cgroup.c | 14 -- 3 files changed, 11 insertions(+), 8 deletions(-) diff --git a/kernel/cgroup/cgroup-internal.h b/kernel/cgroup/cgroup-internal.h index 112bd917e99d..e1ce16e77bb4 100644 --- a/kernel/cgroup/cgroup-internal.h +++ b/kernel/cgroup/cgroup-internal.h @@ -161,7 +161,8 @@ static inline bool notify_on_release(const struct cgroup *cgrp) } struct cgroup *cset_cgroup_from_root(struct css_set *cset, - struct cgroup_root *root); + struct cgroup_root *root, + bool assert_locks_strict); bool cgroup_ssid_enabled(int ssid); bool cgroup_on_dfl(const struct cgroup *cgrp); diff --git a/kernel/cgroup/cgroup-v1.c b/kernel/cgroup/cgroup-v1.c index 70a282932f44..eda3a1f12230 100644 --- a/kernel/cgroup/cgroup-v1.c +++ b/kernel/cgroup/cgroup-v1.c @@ -1021,7 +1021,7 @@ static int cgroup1_show_options(struct seq_file *seq, struct kernfs_root *kf_roo spin_lock_irq(&css_set_lock); cset = ve_ns->cgroup_ns->root_cset; BUG_ON(!cset); - root_cgrp = cset_cgroup_from_root(cset, root); + root_cgrp = cset_cgroup_from_root(cset, root, + false); spin_unlock_irq(&css_set_lock); } } diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c index 34e049361611..219b3cfce41a 100644 --- a/kernel/cgroup/cgroup.c +++ b/kernel/cgroup/cgroup.c @@ -1437,11 +1437,13 @@ current_cgns_cgroup_from_root(struct cgroup_root *root) /* look up cgroup associated with given css_set on the specified hierarchy */ struct cgroup *cset_cgroup_from_root(struct css_set *cset, - struct cgroup_root *root) + struct cgroup_root *root, + bool assert_locks_strict) { struct cgroup *res = NULL; - lockdep_assert_held(&cgroup_mutex); + if (assert_locks_strict) + lockdep_assert_held(&cgroup_mutex); lockdep_assert_held(&css_set_lock); if (cset == &init_css_set) { @@ -1476,7 +1478,7 @@ struct cgroup *task_cgroup_from_root(struct task_struct *task, * No need to lock the task - since we hold css_set_lock the * task can't change groups. */ - return cset_cgroup_from_root(task_css_set(task), root); + return cset_cgroup_from_root(task_css_set(task), root, true); } /* @@ -2284,7 +2286,7 @@ struct dentry *cgroup_do_mount(struct file_system_type *fs_type, int flags, mutex_lock(&cgroup_mutex); spin_lock_irq(&css_set_lock); - cgrp = cset_cgroup_from_root(ns->root_cset, root); + cgrp = cset_cgroup_from_root(ns->root_cset, root, true); spin_unlock_irq(&css_set_lock); mutex_unlock(&cgroup_mutex); @@ -2380,7 +2382,7 @@ static struct file_system_type cgroup2_fs_type = { int cgroup_path_ns_locked(struct cgroup *cgrp, char *buf, size_t buflen, struct cgroup_namespace *ns) { - struct cgroup *root = cset_cgroup_from_root(ns->root_cset, cgrp->root); + struct cgroup *root = cset_cgroup_from_root(ns->root_cset, cgrp->root, + true); return kernfs_path_from_node(cgrp->kn, root->kn, buf, buflen); } @@ -2749,7 +2751,7 @@ void cgroup_migrate_add_src(struct css_set *src_cset, if (src_cset->dead) return; - src_cgrp = cset_cgroup_from_root(src_cset, dst_cgrp->root); + src_cgrp = cset_cgroup_from_root(src_cset, dst_cgrp->root, true); if (!list_empty(&src_cset->mg_preload_node)) return; -- 2.27.0 ___ Devel mailing list Devel@openvz.org https://lists.openvz.org/mailman/listinfo/devel
[Devel] [PATCH VZ8 v0 13/14] ve/cgroup: At cgroup_mark(unmark)_ve_roots skip non-virtualized roots
During container start there might be a situation when not all cgroup hierarchies get virtualized by container manager (like vzctl). By virtualizing a cgroup hierarchy I mean creation of sub-directory within a particular mounted cgroup. When container starts it looks in css set of it's init process to list all affilated cgroups and perform actions on each. But non-virtualized cgroups will also be present in init's css_set and they should not be touched from inside of any non root ve. Signed-off-by: Valeriy Vdovin --- kernel/cgroup/cgroup.c | 17 + 1 file changed, 17 insertions(+) diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c index 285e84d1150f..34e049361611 100644 --- a/kernel/cgroup/cgroup.c +++ b/kernel/cgroup/cgroup.c @@ -1927,6 +1927,23 @@ static int cgroup_add_file(struct cgroup_subsys_state *css, struct cgroup *cgrp, struct cftype *cft, bool activate); #ifdef CONFIG_VE +static inline bool is_virtualized_cgroup(struct cgroup *cgrp) +{ + /* +* no parent means this is the host cgroup +*/ + if (!cgrp->kn->parent) + return false; + + if (cgrp->root->subsys_mask) + return true; + + if (!strcmp(cgrp->root->name, "systemd")) + return true; + + return false; +} + int cgroup_mark_ve_roots(struct ve_struct *ve) { int err; -- 2.27.0 ___ Devel mailing list Devel@openvz.org https://lists.openvz.org/mailman/listinfo/devel
[Devel] [PATCH VZ8 v0 12/14] ve/cgroup: added release_agent to each container root cgroup.
Each container will now have access to it's own cgroup release_agent file. Creation: Normally all cgroup files are created during a call to cgroup_create by cgroup_populate_dir function. It creates or not creates all cgroup files once and they immediately become visible to userspace as filesystem objects. Due to specifics of container creation process, it is not possible to use the same code for 'release_agent' file creation. For VE to start operating, first a list of ordinary cgroups is being created for each subsystem, then the set of newly created cgroups are converted to "virtual roots", so at the time when cgroup_create is executed, there is no knowledge of wheather or not "release_agent" file should be created. This information only comes at "conversion" step which is 'cgroup_mark_ve_roots' function. As the file is created dynamically in a live cgroup, a rather delicate locking sequence is present in the new code: - each new "virtual root" cgroup will have to add "release_agent" file, thus each cgroup's directory would need to be locked during the insertion time by cgroup->dentry->d_inode->i_mutex. - d_inode->i_mutex has an ordering dependency with cgroup_mutex (see cgroup_mount/cgroup_remount). They can not be locked in order {lock(cgroup_mutex), lock(inode->i_mutex)}. - to collect a list of cgroups, that need to become virtual we need cgroup_mutex lock to iterate active roots. - to overcome the above conflict we first need to collect a list of all virtual cgroups under cgroup_mutex lock, then release it and after that to insert "release_agent" to each root under inode->i_mutex lock. - to collect a list of cgroups on stack we utilize cgroup->cft_q_node, made specially for that purpose under it's own cgroup_cft_mutex. Destruction: Destruction is done in reverse from the above within cgroup_unmark_ve_roots. After file destruction we must prevent further write operations to this file in case when someone has opened this file prior to VE and cgroup destruction. This is achieved by checking if cgroup in the argument to cgroup_file_write function has features of host or virtual root. https://jira.sw.ru/browse/PSBM-83887 Signed-off-by: Valeriy Vdovin +++ cgroup: add missing dput() in cgroup_unmark_ve_roots() cgroup_unmark_ve_roots() calls dget() on cgroup's dentry but don't have the corresponding dput() call. This leads to leaking cgroups. Add missing dput() to fix this. https://jira.sw.ru/browse/PSBM-107328 Fixes: 1ac69e183447 ("ve/cgroup: added release_agent to each container root cgroup.") (Cherry-picked from 4a1635024df1bae4f4809a3bc445f0cf64d4acf4) Signed-off-by: Valeriy Vdovin --- include/linux/cgroup-defs.h | 3 + include/linux/cgroup.h | 2 +- include/linux/ve.h | 4 +- kernel/cgroup/cgroup-internal.h | 1 + kernel/cgroup/cgroup-v1.c | 36 - kernel/cgroup/cgroup.c | 128 ++-- kernel/ve/ve.c | 42 --- 7 files changed, 178 insertions(+), 38 deletions(-) diff --git a/include/linux/cgroup-defs.h b/include/linux/cgroup-defs.h index f1bd203321cb..652615e9a963 100644 --- a/include/linux/cgroup-defs.h +++ b/include/linux/cgroup-defs.h @@ -453,6 +453,9 @@ struct cgroup { */ struct list_head cset_links; + /* Used for cgroup_mark/umark ve */ + struct list_head cft_q_node; + /* * Linked list running through all cgroups that can * potentially be reaped by the release agent. Protected by diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h index 0920b2ffb15b..b846617b0a53 100644 --- a/include/linux/cgroup.h +++ b/include/linux/cgroup.h @@ -900,7 +900,7 @@ int cgroup_path_ns(struct cgroup *cgrp, char *buf, size_t buflen, void cgroup1_release_agent(struct work_struct *work); #ifdef CONFIG_VE -extern void cgroup_mark_ve_root(struct ve_struct *ve); +int cgroup_mark_ve_roots(struct ve_struct *ve); void cgroup_unmark_ve_roots(struct ve_struct *ve); struct ve_struct *cgroup_get_ve_owner(struct cgroup *cgrp); #endif diff --git a/include/linux/ve.h b/include/linux/ve.h index f5e5bbac3e2c..33413958d255 100644 --- a/include/linux/ve.h +++ b/include/linux/ve.h @@ -140,11 +140,13 @@ extern int nr_ve; void ve_add_to_release_list(struct cgroup *cgrp); void ve_rm_from_release_list(struct cgroup *cgrp); -int ve_set_release_agent_path(struct ve_struct *ve, struct cgroup *cgroot, +int ve_set_release_agent_path(struct cgroup *cgroot, const char *release_agent); const char *ve_get_release_agent_path(struct cgroup *cgrp_root); +void ve_cleanup_per_cgroot_data(struct ve_struct *ve, struct cgroup *cgrp); + extern struct ve_struct *get_ve(struct ve_struct *ve); exter
[Devel] [PATCH VZ8 v0 11/14] ve/cgroup: set release_agent_path for root cgroups separately
This is done so that each container could set it's own release agent. Release agent information is now stored in per-cgroup-root data structure in ve. https://jira.sw.ru/browse/PSBM-83887 Signed-off-by: Valeriy Vdovin +++ ve/cgroup: change resource release order in ve_drop_context This fixes 87cb5fdb5b5c77ac617b46a0fe118a7d50a77b1c In the mentioned patch in cgroup_show_options ve->ve_ns is checked to ensure that ve->root_css_set is usable. But in ve_drop_context root_css_set is being released before ve_ns, which is a bug. root_css_set will now be set to NULL after ve_ns is released. This reordering only affects the described piece of code in cgroup_show_options. https://jira.sw.ru/browse/PSBM-121438 Signed-off-by: Valeriy Vdovin Reviewed-by: Kirill Tkhai +++ cgroup: do not use cgroup_mutex in cgroup_show_options In 87cb5fdb5b5c77ac617b46a0fe118a7d50a77b1c function cgroup_show_options started to lock cgroup_mutex, which introduced new deadlock possibility, described below: Thread A: m_start() --> down_read(&namespace_sem); cgroup_show_options() --> mutex_lock(&cgroup_mutex); Thread B: attach_task_by_pid() cgroup_lock_live_group --> mutex_lock(&cgroup_mutex); threadgroup_lock() --> down_write(&tsk->signal->group_rwsem); Thread C: copy_process threadgroup_change_begin() --> down_read(&tsk->signal->group_rwsem); copy_namespaces create_new_namespaces copy_mnt_ns namespace_lock() --> down_write(&namespace_sem) Clearly cgroup_mutex can not be locked right after locking namespace_sem, because opposite locking order is also present in the code and should be removed from cgroup_show_options. After reviewing cgroup_show_options, it was established that cgroup_mutex is not absolutely needed to guarantee safe access to root_cgrp. It was used in combination with a call to task_cgroup_from_root to ensure that root_cgrp lived long enough to access it's value of release_agent path. But in this funciton we know that root_cgrp is part of ve->root_css_set, which holds reference to it. In turn root_css_set is referenced while ve->ve_ns is not NULL, the check of which we already have in the code. This means that root_cgrp is valid until ve->ve_ns is valid. ve->ve_ns is valid until the point of rcu_synchronize in ve_drop_context, that's why rcu_read_lock should be maintained all the time when root_cgrp is being accessed. The patch also removes BUG_ON from css_cgroup_from_root, because all 3 calls to this function pass ve->root_css_set as an argument and the above logic applies. https://jira.sw.ru/browse/PSBM-121438 Signed-off-by: Valeriy Vdovin Reviewed-by: Kirill Tkhai +++ ve: cleanup in function ve_get_release_agent_path (Cherry-picked from f1199bd9589b7c0914343dcc72f49ddaa9b98496) Signed-off-by: Valeriy Vdovin Reviewed-by: Kirill Tkhai --- include/linux/cgroup-defs.h | 3 -- include/linux/ve.h | 6 +++ kernel/cgroup/cgroup-internal.h | 4 +- kernel/cgroup/cgroup-v1.c | 85 ++--- kernel/cgroup/cgroup.c | 9 ++-- kernel/ve/ve.c | 76 + 6 files changed, 149 insertions(+), 34 deletions(-) diff --git a/include/linux/cgroup-defs.h b/include/linux/cgroup-defs.h index d883922fb045..f1bd203321cb 100644 --- a/include/linux/cgroup-defs.h +++ b/include/linux/cgroup-defs.h @@ -571,9 +571,6 @@ struct cgroup_root { /* IDs for cgroups in this hierarchy */ struct idr cgroup_idr; - /* The path to use for release notifications. */ - char release_agent_path[PATH_MAX]; - /* The name for this hierarchy - may be empty */ char name[MAX_CGROUP_ROOT_NAMELEN]; }; diff --git a/include/linux/ve.h b/include/linux/ve.h index 42cced6b67c5..f5e5bbac3e2c 100644 --- a/include/linux/ve.h +++ b/include/linux/ve.h @@ -139,6 +139,12 @@ extern int nr_ve; #ifdef CONFIG_VE void ve_add_to_release_list(struct cgroup *cgrp); void ve_rm_from_release_list(struct cgroup *cgrp); + +int ve_set_release_agent_path(struct ve_struct *ve, struct cgroup *cgroot, + const char *release_agent); + +const char *ve_get_release_agent_path(struct cgroup *cgrp_root); + extern struct ve_struct *get_ve(struct ve_struct *ve); extern void put_ve(struct ve_struct *ve); diff --git a/kernel/cgroup/cgroup-internal.h b/kernel/cgroup/cgroup-internal.h index 4de66630d456..be0cd157d4dc 100644 --- a/kernel/cgroup/cgroup-internal.h +++ b/kernel/cgroup/cgroup-internal.h @@ -160,6 +160,9 @@ static inline bool notify_on_release(const struct cgroup *cgrp) return test_bit(CGRP_NOTIFY_ON_RELEASE, &cgrp->flags); } +struct cgroup *cset_cgroup_from_root(struct css_set *cset, + struct cgroup_root *root); + bool cgroup_ssid_enabled(int ssid); bool cgroup_on_dfl(const struct cgroup *cgrp); bool cgroup_is_thread_root(struct cgroup *cgrp); @@ -
[Devel] [PATCH VZ8 v0 10/14] ve/cgroup: private per-cgroup-root data container
As long as each ve is internally attached to a particular css_set via it's init_task, it's good to have container with parameters, which are common to each cgroup subsystem hierarchy, rooting from it's virtual root. (Cherry-picked from 4a98f07102fd248ad4218a07b5ec5ec90da10288) Signed-off-by: Valeriy Vdovin --- include/linux/ve.h | 7 + kernel/ve/ve.c | 75 ++ 2 files changed, 82 insertions(+) diff --git a/include/linux/ve.h b/include/linux/ve.h index 5629f4363394..42cced6b67c5 100644 --- a/include/linux/ve.h +++ b/include/linux/ve.h @@ -109,6 +109,13 @@ struct ve_struct { struct workqueue_struct *wq; struct work_struct release_agent_work; + + /* +* List of data, private for each root cgroup in +* ve's css_set. +*/ + struct list_headper_cgroot_list; + spinlock_t per_cgroot_list_lock; }; struct ve_devmnt { diff --git a/kernel/ve/ve.c b/kernel/ve/ve.c index 36922b322d3f..c746205f7442 100644 --- a/kernel/ve/ve.c +++ b/kernel/ve/ve.c @@ -29,6 +29,14 @@ #include "../cgroup/cgroup-internal.h" /* For cgroup_task_count() */ +struct per_cgroot_data { + struct list_head list; + /* +* data is related to this cgroup +*/ + struct cgroup *cgroot; +}; + extern struct kmapset_set sysfs_ve_perms_set; static struct kmem_cache *ve_cachep; @@ -66,6 +74,9 @@ struct ve_struct ve0 = { .release_list = LIST_HEAD_INIT(ve0.release_list), .release_agent_work = __WORK_INITIALIZER(ve0.release_agent_work, cgroup1_release_agent), + .per_cgroot_list= LIST_HEAD_INIT(ve0.per_cgroot_list), + .per_cgroot_list_lock = __SPIN_LOCK_UNLOCKED( + ve0.per_cgroot_list_lock), }; EXPORT_SYMBOL(ve0); @@ -198,6 +209,53 @@ int nr_threads_ve(struct ve_struct *ve) } EXPORT_SYMBOL(nr_threads_ve); +static struct per_cgroot_data *per_cgroot_data_find_locked( + struct list_head *per_cgroot_list, struct cgroup *cgroot) +{ + struct per_cgroot_data *data; + + list_for_each_entry(data, per_cgroot_list, list) { + if (data->cgroot == cgroot) + return data; + } + return NULL; +} + +static inline struct per_cgroot_data *per_cgroot_get_or_create( + struct ve_struct *ve, struct cgroup *cgroot) +{ + struct per_cgroot_data *data, *other_data; + unsigned long flags; + + spin_lock_irqsave(&ve->per_cgroot_list_lock, flags); + data = per_cgroot_data_find_locked(&ve->per_cgroot_list, + cgroot); + spin_unlock_irqrestore(&ve->per_cgroot_list_lock, flags); + + if (data) + return data; + + data = kzalloc(sizeof(struct per_cgroot_data), GFP_KERNEL); + if (!data) + return ERR_PTR(-ENOMEM); + + spin_lock_irqsave(&ve->per_cgroot_list_lock, flags); + other_data = per_cgroot_data_find_locked(&ve->per_cgroot_list, + cgroot); + + if (other_data) { + spin_unlock_irqrestore(&ve->per_cgroot_list_lock, flags); + kfree(data); + return other_data; + } + + data->cgroot = cgroot; + list_add(&data->list, &ve->per_cgroot_list); + + spin_unlock_irqrestore(&ve->per_cgroot_list_lock, flags); + return data; +} + struct cgroup_subsys_state *ve_get_init_css(struct ve_struct *ve, int subsys_id) { struct cgroup_subsys_state *css; @@ -532,6 +590,19 @@ static int ve_start_container(struct ve_struct *ve) return err; } +static void ve_per_cgroot_free(struct ve_struct *ve) +{ + struct per_cgroot_data *data, *saved; + unsigned long flags; + + spin_lock_irqsave(&ve->per_cgroot_list_lock, flags); + list_for_each_entry_safe(data, saved, &ve->per_cgroot_list, list) { + list_del_init(&data->list); + kfree(data); + } + spin_unlock_irqrestore(&ve->per_cgroot_list_lock, flags); +} + void ve_stop_ns(struct pid_namespace *pid_ns) { struct ve_struct *ve = current->task_ve; @@ -588,6 +659,8 @@ void ve_exit_ns(struct pid_namespace *pid_ns) ve_workqueue_stop(ve); + ve_per_cgroot_free(ve); + /* * At this point all userspace tasks in container are dead. */ @@ -698,6 +771,7 @@ static struct cgroup_subsys_state *ve_create(struct cgroup_subsys_state *parent_ INIT_WORK(&ve->release_agent_work, cgroup1_release_agent); spin_lock_init(&ve->release_list_lock); + spin_lock_init(&ve->per_cgroot_list_lock); ve->_randomize_va_space = ve0._randomize_va_space; @@ -720,6 +794,7 @@ static struct cgroup_subsys_state *ve_create(st
[Devel] [PATCH VZ8 v0 09/14] ve/cgroup: moved release_agent from system_wq to per-ve workqueues
Each VE should execute release agent notifications within it's own workqueue. This way we achieve a more fine-grained control over release_agent work flushing at VE destruction. (Cherry-picked from 9fbfb5b4cfb87ba7c9dd63eec5e5e27946a38d3c) Signed-off-by: Valeriy Vdovin --- include/linux/cgroup-defs.h | 10 ++- include/linux/cgroup.h | 2 + include/linux/ve.h | 10 +++ kernel/cgroup/cgroup-internal.h | 1 + kernel/cgroup/cgroup-v1.c | 109 kernel/cgroup/cgroup.c | 12 +++- kernel/ve/ve.c | 48 ++ 7 files changed, 159 insertions(+), 33 deletions(-) diff --git a/include/linux/cgroup-defs.h b/include/linux/cgroup-defs.h index e497387872f4..d883922fb045 100644 --- a/include/linux/cgroup-defs.h +++ b/include/linux/cgroup-defs.h @@ -453,6 +453,13 @@ struct cgroup { */ struct list_head cset_links; + /* +* Linked list running through all cgroups that can +* potentially be reaped by the release agent. Protected by +* release_list_lock +*/ + struct list_head release_list; + /* * On the default hierarchy, a css_set for a cgroup with some * susbsys disabled will point to css's which are associated with @@ -490,9 +497,6 @@ struct cgroup { /* used to wait for offlining of csses */ wait_queue_head_t offline_waitq; - /* used to schedule release agent */ - struct work_struct release_agent_work; - /* used to store eBPF programs */ struct cgroup_bpf bpf; diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h index df9a9a09ce2a..0920b2ffb15b 100644 --- a/include/linux/cgroup.h +++ b/include/linux/cgroup.h @@ -897,6 +897,8 @@ struct cgroup_namespace *copy_cgroup_ns(unsigned long flags, int cgroup_path_ns(struct cgroup *cgrp, char *buf, size_t buflen, struct cgroup_namespace *ns); +void cgroup1_release_agent(struct work_struct *work); + #ifdef CONFIG_VE extern void cgroup_mark_ve_root(struct ve_struct *ve); void cgroup_unmark_ve_roots(struct ve_struct *ve); diff --git a/include/linux/ve.h b/include/linux/ve.h index 2c45f47b9f92..5629f4363394 100644 --- a/include/linux/ve.h +++ b/include/linux/ve.h @@ -100,7 +100,15 @@ struct ve_struct { struct list_headdevmnt_list; struct mutexdevmnt_mutex; + /* +* cgroups, that want to notify about becoming +* empty, are linked to this release_list. +*/ + struct list_headrelease_list; + spinlock_t release_list_lock; + struct workqueue_struct *wq; + struct work_struct release_agent_work; }; struct ve_devmnt { @@ -122,6 +130,8 @@ extern int nr_ve; (ve_is_super(get_exec_env()) && capable(CAP_SYS_ADMIN)) #ifdef CONFIG_VE +void ve_add_to_release_list(struct cgroup *cgrp); +void ve_rm_from_release_list(struct cgroup *cgrp); extern struct ve_struct *get_ve(struct ve_struct *ve); extern void put_ve(struct ve_struct *ve); diff --git a/kernel/cgroup/cgroup-internal.h b/kernel/cgroup/cgroup-internal.h index 829997989c41..4de66630d456 100644 --- a/kernel/cgroup/cgroup-internal.h +++ b/kernel/cgroup/cgroup-internal.h @@ -135,6 +135,7 @@ extern spinlock_t css_set_lock; extern struct cgroup_subsys *cgroup_subsys[]; extern struct list_head cgroup_roots; extern struct file_system_type cgroup_fs_type; +struct cgroup *cgroup_get_local_root(struct cgroup *cgrp); /* iterate across the hierarchies */ #define for_each_root(root)\ diff --git a/kernel/cgroup/cgroup-v1.c b/kernel/cgroup/cgroup-v1.c index c2d59fd926be..9a23d8163c2b 100644 --- a/kernel/cgroup/cgroup-v1.c +++ b/kernel/cgroup/cgroup-v1.c @@ -784,7 +784,7 @@ void cgroup1_check_for_release(struct cgroup *cgrp) { if (notify_on_release(cgrp) && !cgroup_is_populated(cgrp) && !css_has_online_children(&cgrp->self) && !cgroup_is_dead(cgrp)) - schedule_work(&cgrp->release_agent_work); + ve_add_to_release_list(cgrp); } /* @@ -822,42 +822,95 @@ static inline int cgroup_path_ve_relative(struct cgroup *ve_root_cgrp, struct cg */ void cgroup1_release_agent(struct work_struct *work) { - struct cgroup *cgrp = - container_of(work, struct cgroup, release_agent_work); - char *pathbuf = NULL, *agentbuf = NULL; - char *argv[3], *envp[3]; - int ret; + struct ve_struct *ve; + unsigned long flags; + char *agentbuf; + + agentbuf = kzalloc(PATH_MAX, GFP_KERNEL); + if (!agentbuf) { + pr_warn("failed to allocate agentbuf\n"); + return; + } + ve = container_of(work, struct ve_struct, release_agent_work); mutex_lock(&cgroup_mutex); + spin_lock_irqsave(&v
[Devel] [PATCH VZ8 v0 08/14] cgroup/ve: added cgroup_path_ve_relative function to get ve-related cgroup paths
This fill make fake-absolute paths to support virtual ve roots in cgroup hierarchies. The path will be used in subsequent patches. Signed-off-by: Valeriy.Vdovin --- kernel/cgroup/cgroup-v1.c | 10 ++ 1 file changed, 10 insertions(+) diff --git a/kernel/cgroup/cgroup-v1.c b/kernel/cgroup/cgroup-v1.c index fb06fc9d96ca..c2d59fd926be 100644 --- a/kernel/cgroup/cgroup-v1.c +++ b/kernel/cgroup/cgroup-v1.c @@ -787,6 +787,16 @@ void cgroup1_check_for_release(struct cgroup *cgrp) schedule_work(&cgrp->release_agent_work); } +/* + * Used to get a fake-absolute path to a cgroup on kernfs filesystem, but it + * actually be relative to cgroup root, provided in the argument. + */ +static inline int cgroup_path_ve_relative(struct cgroup *ve_root_cgrp, + struct cgroup *cgrp, char *buf, size_t buflen) +{ + return kernfs_path_from_node(cgrp->kn, ve_root_cgrp->kn, buf, buflen); +} + /* * Notify userspace when a cgroup is released, by running the * configured release agent with the name of the cgroup (path -- 2.27.0 ___ Devel mailing list Devel@openvz.org https://lists.openvz.org/mailman/listinfo/devel
[Devel] [PATCH VZ8 v0 07/14] ve/cgroup: Added ve_owner field to cgroup
Each cgroup representing a host or a container root of cgroup subsystem hierarhy will have this field set to a valid ve_struct, that owns this root. This way each cgroup in a system will be able to know it's owning VE. Non root cgroups will have this field set to NULL, this is an optimization for cleanup code: at VE destruction we only need to iterate over all root cgroups to clean reference to former owning VE, rather than over all cgroup hierarchy. Still any cgroup that wants to know about it's owning VE can find it's virtual root cgroup and read it's ve_owner field. cgroup->ve_owner is declared as RCU pointer, because it fits RCU semantics - rare writes/often reads. ve_owner will be read from multiple locations in code in further patches and is only rarely set at cgroup_mark_ve_root/cgroup_mount. cgroup_get_ve_owner is a read wrapper for this purpose. (Cherry-picked from eb9c0bfae39fe336173a0dec11bc24f7275de3f8) Signed-off-by: Valeriy Vdovin --- include/linux/cgroup-defs.h | 3 +++ include/linux/cgroup.h | 1 + kernel/cgroup/cgroup.c | 44 + 3 files changed, 48 insertions(+) diff --git a/include/linux/cgroup-defs.h b/include/linux/cgroup-defs.h index 4fd566414274..e497387872f4 100644 --- a/include/linux/cgroup-defs.h +++ b/include/linux/cgroup-defs.h @@ -528,6 +528,9 @@ struct cgroup { u64 subgroups_limit; + /* ve_owner, responsible for running release agent. */ + struct ve_struct __rcu *ve_owner; + /* ids of the ancestors at each level including self */ int ancestor_ids[]; }; diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h index a1e6822bca8f..df9a9a09ce2a 100644 --- a/include/linux/cgroup.h +++ b/include/linux/cgroup.h @@ -900,6 +900,7 @@ int cgroup_path_ns(struct cgroup *cgrp, char *buf, size_t buflen, #ifdef CONFIG_VE extern void cgroup_mark_ve_root(struct ve_struct *ve); void cgroup_unmark_ve_roots(struct ve_struct *ve); +struct ve_struct *cgroup_get_ve_owner(struct cgroup *cgrp); #endif #else /* !CONFIG_CGROUPS */ diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c index 53a349539256..ff0a803c3aad 100644 --- a/kernel/cgroup/cgroup.c +++ b/kernel/cgroup/cgroup.c @@ -303,6 +303,43 @@ bool cgroup_on_dfl(const struct cgroup *cgrp) return cgrp->root == &cgrp_dfl_root; } +struct cgroup *cgroup_get_local_root(struct cgroup *cgrp) +{ + /* +* Find nearest root cgroup, which might be host cgroup root +* or ve cgroup root. +* +* -> local_root +* \^ +* | +* \ | +* ---> from here +*\ +* -> local_root +* \ ^ +* | +* \ | +* --->from here +*/ + + while (cgrp->kn->parent && !test_bit(CGRP_VE_ROOT, &cgrp->flags)) + cgrp = cgrp->kn->parent->priv; + + return cgrp; +} + +struct ve_struct *cgroup_get_ve_owner(struct cgroup *cgrp) +{ + struct ve_struct *ve; + /* Caller should hold RCU */ + + cgrp = cgroup_get_local_root(cgrp); + ve = rcu_dereference(cgrp->ve_owner); + if (!ve) + ve = get_ve0(); + return ve; +} + /* IDR wrappers which synchronize using cgroup_idr_lock */ static int cgroup_idr_alloc(struct idr *idr, void *ptr, int start, int end, gfp_t gfp_mask) @@ -1900,6 +1937,7 @@ void cgroup_mark_ve_root(struct ve_struct *ve) list_for_each_entry(link, &cset->cgrp_links, cgrp_link) { cgrp = link->cgrp; + rcu_assign_pointer(cgrp->ve_owner, ve); set_bit(CGRP_VE_ROOT, &cgrp->flags); } link_ve_root_cpu_cgroup(cset->subsys[cpu_cgrp_id]); @@ -1907,6 +1945,7 @@ void cgroup_mark_ve_root(struct ve_struct *ve) rcu_read_unlock(); spin_unlock_irq(&css_set_lock); + synchronize_rcu(); } void cgroup_unmark_ve_roots(struct ve_struct *ve) @@ -1924,6 +1963,7 @@ void cgroup_unmark_ve_roots(struct ve_struct *ve) list_for_each_entry(link, &cset->cgrp_links, cgrp_link) { cgrp = link->cgrp; + rcu_assign_pointer(cgrp->ve_owner, NULL); clear_bit(CGRP_VE_ROOT, &cgrp->flags); } link_ve_root_cpu_cgroup(cset->subsys[cpu_cgrp_id]); @@ -1931,6 +1971,8 @@ void cgroup_unmark_ve_roots(struct ve_struct *ve) rcu_read_unlock(); spin_unlock_irq(&css_set_lock); + /* ve_owner == NULL will be visible */ + synchronize_rcu(); } struct cgroup *cgroup_get_ve_root1(struct cgroup *cgrp) @@ -2115,6 +2157,8 @@ struct dentry *cgroup_do_mount(struct file_system_type *fs_type, int flags,
[Devel] [PATCH VZ8 v0 06/14] cgroup: exported put_css_set and get_css_set to cgroup.h
(Cherry-picked from 8222bbe47ed1e3824e0890a1404735324189c0cb) Signed-off-by: Valeriy.Vdovin --- include/linux/cgroup.h | 28 kernel/cgroup/cgroup-internal.h | 27 --- kernel/cgroup/cgroup.c | 1 + 3 files changed, 29 insertions(+), 27 deletions(-) diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h index 76e38ac6..a1e6822bca8f 100644 --- a/include/linux/cgroup.h +++ b/include/linux/cgroup.h @@ -512,6 +512,34 @@ task_get_css(struct task_struct *task, int subsys_id) return css; } +void put_css_set_locked(struct css_set *cset); + +static inline void put_css_set(struct css_set *cset) +{ + unsigned long flags; + + /* +* Ensure that the refcount doesn't hit zero while any readers +* can see it. Similar to atomic_dec_and_lock(), but for an +* rwlock +*/ + if (refcount_dec_not_one(&cset->refcount)) + return; + + spin_lock_irqsave(&css_set_lock, flags); + put_css_set_locked(cset); + spin_unlock_irqrestore(&css_set_lock, flags); +} + +/* + * refcounted get/put for css_set objects + */ +static inline void get_css_set(struct css_set *cset) +{ + refcount_inc(&cset->refcount); +} + + /** * task_css_is_root - test whether a task belongs to the root css * @task: the target task diff --git a/kernel/cgroup/cgroup-internal.h b/kernel/cgroup/cgroup-internal.h index ce1c1553c696..829997989c41 100644 --- a/kernel/cgroup/cgroup-internal.h +++ b/kernel/cgroup/cgroup-internal.h @@ -159,33 +159,6 @@ static inline bool notify_on_release(const struct cgroup *cgrp) return test_bit(CGRP_NOTIFY_ON_RELEASE, &cgrp->flags); } -void put_css_set_locked(struct css_set *cset); - -static inline void put_css_set(struct css_set *cset) -{ - unsigned long flags; - - /* -* Ensure that the refcount doesn't hit zero while any readers -* can see it. Similar to atomic_dec_and_lock(), but for an -* rwlock -*/ - if (refcount_dec_not_one(&cset->refcount)) - return; - - spin_lock_irqsave(&css_set_lock, flags); - put_css_set_locked(cset); - spin_unlock_irqrestore(&css_set_lock, flags); -} - -/* - * refcounted get/put for css_set objects - */ -static inline void get_css_set(struct css_set *cset) -{ - refcount_inc(&cset->refcount); -} - bool cgroup_ssid_enabled(int ssid); bool cgroup_on_dfl(const struct cgroup *cgrp); bool cgroup_is_thread_root(struct cgroup *cgrp); diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c index 447c8f003496..53a349539256 100644 --- a/kernel/cgroup/cgroup.c +++ b/kernel/cgroup/cgroup.c @@ -58,6 +58,7 @@ #include #include #include +#include #include -- 2.27.0 ___ Devel mailing list Devel@openvz.org https://lists.openvz.org/mailman/listinfo/devel
[Devel] [PATCH VZ8 v0 05/14] cgroup: added rcu node string wrapper for in-cgroup usage. This will be used in further patches in same patchset.
Signed-off-by: Valeriy Vdovin Reviewed-by: Kirill Tkhai (Cherry-picked from e828803a5d776125c9c329f194aff74fb4ec181a) Signed-off-by: Valeriy Vdovin --- include/linux/cgroup-defs.h | 5 + include/linux/cgroup.h | 4 kernel/cgroup/cgroup-v1.c | 15 +++ 3 files changed, 24 insertions(+) diff --git a/include/linux/cgroup-defs.h b/include/linux/cgroup-defs.h index 5ee5f10e3de7..4fd566414274 100644 --- a/include/linux/cgroup-defs.h +++ b/include/linux/cgroup-defs.h @@ -362,6 +362,11 @@ struct cgroup_freezer_state { int nr_frozen_tasks; }; +struct cgroup_rcu_string { + struct rcu_head rcu_head; + char val[]; +}; + struct cgroup { /* self css with NULL ->ss, points back to this cgroup */ struct cgroup_subsys_state self; diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h index dfd9460986ee..76e38ac6 100644 --- a/include/linux/cgroup.h +++ b/include/linux/cgroup.h @@ -689,6 +689,10 @@ static inline union kernfs_node_id *cgroup_get_kernfs_id(struct cgroup *cgrp) void cgroup_path_from_kernfs_id(const union kernfs_node_id *id, char *buf, size_t buflen); + +struct cgroup_rcu_string; + +struct cgroup_rcu_string *cgroup_rcu_strdup(const char *str, int len); #else /* !CONFIG_CGROUPS */ struct cgroup_subsys_state; diff --git a/kernel/cgroup/cgroup-v1.c b/kernel/cgroup/cgroup-v1.c index db10a1ed282a..fb06fc9d96ca 100644 --- a/kernel/cgroup/cgroup-v1.c +++ b/kernel/cgroup/cgroup-v1.c @@ -231,6 +231,21 @@ static void cgroup_pidlist_destroy_work_fn(struct work_struct *work) kfree(tofree); } +struct cgroup_rcu_string *cgroup_rcu_strdup(const char *str, int len) +{ + struct cgroup_rcu_string *result; + size_t buflen = len + 1; + + result = kmalloc(sizeof(*result) + buflen, GFP_KERNEL); + if (!result) + return ERR_PTR(-ENOMEM); + if (strlcpy(result->val, str, buflen) >= buflen) { + kfree(result); + return ERR_PTR(-ENAMETOOLONG); + } + return result; +} + /* * pidlist_uniq - given a kmalloc()ed list, strip out all duplicate entries * Returns the number of unique elements. -- 2.27.0 ___ Devel mailing list Devel@openvz.org https://lists.openvz.org/mailman/listinfo/devel
[Devel] [PATCH VZ8 v0 04/14] ve/cgroup: implemented per-ve workqueue.
Signed-off-by: Valeriy Vdovin Reviewed-by: Kirill Tkhai (Cherry-picked from 0293870666c4f96bd56f612d94f560626c76e2fd) Signed-off-by: Valeriy Vdovin --- include/linux/ve.h | 2 ++ kernel/ve/ve.c | 25 + 2 files changed, 27 insertions(+) diff --git a/include/linux/ve.h b/include/linux/ve.h index ab8da4dceec1..2c45f47b9f92 100644 --- a/include/linux/ve.h +++ b/include/linux/ve.h @@ -99,6 +99,8 @@ struct ve_struct { struct list_headdevmnt_list; struct mutexdevmnt_mutex; + + struct workqueue_struct *wq; }; struct ve_devmnt { diff --git a/kernel/ve/ve.c b/kernel/ve/ve.c index b922653acf49..7eaf787f3a3d 100644 --- a/kernel/ve/ve.c +++ b/kernel/ve/ve.c @@ -387,6 +387,21 @@ static void ve_set_vdso_time(struct ve_struct *ve, u64 time) *vdso_start_time = time; } +static int ve_workqueue_start(struct ve_struct *ve) +{ + ve->wq = alloc_workqueue("ve_wq_%s", + WQ_SYSFS|WQ_FREEZABLE|WQ_UNBOUND, 8, ve->ve_name); + + if (!ve->wq) + return -ENOMEM; + return 0; +} + +static void ve_workqueue_stop(struct ve_struct *ve) +{ + destroy_workqueue(ve->wq); +} + /* under ve->op_sem write-lock */ static int ve_start_container(struct ve_struct *ve) { @@ -442,6 +457,10 @@ static int ve_start_container(struct ve_struct *ve) if (err) goto err_umh; + err = ve_workqueue_start(ve); + if (err) + goto err_workqueue; + err = ve_hook_iterate_init(VE_SS_CHAIN, ve); if (err < 0) goto err_iterate; @@ -457,6 +476,8 @@ static int ve_start_container(struct ve_struct *ve) return 0; err_iterate: + ve_workqueue_stop(ve); +err_workqueue: ve_stop_umh(ve); err_umh: ve_stop_kthreadd(ve); @@ -522,6 +543,8 @@ void ve_exit_ns(struct pid_namespace *pid_ns) cgroup_unmark_ve_roots(ve); + ve_workqueue_stop(ve); + /* * At this point all userspace tasks in container are dead. */ @@ -1354,6 +1377,8 @@ static int __init ve_subsys_init(void) { ve_cachep = KMEM_CACHE_USERCOPY(ve_struct, SLAB_PANIC, core_pattern); list_add(&ve0.ve_list, &ve_list_head); + ve0.wq = alloc_workqueue("ve0_wq", WQ_FREEZABLE|WQ_UNBOUND, 8); + BUG_ON(!ve0.wq); return 0; } late_initcall(ve_subsys_init); -- 2.27.0 ___ Devel mailing list Devel@openvz.org https://lists.openvz.org/mailman/listinfo/devel
[Devel] [PATCH VZ8 v0 03/14] cgroup: port CGROUP_REMOVED flag from vz7
The flag will be used in subsequent patches Signed-off-by: Valeriy Vdovin --- include/linux/cgroup-defs.h | 2 ++ include/linux/cgroup.h | 5 + kernel/cgroup/cgroup.c | 1 + 3 files changed, 8 insertions(+) diff --git a/include/linux/cgroup-defs.h b/include/linux/cgroup-defs.h index a3b309ab1a90..5ee5f10e3de7 100644 --- a/include/linux/cgroup-defs.h +++ b/include/linux/cgroup-defs.h @@ -57,6 +57,8 @@ enum { /* bits in struct cgroup flags field */ enum { + /* Control Cgroup is dead */ + CGRP_REMOVED, /* Control Group requires release notifications to userspace */ CGRP_NOTIFY_ON_RELEASE, /* diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h index c0a42c3d43fa..dfd9460986ee 100644 --- a/include/linux/cgroup.h +++ b/include/linux/cgroup.h @@ -922,6 +922,11 @@ static inline bool cgroup_task_frozen(struct task_struct *task) return task->frozen; } +static inline int cgroup_is_removed(const struct cgroup *cgrp) +{ + return test_bit(CGRP_REMOVED, &cgrp->flags); +} + #else /* !CONFIG_CGROUPS */ static inline void cgroup_enter_frozen(void) { } diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c index 797a3971ab46..447c8f003496 100644 --- a/kernel/cgroup/cgroup.c +++ b/kernel/cgroup/cgroup.c @@ -5562,6 +5562,7 @@ static int cgroup_destroy_locked(struct cgroup *cgrp) tcgrp->freezer.nr_frozen_descendants--; } spin_unlock_irq(&css_set_lock); + set_bit(CGRP_REMOVED, &cgrp->flags); cgroup1_check_for_release(parent); -- 2.27.0 ___ Devel mailing list Devel@openvz.org https://lists.openvz.org/mailman/listinfo/devel
[Devel] [PATCH VZ8 v0 02/14] cgroup/cfs: added 'activate' option to cgroup_add_file
In kernfs files get created in 'deactivated' state, which means they are not visible. Add option to activate the file after creation immediately making it visible in the parent directory. Will be used in later patches. Signed-off-by: Valeriy Vdovin --- kernel/cgroup/cgroup.c | 6 -- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c index 4488df184235..797a3971ab46 100644 --- a/kernel/cgroup/cgroup.c +++ b/kernel/cgroup/cgroup.c @@ -3824,7 +3824,7 @@ static void cgroup_file_notify_timer(struct timer_list *timer) } static int cgroup_add_file(struct cgroup_subsys_state *css, struct cgroup *cgrp, - struct cftype *cft) + struct cftype *cft, bool activate) { char name[CGROUP_FILE_NAME_MAX]; struct kernfs_node *kn; @@ -3866,6 +3866,8 @@ static int cgroup_add_file(struct cgroup_subsys_state *css, struct cgroup *cgrp, if (IS_ERR(kn_link)) return PTR_ERR(kn_link); } + if (activate) + kernfs_activate(kn); return 0; } @@ -3903,7 +3905,7 @@ static int cgroup_addrm_files(struct cgroup_subsys_state *css, if ((cft->flags & CFTYPE_DEBUG) && !cgroup_debug) continue; if (is_add) { - ret = cgroup_add_file(css, cgrp, cft); + ret = cgroup_add_file(css, cgrp, cft, false); if (ret) { pr_warn("%s: failed to add %s, err=%d\n", __func__, cft->name, ret); -- 2.27.0 ___ Devel mailing list Devel@openvz.org https://lists.openvz.org/mailman/listinfo/devel
[Devel] [PATCH VZ8 v0 00/14] Port release_agent virtualization from vz7
This patchset ports virtualization of cgroup release_agent virtualization from vz7. Major challanges of porting are differences between vz7 and vz8 cgroup implementations: - transition of cgroups to kernfs - slightly changed locking scheme, which relies on css_set_lock in places, previously relied on cgroup_mutex. There is a small number of patches that have been ported without modifications, but most of the patches had suffered a lot of modification due to the factors described above. Valeriy Vdovin (14): ve/cgroup: unmark ve-root cgroups at container stop cgroup/cfs: added 'activate' option to cgroup_add_file cgroup: port CGROUP_REMOVED flag from vz7 ve/cgroup: implemented per-ve workqueue. cgroup: added rcu node string wrapper for in-cgroup usage. This will be used in further patches in same patchset. cgroup: exported put_css_set and get_css_set to cgroup.h ve/cgroup: Added ve_owner field to cgroup cgroup/ve: added cgroup_path_ve_relative function to get ve-related cgroup paths ve/cgroup: moved release_agent from system_wq to per-ve workqueues ve/cgroup: private per-cgroup-root data container ve/cgroup: set release_agent_path for root cgroups separately ve/cgroup: added release_agent to each container root cgroup. ve/cgroup: At cgroup_mark(unmark)_ve_roots skip non-virtualized roots cgroup: relaxed lockdep assertion for cset_cgroup_from_root include/linux/cgroup-defs.h | 26 +++- include/linux/cgroup.h | 43 +- include/linux/ve.h | 27 kernel/cgroup/cgroup-internal.h | 32 +--- kernel/cgroup/cgroup-v1.c | 249 +--- kernel/cgroup/cgroup.c | 226 ++--- kernel/ve/ve.c | 247 ++- 7 files changed, 741 insertions(+), 109 deletions(-) -- 2.27.0 ___ Devel mailing list Devel@openvz.org https://lists.openvz.org/mailman/listinfo/devel
[Devel] [PATCH VZ8 v0 01/14] ve/cgroup: unmark ve-root cgroups at container stop
fixes: 915a1130c7ee4ffb6de3f69a5bd98c5ee42a723f Signed-off-by: Valeriy Vdovin Reviewed-by: Kirill Tkhai Cherry-picked from 5dceccf5dd794673ebb1b0e6840d96aa654ec33e) Signed-off-by: Valeriy Vdovin --- include/linux/cgroup.h | 1 + kernel/cgroup/cgroup.c | 24 kernel/ve/ve.c | 3 +++ 3 files changed, 28 insertions(+) diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h index 4f0dd51338bf..c0a42c3d43fa 100644 --- a/include/linux/cgroup.h +++ b/include/linux/cgroup.h @@ -867,6 +867,7 @@ int cgroup_path_ns(struct cgroup *cgrp, char *buf, size_t buflen, #ifdef CONFIG_VE extern void cgroup_mark_ve_root(struct ve_struct *ve); +void cgroup_unmark_ve_roots(struct ve_struct *ve); #endif #else /* !CONFIG_CGROUPS */ diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c index 0335a07f64e6..4488df184235 100644 --- a/kernel/cgroup/cgroup.c +++ b/kernel/cgroup/cgroup.c @@ -1908,6 +1908,30 @@ void cgroup_mark_ve_root(struct ve_struct *ve) spin_unlock_irq(&css_set_lock); } +void cgroup_unmark_ve_roots(struct ve_struct *ve) +{ + struct cgrp_cset_link *link; + struct css_set *cset; + struct cgroup *cgrp; + + spin_lock_irq(&css_set_lock); + + rcu_read_lock(); + cset = rcu_dereference(ve->ve_ns)->cgroup_ns->root_cset; + if (WARN_ON(!cset)) + goto unlock; + + list_for_each_entry(link, &cset->cgrp_links, cgrp_link) { + cgrp = link->cgrp; + clear_bit(CGRP_VE_ROOT, &cgrp->flags); + } + link_ve_root_cpu_cgroup(cset->subsys[cpu_cgrp_id]); +unlock: + rcu_read_unlock(); + + spin_unlock_irq(&css_set_lock); +} + struct cgroup *cgroup_get_ve_root1(struct cgroup *cgrp) { struct cgroup *ve_root = NULL; diff --git a/kernel/ve/ve.c b/kernel/ve/ve.c index 3f53641455ad..b922653acf49 100644 --- a/kernel/ve/ve.c +++ b/kernel/ve/ve.c @@ -519,6 +519,9 @@ void ve_exit_ns(struct pid_namespace *pid_ns) */ if (!ve_ns || ve_ns->pid_ns_for_children != pid_ns) goto unlock; + + cgroup_unmark_ve_roots(ve); + /* * At this point all userspace tasks in container are dead. */ -- 2.27.0 ___ Devel mailing list Devel@openvz.org https://lists.openvz.org/mailman/listinfo/devel
[Devel] [VZ8 PATCH 2/3] cgroup/cfs: added 'activate' option to cgroup_add_file
In kernfs files get created in 'deactivated' state, which means they are not visible. Add option to activate the file after creation immediately making it visible in the parent directory. Will be used in later patches. Signed-off-by: Valeriy Vdovin --- kernel/cgroup/cgroup.c | 6 -- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c index 4488df184235..797a3971ab46 100644 --- a/kernel/cgroup/cgroup.c +++ b/kernel/cgroup/cgroup.c @@ -3824,7 +3824,7 @@ static void cgroup_file_notify_timer(struct timer_list *timer) } static int cgroup_add_file(struct cgroup_subsys_state *css, struct cgroup *cgrp, - struct cftype *cft) + struct cftype *cft, bool activate) { char name[CGROUP_FILE_NAME_MAX]; struct kernfs_node *kn; @@ -3866,6 +3866,8 @@ static int cgroup_add_file(struct cgroup_subsys_state *css, struct cgroup *cgrp, if (IS_ERR(kn_link)) return PTR_ERR(kn_link); } + if (activate) + kernfs_activate(kn); return 0; } @@ -3903,7 +3905,7 @@ static int cgroup_addrm_files(struct cgroup_subsys_state *css, if ((cft->flags & CFTYPE_DEBUG) && !cgroup_debug) continue; if (is_add) { - ret = cgroup_add_file(css, cgrp, cft); + ret = cgroup_add_file(css, cgrp, cft, false); if (ret) { pr_warn("%s: failed to add %s, err=%d\n", __func__, cft->name, ret); -- 2.27.0 ___ Devel mailing list Devel@openvz.org https://lists.openvz.org/mailman/listinfo/devel
[Devel] [VZ8 PATCH 3/3] cgroup: port CGROUP_REMOVED flag from vz7
The flag will be used in subsequent porting patches for release_agent functionality Signed-off-by: Valeriy Vdovin --- include/linux/cgroup-defs.h | 2 ++ include/linux/cgroup.h | 5 + kernel/cgroup/cgroup.c | 1 + 3 files changed, 8 insertions(+) diff --git a/include/linux/cgroup-defs.h b/include/linux/cgroup-defs.h index a3b309ab1a90..5ee5f10e3de7 100644 --- a/include/linux/cgroup-defs.h +++ b/include/linux/cgroup-defs.h @@ -57,6 +57,8 @@ enum { /* bits in struct cgroup flags field */ enum { + /* Control Cgroup is dead */ + CGRP_REMOVED, /* Control Group requires release notifications to userspace */ CGRP_NOTIFY_ON_RELEASE, /* diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h index c0a42c3d43fa..dfd9460986ee 100644 --- a/include/linux/cgroup.h +++ b/include/linux/cgroup.h @@ -922,6 +922,11 @@ static inline bool cgroup_task_frozen(struct task_struct *task) return task->frozen; } +static inline int cgroup_is_removed(const struct cgroup *cgrp) +{ + return test_bit(CGRP_REMOVED, &cgrp->flags); +} + #else /* !CONFIG_CGROUPS */ static inline void cgroup_enter_frozen(void) { } diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c index 797a3971ab46..447c8f003496 100644 --- a/kernel/cgroup/cgroup.c +++ b/kernel/cgroup/cgroup.c @@ -5562,6 +5562,7 @@ static int cgroup_destroy_locked(struct cgroup *cgrp) tcgrp->freezer.nr_frozen_descendants--; } spin_unlock_irq(&css_set_lock); + set_bit(CGRP_REMOVED, &cgrp->flags); cgroup1_check_for_release(parent); -- 2.27.0 ___ Devel mailing list Devel@openvz.org https://lists.openvz.org/mailman/listinfo/devel
[Devel] [VZ8 PATCH 1/3] ve/cgroup: unmark ve-root cgroups at container stop
fixes: 915a1130c7ee4ffb6de3f69a5bd98c5ee42a723f Signed-off-by: Valeriy Vdovin Reviewed-by: Kirill Tkhai Cherry-picked from 5dceccf5dd794673ebb1b0e6840d96aa654ec33e) Signed-off-by: Valeriy Vdovin --- include/linux/cgroup.h | 1 + kernel/cgroup/cgroup.c | 24 kernel/ve/ve.c | 3 +++ 3 files changed, 28 insertions(+) diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h index 4f0dd51338bf..c0a42c3d43fa 100644 --- a/include/linux/cgroup.h +++ b/include/linux/cgroup.h @@ -867,6 +867,7 @@ int cgroup_path_ns(struct cgroup *cgrp, char *buf, size_t buflen, #ifdef CONFIG_VE extern void cgroup_mark_ve_root(struct ve_struct *ve); +void cgroup_unmark_ve_roots(struct ve_struct *ve); #endif #else /* !CONFIG_CGROUPS */ diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c index 0335a07f64e6..4488df184235 100644 --- a/kernel/cgroup/cgroup.c +++ b/kernel/cgroup/cgroup.c @@ -1908,6 +1908,30 @@ void cgroup_mark_ve_root(struct ve_struct *ve) spin_unlock_irq(&css_set_lock); } +void cgroup_unmark_ve_roots(struct ve_struct *ve) +{ + struct cgrp_cset_link *link; + struct css_set *cset; + struct cgroup *cgrp; + + spin_lock_irq(&css_set_lock); + + rcu_read_lock(); + cset = rcu_dereference(ve->ve_ns)->cgroup_ns->root_cset; + if (WARN_ON(!cset)) + goto unlock; + + list_for_each_entry(link, &cset->cgrp_links, cgrp_link) { + cgrp = link->cgrp; + clear_bit(CGRP_VE_ROOT, &cgrp->flags); + } + link_ve_root_cpu_cgroup(cset->subsys[cpu_cgrp_id]); +unlock: + rcu_read_unlock(); + + spin_unlock_irq(&css_set_lock); +} + struct cgroup *cgroup_get_ve_root1(struct cgroup *cgrp) { struct cgroup *ve_root = NULL; diff --git a/kernel/ve/ve.c b/kernel/ve/ve.c index 3f53641455ad..b922653acf49 100644 --- a/kernel/ve/ve.c +++ b/kernel/ve/ve.c @@ -519,6 +519,9 @@ void ve_exit_ns(struct pid_namespace *pid_ns) */ if (!ve_ns || ve_ns->pid_ns_for_children != pid_ns) goto unlock; + + cgroup_unmark_ve_roots(ve); + /* * At this point all userspace tasks in container are dead. */ -- 2.27.0 ___ Devel mailing list Devel@openvz.org https://lists.openvz.org/mailman/listinfo/devel
[Devel] [PATCH RHEL7] ve/cgroup: At container start check ve's css_set for host-level cgroups.
cgroup_mark_ve_roots is not protected against cases when a container is started in an invalid cgroup set configuration. The official supported way of doing that from cgroups point of view is as follows: 1. Create a child cgroup in "ve" cgroup hierarchy. 2. Along with "ve" create child cgroups in all other major cgroup subsystems, mounted on the system (cpuset, blkio, etc). 3. Create a child cgroup in a special cgroup hierarchy named "systemd". 4. Add a task, that will start a container to each of the newly created cgroups from above. 5. Now this task should write "START" to "ve.state" property of the relevant ve cgroup. >From the userspace it's possible to ignore the supported way and proceed to starting a container skipping steps 2-4. In kernel code, this results in ve receiving a root css_set which includes host-level cgroups, which in turn leads to a variety of problems like trying to add a "release_agent" file to a host-level cgroup which already has one, as well as trying to remove it from host-level cgroup at container stop. Prior to performing actions on cgroups, we should first run a quick check that none of the host-level cgroups are present in the ve's css_set. In the check while iterating ve's css_set we skip rootnode cgroup because it's a special case cgroup that is present in each css_set and will always give a false positive. https://jira.sw.ru/browse/PSBM-123506 Signed-off-by: Valeriy Vdovin --- kernel/cgroup.c | 38 ++ 1 file changed, 38 insertions(+) diff --git a/kernel/cgroup.c b/kernel/cgroup.c index c7ed3c2..78fddc6 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -659,6 +659,31 @@ static struct cgroup *css_cgroup_from_root(struct css_set *css_set, } /* + * Iterate all cgroups in a given css_set and check if it is a top cgroup + * of it's hierarchy. + * rootnode should be ignored as it is always present in each css set as + * a placeholder for any unmounted subsystem and will give false positive. + */ +static inline bool css_has_host_cgroups(struct css_set *css_set) +{ + struct cg_cgroup_link *link; + + read_lock(&css_set_lock); + + list_for_each_entry(link, &css_set->cg_links, cg_link_list) { + if (link->cgrp->root == &rootnode) + continue; + + if (!link->cgrp->parent) { + read_unlock(&css_set_lock); + return true; + } + } + read_unlock(&css_set_lock); + return false; +} + +/* * Return the cgroup for "task" from the given hierarchy. Must be * called with cgroup_mutex held. */ @@ -4637,6 +4662,19 @@ int cgroup_mark_ve_roots(struct ve_struct *ve) mutex_lock(&cgroup_cft_mutex); mutex_lock(&cgroup_mutex); + + /* +* Return early if we know that this procedure will fail due to +* existing root cgroups which are not allowed to be root's in ve's +* context. This is for the case when some task wants to start VE +* without adding itself to all virtualized subgroups (+systemd) first. +*/ + if (css_has_host_cgroups(ve->root_css_set)) { + mutex_unlock(&cgroup_mutex); + mutex_unlock(&cgroup_cft_mutex); + return -EINVAL; + } + for_each_active_root(root) { cgrp = css_cgroup_from_root(ve->root_css_set, root); -- 1.8.3.1 ___ Devel mailing list Devel@openvz.org https://lists.openvz.org/mailman/listinfo/devel
Re: [Devel] [PATCH rh8 v3 2/2] ve/proc: Added separate start time field to task_struct to show in container
On Mon, Dec 21, 2020 at 06:09:56PM +0300, Konstantin Khorenko wrote: > From: Valeriy Vdovin > > Introduced 'real_start_time_ct' field in task_struct. > > The value is READ: > 1. When the process lives inside of a ve group and any process > inside of the same ve group wants to know it's start time by reading > it's /proc/[pid]/stat file. > 2. At container suspend operation to store this value to a dump image. > > The value is WRITTEN: > 1. At creation time (copy_process function) > 1.1. If a process is being created outside of ve group / on host, then > this value is initialized to 0 > 1.2. If a process is being created by process already living in ve > group, this value is calculated as host_uptime - ve_uptime. > > 2. During attach to ve. (ve_attach function). The process can be created on > a host and later attached to ve. It's container's start_time value has been > already initialized to 0 at creation time. After the process enters the > domain of a ve, the value should be initialized. > Note that the process can be attached to a non-running container, in which > case it's start_time value should not be calculated and left initialized to > 0. > > 3. At container restore via prctl (prctl_set_task_ct_fields function). > In this case the value is only settable outside of a container. > During restore the processes would be created from the dump image. > At restore step each process will execute prctl to set it's start_time > value, read from the dump. This would only be permitted during > pseudosuper ve mode. The value is set as is (read from the dump), without > any calculations. > > https://jira.sw.ru/browse/PSBM-64123 > > Signed-off-by: Valeriy Vdovin > > (cherry picked from vz7 commit eca790eaed527bae7029b4ae1cd557ce847ac6c0) > Signed-off-by: Konstantin Khorenko Reviewed-by: Valeriy Vdovin > > v2: rebased to branch-rh8-4.18.0-240.1.1.vz8.5.x-ovz branch > v3: added missing ve.h include > --- > fs/proc/array.c| 12 +++- > include/linux/sched.h | 7 ++- > include/linux/ve.h | 16 > include/uapi/linux/prctl.h | 6 ++ > kernel/fork.c | 12 > kernel/sys.c | 23 +++ > kernel/ve/ve.c | 2 ++ > 7 files changed, 68 insertions(+), 10 deletions(-) > > diff --git a/fs/proc/array.c b/fs/proc/array.c > index ba712f18e5ff..735876a51a18 100644 > --- a/fs/proc/array.c > +++ b/fs/proc/array.c > @@ -555,16 +555,10 @@ static int do_task_stat(struct seq_file *m, struct > pid_namespace *ns, > start_time = task->real_start_time; > > #ifdef CONFIG_VE > - if (!is_super) { > - u64 offset = get_exec_env()->real_start_time; > - start_time -= (unsigned long long)offset; > - } > - /* tasks inside a CT can have negative start time e.g. if the CT was > - * migrated from another hw node, in which case we will report 0 in > - * order not to confuse userspace */ > - if ((s64)start_time < 0) > - start_time = 0; > + if (!is_super) > + start_time = task->real_start_time_ct; > #endif > + > /* convert nsec -> ticks */ > start_time = nsec_to_clock_t(start_time); > > diff --git a/include/linux/sched.h b/include/linux/sched.h > index 19ca9cc0f3b9..9846553f7039 100644 > --- a/include/linux/sched.h > +++ b/include/linux/sched.h > @@ -839,7 +839,6 @@ struct task_struct { > #ifdef CONFIG_VIRT_CPU_ACCOUNTING_GEN > struct vtimevtime; > #endif > - > #ifdef CONFIG_NO_HZ_FULL > atomic_ttick_dep_mask; > #endif > @@ -853,6 +852,12 @@ struct task_struct { > /* Boot based time in nsecs: */ > u64 real_start_time; > > + /* > + * This is a Container-side copy of 'real_start_time' field > + * shown from inside of a Container and modified by host. > + */ > + u64 real_start_time_ct; > + > /* MM fault and swap info: this can arguably be seen as either > mm-specific or thread-specific: */ > unsigned long min_flt; > unsigned long maj_flt; > diff --git a/include/linux/ve.h b/include/linux/ve.h > index 3aa0ea0b1bab..ab8da4dceec1 100644 > --- a/include/linux/ve.h > +++ b/include/linux/ve.h > @@ -148,6 +148,22 @@ static u64 ve_get_uptime(struct ve_struct *ve) > return ktime_get_boot_ns() - ve->real_start_time; > } > > +static inline void ve_set_task_start_time(struct ve_struct *ve, > +
Re: [Devel] [PATCH rh8 v3 1/2] ve/time: Move ve_get_uptime() to header
On Mon, Dec 21, 2020 at 06:09:55PM +0300, Konstantin Khorenko wrote: > Will be used in ve.h in another function. > > Fixes: 9644a237d401 ("ve/vestat: Introduce /proc/vz/vestat") > > Signed-off-by: Konstantin Khorenko Reviewed-by: Valeriy Vdovin > --- > include/linux/ve.h | 5 + > kernel/ve/vecalls.c | 5 - > 2 files changed, 5 insertions(+), 5 deletions(-) > > diff --git a/include/linux/ve.h b/include/linux/ve.h > index 7cb416f342e7..3aa0ea0b1bab 100644 > --- a/include/linux/ve.h > +++ b/include/linux/ve.h > @@ -143,6 +143,11 @@ static inline struct ve_struct *css_to_ve(struct > cgroup_subsys_state *css) > > extern struct cgroup_subsys_state *ve_get_init_css(struct ve_struct *ve, int > subsys_id); > > +static u64 ve_get_uptime(struct ve_struct *ve) > +{ > + return ktime_get_boot_ns() - ve->real_start_time; > +} > + > extern void monotonic_abs_to_ve(clockid_t which_clock, struct timespec64 > *tp); > extern void monotonic_ve_to_abs(clockid_t which_clock, struct timespec64 > *tp); > > diff --git a/kernel/ve/vecalls.c b/kernel/ve/vecalls.c > index 786a743faa1a..f1cc04ee82da 100644 > --- a/kernel/ve/vecalls.c > +++ b/kernel/ve/vecalls.c > @@ -32,11 +32,6 @@ > #include > #include > > -static u64 ve_get_uptime(struct ve_struct *ve) > -{ > - return ktime_get_boot_ns() - ve->real_start_time; > -} > - > static int fill_cpu_stat(envid_t veid, struct vz_cpu_stat __user *buf) > { > struct ve_struct *ve; > -- > 2.28.0 > ___ Devel mailing list Devel@openvz.org https://lists.openvz.org/mailman/listinfo/devel
[Devel] [PATCH 1/3 v2 RHEL7] ve/cgroup: change resource release order in ve_drop_context
This fixes 87cb5fdb5b5c77ac617b46a0fe118a7d50a77b1c In the mentioned patch in cgroup_show_options ve->ve_ns is checked to ensure that ve->root_css_set is usable. But in ve_drop_context root_css_set is being released before ve_ns, which is a bug. root_css_set will now be set to NULL after ve_ns is released. This reordering only affects the described piece of code in cgroup_show_options. https://jira.sw.ru/browse/PSBM-121438 Signed-off-by: Valeriy Vdovin --- kernel/ve/ve.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/kernel/ve/ve.c b/kernel/ve/ve.c index db26cbd4..f61b1fe 100644 --- a/kernel/ve/ve.c +++ b/kernel/ve/ve.c @@ -595,9 +595,6 @@ static void ve_drop_context(struct ve_struct *ve) struct nsproxy *ve_ns = ve->ve_ns; struct net *net = ve->ve_netns; - put_css_set_taskexit(ve->root_css_set); - ve->root_css_set = NULL; - ve->ve_netns = NULL; put_net(net); @@ -606,6 +603,9 @@ static void ve_drop_context(struct ve_struct *ve) synchronize_rcu(); put_nsproxy(ve_ns); + put_css_set_taskexit(ve->root_css_set); + ve->root_css_set = NULL; + ve_hook_iterate_fini(VE_SHUTDOWN_CHAIN, ve); put_cred(ve->init_cred); -- 1.8.3.1 ___ Devel mailing list Devel@openvz.org https://lists.openvz.org/mailman/listinfo/devel
[Devel] [PATCH 0/3 v2 RHEL7] cgroup: do not use cgroup_mutex in cgroup_show_options
The patchset fixes lock order inversion problem https://jira.sw.ru/browse/PSBM-121438: Thread A: m_start() --> down_read(&namespace_sem); cgroup_show_options() --> mutex_lock(&cgroup_mutex); Thread B: attach_task_by_pid() cgroup_lock_live_group --> mutex_lock(&cgroup_mutex); threadgroup_lock() --> down_write(&tsk->signal->group_rwsem); Thread C: copy_process threadgroup_change_begin() --> down_read(&tsk->signal->group_rwsem); copy_namespaces create_new_namespaces copy_mnt_ns namespace_lock() --> down_write(&namespace_sem) It's split logically so that the first patch fixes pre-existing incorrect resource release order in ve_drop_contex, that was behaving not as expected in cgroup_show_options. In the next patch cgroup_show_options itself get's slightly rewritten to not use cgroup_mutex which fixes the bug. One more patch is added to clean up the coding style of the function ve_get_release_agent_path, which is relevant to current problem. v1: original patch v2: Added logic that depends on validness of ve->root_css_set instead of cgroup_mutex. Added coding-style fixes. Valeriy Vdovin (3): ve/cgroup: change resource release order in ve_drop_context cgroup: do not use cgroup_mutex in cgroup_show_options ve: cleanup in function ve_get_release_agent_path kernel/cgroup.c | 9 ++--- kernel/ve/ve.c | 7 --- 2 files changed, 6 insertions(+), 10 deletions(-) -- 1.8.3.1 ___ Devel mailing list Devel@openvz.org https://lists.openvz.org/mailman/listinfo/devel
[Devel] [PATCH 2/3 v2 RHEL7] cgroup: do not use cgroup_mutex in cgroup_show_options
In 87cb5fdb5b5c77ac617b46a0fe118a7d50a77b1c function cgroup_show_options started to lock cgroup_mutex, which introduced new deadlock possibility, described below: Thread A: m_start() --> down_read(&namespace_sem); cgroup_show_options() --> mutex_lock(&cgroup_mutex); Thread B: attach_task_by_pid() cgroup_lock_live_group --> mutex_lock(&cgroup_mutex); threadgroup_lock() --> down_write(&tsk->signal->group_rwsem); Thread C: copy_process threadgroup_change_begin() --> down_read(&tsk->signal->group_rwsem); copy_namespaces create_new_namespaces copy_mnt_ns namespace_lock() --> down_write(&namespace_sem) Clearly cgroup_mutex can not be locked right after locking namespace_sem, because opposite locking order is also present in the code and should be removed from cgroup_show_options. After reviewing cgroup_show_options, it was established that cgroup_mutex is not absolutely needed to guarantee safe access to root_cgrp. It was used in combination with a call to task_cgroup_from_root to ensure that root_cgrp lived long enough to access it's value of release_agent path. But in this funciton we know that root_cgrp is part of ve->root_css_set, which holds reference to it. In turn root_css_set is referenced while ve->ve_ns is not NULL, the check of which we already have in the code. This means that root_cgrp is valid until ve->ve_ns is valid. ve->ve_ns is valid until the point of rcu_synchronize in ve_drop_context, that's why rcu_read_lock should be maintained all the time when root_cgrp is being accessed. The patch also removes BUG_ON from css_cgroup_from_root, because all 3 calls to this function pass ve->root_css_set as an argument and the above logic applies. https://jira.sw.ru/browse/PSBM-121438 Signed-off-by: Valeriy Vdovin --- kernel/cgroup.c | 9 ++--- 1 file changed, 2 insertions(+), 7 deletions(-) diff --git a/kernel/cgroup.c b/kernel/cgroup.c index 27d7a5e..c7ed3c2 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -644,7 +644,6 @@ static struct cgroup *css_cgroup_from_root(struct css_set *css_set, struct cgroup *res = NULL; struct cg_cgroup_link *link; - BUG_ON(!mutex_is_locked(&cgroup_mutex)); read_lock(&css_set_lock); list_for_each_entry(link, &css_set->cg_links, cg_link_list) { @@ -1100,7 +1099,6 @@ static int cgroup_show_options(struct seq_file *seq, struct dentry *dentry) struct cgroup_subsys *ss; struct cgroup *root_cgrp = &root->top_cgroup; - mutex_lock(&cgroup_mutex); mutex_lock(&cgroup_root_mutex); for_each_subsys(root, ss) seq_printf(seq, ",%s", ss->name); @@ -1112,6 +1110,7 @@ static int cgroup_show_options(struct seq_file *seq, struct dentry *dentry) seq_puts(seq, ",xattr"); if (root->flags & CGRP_ROOT_CPUSET_V2_MODE) seq_puts(seq, ",cpuset_v2_mode"); + rcu_read_lock(); #ifdef CONFIG_VE { struct ve_struct *ve = get_exec_env(); @@ -1124,15 +1123,12 @@ static int cgroup_show_options(struct seq_file *seq, struct dentry *dentry) * ve->init_task is synchronized via ve->ve_ns rcu, see * ve_grab_context/drop_context. */ - rcu_read_lock(); if (ve->ve_ns) - root_cgrp = task_cgroup_from_root(ve->init_task, + root_cgrp = css_cgroup_from_root(ve->root_css_set, root); - rcu_read_unlock(); } } #endif - rcu_read_lock(); release_agent = ve_get_release_agent_path(root_cgrp); if (release_agent && release_agent[0]) seq_show_option(seq, "release_agent", release_agent); @@ -1142,7 +1138,6 @@ static int cgroup_show_options(struct seq_file *seq, struct dentry *dentry) if (strlen(root->name)) seq_show_option(seq, "name", root->name); mutex_unlock(&cgroup_root_mutex); - mutex_unlock(&cgroup_mutex); return 0; } -- 1.8.3.1 ___ Devel mailing list Devel@openvz.org https://lists.openvz.org/mailman/listinfo/devel
[Devel] [PATCH 3/3 v2 RHEL7] ve: cleanup in function ve_get_release_agent_path
Signed-off-by: Valeriy Vdovin --- kernel/ve/ve.c | 1 + 1 file changed, 1 insertion(+) diff --git a/kernel/ve/ve.c b/kernel/ve/ve.c index f61b1fe..482d658 100644 --- a/kernel/ve/ve.c +++ b/kernel/ve/ve.c @@ -226,6 +226,7 @@ const char *ve_get_release_agent_path(struct cgroup *cgroot) struct per_cgroot_data *data; struct cgroup_rcu_string *str; struct ve_struct *ve; + ve = rcu_dereference(cgroot->ve_owner); if (!ve) return NULL; -- 1.8.3.1 ___ Devel mailing list Devel@openvz.org https://lists.openvz.org/mailman/listinfo/devel
Re: [Devel] [PATCH RH7] cgroup: do not use cgroup_mutex in cgroup_show_options
On 28.10.2020 13:18, Kirill Tkhai wrote: On 27.10.2020 17:59, Valeriy Vdovin wrote: In patch 87cb5fdb5b5c77ac617b46a0fe118a7d50a77b1c function cgroup_show_options started to lock cgroup_mutex, which introduced new deadlock possibility, described below: Thread A: m_start() --> down_read(&namespace_sem); cgroup_show_options() --> mutex_lock(&cgroup_mutex); Thread B: attach_task_by_pid() cgroup_lock_live_group --> mutex_lock(&cgroup_mutex); threadgroup_lock() --> down_write(&tsk->signal->group_rwsem); Thread C: copy_process threadgroup_change_begin() --> down_read(&tsk->signal->group_rwsem); copy_namespaces create_new_namespaces copy_mnt_ns namespace_lock() --> down_write(&namespace_sem) Clearly cgroup_mutex can not be locked right after locking namespace_sem, because opposite locking order is also present in the code. To get rid of cgroup_mutex it's synchonization role should transition to cgroup_root_mutex, that was created specifically to defeat the described locking order problem. In this location cgroup_mutex provided 2 guarantees: 1. Safe call for 'task_cgroup_from_root', that asserts cgroup_mutex ownership. 2. Safe read access to cgroup->ve_owner These guarantees are now transferred to cgroup_root_mutex in the following way: 1. task_cgroup_from_root assertion is modified to take into account cgroup_root_mutex as one of two possible locks. I don't see an explanation of cgroup_root_mutex is acceptable to call task_cgroup_from_root(). This check is about "you can use result of its function under that lock". cgroup_mutex guarantees the result is stable under cgroup_mutex, you should prove the same for cgroup_root_mutex. task_cgroup_from_root() is similar to css_cgroup_from_root(), but the second function does not accept cgroup_root_mutex. What is the difference? 2. cgroup->ve_owner field modifications are done with cgroup_root_mutex also locked. Which way? There is no cgroup_root_mutex in cgroup_unmark_ve_roots(). You are right. Thank you for pointing that out. More work should be done to ensure that cgroup_root_mutex could provide the right guarantees. While answering your questions I've found a better solution I think. I will revoke this whole patch and send a new one. https://jira.sw.ru/browse/PSBM-121438 Signed-Off-By: Valeriy Vdovin --- kernel/cgroup.c | 7 --- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/kernel/cgroup.c b/kernel/cgroup.c index 27d7a5e..db6a5ba 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -669,7 +669,7 @@ static struct cgroup *task_cgroup_from_root(struct task_struct *task, struct css_set *css; struct cgroup *res = NULL; - BUG_ON(!mutex_is_locked(&cgroup_mutex)); + BUG_ON(!mutex_is_locked(&cgroup_mutex) && !mutex_is_locked(&cgroup_root_mutex)); read_lock(&css_set_lock); /* * No need to lock the task - since we hold cgroup_mutex the @@ -1100,7 +1100,6 @@ static int cgroup_show_options(struct seq_file *seq, struct dentry *dentry) struct cgroup_subsys *ss; struct cgroup *root_cgrp = &root->top_cgroup; - mutex_lock(&cgroup_mutex); mutex_lock(&cgroup_root_mutex); for_each_subsys(root, ss) seq_printf(seq, ",%s", ss->name); @@ -1142,7 +1141,6 @@ static int cgroup_show_options(struct seq_file *seq, struct dentry *dentry) if (strlen(root->name)) seq_show_option(seq, "name", root->name); mutex_unlock(&cgroup_root_mutex); - mutex_unlock(&cgroup_mutex); return 0; } @@ -2529,6 +2527,8 @@ static int cgroup_release_agent_write(struct cgroup *cgrp, struct cftype *cft, if (!cgroup_lock_live_group(cgrp)) return -ENODEV; + mutex_lock(&cgroup_root_mutex); + /* * Call to cgroup_get_local_root is a way to make sure * that cgrp in the argument is valid "virtual root" @@ -2551,6 +2551,7 @@ static int cgroup_release_agent_write(struct cgroup *cgrp, struct cftype *cft, ret = -ENODEV; out: + mutex_unlock(&cgroup_root_mutex); mutex_unlock(&cgroup_mutex); return ret; } ___ Devel mailing list Devel@openvz.org https://lists.openvz.org/mailman/listinfo/devel
[Devel] [PATCH RH7] cgroup: do not use cgroup_mutex in cgroup_show_options
In patch 87cb5fdb5b5c77ac617b46a0fe118a7d50a77b1c function cgroup_show_options started to lock cgroup_mutex, which introduced new deadlock possibility, described below: Thread A: m_start() --> down_read(&namespace_sem); cgroup_show_options() --> mutex_lock(&cgroup_mutex); Thread B: attach_task_by_pid() cgroup_lock_live_group --> mutex_lock(&cgroup_mutex); threadgroup_lock() --> down_write(&tsk->signal->group_rwsem); Thread C: copy_process threadgroup_change_begin() --> down_read(&tsk->signal->group_rwsem); copy_namespaces create_new_namespaces copy_mnt_ns namespace_lock() --> down_write(&namespace_sem) Clearly cgroup_mutex can not be locked right after locking namespace_sem, because opposite locking order is also present in the code. To get rid of cgroup_mutex it's synchonization role should transition to cgroup_root_mutex, that was created specifically to defeat the described locking order problem. In this location cgroup_mutex provided 2 guarantees: 1. Safe call for 'task_cgroup_from_root', that asserts cgroup_mutex ownership. 2. Safe read access to cgroup->ve_owner These guarantees are now transferred to cgroup_root_mutex in the following way: 1. task_cgroup_from_root assertion is modified to take into account cgroup_root_mutex as one of two possible locks. 2. cgroup->ve_owner field modifications are done with cgroup_root_mutex also locked. https://jira.sw.ru/browse/PSBM-121438 Signed-Off-By: Valeriy Vdovin --- kernel/cgroup.c | 7 --- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/kernel/cgroup.c b/kernel/cgroup.c index 27d7a5e..db6a5ba 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -669,7 +669,7 @@ static struct cgroup *task_cgroup_from_root(struct task_struct *task, struct css_set *css; struct cgroup *res = NULL; - BUG_ON(!mutex_is_locked(&cgroup_mutex)); + BUG_ON(!mutex_is_locked(&cgroup_mutex) && !mutex_is_locked(&cgroup_root_mutex)); read_lock(&css_set_lock); /* * No need to lock the task - since we hold cgroup_mutex the @@ -1100,7 +1100,6 @@ static int cgroup_show_options(struct seq_file *seq, struct dentry *dentry) struct cgroup_subsys *ss; struct cgroup *root_cgrp = &root->top_cgroup; - mutex_lock(&cgroup_mutex); mutex_lock(&cgroup_root_mutex); for_each_subsys(root, ss) seq_printf(seq, ",%s", ss->name); @@ -1142,7 +1141,6 @@ static int cgroup_show_options(struct seq_file *seq, struct dentry *dentry) if (strlen(root->name)) seq_show_option(seq, "name", root->name); mutex_unlock(&cgroup_root_mutex); - mutex_unlock(&cgroup_mutex); return 0; } @@ -2529,6 +2527,8 @@ static int cgroup_release_agent_write(struct cgroup *cgrp, struct cftype *cft, if (!cgroup_lock_live_group(cgrp)) return -ENODEV; + mutex_lock(&cgroup_root_mutex); + /* * Call to cgroup_get_local_root is a way to make sure * that cgrp in the argument is valid "virtual root" @@ -2551,6 +2551,7 @@ static int cgroup_release_agent_write(struct cgroup *cgrp, struct cftype *cft, ret = -ENODEV; out: + mutex_unlock(&cgroup_root_mutex); mutex_unlock(&cgroup_mutex); return ret; } -- 1.8.3.1 ___ Devel mailing list Devel@openvz.org https://lists.openvz.org/mailman/listinfo/devel
[Devel] [PATCH RHEL7] ploop: zero-out block device statistics at ploop_stop
ploop block device is represented by a block device file in /dev, but it's lifecycle is separated from the file itself by PLOOP_IOC_START and PLOOP_IOC_STOP ioctls. This way ploop file in /dev can be an empty placeholder after PLOOP_IOC_STOP ioctl and reinitialized later by a PLOOP_IOC_START. Because of that some of the important data structures stay allocated after stop and maintain old values until and after restart. This situation is also true for block device statistics that remain unchanged after end of ploop device lifecycle. Fresh-started ploop device is considered a new entity with stats equal to zero. For that we zero out stats at ploop_stop. https://jira.sw.ru/browse/PSBM-95605 Signed-off-by: Valeriy.Vdovin --- drivers/block/ploop/dev.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/drivers/block/ploop/dev.c b/drivers/block/ploop/dev.c index ac4d142..c54ff90 100644 --- a/drivers/block/ploop/dev.c +++ b/drivers/block/ploop/dev.c @@ -4373,6 +4373,9 @@ static int ploop_stop(struct ploop_device * plo, struct block_device *bdev) clear_bit(PLOOP_S_RUNNING, &plo->state); + part_stat_set_all(&plo->disk->part0, 0); + memset(&plo->st, 0, sizeof(plo->st)); + del_timer_sync(&plo->mitigation_timer); del_timer_sync(&plo->freeze_timer); -- 1.8.3.1 ___ Devel mailing list Devel@openvz.org https://lists.openvz.org/mailman/listinfo/devel
[Devel] [PATCH RHEL7] KVM: LAPIC: Fix pv ipis use-before-initialization
From: Wanpeng Li Reported by syzkaller: BUG: unable to handle kernel NULL pointer dereference at 0014 PGD 80040410c067 P4D 80040410c067 PUD 40410d067 PMD 0 Oops: [#1] PREEMPT SMP PTI CPU: 3 PID: 2567 Comm: poc Tainted: G OE 4.19.0-rc5 #16 RIP: 0010:kvm_pv_send_ipi+0x94/0x350 [kvm] Call Trace: kvm_emulate_hypercall+0x3cc/0x700 [kvm] handle_vmcall+0xe/0x10 [kvm_intel] vmx_handle_exit+0xc1/0x11b0 [kvm_intel] vcpu_enter_guest+0x9fb/0x1910 [kvm] kvm_arch_vcpu_ioctl_run+0x35c/0x610 [kvm] kvm_vcpu_ioctl+0x3e9/0x6d0 [kvm] do_vfs_ioctl+0xa5/0x690 ksys_ioctl+0x6d/0x80 __x64_sys_ioctl+0x1a/0x20 do_syscall_64+0x83/0x6e0 entry_SYSCALL_64_after_hwframe+0x49/0xbe The reason is that the apic map has not yet been initialized, the testcase triggers pv_send_ipi interface by vmcall which results in kvm->arch.apic_map is dereferenced. This patch fixes it by checking whether or not apic map is NULL and bailing out immediately if that is the case. Fixes: 4180bf1b65 (KVM: X86: Implement "send IPI" hypercall) Reported-by: Wei Wu Cc: Paolo Bonzini Cc: Radim Krčmář Cc: Wei Wu Signed-off-by: Wanpeng Li Cc: sta...@vger.kernel.org Signed-off-by: Paolo Bonzini (cherry-picked from commit 38ab012f109caf10f471db1adf284e620dd8d701) https://jira.sw.ru/browse/PSBM-107931 Signed-off-by: Valeriy.Vdovin --- arch/x86/kvm/lapic.c | 5 + 1 file changed, 5 insertions(+) diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c index 740be89..f433199 100644 --- a/arch/x86/kvm/lapic.c +++ b/arch/x86/kvm/lapic.c @@ -566,6 +566,11 @@ int kvm_pv_send_ipi(struct kvm *kvm, unsigned long ipi_bitmap_low, rcu_read_lock(); map = rcu_dereference(kvm->arch.apic_map); + if (unlikely(!map)) { + count = -EOPNOTSUPP; + goto out; + } + if (min > map->max_apic_id) goto out; /* Bits above cluster_size are masked in the caller. */ -- 1.8.3.1 ___ Devel mailing list Devel@openvz.org https://lists.openvz.org/mailman/listinfo/devel
[Devel] [PATCH] KVM: LAPIC: Fix pv ipis out-of-bounds access
From: Wanpeng Li Dan Carpenter reported that the untrusted data returns from kvm_register_read() results in the following static checker warning: arch/x86/kvm/lapic.c:576 kvm_pv_send_ipi() error: buffer underflow 'map->phys_map' 's32min-s32max' KVM guest can easily trigger this by executing the following assembly sequence in Ring0: mov $10, %rax mov $0x, %rbx mov $0x, %rdx mov $0, %rsi vmcall As this will cause KVM to execute the following code-path: vmx_handle_exit() -> handle_vmcall() -> kvm_emulate_hypercall() -> kvm_pv_send_ipi() which will reach out-of-bounds access. This patch fixes it by adding a check to kvm_pv_send_ipi() against map->max_apic_id, ignoring destinations that are not present and delivering the rest. We also check whether or not map->phys_map[min + i] is NULL since the max_apic_id is set to the max apic id, some phys_map maybe NULL when apic id is sparse, especially kvm unconditionally set max_apic_id to 255 to reserve enough space for any xAPIC ID. Reported-by: Dan Carpenter Reviewed-by: Liran Alon Cc: Paolo Bonzini Cc: Radim Krčmář Cc: Liran Alon Cc: Dan Carpenter Signed-off-by: Wanpeng Li [Add second "if (min > map->max_apic_id)" to complete the fix. -Radim] Signed-off-by: Radim Krčmář (cherry picked from commit bdf7ffc89922a52a4f08a12f7421ea24bb7626a0) https://jira.sw.ru/browse/PSBM-107931 Signed-off-by: Valeriy Vdovin --- arch/x86/include/asm/kvm_host.h | 2 +- arch/x86/kvm/lapic.c| 27 --- 2 files changed, 21 insertions(+), 8 deletions(-) diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index 50817dc3..e9ee080 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -1434,7 +1434,7 @@ void kvm_arch_mmu_notifier_invalidate_page(struct kvm *kvm, u64 kvm_get_arch_capabilities(void); int kvm_pv_send_ipi(struct kvm *kvm, unsigned long ipi_bitmap_low, - unsigned long ipi_bitmap_high, int min, + unsigned long ipi_bitmap_high, u32 min, unsigned long icr, int op_64_bit); void kvm_define_shared_msr(unsigned index, u32 msr); diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c index 1487fe2..740be89 100644 --- a/arch/x86/kvm/lapic.c +++ b/arch/x86/kvm/lapic.c @@ -543,7 +543,7 @@ int kvm_apic_set_irq(struct kvm_vcpu *vcpu, struct kvm_lapic_irq *irq, } int kvm_pv_send_ipi(struct kvm *kvm, unsigned long ipi_bitmap_low, - unsigned long ipi_bitmap_high, int min, + unsigned long ipi_bitmap_high, u32 min, unsigned long icr, int op_64_bit) { int i; @@ -566,18 +566,31 @@ int kvm_pv_send_ipi(struct kvm *kvm, unsigned long ipi_bitmap_low, rcu_read_lock(); map = rcu_dereference(kvm->arch.apic_map); + if (min > map->max_apic_id) + goto out; /* Bits above cluster_size are masked in the caller. */ - for_each_set_bit(i, &ipi_bitmap_low, BITS_PER_LONG) { - vcpu = map->phys_map[min + i]->vcpu; - count += kvm_apic_set_irq(vcpu, &irq, NULL); + for_each_set_bit(i, &ipi_bitmap_low, + min((u32)BITS_PER_LONG, (map->max_apic_id - min + 1))) { + if (map->phys_map[min + i]) { + vcpu = map->phys_map[min + i]->vcpu; + count += kvm_apic_set_irq(vcpu, &irq, NULL); + } } min += cluster_size; - for_each_set_bit(i, &ipi_bitmap_high, BITS_PER_LONG) { - vcpu = map->phys_map[min + i]->vcpu; - count += kvm_apic_set_irq(vcpu, &irq, NULL); + + if (min > map->max_apic_id) + goto out; + + for_each_set_bit(i, &ipi_bitmap_high, + min((u32)BITS_PER_LONG, (map->max_apic_id - min + 1))) { + if (map->phys_map[min + i]) { + vcpu = map->phys_map[min + i]->vcpu; + count += kvm_apic_set_irq(vcpu, &irq, NULL); + } } +out: rcu_read_unlock(); return count; } -- 1.8.3.1 ___ Devel mailing list Devel@openvz.org https://lists.openvz.org/mailman/listinfo/devel
[Devel] [PATCH RHEL7] cgroup: fixed NULL-pointer dereference in cgroup_release_agent
The fix checks that ve->init_task is not referenced during warning message decision if ve == ve0, because ve0 init_task is always NULL. https://jira.sw.ru/browse/PSBM-107673 Signed-off-by: Valeriy Vdovin --- kernel/cgroup.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/cgroup.c b/kernel/cgroup.c index 691505c..27d7a5e 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -5934,7 +5934,7 @@ void cgroup_release_agent(struct work_struct *work) envp, UMH_WAIT_EXEC, NULL, NULL, NULL); ve_task = ve->init_task; - if (err < 0 && (!(ve_task->flags & PF_EXITING))) + if (err < 0 && (ve == &ve0 || !(ve_task->flags & PF_EXITING))) pr_warn_ratelimited("cgroup release_agent " "%s %s failed: %d\n", agentbuf, pathbuf, err); -- 1.8.3.1 ___ Devel mailing list Devel@openvz.org https://lists.openvz.org/mailman/listinfo/devel
[Devel] [PATCH RH7] cgroup: Fixed null pointer dereference at cgroup_mount #PSBM-107596
At cgroup_mount new_root a call to cgroup_root_from_opts may return early and not allocate a new cgroup root object, instead returning NULL. In that case we should not initialize ve_owner field for cgroup root. https://jira.sw.ru/browse/PSBM-107596 Signed-off-by: Valeriy Vdovin --- kernel/cgroup.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/kernel/cgroup.c b/kernel/cgroup.c index 5f311180..691505c 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -1749,6 +1749,9 @@ static struct cgroupfs_root *cgroup_root_from_opts(struct cgroup_sb_opts *opts) strcpy(root->name, opts->name); if (opts->cpuset_clone_children) set_bit(CGRP_CPUSET_CLONE_CHILDREN, &root->top_cgroup.flags); + + RCU_INIT_POINTER(root->top_cgroup.ve_owner, &ve0); + return root; } @@ -1859,7 +1862,6 @@ static struct dentry *cgroup_mount(struct file_system_type *fs_type, goto drop_modules; } - RCU_INIT_POINTER(new_root->top_cgroup.ve_owner, &ve0); opts.new_root = new_root; /* Locate an existing or new sb for this hierarchy */ -- 1.8.3.1 ___ Devel mailing list Devel@openvz.org https://lists.openvz.org/mailman/listinfo/devel
[Devel] [PATCH RHEL7] mm: Fixing rwsem_is_contented conditional code in shrink_slab_memcg
Fixes commit 38afbd5ecdd6841b5e486e3c9dae05d961f084b5 that partially reverts code in shrink_slab_memcg by adding missing line. https://jira.sw.ru/browse/PSBM-99181 Signed-off-by: Valeriy Vdovin --- mm/vmscan.c | 1 + 1 file changed, 1 insertion(+) diff --git a/mm/vmscan.c b/mm/vmscan.c index 4fa86e7..13ae9bd 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -598,6 +598,7 @@ static unsigned long shrink_slab_memcg(gfp_t gfp_mask, int nid, map = memcg_nid_shrinker_map(memcg, nid); nr_max = min(shrinker_nr_max, map->nr_max); } else if (rwsem_is_contended(&shrinker_rwsem)) { + freed = freed ? : 1; break; } } -- 1.8.3.1 ___ Devel mailing list Devel@openvz.org https://lists.openvz.org/mailman/listinfo/devel
[Devel] [PATCH RHEL v2] mm: Reduce access frequency to shrinker_rwsem during shrink_slab
Bug https://jira.sw.ru/browse/PSBM-99181 has introduced a problem: when the kernel has opened NFS delegations and NFS server is not accessible at the time when NFS shrinker is called, the whole shrinker list execution gets stuck until NFS server is back. Being a problem in itself it also introduces bigger problem - during that hang, the shrinker_rwsem also gets locked, consequently no new mounts can be done at that time because new superblock tries to register it's own shrinker and also gets stuck at aquiring shrinker_rwsem. Commit 9e9e35d050955648449498827deb2d43be0564e1 is a workaround for that problem. It is known that during signle shrinker execution we do not actually need to hold shrinker_rwsem so we release and reacqiure the rwsem for each shrinker in the list. Because of this workaround shrink_slab function now experiences a major slowdown, because shrinker_rwsem gets accessed for each shrinker in the list twice. On an idle fresh-booted system shrinker_list could be iterated up to 1600 times a second, although originally the problem was local to only one NFS shrinker. This patch fixes commit 9e9e35d050955648449498827deb2d43be0564e1 in a way that before calling for up_read for shrinker_rwsem, we check that this is really an NFS shrinker by checking NFS magic in superblock, if it is accessible from shrinker. https://jira.sw.ru/browse/PSBM-99181 Co-authored-by: Andrey Ryabinin Signed-off-by: Valeriy Vdovin Changes: v2: Added missing 'rwsem_is_contented' check --- fs/super.c | 2 +- mm/vmscan.c | 65 ++--- 2 files changed, 50 insertions(+), 17 deletions(-) diff --git a/fs/super.c b/fs/super.c index f131d14..1cf377a 100644 --- a/fs/super.c +++ b/fs/super.c @@ -80,7 +80,7 @@ EXPORT_SYMBOL(dcache_is_low); * shrinker path and that leads to deadlock on the shrinker_rwsem. Hence we * take a passive reference to the superblock to avoid this from occurring. */ -static unsigned long super_cache_scan(struct shrinker *shrink, +unsigned long super_cache_scan(struct shrinker *shrink, struct shrink_control *sc) { struct super_block *sb; diff --git a/mm/vmscan.c b/mm/vmscan.c index d7082d2..4fa86e7 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -453,6 +453,20 @@ static unsigned long do_shrink_slab(struct shrink_control *shrinkctl, return freed; } +unsigned long super_cache_scan(struct shrinker *shrink, + struct shrink_control *sc); + +static inline bool is_nfs_shrinker(struct shrinker *shrinker) +{ + struct super_block *sb = container_of(shrinker, + struct super_block, s_shrink); + + if (shrinker->scan_objects == &super_cache_scan) + return sb->s_magic == NFS_SUPER_MAGIC; + + return false; +} + struct shrinker *get_shrinker(struct shrinker *shrinker) { /* @@ -511,6 +525,7 @@ static unsigned long shrink_slab_memcg(gfp_t gfp_mask, int nid, .memcg = memcg, }; struct shrinker *shrinker; + bool is_nfs; shrinker = idr_find(&shrinker_idr, i); if (unlikely(!shrinker)) { @@ -518,6 +533,8 @@ static unsigned long shrink_slab_memcg(gfp_t gfp_mask, int nid, continue; } + is_nfs = is_nfs_shrinker(shrinker); + /* * Take a refcnt on a shrinker so that it can't be freed or * removed from shrinker_idr (and shrinker_list). These way we @@ -527,10 +544,16 @@ static unsigned long shrink_slab_memcg(gfp_t gfp_mask, int nid, * take too much time to finish (e.g. on nfs). And holding * global shrinker_rwsem can block registring and unregistring * of shrinkers. +* +* The up_read logic should only be executed for nfs shrinker +* path, because it has proven to hang. For others it should be +* skipped to reduce performance penalties. */ - if(!get_shrinker(shrinker)) - continue; - up_read(&shrinker_rwsem); + if (is_nfs) { + if (!get_shrinker(shrinker)) + continue; + up_read(&shrinker_rwsem); + } ret = do_shrink_slab(&sc, shrinker, priority); if (ret == SHRINK_EMPTY) { @@ -565,14 +588,18 @@ static unsigned long shrink_slab_memcg(gfp_t gfp_mask, int nid, * memcg_expand_one_shrinker_map if new shrinkers * were registred in the meanwhile. */ - if (!down_read_trylock(&shrinker_rwsem)) { - freed = freed ? : 1; + if (is_nfs) { +
[Devel] [PATCH RHEL7] mm: Reduce access frequency to shrinker_rwsem during shrink_slab
Bug https://jira.sw.ru/browse/PSBM-99181 has introduced a problem: when the kernel has opened NFS delegations and NFS server is not accessible at the time when NFS shrinker is called, the whole shrinker list execution gets stuck until NFS server is back. Being a problem in itself it also introduces bigger problem - during that hang, the shrinker_rwsem also gets locked, consequently no new mounts can be done at that time because new superblock tries to register it's own shrinker and also gets stuck at aquiring shrinker_rwsem. Commit 9e9e35d050955648449498827deb2d43be0564e1 is a workaround for that problem. It is known that during signle shrinker execution we do not actually need to hold shrinker_rwsem so we release and reacqiure the rwsem for each shrinker in the list. Because of this workaround shrink_slab function now experiences a major slowdown, because shrinker_rwsem gets accessed for each shrinker in the list twice. On an idle fresh-booted system shrinker_list could be iterated up to 1600 times a second, although originally the problem was local to only one NFS shrinker. This patch fixes commit 9e9e35d050955648449498827deb2d43be0564e1 in a way that before calling for up_read for shrinker_rwsem, we check that this is really an NFS shrinker by checking NFS magic in superblock, if it is accessible from shrinker. https://jira.sw.ru/browse/PSBM-99181 Co-authored-by: Andrey Ryabinin Signed-off-by: Valeriy Vdovin --- fs/super.c | 2 +- mm/vmscan.c | 63 + 2 files changed, 48 insertions(+), 17 deletions(-) diff --git a/fs/super.c b/fs/super.c index f131d14..1cf377a 100644 --- a/fs/super.c +++ b/fs/super.c @@ -80,7 +80,7 @@ EXPORT_SYMBOL(dcache_is_low); * shrinker path and that leads to deadlock on the shrinker_rwsem. Hence we * take a passive reference to the superblock to avoid this from occurring. */ -static unsigned long super_cache_scan(struct shrinker *shrink, +unsigned long super_cache_scan(struct shrinker *shrink, struct shrink_control *sc) { struct super_block *sb; diff --git a/mm/vmscan.c b/mm/vmscan.c index d7082d2..4fb5d78 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -453,6 +453,20 @@ static unsigned long do_shrink_slab(struct shrink_control *shrinkctl, return freed; } +unsigned long super_cache_scan(struct shrinker *shrink, + struct shrink_control *sc); + +static inline bool is_nfs_shrinker(struct shrinker *shrinker) +{ + struct super_block *sb = container_of(shrinker, + struct super_block, s_shrink); + + if (shrinker->scan_objects == &super_cache_scan) + return sb->s_magic == NFS_SUPER_MAGIC; + + return false; +} + struct shrinker *get_shrinker(struct shrinker *shrinker) { /* @@ -511,6 +525,7 @@ static unsigned long shrink_slab_memcg(gfp_t gfp_mask, int nid, .memcg = memcg, }; struct shrinker *shrinker; + bool is_nfs; shrinker = idr_find(&shrinker_idr, i); if (unlikely(!shrinker)) { @@ -518,6 +533,8 @@ static unsigned long shrink_slab_memcg(gfp_t gfp_mask, int nid, continue; } + is_nfs = is_nfs_shrinker(shrinker); + /* * Take a refcnt on a shrinker so that it can't be freed or * removed from shrinker_idr (and shrinker_list). These way we @@ -527,10 +544,16 @@ static unsigned long shrink_slab_memcg(gfp_t gfp_mask, int nid, * take too much time to finish (e.g. on nfs). And holding * global shrinker_rwsem can block registring and unregistring * of shrinkers. +* +* The up_read logic should only be executed for nfs shrinker +* path, because it has proven to hang. For others it should be +* skipped to reduce performance penalties. */ - if(!get_shrinker(shrinker)) - continue; - up_read(&shrinker_rwsem); + if (is_nfs) { + if (!get_shrinker(shrinker)) + continue; + up_read(&shrinker_rwsem); + } ret = do_shrink_slab(&sc, shrinker, priority); if (ret == SHRINK_EMPTY) { @@ -565,14 +588,16 @@ static unsigned long shrink_slab_memcg(gfp_t gfp_mask, int nid, * memcg_expand_one_shrinker_map if new shrinkers * were registred in the meanwhile. */ - if (!down_read_trylock(&shrinker_rwsem)) { - freed = freed ? : 1; + if (is_nfs) { + if (!down_read_trylock(&shrinker_rwsem)) { +
[Devel] [PATCH RHEL7] KVM: X86: Implement "send IPI" hypercall
From: Wanpeng Li Using hypercall to send IPIs by one vmexit instead of one by one for xAPIC/x2APIC physical mode and one vmexit per-cluster for x2APIC cluster mode. Intel guest can enter x2apic cluster mode when interrupt remmaping is enabled in qemu, however, latest AMD EPYC still just supports xapic mode which can get great improvement by Exit-less IPIs. This patchset lets a guest send multicast IPIs, with at most 128 destinations per hypercall in 64-bit mode and 64 vCPUs per hypercall in 32-bit mode. Hardware: Xeon Skylake 2.5GHz, 2 sockets, 40 cores, 80 threads, the VM is 80 vCPUs, IPI microbenchmark(https://lkml.org/lkml/2017/12/19/141): x2apic cluster mode, vanilla Dry-run: 0,2392199 ns Self-IPI: 6907514, 15027589 ns Normal IPI: 223910476, 251301666 ns Broadcast IPI: 0, 9282161150 ns Broadcast lock: 0, 8812934104 ns x2apic cluster mode, pv-ipi Dry-run: 0,2449341 ns Self-IPI: 6720360, 15028732 ns Normal IPI: 228643307, 255708477 ns Broadcast IPI: 0, 7572293590 ns => 22% performance boost Broadcast lock: 0, 8316124651 ns x2apic physical mode, vanilla Dry-run: 0,3135933 ns Self-IPI: 8572670, 17901757 ns Normal IPI: 226444334, 255421709 ns Broadcast IPI: 0,19845070887 ns Broadcast lock: 0,19827383656 ns x2apic physical mode, pv-ipi Dry-run: 0,2446381 ns Self-IPI: 6788217, 15021056 ns Normal IPI: 219454441, 249583458 ns Broadcast IPI: 0, 7806540019 ns => 154% performance boost Broadcast lock: 0, 9143618799 ns Cc: Paolo Bonzini Cc: Radim Krčmář Cc: Vitaly Kuznetsov Signed-off-by: Wanpeng Li Signed-off-by: Paolo Bonzini (cherry-picked from commit 4180bf1b655a791a0a6ef93a2c762722c782) https://jira.sw.ru/browse/PSBM-104805 Signed-off-by: Valeriy.Vdovin Signed-off-by: Valeriy Vdovin --- Documentation/virtual/kvm/cpuid.txt | 4 Documentation/virtual/kvm/hypercalls.txt | 20 arch/x86/include/asm/kvm_host.h | 4 arch/x86/include/uapi/asm/kvm_para.h | 3 ++- arch/x86/kvm/cpuid.c | 3 ++- arch/x86/kvm/lapic.c | 40 arch/x86/kvm/x86.c | 3 +++ include/uapi/linux/kvm_para.h| 2 ++ 8 files changed, 77 insertions(+), 2 deletions(-) diff --git a/Documentation/virtual/kvm/cpuid.txt b/Documentation/virtual/kvm/cpuid.txt index 3c65feb..41217d6 100644 --- a/Documentation/virtual/kvm/cpuid.txt +++ b/Documentation/virtual/kvm/cpuid.txt @@ -54,6 +54,10 @@ KVM_FEATURE_PV_UNHALT || 7 || guest checks this feature bit || || before enabling paravirtualized || || spinlock support. -- +KVM_FEATURE_PV_SEND_IPI||11 || guest checks this feature bit + || || before using paravirtualized + || || send IPIs. +-- KVM_FEATURE_CLOCKSOURCE_STABLE_BIT ||24 || host will warn if no guest-side || || per-cpu warps are expected in || || kvmclock. diff --git a/Documentation/virtual/kvm/hypercalls.txt b/Documentation/virtual/kvm/hypercalls.txt index 6b6dc98..8fa78a1 100644 --- a/Documentation/virtual/kvm/hypercalls.txt +++ b/Documentation/virtual/kvm/hypercalls.txt @@ -113,3 +113,23 @@ compute the CLOCK_REALTIME for its clock, at the same instant. Returns KVM_EOPNOTSUPP if the host does not use TSC clocksource, or if clock type is different than KVM_CLOCK_PAIRING_WALLCLOCK. + +6. KVM_HC_SEND_IPI + +Architecture: x86 +Status: active +Purpose: Send IPIs to multiple vCPUs. + +a0: lower part of the bitmap of destination APIC IDs +a1: higher part of the bitmap of destination APIC IDs +a2: the lowest APIC ID in bitmap +a3: APIC ICR + +The hypercall lets a guest send multicast IPIs, with at most 128 +128 destinations per hypercall in 64-bit mode and 64 vCPUs per +hypercall in 32-bit mode. The destinations are represented by a +bitmap contained in the first two arguments (a0 and a1). Bit 0 of +a0 corresponds to the APIC ID in the third argument (a2), bit 1 +corresponds to the APIC ID a2+1, and so on. + +Returns the number of CPUs to which the IPI
[Devel] [PATCH RHEL v22 14/14] ve/cgroup: At cgroup_mark(unmark)_ve_roots skip non-virtualized roots
During container start there might be a situation when not all cgroup hierarchies get virtualized by container manager (like vzctl). By virtualizing a cgroup hierarchy I mean creation of sub-directory within a particular mounted cgroup. When container starts it looks in css set of it's init process to list all affilated cgroups and perform actions on each. But non-virtualized cgroups will also be present in init's css_set and they should not be touched from inside of any non root ve. Signed-off-by: Valeriy Vdovin --- kernel/cgroup.c | 31 +++ 1 file changed, 31 insertions(+) diff --git a/kernel/cgroup.c b/kernel/cgroup.c index 3305032..aefe40b 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -4449,6 +4449,18 @@ static struct cftype *get_cftype_by_name(const char *name) } #ifdef CONFIG_VE +static inline bool is_virtualized_cgroup(struct cgroup *cgrp) +{ + lockdep_assert_held(&cgroup_mutex); + if (cgrp->root->subsys_mask) + return true; + + if (!strcmp(cgrp->root->name, "systemd")) + return true; + + return false; +} + int cgroup_mark_ve_roots(struct ve_struct *ve) { struct cgroup *cgrp, *tmp; @@ -4464,6 +4476,17 @@ int cgroup_mark_ve_roots(struct ve_struct *ve) mutex_lock(&cgroup_mutex); for_each_active_root(root) { cgrp = css_cgroup_from_root(ve->root_css_set, root); + + /* +* At container start, vzctl creates special cgroups to serve +* as virtualized cgroup roots. They are bind-mounted on top +* of original cgroup mount point in container namespace. But +* not all cgroup mounts undergo this procedure. We should +* skip cgroup mounts that are not virtualized. +*/ + if (!is_virtualized_cgroup(cgrp)) + continue; + rcu_assign_pointer(cgrp->ve_owner, ve); set_bit(CGRP_VE_ROOT, &cgrp->flags); @@ -4513,6 +4536,14 @@ void cgroup_unmark_ve_roots(struct ve_struct *ve) mutex_lock(&cgroup_mutex); for_each_active_root(root) { cgrp = css_cgroup_from_root(ve->root_css_set, root); + + /* +* For this line see comments in +* cgroup_mark_ve_roots +*/ + if (!is_virtualized_cgroup(cgrp)) + continue; + dget(cgrp->dentry); list_add_tail(&cgrp->cft_q_node, &pending); } -- 1.8.3.1 ___ Devel mailing list Devel@openvz.org https://lists.openvz.org/mailman/listinfo/devel
[Devel] [PATCH RHEL v22 04/14] cgroup: exported __put_css_set and wrappers to cgroup.h
Signed-off-by: Valeriy Vdovin Reviewed-by: Kirill Tkhai --- include/linux/cgroup.h | 20 kernel/cgroup.c| 20 +--- 2 files changed, 21 insertions(+), 19 deletions(-) diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h index b25d0e18..ac60aaed 100644 --- a/include/linux/cgroup.h +++ b/include/linux/cgroup.h @@ -478,6 +478,26 @@ struct css_set { }; /* + * refcounted get/put for css_set objects + */ +extern void __put_css_set(struct css_set *cg, int taskexit); + +static inline void get_css_set(struct css_set *cg) +{ + atomic_inc(&cg->refcount); +} + +static inline void put_css_set(struct css_set *cg) +{ + __put_css_set(cg, 0); +} + +static inline void put_css_set_taskexit(struct css_set *cg) +{ + __put_css_set(cg, 1); +} + +/* * cgroup_map_cb is an abstract callback API for reporting map-valued * control files */ diff --git a/kernel/cgroup.c b/kernel/cgroup.c index a91dd5f..ce576c5 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -335,7 +335,7 @@ static unsigned long css_set_hash(struct cgroup_subsys_state *css[]) * compiled into their kernel but not actually in use */ static int use_task_css_set_links __read_mostly; -static void __put_css_set(struct css_set *cg, int taskexit) +void __put_css_set(struct css_set *cg, int taskexit) { struct cg_cgroup_link *link; struct cg_cgroup_link *saved_link; @@ -384,24 +384,6 @@ static void __put_css_set(struct css_set *cg, int taskexit) } /* - * refcounted get/put for css_set objects - */ -static inline void get_css_set(struct css_set *cg) -{ - atomic_inc(&cg->refcount); -} - -static inline void put_css_set(struct css_set *cg) -{ - __put_css_set(cg, 0); -} - -static inline void put_css_set_taskexit(struct css_set *cg) -{ - __put_css_set(cg, 1); -} - -/* * compare_css_sets - helper function for find_existing_css_set(). * @cg: candidate css_set being tested * @old_cg: existing css_set for a task -- 1.8.3.1 ___ Devel mailing list Devel@openvz.org https://lists.openvz.org/mailman/listinfo/devel
[Devel] [PATCH RHEL v22 11/14] ve/cgroup: set release_agent_path for root cgroups separately for each ve.
This is done so that each container could set it's own release agent. Release agent information is now stored in per-cgroup-root data structure in ve. https://jira.sw.ru/browse/PSBM-83887 Signed-off-by: Valeriy Vdovin --- include/linux/cgroup.h | 3 -- include/linux/ve.h | 6 +++ kernel/cgroup.c| 100 - kernel/ve/ve.c | 72 +++ 4 files changed, 161 insertions(+), 20 deletions(-) diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h index 5f1460d..fc138c0 100644 --- a/include/linux/cgroup.h +++ b/include/linux/cgroup.h @@ -429,9 +429,6 @@ struct cgroupfs_root { /* IDs for cgroups in this hierarchy */ struct ida cgroup_ida; - /* The path to use for release notifications. */ - char release_agent_path[PATH_MAX]; - /* The name for this hierarchy - may be empty */ char name[MAX_CGROUP_ROOT_NAMELEN]; }; diff --git a/include/linux/ve.h b/include/linux/ve.h index 65413d5..b6662637 100644 --- a/include/linux/ve.h +++ b/include/linux/ve.h @@ -214,6 +214,12 @@ void do_update_load_avg_ve(void); void ve_add_to_release_list(struct cgroup *cgrp); void ve_rm_from_release_list(struct cgroup *cgrp); + +int ve_set_release_agent_path(struct ve_struct *ve, struct cgroup *cgroot, + const char *release_agent); + +const char *ve_get_release_agent_path(struct cgroup *cgrp_root); + extern struct ve_struct *get_ve(struct ve_struct *ve); extern void put_ve(struct ve_struct *ve); diff --git a/kernel/cgroup.c b/kernel/cgroup.c index aa93cf2..aff369b 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -1092,9 +1092,12 @@ static int rebind_subsystems(struct cgroupfs_root *root, static int cgroup_show_options(struct seq_file *seq, struct dentry *dentry) { + const char *release_agent; struct cgroupfs_root *root = dentry->d_sb->s_fs_info; struct cgroup_subsys *ss; + struct cgroup *root_cgrp = &root->top_cgroup; + mutex_lock(&cgroup_mutex); mutex_lock(&cgroup_root_mutex); for_each_subsys(root, ss) seq_printf(seq, ",%s", ss->name); @@ -1106,14 +1109,37 @@ static int cgroup_show_options(struct seq_file *seq, struct dentry *dentry) seq_puts(seq, ",xattr"); if (root->flags & CGRP_ROOT_CPUSET_V2_MODE) seq_puts(seq, ",cpuset_v2_mode"); - if (strlen(root->release_agent_path)) - seq_show_option(seq, "release_agent", - root->release_agent_path); +#ifdef CONFIG_VE + { + struct ve_struct *ve = get_exec_env(); + + if (!ve_is_super(ve)) { + /* +* ve->init_task is NULL in case when cgroup is accessed +* before ve_start_container has been called. +* +* ve->init_task is synchronized via ve->ve_ns rcu, see +* ve_grab_context/drop_context. +*/ + rcu_read_lock(); + if (ve->ve_ns) + root_cgrp = task_cgroup_from_root(ve->init_task, + root); + rcu_read_unlock(); + } + } +#endif + rcu_read_lock(); + release_agent = ve_get_release_agent_path(root_cgrp); + if (release_agent && release_agent[0]) + seq_show_option(seq, "release_agent", release_agent); + rcu_read_unlock(); if (test_bit(CGRP_CPUSET_CLONE_CHILDREN, &root->top_cgroup.flags)) seq_puts(seq, ",clone_children"); if (strlen(root->name)) seq_show_option(seq, "name", root->name); mutex_unlock(&cgroup_root_mutex); + mutex_unlock(&cgroup_mutex); return 0; } @@ -1386,8 +1412,13 @@ static int cgroup_remount(struct super_block *sb, int *flags, char *data) /* re-populate subsystem files */ cgroup_populate_dir(cgrp, false, added_mask); - if (opts.release_agent) - strcpy(root->release_agent_path, opts.release_agent); + if (opts.release_agent) { + struct cgroup *root_cgrp; + root_cgrp = cgroup_get_local_root(cgrp); + if (root_cgrp->ve_owner) + ret = ve_set_release_agent_path(root_cgrp, + opts.release_agent); + } out_unlock: kfree(opts.release_agent); kfree(opts.name); @@ -1549,8 +1580,6 @@ static struct cgroupfs_root *cgroup_root_from_opts(struct cgroup_sb_opts *opts) root->subsys_mask = opts->subsys_mask; root->flags = opts->flags; ida_init(&root->cgroup_ida); -
[Devel] [PATCH RHEL v22 10/14] ve/cgroup: private per-cgroup-root data container
As long as each ve is internally attached to a particular css_set via it's init_task, it's good to have container with parameters, which are common to each cgroup subsystem hierarchy, rooting from it's virtual root. Signed-off-by: Valeriy Vdovin Reviewed-by: Kirill Tkhai --- include/linux/ve.h | 7 ++ kernel/ve/ve.c | 73 ++ 2 files changed, 80 insertions(+) diff --git a/include/linux/ve.h b/include/linux/ve.h index 4dbd216..65413d5 100644 --- a/include/linux/ve.h +++ b/include/linux/ve.h @@ -137,6 +137,13 @@ struct ve_struct { struct work_struct release_agent_work; /* +* List of data, private for each root cgroup in +* ve's css_set. +*/ + struct list_headper_cgroot_list; + struct raw_spinlock per_cgroot_list_lock; + + /* * All tasks, that belong to this ve, live * in cgroups, that are children to cgroups * that form this css_set. diff --git a/kernel/ve/ve.c b/kernel/ve/ve.c index 9e6bb8b..f564dca 100644 --- a/kernel/ve/ve.c +++ b/kernel/ve/ve.c @@ -45,6 +45,14 @@ #include #include +struct per_cgroot_data { + struct list_head list; + /* +* data is related to this cgroup +*/ + struct cgroup *cgroot; +}; + extern struct kmapset_set sysfs_ve_perms_set; static struct kmem_cache *ve_cachep; @@ -92,6 +100,9 @@ struct ve_struct ve0 = { .release_list = LIST_HEAD_INIT(ve0.release_list), .release_agent_work = __WORK_INITIALIZER(ve0.release_agent_work, cgroup_release_agent), + .per_cgroot_list= LIST_HEAD_INIT(ve0.per_cgroot_list), + .per_cgroot_list_lock = __RAW_SPIN_LOCK_UNLOCKED( + ve0.per_cgroot_list_lock), }; EXPORT_SYMBOL(ve0); @@ -118,6 +129,52 @@ void put_ve(struct ve_struct *ve) } EXPORT_SYMBOL(put_ve); +static struct per_cgroot_data *per_cgroot_data_find_locked( + struct list_head *per_cgroot_list, struct cgroup *cgroot) +{ + struct per_cgroot_data *data; + + list_for_each_entry(data, per_cgroot_list, list) { + if (data->cgroot == cgroot) + return data; + } + return NULL; +} + +static inline struct per_cgroot_data *per_cgroot_get_or_create( + struct ve_struct *ve, struct cgroup *cgroot) +{ + struct per_cgroot_data *data, *other_data; + + raw_spin_lock(&ve->per_cgroot_list_lock); + data = per_cgroot_data_find_locked(&ve->per_cgroot_list, + cgroot); + raw_spin_unlock(&ve->per_cgroot_list_lock); + + if (data) + return data; + + data = kzalloc(sizeof(struct per_cgroot_data), GFP_KERNEL); + if (!data) + return ERR_PTR(-ENOMEM); + + raw_spin_lock(&ve->per_cgroot_list_lock); + other_data = per_cgroot_data_find_locked(&ve->per_cgroot_list, + cgroot); + + if (other_data) { + raw_spin_unlock(&ve->per_cgroot_list_lock); + kfree(data); + return other_data; + } + + data->cgroot = cgroot; + list_add(&data->list, &ve->per_cgroot_list); + + raw_spin_unlock(&ve->per_cgroot_list_lock); + return data; +} + struct cgroup_subsys_state *ve_get_init_css(struct ve_struct *ve, int subsys_id) { struct cgroup_subsys_state *css, *tmp; @@ -617,6 +674,18 @@ err_list: return err; } +static void ve_per_cgroot_free(struct ve_struct *ve) +{ + struct per_cgroot_data *data, *saved; + + raw_spin_lock(&ve->per_cgroot_list_lock); + list_for_each_entry_safe(data, saved, &ve->per_cgroot_list, list) { + list_del_init(&data->list); + kfree(data); + } + raw_spin_unlock(&ve->per_cgroot_list_lock); +} + void ve_stop_ns(struct pid_namespace *pid_ns) { struct ve_struct *ve = current->task_ve; @@ -667,6 +736,8 @@ void ve_exit_ns(struct pid_namespace *pid_ns) ve_workqueue_stop(ve); + ve_per_cgroot_free(ve); + /* * At this point all userspace tasks in container are dead. */ @@ -740,6 +811,7 @@ static struct cgroup_subsys_state *ve_create(struct cgroup *cg) INIT_WORK(&ve->release_agent_work, cgroup_release_agent); raw_spin_lock_init(&ve->release_list_lock); + raw_spin_lock_init(&ve->per_cgroot_list_lock); ve->_randomize_va_space = ve0._randomize_va_space; @@ -776,6 +848,7 @@ do_init: INIT_LIST_HEAD(&ve->ve_list); INIT_LIST_HEAD(&ve->devmnt_list); INIT_LIST_HEAD(&ve->release_list); + INIT_LIST_HEAD(&ve->per_cgroot_list); mutex_init(&ve->devmnt_mutex); #ifdef CONFIG_AIO -- 1.8.3.1 ___ Devel mailing list Devel@openvz.org https://lists.openvz.org/mailman/listinfo/devel