On 28.07.2020 20:53, Valeriy Vdovin wrote:
> Each container will now have access to it's own cgroup release_agent
> file.
> Creation:
>   Normally all cgroup files are created during a call to cgroup_create
>   by cgroup_populate_dir function. It creates or not creates all cgroup
>   files once and they immediately become visible to userspace as
>   filesystem objects.
>   Due to specifics of container creation process, it is not possible to
>   use the same code for 'release_agent' file creation. For VE to start
>   operating, first a list of ordinary cgroups is being created for
>   each subsystem, then the set of newly created cgroups are converted to
>   "virtual roots", so at the time when cgroup_create is executed, there
>   is no knowledge of wheather or not "release_agent" file should be
>   created. This information only comes at "convertion" step which is
>   'cgroup_mark_ve_roots' function. As the file is created dynamically
>   in a live cgroup, a rather delicate locking sequence is present in
>   the new code:
>     - each new "virtual root" cgroup will have to add "release_agent" file,
>       thus each cgroup's directory would need to be locked during
>       the insertion time by cgroup->dentry->d_inode->i_mutex.
>     - d_inode->i_mutex has an ordering dependency with cgroup_mutex
>       (see cgroup_mount/cgroup_remount). They can not be locked in order
>       {lock(cgroup_mutex), lock(inode->i_mutex)}.
>     - to collect a list of cgroups, that need to become virtual we need
>       cgroup_mutex lock to iterate active roots.
>     - to overcome the above conflict we first need to collect a list of
>       all virtual cgroups under cgroup_mutex lock, then release it and
>       after that to insert "release_agent" to each root under
>       inode->i_mutex lock.
>     - to collect a list of cgroups on stack we utilize
>       cgroup->cft_q_node, made specially for that purpose under it's own
>       cgroup_cft_mutex.
> 
> Destruction:
>   Destruction is done in reverse from the above within
>   cgroup_unmark_ve_root.
>   After file destruction we must prevent further write operations to
>   this file in case when someone has opened this file prior to VE
>   and cgroup destruction. This is achieved by checking if cgroup
>   in the argument to cgroup_file_write function has features of host
>   or virtual root.
> 
> https://jira.sw.ru/browse/PSBM-83887
> 
> Signed-off-by: Valeriy Vdovin <valeriy.vdo...@virtuozzo.com>
> ---
>  include/linux/cgroup.h |  2 +-
>  include/linux/ve.h     |  2 +-
>  kernel/cgroup.c        | 86 
> ++++++++++++++++++++++++++++++++++++++++++++++----
>  kernel/ve/ve.c         |  6 +++-
>  4 files changed, 87 insertions(+), 9 deletions(-)
> 
> diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h
> index fc138c0..645c9fd 100644
> --- a/include/linux/cgroup.h
> +++ b/include/linux/cgroup.h
> @@ -671,7 +671,7 @@ int cgroup_task_count(const struct cgroup *cgrp);
>  void cgroup_release_agent(struct work_struct *work);
>  
>  #ifdef CONFIG_VE
> -void cgroup_mark_ve_roots(struct ve_struct *ve);
> +int cgroup_mark_ve_roots(struct ve_struct *ve);
>  void cgroup_unmark_ve_roots(struct ve_struct *ve);
>  struct ve_struct *cgroup_get_ve_owner(struct cgroup *cgrp);
>  #endif
> diff --git a/include/linux/ve.h b/include/linux/ve.h
> index b6662637..5bf275f 100644
> --- a/include/linux/ve.h
> +++ b/include/linux/ve.h
> @@ -215,7 +215,7 @@ void do_update_load_avg_ve(void);
>  void ve_add_to_release_list(struct cgroup *cgrp);
>  void ve_rm_from_release_list(struct cgroup *cgrp);
>  
> -int ve_set_release_agent_path(struct ve_struct *ve, struct cgroup *cgroot,
> +int ve_set_release_agent_path(struct cgroup *cgroot,
>       const char *release_agent);
>  
>  const char *ve_get_release_agent_path(struct cgroup *cgrp_root);
> diff --git a/kernel/cgroup.c b/kernel/cgroup.c
> index 1d9c889..bb77804 100644
> --- a/kernel/cgroup.c
> +++ b/kernel/cgroup.c
> @@ -2360,13 +2360,28 @@ static int cgroup_release_agent_write(struct cgroup 
> *cgrp, struct cftype *cft,
>       if (!cgroup_lock_live_group(cgrp))
>               return -ENODEV;
>  
> +     /*
> +      * Call to cgroup_get_local_root is a way to make sure
> +      * that cgrp in the argument is valid "virtual root"
> +      * cgroup. If it's not - this means that cgroup_unmark_ve_roots
> +      * has already cleared CGRP_VE_ROOT flag from this cgroup while
> +      * the file was still opened by other process and
> +      * that we shouldn't allow further writings via that file.
> +      */
>       root_cgrp = cgroup_get_local_root(cgrp);
>       BUG_ON(!root_cgrp);
> +
> +     if (root_cgrp != cgrp) {
> +             ret = -EPERM;
> +             goto out;
> +     }
> +
>       if (root_cgrp->ve_owner)
>               ret = ve_set_release_agent_path(root_cgrp, buffer);
>       else
>               return -ENODEV;
>  
> +out:
>       mutex_unlock(&cgroup_mutex);
>       return ret;
>  }
> @@ -4204,7 +4219,7 @@ static struct cftype files[] = {
>       },
>       {
>               .name = "release_agent",
> -             .flags = CFTYPE_ONLY_ON_ROOT,
> +             .flags = CFTYPE_ONLY_ON_ROOT | CFTYPE_VE_WRITABLE,
>               .read_seq_string = cgroup_release_agent_show,
>               .write_string = cgroup_release_agent_write,
>               .max_write_len = PATH_MAX,
> @@ -4422,39 +4437,98 @@ static int subgroups_count(struct cgroup *cgroup)
>       return cgrps_count;
>  }
>  
> +static struct cftype *get_cftype_by_name(const char *name)
> +{
> +     struct cftype *cft;
> +     for (cft = files; cft->name[0] != '\0'; cft++) {
> +             if (!strcmp(cft->name, name))
> +                     return cft;
> +     }
> +     return NULL;
> +}
> +
>  #ifdef CONFIG_VE
> -void cgroup_mark_ve_roots(struct ve_struct *ve)
> +int cgroup_mark_ve_roots(struct ve_struct *ve)
>  {
> -     struct cgroup *cgrp;
> +     struct cgroup *cgrp, *tmp;
>       struct cgroupfs_root *root;
> +     int err = 0;
> +     struct cftype *cft;
> +     LIST_HEAD(pending);
>  
> +     cft = get_cftype_by_name("release_agent");
> +     BUG_ON(!cft);
> +
> +     mutex_lock(&cgroup_cft_mutex);
>       mutex_lock(&cgroup_mutex);
>       for_each_active_root(root) {
> -             cgrp = task_cgroup_from_root(ve->init_task, root);
> +             cgrp = css_cgroup_from_root(ve->root_css_set, root);
>               rcu_assign_pointer(cgrp->ve_owner, ve);
>               set_bit(CGRP_VE_ROOT, &cgrp->flags);
>  
> +             dget(cgrp->dentry);
> +             list_add_tail(&cgrp->cft_q_node, &pending);
>               if (test_bit(cpu_cgroup_subsys_id, &root->subsys_mask))
>                       link_ve_root_cpu_cgroup(cgrp);
>       }
>       mutex_unlock(&cgroup_mutex);
>       synchronize_rcu();
> +     list_for_each_entry_safe(cgrp, tmp, &pending, cft_q_node) {
> +             struct inode *inode = cgrp->dentry->d_inode;
> +
> +             if (err) {
> +                     list_del_init(&cgrp->cft_q_node);
> +                     dput(cgrp->dentry);
> +                     continue;
> +             }

How does this work?

If cgroup_add_file() sets error on previous iteration, then here we do double 
dput()?

> +
> +             mutex_lock(&inode->i_mutex);
> +             mutex_lock(&cgroup_mutex);
> +             if (!cgroup_is_removed(cgrp))
> +                     err = cgroup_add_file(cgrp, NULL, cft);
> +             mutex_unlock(&cgroup_mutex);
> +             mutex_unlock(&inode->i_mutex);
> +
> +             list_del_init(&cgrp->cft_q_node);
> +             dput(cgrp->dentry);
> +     }
> +     mutex_unlock(&cgroup_cft_mutex);
> +
> +     if (err)
> +             cgroup_unmark_ve_roots(ve);
> +
> +     return err;
>  }
>  
>  void cgroup_unmark_ve_roots(struct ve_struct *ve)
>  {
> -     struct cgroup *cgrp;
> +     struct cgroup *cgrp, *tmp;
>       struct cgroupfs_root *root;
> +     struct cftype *cft;
> +     LIST_HEAD(pending);
> +
> +     cft = get_cftype_by_name("release_agent");
>  
> +     mutex_lock(&cgroup_cft_mutex);
>       mutex_lock(&cgroup_mutex);
>       for_each_active_root(root) {
>               cgrp = css_cgroup_from_root(ve->root_css_set, root);
> +             dget(cgrp->dentry);
> +             list_add_tail(&cgrp->cft_q_node, &pending);
> +     }
> +     mutex_unlock(&cgroup_mutex);
> +     list_for_each_entry_safe(cgrp, tmp, &pending, cft_q_node) {
> +             struct inode *inode = cgrp->dentry->d_inode;
> +             mutex_lock(&inode->i_mutex);
> +             mutex_lock(&cgroup_mutex);
> +             cgroup_rm_file(cgrp, cft);
>               BUG_ON(!rcu_dereference_protected(cgrp->ve_owner,
>                               lockdep_is_held(&cgroup_mutex)));
>               rcu_assign_pointer(cgrp->ve_owner, NULL);
>               clear_bit(CGRP_VE_ROOT, &cgrp->flags);
> +             mutex_unlock(&cgroup_mutex);
> +             mutex_unlock(&inode->i_mutex);
>       }
> -     mutex_unlock(&cgroup_mutex);
> +     mutex_unlock(&cgroup_cft_mutex);
>       /* ve_owner == NULL will be visible */
>       synchronize_rcu();
>  
> diff --git a/kernel/ve/ve.c b/kernel/ve/ve.c
> index f03f665..8d78270 100644
> --- a/kernel/ve/ve.c
> +++ b/kernel/ve/ve.c
> @@ -718,7 +718,9 @@ static int ve_start_container(struct ve_struct *ve)
>       if (err < 0)
>               goto err_iterate;
>  
> -     cgroup_mark_ve_roots(ve);
> +     err = cgroup_mark_ve_roots(ve);
> +     if (err)
> +             goto err_mark_ve;
>  
>       ve->is_running = 1;
>  
> @@ -728,6 +730,8 @@ static int ve_start_container(struct ve_struct *ve)
>  
>       return 0;
>  
> +err_mark_ve:
> +     ve_hook_iterate_fini(VE_SS_CHAIN, ve);
>  err_iterate:
>       ve_workqueue_stop(ve);
>  err_workqueue:
> 

_______________________________________________
Devel mailing list
Devel@openvz.org
https://lists.openvz.org/mailman/listinfo/devel

Reply via email to