On 2012/10/17 6:28, Tejun Heo wrote:
> cgroup core has a bug which violates a basic rule about event
> notifications - when a new entity needs to be added, you add that to
> the notification list first and then make the new entity conform to
> the current state.  If done in the reverse order, an event happening
> inbetween will be lost.
> 
> cgroup_subsys->fork() is invoked way before the new task is added to
> the css_set.  Currently, cgroup_freezer is the only user of ->fork()
> and uses it to make new tasks conform to the current state of the
> freezer.  If FROZEN state is requested while fork is in progress
> between cgroup_fork_callbacks() and cgroup_post_fork(), the child
> could escape freezing - the cgroup isn't frozen when ->fork() is
> called and the freezer couldn't see the new task on the css_set.
> 
> This patch moves cgroup_subsys->fork() invocation to
> cgroup_post_fork() after the new task is added to the css_set.
> cgroup_fork_callbacks() is removed.
> 
> Because now a task may be migrated during cgroup_subsys->fork(),
> freezer_fork() is updated so that it adheres to the usual RCU locking
> and the rather pointless comment on why locking can be different there
> is removed (if it doesn't make anything simpler, why even bother?).
> 

I don't think rcu read section is sufficient. It guarantees the data you're
accessing is valid, but the data can be new or can be old.

So a case below is possible:

in freezer_fork():
rcu_read_lock();
freezer = task_freezer(task);
                                  move task from freezer to freezer2
                                  which is in FREEZING/FROZEN state
freezer is in THAWED state,
nothing to do.
rcu_read_unlock();

> Signed-off-by: Tejun Heo <[email protected]>
> Cc: Oleg Nesterov <[email protected]>
> Cc: Rafael J. Wysocki <[email protected]>
> Cc: [email protected]
> ---
>  include/linux/cgroup.h  |    1 -
>  kernel/cgroup.c         |   62 ++++++++++++++++++++++------------------------
>  kernel/cgroup_freezer.c |   13 +++-------
>  kernel/fork.c           |    9 +------
>  4 files changed, 35 insertions(+), 50 deletions(-)
> 
> diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h
> index f8a030c..4cd1d0f 100644
> --- a/include/linux/cgroup.h
> +++ b/include/linux/cgroup.h
> @@ -34,7 +34,6 @@ extern int cgroup_lock_is_held(void);
>  extern bool cgroup_lock_live_group(struct cgroup *cgrp);
>  extern void cgroup_unlock(void);
>  extern void cgroup_fork(struct task_struct *p);
> -extern void cgroup_fork_callbacks(struct task_struct *p);
>  extern void cgroup_post_fork(struct task_struct *p);
>  extern void cgroup_exit(struct task_struct *p, int run_callbacks);
>  extern int cgroupstats_build(struct cgroupstats *stats,
> diff --git a/kernel/cgroup.c b/kernel/cgroup.c
> index 13774b3..b7a0171 100644
> --- a/kernel/cgroup.c
> +++ b/kernel/cgroup.c
> @@ -4844,44 +4844,19 @@ void cgroup_fork(struct task_struct *child)
>  }
>  
>  /**
> - * cgroup_fork_callbacks - run fork callbacks
> - * @child: the new task
> - *
> - * Called on a new task very soon before adding it to the
> - * tasklist. No need to take any locks since no-one can
> - * be operating on this task.
> - */
> -void cgroup_fork_callbacks(struct task_struct *child)
> -{
> -     if (need_forkexit_callback) {
> -             int i;
> -             for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) {
> -                     struct cgroup_subsys *ss = subsys[i];
> -
> -                     /*
> -                      * forkexit callbacks are only supported for
> -                      * builtin subsystems.
> -                      */
> -                     if (!ss || ss->module)
> -                             continue;
> -
> -                     if (ss->fork)
> -                             ss->fork(child);
> -             }
> -     }
> -}
> -
> -/**
>   * cgroup_post_fork - called on a new task after adding it to the task list
>   * @child: the task in question
>   *
> - * Adds the task to the list running through its css_set if necessary.
> - * Has to be after the task is visible on the task list in case we race
> - * with the first call to cgroup_iter_start() - to guarantee that the
> - * new task ends up on its list.
> + * Adds the task to the list running through its css_set if necessary and
> + * call the subsystem fork() callbacks.  Has to be after the task is
> + * visible on the task list in case we race with the first call to
> + * cgroup_iter_start() - to guarantee that the new task ends up on its
> + * list.
>   */
>  void cgroup_post_fork(struct task_struct *child)
>  {
> +     int i;
> +
>       /*
>        * use_task_css_set_links is set to 1 before we walk the tasklist
>        * under the tasklist_lock and we read it here after we added the child
> @@ -4910,7 +4885,30 @@ void cgroup_post_fork(struct task_struct *child)
>               }
>               write_unlock(&css_set_lock);
>       }
> +
> +     /*
> +      * Call ss->fork().  This must happen after @child is linked on
> +      * css_set; otherwise, @child might change state between ->fork()
> +      * and addition to css_set.
> +      */
> +     if (need_forkexit_callback) {
> +             for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) {
> +                     struct cgroup_subsys *ss = subsys[i];
> +
> +                     /*
> +                      * fork/exit callbacks are supported only for
> +                      * builtin subsystems and we don't need further
> +                      * synchronization as they never go away.
> +                      */
> +                     if (!ss || ss->module)
> +                             continue;
> +
> +                     if (ss->fork)
> +                             ss->fork(child);
> +             }
> +     }
>  }
> +
>  /**
>   * cgroup_exit - detach cgroup from exiting task
>   * @tsk: pointer to task_struct of exiting process
> diff --git a/kernel/cgroup_freezer.c b/kernel/cgroup_freezer.c
> index b1724ce..12bfedb 100644
> --- a/kernel/cgroup_freezer.c
> +++ b/kernel/cgroup_freezer.c
> @@ -186,23 +186,15 @@ static void freezer_fork(struct task_struct *task)
>  {
>       struct freezer *freezer;
>  
> -     /*
> -      * No lock is needed, since the task isn't on tasklist yet,
> -      * so it can't be moved to another cgroup, which means the
> -      * freezer won't be removed and will be valid during this
> -      * function call.  Nevertheless, apply RCU read-side critical
> -      * section to suppress RCU lockdep false positives.
> -      */
>       rcu_read_lock();
>       freezer = task_freezer(task);
> -     rcu_read_unlock();
>  
>       /*
>        * The root cgroup is non-freezable, so we can skip the
>        * following check.
>        */
>       if (!freezer->css.cgroup->parent)
> -             return;
> +             goto out;
>  
>       spin_lock_irq(&freezer->lock);
>       BUG_ON(freezer->state == CGROUP_FROZEN);
> @@ -210,7 +202,10 @@ static void freezer_fork(struct task_struct *task)
>       /* Locking avoids race with FREEZING -> THAWED transitions. */
>       if (freezer->state == CGROUP_FREEZING)
>               freeze_task(task);
> +
>       spin_unlock_irq(&freezer->lock);
> +out:
> +     rcu_read_unlock();
>  }
>  
>  /*
> diff --git a/kernel/fork.c b/kernel/fork.c
> index 8b20ab7..acc4cb6 100644
> --- a/kernel/fork.c
> +++ b/kernel/fork.c
> @@ -1135,7 +1135,6 @@ static struct task_struct *copy_process(unsigned long 
> clone_flags,
>  {
>       int retval;
>       struct task_struct *p;
> -     int cgroup_callbacks_done = 0;
>  
>       if ((clone_flags & (CLONE_NEWNS|CLONE_FS)) == (CLONE_NEWNS|CLONE_FS))
>               return ERR_PTR(-EINVAL);
> @@ -1393,12 +1392,6 @@ static struct task_struct *copy_process(unsigned long 
> clone_flags,
>       INIT_LIST_HEAD(&p->thread_group);
>       p->task_works = NULL;
>  
> -     /* Now that the task is set up, run cgroup callbacks if
> -      * necessary. We need to run them before the task is visible
> -      * on the tasklist. */
> -     cgroup_fork_callbacks(p);
> -     cgroup_callbacks_done = 1;
> -
>       /* Need tasklist lock for parent etc handling! */
>       write_lock_irq(&tasklist_lock);
>  
> @@ -1503,7 +1496,7 @@ bad_fork_cleanup_cgroup:
>  #endif
>       if (clone_flags & CLONE_THREAD)
>               threadgroup_change_end(current);
> -     cgroup_exit(p, cgroup_callbacks_done);
> +     cgroup_exit(p, 0);
>       delayacct_tsk_free(p);
>       module_put(task_thread_info(p)->exec_domain->module);
>  bad_fork_cleanup_count:
> 

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to [email protected]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Reply via email to