On Thu, Jun 21, 2012 at 10:23:02AM +0200, Mike Galbraith wrote:
> On Thu, 2012-06-21 at 11:54 +0400, Alexey Vlasov wrote: 
> > On Wed, Jun 20, 2012 at 02:28:18PM +0200, Mike Galbraith wrote:
> > > 
> > > kernel/cgroup.c::cgroup_attach_task()
> > > {
> > > ...
> > >   synchronize_rcu();
> > > ...
> > > }
> > 
> > So nothing can be done here? (I mean if only I knew how to fix it I
> > wouldn't ask about it ;)
> 
> Sure, kill the obnoxious thing, it's sitting right in the middle of the
> userspace interface.
> 
> I banged on it a while back (wrt explosive android patches), extracted
> RCU from the userspace interface.  It seemed to work great, much faster,
> couldn't make it explode.  I wouldn't bet anything I wasn't willing to
> immediately part with that the result was really really safe though ;-)
> 
> -Mike

JFYI,

I'm testing the following patch in a bunch of hosts and I wasn't able to
make any of them to explode, even running a multi-threaded
cgroup-intensive workload, but probably I was just lucky (or unlucky,
depending on the point of view).

It is basically the same Not-signed-off-by work posted by Mike a while
ago: https://lkml.org/lkml/2011/4/12/599.

In addition, I totally removed the synchronize_rcu() call from
cgroup_attach_task() and added the call_rcu -> schedule_work removal
also for css_set. The latter looks unnecessary to me from a logical
point of view, or maybe I'm missing something, because I can't explain
why with it I can't trigger any BUG / oops.

Mike, did you make any progress from your old patch?

Thanks,
-Andrea

---
 include/linux/cgroup.h |    2 ++
 kernel/cgroup.c        |   91 +++++++++++++++++++++++++++++-------------------
 2 files changed, 58 insertions(+), 35 deletions(-)

diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h
index d3f5fba..2bab2d6 100644
--- a/include/linux/cgroup.h
+++ b/include/linux/cgroup.h
@@ -212,6 +212,7 @@ struct cgroup {
 
        /* For RCU-protected deletion */
        struct rcu_head rcu_head;
+       struct work_struct work;
 
        /* List of events which userspace want to receive */
        struct list_head event_list;
@@ -260,6 +261,7 @@ struct css_set {
 
        /* For RCU-protected deletion */
        struct rcu_head rcu_head;
+       struct work_struct work;
 };
 
 /*
diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index b303dfc..aa7cc06 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -396,6 +396,21 @@ static struct hlist_head *css_set_hash(struct 
cgroup_subsys_state *css[])
  * compiled into their kernel but not actually in use */
 static int use_task_css_set_links __read_mostly;
 
+static void free_css_set_work(struct work_struct *work)
+{
+       struct css_set *cg = container_of(work, struct css_set, work);
+
+       kfree(cg);
+}
+
+static void free_css_set_rcu(struct rcu_head *obj)
+{
+       struct css_set *cg = container_of(obj, struct css_set, rcu_head);
+
+       INIT_WORK(&cg->work, free_css_set_work);
+       schedule_work(&cg->work);
+}
+
 static void __put_css_set(struct css_set *cg, int taskexit)
 {
        struct cg_cgroup_link *link;
@@ -433,7 +448,7 @@ static void __put_css_set(struct css_set *cg, int taskexit)
        }
 
        write_unlock(&css_set_lock);
-       kfree_rcu(cg, rcu_head);
+       call_rcu(&cg->rcu_head, free_css_set_rcu);
 }
 
 /*
@@ -875,44 +890,52 @@ static int cgroup_call_pre_destroy(struct cgroup *cgrp)
        return ret;
 }
 
-static void cgroup_diput(struct dentry *dentry, struct inode *inode)
+static void free_cgroup_work(struct work_struct *work)
 {
-       /* is dentry a directory ? if so, kfree() associated cgroup */
-       if (S_ISDIR(inode->i_mode)) {
-               struct cgroup *cgrp = dentry->d_fsdata;
-               struct cgroup_subsys *ss;
-               BUG_ON(!(cgroup_is_removed(cgrp)));
-               /* It's possible for external users to be holding css
-                * reference counts on a cgroup; css_put() needs to
-                * be able to access the cgroup after decrementing
-                * the reference count in order to know if it needs to
-                * queue the cgroup to be handled by the release
-                * agent */
-               synchronize_rcu();
+       struct cgroup *cgrp = container_of(work, struct cgroup, work);
+       struct cgroup_subsys *ss;
 
-               mutex_lock(&cgroup_mutex);
-               /*
-                * Release the subsystem state objects.
-                */
-               for_each_subsys(cgrp->root, ss)
-                       ss->destroy(cgrp);
+       mutex_lock(&cgroup_mutex);
+       /*
+        * Release the subsystem state objects.
+        */
+       for_each_subsys(cgrp->root, ss)
+               ss->destroy(cgrp);
 
-               cgrp->root->number_of_cgroups--;
-               mutex_unlock(&cgroup_mutex);
+       cgrp->root->number_of_cgroups--;
+       mutex_unlock(&cgroup_mutex);
 
-               /*
-                * Drop the active superblock reference that we took when we
-                * created the cgroup
-                */
-               deactivate_super(cgrp->root->sb);
+       /*
+        * Drop the active superblock reference that we took when we
+        * created the cgroup
+        */
+       deactivate_super(cgrp->root->sb);
 
-               /*
-                * if we're getting rid of the cgroup, refcount should ensure
-                * that there are no pidlists left.
-                */
-               BUG_ON(!list_empty(&cgrp->pidlists));
+       /*
+        * if we're getting rid of the cgroup, refcount should ensure
+        * that there are no pidlists left.
+        */
+       BUG_ON(!list_empty(&cgrp->pidlists));
+
+       kfree(cgrp);
+}
+
+static void free_cgroup_rcu(struct rcu_head *obj)
+{
+       struct cgroup *cgrp = container_of(obj, struct cgroup, rcu_head);
+
+       INIT_WORK(&cgrp->work, free_cgroup_work);
+       schedule_work(&cgrp->work);
+}
+
+static void cgroup_diput(struct dentry *dentry, struct inode *inode)
+{
+       /* is dentry a directory ? if so, kfree() associated cgroup */
+       if (S_ISDIR(inode->i_mode)) {
+               struct cgroup *cgrp = dentry->d_fsdata;
 
-               kfree_rcu(cgrp, rcu_head);
+               BUG_ON(!(cgroup_is_removed(cgrp)));
+               call_rcu(&cgrp->rcu_head, free_cgroup_rcu);
        } else {
                struct cfent *cfe = __d_cfe(dentry);
                struct cgroup *cgrp = dentry->d_parent->d_fsdata;
@@ -1990,8 +2013,6 @@ int cgroup_attach_task(struct cgroup *cgrp, struct 
task_struct *tsk)
                        ss->attach(cgrp, &tset);
        }
 
-       synchronize_rcu();
-
        /*
         * wake up rmdir() waiter. the rmdir should fail since the cgroup
         * is no longer empty.
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to [email protected]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Reply via email to