from:"Li Zefan"

[PATCH 2/4] ccgroup: remove redundant code in cgroup_rmdir()

2014-09-17 Thread Li Zefan

We no longer clear kn->priv in cgroup_rmdir(), so we don't need
to get an extra refcnt.

Signed-off-by: Zefan Li 
---
 kernel/cgroup.c | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index 0ce9d9e..26b8cb9 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -4841,13 +4841,10 @@ static int cgroup_rmdir(struct kernfs_node *kn)
cgrp = cgroup_kn_lock_live(kn);
if (!cgrp)
return 0;
-   cgroup_get(cgrp);   /* for @kn->priv clearing */
 
ret = cgroup_destroy_locked(cgrp);
 
cgroup_kn_unlock(kn);
-
-   cgroup_put(cgrp);
return ret;
 }
 
-- 
1.8.0.2

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

[PATCH 4/4] cgroup: reuse css->destroy_work for release agent

2014-09-17 Thread Li Zefan

Currently we use a global work to schedule release agent on removable
cgroups. We can change to reuse css->destroy_work to do this, which
saves a few lines of code.

Signed-off-by: Zefan Li 
---
 include/linux/cgroup.h |   7 
 kernel/cgroup.c| 108 ++---
 2 files changed, 39 insertions(+), 76 deletions(-)

diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h
index f7898e0..97da407 100644
--- a/include/linux/cgroup.h
+++ b/include/linux/cgroup.h
@@ -234,13 +234,6 @@ struct cgroup {
struct list_head e_csets[CGROUP_SUBSYS_COUNT];
 
/*
-* Linked list running through all cgroups that can
-* potentially be reaped by the release agent. Protected by
-* release_list_lock
-*/
-   struct list_head release_list;
-
-   /*
 * list of pidlists, up to two for each namespace (one for procs, one
 * for tasks); created on demand.
 */
diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index 1abb554..5b6566c 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -392,12 +392,7 @@ static int notify_on_release(const struct cgroup *cgrp)
;   \
else
 
-/* the list of cgroups eligible for automatic release. Protected by
- * release_list_lock */
-static LIST_HEAD(release_list);
-static DEFINE_RAW_SPINLOCK(release_list_lock);
 static void cgroup_release_agent(struct work_struct *work);
-static DECLARE_WORK(release_agent_work, cgroup_release_agent);
 static void check_for_release(struct cgroup *cgrp);
 
 /*
@@ -1577,7 +1572,6 @@ static void init_cgroup_housekeeping(struct cgroup *cgrp)
INIT_LIST_HEAD(>self.sibling);
INIT_LIST_HEAD(>self.children);
INIT_LIST_HEAD(>cset_links);
-   INIT_LIST_HEAD(>release_list);
INIT_LIST_HEAD(>pidlists);
mutex_init(>pidlist_mutex);
cgrp->self.cgroup = cgrp;
@@ -1587,6 +1581,7 @@ static void init_cgroup_housekeeping(struct cgroup *cgrp)
INIT_LIST_HEAD(>e_csets[ssid]);
 
init_waitqueue_head(>offline_waitq);
+   INIT_WORK(>self.destroy_work, cgroup_release_agent);
 }
 
 static void init_cgroup_root(struct cgroup_root *root,
@@ -4804,12 +4799,6 @@ static int cgroup_destroy_locked(struct cgroup *cgrp)
for_each_css(css, ssid, cgrp)
kill_css(css);
 
-   /* CSS_ONLINE is clear, remove from ->release_list for the last time */
-   raw_spin_lock(_list_lock);
-   if (!list_empty(>release_list))
-   list_del_init(>release_list);
-   raw_spin_unlock(_list_lock);
-
/*
 * Remove @cgrp directory along with the base files.  @cgrp has an
 * extra ref on its kn.
@@ -5274,21 +5263,14 @@ static void check_for_release(struct cgroup *cgrp)
if (cgroup_is_releasable(cgrp) && list_empty(>cset_links) &&
!css_has_online_children(>self)) {
/*
-* Control Group is currently removeable. If it's not
-* already queued for a userspace notification, queue
-* it now
+* get a reference, so the cgroup can only be freed
+* after the release work is done.
 */
-   int need_schedule_work = 0;
+   if (!cgroup_tryget(cgrp))
+   return;
 
-   raw_spin_lock(_list_lock);
-   if (!cgroup_is_dead(cgrp) &&
-   list_empty(>release_list)) {
-   list_add(>release_list, _list);
-   need_schedule_work = 1;
-   }
-   raw_spin_unlock(_list_lock);
-   if (need_schedule_work)
-   schedule_work(_agent_work);
+   if (!queue_work(cgroup_destroy_wq, >self.destroy_work))
+   cgroup_put(cgrp);
}
 }
 
@@ -5317,52 +5299,40 @@ static void check_for_release(struct cgroup *cgrp)
  */
 static void cgroup_release_agent(struct work_struct *work)
 {
-   BUG_ON(work != _agent_work);
+   struct cgroup_subsys_state *css =
+   container_of(work, struct cgroup_subsys_state, destroy_work);
+   struct cgroup *cgrp = css->cgroup;
+   char *pathbuf = NULL, *agentbuf = NULL, *path;
+   char *argv[3], *envp[3];
+
mutex_lock(_mutex);
-   raw_spin_lock(_list_lock);
-   while (!list_empty(_list)) {
-   char *argv[3], *envp[3];
-   int i;
-   char *pathbuf = NULL, *agentbuf = NULL, *path;
-   struct cgroup *cgrp = list_entry(release_list.next,
-   struct cgroup,
-   release_list);
-   list_del_init(>release_list);
-   raw_spin_unlock(_list_lock);
-   pathbuf = kmalloc(PATH_MAX, GFP_KERNEL);
-   if (!pathbuf)
-   goto

[PATCH 3/4] cgroup: remove bogus comments

2014-09-17 Thread Li Zefan

We never grab cgroup mutex in fork and exit paths no matter whether
notify_on_release is set or not.

Signed-off-by: Zefan Li 
---
 kernel/cgroup.c | 8 
 1 file changed, 8 deletions(-)

diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index 26b8cb9..1abb554 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -967,14 +967,6 @@ static struct cgroup *task_cgroup_from_root(struct 
task_struct *task,
  * knows that the cgroup won't be removed, as cgroup_rmdir()
  * needs that mutex.
  *
- * The fork and exit callbacks cgroup_fork() and cgroup_exit(), don't
- * (usually) take cgroup_mutex.  These are the two most performance
- * critical pieces of code here.  The exception occurs on cgroup_exit(),
- * when a task in a notify_on_release cgroup exits.  Then cgroup_mutex
- * is taken, and if the cgroup count is zero, a usermode call made
- * to the release agent with the name of the cgroup (path relative to
- * the root of cgroup file system) as the argument.
- *
  * A cgroup can only be deleted if both its 'count' of using tasks
  * is zero, and its list of 'children' cgroups is empty.  Since all
  * tasks in the system use _some_ cgroup, and since there is always at
-- 
1.8.0.2

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

[PATCH 1/4] cgroup: remove some useless forward declarations

2014-09-17 Thread Li Zefan


Signed-off-by: Zefan Li 
---
 include/linux/cgroup.h | 1 -
 kernel/cgroup.c| 2 --
 2 files changed, 3 deletions(-)

diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h
index b5223c5..f7898e0 100644
--- a/include/linux/cgroup.h
+++ b/include/linux/cgroup.h
@@ -27,7 +27,6 @@
 
 struct cgroup_root;
 struct cgroup_subsys;
-struct inode;
 struct cgroup;
 
 extern int cgroup_init_early(void);
diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index 940aced..0ce9d9e 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -185,7 +185,6 @@ static int need_forkexit_callback __read_mostly;
 static struct cftype cgroup_dfl_base_files[];
 static struct cftype cgroup_legacy_base_files[];
 
-static void cgroup_put(struct cgroup *cgrp);
 static int rebind_subsystems(struct cgroup_root *dst_root,
 unsigned int ss_mask);
 static int cgroup_destroy_locked(struct cgroup *cgrp);
@@ -195,7 +194,6 @@ static void css_release(struct percpu_ref *ref);
 static void kill_css(struct cgroup_subsys_state *css);
 static int cgroup_addrm_files(struct cgroup *cgrp, struct cftype cfts[],
  bool is_add);
-static void cgroup_pidlist_destroy_all(struct cgroup *cgrp);
 
 /* IDR wrappers which synchronize using cgroup_idr_lock */
 static int cgroup_idr_alloc(struct idr *idr, void *ptr, int start, int end,
-- 
1.8.0.2

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: Kernel crash in cgroup_pidlist_destroy_work_fn()

2014-09-17 Thread Li Zefan

On 2014/9/17 13:29, Li Zefan wrote:
> On 2014/9/17 7:56, Cong Wang wrote:
>> Hi, Tejun
>>
>>
>> We saw some kernel null pointer dereference in
>> cgroup_pidlist_destroy_work_fn(), more precisely at
>> __mutex_lock_slowpath(), on 3.14. I can show you the full stack trace
>> on request.
>>
> 
> Yes, please.
> 
>> Looking at the code, it seems flush_workqueue() doesn't care about new
>> incoming works, it only processes currently pending ones, if this is
>> correct, then we could have the following race condition:
>>
>> cgroup_pidlist_destroy_all():
>> //...
>> mutex_lock(>pidlist_mutex);
>> list_for_each_entry_safe(l, tmp_l, >pidlists, links)
>> mod_delayed_work(cgroup_pidlist_destroy_wq,
>> >destroy_dwork, 0);
>> mutex_unlock(>pidlist_mutex);
>>
>> // <--- another process calls cgroup_pidlist_start() here
>> since mutex is released
>>
>> flush_workqueue(cgroup_pidlist_destroy_wq); // <--- another
>> process adds new pidlist and queue work in pararell
>> BUG_ON(!list_empty(>pidlists)); // <--- This check is
>> passed, list_add() could happen after this
>>
> 
> Did you confirm this is what happened when the bug was triggered?
> 
> I don't think the race condition you described exists. In 3.14 kernel,
> cgroup_diput() won't be called if there is any thread running
> cgroup_pidlist_start(). This is guaranteed by vfs.
> 
> But newer kernels are different. Looks like the bug exists in those
> kernels.
> 

Newer kernels should be also fine.

If cgroup_pidlist_destroy_all() is called, it means kernfs has already
removed the tasks file, and even if you still have it opened, when
you try to read it, it will immediately return an errno.

fd = open(cgrp/tasks)
cgroup_rmdir(cgrp)
  cgroup_destroy_locked(c)
kernfs_remove()
  ...
css_free_work_fn()
  cgroup_pidlist_destroy_all()
   read(fd of cgrp/tasks)
 return -ENODEV

So cgroup_pidlist_destroy_all() won't race with cgroup_pidlist_start().

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: Kernel crash in cgroup_pidlist_destroy_work_fn()

2014-09-17 Thread Li Zefan

On 2014/9/17 13:29, Li Zefan wrote:
 On 2014/9/17 7:56, Cong Wang wrote:
 Hi, Tejun


 We saw some kernel null pointer dereference in
 cgroup_pidlist_destroy_work_fn(), more precisely at
 __mutex_lock_slowpath(), on 3.14. I can show you the full stack trace
 on request.

 
 Yes, please.
 
 Looking at the code, it seems flush_workqueue() doesn't care about new
 incoming works, it only processes currently pending ones, if this is
 correct, then we could have the following race condition:

 cgroup_pidlist_destroy_all():
 //...
 mutex_lock(cgrp-pidlist_mutex);
 list_for_each_entry_safe(l, tmp_l, cgrp-pidlists, links)
 mod_delayed_work(cgroup_pidlist_destroy_wq,
 l-destroy_dwork, 0);
 mutex_unlock(cgrp-pidlist_mutex);

 // --- another process calls cgroup_pidlist_start() here
 since mutex is released

 flush_workqueue(cgroup_pidlist_destroy_wq); // --- another
 process adds new pidlist and queue work in pararell
 BUG_ON(!list_empty(cgrp-pidlists)); // --- This check is
 passed, list_add() could happen after this

 
 Did you confirm this is what happened when the bug was triggered?
 
 I don't think the race condition you described exists. In 3.14 kernel,
 cgroup_diput() won't be called if there is any thread running
 cgroup_pidlist_start(). This is guaranteed by vfs.
 
 But newer kernels are different. Looks like the bug exists in those
 kernels.
 

Newer kernels should be also fine.

If cgroup_pidlist_destroy_all() is called, it means kernfs has already
removed the tasks file, and even if you still have it opened, when
you try to read it, it will immediately return an errno.

fd = open(cgrp/tasks)
cgroup_rmdir(cgrp)
  cgroup_destroy_locked(c)
kernfs_remove()
  ...
css_free_work_fn()
  cgroup_pidlist_destroy_all()
   read(fd of cgrp/tasks)
 return -ENODEV

So cgroup_pidlist_destroy_all() won't race with cgroup_pidlist_start().

--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

[PATCH 1/4] cgroup: remove some useless forward declarations

2014-09-17 Thread Li Zefan


Signed-off-by: Zefan Li lize...@huawei.com
---
 include/linux/cgroup.h | 1 -
 kernel/cgroup.c| 2 --
 2 files changed, 3 deletions(-)

diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h
index b5223c5..f7898e0 100644
--- a/include/linux/cgroup.h
+++ b/include/linux/cgroup.h
@@ -27,7 +27,6 @@
 
 struct cgroup_root;
 struct cgroup_subsys;
-struct inode;
 struct cgroup;
 
 extern int cgroup_init_early(void);
diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index 940aced..0ce9d9e 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -185,7 +185,6 @@ static int need_forkexit_callback __read_mostly;
 static struct cftype cgroup_dfl_base_files[];
 static struct cftype cgroup_legacy_base_files[];
 
-static void cgroup_put(struct cgroup *cgrp);
 static int rebind_subsystems(struct cgroup_root *dst_root,
 unsigned int ss_mask);
 static int cgroup_destroy_locked(struct cgroup *cgrp);
@@ -195,7 +194,6 @@ static void css_release(struct percpu_ref *ref);
 static void kill_css(struct cgroup_subsys_state *css);
 static int cgroup_addrm_files(struct cgroup *cgrp, struct cftype cfts[],
  bool is_add);
-static void cgroup_pidlist_destroy_all(struct cgroup *cgrp);
 
 /* IDR wrappers which synchronize using cgroup_idr_lock */
 static int cgroup_idr_alloc(struct idr *idr, void *ptr, int start, int end,
-- 
1.8.0.2

--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

[PATCH 3/4] cgroup: remove bogus comments

2014-09-17 Thread Li Zefan

We never grab cgroup mutex in fork and exit paths no matter whether
notify_on_release is set or not.

Signed-off-by: Zefan Li lize...@huawei.com
---
 kernel/cgroup.c | 8 
 1 file changed, 8 deletions(-)

diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index 26b8cb9..1abb554 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -967,14 +967,6 @@ static struct cgroup *task_cgroup_from_root(struct 
task_struct *task,
  * knows that the cgroup won't be removed, as cgroup_rmdir()
  * needs that mutex.
  *
- * The fork and exit callbacks cgroup_fork() and cgroup_exit(), don't
- * (usually) take cgroup_mutex.  These are the two most performance
- * critical pieces of code here.  The exception occurs on cgroup_exit(),
- * when a task in a notify_on_release cgroup exits.  Then cgroup_mutex
- * is taken, and if the cgroup count is zero, a usermode call made
- * to the release agent with the name of the cgroup (path relative to
- * the root of cgroup file system) as the argument.
- *
  * A cgroup can only be deleted if both its 'count' of using tasks
  * is zero, and its list of 'children' cgroups is empty.  Since all
  * tasks in the system use _some_ cgroup, and since there is always at
-- 
1.8.0.2

--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

[PATCH 2/4] ccgroup: remove redundant code in cgroup_rmdir()

2014-09-17 Thread Li Zefan

We no longer clear kn-priv in cgroup_rmdir(), so we don't need
to get an extra refcnt.

Signed-off-by: Zefan Li lize...@huawei.com
---
 kernel/cgroup.c | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index 0ce9d9e..26b8cb9 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -4841,13 +4841,10 @@ static int cgroup_rmdir(struct kernfs_node *kn)
cgrp = cgroup_kn_lock_live(kn);
if (!cgrp)
return 0;
-   cgroup_get(cgrp);   /* for @kn-priv clearing */
 
ret = cgroup_destroy_locked(cgrp);
 
cgroup_kn_unlock(kn);
-
-   cgroup_put(cgrp);
return ret;
 }
 
-- 
1.8.0.2

--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

[PATCH 4/4] cgroup: reuse css-destroy_work for release agent

2014-09-17 Thread Li Zefan

Currently we use a global work to schedule release agent on removable
cgroups. We can change to reuse css-destroy_work to do this, which
saves a few lines of code.

Signed-off-by: Zefan Li lize...@huawei.com
---
 include/linux/cgroup.h |   7 
 kernel/cgroup.c| 108 ++---
 2 files changed, 39 insertions(+), 76 deletions(-)

diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h
index f7898e0..97da407 100644
--- a/include/linux/cgroup.h
+++ b/include/linux/cgroup.h
@@ -234,13 +234,6 @@ struct cgroup {
struct list_head e_csets[CGROUP_SUBSYS_COUNT];
 
/*
-* Linked list running through all cgroups that can
-* potentially be reaped by the release agent. Protected by
-* release_list_lock
-*/
-   struct list_head release_list;
-
-   /*
 * list of pidlists, up to two for each namespace (one for procs, one
 * for tasks); created on demand.
 */
diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index 1abb554..5b6566c 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -392,12 +392,7 @@ static int notify_on_release(const struct cgroup *cgrp)
;   \
else
 
-/* the list of cgroups eligible for automatic release. Protected by
- * release_list_lock */
-static LIST_HEAD(release_list);
-static DEFINE_RAW_SPINLOCK(release_list_lock);
 static void cgroup_release_agent(struct work_struct *work);
-static DECLARE_WORK(release_agent_work, cgroup_release_agent);
 static void check_for_release(struct cgroup *cgrp);
 
 /*
@@ -1577,7 +1572,6 @@ static void init_cgroup_housekeeping(struct cgroup *cgrp)
INIT_LIST_HEAD(cgrp-self.sibling);
INIT_LIST_HEAD(cgrp-self.children);
INIT_LIST_HEAD(cgrp-cset_links);
-   INIT_LIST_HEAD(cgrp-release_list);
INIT_LIST_HEAD(cgrp-pidlists);
mutex_init(cgrp-pidlist_mutex);
cgrp-self.cgroup = cgrp;
@@ -1587,6 +1581,7 @@ static void init_cgroup_housekeeping(struct cgroup *cgrp)
INIT_LIST_HEAD(cgrp-e_csets[ssid]);
 
init_waitqueue_head(cgrp-offline_waitq);
+   INIT_WORK(cgrp-self.destroy_work, cgroup_release_agent);
 }
 
 static void init_cgroup_root(struct cgroup_root *root,
@@ -4804,12 +4799,6 @@ static int cgroup_destroy_locked(struct cgroup *cgrp)
for_each_css(css, ssid, cgrp)
kill_css(css);
 
-   /* CSS_ONLINE is clear, remove from -release_list for the last time */
-   raw_spin_lock(release_list_lock);
-   if (!list_empty(cgrp-release_list))
-   list_del_init(cgrp-release_list);
-   raw_spin_unlock(release_list_lock);
-
/*
 * Remove @cgrp directory along with the base files.  @cgrp has an
 * extra ref on its kn.
@@ -5274,21 +5263,14 @@ static void check_for_release(struct cgroup *cgrp)
if (cgroup_is_releasable(cgrp)  list_empty(cgrp-cset_links) 
!css_has_online_children(cgrp-self)) {
/*
-* Control Group is currently removeable. If it's not
-* already queued for a userspace notification, queue
-* it now
+* get a reference, so the cgroup can only be freed
+* after the release work is done.
 */
-   int need_schedule_work = 0;
+   if (!cgroup_tryget(cgrp))
+   return;
 
-   raw_spin_lock(release_list_lock);
-   if (!cgroup_is_dead(cgrp) 
-   list_empty(cgrp-release_list)) {
-   list_add(cgrp-release_list, release_list);
-   need_schedule_work = 1;
-   }
-   raw_spin_unlock(release_list_lock);
-   if (need_schedule_work)
-   schedule_work(release_agent_work);
+   if (!queue_work(cgroup_destroy_wq, cgrp-self.destroy_work))
+   cgroup_put(cgrp);
}
 }
 
@@ -5317,52 +5299,40 @@ static void check_for_release(struct cgroup *cgrp)
  */
 static void cgroup_release_agent(struct work_struct *work)
 {
-   BUG_ON(work != release_agent_work);
+   struct cgroup_subsys_state *css =
+   container_of(work, struct cgroup_subsys_state, destroy_work);
+   struct cgroup *cgrp = css-cgroup;
+   char *pathbuf = NULL, *agentbuf = NULL, *path;
+   char *argv[3], *envp[3];
+
mutex_lock(cgroup_mutex);
-   raw_spin_lock(release_list_lock);
-   while (!list_empty(release_list)) {
-   char *argv[3], *envp[3];
-   int i;
-   char *pathbuf = NULL, *agentbuf = NULL, *path;
-   struct cgroup *cgrp = list_entry(release_list.next,
-   struct cgroup,
-   release_list);
-   list_del_init(cgrp-release_list);
-

Re: Kernel crash in cgroup_pidlist_destroy_work_fn()

2014-09-16 Thread Li Zefan

On 2014/9/17 7:56, Cong Wang wrote:
> Hi, Tejun
> 
> 
> We saw some kernel null pointer dereference in
> cgroup_pidlist_destroy_work_fn(), more precisely at
> __mutex_lock_slowpath(), on 3.14. I can show you the full stack trace
> on request.
> 

Yes, please.

> Looking at the code, it seems flush_workqueue() doesn't care about new
> incoming works, it only processes currently pending ones, if this is
> correct, then we could have the following race condition:
> 
> cgroup_pidlist_destroy_all():
> //...
> mutex_lock(>pidlist_mutex);
> list_for_each_entry_safe(l, tmp_l, >pidlists, links)
> mod_delayed_work(cgroup_pidlist_destroy_wq,
> >destroy_dwork, 0);
> mutex_unlock(>pidlist_mutex);
> 
> // <--- another process calls cgroup_pidlist_start() here
> since mutex is released
> 
> flush_workqueue(cgroup_pidlist_destroy_wq); // <--- another
> process adds new pidlist and queue work in pararell
> BUG_ON(!list_empty(>pidlists)); // <--- This check is
> passed, list_add() could happen after this
> 

Did you confirm this is what happened when the bug was triggered?

I don't think the race condition you described exists. In 3.14 kernel,
cgroup_diput() won't be called if there is any thread running
cgroup_pidlist_start(). This is guaranteed by vfs.

But newer kernels are different. Looks like the bug exists in those
kernels.

> 
> Therefore, the newly added pidlist will point to a freed cgroup, and
> when it is freed in the delayed work we will crash.
> 
> The attached patch (compile test ONLY) could be a possible fix, since
> it will check and hold a refcount on this cgroup in
> cgroup_pidlist_start(). But I could very easily miss something here
> since there are many cgroup changes after 3.14 and I don't follow
> cgroup development.
> 
> What do you think?
> 

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: Kernel crash in cgroup_pidlist_destroy_work_fn()

2014-09-16 Thread Li Zefan

On 2014/9/17 7:56, Cong Wang wrote:
 Hi, Tejun
 
 
 We saw some kernel null pointer dereference in
 cgroup_pidlist_destroy_work_fn(), more precisely at
 __mutex_lock_slowpath(), on 3.14. I can show you the full stack trace
 on request.
 

Yes, please.

 Looking at the code, it seems flush_workqueue() doesn't care about new
 incoming works, it only processes currently pending ones, if this is
 correct, then we could have the following race condition:
 
 cgroup_pidlist_destroy_all():
 //...
 mutex_lock(cgrp-pidlist_mutex);
 list_for_each_entry_safe(l, tmp_l, cgrp-pidlists, links)
 mod_delayed_work(cgroup_pidlist_destroy_wq,
 l-destroy_dwork, 0);
 mutex_unlock(cgrp-pidlist_mutex);
 
 // --- another process calls cgroup_pidlist_start() here
 since mutex is released
 
 flush_workqueue(cgroup_pidlist_destroy_wq); // --- another
 process adds new pidlist and queue work in pararell
 BUG_ON(!list_empty(cgrp-pidlists)); // --- This check is
 passed, list_add() could happen after this
 

Did you confirm this is what happened when the bug was triggered?

I don't think the race condition you described exists. In 3.14 kernel,
cgroup_diput() won't be called if there is any thread running
cgroup_pidlist_start(). This is guaranteed by vfs.

But newer kernels are different. Looks like the bug exists in those
kernels.

 
 Therefore, the newly added pidlist will point to a freed cgroup, and
 when it is freed in the delayed work we will crash.
 
 The attached patch (compile test ONLY) could be a possible fix, since
 it will check and hold a refcount on this cgroup in
 cgroup_pidlist_start(). But I could very easily miss something here
 since there are many cgroup changes after 3.14 and I don't follow
 cgroup development.
 
 What do you think?
 

--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: cgroups/netfilter : kernel NULL pointer BUG at 00000038

2014-09-14 Thread Li Zefan

I think this is the same bug as the one you reported recently, which
has been fixed in mainline.

http://git.kernel.org/cgit/linux/kernel/git/torvalds/linux.git/commit/?id=a4189487da1b4f8260c6006b9dc47c3c4107a5ae

On 2014/9/13 0:19, Toralf Förster wrote:
> Today I observed within a 32 bit KVM machine (stable Gentoo x86 Linux) the 
> following :
> 
> Sep 12 18:14:37 n22kvmclone kernel: [   37.964900] ip_tables: (C) 2000-2006 
> Netfilter Core Team
> Sep 12 18:14:38 n22kvmclone kernel: [   38.412110] nf_conntrack version 0.5.0 
> (16384 buckets, 65536 max)
> Sep 12 18:14:38 n22kvmclone kernel: [   39.032978] [ cut here 
> ]
> Sep 12 18:14:38 n22kvmclone kernel: [   39.033009] WARNING: CPU: 0 PID: 1632 
> at kernel/cgroup.c:1034 cgroup_get+0x91/0xb0()
> Sep 12 18:14:38 n22kvmclone kernel: [   39.033013] Modules linked in: 
> xt_NFLOG xt_limit ipt_REJECT nf_conntrack_ipv4 nf_defrag_ipv4 xt_recent 
> xt_conntrack nf_conntrack iptable_filter ip_tables af_packet dm_crypt dm_mod 
> usbhid mousedev uhci_hcd ehci_pci microcode psmouse ehci_hcd evdev usbcore 
> atkbd usb_common virtio_console processor button
> Sep 12 18:14:38 n22kvmclone kernel: [   39.033104] CPU: 0 PID: 1632 Comm: 
> runscript.sh Not tainted 3.17.0-rc4 #18
> Sep 12 18:14:38 n22kvmclone kernel: [   39.033107] Hardware name: QEMU 
> Standard PC (i440FX + PIIX, 1996), BIOS 
> rel-1.7.5-0-ge51488c-20140602_164612-nilsson.home.kraxel.org 04/01/2014
> Sep 12 18:14:38 n22kvmclone kernel: [   39.033110]    
> f4b33e54 cc04b292  f4b33e84 cbc46d64 cc18f11c
> Sep 12 18:14:38 n22kvmclone kernel: [   39.033119]   0660 
> cc19bcc5 040a cbcb5841 cbcb5841 f56de400 f4963688
> Sep 12 18:14:38 n22kvmclone kernel: [   39.033128]  f4983cb8 f4b33e94 
> cbc46da2 0009  f4b33eb0 cbcb5841 cbe59246
> Sep 12 18:14:38 n22kvmclone kernel: [   39.033137] Call Trace:
> Sep 12 18:14:38 n22kvmclone kernel: [   39.033163]  [] 
> dump_stack+0x41/0x52
> Sep 12 18:14:38 n22kvmclone kernel: [   39.033179]  [] 
> warn_slowpath_common+0x84/0xa0
> Sep 12 18:14:38 n22kvmclone kernel: [   39.033186]  [] ? 
> cgroup_get+0x91/0xb0
> Sep 12 18:14:38 n22kvmclone kernel: [   39.033191]  [] ? 
> cgroup_get+0x91/0xb0
> Sep 12 18:14:38 n22kvmclone kernel: [   39.033197]  [] 
> warn_slowpath_null+0x22/0x30
> Sep 12 18:14:38 n22kvmclone kernel: [   39.033202]  [] 
> cgroup_get+0x91/0xb0
> Sep 12 18:14:38 n22kvmclone kernel: [   39.033223]  [] ? 
> kstrtoll+0x16/0x70
> Sep 12 18:14:38 n22kvmclone kernel: [   39.033230]  [] 
> cgroup_kn_lock_live+0x2d/0x70
> Sep 12 18:14:38 n22kvmclone kernel: [   39.033235]  [] 
> __cgroup_procs_write.isra.26+0x56/0x240
> Sep 12 18:14:38 n22kvmclone kernel: [   39.033240]  [] ? 
> __cgroup_procs_write.isra.26+0x240/0x240
> Sep 12 18:14:38 n22kvmclone kernel: [   39.033245]  [] 
> cgroup_tasks_write+0x17/0x20
> Sep 12 18:14:38 n22kvmclone kernel: [   39.033250]  [] 
> cgroup_file_write+0x45/0x140
> Sep 12 18:14:38 n22kvmclone kernel: [   39.033256]  [] ? 
> kill_css+0xd0/0xd0
> Sep 12 18:14:38 n22kvmclone kernel: [   39.033275]  [] 
> kernfs_fop_write+0xd1/0x160
> Sep 12 18:14:38 n22kvmclone kernel: [   39.033281]  [] ? 
> kernfs_vma_page_mkwrite+0x90/0x90
> Sep 12 18:14:38 n22kvmclone kernel: [   39.033291]  [] 
> vfs_write+0x9d/0x1e0
> Sep 12 18:14:38 n22kvmclone kernel: [   39.033297]  [] ? 
> kernfs_vma_page_mkwrite+0x90/0x90
> Sep 12 18:14:38 n22kvmclone kernel: [   39.033306]  [] ? 
> __fdget+0x12/0x20
> Sep 12 18:14:38 n22kvmclone kernel: [   39.033311]  [] 
> SyS_write+0x52/0xa0
> Sep 12 18:14:38 n22kvmclone kernel: [   39.033321]  [] 
> sysenter_do_call+0x12/0x12
> Sep 12 18:14:38 n22kvmclone kernel: [   39.033325] ---[ end trace 
> f3513225d53cf0f3 ]---
> Sep 12 18:14:38 n22kvmclone kernel: [   39.036277] BUG: unable to handle 
> kernel NULL pointer dereference at 0038
> Sep 12 18:14:38 n22kvmclone kernel: [   39.037026] IP: [] 
> cgroup_put+0xc/0x90
> Sep 12 18:14:38 n22kvmclone kernel: [   39.037026] *pde = 
> Sep 12 18:14:38 n22kvmclone kernel: [   39.037026] Oops:  [#1] SMP
> Sep 12 18:14:38 n22kvmclone kernel: [   39.037026] Modules linked in: 
> xt_NFLOG xt_limit ipt_REJECT nf_conntrack_ipv4 nf_defrag_ipv4 xt_recent 
> xt_conntrack nf_conntrack iptable_filter ip_tables af_packet dm_crypt dm_mod 
> usbhid mousedev uhci_hcd ehci_pci microcode psmouse ehci_hcd evdev usbcore 
> atkbd usb_common virtio_console processor button
> Sep 12 18:14:38 n22kvmclone kernel: [   39.037026] CPU: 0 PID: 1632 Comm: 
> runscript.sh Tainted: GW  3.17.0-rc4 #18
> Sep 12 18:14:38 n22kvmclone kernel: [   39.037026] Hardware name: QEMU 
> Standard PC (i440FX + PIIX, 1996), BIOS 
> rel-1.7.5-0-ge51488c-20140602_164612-nilsson.home.kraxel.org 04/01/2014
> Sep 12 18:14:38 n22kvmclone kernel: [   39.037026] task: f6216390 ti: 
> f4b32000 task.ti: f4b32000
> Sep 12 18:14:38 n22kvmclone kernel: [   39.037026] EIP: 0060:[] 
> EFLAGS: 00010282 CPU: 0
> Sep 12 18:14:38 n22kvmclone

Re: cgroups/netfilter : kernel NULL pointer BUG at 00000038

2014-09-14 Thread Li Zefan

I think this is the same bug as the one you reported recently, which
has been fixed in mainline.

http://git.kernel.org/cgit/linux/kernel/git/torvalds/linux.git/commit/?id=a4189487da1b4f8260c6006b9dc47c3c4107a5ae

On 2014/9/13 0:19, Toralf Förster wrote:
 Today I observed within a 32 bit KVM machine (stable Gentoo x86 Linux) the 
 following :
 
 Sep 12 18:14:37 n22kvmclone kernel: [   37.964900] ip_tables: (C) 2000-2006 
 Netfilter Core Team
 Sep 12 18:14:38 n22kvmclone kernel: [   38.412110] nf_conntrack version 0.5.0 
 (16384 buckets, 65536 max)
 Sep 12 18:14:38 n22kvmclone kernel: [   39.032978] [ cut here 
 ]
 Sep 12 18:14:38 n22kvmclone kernel: [   39.033009] WARNING: CPU: 0 PID: 1632 
 at kernel/cgroup.c:1034 cgroup_get+0x91/0xb0()
 Sep 12 18:14:38 n22kvmclone kernel: [   39.033013] Modules linked in: 
 xt_NFLOG xt_limit ipt_REJECT nf_conntrack_ipv4 nf_defrag_ipv4 xt_recent 
 xt_conntrack nf_conntrack iptable_filter ip_tables af_packet dm_crypt dm_mod 
 usbhid mousedev uhci_hcd ehci_pci microcode psmouse ehci_hcd evdev usbcore 
 atkbd usb_common virtio_console processor button
 Sep 12 18:14:38 n22kvmclone kernel: [   39.033104] CPU: 0 PID: 1632 Comm: 
 runscript.sh Not tainted 3.17.0-rc4 #18
 Sep 12 18:14:38 n22kvmclone kernel: [   39.033107] Hardware name: QEMU 
 Standard PC (i440FX + PIIX, 1996), BIOS 
 rel-1.7.5-0-ge51488c-20140602_164612-nilsson.home.kraxel.org 04/01/2014
 Sep 12 18:14:38 n22kvmclone kernel: [   39.033110]    
 f4b33e54 cc04b292  f4b33e84 cbc46d64 cc18f11c
 Sep 12 18:14:38 n22kvmclone kernel: [   39.033119]   0660 
 cc19bcc5 040a cbcb5841 cbcb5841 f56de400 f4963688
 Sep 12 18:14:38 n22kvmclone kernel: [   39.033128]  f4983cb8 f4b33e94 
 cbc46da2 0009  f4b33eb0 cbcb5841 cbe59246
 Sep 12 18:14:38 n22kvmclone kernel: [   39.033137] Call Trace:
 Sep 12 18:14:38 n22kvmclone kernel: [   39.033163]  [cc04b292] 
 dump_stack+0x41/0x52
 Sep 12 18:14:38 n22kvmclone kernel: [   39.033179]  [cbc46d64] 
 warn_slowpath_common+0x84/0xa0
 Sep 12 18:14:38 n22kvmclone kernel: [   39.033186]  [cbcb5841] ? 
 cgroup_get+0x91/0xb0
 Sep 12 18:14:38 n22kvmclone kernel: [   39.033191]  [cbcb5841] ? 
 cgroup_get+0x91/0xb0
 Sep 12 18:14:38 n22kvmclone kernel: [   39.033197]  [cbc46da2] 
 warn_slowpath_null+0x22/0x30
 Sep 12 18:14:38 n22kvmclone kernel: [   39.033202]  [cbcb5841] 
 cgroup_get+0x91/0xb0
 Sep 12 18:14:38 n22kvmclone kernel: [   39.033223]  [cbe59246] ? 
 kstrtoll+0x16/0x70
 Sep 12 18:14:38 n22kvmclone kernel: [   39.033230]  [cbcb5d3d] 
 cgroup_kn_lock_live+0x2d/0x70
 Sep 12 18:14:38 n22kvmclone kernel: [   39.033235]  [cbcb8386] 
 __cgroup_procs_write.isra.26+0x56/0x240
 Sep 12 18:14:38 n22kvmclone kernel: [   39.033240]  [cbcb8570] ? 
 __cgroup_procs_write.isra.26+0x240/0x240
 Sep 12 18:14:38 n22kvmclone kernel: [   39.033245]  [cbcb8587] 
 cgroup_tasks_write+0x17/0x20
 Sep 12 18:14:38 n22kvmclone kernel: [   39.033250]  [cbcb5645] 
 cgroup_file_write+0x45/0x140
 Sep 12 18:14:38 n22kvmclone kernel: [   39.033256]  [cbcb5600] ? 
 kill_css+0xd0/0xd0
 Sep 12 18:14:38 n22kvmclone kernel: [   39.033275]  [cbda8d21] 
 kernfs_fop_write+0xd1/0x160
 Sep 12 18:14:38 n22kvmclone kernel: [   39.033281]  [cbda8c50] ? 
 kernfs_vma_page_mkwrite+0x90/0x90
 Sep 12 18:14:38 n22kvmclone kernel: [   39.033291]  [cbd49bbd] 
 vfs_write+0x9d/0x1e0
 Sep 12 18:14:38 n22kvmclone kernel: [   39.033297]  [cbda8c50] ? 
 kernfs_vma_page_mkwrite+0x90/0x90
 Sep 12 18:14:38 n22kvmclone kernel: [   39.033306]  [cbd64522] ? 
 __fdget+0x12/0x20
 Sep 12 18:14:38 n22kvmclone kernel: [   39.033311]  [cbd4a0b2] 
 SyS_write+0x52/0xa0
 Sep 12 18:14:38 n22kvmclone kernel: [   39.033321]  [cc0522ab] 
 sysenter_do_call+0x12/0x12
 Sep 12 18:14:38 n22kvmclone kernel: [   39.033325] ---[ end trace 
 f3513225d53cf0f3 ]---
 Sep 12 18:14:38 n22kvmclone kernel: [   39.036277] BUG: unable to handle 
 kernel NULL pointer dereference at 0038
 Sep 12 18:14:38 n22kvmclone kernel: [   39.037026] IP: [cbcb5c2c] 
 cgroup_put+0xc/0x90
 Sep 12 18:14:38 n22kvmclone kernel: [   39.037026] *pde = 
 Sep 12 18:14:38 n22kvmclone kernel: [   39.037026] Oops:  [#1] SMP
 Sep 12 18:14:38 n22kvmclone kernel: [   39.037026] Modules linked in: 
 xt_NFLOG xt_limit ipt_REJECT nf_conntrack_ipv4 nf_defrag_ipv4 xt_recent 
 xt_conntrack nf_conntrack iptable_filter ip_tables af_packet dm_crypt dm_mod 
 usbhid mousedev uhci_hcd ehci_pci microcode psmouse ehci_hcd evdev usbcore 
 atkbd usb_common virtio_console processor button
 Sep 12 18:14:38 n22kvmclone kernel: [   39.037026] CPU: 0 PID: 1632 Comm: 
 runscript.sh Tainted: GW  3.17.0-rc4 #18
 Sep 12 18:14:38 n22kvmclone kernel: [   39.037026] Hardware name: QEMU 
 Standard PC (i440FX + PIIX, 1996), BIOS 
 rel-1.7.5-0-ge51488c-20140602_164612-nilsson.home.kraxel.org 04/01/2014
 Sep 12 18:14:38 n22kvmclone kernel: [   39.037026] task: f6216390 ti: 
 f4b32000 task.ti: f4b32000
 Sep 12 18:14:38 n22kvmclone kernel: [

Re: [kernel.org PATCH] Li Zefan is now the 3.4 stable maintainer

2014-09-09 Thread Li Zefan

On 2014/9/5 21:58, Guenter Roeck wrote:
> On 09/05/2014 12:55 AM, Li Zefan wrote:
>>>>> Li,
>>>>>
>>>>> it would be great if you can send me information about your -stable queue,
>>>>> ie how you maintain it and where it is located. This will enable me to
>>>>> continue testing the stable queue for the 3.4 kernel.
>>>>>
>>>>
>>>> Thanks for testing LTS kernels!
>>>>
>>>> This is my 3.4.y git tree:
>>>>
>>>> https://git.kernel.org/cgit/linux/kernel/git/lizf/linux-3.4.y.git/
>>>>
>>>> And this is the patch queue:
>>>>
>>>> https://git.kernel.org/cgit/linux/kernel/git/lizf/linux-3.4.y-queue.git/
>>>>
>>>> I use quilt. When I've added some patches to 3.4.y, I'll update this
>>>> queue. The patches and series file are under /patches. Currently there's
>>>> already a patch in the queue.
>>>>
>>>> When I release a new version, I'll clean up the queue by removing all
>>>> the files under /patches.
>>>>
>>>> Hope this is all the information you need. Please tell me if you need
>>>> me to slightly adjust my workflow so it's easier for you.
>>>
>>> It almost works.
>>>
>>> Problem is that the pending patch got converted to use  instead of
>>> just  as common in Linux. When I try to apply it with "git quiltimport",
>>> it bails out with "trailing whitespace" errors. "git am" with the individual
>>> patch works fine for some reason, though.
>>>
>>> I can try to find a workaround, but it would be better to have the file in
>>> linux file format to start with. Would that be possible ?
>>>
>>
>> Yeah, I've fixed it. I'll run dos2unix for every patch file.
>>
> 
> Yes, it now works. Wonder how it comes that the patch in Greg's patch tree
> didn't have the problem. Any idea ?
> 

I saved the patch from my email client, and turned out the lines ended
with CRLF.

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: [kernel.org PATCH] Li Zefan is now the 3.4 stable maintainer

2014-09-09 Thread Li Zefan

On 2014/9/5 21:58, Guenter Roeck wrote:
 On 09/05/2014 12:55 AM, Li Zefan wrote:
 Li,

 it would be great if you can send me information about your -stable queue,
 ie how you maintain it and where it is located. This will enable me to
 continue testing the stable queue for the 3.4 kernel.


 Thanks for testing LTS kernels!

 This is my 3.4.y git tree:

 https://git.kernel.org/cgit/linux/kernel/git/lizf/linux-3.4.y.git/

 And this is the patch queue:

 https://git.kernel.org/cgit/linux/kernel/git/lizf/linux-3.4.y-queue.git/

 I use quilt. When I've added some patches to 3.4.y, I'll update this
 queue. The patches and series file are under /patches. Currently there's
 already a patch in the queue.

 When I release a new version, I'll clean up the queue by removing all
 the files under /patches.

 Hope this is all the information you need. Please tell me if you need
 me to slightly adjust my workflow so it's easier for you.

 It almost works.

 Problem is that the pending patch got converted to use crlf instead of
 just lf as common in Linux. When I try to apply it with git quiltimport,
 it bails out with trailing whitespace errors. git am with the individual
 patch works fine for some reason, though.

 I can try to find a workaround, but it would be better to have the file in
 linux file format to start with. Would that be possible ?


 Yeah, I've fixed it. I'll run dos2unix for every patch file.

 
 Yes, it now works. Wonder how it comes that the patch in Greg's patch tree
 didn't have the problem. Any idea ?
 

I saved the patch from my email client, and turned out the lines ended
with CRLF.

--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: [kernel.org PATCH] Li Zefan is now the 3.4 stable maintainer

2014-09-05 Thread Li Zefan

>>> Li,
>>>
>>> it would be great if you can send me information about your -stable queue,
>>> ie how you maintain it and where it is located. This will enable me to
>>> continue testing the stable queue for the 3.4 kernel.
>>>
>>
>> Thanks for testing LTS kernels!
>>
>> This is my 3.4.y git tree:
>>
>> https://git.kernel.org/cgit/linux/kernel/git/lizf/linux-3.4.y.git/
>>
>> And this is the patch queue:
>>
>> https://git.kernel.org/cgit/linux/kernel/git/lizf/linux-3.4.y-queue.git/
>>
>> I use quilt. When I've added some patches to 3.4.y, I'll update this
>> queue. The patches and series file are under /patches. Currently there's
>> already a patch in the queue.
>>
>> When I release a new version, I'll clean up the queue by removing all
>> the files under /patches.
>>
>> Hope this is all the information you need. Please tell me if you need
>> me to slightly adjust my workflow so it's easier for you.
> 
> It almost works.
> 
> Problem is that the pending patch got converted to use  instead of
> just  as common in Linux. When I try to apply it with "git quiltimport",
> it bails out with "trailing whitespace" errors. "git am" with the individual
> patch works fine for some reason, though.
> 
> I can try to find a workaround, but it would be better to have the file in
> linux file format to start with. Would that be possible ?
> 

Yeah, I've fixed it. I'll run dos2unix for every patch file.

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: [kernel.org PATCH] Li Zefan is now the 3.4 stable maintainer

2014-09-05 Thread Li Zefan

 Li,

 it would be great if you can send me information about your -stable queue,
 ie how you maintain it and where it is located. This will enable me to
 continue testing the stable queue for the 3.4 kernel.


 Thanks for testing LTS kernels!

 This is my 3.4.y git tree:

 https://git.kernel.org/cgit/linux/kernel/git/lizf/linux-3.4.y.git/

 And this is the patch queue:

 https://git.kernel.org/cgit/linux/kernel/git/lizf/linux-3.4.y-queue.git/

 I use quilt. When I've added some patches to 3.4.y, I'll update this
 queue. The patches and series file are under /patches. Currently there's
 already a patch in the queue.

 When I release a new version, I'll clean up the queue by removing all
 the files under /patches.

 Hope this is all the information you need. Please tell me if you need
 me to slightly adjust my workflow so it's easier for you.
 
 It almost works.
 
 Problem is that the pending patch got converted to use crlf instead of
 just lf as common in Linux. When I try to apply it with git quiltimport,
 it bails out with trailing whitespace errors. git am with the individual
 patch works fine for some reason, though.
 
 I can try to find a workaround, but it would be better to have the file in
 linux file format to start with. Would that be possible ?
 

Yeah, I've fixed it. I'll run dos2unix for every patch file.

--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: [kernel.org PATCH] Li Zefan is now the 3.4 stable maintainer

2014-09-04 Thread Li Zefan

Hi Guenter,

Sorry for my late reply.

On 2014/8/27 12:59, Guenter Roeck wrote:
> On Tue, Aug 26, 2014 at 04:08:58PM -0700, Greg KH wrote:
>> Li has agreed to continue to support the 3.4 stable kernel tree until
>> September 2016.  Update the releases.html page on kernel.org to reflect
>> this.
>>
> Li,
> 
> it would be great if you can send me information about your -stable queue,
> ie how you maintain it and where it is located. This will enable me to
> continue testing the stable queue for the 3.4 kernel.
> 

Thanks for testing LTS kernels!

This is my 3.4.y git tree:

https://git.kernel.org/cgit/linux/kernel/git/lizf/linux-3.4.y.git/

And this is the patch queue:

https://git.kernel.org/cgit/linux/kernel/git/lizf/linux-3.4.y-queue.git/

I use quilt. When I've added some patches to 3.4.y, I'll update this
queue. The patches and series file are under /patches. Currently there's
already a patch in the queue.

When I release a new version, I'll clean up the queue by removing all
the files under /patches.

Hope this is all the information you need. Please tell me if you need
me to slightly adjust my workflow so it's easier for you.

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

[PATCH v2 2/2] cgroup: check cgroup liveliness before unbreaking kernfs

2014-09-04 Thread Li Zefan

When cgroup_kn_lock_live() is called through some kernfs operation and
another thread is calling cgroup_rmdir(), we'll trigger the warning in
cgroup_get().

[ cut here ]
WARNING: CPU: 1 PID: 1228 at kernel/cgroup.c:1034 cgroup_get+0x89/0xa0()
...
Call Trace:
 [] dump_stack+0x41/0x52
 [] warn_slowpath_common+0x7f/0xa0
 [] warn_slowpath_null+0x1d/0x20
 [] cgroup_get+0x89/0xa0
 [] cgroup_kn_lock_live+0x28/0x70
 [] __cgroup_procs_write.isra.26+0x51/0x230
 [] cgroup_tasks_write+0x12/0x20
 [] cgroup_file_write+0x40/0x130
 [] kernfs_fop_write+0xd1/0x160
 [] vfs_write+0x98/0x1e0
 [] SyS_write+0x4d/0xa0
 [] sysenter_do_call+0x12/0x12
---[ end trace 6f2e0c38c2108a74 ]---

Fix this by calling css_tryget() instead of cgroup_get().

v2:
- move cgroup_tryget() right below cgroup_get() definition. (Tejun)

Cc:  # 3.15+
Reported-by: Toralf Förster 
Signed-off-by: Zefan Li 
---
 kernel/cgroup.c | 8 +++-
 1 file changed, 7 insertions(+), 1 deletion(-)

diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index 709a6a0..51dd46e 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -1031,6 +1031,11 @@ static void cgroup_get(struct cgroup *cgrp)
css_get(>self);
 }
 
+static bool cgroup_tryget(struct cgroup *cgrp)
+{
+   return css_tryget(>self);
+}
+
 static void cgroup_put(struct cgroup *cgrp)
 {
css_put(>self);
@@ -1091,7 +1096,8 @@ static struct cgroup *cgroup_kn_lock_live(struct 
kernfs_node *kn)
 * protection against removal.  Ensure @cgrp stays accessible and
 * break the active_ref protection.
 */
-   cgroup_get(cgrp);
+   if (!cgroup_tryget(cgrp))
+   return NULL;
kernfs_break_active_protection(kn);
 
mutex_lock(_mutex);
-- 
1.8.0.2

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

[PATCH v2 1/2] cgroup: delay the clearing of cgrp->kn->priv

2014-09-04 Thread Li Zefan

Run these two scripts concurrently:

for ((; ;))
{
mkdir /cgroup/sub
rmdir /cgroup/sub
}

for ((; ;))
{
echo $$ > /cgroup/sub/cgroup.procs
echo $$ > /cgroup/cgroup.procs
}

A kernel bug will be triggered:

BUG: unable to handle kernel NULL pointer dereference at 0038
IP: [] cgroup_put+0x9/0x80
...
Call Trace:
 [] cgroup_kn_unlock+0x39/0x50
 [] cgroup_kn_lock_live+0x61/0x70
 [] __cgroup_procs_write.isra.26+0x51/0x230
 [] cgroup_tasks_write+0x12/0x20
 [] cgroup_file_write+0x40/0x130
 [] kernfs_fop_write+0xd1/0x160
 [] vfs_write+0x98/0x1e0
 [] SyS_write+0x4d/0xa0
 [] sysenter_do_call+0x12/0x12

We clear cgrp->kn->priv in the end of cgroup_rmdir(), but another
concurrent thread can access kn->priv after the clearing.

We should move the clearing to css_release_work_fn(). At that time
no one is holding reference to the cgroup and no one can gain a new
reference to access it.

v2:
- remove RCU_INIT_POINTER() into the else block. (Tejun)
- remove the cgroup_parent() check. (Tejun)
- update the comment in css_tryget_online_from_dir().

Cc:  # 3.15+
Reported-by: Toralf Förster 
Signed-off-by: Zefan Li 
---
 kernel/cgroup.c | 21 ++---
 1 file changed, 10 insertions(+), 11 deletions(-)

diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index 1c56924..205f793 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -4181,6 +4181,15 @@ static void css_release_work_fn(struct work_struct *work)
/* cgroup release path */
cgroup_idr_remove(>root->cgroup_idr, cgrp->id);
cgrp->id = -1;
+
+   /*
+* There are two control paths which try to determine
+* cgroup from dentry without going through kernfs -
+* cgroupstats_build() and css_tryget_online_from_dir().
+* Those are supported by RCU protecting clearing of
+* cgrp->kn->priv backpointer.
+*/
+   RCU_INIT_POINTER(*(void __rcu __force **)>kn->priv, NULL);
}
 
mutex_unlock(_mutex);
@@ -4601,16 +4610,6 @@ static int cgroup_rmdir(struct kernfs_node *kn)
 
cgroup_kn_unlock(kn);
 
-   /*
-* There are two control paths which try to determine cgroup from
-* dentry without going through kernfs - cgroupstats_build() and
-* css_tryget_online_from_dir().  Those are supported by RCU
-* protecting clearing of cgrp->kn->priv backpointer, which should
-* happen after all files under it have been removed.
-*/
-   if (!ret)
-   RCU_INIT_POINTER(*(void __rcu __force **)>priv, NULL);
-
cgroup_put(cgrp);
return ret;
 }
@@ -5175,7 +5174,7 @@ struct cgroup_subsys_state 
*css_tryget_online_from_dir(struct dentry *dentry,
/*
 * This path doesn't originate from kernfs and @kn could already
 * have been or be removed at any point.  @kn->priv is RCU
-* protected for this access.  See cgroup_rmdir() for details.
+* protected for this access.  See css_release_work_fn() for details.
 */
cgrp = rcu_dereference(kn->priv);
if (cgrp)
-- 
1.8.0.2

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

[PATCH v2 2/2] cgroup: check cgroup liveliness before unbreaking kernfs

2014-09-04 Thread Li Zefan

When cgroup_kn_lock_live() is called through some kernfs operation and
another thread is calling cgroup_rmdir(), we'll trigger the warning in
cgroup_get().

[ cut here ]
WARNING: CPU: 1 PID: 1228 at kernel/cgroup.c:1034 cgroup_get+0x89/0xa0()
...
Call Trace:
 [c16ee73d] dump_stack+0x41/0x52
 [c10468ef] warn_slowpath_common+0x7f/0xa0
 [c104692d] warn_slowpath_null+0x1d/0x20
 [c10bb999] cgroup_get+0x89/0xa0
 [c10bbe58] cgroup_kn_lock_live+0x28/0x70
 [c10be3c1] __cgroup_procs_write.isra.26+0x51/0x230
 [c10be5b2] cgroup_tasks_write+0x12/0x20
 [c10bb7b0] cgroup_file_write+0x40/0x130
 [c11aee71] kernfs_fop_write+0xd1/0x160
 [c1148e58] vfs_write+0x98/0x1e0
 [c114934d] SyS_write+0x4d/0xa0
 [c16f656b] sysenter_do_call+0x12/0x12
---[ end trace 6f2e0c38c2108a74 ]---

Fix this by calling css_tryget() instead of cgroup_get().

v2:
- move cgroup_tryget() right below cgroup_get() definition. (Tejun)

Cc: sta...@vger.kernel.org # 3.15+
Reported-by: Toralf Förster toralf.foers...@gmx.de
Signed-off-by: Zefan Li lize...@huawei.com
---
 kernel/cgroup.c | 8 +++-
 1 file changed, 7 insertions(+), 1 deletion(-)

diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index 709a6a0..51dd46e 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -1031,6 +1031,11 @@ static void cgroup_get(struct cgroup *cgrp)
css_get(cgrp-self);
 }
 
+static bool cgroup_tryget(struct cgroup *cgrp)
+{
+   return css_tryget(cgrp-self);
+}
+
 static void cgroup_put(struct cgroup *cgrp)
 {
css_put(cgrp-self);
@@ -1091,7 +1096,8 @@ static struct cgroup *cgroup_kn_lock_live(struct 
kernfs_node *kn)
 * protection against removal.  Ensure @cgrp stays accessible and
 * break the active_ref protection.
 */
-   cgroup_get(cgrp);
+   if (!cgroup_tryget(cgrp))
+   return NULL;
kernfs_break_active_protection(kn);
 
mutex_lock(cgroup_mutex);
-- 
1.8.0.2

--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

[PATCH v2 1/2] cgroup: delay the clearing of cgrp-kn-priv

2014-09-04 Thread Li Zefan

Run these two scripts concurrently:

for ((; ;))
{
mkdir /cgroup/sub
rmdir /cgroup/sub
}

for ((; ;))
{
echo $$  /cgroup/sub/cgroup.procs
echo $$  /cgroup/cgroup.procs
}

A kernel bug will be triggered:

BUG: unable to handle kernel NULL pointer dereference at 0038
IP: [c10bbd69] cgroup_put+0x9/0x80
...
Call Trace:
 [c10bbe19] cgroup_kn_unlock+0x39/0x50
 [c10bbe91] cgroup_kn_lock_live+0x61/0x70
 [c10be3c1] __cgroup_procs_write.isra.26+0x51/0x230
 [c10be5b2] cgroup_tasks_write+0x12/0x20
 [c10bb7b0] cgroup_file_write+0x40/0x130
 [c11aee71] kernfs_fop_write+0xd1/0x160
 [c1148e58] vfs_write+0x98/0x1e0
 [c114934d] SyS_write+0x4d/0xa0
 [c16f656b] sysenter_do_call+0x12/0x12

We clear cgrp-kn-priv in the end of cgroup_rmdir(), but another
concurrent thread can access kn-priv after the clearing.

We should move the clearing to css_release_work_fn(). At that time
no one is holding reference to the cgroup and no one can gain a new
reference to access it.

v2:
- remove RCU_INIT_POINTER() into the else block. (Tejun)
- remove the cgroup_parent() check. (Tejun)
- update the comment in css_tryget_online_from_dir().

Cc: sta...@vger.kernel.org # 3.15+
Reported-by: Toralf Förster toralf.foers...@gmx.de
Signed-off-by: Zefan Li lize...@huawei.com
---
 kernel/cgroup.c | 21 ++---
 1 file changed, 10 insertions(+), 11 deletions(-)

diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index 1c56924..205f793 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -4181,6 +4181,15 @@ static void css_release_work_fn(struct work_struct *work)
/* cgroup release path */
cgroup_idr_remove(cgrp-root-cgroup_idr, cgrp-id);
cgrp-id = -1;
+
+   /*
+* There are two control paths which try to determine
+* cgroup from dentry without going through kernfs -
+* cgroupstats_build() and css_tryget_online_from_dir().
+* Those are supported by RCU protecting clearing of
+* cgrp-kn-priv backpointer.
+*/
+   RCU_INIT_POINTER(*(void __rcu __force **)cgrp-kn-priv, NULL);
}
 
mutex_unlock(cgroup_mutex);
@@ -4601,16 +4610,6 @@ static int cgroup_rmdir(struct kernfs_node *kn)
 
cgroup_kn_unlock(kn);
 
-   /*
-* There are two control paths which try to determine cgroup from
-* dentry without going through kernfs - cgroupstats_build() and
-* css_tryget_online_from_dir().  Those are supported by RCU
-* protecting clearing of cgrp-kn-priv backpointer, which should
-* happen after all files under it have been removed.
-*/
-   if (!ret)
-   RCU_INIT_POINTER(*(void __rcu __force **)kn-priv, NULL);
-
cgroup_put(cgrp);
return ret;
 }
@@ -5175,7 +5174,7 @@ struct cgroup_subsys_state 
*css_tryget_online_from_dir(struct dentry *dentry,
/*
 * This path doesn't originate from kernfs and @kn could already
 * have been or be removed at any point.  @kn-priv is RCU
-* protected for this access.  See cgroup_rmdir() for details.
+* protected for this access.  See css_release_work_fn() for details.
 */
cgrp = rcu_dereference(kn-priv);
if (cgrp)
-- 
1.8.0.2

--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: [kernel.org PATCH] Li Zefan is now the 3.4 stable maintainer

2014-09-04 Thread Li Zefan

Hi Guenter,

Sorry for my late reply.

On 2014/8/27 12:59, Guenter Roeck wrote:
 On Tue, Aug 26, 2014 at 04:08:58PM -0700, Greg KH wrote:
 Li has agreed to continue to support the 3.4 stable kernel tree until
 September 2016.  Update the releases.html page on kernel.org to reflect
 this.

 Li,
 
 it would be great if you can send me information about your -stable queue,
 ie how you maintain it and where it is located. This will enable me to
 continue testing the stable queue for the 3.4 kernel.
 

Thanks for testing LTS kernels!

This is my 3.4.y git tree:

https://git.kernel.org/cgit/linux/kernel/git/lizf/linux-3.4.y.git/

And this is the patch queue:

https://git.kernel.org/cgit/linux/kernel/git/lizf/linux-3.4.y-queue.git/

I use quilt. When I've added some patches to 3.4.y, I'll update this
queue. The patches and series file are under /patches. Currently there's
already a patch in the queue.

When I release a new version, I'll clean up the queue by removing all
the files under /patches.

Hope this is all the information you need. Please tell me if you need
me to slightly adjust my workflow so it's easier for you.

--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: [PATCH 1/2] cgroup: Delay the clearing of cgrp->kn->priv

2014-09-03 Thread Li Zefan

于 2014/9/2 23:33, Tejun Heo 写道:
> Hello, Li.
> 
> On Tue, Sep 02, 2014 at 06:56:58PM +0800, Li Zefan wrote:
>> for ((; ;))
>> {
>> echo $$ > /cgroup/sub/cgroup.procs
>> ech $$ > /cgce 6f2e0c38c2108a74 ]---
>   
>   copy & paste error?
> ...

oops

>> Reported-by: Toralf Förster 
>> Signed-off-by: Li Zefan 
>> ---
>>
>> Toralf, Thanks for reporting the bug. I'm not able to repy to your email,
>> because I was kicked out of the cgroup mailing list so didn't receive
>> emails from mailing list for a week.
>>
>> ---
>>  kernel/cgroup.c | 19 +--
>>  1 file changed, 9 insertions(+), 10 deletions(-)
>>
>> diff --git a/kernel/cgroup.c b/kernel/cgroup.c
>> index 1c56924..e03fc62 100644
>> --- a/kernel/cgroup.c
>> +++ b/kernel/cgroup.c
>> @@ -4185,6 +4185,15 @@ static void css_release_work_fn(struct work_struct 
>> *work)
>>  
>>  mutex_unlock(_mutex);
>>  
>> +/*
>> + * There are two control paths which try to determine cgroup from
>> + * dentry without going through kernfs - cgroupstats_build() and
>> + * css_tryget_online_from_dir().  Those are supported by RCU
>> + * protecting clearing of cgrp->kn->priv backpointer.
>> + */
>> +if (!ss && cgroup_parent(cgrp))
>> +RCU_INIT_POINTER(*(void __rcu __force **)>kn->priv, NULL);
> 
> Can we move the above into the preceding else block?  I don't think
> holding cgroup_mutex or not makes any difference here. 

> Also, why do
> we need the cgroup_parent() check?  Do we deref root's kn->priv in the
> destruction path?  If so, can you please note that in the comment?
> 

I think the check is not necessary. I was trying to make smaller difference
than the original code, and RCU_INIT_POINTER() is in cgroup_rmdir() which
won't be called on root cgroup.

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: [PATCH 1/2] cgroup: Delay the clearing of cgrp-kn-priv

2014-09-03 Thread Li Zefan

于 2014/9/2 23:33, Tejun Heo 写道:
 Hello, Li.
 
 On Tue, Sep 02, 2014 at 06:56:58PM +0800, Li Zefan wrote:
 for ((; ;))
 {
 echo $$  /cgroup/sub/cgroup.procs
 ech $$  /cgce 6f2e0c38c2108a74 ]---
   
   copy  paste error?
 ...

oops

 Reported-by: Toralf Förster toralf.foers...@gmx.de
 Signed-off-by: Li Zefan lize...@huawei.com
 ---

 Toralf, Thanks for reporting the bug. I'm not able to repy to your email,
 because I was kicked out of the cgroup mailing list so didn't receive
 emails from mailing list for a week.

 ---
  kernel/cgroup.c | 19 +--
  1 file changed, 9 insertions(+), 10 deletions(-)

 diff --git a/kernel/cgroup.c b/kernel/cgroup.c
 index 1c56924..e03fc62 100644
 --- a/kernel/cgroup.c
 +++ b/kernel/cgroup.c
 @@ -4185,6 +4185,15 @@ static void css_release_work_fn(struct work_struct 
 *work)
  
  mutex_unlock(cgroup_mutex);
  
 +/*
 + * There are two control paths which try to determine cgroup from
 + * dentry without going through kernfs - cgroupstats_build() and
 + * css_tryget_online_from_dir().  Those are supported by RCU
 + * protecting clearing of cgrp-kn-priv backpointer.
 + */
 +if (!ss  cgroup_parent(cgrp))
 +RCU_INIT_POINTER(*(void __rcu __force **)cgrp-kn-priv, NULL);
 
 Can we move the above into the preceding else block?  I don't think
 holding cgroup_mutex or not makes any difference here. 

 Also, why do
 we need the cgroup_parent() check?  Do we deref root's kn-priv in the
 destruction path?  If so, can you please note that in the comment?
 

I think the check is not necessary. I was trying to make smaller difference
than the original code, and RCU_INIT_POINTER() is in cgroup_rmdir() which
won't be called on root cgroup.

--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

[PATCH 2/2] cgroup: check cgroup liveliness before unbreaking kernfs protection

2014-09-02 Thread Li Zefan

When cgroup_kn_lock_live() is called through some kernfs operation and
another thread is calling cgroup_rmdir(), we may trigger the warning in
cgroup_get().

[ cut here ]
WARNING: CPU: 1 PID: 1228 at kernel/cgroup.c:1034 cgroup_get+0x89/0xa0()
...
Call Trace:
 [] dump_stack+0x41/0x52
 [] warn_slowpath_common+0x7f/0xa0
 [] warn_slowpath_null+0x1d/0x20
 [] cgroup_get+0x89/0xa0
 [] cgroup_kn_lock_live+0x28/0x70
 [] __cgroup_procs_write.isra.26+0x51/0x230
 [] cgroup_tasks_write+0x12/0x20
 [] cgroup_file_write+0x40/0x130
 [] kernfs_fop_write+0xd1/0x160
 [] vfs_write+0x98/0x1e0
 [] SyS_write+0x4d/0xa0
 [] sysenter_do_call+0x12/0x12
---[ end trace 6f2e0c38c2108a74 ]---

Fix this by calling css_tryget() instead of cgroup_get().

Reported-by: Toralf Förster 
Signed-off-by: Li Zefan 
---
 kernel/cgroup.c | 8 +++-
 1 file changed, 7 insertions(+), 1 deletion(-)

diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index e03fc62..c8d07e5 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -1025,6 +1025,11 @@ static umode_t cgroup_file_mode(const struct cftype *cft)
return mode;
 }
 
+static bool cgroup_tryget(struct cgroup *cgrp)
+{
+   return css_tryget(>self);
+}
+
 static void cgroup_get(struct cgroup *cgrp)
 {
WARN_ON_ONCE(cgroup_is_dead(cgrp));
@@ -1091,7 +1096,8 @@ static struct cgroup *cgroup_kn_lock_live(struct 
kernfs_node *kn)
 * protection against removal.  Ensure @cgrp stays accessible and
 * break the active_ref protection.
 */
-   cgroup_get(cgrp);
+   if (!cgroup_tryget(cgrp))
+   return NULL;
kernfs_break_active_protection(kn);
 
mutex_lock(_mutex);
-- 
1.8.0.2
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

[PATCH 1/2] cgroup: Delay the clearing of cgrp->kn->priv

2014-09-02 Thread Li Zefan

Run these two scripts concurrently:

for ((; ;))
{
mkdir /cgroup/sub
rmdir /cgroup/sub
}

for ((; ;))
{
echo $$ > /cgroup/sub/cgroup.procs
ech $$ > /cgce 6f2e0c38c2108a74 ]---
}

A kernel bug will be triggered:

BUG: unable to handle kernel NULL pointer dereference at 0038
IP: [] cgroup_put+0x9/0x80
...
Call Trace:
 [] cgroup_kn_unlock+0x39/0x50
 [] cgroup_kn_lock_live+0x61/0x70
 [] __cgroup_procs_write.isra.26+0x51/0x230
 [] cgroup_tasks_write+0x12/0x20
 [] cgroup_file_write+0x40/0x130
 [] kernfs_fop_write+0xd1/0x160
 [] vfs_write+0x98/0x1e0
 [] SyS_write+0x4d/0xa0
 [] sysenter_do_call+0x12/0x12

We clear cgrp->kn->priv in the end of cgroup_rmdir(), but another
concurrent thread can access kn->priv after the clearing.

We should move the clearing to css_release_work_fn(). At that time
no one is holding reference to the cgroup and no one can gain a new
reference to access it.

Reported-by: Toralf Förster 
Signed-off-by: Li Zefan 
---

Toralf, Thanks for reporting the bug. I'm not able to repy to your email,
because I was kicked out of the cgroup mailing list so didn't receive
emails from mailing list for a week.

---
 kernel/cgroup.c | 19 +--
 1 file changed, 9 insertions(+), 10 deletions(-)

diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index 1c56924..e03fc62 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -4185,6 +4185,15 @@ static void css_release_work_fn(struct work_struct *work)
 
mutex_unlock(_mutex);
 
+   /*
+* There are two control paths which try to determine cgroup from
+* dentry without going through kernfs - cgroupstats_build() and
+* css_tryget_online_from_dir().  Those are supported by RCU
+* protecting clearing of cgrp->kn->priv backpointer.
+*/
+   if (!ss && cgroup_parent(cgrp))
+   RCU_INIT_POINTER(*(void __rcu __force **)>kn->priv, NULL);
+
call_rcu(>rcu_head, css_free_rcu_fn);
 }
 
@@ -4601,16 +4610,6 @@ static int cgroup_rmdir(struct kernfs_node *kn)
 
cgroup_kn_unlock(kn);
 
-   /*
-* There are two control paths which try to determine cgroup from
-* dentry without going through kernfs - cgroupstats_build() and
-* css_tryget_online_from_dir().  Those are supported by RCU
-* protecting clearing of cgrp->kn->priv backpointer, which should
-* happen after all files under it have been removed.
-*/
-   if (!ret)
-   RCU_INIT_POINTER(*(void __rcu __force **)>priv, NULL);
-
cgroup_put(cgrp);
return ret;
 }
-- 
1.8.0.2

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

[PATCH 1/2] cgroup: Delay the clearing of cgrp-kn-priv

2014-09-02 Thread Li Zefan

Run these two scripts concurrently:

for ((; ;))
{
mkdir /cgroup/sub
rmdir /cgroup/sub
}

for ((; ;))
{
echo $$  /cgroup/sub/cgroup.procs
ech $$  /cgce 6f2e0c38c2108a74 ]---
}

A kernel bug will be triggered:

BUG: unable to handle kernel NULL pointer dereference at 0038
IP: [c10bbd69] cgroup_put+0x9/0x80
...
Call Trace:
 [c10bbe19] cgroup_kn_unlock+0x39/0x50
 [c10bbe91] cgroup_kn_lock_live+0x61/0x70
 [c10be3c1] __cgroup_procs_write.isra.26+0x51/0x230
 [c10be5b2] cgroup_tasks_write+0x12/0x20
 [c10bb7b0] cgroup_file_write+0x40/0x130
 [c11aee71] kernfs_fop_write+0xd1/0x160
 [c1148e58] vfs_write+0x98/0x1e0
 [c114934d] SyS_write+0x4d/0xa0
 [c16f656b] sysenter_do_call+0x12/0x12

We clear cgrp-kn-priv in the end of cgroup_rmdir(), but another
concurrent thread can access kn-priv after the clearing.

We should move the clearing to css_release_work_fn(). At that time
no one is holding reference to the cgroup and no one can gain a new
reference to access it.

Reported-by: Toralf Förster toralf.foers...@gmx.de
Signed-off-by: Li Zefan lize...@huawei.com
---

Toralf, Thanks for reporting the bug. I'm not able to repy to your email,
because I was kicked out of the cgroup mailing list so didn't receive
emails from mailing list for a week.

---
 kernel/cgroup.c | 19 +--
 1 file changed, 9 insertions(+), 10 deletions(-)

diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index 1c56924..e03fc62 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -4185,6 +4185,15 @@ static void css_release_work_fn(struct work_struct *work)
 
mutex_unlock(cgroup_mutex);
 
+   /*
+* There are two control paths which try to determine cgroup from
+* dentry without going through kernfs - cgroupstats_build() and
+* css_tryget_online_from_dir().  Those are supported by RCU
+* protecting clearing of cgrp-kn-priv backpointer.
+*/
+   if (!ss  cgroup_parent(cgrp))
+   RCU_INIT_POINTER(*(void __rcu __force **)cgrp-kn-priv, NULL);
+
call_rcu(css-rcu_head, css_free_rcu_fn);
 }
 
@@ -4601,16 +4610,6 @@ static int cgroup_rmdir(struct kernfs_node *kn)
 
cgroup_kn_unlock(kn);
 
-   /*
-* There are two control paths which try to determine cgroup from
-* dentry without going through kernfs - cgroupstats_build() and
-* css_tryget_online_from_dir().  Those are supported by RCU
-* protecting clearing of cgrp-kn-priv backpointer, which should
-* happen after all files under it have been removed.
-*/
-   if (!ret)
-   RCU_INIT_POINTER(*(void __rcu __force **)kn-priv, NULL);
-
cgroup_put(cgrp);
return ret;
 }
-- 
1.8.0.2

--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

[PATCH 2/2] cgroup: check cgroup liveliness before unbreaking kernfs protection

2014-09-02 Thread Li Zefan

When cgroup_kn_lock_live() is called through some kernfs operation and
another thread is calling cgroup_rmdir(), we may trigger the warning in
cgroup_get().

[ cut here ]
WARNING: CPU: 1 PID: 1228 at kernel/cgroup.c:1034 cgroup_get+0x89/0xa0()
...
Call Trace:
 [c16ee73d] dump_stack+0x41/0x52
 [c10468ef] warn_slowpath_common+0x7f/0xa0
 [c104692d] warn_slowpath_null+0x1d/0x20
 [c10bb999] cgroup_get+0x89/0xa0
 [c10bbe58] cgroup_kn_lock_live+0x28/0x70
 [c10be3c1] __cgroup_procs_write.isra.26+0x51/0x230
 [c10be5b2] cgroup_tasks_write+0x12/0x20
 [c10bb7b0] cgroup_file_write+0x40/0x130
 [c11aee71] kernfs_fop_write+0xd1/0x160
 [c1148e58] vfs_write+0x98/0x1e0
 [c114934d] SyS_write+0x4d/0xa0
 [c16f656b] sysenter_do_call+0x12/0x12
---[ end trace 6f2e0c38c2108a74 ]---

Fix this by calling css_tryget() instead of cgroup_get().

Reported-by: Toralf Förster toralf.foers...@gmx.de
Signed-off-by: Li Zefan lize...@huawei.com
---
 kernel/cgroup.c | 8 +++-
 1 file changed, 7 insertions(+), 1 deletion(-)

diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index e03fc62..c8d07e5 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -1025,6 +1025,11 @@ static umode_t cgroup_file_mode(const struct cftype *cft)
return mode;
 }
 
+static bool cgroup_tryget(struct cgroup *cgrp)
+{
+   return css_tryget(cgrp-self);
+}
+
 static void cgroup_get(struct cgroup *cgrp)
 {
WARN_ON_ONCE(cgroup_is_dead(cgrp));
@@ -1091,7 +1096,8 @@ static struct cgroup *cgroup_kn_lock_live(struct 
kernfs_node *kn)
 * protection against removal.  Ensure @cgrp stays accessible and
 * break the active_ref protection.
 */
-   cgroup_get(cgrp);
+   if (!cgroup_tryget(cgrp))
+   return NULL;
kernfs_break_active_protection(kn);
 
mutex_lock(cgroup_mutex);
-- 
1.8.0.2
--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: [PATCH V2] cgroup: Introduce cgroup_detach_task().

2014-08-25 Thread Li Zefan

On 2014/8/25 23:00, Dongsheng Yang wrote:
> On Mon, Aug 25, 2014 at 10:47 PM, Tejun Heo  wrote:
>> On Mon, Aug 25, 2014 at 10:46:03PM +0800, Dongsheng Yang wrote:
>>> My point here is that attaching and detaching are a pair of operations.
>>
>> There is no detaching from a cgroup.  A task is always attached to a
>> cgroup whether that's a root or non-root cgroup.
> 
> Okey, I should not think it as attaching and detaching. Just treat them as
> a move between root and non-root cgroup.
> 
> It sounds reasonable to me now.
> 

I from time to time have to explain this to other people.

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: [PATCH V2] cgroup: Introduce cgroup_detach_task().

2014-08-25 Thread Li Zefan

On 2014/8/25 23:00, Dongsheng Yang wrote:
 On Mon, Aug 25, 2014 at 10:47 PM, Tejun Heo t...@kernel.org wrote:
 On Mon, Aug 25, 2014 at 10:46:03PM +0800, Dongsheng Yang wrote:
 My point here is that attaching and detaching are a pair of operations.

 There is no detaching from a cgroup.  A task is always attached to a
 cgroup whether that's a root or non-root cgroup.
 
 Okey, I should not think it as attaching and detaching. Just treat them as
 a move between root and non-root cgroup.
 
 It sounds reasonable to me now.
 

I from time to time have to explain this to other people.

--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: [PATCH -mm] slab: fix cpuset check in fallback_alloc

2014-08-14 Thread Li Zefan

On 2014/8/12 5:05, David Rientjes wrote:
> On Mon, 11 Aug 2014, Vladimir Davydov wrote:
> 
>>> diff --git a/mm/page_alloc.c b/mm/page_alloc.c
>>> --- a/mm/page_alloc.c
>>> +++ b/mm/page_alloc.c
>>> @@ -1963,7 +1963,7 @@ zonelist_scan:
>>>  
>>> /*
>>>  * Scan zonelist, looking for a zone with enough free.
>>> -* See also __cpuset_node_allowed_softwall() comment in kernel/cpuset.c.
>>> +* See __cpuset_node_allowed() comment in kernel/cpuset.c.
>>>  */
>>> for_each_zone_zonelist_nodemask(zone, z, zonelist,
>>> high_zoneidx, nodemask) {
>>> @@ -1974,7 +1974,7 @@ zonelist_scan:
>>> continue;
>>> if (cpusets_enabled() &&
>>> (alloc_flags & ALLOC_CPUSET) &&
>>> -   !cpuset_zone_allowed_softwall(zone, gfp_mask))
>>> +   !cpuset_zone_allowed(zone, gfp_mask))
>>> continue;
>>
>> So, this is get_page_from_freelist. It's called from
>> __alloc_pages_nodemask with alloc_flags always having ALLOC_CPUSET bit
>> set and from __alloc_pages_slowpath with alloc_flags having ALLOC_CPUSET
>> bit set only for __GFP_WAIT allocations. That said, w/o your patch we
>> try to respect cpusets for all allocations, including atomic, and only
>> ignore cpusets if tight on memory (freelist's empty) for !__GFP_WAIT
>> allocations, while with your patch we always ignore cpusets for
>> !__GFP_WAIT allocations. Not sure if it really matters though, because
>> usually one uses cpuset.mems in conjunction with cpuset.cpus and it
>> won't make any difference then. It also doesn't conflict with any cpuset
>> documentation.
>>
> 
> Yeah, that's why I'm asking Li, the cpuset maintainer, if we can do this.  

I'm not quite sure. That code has been there before I got involved in cpuset.

> The only thing that we get by falling back to the page allocator slowpath 
> is that kswapd gets woken up before the allocation is attempted without 
> ALLOC_CPUSET.  It seems pointless to wakeup kswapd when the allocation can 
> succeed on any node.  Even with the patch, if the allocation fails because 
> all nodes are below their min watermark, then we still fallback to the 
> slowpath and wake up kswapd but there's nothing much else we can do 
> because it's !__GFP_WAIT.
> .

But I tend to agree with you. But if we want to do this, we should split this
change from the cleanup.

Regarding to the cleanup, I found there used to be a single 
cpuset_node_allowed(),
and your cleanup is exactly a revert of that ancient commit:

commit 02a0e53d8227aff5e62e0433f82c12c1c2805fd6
Author: Paul Jackson 
Date:   Wed Dec 13 00:34:25 2006 -0800

[PATCH] cpuset: rework cpuset_zone_allowed api

Seems the major intention was to avoid accident sleep-in-atomic bugs, because
callback_mutex might be held.

I don't see there's any reason callback_mutex can't be a spinlock. I thought
about this when Gu Zhen fixed the bug that callback_mutex is nested inside
rcu_read_lock().

--
 kernel/cpuset.c | 81 ++---
 1 file changed, 49 insertions(+), 32 deletions(-)
diff --git a/kernel/cpuset.c b/kernel/cpuset.c
index baa155c..9d9e239 100644
--- a/kernel/cpuset.c
+++ b/kernel/cpuset.c
@@ -284,7 +284,7 @@ static struct cpuset top_cpuset = {
  */
 
 static DEFINE_MUTEX(cpuset_mutex);
-static DEFINE_MUTEX(callback_mutex);
+static DEFINE_SPINLOCK(callback_lock);
 
 /*
  * CPU / memory hotplug is handled asynchronously.
@@ -848,6 +848,7 @@ static void update_tasks_cpumask(struct cpuset *cs)
  */
 static void update_cpumasks_hier(struct cpuset *cs, struct cpumask *new_cpus)
 {
+   unsigned long flags;
struct cpuset *cp;
struct cgroup_subsys_state *pos_css;
bool need_rebuild_sched_domains = false;
@@ -875,9 +876,9 @@ static void update_cpumasks_hier(struct cpuset *cs, struct 
cpumask *new_cpus)
continue;
rcu_read_unlock();
 
-   mutex_lock(_mutex);
+   spin_lock_irqsave(_lock, flags);
cpumask_copy(cp->effective_cpus, new_cpus);
-   mutex_unlock(_mutex);
+   spin_unlock_irqrestore(_lock, flags);
 
WARN_ON(!cgroup_on_dfl(cp->css.cgroup) &&
!cpumask_equal(cp->cpus_allowed, cp->effective_cpus));
@@ -910,6 +911,7 @@ static void update_cpumasks_hier(struct cpuset *cs, struct 
cpumask *new_cpus)
 static int update_cpumask(struct cpuset *cs, struct cpuset *trialcs,
  const char *buf)
 {
+   unsigned long flags;
int retval;
 
/* top_cpuset.cpus_allowed tracks cpu_online_mask; it's read-only */
@@ -942,9 +944,9 @@ static int update_cpumask(struct cpuset *cs, struct cpuset 
*trialcs,
if (retval < 0)
return retval;
 
-   mutex_lock(_mutex);
+   spin_lock_irqsave(_lock, flags);
cpumask_copy(cs->cpus_allowed,

Re: [PATCH -mm] slab: fix cpuset check in fallback_alloc

2014-08-14 Thread Li Zefan

On 2014/8/12 5:05, David Rientjes wrote:
 On Mon, 11 Aug 2014, Vladimir Davydov wrote:
 
 diff --git a/mm/page_alloc.c b/mm/page_alloc.c
 --- a/mm/page_alloc.c
 +++ b/mm/page_alloc.c
 @@ -1963,7 +1963,7 @@ zonelist_scan:
  
 /*
  * Scan zonelist, looking for a zone with enough free.
 -* See also __cpuset_node_allowed_softwall() comment in kernel/cpuset.c.
 +* See __cpuset_node_allowed() comment in kernel/cpuset.c.
  */
 for_each_zone_zonelist_nodemask(zone, z, zonelist,
 high_zoneidx, nodemask) {
 @@ -1974,7 +1974,7 @@ zonelist_scan:
 continue;
 if (cpusets_enabled() 
 (alloc_flags  ALLOC_CPUSET) 
 -   !cpuset_zone_allowed_softwall(zone, gfp_mask))
 +   !cpuset_zone_allowed(zone, gfp_mask))
 continue;

 So, this is get_page_from_freelist. It's called from
 __alloc_pages_nodemask with alloc_flags always having ALLOC_CPUSET bit
 set and from __alloc_pages_slowpath with alloc_flags having ALLOC_CPUSET
 bit set only for __GFP_WAIT allocations. That said, w/o your patch we
 try to respect cpusets for all allocations, including atomic, and only
 ignore cpusets if tight on memory (freelist's empty) for !__GFP_WAIT
 allocations, while with your patch we always ignore cpusets for
 !__GFP_WAIT allocations. Not sure if it really matters though, because
 usually one uses cpuset.mems in conjunction with cpuset.cpus and it
 won't make any difference then. It also doesn't conflict with any cpuset
 documentation.

 
 Yeah, that's why I'm asking Li, the cpuset maintainer, if we can do this.  

I'm not quite sure. That code has been there before I got involved in cpuset.

 The only thing that we get by falling back to the page allocator slowpath 
 is that kswapd gets woken up before the allocation is attempted without 
 ALLOC_CPUSET.  It seems pointless to wakeup kswapd when the allocation can 
 succeed on any node.  Even with the patch, if the allocation fails because 
 all nodes are below their min watermark, then we still fallback to the 
 slowpath and wake up kswapd but there's nothing much else we can do 
 because it's !__GFP_WAIT.
 .

But I tend to agree with you. But if we want to do this, we should split this
change from the cleanup.

Regarding to the cleanup, I found there used to be a single 
cpuset_node_allowed(),
and your cleanup is exactly a revert of that ancient commit:

commit 02a0e53d8227aff5e62e0433f82c12c1c2805fd6
Author: Paul Jackson p...@sgi.com
Date:   Wed Dec 13 00:34:25 2006 -0800

[PATCH] cpuset: rework cpuset_zone_allowed api

Seems the major intention was to avoid accident sleep-in-atomic bugs, because
callback_mutex might be held.

I don't see there's any reason callback_mutex can't be a spinlock. I thought
about this when Gu Zhen fixed the bug that callback_mutex is nested inside
rcu_read_lock().

--
 kernel/cpuset.c | 81 ++---
 1 file changed, 49 insertions(+), 32 deletions(-)
diff --git a/kernel/cpuset.c b/kernel/cpuset.c
index baa155c..9d9e239 100644
--- a/kernel/cpuset.c
+++ b/kernel/cpuset.c
@@ -284,7 +284,7 @@ static struct cpuset top_cpuset = {
  */
 
 static DEFINE_MUTEX(cpuset_mutex);
-static DEFINE_MUTEX(callback_mutex);
+static DEFINE_SPINLOCK(callback_lock);
 
 /*
  * CPU / memory hotplug is handled asynchronously.
@@ -848,6 +848,7 @@ static void update_tasks_cpumask(struct cpuset *cs)
  */
 static void update_cpumasks_hier(struct cpuset *cs, struct cpumask *new_cpus)
 {
+   unsigned long flags;
struct cpuset *cp;
struct cgroup_subsys_state *pos_css;
bool need_rebuild_sched_domains = false;
@@ -875,9 +876,9 @@ static void update_cpumasks_hier(struct cpuset *cs, struct 
cpumask *new_cpus)
continue;
rcu_read_unlock();
 
-   mutex_lock(callback_mutex);
+   spin_lock_irqsave(callback_lock, flags);
cpumask_copy(cp-effective_cpus, new_cpus);
-   mutex_unlock(callback_mutex);
+   spin_unlock_irqrestore(callback_lock, flags);
 
WARN_ON(!cgroup_on_dfl(cp-css.cgroup) 
!cpumask_equal(cp-cpus_allowed, cp-effective_cpus));
@@ -910,6 +911,7 @@ static void update_cpumasks_hier(struct cpuset *cs, struct 
cpumask *new_cpus)
 static int update_cpumask(struct cpuset *cs, struct cpuset *trialcs,
  const char *buf)
 {
+   unsigned long flags;
int retval;
 
/* top_cpuset.cpus_allowed tracks cpu_online_mask; it's read-only */
@@ -942,9 +944,9 @@ static int update_cpumask(struct cpuset *cs, struct cpuset 
*trialcs,
if (retval  0)
return retval;
 
-   mutex_lock(callback_mutex);
+   spin_lock_irqsave(callback_lock, flags);
cpumask_copy(cs-cpus_allowed, trialcs-cpus_allowed);
-   mutex_unlock(callback_mutex);
+

[PATCH] cpuset: fix the WARN_ON() in update_nodemasks_hier()

2014-07-30 Thread Li Zefan

The WARN_ON() is used to check if we break the legal hierarchy, on
which the effective mems should be equal to configured mems.

Reported-by: Mike Qiu 
Tested-by: Mike Qiu 
Signed-off-by: Li Zefan 
---
 kernel/cpuset.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/kernel/cpuset.c b/kernel/cpuset.c
index 53a9bbf..baa155c 100644
--- a/kernel/cpuset.c
+++ b/kernel/cpuset.c
@@ -1136,7 +1136,7 @@ static void update_nodemasks_hier(struct cpuset *cs, 
nodemask_t *new_mems)
mutex_unlock(_mutex);
 
WARN_ON(!cgroup_on_dfl(cp->css.cgroup) &&
-   nodes_equal(cp->mems_allowed, cp->effective_mems));
+   !nodes_equal(cp->mems_allowed, cp->effective_mems));
 
update_tasks_nodemask(cp);
 
-- 
1.8.0.2

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

[PATCH] cpuset: fix the WARN_ON() in update_nodemasks_hier()

2014-07-30 Thread Li Zefan

The WARN_ON() is used to check if we break the legal hierarchy, on
which the effective mems should be equal to configured mems.

Reported-by: Mike Qiu qiud...@linux.vnet.ibm.com
Tested-by: Mike Qiu qiud...@linux.vnet.ibm.com
Signed-off-by: Li Zefan lize...@huawei.com
---
 kernel/cpuset.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/kernel/cpuset.c b/kernel/cpuset.c
index 53a9bbf..baa155c 100644
--- a/kernel/cpuset.c
+++ b/kernel/cpuset.c
@@ -1136,7 +1136,7 @@ static void update_nodemasks_hier(struct cpuset *cs, 
nodemask_t *new_mems)
mutex_unlock(callback_mutex);
 
WARN_ON(!cgroup_on_dfl(cp-css.cgroup) 
-   nodes_equal(cp-mems_allowed, cp-effective_mems));
+   !nodes_equal(cp-mems_allowed, cp-effective_mems));
 
update_tasks_nodemask(cp);
 
-- 
1.8.0.2

--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: WARNING: at kernel/cpuset.c:1139

2014-07-29 Thread Li Zefan

On 2014/7/29 3:20, Tejun Heo wrote:
> On Thu, Jul 24, 2014 at 08:27:40AM +0800, Li Zefan wrote:
>> On 2014/7/23 23:12, Tejun Heo wrote:
>>> On Wed, Jul 23, 2014 at 10:50:29AM +0800, Mike Qiu wrote:
>>>> commit 734d45130cb ("cpuset: update cs->effective_{cpus, mems} when config
>>>> changes") introduce the below warning in my server.
>>>>
>>>> [   35.652137] [ cut here ]
>>>> [   35.652141] WARNING: at kernel/cpuset.c:1139
>>>
>>> Hah, can you reproduce it?  If so, can you detail how?
>>>
>>
>> It's a typo.
>>
>> WARN_ON(!cgroup_on_dfl(cp->css.cgroup) &&
>>  nodes_equal(cp->mems_allowed, cp->effective_mems));
>>
>> should be
>>
>> WARN_ON(!cgroup_on_dfl(cp->css.cgroup) &&
>>  !nodes_equal(cp->mems_allowed, cp->effective_mems));
> 
> Care to post a patch?
> 

Sorry for the delay. I had been off office for the last two weeks.
I'll do this tomorrow.

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: WARNING: at kernel/cpuset.c:1139

2014-07-29 Thread Li Zefan

On 2014/7/29 3:20, Tejun Heo wrote:
 On Thu, Jul 24, 2014 at 08:27:40AM +0800, Li Zefan wrote:
 On 2014/7/23 23:12, Tejun Heo wrote:
 On Wed, Jul 23, 2014 at 10:50:29AM +0800, Mike Qiu wrote:
 commit 734d45130cb (cpuset: update cs-effective_{cpus, mems} when config
 changes) introduce the below warning in my server.

 [   35.652137] [ cut here ]
 [   35.652141] WARNING: at kernel/cpuset.c:1139

 Hah, can you reproduce it?  If so, can you detail how?


 It's a typo.

 WARN_ON(!cgroup_on_dfl(cp-css.cgroup) 
  nodes_equal(cp-mems_allowed, cp-effective_mems));

 should be

 WARN_ON(!cgroup_on_dfl(cp-css.cgroup) 
  !nodes_equal(cp-mems_allowed, cp-effective_mems));
 
 Care to post a patch?
 

Sorry for the delay. I had been off office for the last two weeks.
I'll do this tomorrow.

--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: WARNING: at kernel/cpuset.c:1139

2014-07-23 Thread Li Zefan

On 2014/7/23 23:12, Tejun Heo wrote:
> On Wed, Jul 23, 2014 at 10:50:29AM +0800, Mike Qiu wrote:
>> commit 734d45130cb ("cpuset: update cs->effective_{cpus, mems} when config
>> changes") introduce the below warning in my server.
>>
>> [   35.652137] [ cut here ]
>> [   35.652141] WARNING: at kernel/cpuset.c:1139
> 
> Hah, can you reproduce it?  If so, can you detail how?
> 

It's a typo.

WARN_ON(!cgroup_on_dfl(cp->css.cgroup) &&
nodes_equal(cp->mems_allowed, cp->effective_mems));

should be

WARN_ON(!cgroup_on_dfl(cp->css.cgroup) &&
!nodes_equal(cp->mems_allowed, cp->effective_mems));

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: WARNING: at kernel/cpuset.c:1139

2014-07-23 Thread Li Zefan

On 2014/7/23 23:12, Tejun Heo wrote:
 On Wed, Jul 23, 2014 at 10:50:29AM +0800, Mike Qiu wrote:
 commit 734d45130cb (cpuset: update cs-effective_{cpus, mems} when config
 changes) introduce the below warning in my server.

 [   35.652137] [ cut here ]
 [   35.652141] WARNING: at kernel/cpuset.c:1139
 
 Hah, can you reproduce it?  If so, can you detail how?
 

It's a typo.

WARN_ON(!cgroup_on_dfl(cp-css.cgroup) 
nodes_equal(cp-mems_allowed, cp-effective_mems));

should be

WARN_ON(!cgroup_on_dfl(cp-css.cgroup) 
!nodes_equal(cp-mems_allowed, cp-effective_mems));

--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: [PATCHSET v2 cgroup/for-3.17] cgroup: distinguish the default and legacy hierarchies when handling cftypes

2014-07-15 Thread Li Zefan

于 2014/7/14 23:44, Tejun Heo 写道:
> Hello,
> 
> This is v2 of dfl_files patchset.  Changes from the last version [1]
> are
> 
> * Rebased on top of cgroup/for-3.17.
> 
> * 0004 updated so that CFTYPE_ONLY_ON_DFL and CFTYPE_INSANE are
>   cleared when cfts are removed as suggested by Li.
> 
> Until now, cftype arrays carried files for both the default and legacy
> hierarchies and the files which needed to be used on only one of them
> were flagged with either CFTYPE_ONLY_ON_DFL or CFTYPE_INSANE.  This
> gets confusing very quickly and we may end up exposing interface files
> to the default hierarchy without thinking it through.
> 
> This patchset makes cgroup core provide separate sets of interfaces
> for cftype handling so that the cftypes for the default and legacy
> hierarchies are clearly distinguished.  This makes all the existing
> subsystem interface files legacy-only by default and all subsystems
> will have no interface file created when enabled on the default
> hierarchy.  Each subsystem should explicitly review and compose the
> interface for the default hierarchy.
> 
> This patchset contains the following six patches.
> 
>  0001-cgroup-split-cgroup_base_files-into-cgroup_-dfl-lega.patch
>  0002-cgroup-rename-cgroup_subsys-base_cftypes-to-legacy_c.patch
>  0003-cgroup-replace-cgroup_add_cftypes-with-cgroup_add_le.patch
>  0004-cgroup-distinguish-the-default-and-legacy-hierarchie.patch
>  0005-cgroup-make-CFTYPE_ONLY_ON_DFL-and-CFTYPE_NO_-intern.patch
>  0006-cgroup-initialize-cgrp_dfl_root_inhibit_ss_mask-from.patch
> 
> This patchset is on top of afd1a8b3e0bc ("cpuset: export effective
> masks to userspace")
> 
> and available in the following git branch.
> 
>  git://git.kernel.org/pub/scm/linux/kernel/git/tj/cgroup.git 
> review-dfl_files-v2
> 
> diffstat follows.  Thanks.
> 
>  Documentation/cgroups/unified-hierarchy.txt |   18 ++-
>  block/blk-cgroup.c  |5
>  include/linux/cgroup.h  |   17 ++
>  kernel/cgroup.c |  160 
> +---
>  kernel/cgroup_freezer.c |2
>  kernel/cpuset.c |2
>  kernel/sched/core.c |2
>  kernel/sched/cpuacct.c  |2
>  mm/hugetlb_cgroup.c |5
>  mm/memcontrol.c |6 -
>  net/core/netclassid_cgroup.c|2
>  net/core/netprio_cgroup.c   |2
>  net/ipv4/tcp_memcontrol.c   |2
>  security/device_cgroup.c|2
>  14 files changed, 160 insertions(+), 67 deletions(-)
> 

Acked-by: Li Zefan 

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: [PATCHSET v2 cgroup/for-3.17] cgroup: distinguish the default and legacy hierarchies when handling cftypes

2014-07-15 Thread Li Zefan

于 2014/7/14 23:44, Tejun Heo 写道:
 Hello,
 
 This is v2 of dfl_files patchset.  Changes from the last version [1]
 are
 
 * Rebased on top of cgroup/for-3.17.
 
 * 0004 updated so that CFTYPE_ONLY_ON_DFL and CFTYPE_INSANE are
   cleared when cfts are removed as suggested by Li.
 
 Until now, cftype arrays carried files for both the default and legacy
 hierarchies and the files which needed to be used on only one of them
 were flagged with either CFTYPE_ONLY_ON_DFL or CFTYPE_INSANE.  This
 gets confusing very quickly and we may end up exposing interface files
 to the default hierarchy without thinking it through.
 
 This patchset makes cgroup core provide separate sets of interfaces
 for cftype handling so that the cftypes for the default and legacy
 hierarchies are clearly distinguished.  This makes all the existing
 subsystem interface files legacy-only by default and all subsystems
 will have no interface file created when enabled on the default
 hierarchy.  Each subsystem should explicitly review and compose the
 interface for the default hierarchy.
 
 This patchset contains the following six patches.
 
  0001-cgroup-split-cgroup_base_files-into-cgroup_-dfl-lega.patch
  0002-cgroup-rename-cgroup_subsys-base_cftypes-to-legacy_c.patch
  0003-cgroup-replace-cgroup_add_cftypes-with-cgroup_add_le.patch
  0004-cgroup-distinguish-the-default-and-legacy-hierarchie.patch
  0005-cgroup-make-CFTYPE_ONLY_ON_DFL-and-CFTYPE_NO_-intern.patch
  0006-cgroup-initialize-cgrp_dfl_root_inhibit_ss_mask-from.patch
 
 This patchset is on top of afd1a8b3e0bc (cpuset: export effective
 masks to userspace)
 
 and available in the following git branch.
 
  git://git.kernel.org/pub/scm/linux/kernel/git/tj/cgroup.git 
 review-dfl_files-v2
 
 diffstat follows.  Thanks.
 
  Documentation/cgroups/unified-hierarchy.txt |   18 ++-
  block/blk-cgroup.c  |5
  include/linux/cgroup.h  |   17 ++
  kernel/cgroup.c |  160 
 +---
  kernel/cgroup_freezer.c |2
  kernel/cpuset.c |2
  kernel/sched/core.c |2
  kernel/sched/cpuacct.c  |2
  mm/hugetlb_cgroup.c |5
  mm/memcontrol.c |6 -
  net/core/netclassid_cgroup.c|2
  net/core/netprio_cgroup.c   |2
  net/ipv4/tcp_memcontrol.c   |2
  security/device_cgroup.c|2
  14 files changed, 160 insertions(+), 67 deletions(-)
 

Acked-by: Li Zefan lize...@huawei.com

--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: [PATCH 4/5] cgroup: distinguish the default and legacy hierarchies when handling cftypes

2014-07-13 Thread Li Zefan

> @@ -3085,8 +3091,37 @@ static int cgroup_add_cftypes(struct cgroup_subsys 
> *ss, struct cftype *cfts)
>   return ret;
>  }
>  
> +/**
> + * cgroup_add_dfl_cftypes - add an array of cftypes for default hierarchy
> + * @ss: target cgroup subsystem
> + * @cfts: zero-length name terminated array of cftypes
> + *
> + * Similar to cgroup_add_cftypes() but the added files are only used for
> + * the default hierarchy.
> + */
> +int cgroup_add_dfl_cftypes(struct cgroup_subsys *ss, struct cftype *cfts)
> +{
> + struct cftype *cft;
> +
> + for (cft = cfts; cft && cft->name[0] != '\0'; cft++)
> + cft->flags |= CFTYPE_ONLY_ON_DFL;

I think we should remove this flag in cgroup_rm_cftypes_locked(). Otherwise
if we call cgroup_add_dlf_cftypes() and then cgroup_rm_cftypes() and then
cgroup_add_legacy_cftypes() for the same @cfts, both CFTYPE_ONLY_ON_DFL and
CFTYPE_INSANE are set.

> + return cgroup_add_cftypes(ss, cfts);
> +}
> +
> +/**
> + * cgroup_add_legacy_cftypes - add an array of cftypes for legacy hierarchies
> + * @ss: target cgroup subsystem
> + * @cfts: zero-length name terminated array of cftypes
> + *
> + * Similar to cgroup_add_cftypes() but the added files are only used for
> + * the legacy hierarchies.
> + */
>  int cgroup_add_legacy_cftypes(struct cgroup_subsys *ss, struct cftype *cfts)
>  {
> + struct cftype *cft;
> +
> + for (cft = cfts; cft && cft->name[0] != '\0'; cft++)
> + cft->flags |= CFTYPE_INSANE;
>   return cgroup_add_cftypes(ss, cfts);
>  }

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: [PATCH 4/5] cgroup: distinguish the default and legacy hierarchies when handling cftypes

2014-07-13 Thread Li Zefan

 @@ -3085,8 +3091,37 @@ static int cgroup_add_cftypes(struct cgroup_subsys 
 *ss, struct cftype *cfts)
   return ret;
  }
  
 +/**
 + * cgroup_add_dfl_cftypes - add an array of cftypes for default hierarchy
 + * @ss: target cgroup subsystem
 + * @cfts: zero-length name terminated array of cftypes
 + *
 + * Similar to cgroup_add_cftypes() but the added files are only used for
 + * the default hierarchy.
 + */
 +int cgroup_add_dfl_cftypes(struct cgroup_subsys *ss, struct cftype *cfts)
 +{
 + struct cftype *cft;
 +
 + for (cft = cfts; cft  cft-name[0] != '\0'; cft++)
 + cft-flags |= CFTYPE_ONLY_ON_DFL;

I think we should remove this flag in cgroup_rm_cftypes_locked(). Otherwise
if we call cgroup_add_dlf_cftypes() and then cgroup_rm_cftypes() and then
cgroup_add_legacy_cftypes() for the same @cfts, both CFTYPE_ONLY_ON_DFL and
CFTYPE_INSANE are set.

 + return cgroup_add_cftypes(ss, cfts);
 +}
 +
 +/**
 + * cgroup_add_legacy_cftypes - add an array of cftypes for legacy hierarchies
 + * @ss: target cgroup subsystem
 + * @cfts: zero-length name terminated array of cftypes
 + *
 + * Similar to cgroup_add_cftypes() but the added files are only used for
 + * the legacy hierarchies.
 + */
  int cgroup_add_legacy_cftypes(struct cgroup_subsys *ss, struct cftype *cfts)
  {
 + struct cftype *cft;
 +
 + for (cft = cfts; cft  cft-name[0] != '\0'; cft++)
 + cft-flags |= CFTYPE_INSANE;
   return cgroup_add_cftypes(ss, cfts);
  }

--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

[PATCH v3 09/12] cpuset: refactor cpuset_hotplug_update_tasks()

2014-07-09 Thread Li Zefan

We mix the handling for both default hierarchy and legacy hierarchy in
the same function, and it's quite messy, so split into two functions.

Signed-off-by: Li Zefan 
---
 kernel/cpuset.c | 121 ++--
 1 file changed, 66 insertions(+), 55 deletions(-)

diff --git a/kernel/cpuset.c b/kernel/cpuset.c
index 4b409d2..41822e2 100644
--- a/kernel/cpuset.c
+++ b/kernel/cpuset.c
@@ -2080,6 +2080,65 @@ static void remove_tasks_in_empty_cpuset(struct cpuset 
*cs)
}
 }
 
+static void hotplug_update_tasks_legacy(struct cpuset *cs,
+   struct cpumask *off_cpus,
+   nodemask_t *off_mems)
+{
+   bool is_empty;
+
+   mutex_lock(_mutex);
+   cpumask_andnot(cs->cpus_allowed, cs->cpus_allowed, off_cpus);
+   cpumask_andnot(cs->effective_cpus, cs->effective_cpus, off_cpus);
+   nodes_andnot(cs->mems_allowed, cs->mems_allowed, *off_mems);
+   nodes_andnot(cs->effective_mems, cs->effective_mems, *off_mems);
+   mutex_unlock(_mutex);
+
+   /*
+* Don't call update_tasks_cpumask() if the cpuset becomes empty,
+* as the tasks will be migratecd to an ancestor.
+*/
+   if (!cpumask_empty(off_cpus) && !cpumask_empty(cs->cpus_allowed))
+   update_tasks_cpumask(cs);
+   if (!nodes_empty(*off_mems) && !nodes_empty(cs->mems_allowed))
+   update_tasks_nodemask(cs);
+
+   is_empty = cpumask_empty(cs->cpus_allowed) ||
+  nodes_empty(cs->mems_allowed);
+
+   mutex_unlock(_mutex);
+
+   /*
+* Move tasks to the nearest ancestor with execution resources,
+* This is full cgroup operation which will also call back into
+* cpuset. Should be done outside any lock.
+*/
+   if (is_empty)
+   remove_tasks_in_empty_cpuset(cs);
+
+   mutex_lock(_mutex);
+}
+
+static void hotplug_update_tasks(struct cpuset *cs,
+struct cpumask *off_cpus,
+nodemask_t *off_mems)
+{
+   mutex_lock(_mutex);
+   cpumask_andnot(cs->effective_cpus, cs->effective_cpus, off_cpus);
+   if (cpumask_empty(cs->effective_cpus))
+   cpumask_copy(cs->effective_cpus,
+parent_cs(cs)->effective_cpus);
+
+   nodes_andnot(cs->effective_mems, cs->effective_mems, *off_mems);
+   if (nodes_empty(cs->effective_mems))
+   cs->effective_mems = parent_cs(cs)->effective_mems;
+   mutex_unlock(_mutex);
+
+   if (!cpumask_empty(off_cpus))
+   update_tasks_cpumask(cs);
+   if (!nodes_empty(*off_mems))
+   update_tasks_nodemask(cs);
+}
+
 /**
  * cpuset_hotplug_update_tasks - update tasks in a cpuset for hotunplug
  * @cs: cpuset in interest
@@ -2092,9 +2151,6 @@ static void cpuset_hotplug_update_tasks(struct cpuset *cs)
 {
static cpumask_t off_cpus;
static nodemask_t off_mems;
-   bool is_empty;
-   bool on_dfl = cgroup_on_dfl(cs->css.cgroup);
-
 retry:
wait_event(cpuset_attach_wq, cs->attach_in_progress == 0);
 
@@ -2109,61 +2165,16 @@ retry:
goto retry;
}
 
-   cpumask_andnot(_cpus, cs->cpus_allowed, top_cpuset.cpus_allowed);
-   nodes_andnot(off_mems, cs->mems_allowed, top_cpuset.mems_allowed);
-
-   mutex_lock(_mutex);
-   cpumask_andnot(cs->cpus_allowed, cs->cpus_allowed, _cpus);
-
-   /* Inherit the effective mask of the parent, if it becomes empty. */
-   cpumask_andnot(cs->effective_cpus, cs->effective_cpus, _cpus);
-   if (on_dfl && cpumask_empty(cs->effective_cpus))
-   cpumask_copy(cs->effective_cpus, parent_cs(cs)->effective_cpus);
-   mutex_unlock(_mutex);
-
-   /*
-* If on_dfl, we need to update tasks' cpumask for empty cpuset to
-* take on ancestor's cpumask. Otherwise, don't call
-* update_tasks_cpumask() if the cpuset becomes empty, as the tasks
-* in it will be migrated to an ancestor.
-*/
-   if ((on_dfl && cpumask_empty(cs->cpus_allowed)) ||
-   (!cpumask_empty(_cpus) && !cpumask_empty(cs->cpus_allowed)))
-   update_tasks_cpumask(cs);
-
-   mutex_lock(_mutex);
-   nodes_andnot(cs->mems_allowed, cs->mems_allowed, off_mems);
+   cpumask_andnot(_cpus, cs->effective_cpus,
+  top_cpuset.effective_cpus);
+   nodes_andnot(off_mems, cs->effective_mems, top_cpuset.effective_mems);
 
-   /* Inherit the effective mask of the parent, if it becomes empty */
-   nodes_andnot(cs->effective_mems, cs->effective_mems, off_mems);
-   if (on_dfl && nodes_empty(cs->effective_mems))
-   cs->effective_mems = parent_cs(cs)->effective_mems;

[PATCH v3 10/12] cpuset: enable onlined cpu/node in effective masks

2014-07-09 Thread Li Zefan

Firstly offline cpu1:

  # echo 0-1 > cpuset.cpus
  # echo 0 > /sys/devices/system/cpu/cpu1/online
  # cat cpuset.cpus
  0-1
  # cat cpuset.effective_cpus
  0

Then online it:

  # echo 1 > /sys/devices/system/cpu/cpu1/online
  # cat cpuset.cpus
  0-1
  # cat cpuset.effective_cpus
  0-1

And cpuset will bring it back to the effective mask.

The implementation is quite straightforward. Instead of calculating the
offlined cpus/mems and do updates, we just set the new effective_mask
to online_mask & congifured_mask.

This is a behavior change for default hierarchy, so legacy hierarchy
won't be affected.

v2:
- make refactoring of cpuset_hotplug_update_tasks() as seperate patch,
  suggested by Tejun.
- make hotplug_update_tasks_insane() use @new_cpus and @new_mems as
  hotplug_update_tasks_sane() does.

Signed-off-by: Li Zefan 
---
 kernel/cpuset.c | 65 -
 1 file changed, 36 insertions(+), 29 deletions(-)

diff --git a/kernel/cpuset.c b/kernel/cpuset.c
index 41822e2..c47cb94 100644
--- a/kernel/cpuset.c
+++ b/kernel/cpuset.c
@@ -2080,26 +2080,27 @@ static void remove_tasks_in_empty_cpuset(struct cpuset 
*cs)
}
 }
 
-static void hotplug_update_tasks_legacy(struct cpuset *cs,
-   struct cpumask *off_cpus,
-   nodemask_t *off_mems)
+static void
+hotplug_update_tasks_legacy(struct cpuset *cs,
+   struct cpumask *new_cpus, nodemask_t *new_mems,
+   bool cpus_updated, bool mems_updated)
 {
bool is_empty;
 
mutex_lock(_mutex);
-   cpumask_andnot(cs->cpus_allowed, cs->cpus_allowed, off_cpus);
-   cpumask_andnot(cs->effective_cpus, cs->effective_cpus, off_cpus);
-   nodes_andnot(cs->mems_allowed, cs->mems_allowed, *off_mems);
-   nodes_andnot(cs->effective_mems, cs->effective_mems, *off_mems);
+   cpumask_copy(cs->cpus_allowed, new_cpus);
+   cpumask_copy(cs->effective_cpus, new_cpus);
+   cs->mems_allowed = *new_mems;
+   cs->effective_mems = *new_mems;
mutex_unlock(_mutex);
 
/*
 * Don't call update_tasks_cpumask() if the cpuset becomes empty,
 * as the tasks will be migratecd to an ancestor.
 */
-   if (!cpumask_empty(off_cpus) && !cpumask_empty(cs->cpus_allowed))
+   if (cpus_updated && !cpumask_empty(cs->cpus_allowed))
update_tasks_cpumask(cs);
-   if (!nodes_empty(*off_mems) && !nodes_empty(cs->mems_allowed))
+   if (mems_updated && !nodes_empty(cs->mems_allowed))
update_tasks_nodemask(cs);
 
is_empty = cpumask_empty(cs->cpus_allowed) ||
@@ -2118,24 +2119,24 @@ static void hotplug_update_tasks_legacy(struct cpuset 
*cs,
mutex_lock(_mutex);
 }
 
-static void hotplug_update_tasks(struct cpuset *cs,
-struct cpumask *off_cpus,
-nodemask_t *off_mems)
+static void
+hotplug_update_tasks(struct cpuset *cs,
+struct cpumask *new_cpus, nodemask_t *new_mems,
+bool cpus_updated, bool mems_updated)
 {
+   if (cpumask_empty(new_cpus))
+   cpumask_copy(new_cpus, parent_cs(cs)->effective_cpus);
+   if (nodes_empty(*new_mems))
+   *new_mems = parent_cs(cs)->effective_mems;
+
mutex_lock(_mutex);
-   cpumask_andnot(cs->effective_cpus, cs->effective_cpus, off_cpus);
-   if (cpumask_empty(cs->effective_cpus))
-   cpumask_copy(cs->effective_cpus,
-parent_cs(cs)->effective_cpus);
-
-   nodes_andnot(cs->effective_mems, cs->effective_mems, *off_mems);
-   if (nodes_empty(cs->effective_mems))
-   cs->effective_mems = parent_cs(cs)->effective_mems;
+   cpumask_copy(cs->effective_cpus, new_cpus);
+   cs->effective_mems = *new_mems;
mutex_unlock(_mutex);
 
-   if (!cpumask_empty(off_cpus))
+   if (cpus_updated)
update_tasks_cpumask(cs);
-   if (!nodes_empty(*off_mems))
+   if (mems_updated)
update_tasks_nodemask(cs);
 }
 
@@ -2149,8 +2150,10 @@ static void hotplug_update_tasks(struct cpuset *cs,
  */
 static void cpuset_hotplug_update_tasks(struct cpuset *cs)
 {
-   static cpumask_t off_cpus;
-   static nodemask_t off_mems;
+   static cpumask_t new_cpus;
+   static nodemask_t new_mems;
+   bool cpus_updated;
+   bool mems_updated;
 retry:
wait_event(cpuset_attach_wq, cs->attach_in_progress == 0);
 
@@ -2165,14 +2168,18 @@ retry:
goto retry;
}
 
-   cpumask_andnot(_cpus, cs->effective_cpus,
-  top_cpuset.effective_cpus);
-   nodes_andnot(off_mems, cs->effective_mems, top_cpuset.effective_mems);
+

[PATCH v3 12/12] cpuset: export effective masks to userspace

2014-07-09 Thread Li Zefan

cpuset.cpus and cpuset.mems are the configured masks, and we need
to export effective masks to userspace, so users know the real
cpus_allowed and mems_allowed that apply to the tasks in a cpuset.

v2:
- export those masks unconditionally, suggested by Tejun.

Signed-off-by: Li Zefan 
---
 kernel/cpuset.c | 20 
 1 file changed, 20 insertions(+)

diff --git a/kernel/cpuset.c b/kernel/cpuset.c
index 65878a7..53a9bbf 100644
--- a/kernel/cpuset.c
+++ b/kernel/cpuset.c
@@ -1535,6 +1535,8 @@ typedef enum {
FILE_MEMORY_MIGRATE,
FILE_CPULIST,
FILE_MEMLIST,
+   FILE_EFFECTIVE_CPULIST,
+   FILE_EFFECTIVE_MEMLIST,
FILE_CPU_EXCLUSIVE,
FILE_MEM_EXCLUSIVE,
FILE_MEM_HARDWALL,
@@ -1701,6 +1703,12 @@ static int cpuset_common_seq_show(struct seq_file *sf, 
void *v)
case FILE_MEMLIST:
s += nodelist_scnprintf(s, count, cs->mems_allowed);
break;
+   case FILE_EFFECTIVE_CPULIST:
+   s += cpulist_scnprintf(s, count, cs->effective_cpus);
+   break;
+   case FILE_EFFECTIVE_MEMLIST:
+   s += nodelist_scnprintf(s, count, cs->effective_mems);
+   break;
default:
ret = -EINVAL;
goto out_unlock;
@@ -1786,6 +1794,18 @@ static struct cftype files[] = {
},
 
{
+   .name = "effective_cpus",
+   .seq_show = cpuset_common_seq_show,
+   .private = FILE_EFFECTIVE_CPULIST,
+   },
+
+   {
+   .name = "effective_mems",
+   .seq_show = cpuset_common_seq_show,
+   .private = FILE_EFFECTIVE_MEMLIST,
+   },
+
+   {
.name = "cpu_exclusive",
.read_u64 = cpuset_read_u64,
.write_u64 = cpuset_write_u64,
-- 
1.8.0.2

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

[PATCH v3 11/12] cpuset: allow writing offlined masks to cpuset.cpus/mems

2014-07-09 Thread Li Zefan

As the configured masks won't be limited by its parent, and the top
cpuset's masks won't change when hotplug happens, it's natural to
allow writing offlined masks to the configured masks.

If on default hierarchy:

# echo 0 > /sys/devices/system/cpu/cpu1/online
# mkdir /cpuset/sub
# echo 1 > /cpuset/sub/cpuset.cpus
# cat /cpuset/sub/cpuset.cpus
1

If on legacy hierarchy:

# echo 0 > /sys/devices/system/cpu/cpu1/online
# mkdir /cpuset/sub
# echo 1 > /cpuset/sub/cpuset.cpus
-bash: echo: write error: Invalid argument

Note the checks don't need to be gated by cgroup_on_dfl, because we've
initialized top_cpuset.{cpus,mems}_allowed accordingly in cpuset_bind().

Signed-off-by: Li Zefan 
---
 kernel/cpuset.c | 7 ---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/kernel/cpuset.c b/kernel/cpuset.c
index c47cb94..65878a7 100644
--- a/kernel/cpuset.c
+++ b/kernel/cpuset.c
@@ -929,7 +929,8 @@ static int update_cpumask(struct cpuset *cs, struct cpuset 
*trialcs,
if (retval < 0)
return retval;
 
-   if (!cpumask_subset(trialcs->cpus_allowed, cpu_active_mask))
+   if (!cpumask_subset(trialcs->cpus_allowed,
+   top_cpuset.cpus_allowed))
return -EINVAL;
}
 
@@ -1186,8 +1187,8 @@ static int update_nodemask(struct cpuset *cs, struct 
cpuset *trialcs,
goto done;
 
if (!nodes_subset(trialcs->mems_allowed,
-   node_states[N_MEMORY])) {
-   retval =  -EINVAL;
+ top_cpuset.mems_allowed)) {
+   retval = -EINVAL;
goto done;
}
}
-- 
1.8.0.2

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

[PATCH v3 07/12] cpuset: apply cs->effective_{cpus,mems}

2014-07-09 Thread Li Zefan

Now we can use cs->effective_{cpus,mems} as effective masks. It's
used whenever:

- we update tasks' cpus_allowed/mems_allowed,
- we want to retrieve tasks_cs(tsk)'s cpus_allowed/mems_allowed.

They actually replace effective_{cpu,node}mask_cpuset().

effective_mask == configured_mask & parent effective_mask except when
the reault is empty, in which case it inherits parent effective_mask.
The result equals the mask computed from effective_{cpu,node}mask_cpuset().

This won't affect the original legacy hierarchy, because in this case we
make sure the effective masks are always the same with user-configured
masks.

Signed-off-by: Li Zefan 
---
 kernel/cpuset.c | 83 ++---
 1 file changed, 14 insertions(+), 69 deletions(-)

diff --git a/kernel/cpuset.c b/kernel/cpuset.c
index e4c31e6..820870a 100644
--- a/kernel/cpuset.c
+++ b/kernel/cpuset.c
@@ -313,9 +313,9 @@ static struct file_system_type cpuset_fs_type = {
  */
 static void guarantee_online_cpus(struct cpuset *cs, struct cpumask *pmask)
 {
-   while (!cpumask_intersects(cs->cpus_allowed, cpu_online_mask))
+   while (!cpumask_intersects(cs->effective_cpus, cpu_online_mask))
cs = parent_cs(cs);
-   cpumask_and(pmask, cs->cpus_allowed, cpu_online_mask);
+   cpumask_and(pmask, cs->effective_cpus, cpu_online_mask);
 }
 
 /*
@@ -331,9 +331,9 @@ static void guarantee_online_cpus(struct cpuset *cs, struct 
cpumask *pmask)
  */
 static void guarantee_online_mems(struct cpuset *cs, nodemask_t *pmask)
 {
-   while (!nodes_intersects(cs->mems_allowed, node_states[N_MEMORY]))
+   while (!nodes_intersects(cs->effective_mems, node_states[N_MEMORY]))
cs = parent_cs(cs);
-   nodes_and(*pmask, cs->mems_allowed, node_states[N_MEMORY]);
+   nodes_and(*pmask, cs->effective_mems, node_states[N_MEMORY]);
 }
 
 /*
@@ -795,45 +795,6 @@ void rebuild_sched_domains(void)
mutex_unlock(_mutex);
 }
 
-/*
- * effective_cpumask_cpuset - return nearest ancestor with non-empty cpus
- * @cs: the cpuset in interest
- *
- * A cpuset's effective cpumask is the cpumask of the nearest ancestor
- * with non-empty cpus. We use effective cpumask whenever:
- * - we update tasks' cpus_allowed. (they take on the ancestor's cpumask
- *   if the cpuset they reside in has no cpus)
- * - we want to retrieve task_cs(tsk)'s cpus_allowed.
- *
- * Called with cpuset_mutex held. cpuset_cpus_allowed_fallback() is an
- * exception. See comments there.
- */
-static struct cpuset *effective_cpumask_cpuset(struct cpuset *cs)
-{
-   while (cpumask_empty(cs->cpus_allowed))
-   cs = parent_cs(cs);
-   return cs;
-}
-
-/*
- * effective_nodemask_cpuset - return nearest ancestor with non-empty mems
- * @cs: the cpuset in interest
- *
- * A cpuset's effective nodemask is the nodemask of the nearest ancestor
- * with non-empty memss. We use effective nodemask whenever:
- * - we update tasks' mems_allowed. (they take on the ancestor's nodemask
- *   if the cpuset they reside in has no mems)
- * - we want to retrieve task_cs(tsk)'s mems_allowed.
- *
- * Called with cpuset_mutex held.
- */
-static struct cpuset *effective_nodemask_cpuset(struct cpuset *cs)
-{
-   while (nodes_empty(cs->mems_allowed))
-   cs = parent_cs(cs);
-   return cs;
-}
-
 /**
  * update_tasks_cpumask - Update the cpumasks of tasks in the cpuset.
  * @cs: the cpuset in which each task's cpus_allowed mask needs to be changed
@@ -844,13 +805,12 @@ static struct cpuset *effective_nodemask_cpuset(struct 
cpuset *cs)
  */
 static void update_tasks_cpumask(struct cpuset *cs)
 {
-   struct cpuset *cpus_cs = effective_cpumask_cpuset(cs);
struct css_task_iter it;
struct task_struct *task;
 
css_task_iter_start(>css, );
while ((task = css_task_iter_next()))
-   set_cpus_allowed_ptr(task, cpus_cs->cpus_allowed);
+   set_cpus_allowed_ptr(task, cs->effective_cpus);
css_task_iter_end();
 }
 
@@ -988,15 +948,13 @@ static void cpuset_migrate_mm(struct mm_struct *mm, const 
nodemask_t *from,
const nodemask_t *to)
 {
struct task_struct *tsk = current;
-   struct cpuset *mems_cs;
 
tsk->mems_allowed = *to;
 
do_migrate_pages(mm, from, to, MPOL_MF_MOVE_ALL);
 
rcu_read_lock();
-   mems_cs = effective_nodemask_cpuset(task_cs(tsk));
-   guarantee_online_mems(mems_cs, >mems_allowed);
+   guarantee_online_mems(task_cs(tsk), >mems_allowed);
rcu_read_unlock();
 }
 
@@ -1065,13 +1023,12 @@ static void *cpuset_being_rebound;
 static void update_tasks_nodemask(struct cpuset *cs)
 {
static nodemask_t newmems;  /* protected by cpuset_mutex */
-   struct cpuset *mems_cs = effective_nodemask_cpuset(cs);
struct css_task_iter it;
struct task_struct *task;

[PATCH v3 08/12] cpuset: make cs->{cpus,mems}_allowed as user-configured masks

2014-07-09 Thread Li Zefan

Now we've used effective cpumasks to enforce hierarchical manner,
we can use cs->{cpus,mems}_allowed as configured masks.

Configured masks can be changed by writing cpuset.cpus and cpuset.mems
only. The new behaviors are:

- They won't be changed by hotplug anymore.
- They won't be limited by its parent's masks.

This ia a behavior change, but won't take effect unless mount with
sane_behavior.

v2:
- Add comments to explain the differences between configured masks and
effective masks.

Signed-off-by: Li Zefan 
---
 kernel/cpuset.c | 35 +--
 1 file changed, 29 insertions(+), 6 deletions(-)

diff --git a/kernel/cpuset.c b/kernel/cpuset.c
index 820870a..4b409d2 100644
--- a/kernel/cpuset.c
+++ b/kernel/cpuset.c
@@ -77,6 +77,26 @@ struct cpuset {
 
unsigned long flags;/* "unsigned long" so bitops work */
 
+   /*
+* On default hierarchy:
+*
+* The user-configured masks can only be changed by writing to
+* cpuset.cpus and cpuset.mems, and won't be limited by the
+* parent masks.
+*
+* The effective masks is the real masks that apply to the tasks
+* in the cpuset. They may be changed if the configured masks are
+* changed or hotplug happens.
+*
+* effective_mask == configured_mask & parent's effective_mask,
+* and if it ends up empty, it will inherit the parent's mask.
+*
+*
+* On legacy hierachy:
+*
+* The user-configured masks are always the same with effective masks.
+*/
+
/* user-configured CPUs and Memory Nodes allow to tasks */
cpumask_var_t cpus_allowed;
nodemask_t mems_allowed;
@@ -450,9 +470,9 @@ static int validate_change(struct cpuset *cur, struct 
cpuset *trial)
 
par = parent_cs(cur);
 
-   /* We must be a subset of our parent cpuset */
+   /* On legacy hiearchy, we must be a subset of our parent cpuset. */
ret = -EACCES;
-   if (!is_cpuset_subset(trial, par))
+   if (!cgroup_on_dfl(cur->css.cgroup) && !is_cpuset_subset(trial, par))
goto out;
 
/*
@@ -2167,6 +2187,7 @@ static void cpuset_hotplug_workfn(struct work_struct 
*work)
static cpumask_t new_cpus;
static nodemask_t new_mems;
bool cpus_updated, mems_updated;
+   bool on_dfl = cgroup_on_dfl(top_cpuset.css.cgroup);
 
mutex_lock(_mutex);
 
@@ -2174,13 +2195,14 @@ static void cpuset_hotplug_workfn(struct work_struct 
*work)
cpumask_copy(_cpus, cpu_active_mask);
new_mems = node_states[N_MEMORY];
 
-   cpus_updated = !cpumask_equal(top_cpuset.cpus_allowed, _cpus);
-   mems_updated = !nodes_equal(top_cpuset.mems_allowed, new_mems);
+   cpus_updated = !cpumask_equal(top_cpuset.effective_cpus, _cpus);
+   mems_updated = !nodes_equal(top_cpuset.effective_mems, new_mems);
 
/* synchronize cpus_allowed to cpu_active_mask */
if (cpus_updated) {
mutex_lock(_mutex);
-   cpumask_copy(top_cpuset.cpus_allowed, _cpus);
+   if (!on_dfl)
+   cpumask_copy(top_cpuset.cpus_allowed, _cpus);
cpumask_copy(top_cpuset.effective_cpus, _cpus);
mutex_unlock(_mutex);
/* we don't mess with cpumasks of tasks in top_cpuset */
@@ -2189,7 +2211,8 @@ static void cpuset_hotplug_workfn(struct work_struct 
*work)
/* synchronize mems_allowed to N_MEMORY */
if (mems_updated) {
mutex_lock(_mutex);
-   top_cpuset.mems_allowed = new_mems;
+   if (!on_dfl)
+   top_cpuset.mems_allowed = new_mems;
top_cpuset.effective_mems = new_mems;
mutex_unlock(_mutex);
update_tasks_nodemask(_cpuset);
-- 
1.8.0.2

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

[PATCH v3 03/12] cpuset: update cs->effective_{cpus,mems} when config changes

2014-07-09 Thread Li Zefan

We're going to have separate user-configured masks and effective ones.

Eventually configured masks can only be changed by writing cpuset.cpus
and cpuset.mems, and they won't be restricted by parent cpuset. While
effective masks reflect cpu/memory hotplug and hierachical restriction,
and these are the real masks that apply to the tasks in the cpuset.

We calculate effective mask this way:
  - top cpuset's effective_mask == online_mask, otherwise
  - cpuset's effective_mask == configured_mask & parent effective_mask,
if the result is empty, it inherits parent effective mask.

Those behavior changes are for default hierarchy only. For legacy
hierarchy, effective_mask and configured_mask are the same, so we won't
break old interfaces.

To make cs->effective_{cpus,mems} to be effective masks, we need to
  - update the effective masks at hotplug
  - update the effective masks at config change
  - take on ancestor's mask when the effective mask is empty

The second item is done here. We don't need to treat root_cs specially
in update_cpumasks_hier().

This won't introduce behavior change.

v3:
- add a WARN_ON() to check if effective masks are the same with configured
  masks on legacy hierarchy.
- pass trialcs->cpus_allowed to update_cpumasks_hier() and add a comment for
  it. Similar change for update_nodemasks_hier(). Suggested by Tejun.

v2:
- revise the comment in update_{cpu,node}masks_hier(), suggested by Tejun.
- fix to use @cp instead of @cs in these two functions.

Signed-off-by: Li Zefan 
---
 kernel/cpuset.c | 88 +++--
 1 file changed, 54 insertions(+), 34 deletions(-)

diff --git a/kernel/cpuset.c b/kernel/cpuset.c
index 94f651d..da766c3 100644
--- a/kernel/cpuset.c
+++ b/kernel/cpuset.c
@@ -855,36 +855,45 @@ static void update_tasks_cpumask(struct cpuset *cs)
 }
 
 /*
- * update_tasks_cpumask_hier - Update the cpumasks of tasks in the hierarchy.
- * @root_cs: the root cpuset of the hierarchy
- * @update_root: update root cpuset or not?
+ * update_cpumasks_hier - Update effective cpumasks and tasks in the subtree
+ * @cs: the cpuset to consider
+ * @new_cpus: temp variable for calculating new effective_cpus
+ *
+ * When congifured cpumask is changed, the effective cpumasks of this cpuset
+ * and all its descendants need to be updated.
  *
- * This will update cpumasks of tasks in @root_cs and all other empty cpusets
- * which take on cpumask of @root_cs.
+ * On legacy hierachy, effective_cpus will be the same with cpu_allowed.
  *
  * Called with cpuset_mutex held
  */
-static void update_tasks_cpumask_hier(struct cpuset *root_cs, bool update_root)
+static void update_cpumasks_hier(struct cpuset *cs, struct cpumask *new_cpus)
 {
struct cpuset *cp;
struct cgroup_subsys_state *pos_css;
 
rcu_read_lock();
-   cpuset_for_each_descendant_pre(cp, pos_css, root_cs) {
-   if (cp == root_cs) {
-   if (!update_root)
-   continue;
-   } else {
-   /* skip the whole subtree if @cp have some CPU */
-   if (!cpumask_empty(cp->cpus_allowed)) {
-   pos_css = css_rightmost_descendant(pos_css);
-   continue;
-   }
+   cpuset_for_each_descendant_pre(cp, pos_css, cs) {
+   struct cpuset *parent = parent_cs(cp);
+
+   cpumask_and(new_cpus, cp->cpus_allowed, parent->effective_cpus);
+
+   /* Skip the whole subtree if the cpumask remains the same. */
+   if (cpumask_equal(new_cpus, cp->effective_cpus)) {
+   pos_css = css_rightmost_descendant(pos_css);
+   continue;
}
+
if (!css_tryget_online(>css))
continue;
rcu_read_unlock();
 
+   mutex_lock(_mutex);
+   cpumask_copy(cp->effective_cpus, new_cpus);
+   mutex_unlock(_mutex);
+
+   WARN_ON(!cgroup_on_dfl(cp->css.cgroup) &&
+   !cpumask_equal(cp->cpus_allowed, cp->effective_cpus));
+
update_tasks_cpumask(cp);
 
rcu_read_lock();
@@ -940,7 +949,8 @@ static int update_cpumask(struct cpuset *cs, struct cpuset 
*trialcs,
cpumask_copy(cs->cpus_allowed, trialcs->cpus_allowed);
mutex_unlock(_mutex);
 
-   update_tasks_cpumask_hier(cs, true);
+   /* use trialcs->cpus_allowed as a temp variable */
+   update_cpumasks_hier(cs, trialcs->cpus_allowed);
 
if (is_load_balanced)
rebuild_sched_domains_locked();
@@ -1091,36 +1101,45 @@ static void update_tasks_nodemask(struct cpuset *cs)
 }
 
 /*
- * update_tasks_nodemask_hier - Update the nodemasks of tasks in the hierarchy.
- * @cs: the root cpuset of the hierarchy
- * @update_root: update the root cp

[PATCH v3 06/12] cpuset: initialize top_cpuset's configured masks at mount

2014-07-09 Thread Li Zefan

We now have to support different behaviors for default hierachy and
legacy hiearchy, top_cpuset's configured masks need to be initialized
accordingly.

Suppose we've offlined cpu1.

On default hierarchy:

# mount -t cgroup -o __DEVEL__sane_behavior xxx /cpuset
# cat /cpuset/cpuset.cpus
0-15

On legacy hierarchy:

# mount -t cgroup xxx /cpuset
# cat /cpuset/cpuset.cpus
0,2-15

Signed-off-by: Li Zefan 
---
 kernel/cpuset.c | 37 -
 1 file changed, 28 insertions(+), 9 deletions(-)

diff --git a/kernel/cpuset.c b/kernel/cpuset.c
index 60577cc..e4c31e6 100644
--- a/kernel/cpuset.c
+++ b/kernel/cpuset.c
@@ -2015,16 +2015,35 @@ static void cpuset_css_free(struct cgroup_subsys_state 
*css)
kfree(cs);
 }
 
+static void cpuset_bind(struct cgroup_subsys_state *root_css)
+{
+   mutex_lock(_mutex);
+   mutex_lock(_mutex);
+
+   if (cgroup_on_dfl(root_css->cgroup)) {
+   cpumask_copy(top_cpuset.cpus_allowed, cpu_possible_mask);
+   top_cpuset.mems_allowed = node_possible_map;
+   } else {
+   cpumask_copy(top_cpuset.cpus_allowed,
+top_cpuset.effective_cpus);
+   top_cpuset.mems_allowed = top_cpuset.effective_mems;
+   }
+
+   mutex_unlock(_mutex);
+   mutex_unlock(_mutex);
+}
+
 struct cgroup_subsys cpuset_cgrp_subsys = {
-   .css_alloc = cpuset_css_alloc,
-   .css_online = cpuset_css_online,
-   .css_offline = cpuset_css_offline,
-   .css_free = cpuset_css_free,
-   .can_attach = cpuset_can_attach,
-   .cancel_attach = cpuset_cancel_attach,
-   .attach = cpuset_attach,
-   .base_cftypes = files,
-   .early_init = 1,
+   .css_alloc  = cpuset_css_alloc,
+   .css_online = cpuset_css_online,
+   .css_offline= cpuset_css_offline,
+   .css_free   = cpuset_css_free,
+   .can_attach = cpuset_can_attach,
+   .cancel_attach  = cpuset_cancel_attach,
+   .attach = cpuset_attach,
+   .bind   = cpuset_bind,
+   .base_cftypes   = files,
+   .early_init = 1,
 };
 
 /**
-- 
1.8.0.2

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

[PATCH v3 04/12] cpuset: inherit ancestor's masks if effective_{cpus,mems} becomes empty

2014-07-09 Thread Li Zefan

We're going to have separate user-configured masks and effective ones.

Eventually configured masks can only be changed by writing cpuset.cpus
and cpuset.mems, and they won't be restricted by parent cpuset. While
effective masks reflect cpu/memory hotplug and hierachical restriction,
and these are the real masks that apply to the tasks in the cpuset.

We calculate effective mask this way:
  - top cpuset's effective_mask == online_mask, otherwise
  - cpuset's effective_mask == configured_mask & parent effective_mask,
if the result is empty, it inherits parent effective mask.

Those behavior changes are for default hierarchy only. For legacy
hierarchy, effective_mask and configured_mask are the same, so we won't
break old interfaces.

To make cs->effective_{cpus,mems} to be effective masks, we need to
  - update the effective masks at hotplug
  - update the effective masks at config change
  - take on ancestor's mask when the effective mask is empty

The last item is done here.

This won't introduce behavior change.

Signed-off-by: Li Zefan 
---
 kernel/cpuset.c | 22 ++
 1 file changed, 22 insertions(+)

diff --git a/kernel/cpuset.c b/kernel/cpuset.c
index da766c3..f834002 100644
--- a/kernel/cpuset.c
+++ b/kernel/cpuset.c
@@ -877,6 +877,13 @@ static void update_cpumasks_hier(struct cpuset *cs, struct 
cpumask *new_cpus)
 
cpumask_and(new_cpus, cp->cpus_allowed, parent->effective_cpus);
 
+   /*
+* If it becomes empty, inherit the effective mask of the
+* parent, which is guaranteed to have some CPUs.
+*/
+   if (cpumask_empty(new_cpus))
+   cpumask_copy(new_cpus, parent->effective_cpus);
+
/* Skip the whole subtree if the cpumask remains the same. */
if (cpumask_equal(new_cpus, cp->effective_cpus)) {
pos_css = css_rightmost_descendant(pos_css);
@@ -1123,6 +1130,13 @@ static void update_nodemasks_hier(struct cpuset *cs, 
nodemask_t *new_mems)
 
nodes_and(*new_mems, cp->mems_allowed, parent->effective_mems);
 
+   /*
+* If it becomes empty, inherit the effective mask of the
+* parent, which is guaranteed to have some MEMs.
+*/
+   if (nodes_empty(*new_mems))
+   *new_mems = parent->effective_mems;
+
/* Skip the whole subtree if the nodemask remains the same. */
if (nodes_equal(*new_mems, cp->effective_mems)) {
pos_css = css_rightmost_descendant(pos_css);
@@ -2102,7 +2116,11 @@ retry:
 
mutex_lock(_mutex);
cpumask_andnot(cs->cpus_allowed, cs->cpus_allowed, _cpus);
+
+   /* Inherit the effective mask of the parent, if it becomes empty. */
cpumask_andnot(cs->effective_cpus, cs->effective_cpus, _cpus);
+   if (on_dfl && cpumask_empty(cs->effective_cpus))
+   cpumask_copy(cs->effective_cpus, parent_cs(cs)->effective_cpus);
mutex_unlock(_mutex);
 
/*
@@ -2117,7 +2135,11 @@ retry:
 
mutex_lock(_mutex);
nodes_andnot(cs->mems_allowed, cs->mems_allowed, off_mems);
+
+   /* Inherit the effective mask of the parent, if it becomes empty */
nodes_andnot(cs->effective_mems, cs->effective_mems, off_mems);
+   if (on_dfl && nodes_empty(cs->effective_mems))
+   cs->effective_mems = parent_cs(cs)->effective_mems;
mutex_unlock(_mutex);
 
/*
-- 
1.8.0.2

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

[PATCH v3 05/12] cpuset: use effective cpumask to build sched domains

2014-07-09 Thread Li Zefan

We're going to have separate user-configured masks and effective ones.

Eventually configured masks can only be changed by writing cpuset.cpus
and cpuset.mems, and they won't be restricted by parent cpuset. While
effective masks reflect cpu/memory hotplug and hierachical restriction,
and these are the real masks that apply to the tasks in the cpuset.

We calculate effective mask this way:
  - top cpuset's effective_mask == online_mask, otherwise
  - cpuset's effective_mask == configured_mask & parent effective_mask,
if the result is empty, it inherits parent effective mask.

Those behavior changes are for default hierarchy only. For legacy
hierarchy, effective_mask and configured_mask are the same, so we won't
break old interfaces.

We should partition sched domains according to effective_cpus, which
is the real cpulist that takes effects on tasks in the cpuset.

This won't introduce behavior change.

v2:
- Add a comment for the call of rebuild_sched_domains(), suggested
by Tejun.

Signed-off-by: Li Zefan 
---
 kernel/cpuset.c | 28 +---
 1 file changed, 17 insertions(+), 11 deletions(-)

diff --git a/kernel/cpuset.c b/kernel/cpuset.c
index f834002..60577cc 100644
--- a/kernel/cpuset.c
+++ b/kernel/cpuset.c
@@ -494,11 +494,11 @@ out:
 #ifdef CONFIG_SMP
 /*
  * Helper routine for generate_sched_domains().
- * Do cpusets a, b have overlapping cpus_allowed masks?
+ * Do cpusets a, b have overlapping effective cpus_allowed masks?
  */
 static int cpusets_overlap(struct cpuset *a, struct cpuset *b)
 {
-   return cpumask_intersects(a->cpus_allowed, b->cpus_allowed);
+   return cpumask_intersects(a->effective_cpus, b->effective_cpus);
 }
 
 static void
@@ -615,7 +615,7 @@ static int generate_sched_domains(cpumask_var_t **domains,
*dattr = SD_ATTR_INIT;
update_domain_attr_tree(dattr, _cpuset);
}
-   cpumask_copy(doms[0], top_cpuset.cpus_allowed);
+   cpumask_copy(doms[0], top_cpuset.effective_cpus);
 
goto done;
}
@@ -719,7 +719,7 @@ restart:
struct cpuset *b = csa[j];
 
if (apn == b->pn) {
-   cpumask_or(dp, dp, b->cpus_allowed);
+   cpumask_or(dp, dp, b->effective_cpus);
if (dattr)
update_domain_attr_tree(dattr + nslot, 
b);
 
@@ -771,7 +771,7 @@ static void rebuild_sched_domains_locked(void)
 * passing doms with offlined cpu to partition_sched_domains().
 * Anyways, hotplug work item will rebuild sched domains.
 */
-   if (!cpumask_equal(top_cpuset.cpus_allowed, cpu_active_mask))
+   if (!cpumask_equal(top_cpuset.effective_cpus, cpu_active_mask))
goto out;
 
/* Generate domain masks and attrs */
@@ -870,6 +870,7 @@ static void update_cpumasks_hier(struct cpuset *cs, struct 
cpumask *new_cpus)
 {
struct cpuset *cp;
struct cgroup_subsys_state *pos_css;
+   bool need_rebuild_sched_domains = false;
 
rcu_read_lock();
cpuset_for_each_descendant_pre(cp, pos_css, cs) {
@@ -903,10 +904,21 @@ static void update_cpumasks_hier(struct cpuset *cs, 
struct cpumask *new_cpus)
 
update_tasks_cpumask(cp);
 
+   /*
+* If the effective cpumask of any non-empty cpuset is changed,
+* we need to rebuild sched domains.
+*/
+   if (!cpumask_empty(cp->cpus_allowed) &&
+   is_sched_load_balance(cp))
+   need_rebuild_sched_domains = true;
+
rcu_read_lock();
css_put(>css);
}
rcu_read_unlock();
+
+   if (need_rebuild_sched_domains)
+   rebuild_sched_domains_locked();
 }
 
 /**
@@ -919,7 +931,6 @@ static int update_cpumask(struct cpuset *cs, struct cpuset 
*trialcs,
  const char *buf)
 {
int retval;
-   int is_load_balanced;
 
/* top_cpuset.cpus_allowed tracks cpu_online_mask; it's read-only */
if (cs == _cpuset)
@@ -950,17 +961,12 @@ static int update_cpumask(struct cpuset *cs, struct 
cpuset *trialcs,
if (retval < 0)
return retval;
 
-   is_load_balanced = is_sched_load_balance(trialcs);
-
mutex_lock(_mutex);
cpumask_copy(cs->cpus_allowed, trialcs->cpus_allowed);
mutex_unlock(_mutex);
 
/* use trialcs->cpus_allowed as a temp variable */
update_cpumasks_hier(cs, trialcs->cpus_allowed);
-
-   if (is_load_balanced)
-   rebuild_sched_domains_locked();
return 0;
 }
 
-- 
1.8.0.2

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://

[PATCH v3 01/12] cpuset: add cs->effective_cpus and cs->effective_mems

2014-07-09 Thread Li Zefan

We're going to have separate user-configured masks and effective ones.

Eventually configured masks can only be changed by writing cpuset.cpus
and cpuset.mems, and they won't be restricted by parent cpuset. While
effective masks reflect cpu/memory hotplug and hierachical restriction,
and these are the real masks that apply to the tasks in the cpuset.

We calculate effective mask this way:
  - top cpuset's effective_mask == online_mask, otherwise
  - cpuset's effective_mask == configured_mask & parent effective_mask,
if the result is empty, it inherits parent effective mask.

Those behavior changes are for default hierarchy only. For legacy
hierachy, effective_mask and configured_mask are the same, so we won't
break old interfaces.

This patch adds the effective masks to struct cpuset and initializes
them. The effective masks of the top cpuset is the same with configured
masks, and a child cpuset inherits its parent's effective masks.

This won't introduce behavior change.

v2:
- s/real_{mems,cpus}_allowed/effective_{mems,cpus}, suggested by Tejun.
- don't init effective masks in cpuset_css_online() if !cgroup_on_dfl.

Signed-off-by: Li Zefan 
---
 kernel/cpuset.c | 59 ++---
 1 file changed, 48 insertions(+), 11 deletions(-)

diff --git a/kernel/cpuset.c b/kernel/cpuset.c
index f9d4807..ef0974c 100644
--- a/kernel/cpuset.c
+++ b/kernel/cpuset.c
@@ -76,8 +76,14 @@ struct cpuset {
struct cgroup_subsys_state css;
 
unsigned long flags;/* "unsigned long" so bitops work */
-   cpumask_var_t cpus_allowed; /* CPUs allowed to tasks in cpuset */
-   nodemask_t mems_allowed;/* Memory Nodes allowed to tasks */
+
+   /* user-configured CPUs and Memory Nodes allow to tasks */
+   cpumask_var_t cpus_allowed;
+   nodemask_t mems_allowed;
+
+   /* effective CPUs and Memory Nodes allow to tasks */
+   cpumask_var_t effective_cpus;
+   nodemask_t effective_mems;
 
/*
 * This is old Memory Nodes tasks took on.
@@ -376,13 +382,20 @@ static struct cpuset *alloc_trial_cpuset(struct cpuset 
*cs)
if (!trial)
return NULL;
 
-   if (!alloc_cpumask_var(>cpus_allowed, GFP_KERNEL)) {
-   kfree(trial);
-   return NULL;
-   }
-   cpumask_copy(trial->cpus_allowed, cs->cpus_allowed);
+   if (!alloc_cpumask_var(>cpus_allowed, GFP_KERNEL))
+   goto free_cs;
+   if (!alloc_cpumask_var(>effective_cpus, GFP_KERNEL))
+   goto free_cpus;
 
+   cpumask_copy(trial->cpus_allowed, cs->cpus_allowed);
+   cpumask_copy(trial->effective_cpus, cs->effective_cpus);
return trial;
+
+free_cpus:
+   free_cpumask_var(trial->cpus_allowed);
+free_cs:
+   kfree(trial);
+   return NULL;
 }
 
 /**
@@ -391,6 +404,7 @@ static struct cpuset *alloc_trial_cpuset(struct cpuset *cs)
  */
 static void free_trial_cpuset(struct cpuset *trial)
 {
+   free_cpumask_var(trial->effective_cpus);
free_cpumask_var(trial->cpus_allowed);
kfree(trial);
 }
@@ -1848,18 +1862,26 @@ cpuset_css_alloc(struct cgroup_subsys_state *parent_css)
cs = kzalloc(sizeof(*cs), GFP_KERNEL);
if (!cs)
return ERR_PTR(-ENOMEM);
-   if (!alloc_cpumask_var(>cpus_allowed, GFP_KERNEL)) {
-   kfree(cs);
-   return ERR_PTR(-ENOMEM);
-   }
+   if (!alloc_cpumask_var(>cpus_allowed, GFP_KERNEL))
+   goto free_cs;
+   if (!alloc_cpumask_var(>effective_cpus, GFP_KERNEL))
+   goto free_cpus;
 
set_bit(CS_SCHED_LOAD_BALANCE, >flags);
cpumask_clear(cs->cpus_allowed);
nodes_clear(cs->mems_allowed);
+   cpumask_clear(cs->effective_cpus);
+   nodes_clear(cs->effective_mems);
fmeter_init(>fmeter);
cs->relax_domain_level = -1;
 
return >css;
+
+free_cpus:
+   free_cpumask_var(cs->cpus_allowed);
+free_cs:
+   kfree(cs);
+   return ERR_PTR(-ENOMEM);
 }
 
 static int cpuset_css_online(struct cgroup_subsys_state *css)
@@ -1882,6 +1904,13 @@ static int cpuset_css_online(struct cgroup_subsys_state 
*css)
 
cpuset_inc();
 
+   mutex_lock(_mutex);
+   if (cgroup_on_dfl(cs->css.cgroup)) {
+   cpumask_copy(cs->effective_cpus, parent->effective_cpus);
+   cs->effective_mems = parent->effective_mems;
+   }
+   mutex_unlock(_mutex);
+
if (!test_bit(CGRP_CPUSET_CLONE_CHILDREN, >cgroup->flags))
goto out_unlock;
 
@@ -1941,6 +1970,7 @@ static void cpuset_css_free(struct cgroup_subsys_state 
*css)
 {
struct cpuset *cs = css_cs(css);
 
+   free_cpumask_var(cs->effective_cpus);
free_cpumask_var(cs->cpus_allowed);
kfree(cs);
 }
@@ -1969,9 +1999,13 @@ int __init cpuset_init(void)

[PATCH v3 02/12] cpuset: update cpuset->effective_{cpus,mems} at hotplug

2014-07-09 Thread Li Zefan

We're going to have separate user-configured masks and effective ones.

Eventually configured masks can only be changed by writing cpuset.cpus
and cpuset.mems, and they won't be restricted by parent cpuset. While
effective masks reflect cpu/memory hotplug and hierachical restriction,
and these are the real masks that apply to the tasks in the cpuset.

We calculate effective mask this way:
  - top cpuset's effective_mask == online_mask, otherwise
  - cpuset's effective_mask == configured_mask & parent effective_mask,
if the result is empty, it inherits parent effective mask.

Those behavior changes are for default hierarchy only. For legacy
hierarchy, effective_mask and configured_mask are the same, so we won't
break old interfaces.

To make cs->effective_{cpus,mems} to be effective masks, we need to
  - update the effective masks at hotplug
  - update the effective masks at config change
  - take on ancestor's mask when the effective mask is empty

The first item is done here.

This won't introduce behavior change.

Signed-off-by: Li Zefan 
---
 kernel/cpuset.c | 4 
 1 file changed, 4 insertions(+)

diff --git a/kernel/cpuset.c b/kernel/cpuset.c
index ef0974c..94f651d 100644
--- a/kernel/cpuset.c
+++ b/kernel/cpuset.c
@@ -2082,6 +2082,7 @@ retry:
 
mutex_lock(_mutex);
cpumask_andnot(cs->cpus_allowed, cs->cpus_allowed, _cpus);
+   cpumask_andnot(cs->effective_cpus, cs->effective_cpus, _cpus);
mutex_unlock(_mutex);
 
/*
@@ -2096,6 +2097,7 @@ retry:
 
mutex_lock(_mutex);
nodes_andnot(cs->mems_allowed, cs->mems_allowed, off_mems);
+   nodes_andnot(cs->effective_mems, cs->effective_mems, off_mems);
mutex_unlock(_mutex);
 
/*
@@ -2159,6 +2161,7 @@ static void cpuset_hotplug_workfn(struct work_struct 
*work)
if (cpus_updated) {
mutex_lock(_mutex);
cpumask_copy(top_cpuset.cpus_allowed, _cpus);
+   cpumask_copy(top_cpuset.effective_cpus, _cpus);
mutex_unlock(_mutex);
/* we don't mess with cpumasks of tasks in top_cpuset */
}
@@ -2167,6 +2170,7 @@ static void cpuset_hotplug_workfn(struct work_struct 
*work)
if (mems_updated) {
mutex_lock(_mutex);
top_cpuset.mems_allowed = new_mems;
+   top_cpuset.effective_mems = new_mems;
mutex_unlock(_mutex);
update_tasks_nodemask(_cpuset);
}
-- 
1.8.0.2

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

[PATCH v3 00/12] cpuset: separate configured masks and effective masks

2014-07-09 Thread Li Zefan

This patcheset introduces behavior changes, but only for default hierarchy

- We introduce new interfaces cpuset.effective_cpus and cpuset.effective_mems,
  while cpuset.cpus and cpuset.mems will be configured masks.

- The configured masks can be changed by writing cpuset.cpus/mems only. They
  won't be changed when hotplug happens.

- Users can config cpus and mems without restrictions from the parent cpuset.
  effective masks will enforce the hierarchical behavior.

- Users can also config cpus and mems to have already offlined CPU/nodes.

- When a CPU/node is onlined, it will be brought back to the effective masks
  if it's in the configured masks.

- We build sched domains based on effective cpumask but not configured cpumask.

v3:
- rebased against "cgroup: remove sane_behavior support on non-default 
hierarchies"
- addressed previous review comments
- adjusted some code, comment and changelog slightly

v2:
- fixed two bugs
- made changelogs more verbose
- added more comments
- changed cs->real_{mems,cpus}_allowed to cs->effective_{mems, cpus}
- splitted "cpuset: enable onlined cpu/node in effective masks" into 2 patches
- exported cpuset.effective_{cpus,mems} unconditionally


Li Zefan (12):
  cpuset: add cs->effective_cpus and cs->effective_mems
  cpuset: update cpuset->effective_{cpus,mems} at hotplug
  cpuset: update cs->effective_{cpus,mems} when config changes
  cpuset: inherit ancestor's masks if effective_{cpus,mems} becomes
empty
  cpuset: use effective cpumask to build sched domains
  cpuset: initialize top_cpuset's configured masks at mount
  cpuset: apply cs->effective_{cpus,mems}
  cpuset: make cs->{cpus,mems}_allowed as user-configured masks
  cpuset: refactor cpuset_hotplug_update_tasks()
  cpuset: enable onlined cpu/node in effective masks
  cpuset: allow writing offlined masks to cpuset.cpus/mems
  cpuset: export effective masks to userspace

 kernel/cpuset.c | 493 ++--
 1 file changed, 304 insertions(+), 189 deletions(-)

-- 
1.8.0.2


--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: [PATCHSET cgroup/for-3.17] cgroup: remove sane_behavior support on non-default hierarchies

2014-07-09 Thread Li Zefan

On 2014/7/3 7:45, Tejun Heo wrote:
> Hello,
> 
> sane_behavior has been used as a development vehicle for the default
> unified hierarchy.  Now that the default hierarchy is in place, the
> flag became redundant and confusing as its usage is allowed on all
> hierarchies.  There are gonna be either the default hierarchy or
> legacy ones.  Let's make that clear by removing sane_behavior support
> on non-default hierarchies.
> 
> This patchset contains the following four patches.
> 
>  0001-cgroup-remove-CGRP_ROOT_OPTION_MASK.patch
>  0002-cgroup-make-interface-file-cgroup.sane_behavior-lega.patch
>  0003-cgroup-remove-sane_behavior-support-on-non-default-h.patch
>  0004-cgroup-clean-up-sane_behavior-handling.patch
> 
> 0001 is a trivial cleanup.
> 
> 0002 removes "cgroup.sane_behavior" from the default hierarchy.
> 
> 0003 removes sane_behavior support on non-default hierarchies.
> 
> 0004 cleans up sane_behavior handling.
> 
> This patchset is on top of a497c3ba1d97 ("Linux 3.16-rc2") and
> available in the following git branch.
> 
>  git://git.kernel.org/pub/scm/linux/kernel/git/tj/cgroup.git 
> review-dfl-instead-of-sane
> 
> diffstat follows.  Thanks.
> 
>  block/blk-throttle.c   |6 +-
>  include/linux/cgroup.h |  128 
> -
>  kernel/cgroup.c|   96 +++-
>  kernel/cpuset.c        |   33 +---
>  mm/memcontrol.c|7 +-
>  5 files changed, 117 insertions(+), 153 deletions(-)
> 

Acked-by: Li Zefan 

I'm rebasing my cpuset patchset against this.

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

[PATCH v3 00/12] cpuset: separate configured masks and effective masks

2014-07-09 Thread Li Zefan

This patcheset introduces behavior changes, but only for default hierarchy

- We introduce new interfaces cpuset.effective_cpus and cpuset.effective_mems,
  while cpuset.cpus and cpuset.mems will be configured masks.

- The configured masks can be changed by writing cpuset.cpus/mems only. They
  won't be changed when hotplug happens.

- Users can config cpus and mems without restrictions from the parent cpuset.
  effective masks will enforce the hierarchical behavior.

- Users can also config cpus and mems to have already offlined CPU/nodes.

- When a CPU/node is onlined, it will be brought back to the effective masks
  if it's in the configured masks.

- We build sched domains based on effective cpumask but not configured cpumask.

v3:
- rebased against cgroup: remove sane_behavior support on non-default 
hierarchies
- addressed previous review comments
- adjusted some code, comment and changelog slightly

v2:
- fixed two bugs
- made changelogs more verbose
- added more comments
- changed cs-real_{mems,cpus}_allowed to cs-effective_{mems, cpus}
- splitted cpuset: enable onlined cpu/node in effective masks into 2 patches
- exported cpuset.effective_{cpus,mems} unconditionally


Li Zefan (12):
  cpuset: add cs-effective_cpus and cs-effective_mems
  cpuset: update cpuset-effective_{cpus,mems} at hotplug
  cpuset: update cs-effective_{cpus,mems} when config changes
  cpuset: inherit ancestor's masks if effective_{cpus,mems} becomes
empty
  cpuset: use effective cpumask to build sched domains
  cpuset: initialize top_cpuset's configured masks at mount
  cpuset: apply cs-effective_{cpus,mems}
  cpuset: make cs-{cpus,mems}_allowed as user-configured masks
  cpuset: refactor cpuset_hotplug_update_tasks()
  cpuset: enable onlined cpu/node in effective masks
  cpuset: allow writing offlined masks to cpuset.cpus/mems
  cpuset: export effective masks to userspace

 kernel/cpuset.c | 493 ++--
 1 file changed, 304 insertions(+), 189 deletions(-)

-- 
1.8.0.2


--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

[PATCH v3 01/12] cpuset: add cs-effective_cpus and cs-effective_mems

2014-07-09 Thread Li Zefan

We're going to have separate user-configured masks and effective ones.

Eventually configured masks can only be changed by writing cpuset.cpus
and cpuset.mems, and they won't be restricted by parent cpuset. While
effective masks reflect cpu/memory hotplug and hierachical restriction,
and these are the real masks that apply to the tasks in the cpuset.

We calculate effective mask this way:
  - top cpuset's effective_mask == online_mask, otherwise
  - cpuset's effective_mask == configured_mask  parent effective_mask,
if the result is empty, it inherits parent effective mask.

Those behavior changes are for default hierarchy only. For legacy
hierachy, effective_mask and configured_mask are the same, so we won't
break old interfaces.

This patch adds the effective masks to struct cpuset and initializes
them. The effective masks of the top cpuset is the same with configured
masks, and a child cpuset inherits its parent's effective masks.

This won't introduce behavior change.

v2:
- s/real_{mems,cpus}_allowed/effective_{mems,cpus}, suggested by Tejun.
- don't init effective masks in cpuset_css_online() if !cgroup_on_dfl.

Signed-off-by: Li Zefan lize...@huawei.com
---
 kernel/cpuset.c | 59 ++---
 1 file changed, 48 insertions(+), 11 deletions(-)

diff --git a/kernel/cpuset.c b/kernel/cpuset.c
index f9d4807..ef0974c 100644
--- a/kernel/cpuset.c
+++ b/kernel/cpuset.c
@@ -76,8 +76,14 @@ struct cpuset {
struct cgroup_subsys_state css;
 
unsigned long flags;/* unsigned long so bitops work */
-   cpumask_var_t cpus_allowed; /* CPUs allowed to tasks in cpuset */
-   nodemask_t mems_allowed;/* Memory Nodes allowed to tasks */
+
+   /* user-configured CPUs and Memory Nodes allow to tasks */
+   cpumask_var_t cpus_allowed;
+   nodemask_t mems_allowed;
+
+   /* effective CPUs and Memory Nodes allow to tasks */
+   cpumask_var_t effective_cpus;
+   nodemask_t effective_mems;
 
/*
 * This is old Memory Nodes tasks took on.
@@ -376,13 +382,20 @@ static struct cpuset *alloc_trial_cpuset(struct cpuset 
*cs)
if (!trial)
return NULL;
 
-   if (!alloc_cpumask_var(trial-cpus_allowed, GFP_KERNEL)) {
-   kfree(trial);
-   return NULL;
-   }
-   cpumask_copy(trial-cpus_allowed, cs-cpus_allowed);
+   if (!alloc_cpumask_var(trial-cpus_allowed, GFP_KERNEL))
+   goto free_cs;
+   if (!alloc_cpumask_var(trial-effective_cpus, GFP_KERNEL))
+   goto free_cpus;
 
+   cpumask_copy(trial-cpus_allowed, cs-cpus_allowed);
+   cpumask_copy(trial-effective_cpus, cs-effective_cpus);
return trial;
+
+free_cpus:
+   free_cpumask_var(trial-cpus_allowed);
+free_cs:
+   kfree(trial);
+   return NULL;
 }
 
 /**
@@ -391,6 +404,7 @@ static struct cpuset *alloc_trial_cpuset(struct cpuset *cs)
  */
 static void free_trial_cpuset(struct cpuset *trial)
 {
+   free_cpumask_var(trial-effective_cpus);
free_cpumask_var(trial-cpus_allowed);
kfree(trial);
 }
@@ -1848,18 +1862,26 @@ cpuset_css_alloc(struct cgroup_subsys_state *parent_css)
cs = kzalloc(sizeof(*cs), GFP_KERNEL);
if (!cs)
return ERR_PTR(-ENOMEM);
-   if (!alloc_cpumask_var(cs-cpus_allowed, GFP_KERNEL)) {
-   kfree(cs);
-   return ERR_PTR(-ENOMEM);
-   }
+   if (!alloc_cpumask_var(cs-cpus_allowed, GFP_KERNEL))
+   goto free_cs;
+   if (!alloc_cpumask_var(cs-effective_cpus, GFP_KERNEL))
+   goto free_cpus;
 
set_bit(CS_SCHED_LOAD_BALANCE, cs-flags);
cpumask_clear(cs-cpus_allowed);
nodes_clear(cs-mems_allowed);
+   cpumask_clear(cs-effective_cpus);
+   nodes_clear(cs-effective_mems);
fmeter_init(cs-fmeter);
cs-relax_domain_level = -1;
 
return cs-css;
+
+free_cpus:
+   free_cpumask_var(cs-cpus_allowed);
+free_cs:
+   kfree(cs);
+   return ERR_PTR(-ENOMEM);
 }
 
 static int cpuset_css_online(struct cgroup_subsys_state *css)
@@ -1882,6 +1904,13 @@ static int cpuset_css_online(struct cgroup_subsys_state 
*css)
 
cpuset_inc();
 
+   mutex_lock(callback_mutex);
+   if (cgroup_on_dfl(cs-css.cgroup)) {
+   cpumask_copy(cs-effective_cpus, parent-effective_cpus);
+   cs-effective_mems = parent-effective_mems;
+   }
+   mutex_unlock(callback_mutex);
+
if (!test_bit(CGRP_CPUSET_CLONE_CHILDREN, css-cgroup-flags))
goto out_unlock;
 
@@ -1941,6 +1970,7 @@ static void cpuset_css_free(struct cgroup_subsys_state 
*css)
 {
struct cpuset *cs = css_cs(css);
 
+   free_cpumask_var(cs-effective_cpus);
free_cpumask_var(cs-cpus_allowed);
kfree(cs);
 }
@@ -1969,9 +1999,13 @@ int __init cpuset_init(void)
 
if (!alloc_cpumask_var(top_cpuset.cpus_allowed, GFP_KERNEL))
BUG

[PATCH v3 02/12] cpuset: update cpuset-effective_{cpus,mems} at hotplug

2014-07-09 Thread Li Zefan

We're going to have separate user-configured masks and effective ones.

Eventually configured masks can only be changed by writing cpuset.cpus
and cpuset.mems, and they won't be restricted by parent cpuset. While
effective masks reflect cpu/memory hotplug and hierachical restriction,
and these are the real masks that apply to the tasks in the cpuset.

We calculate effective mask this way:
  - top cpuset's effective_mask == online_mask, otherwise
  - cpuset's effective_mask == configured_mask  parent effective_mask,
if the result is empty, it inherits parent effective mask.

Those behavior changes are for default hierarchy only. For legacy
hierarchy, effective_mask and configured_mask are the same, so we won't
break old interfaces.

To make cs-effective_{cpus,mems} to be effective masks, we need to
  - update the effective masks at hotplug
  - update the effective masks at config change
  - take on ancestor's mask when the effective mask is empty

The first item is done here.

This won't introduce behavior change.

Signed-off-by: Li Zefan lize...@huawei.com
---
 kernel/cpuset.c | 4 
 1 file changed, 4 insertions(+)

diff --git a/kernel/cpuset.c b/kernel/cpuset.c
index ef0974c..94f651d 100644
--- a/kernel/cpuset.c
+++ b/kernel/cpuset.c
@@ -2082,6 +2082,7 @@ retry:
 
mutex_lock(callback_mutex);
cpumask_andnot(cs-cpus_allowed, cs-cpus_allowed, off_cpus);
+   cpumask_andnot(cs-effective_cpus, cs-effective_cpus, off_cpus);
mutex_unlock(callback_mutex);
 
/*
@@ -2096,6 +2097,7 @@ retry:
 
mutex_lock(callback_mutex);
nodes_andnot(cs-mems_allowed, cs-mems_allowed, off_mems);
+   nodes_andnot(cs-effective_mems, cs-effective_mems, off_mems);
mutex_unlock(callback_mutex);
 
/*
@@ -2159,6 +2161,7 @@ static void cpuset_hotplug_workfn(struct work_struct 
*work)
if (cpus_updated) {
mutex_lock(callback_mutex);
cpumask_copy(top_cpuset.cpus_allowed, new_cpus);
+   cpumask_copy(top_cpuset.effective_cpus, new_cpus);
mutex_unlock(callback_mutex);
/* we don't mess with cpumasks of tasks in top_cpuset */
}
@@ -2167,6 +2170,7 @@ static void cpuset_hotplug_workfn(struct work_struct 
*work)
if (mems_updated) {
mutex_lock(callback_mutex);
top_cpuset.mems_allowed = new_mems;
+   top_cpuset.effective_mems = new_mems;
mutex_unlock(callback_mutex);
update_tasks_nodemask(top_cpuset);
}
-- 
1.8.0.2

--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

[PATCH v3 06/12] cpuset: initialize top_cpuset's configured masks at mount

2014-07-09 Thread Li Zefan

We now have to support different behaviors for default hierachy and
legacy hiearchy, top_cpuset's configured masks need to be initialized
accordingly.

Suppose we've offlined cpu1.

On default hierarchy:

# mount -t cgroup -o __DEVEL__sane_behavior xxx /cpuset
# cat /cpuset/cpuset.cpus
0-15

On legacy hierarchy:

# mount -t cgroup xxx /cpuset
# cat /cpuset/cpuset.cpus
0,2-15

Signed-off-by: Li Zefan lize...@huawei.com
---
 kernel/cpuset.c | 37 -
 1 file changed, 28 insertions(+), 9 deletions(-)

diff --git a/kernel/cpuset.c b/kernel/cpuset.c
index 60577cc..e4c31e6 100644
--- a/kernel/cpuset.c
+++ b/kernel/cpuset.c
@@ -2015,16 +2015,35 @@ static void cpuset_css_free(struct cgroup_subsys_state 
*css)
kfree(cs);
 }
 
+static void cpuset_bind(struct cgroup_subsys_state *root_css)
+{
+   mutex_lock(cpuset_mutex);
+   mutex_lock(callback_mutex);
+
+   if (cgroup_on_dfl(root_css-cgroup)) {
+   cpumask_copy(top_cpuset.cpus_allowed, cpu_possible_mask);
+   top_cpuset.mems_allowed = node_possible_map;
+   } else {
+   cpumask_copy(top_cpuset.cpus_allowed,
+top_cpuset.effective_cpus);
+   top_cpuset.mems_allowed = top_cpuset.effective_mems;
+   }
+
+   mutex_unlock(callback_mutex);
+   mutex_unlock(cpuset_mutex);
+}
+
 struct cgroup_subsys cpuset_cgrp_subsys = {
-   .css_alloc = cpuset_css_alloc,
-   .css_online = cpuset_css_online,
-   .css_offline = cpuset_css_offline,
-   .css_free = cpuset_css_free,
-   .can_attach = cpuset_can_attach,
-   .cancel_attach = cpuset_cancel_attach,
-   .attach = cpuset_attach,
-   .base_cftypes = files,
-   .early_init = 1,
+   .css_alloc  = cpuset_css_alloc,
+   .css_online = cpuset_css_online,
+   .css_offline= cpuset_css_offline,
+   .css_free   = cpuset_css_free,
+   .can_attach = cpuset_can_attach,
+   .cancel_attach  = cpuset_cancel_attach,
+   .attach = cpuset_attach,
+   .bind   = cpuset_bind,
+   .base_cftypes   = files,
+   .early_init = 1,
 };
 
 /**
-- 
1.8.0.2

--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

[PATCH v3 04/12] cpuset: inherit ancestor's masks if effective_{cpus,mems} becomes empty

2014-07-09 Thread Li Zefan

We're going to have separate user-configured masks and effective ones.

Eventually configured masks can only be changed by writing cpuset.cpus
and cpuset.mems, and they won't be restricted by parent cpuset. While
effective masks reflect cpu/memory hotplug and hierachical restriction,
and these are the real masks that apply to the tasks in the cpuset.

We calculate effective mask this way:
  - top cpuset's effective_mask == online_mask, otherwise
  - cpuset's effective_mask == configured_mask  parent effective_mask,
if the result is empty, it inherits parent effective mask.

Those behavior changes are for default hierarchy only. For legacy
hierarchy, effective_mask and configured_mask are the same, so we won't
break old interfaces.

To make cs-effective_{cpus,mems} to be effective masks, we need to
  - update the effective masks at hotplug
  - update the effective masks at config change
  - take on ancestor's mask when the effective mask is empty

The last item is done here.

This won't introduce behavior change.

Signed-off-by: Li Zefan lize...@huawei.com
---
 kernel/cpuset.c | 22 ++
 1 file changed, 22 insertions(+)

diff --git a/kernel/cpuset.c b/kernel/cpuset.c
index da766c3..f834002 100644
--- a/kernel/cpuset.c
+++ b/kernel/cpuset.c
@@ -877,6 +877,13 @@ static void update_cpumasks_hier(struct cpuset *cs, struct 
cpumask *new_cpus)
 
cpumask_and(new_cpus, cp-cpus_allowed, parent-effective_cpus);
 
+   /*
+* If it becomes empty, inherit the effective mask of the
+* parent, which is guaranteed to have some CPUs.
+*/
+   if (cpumask_empty(new_cpus))
+   cpumask_copy(new_cpus, parent-effective_cpus);
+
/* Skip the whole subtree if the cpumask remains the same. */
if (cpumask_equal(new_cpus, cp-effective_cpus)) {
pos_css = css_rightmost_descendant(pos_css);
@@ -1123,6 +1130,13 @@ static void update_nodemasks_hier(struct cpuset *cs, 
nodemask_t *new_mems)
 
nodes_and(*new_mems, cp-mems_allowed, parent-effective_mems);
 
+   /*
+* If it becomes empty, inherit the effective mask of the
+* parent, which is guaranteed to have some MEMs.
+*/
+   if (nodes_empty(*new_mems))
+   *new_mems = parent-effective_mems;
+
/* Skip the whole subtree if the nodemask remains the same. */
if (nodes_equal(*new_mems, cp-effective_mems)) {
pos_css = css_rightmost_descendant(pos_css);
@@ -2102,7 +2116,11 @@ retry:
 
mutex_lock(callback_mutex);
cpumask_andnot(cs-cpus_allowed, cs-cpus_allowed, off_cpus);
+
+   /* Inherit the effective mask of the parent, if it becomes empty. */
cpumask_andnot(cs-effective_cpus, cs-effective_cpus, off_cpus);
+   if (on_dfl  cpumask_empty(cs-effective_cpus))
+   cpumask_copy(cs-effective_cpus, parent_cs(cs)-effective_cpus);
mutex_unlock(callback_mutex);
 
/*
@@ -2117,7 +2135,11 @@ retry:
 
mutex_lock(callback_mutex);
nodes_andnot(cs-mems_allowed, cs-mems_allowed, off_mems);
+
+   /* Inherit the effective mask of the parent, if it becomes empty */
nodes_andnot(cs-effective_mems, cs-effective_mems, off_mems);
+   if (on_dfl  nodes_empty(cs-effective_mems))
+   cs-effective_mems = parent_cs(cs)-effective_mems;
mutex_unlock(callback_mutex);
 
/*
-- 
1.8.0.2

--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

[PATCH v3 05/12] cpuset: use effective cpumask to build sched domains

2014-07-09 Thread Li Zefan

We're going to have separate user-configured masks and effective ones.

Eventually configured masks can only be changed by writing cpuset.cpus
and cpuset.mems, and they won't be restricted by parent cpuset. While
effective masks reflect cpu/memory hotplug and hierachical restriction,
and these are the real masks that apply to the tasks in the cpuset.

We calculate effective mask this way:
  - top cpuset's effective_mask == online_mask, otherwise
  - cpuset's effective_mask == configured_mask  parent effective_mask,
if the result is empty, it inherits parent effective mask.

Those behavior changes are for default hierarchy only. For legacy
hierarchy, effective_mask and configured_mask are the same, so we won't
break old interfaces.

We should partition sched domains according to effective_cpus, which
is the real cpulist that takes effects on tasks in the cpuset.

This won't introduce behavior change.

v2:
- Add a comment for the call of rebuild_sched_domains(), suggested
by Tejun.

Signed-off-by: Li Zefan lize...@huawei.com
---
 kernel/cpuset.c | 28 +---
 1 file changed, 17 insertions(+), 11 deletions(-)

diff --git a/kernel/cpuset.c b/kernel/cpuset.c
index f834002..60577cc 100644
--- a/kernel/cpuset.c
+++ b/kernel/cpuset.c
@@ -494,11 +494,11 @@ out:
 #ifdef CONFIG_SMP
 /*
  * Helper routine for generate_sched_domains().
- * Do cpusets a, b have overlapping cpus_allowed masks?
+ * Do cpusets a, b have overlapping effective cpus_allowed masks?
  */
 static int cpusets_overlap(struct cpuset *a, struct cpuset *b)
 {
-   return cpumask_intersects(a-cpus_allowed, b-cpus_allowed);
+   return cpumask_intersects(a-effective_cpus, b-effective_cpus);
 }
 
 static void
@@ -615,7 +615,7 @@ static int generate_sched_domains(cpumask_var_t **domains,
*dattr = SD_ATTR_INIT;
update_domain_attr_tree(dattr, top_cpuset);
}
-   cpumask_copy(doms[0], top_cpuset.cpus_allowed);
+   cpumask_copy(doms[0], top_cpuset.effective_cpus);
 
goto done;
}
@@ -719,7 +719,7 @@ restart:
struct cpuset *b = csa[j];
 
if (apn == b-pn) {
-   cpumask_or(dp, dp, b-cpus_allowed);
+   cpumask_or(dp, dp, b-effective_cpus);
if (dattr)
update_domain_attr_tree(dattr + nslot, 
b);
 
@@ -771,7 +771,7 @@ static void rebuild_sched_domains_locked(void)
 * passing doms with offlined cpu to partition_sched_domains().
 * Anyways, hotplug work item will rebuild sched domains.
 */
-   if (!cpumask_equal(top_cpuset.cpus_allowed, cpu_active_mask))
+   if (!cpumask_equal(top_cpuset.effective_cpus, cpu_active_mask))
goto out;
 
/* Generate domain masks and attrs */
@@ -870,6 +870,7 @@ static void update_cpumasks_hier(struct cpuset *cs, struct 
cpumask *new_cpus)
 {
struct cpuset *cp;
struct cgroup_subsys_state *pos_css;
+   bool need_rebuild_sched_domains = false;
 
rcu_read_lock();
cpuset_for_each_descendant_pre(cp, pos_css, cs) {
@@ -903,10 +904,21 @@ static void update_cpumasks_hier(struct cpuset *cs, 
struct cpumask *new_cpus)
 
update_tasks_cpumask(cp);
 
+   /*
+* If the effective cpumask of any non-empty cpuset is changed,
+* we need to rebuild sched domains.
+*/
+   if (!cpumask_empty(cp-cpus_allowed) 
+   is_sched_load_balance(cp))
+   need_rebuild_sched_domains = true;
+
rcu_read_lock();
css_put(cp-css);
}
rcu_read_unlock();
+
+   if (need_rebuild_sched_domains)
+   rebuild_sched_domains_locked();
 }
 
 /**
@@ -919,7 +931,6 @@ static int update_cpumask(struct cpuset *cs, struct cpuset 
*trialcs,
  const char *buf)
 {
int retval;
-   int is_load_balanced;
 
/* top_cpuset.cpus_allowed tracks cpu_online_mask; it's read-only */
if (cs == top_cpuset)
@@ -950,17 +961,12 @@ static int update_cpumask(struct cpuset *cs, struct 
cpuset *trialcs,
if (retval  0)
return retval;
 
-   is_load_balanced = is_sched_load_balance(trialcs);
-
mutex_lock(callback_mutex);
cpumask_copy(cs-cpus_allowed, trialcs-cpus_allowed);
mutex_unlock(callback_mutex);
 
/* use trialcs-cpus_allowed as a temp variable */
update_cpumasks_hier(cs, trialcs-cpus_allowed);
-
-   if (is_load_balanced)
-   rebuild_sched_domains_locked();
return 0;
 }
 
-- 
1.8.0.2

--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read

[PATCH v3 03/12] cpuset: update cs-effective_{cpus,mems} when config changes

2014-07-09 Thread Li Zefan

We're going to have separate user-configured masks and effective ones.

Eventually configured masks can only be changed by writing cpuset.cpus
and cpuset.mems, and they won't be restricted by parent cpuset. While
effective masks reflect cpu/memory hotplug and hierachical restriction,
and these are the real masks that apply to the tasks in the cpuset.

We calculate effective mask this way:
  - top cpuset's effective_mask == online_mask, otherwise
  - cpuset's effective_mask == configured_mask  parent effective_mask,
if the result is empty, it inherits parent effective mask.

Those behavior changes are for default hierarchy only. For legacy
hierarchy, effective_mask and configured_mask are the same, so we won't
break old interfaces.

To make cs-effective_{cpus,mems} to be effective masks, we need to
  - update the effective masks at hotplug
  - update the effective masks at config change
  - take on ancestor's mask when the effective mask is empty

The second item is done here. We don't need to treat root_cs specially
in update_cpumasks_hier().

This won't introduce behavior change.

v3:
- add a WARN_ON() to check if effective masks are the same with configured
  masks on legacy hierarchy.
- pass trialcs-cpus_allowed to update_cpumasks_hier() and add a comment for
  it. Similar change for update_nodemasks_hier(). Suggested by Tejun.

v2:
- revise the comment in update_{cpu,node}masks_hier(), suggested by Tejun.
- fix to use @cp instead of @cs in these two functions.

Signed-off-by: Li Zefan lize...@huawei.com
---
 kernel/cpuset.c | 88 +++--
 1 file changed, 54 insertions(+), 34 deletions(-)

diff --git a/kernel/cpuset.c b/kernel/cpuset.c
index 94f651d..da766c3 100644
--- a/kernel/cpuset.c
+++ b/kernel/cpuset.c
@@ -855,36 +855,45 @@ static void update_tasks_cpumask(struct cpuset *cs)
 }
 
 /*
- * update_tasks_cpumask_hier - Update the cpumasks of tasks in the hierarchy.
- * @root_cs: the root cpuset of the hierarchy
- * @update_root: update root cpuset or not?
+ * update_cpumasks_hier - Update effective cpumasks and tasks in the subtree
+ * @cs: the cpuset to consider
+ * @new_cpus: temp variable for calculating new effective_cpus
+ *
+ * When congifured cpumask is changed, the effective cpumasks of this cpuset
+ * and all its descendants need to be updated.
  *
- * This will update cpumasks of tasks in @root_cs and all other empty cpusets
- * which take on cpumask of @root_cs.
+ * On legacy hierachy, effective_cpus will be the same with cpu_allowed.
  *
  * Called with cpuset_mutex held
  */
-static void update_tasks_cpumask_hier(struct cpuset *root_cs, bool update_root)
+static void update_cpumasks_hier(struct cpuset *cs, struct cpumask *new_cpus)
 {
struct cpuset *cp;
struct cgroup_subsys_state *pos_css;
 
rcu_read_lock();
-   cpuset_for_each_descendant_pre(cp, pos_css, root_cs) {
-   if (cp == root_cs) {
-   if (!update_root)
-   continue;
-   } else {
-   /* skip the whole subtree if @cp have some CPU */
-   if (!cpumask_empty(cp-cpus_allowed)) {
-   pos_css = css_rightmost_descendant(pos_css);
-   continue;
-   }
+   cpuset_for_each_descendant_pre(cp, pos_css, cs) {
+   struct cpuset *parent = parent_cs(cp);
+
+   cpumask_and(new_cpus, cp-cpus_allowed, parent-effective_cpus);
+
+   /* Skip the whole subtree if the cpumask remains the same. */
+   if (cpumask_equal(new_cpus, cp-effective_cpus)) {
+   pos_css = css_rightmost_descendant(pos_css);
+   continue;
}
+
if (!css_tryget_online(cp-css))
continue;
rcu_read_unlock();
 
+   mutex_lock(callback_mutex);
+   cpumask_copy(cp-effective_cpus, new_cpus);
+   mutex_unlock(callback_mutex);
+
+   WARN_ON(!cgroup_on_dfl(cp-css.cgroup) 
+   !cpumask_equal(cp-cpus_allowed, cp-effective_cpus));
+
update_tasks_cpumask(cp);
 
rcu_read_lock();
@@ -940,7 +949,8 @@ static int update_cpumask(struct cpuset *cs, struct cpuset 
*trialcs,
cpumask_copy(cs-cpus_allowed, trialcs-cpus_allowed);
mutex_unlock(callback_mutex);
 
-   update_tasks_cpumask_hier(cs, true);
+   /* use trialcs-cpus_allowed as a temp variable */
+   update_cpumasks_hier(cs, trialcs-cpus_allowed);
 
if (is_load_balanced)
rebuild_sched_domains_locked();
@@ -1091,36 +1101,45 @@ static void update_tasks_nodemask(struct cpuset *cs)
 }
 
 /*
- * update_tasks_nodemask_hier - Update the nodemasks of tasks in the hierarchy.
- * @cs: the root cpuset of the hierarchy
- * @update_root: update the root cpuset

[PATCH v3 08/12] cpuset: make cs-{cpus,mems}_allowed as user-configured masks

2014-07-09 Thread Li Zefan

Now we've used effective cpumasks to enforce hierarchical manner,
we can use cs-{cpus,mems}_allowed as configured masks.

Configured masks can be changed by writing cpuset.cpus and cpuset.mems
only. The new behaviors are:

- They won't be changed by hotplug anymore.
- They won't be limited by its parent's masks.

This ia a behavior change, but won't take effect unless mount with
sane_behavior.

v2:
- Add comments to explain the differences between configured masks and
effective masks.

Signed-off-by: Li Zefan lize...@huawei.com
---
 kernel/cpuset.c | 35 +--
 1 file changed, 29 insertions(+), 6 deletions(-)

diff --git a/kernel/cpuset.c b/kernel/cpuset.c
index 820870a..4b409d2 100644
--- a/kernel/cpuset.c
+++ b/kernel/cpuset.c
@@ -77,6 +77,26 @@ struct cpuset {
 
unsigned long flags;/* unsigned long so bitops work */
 
+   /*
+* On default hierarchy:
+*
+* The user-configured masks can only be changed by writing to
+* cpuset.cpus and cpuset.mems, and won't be limited by the
+* parent masks.
+*
+* The effective masks is the real masks that apply to the tasks
+* in the cpuset. They may be changed if the configured masks are
+* changed or hotplug happens.
+*
+* effective_mask == configured_mask  parent's effective_mask,
+* and if it ends up empty, it will inherit the parent's mask.
+*
+*
+* On legacy hierachy:
+*
+* The user-configured masks are always the same with effective masks.
+*/
+
/* user-configured CPUs and Memory Nodes allow to tasks */
cpumask_var_t cpus_allowed;
nodemask_t mems_allowed;
@@ -450,9 +470,9 @@ static int validate_change(struct cpuset *cur, struct 
cpuset *trial)
 
par = parent_cs(cur);
 
-   /* We must be a subset of our parent cpuset */
+   /* On legacy hiearchy, we must be a subset of our parent cpuset. */
ret = -EACCES;
-   if (!is_cpuset_subset(trial, par))
+   if (!cgroup_on_dfl(cur-css.cgroup)  !is_cpuset_subset(trial, par))
goto out;
 
/*
@@ -2167,6 +2187,7 @@ static void cpuset_hotplug_workfn(struct work_struct 
*work)
static cpumask_t new_cpus;
static nodemask_t new_mems;
bool cpus_updated, mems_updated;
+   bool on_dfl = cgroup_on_dfl(top_cpuset.css.cgroup);
 
mutex_lock(cpuset_mutex);
 
@@ -2174,13 +2195,14 @@ static void cpuset_hotplug_workfn(struct work_struct 
*work)
cpumask_copy(new_cpus, cpu_active_mask);
new_mems = node_states[N_MEMORY];
 
-   cpus_updated = !cpumask_equal(top_cpuset.cpus_allowed, new_cpus);
-   mems_updated = !nodes_equal(top_cpuset.mems_allowed, new_mems);
+   cpus_updated = !cpumask_equal(top_cpuset.effective_cpus, new_cpus);
+   mems_updated = !nodes_equal(top_cpuset.effective_mems, new_mems);
 
/* synchronize cpus_allowed to cpu_active_mask */
if (cpus_updated) {
mutex_lock(callback_mutex);
-   cpumask_copy(top_cpuset.cpus_allowed, new_cpus);
+   if (!on_dfl)
+   cpumask_copy(top_cpuset.cpus_allowed, new_cpus);
cpumask_copy(top_cpuset.effective_cpus, new_cpus);
mutex_unlock(callback_mutex);
/* we don't mess with cpumasks of tasks in top_cpuset */
@@ -2189,7 +2211,8 @@ static void cpuset_hotplug_workfn(struct work_struct 
*work)
/* synchronize mems_allowed to N_MEMORY */
if (mems_updated) {
mutex_lock(callback_mutex);
-   top_cpuset.mems_allowed = new_mems;
+   if (!on_dfl)
+   top_cpuset.mems_allowed = new_mems;
top_cpuset.effective_mems = new_mems;
mutex_unlock(callback_mutex);
update_tasks_nodemask(top_cpuset);
-- 
1.8.0.2

--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

[PATCH v3 07/12] cpuset: apply cs-effective_{cpus,mems}

2014-07-09 Thread Li Zefan

Now we can use cs-effective_{cpus,mems} as effective masks. It's
used whenever:

- we update tasks' cpus_allowed/mems_allowed,
- we want to retrieve tasks_cs(tsk)'s cpus_allowed/mems_allowed.

They actually replace effective_{cpu,node}mask_cpuset().

effective_mask == configured_mask  parent effective_mask except when
the reault is empty, in which case it inherits parent effective_mask.
The result equals the mask computed from effective_{cpu,node}mask_cpuset().

This won't affect the original legacy hierarchy, because in this case we
make sure the effective masks are always the same with user-configured
masks.

Signed-off-by: Li Zefan lize...@huawei.com
---
 kernel/cpuset.c | 83 ++---
 1 file changed, 14 insertions(+), 69 deletions(-)

diff --git a/kernel/cpuset.c b/kernel/cpuset.c
index e4c31e6..820870a 100644
--- a/kernel/cpuset.c
+++ b/kernel/cpuset.c
@@ -313,9 +313,9 @@ static struct file_system_type cpuset_fs_type = {
  */
 static void guarantee_online_cpus(struct cpuset *cs, struct cpumask *pmask)
 {
-   while (!cpumask_intersects(cs-cpus_allowed, cpu_online_mask))
+   while (!cpumask_intersects(cs-effective_cpus, cpu_online_mask))
cs = parent_cs(cs);
-   cpumask_and(pmask, cs-cpus_allowed, cpu_online_mask);
+   cpumask_and(pmask, cs-effective_cpus, cpu_online_mask);
 }
 
 /*
@@ -331,9 +331,9 @@ static void guarantee_online_cpus(struct cpuset *cs, struct 
cpumask *pmask)
  */
 static void guarantee_online_mems(struct cpuset *cs, nodemask_t *pmask)
 {
-   while (!nodes_intersects(cs-mems_allowed, node_states[N_MEMORY]))
+   while (!nodes_intersects(cs-effective_mems, node_states[N_MEMORY]))
cs = parent_cs(cs);
-   nodes_and(*pmask, cs-mems_allowed, node_states[N_MEMORY]);
+   nodes_and(*pmask, cs-effective_mems, node_states[N_MEMORY]);
 }
 
 /*
@@ -795,45 +795,6 @@ void rebuild_sched_domains(void)
mutex_unlock(cpuset_mutex);
 }
 
-/*
- * effective_cpumask_cpuset - return nearest ancestor with non-empty cpus
- * @cs: the cpuset in interest
- *
- * A cpuset's effective cpumask is the cpumask of the nearest ancestor
- * with non-empty cpus. We use effective cpumask whenever:
- * - we update tasks' cpus_allowed. (they take on the ancestor's cpumask
- *   if the cpuset they reside in has no cpus)
- * - we want to retrieve task_cs(tsk)'s cpus_allowed.
- *
- * Called with cpuset_mutex held. cpuset_cpus_allowed_fallback() is an
- * exception. See comments there.
- */
-static struct cpuset *effective_cpumask_cpuset(struct cpuset *cs)
-{
-   while (cpumask_empty(cs-cpus_allowed))
-   cs = parent_cs(cs);
-   return cs;
-}
-
-/*
- * effective_nodemask_cpuset - return nearest ancestor with non-empty mems
- * @cs: the cpuset in interest
- *
- * A cpuset's effective nodemask is the nodemask of the nearest ancestor
- * with non-empty memss. We use effective nodemask whenever:
- * - we update tasks' mems_allowed. (they take on the ancestor's nodemask
- *   if the cpuset they reside in has no mems)
- * - we want to retrieve task_cs(tsk)'s mems_allowed.
- *
- * Called with cpuset_mutex held.
- */
-static struct cpuset *effective_nodemask_cpuset(struct cpuset *cs)
-{
-   while (nodes_empty(cs-mems_allowed))
-   cs = parent_cs(cs);
-   return cs;
-}
-
 /**
  * update_tasks_cpumask - Update the cpumasks of tasks in the cpuset.
  * @cs: the cpuset in which each task's cpus_allowed mask needs to be changed
@@ -844,13 +805,12 @@ static struct cpuset *effective_nodemask_cpuset(struct 
cpuset *cs)
  */
 static void update_tasks_cpumask(struct cpuset *cs)
 {
-   struct cpuset *cpus_cs = effective_cpumask_cpuset(cs);
struct css_task_iter it;
struct task_struct *task;
 
css_task_iter_start(cs-css, it);
while ((task = css_task_iter_next(it)))
-   set_cpus_allowed_ptr(task, cpus_cs-cpus_allowed);
+   set_cpus_allowed_ptr(task, cs-effective_cpus);
css_task_iter_end(it);
 }
 
@@ -988,15 +948,13 @@ static void cpuset_migrate_mm(struct mm_struct *mm, const 
nodemask_t *from,
const nodemask_t *to)
 {
struct task_struct *tsk = current;
-   struct cpuset *mems_cs;
 
tsk-mems_allowed = *to;
 
do_migrate_pages(mm, from, to, MPOL_MF_MOVE_ALL);
 
rcu_read_lock();
-   mems_cs = effective_nodemask_cpuset(task_cs(tsk));
-   guarantee_online_mems(mems_cs, tsk-mems_allowed);
+   guarantee_online_mems(task_cs(tsk), tsk-mems_allowed);
rcu_read_unlock();
 }
 
@@ -1065,13 +1023,12 @@ static void *cpuset_being_rebound;
 static void update_tasks_nodemask(struct cpuset *cs)
 {
static nodemask_t newmems;  /* protected by cpuset_mutex */
-   struct cpuset *mems_cs = effective_nodemask_cpuset(cs);
struct css_task_iter it;
struct task_struct *task;
 
cpuset_being_rebound = cs

[PATCH v3 12/12] cpuset: export effective masks to userspace

2014-07-09 Thread Li Zefan

cpuset.cpus and cpuset.mems are the configured masks, and we need
to export effective masks to userspace, so users know the real
cpus_allowed and mems_allowed that apply to the tasks in a cpuset.

v2:
- export those masks unconditionally, suggested by Tejun.

Signed-off-by: Li Zefan lize...@huawei.com
---
 kernel/cpuset.c | 20 
 1 file changed, 20 insertions(+)

diff --git a/kernel/cpuset.c b/kernel/cpuset.c
index 65878a7..53a9bbf 100644
--- a/kernel/cpuset.c
+++ b/kernel/cpuset.c
@@ -1535,6 +1535,8 @@ typedef enum {
FILE_MEMORY_MIGRATE,
FILE_CPULIST,
FILE_MEMLIST,
+   FILE_EFFECTIVE_CPULIST,
+   FILE_EFFECTIVE_MEMLIST,
FILE_CPU_EXCLUSIVE,
FILE_MEM_EXCLUSIVE,
FILE_MEM_HARDWALL,
@@ -1701,6 +1703,12 @@ static int cpuset_common_seq_show(struct seq_file *sf, 
void *v)
case FILE_MEMLIST:
s += nodelist_scnprintf(s, count, cs-mems_allowed);
break;
+   case FILE_EFFECTIVE_CPULIST:
+   s += cpulist_scnprintf(s, count, cs-effective_cpus);
+   break;
+   case FILE_EFFECTIVE_MEMLIST:
+   s += nodelist_scnprintf(s, count, cs-effective_mems);
+   break;
default:
ret = -EINVAL;
goto out_unlock;
@@ -1786,6 +1794,18 @@ static struct cftype files[] = {
},
 
{
+   .name = effective_cpus,
+   .seq_show = cpuset_common_seq_show,
+   .private = FILE_EFFECTIVE_CPULIST,
+   },
+
+   {
+   .name = effective_mems,
+   .seq_show = cpuset_common_seq_show,
+   .private = FILE_EFFECTIVE_MEMLIST,
+   },
+
+   {
.name = cpu_exclusive,
.read_u64 = cpuset_read_u64,
.write_u64 = cpuset_write_u64,
-- 
1.8.0.2

--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

[PATCH v3 11/12] cpuset: allow writing offlined masks to cpuset.cpus/mems

2014-07-09 Thread Li Zefan

As the configured masks won't be limited by its parent, and the top
cpuset's masks won't change when hotplug happens, it's natural to
allow writing offlined masks to the configured masks.

If on default hierarchy:

# echo 0  /sys/devices/system/cpu/cpu1/online
# mkdir /cpuset/sub
# echo 1  /cpuset/sub/cpuset.cpus
# cat /cpuset/sub/cpuset.cpus
1

If on legacy hierarchy:

# echo 0  /sys/devices/system/cpu/cpu1/online
# mkdir /cpuset/sub
# echo 1  /cpuset/sub/cpuset.cpus
-bash: echo: write error: Invalid argument

Note the checks don't need to be gated by cgroup_on_dfl, because we've
initialized top_cpuset.{cpus,mems}_allowed accordingly in cpuset_bind().

Signed-off-by: Li Zefan lize...@huawei.com
---
 kernel/cpuset.c | 7 ---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/kernel/cpuset.c b/kernel/cpuset.c
index c47cb94..65878a7 100644
--- a/kernel/cpuset.c
+++ b/kernel/cpuset.c
@@ -929,7 +929,8 @@ static int update_cpumask(struct cpuset *cs, struct cpuset 
*trialcs,
if (retval  0)
return retval;
 
-   if (!cpumask_subset(trialcs-cpus_allowed, cpu_active_mask))
+   if (!cpumask_subset(trialcs-cpus_allowed,
+   top_cpuset.cpus_allowed))
return -EINVAL;
}
 
@@ -1186,8 +1187,8 @@ static int update_nodemask(struct cpuset *cs, struct 
cpuset *trialcs,
goto done;
 
if (!nodes_subset(trialcs-mems_allowed,
-   node_states[N_MEMORY])) {
-   retval =  -EINVAL;
+ top_cpuset.mems_allowed)) {
+   retval = -EINVAL;
goto done;
}
}
-- 
1.8.0.2

--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

[PATCH v3 10/12] cpuset: enable onlined cpu/node in effective masks

2014-07-09 Thread Li Zefan

Firstly offline cpu1:

  # echo 0-1  cpuset.cpus
  # echo 0  /sys/devices/system/cpu/cpu1/online
  # cat cpuset.cpus
  0-1
  # cat cpuset.effective_cpus
  0

Then online it:

  # echo 1  /sys/devices/system/cpu/cpu1/online
  # cat cpuset.cpus
  0-1
  # cat cpuset.effective_cpus
  0-1

And cpuset will bring it back to the effective mask.

The implementation is quite straightforward. Instead of calculating the
offlined cpus/mems and do updates, we just set the new effective_mask
to online_mask  congifured_mask.

This is a behavior change for default hierarchy, so legacy hierarchy
won't be affected.

v2:
- make refactoring of cpuset_hotplug_update_tasks() as seperate patch,
  suggested by Tejun.
- make hotplug_update_tasks_insane() use @new_cpus and @new_mems as
  hotplug_update_tasks_sane() does.

Signed-off-by: Li Zefan lize...@huawei.com
---
 kernel/cpuset.c | 65 -
 1 file changed, 36 insertions(+), 29 deletions(-)

diff --git a/kernel/cpuset.c b/kernel/cpuset.c
index 41822e2..c47cb94 100644
--- a/kernel/cpuset.c
+++ b/kernel/cpuset.c
@@ -2080,26 +2080,27 @@ static void remove_tasks_in_empty_cpuset(struct cpuset 
*cs)
}
 }
 
-static void hotplug_update_tasks_legacy(struct cpuset *cs,
-   struct cpumask *off_cpus,
-   nodemask_t *off_mems)
+static void
+hotplug_update_tasks_legacy(struct cpuset *cs,
+   struct cpumask *new_cpus, nodemask_t *new_mems,
+   bool cpus_updated, bool mems_updated)
 {
bool is_empty;
 
mutex_lock(callback_mutex);
-   cpumask_andnot(cs-cpus_allowed, cs-cpus_allowed, off_cpus);
-   cpumask_andnot(cs-effective_cpus, cs-effective_cpus, off_cpus);
-   nodes_andnot(cs-mems_allowed, cs-mems_allowed, *off_mems);
-   nodes_andnot(cs-effective_mems, cs-effective_mems, *off_mems);
+   cpumask_copy(cs-cpus_allowed, new_cpus);
+   cpumask_copy(cs-effective_cpus, new_cpus);
+   cs-mems_allowed = *new_mems;
+   cs-effective_mems = *new_mems;
mutex_unlock(callback_mutex);
 
/*
 * Don't call update_tasks_cpumask() if the cpuset becomes empty,
 * as the tasks will be migratecd to an ancestor.
 */
-   if (!cpumask_empty(off_cpus)  !cpumask_empty(cs-cpus_allowed))
+   if (cpus_updated  !cpumask_empty(cs-cpus_allowed))
update_tasks_cpumask(cs);
-   if (!nodes_empty(*off_mems)  !nodes_empty(cs-mems_allowed))
+   if (mems_updated  !nodes_empty(cs-mems_allowed))
update_tasks_nodemask(cs);
 
is_empty = cpumask_empty(cs-cpus_allowed) ||
@@ -2118,24 +2119,24 @@ static void hotplug_update_tasks_legacy(struct cpuset 
*cs,
mutex_lock(cpuset_mutex);
 }
 
-static void hotplug_update_tasks(struct cpuset *cs,
-struct cpumask *off_cpus,
-nodemask_t *off_mems)
+static void
+hotplug_update_tasks(struct cpuset *cs,
+struct cpumask *new_cpus, nodemask_t *new_mems,
+bool cpus_updated, bool mems_updated)
 {
+   if (cpumask_empty(new_cpus))
+   cpumask_copy(new_cpus, parent_cs(cs)-effective_cpus);
+   if (nodes_empty(*new_mems))
+   *new_mems = parent_cs(cs)-effective_mems;
+
mutex_lock(callback_mutex);
-   cpumask_andnot(cs-effective_cpus, cs-effective_cpus, off_cpus);
-   if (cpumask_empty(cs-effective_cpus))
-   cpumask_copy(cs-effective_cpus,
-parent_cs(cs)-effective_cpus);
-
-   nodes_andnot(cs-effective_mems, cs-effective_mems, *off_mems);
-   if (nodes_empty(cs-effective_mems))
-   cs-effective_mems = parent_cs(cs)-effective_mems;
+   cpumask_copy(cs-effective_cpus, new_cpus);
+   cs-effective_mems = *new_mems;
mutex_unlock(callback_mutex);
 
-   if (!cpumask_empty(off_cpus))
+   if (cpus_updated)
update_tasks_cpumask(cs);
-   if (!nodes_empty(*off_mems))
+   if (mems_updated)
update_tasks_nodemask(cs);
 }
 
@@ -2149,8 +2150,10 @@ static void hotplug_update_tasks(struct cpuset *cs,
  */
 static void cpuset_hotplug_update_tasks(struct cpuset *cs)
 {
-   static cpumask_t off_cpus;
-   static nodemask_t off_mems;
+   static cpumask_t new_cpus;
+   static nodemask_t new_mems;
+   bool cpus_updated;
+   bool mems_updated;
 retry:
wait_event(cpuset_attach_wq, cs-attach_in_progress == 0);
 
@@ -2165,14 +2168,18 @@ retry:
goto retry;
}
 
-   cpumask_andnot(off_cpus, cs-effective_cpus,
-  top_cpuset.effective_cpus);
-   nodes_andnot(off_mems, cs-effective_mems, top_cpuset.effective_mems);
+   cpumask_and(new_cpus, cs-cpus_allowed, parent_cs(cs)-effective_cpus);
+   nodes_and(new_mems, cs-mems_allowed, parent_cs(cs)-effective_mems

[PATCH v3 09/12] cpuset: refactor cpuset_hotplug_update_tasks()

2014-07-09 Thread Li Zefan

We mix the handling for both default hierarchy and legacy hierarchy in
the same function, and it's quite messy, so split into two functions.

Signed-off-by: Li Zefan lize...@huawei.com
---
 kernel/cpuset.c | 121 ++--
 1 file changed, 66 insertions(+), 55 deletions(-)

diff --git a/kernel/cpuset.c b/kernel/cpuset.c
index 4b409d2..41822e2 100644
--- a/kernel/cpuset.c
+++ b/kernel/cpuset.c
@@ -2080,6 +2080,65 @@ static void remove_tasks_in_empty_cpuset(struct cpuset 
*cs)
}
 }
 
+static void hotplug_update_tasks_legacy(struct cpuset *cs,
+   struct cpumask *off_cpus,
+   nodemask_t *off_mems)
+{
+   bool is_empty;
+
+   mutex_lock(callback_mutex);
+   cpumask_andnot(cs-cpus_allowed, cs-cpus_allowed, off_cpus);
+   cpumask_andnot(cs-effective_cpus, cs-effective_cpus, off_cpus);
+   nodes_andnot(cs-mems_allowed, cs-mems_allowed, *off_mems);
+   nodes_andnot(cs-effective_mems, cs-effective_mems, *off_mems);
+   mutex_unlock(callback_mutex);
+
+   /*
+* Don't call update_tasks_cpumask() if the cpuset becomes empty,
+* as the tasks will be migratecd to an ancestor.
+*/
+   if (!cpumask_empty(off_cpus)  !cpumask_empty(cs-cpus_allowed))
+   update_tasks_cpumask(cs);
+   if (!nodes_empty(*off_mems)  !nodes_empty(cs-mems_allowed))
+   update_tasks_nodemask(cs);
+
+   is_empty = cpumask_empty(cs-cpus_allowed) ||
+  nodes_empty(cs-mems_allowed);
+
+   mutex_unlock(cpuset_mutex);
+
+   /*
+* Move tasks to the nearest ancestor with execution resources,
+* This is full cgroup operation which will also call back into
+* cpuset. Should be done outside any lock.
+*/
+   if (is_empty)
+   remove_tasks_in_empty_cpuset(cs);
+
+   mutex_lock(cpuset_mutex);
+}
+
+static void hotplug_update_tasks(struct cpuset *cs,
+struct cpumask *off_cpus,
+nodemask_t *off_mems)
+{
+   mutex_lock(callback_mutex);
+   cpumask_andnot(cs-effective_cpus, cs-effective_cpus, off_cpus);
+   if (cpumask_empty(cs-effective_cpus))
+   cpumask_copy(cs-effective_cpus,
+parent_cs(cs)-effective_cpus);
+
+   nodes_andnot(cs-effective_mems, cs-effective_mems, *off_mems);
+   if (nodes_empty(cs-effective_mems))
+   cs-effective_mems = parent_cs(cs)-effective_mems;
+   mutex_unlock(callback_mutex);
+
+   if (!cpumask_empty(off_cpus))
+   update_tasks_cpumask(cs);
+   if (!nodes_empty(*off_mems))
+   update_tasks_nodemask(cs);
+}
+
 /**
  * cpuset_hotplug_update_tasks - update tasks in a cpuset for hotunplug
  * @cs: cpuset in interest
@@ -2092,9 +2151,6 @@ static void cpuset_hotplug_update_tasks(struct cpuset *cs)
 {
static cpumask_t off_cpus;
static nodemask_t off_mems;
-   bool is_empty;
-   bool on_dfl = cgroup_on_dfl(cs-css.cgroup);
-
 retry:
wait_event(cpuset_attach_wq, cs-attach_in_progress == 0);
 
@@ -2109,61 +2165,16 @@ retry:
goto retry;
}
 
-   cpumask_andnot(off_cpus, cs-cpus_allowed, top_cpuset.cpus_allowed);
-   nodes_andnot(off_mems, cs-mems_allowed, top_cpuset.mems_allowed);
-
-   mutex_lock(callback_mutex);
-   cpumask_andnot(cs-cpus_allowed, cs-cpus_allowed, off_cpus);
-
-   /* Inherit the effective mask of the parent, if it becomes empty. */
-   cpumask_andnot(cs-effective_cpus, cs-effective_cpus, off_cpus);
-   if (on_dfl  cpumask_empty(cs-effective_cpus))
-   cpumask_copy(cs-effective_cpus, parent_cs(cs)-effective_cpus);
-   mutex_unlock(callback_mutex);
-
-   /*
-* If on_dfl, we need to update tasks' cpumask for empty cpuset to
-* take on ancestor's cpumask. Otherwise, don't call
-* update_tasks_cpumask() if the cpuset becomes empty, as the tasks
-* in it will be migrated to an ancestor.
-*/
-   if ((on_dfl  cpumask_empty(cs-cpus_allowed)) ||
-   (!cpumask_empty(off_cpus)  !cpumask_empty(cs-cpus_allowed)))
-   update_tasks_cpumask(cs);
-
-   mutex_lock(callback_mutex);
-   nodes_andnot(cs-mems_allowed, cs-mems_allowed, off_mems);
+   cpumask_andnot(off_cpus, cs-effective_cpus,
+  top_cpuset.effective_cpus);
+   nodes_andnot(off_mems, cs-effective_mems, top_cpuset.effective_mems);
 
-   /* Inherit the effective mask of the parent, if it becomes empty */
-   nodes_andnot(cs-effective_mems, cs-effective_mems, off_mems);
-   if (on_dfl  nodes_empty(cs-effective_mems))
-   cs-effective_mems = parent_cs(cs)-effective_mems;
-   mutex_unlock(callback_mutex);
-
-   /*
-* If on_dfl, we need to update tasks' nodemask for empty cpuset

Re: [PATCHSET cgroup/for-3.17] cgroup: remove sane_behavior support on non-default hierarchies

2014-07-09 Thread Li Zefan

On 2014/7/3 7:45, Tejun Heo wrote:
 Hello,
 
 sane_behavior has been used as a development vehicle for the default
 unified hierarchy.  Now that the default hierarchy is in place, the
 flag became redundant and confusing as its usage is allowed on all
 hierarchies.  There are gonna be either the default hierarchy or
 legacy ones.  Let's make that clear by removing sane_behavior support
 on non-default hierarchies.
 
 This patchset contains the following four patches.
 
  0001-cgroup-remove-CGRP_ROOT_OPTION_MASK.patch
  0002-cgroup-make-interface-file-cgroup.sane_behavior-lega.patch
  0003-cgroup-remove-sane_behavior-support-on-non-default-h.patch
  0004-cgroup-clean-up-sane_behavior-handling.patch
 
 0001 is a trivial cleanup.
 
 0002 removes cgroup.sane_behavior from the default hierarchy.
 
 0003 removes sane_behavior support on non-default hierarchies.
 
 0004 cleans up sane_behavior handling.
 
 This patchset is on top of a497c3ba1d97 (Linux 3.16-rc2) and
 available in the following git branch.
 
  git://git.kernel.org/pub/scm/linux/kernel/git/tj/cgroup.git 
 review-dfl-instead-of-sane
 
 diffstat follows.  Thanks.
 
  block/blk-throttle.c   |6 +-
  include/linux/cgroup.h |  128 
 -
  kernel/cgroup.c|   96 +++-
  kernel/cpuset.c|   33 +---
  mm/memcontrol.c|7 +-
  5 files changed, 117 insertions(+), 153 deletions(-)
 

Acked-by: Li Zefan lize...@huawei.com

I'm rebasing my cpuset patchset against this.

--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: [PATCH] sched/rt: overrun could happen in start_hrtick_dl

2014-07-07 Thread Li Zefan

On 2014/7/8 9:10, xiaofeng.yan wrote:
> On 2014/7/7 16:41, Peter Zijlstra wrote:
>> On Fri, Jul 04, 2014 at 12:02:21PM +, xiaofeng.yan wrote:
>>> It could be wrong for the precision of runtime and deadline
>>> when the precision is within microsecond level. For example:
>>> Task runtime deadline period
>>>   P1   200us   500us   500us
>>>
>>> This case need enbale HRTICK feature by the next command
>>> PC#echo "HRTICK" > /sys/kernel/debug/sched_features
>>> PC#./schedtool -E -t 20:50 -e ./test&
>>> PC#trace-cmd record -e sched_switch
>> Are you actually using HRTICK ?
> yes, If HRTICK is close , then all of runtime and deadline will be wrong.

I think what peter meant is, do you use HRTICK in products or
just use it for testing/experiment?

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: [PATCH] sched/rt: overrun could happen in start_hrtick_dl

2014-07-07 Thread Li Zefan

On 2014/7/8 9:10, xiaofeng.yan wrote:
 On 2014/7/7 16:41, Peter Zijlstra wrote:
 On Fri, Jul 04, 2014 at 12:02:21PM +, xiaofeng.yan wrote:
 It could be wrong for the precision of runtime and deadline
 when the precision is within microsecond level. For example:
 Task runtime deadline period
   P1   200us   500us   500us

 This case need enbale HRTICK feature by the next command
 PC#echo HRTICK  /sys/kernel/debug/sched_features
 PC#./schedtool -E -t 20:50 -e ./test
 PC#trace-cmd record -e sched_switch
 Are you actually using HRTICK ?
 yes, If HRTICK is close , then all of runtime and deadline will be wrong.

I think what peter meant is, do you use HRTICK in products or
just use it for testing/experiment?

--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: [PATCH] MAINTAINERS:ARM:hisi: add Hisilicon SoC family

2014-07-04 Thread Li Zefan

On 2014/7/4 15:11, xuwei wrote:
> 
> Introduce a new mach-hisi that will support Hisilicon SoCs based on ARMv7
> and I am taking maintainership for it.
> 
> Signed-off-by: Wei Xu 
> ---
>  MAINTAINERS | 8 
>  1 file changed, 8 insertions(+)
> 
> diff --git a/MAINTAINERS b/MAINTAINERS
> index 134483f..c11c89b 100644
> --- a/MAINTAINERS
> +++ b/MAINTAINERS
> @@ -972,6 +972,14 @@ F:   arch/arm/mach-pxa/hx4700.c
>  F:   arch/arm/mach-pxa/include/mach/hx4700.h
>  F:   sound/soc/pxa/hx4700.c
>  
> +ARM/Hisilicon SoC support
> +M:   Wei Xu 
> +L:   linux-arm-ker...@lists.infradead.org (moderated for non-subscribers)
> +W:   www.hisilicon.com
> +S:   Maintained

S:  Supported ?

Supported:   Someone is actually paid to look after this.
Maintained:  Someone actually looks after it.

> +T:   git git://github.com/hisilicon/linux-hisi.git
> +F:   arch/arm/mach-hisi/
> +
>  ARM/HP JORNADA 7XX MACHINE SUPPORT
>  M:   Kristoffer Ericson 
>  W:   www.jlime.com
> 

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: [PATCHSET cgroup/for-3.17] cgroup, blkcg, memcg: make blkcg depend on memcg on unified hierarchy

2014-07-04 Thread Li Zefan

Hi Tejun,

On 2014/6/28 9:03, Tejun Heo wrote:
> Hello, guys.
> 
> Currently, the blkio subsystem attributes all of writeback IOs to the
> root.  One of the issues is that there's no way to tell who originated
> a writeback IO from block layer.  Those IOs are usually issued
> asynchronously from a task which didn't have anything to do with
> actually generating the dirty pages.  The memory subsystem, when
> enabled, already keeps track of the ownership of each dirty page and
> it's desirable for blkio to piggyback instead of adding its own
> per-page tag.

It's great to see this being worked on!

> 
> This can be achieved on the unified hierarchy without too much
> difficulty.  This patchset implements a dependency mechanism in the
> cgroup such that a subsystem can depends on other subsystems.  If
> available, the depended-upon subsystems are enabled together
> implicitly when the subsystem is turned on.  Implicitly enabled
> subsystems are invisible and the dependencies are transparent to
> userland.
> 
> This patchset implements the dependency mechanism in cgroup core and
> make blkcg depend on memcg.  This doesn't actually solve the writeback
> problem yet but is an important step.
> 
> This patchset contains the following six patches.
> 
>  0001-cgroup-reorganize-cgroup_subtree_control_write.patch
>  0002-cgroup-introduce-cgroup-subtree_control.patch
>  0003-cgroup-make-interface-files-visible-iff-enabled-on-c.patch
>  0004-cgroup-implement-cgroup_subsys-css_reset.patch
>  0005-cgroup-implement-cgroup_subsys-depends_on.patch
>  0006-blkcg-memcg-make-blkcg-depend-on-memcg-on-the-defaul.patch
> 
> 0001-0005 gradually implement the dependency mechanism.
> 
> 0006 makes blkcg depend on memcg.
> 
> This patchset is on top of a497c3ba1d97 ("Linux 3.16-rc2") and
> available in the following git branch.
> 
>  git://git.kernel.org/pub/scm/linux/kernel/git/tj/cgroup.git 
> review-cgroup-dependency
> 
> diffstat follows.  Thanks.
> 
>  Documentation/cgroups/cgroups.txt   |   14 +
>  Documentation/cgroups/unified-hierarchy.txt |   23 ++-
>  block/blk-cgroup.c  |7
>  include/linux/cgroup.h  |   20 ++
>  kernel/cgroup.c |  201 
> ++--
>  mm/memcontrol.c |   24 +++
>  6 files changed, 243 insertions(+), 46 deletions(-)
> 

Acked-by: Li Zefan 

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: [PATCHSET cgroup/for-3.17] cgroup, blkcg, memcg: make blkcg depend on memcg on unified hierarchy

2014-07-04 Thread Li Zefan

Hi Tejun,

On 2014/6/28 9:03, Tejun Heo wrote:
 Hello, guys.
 
 Currently, the blkio subsystem attributes all of writeback IOs to the
 root.  One of the issues is that there's no way to tell who originated
 a writeback IO from block layer.  Those IOs are usually issued
 asynchronously from a task which didn't have anything to do with
 actually generating the dirty pages.  The memory subsystem, when
 enabled, already keeps track of the ownership of each dirty page and
 it's desirable for blkio to piggyback instead of adding its own
 per-page tag.

It's great to see this being worked on!

 
 This can be achieved on the unified hierarchy without too much
 difficulty.  This patchset implements a dependency mechanism in the
 cgroup such that a subsystem can depends on other subsystems.  If
 available, the depended-upon subsystems are enabled together
 implicitly when the subsystem is turned on.  Implicitly enabled
 subsystems are invisible and the dependencies are transparent to
 userland.
 
 This patchset implements the dependency mechanism in cgroup core and
 make blkcg depend on memcg.  This doesn't actually solve the writeback
 problem yet but is an important step.
 
 This patchset contains the following six patches.
 
  0001-cgroup-reorganize-cgroup_subtree_control_write.patch
  0002-cgroup-introduce-cgroup-subtree_control.patch
  0003-cgroup-make-interface-files-visible-iff-enabled-on-c.patch
  0004-cgroup-implement-cgroup_subsys-css_reset.patch
  0005-cgroup-implement-cgroup_subsys-depends_on.patch
  0006-blkcg-memcg-make-blkcg-depend-on-memcg-on-the-defaul.patch
 
 0001-0005 gradually implement the dependency mechanism.
 
 0006 makes blkcg depend on memcg.
 
 This patchset is on top of a497c3ba1d97 (Linux 3.16-rc2) and
 available in the following git branch.
 
  git://git.kernel.org/pub/scm/linux/kernel/git/tj/cgroup.git 
 review-cgroup-dependency
 
 diffstat follows.  Thanks.
 
  Documentation/cgroups/cgroups.txt   |   14 +
  Documentation/cgroups/unified-hierarchy.txt |   23 ++-
  block/blk-cgroup.c  |7
  include/linux/cgroup.h  |   20 ++
  kernel/cgroup.c |  201 
 ++--
  mm/memcontrol.c |   24 +++
  6 files changed, 243 insertions(+), 46 deletions(-)
 

Acked-by: Li Zefan lize...@huawei.com

--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: [PATCH] MAINTAINERS:ARM:hisi: add Hisilicon SoC family

2014-07-04 Thread Li Zefan

On 2014/7/4 15:11, xuwei wrote:
 
 Introduce a new mach-hisi that will support Hisilicon SoCs based on ARMv7
 and I am taking maintainership for it.
 
 Signed-off-by: Wei Xu xuw...@hisilicon.com
 ---
  MAINTAINERS | 8 
  1 file changed, 8 insertions(+)
 
 diff --git a/MAINTAINERS b/MAINTAINERS
 index 134483f..c11c89b 100644
 --- a/MAINTAINERS
 +++ b/MAINTAINERS
 @@ -972,6 +972,14 @@ F:   arch/arm/mach-pxa/hx4700.c
  F:   arch/arm/mach-pxa/include/mach/hx4700.h
  F:   sound/soc/pxa/hx4700.c
  
 +ARM/Hisilicon SoC support
 +M:   Wei Xu xuw...@hisilicon.com
 +L:   linux-arm-ker...@lists.infradead.org (moderated for non-subscribers)
 +W:   www.hisilicon.com
 +S:   Maintained

S:  Supported ?

Supported:   Someone is actually paid to look after this.
Maintained:  Someone actually looks after it.

 +T:   git git://github.com/hisilicon/linux-hisi.git
 +F:   arch/arm/mach-hisi/
 +
  ARM/HP JORNADA 7XX MACHINE SUPPORT
  M:   Kristoffer Ericson kristoffer.eric...@gmail.com
  W:   www.jlime.com
 

--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: [PATCH cgroup/for-3.16-fixes] cpuset: break kernfs active protection in cpuset_write_resmask()

2014-06-30 Thread Li Zefan

On 2014/7/1 3:47, Tejun Heo wrote:
> Hey, Li.
> 
> Can you please test this patch and ack it?
> 
...
> Signed-off-by: Tejun Heo 
> Reported-by: Li Zefan 

Tested-by: Li Zefan 

Thanks!

> ---
>  kernel/cpuset.c |   12 
>  1 file changed, 12 insertions(+)

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: [PATCH cgroup/for-3.16-fixes] cpuset: break kernfs active protection in cpuset_write_resmask()

2014-06-30 Thread Li Zefan

On 2014/7/1 3:47, Tejun Heo wrote:
 Hey, Li.
 
 Can you please test this patch and ack it?
 
...
 Signed-off-by: Tejun Heo t...@kernel.org
 Reported-by: Li Zefan lize...@huawei.com

Tested-by: Li Zefan lize...@huawei.com

Thanks!

 ---
  kernel/cpuset.c |   12 
  1 file changed, 12 insertions(+)

--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

[PATCH v3 3/3] cgroup: fix a race between cgroup_mount() and cgroup_kill_sb()

2014-06-29 Thread Li Zefan

We've converted cgroup to kernfs so cgroup won't be intertwined with
vfs objects and locking, but there are dark areas.

Run two instances of this script concurrently:

for ((; ;))
{
mount -t cgroup -o cpuacct xxx /cgroup
umount /cgroup
}

After a while, I saw two mount processes were stuck at retrying, because
they were waiting for a subsystem to become free, but the root associated
with this subsystem never got freed.

This can happen, if thread A is in the process of killing superblock but
hasn't called percpu_ref_kill(), and at this time thread B is mounting
the same cgroup root and finds the root in the root list and performs
percpu_ref_try_get().

To fix this, we try to increase both the refcnt of the superblock and the
percpu refcnt of cgroup root.

v2:
- we should try to get both the superblock refcnt and cgroup_root refcnt,
  because cgroup_root may have no superblock assosiated with it.
- adjust/add comments.

Cc:  # 3.15
Signed-off-by: Li Zefan 
---
 kernel/cgroup.c | 28 ++--
 1 file changed, 22 insertions(+), 6 deletions(-)

diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index d3662ac..11e40cf 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -1655,6 +1655,7 @@ static struct dentry *cgroup_mount(struct 
file_system_type *fs_type,
int ret;
int i;
bool new_sb;
+   struct super_block *sb = NULL;
 
/*
 * The first time anyone tries to mount a cgroup, enable the list
@@ -1739,14 +1740,18 @@ static struct dentry *cgroup_mount(struct 
file_system_type *fs_type,
 
/*
 * A root's lifetime is governed by its root cgroup.
-* tryget_live failure indicate that the root is being
-* destroyed.  Wait for destruction to complete so that the
-* subsystems are free.  We can use wait_queue for the wait
-* but this path is super cold.  Let's just sleep for a bit
-* and retry.
+* pin_sb and tryget_live failure indicate that the root is
+* being destroyed.  Wait for destruction to complete so that
+* the subsystems are free.  We can use wait_queue for the
+* wait but this path is super cold.  Let's just sleep for
+* a bit and retry.
 */
-   if (!percpu_ref_tryget_live(>cgrp.self.refcnt)) {
+   sb = kernfs_pin_sb(root->kf_root, NULL);
+   if (IS_ERR(sb) ||
+   !percpu_ref_tryget_live(>cgrp.self.refcnt)) {
mutex_unlock(_mutex);
+   if (!IS_ERR_OR_NULL(sb))
+   deactivate_super(sb);
msleep(10);
ret = restart_syscall();
goto out_free;
@@ -1790,6 +1795,17 @@ out_free:
dentry = kernfs_mount(fs_type, flags, root->kf_root, _sb);
if (IS_ERR(dentry) || !new_sb)
cgroup_put(>cgrp);
+
+   if (sb) {
+   /*
+* On success kernfs_mount() returns with sb->s_umount held,
+* but kernfs_mount() also increases the superblock's refcnt,
+* so calling deactivate_super() to drop the refcnt we got when
+* looking up cgroup root won't acquire sb->s_umount again.
+*/
+   WARN_ON(new_sb);
+   deactivate_super(sb);
+   }
return dentry;
 }
 
-- 
1.8.0.2

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

[PATCH v3 2/3] kernfs: introduce kernfs_pin_sb()

2014-06-29 Thread Li Zefan

kernfs_pin_sb() tries to get a refcnt of the superblock.

This will be used by cgroupfs.

v2:
- make kernfs_pin_sb() return the superblock.
- drop kernfs_drop_sb().

[ This is a prerequisite for a bugfix. ]
Cc:  # 3.15
Acked-by: Greg Kroah-Hartman 
Signed-off-by: Li Zefan 
---
 fs/kernfs/mount.c  | 27 +++
 include/linux/kernfs.h |  1 +
 2 files changed, 28 insertions(+)

diff --git a/fs/kernfs/mount.c b/fs/kernfs/mount.c
index f25a7c0..616c5c4 100644
--- a/fs/kernfs/mount.c
+++ b/fs/kernfs/mount.c
@@ -210,6 +210,33 @@ void kernfs_kill_sb(struct super_block *sb)
kernfs_put(root_kn);
 }
 
+/**
+ * kernfs_pin_sb: try to pin the superblock associated with a kernfs_root
+ * @kernfs_root: the kernfs_root in question
+ * @ns: the namespace tag
+ *
+ * Pin the superblock so the superblock won't be destroyed in subsequent
+ * operations. Return NULL if there's no superblock associated to this
+ * kernfs_root, or -EINVAL if the superblock is being freed.
+ */
+struct super_block *kernfs_pin_sb(struct kernfs_root *root, const void *ns)
+{
+   struct kernfs_super_info *info;
+   struct super_block *sb = NULL;
+
+   mutex_lock(_mutex);
+   list_for_each_entry(info, >supers, node) {
+   if (info->ns == ns) {
+   sb = info->sb;
+   if (!atomic_inc_not_zero(>sb->s_active))
+   sb = ERR_PTR(-EINVAL);
+   break;
+   }
+   }
+   mutex_unlock(_mutex);
+   return sb;
+}
+
 void __init kernfs_init(void)
 {
kernfs_node_cache = kmem_cache_create("kernfs_node_cache",
diff --git a/include/linux/kernfs.h b/include/linux/kernfs.h
index 589318b..9096296 100644
--- a/include/linux/kernfs.h
+++ b/include/linux/kernfs.h
@@ -287,6 +287,7 @@ struct dentry *kernfs_mount_ns(struct file_system_type 
*fs_type, int flags,
   struct kernfs_root *root, bool *new_sb_created,
   const void *ns);
 void kernfs_kill_sb(struct super_block *sb);
+struct super_block *kernfs_pin_sb(struct kernfs_root *root, const void *ns);
 
 void kernfs_init(void);
 
-- 
1.8.0.2

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

[PATCH v3 1/3] cgroup: fix mount failure in a corner case

2014-06-29 Thread Li Zefan

  # cat test.sh
  #! /bin/bash

  mount -t cgroup -o cpu xxx /cgroup
  umount /cgroup

  mount -t cgroup -o cpu,cpuacct xxx /cgroup
  umount /cgroup
  # ./test.sh
  mount: xxx already mounted or /cgroup busy
  mount: according to mtab, xxx is already mounted on /cgroup

It's because the cgroupfs_root of the first mount was under destruction
asynchronously.

Fix this by delaying and then retrying mount for this case.

v3:
- put the refcnt immediately after getting it. (Tejun)

v2:
- use percpu_ref_tryget_live() rather that introducing
  percpu_ref_alive(). (Tejun)
- adjust comment.

Cc:  # 3.15
Signed-off-by: Li Zefan 
---
 kernel/cgroup.c | 21 +
 1 file changed, 21 insertions(+)

diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index 1c65f24..d3662ac 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -1648,10 +1648,12 @@ static struct dentry *cgroup_mount(struct 
file_system_type *fs_type,
 int flags, const char *unused_dev_name,
 void *data)
 {
+   struct cgroup_subsys *ss;
struct cgroup_root *root;
struct cgroup_sb_opts opts;
struct dentry *dentry;
int ret;
+   int i;
bool new_sb;
 
/*
@@ -1677,6 +1679,25 @@ static struct dentry *cgroup_mount(struct 
file_system_type *fs_type,
goto out_unlock;
}
 
+   /*
+* Destruction of cgroup root is asynchronous, so we may fail to
+* mount a cgroupfs if it immediately follows a umount. Let's wait
+* a little bit and retry.
+*/
+   for_each_subsys(ss, i) {
+   if (!(opts.subsys_mask & (1 << i)) ||
+   ss->root == _dfl_root)
+   continue;
+
+   if (!percpu_ref_tryget_live(>root->cgrp.self.refcnt)) {
+   mutex_unlock(_mutex);
+   msleep(10);
+   ret = restart_syscall();
+   goto out_free;
+   }
+   cgroup_put(>root->cgrp);
+   }
+
for_each_root(root) {
bool name_match = false;
 
-- 
1.8.0.2

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

[BUG] cpuset: lockdep warning

2014-06-29 Thread Li Zefan

Hi Tejun,

In this lockdep warning kernfs and workqueue are involved, so I'm not sure 
what's
happening here.

This was triggered when tasks were being moved to parent cpuset due to hotplug.
The kernel is 3.16-rc1, with no modification.

localhost:/ # mount -t cgroup -o cpuset xxx /cpuset
localhost:/ # mkdir /cpuset/tmp
localhost:/ # echo 1 > /cpuset/tmp/cpuset.cpus
localhost:/ # echo 0 > cpuset/tmp/cpuset.mems
localhost:/ # echo $$ > /cpuset/tmp/tasks
localhost:/ # echo 0 > /sys/devices/system/cpu/cpu1/online


[ 1810.292243] ==
[ 1810.292251] [ INFO: possible circular locking dependency detected ]
[ 1810.292259] 3.16.0-rc1-0.1-default+ #7 Not tainted
[ 1810.292266] ---
[ 1810.292273] kworker/1:0/32649 is trying to acquire lock:
[ 1810.292280]  (cgroup_mutex){+.+.+.}, at: [] 
cgroup_transfer_tasks+0x37/0x150
[ 1810.292300]
[ 1810.292300] but task is already holding lock:
[ 1810.292309]  (cpuset_hotplug_work){+.+...}, at: [] 
process_one_work+0x192/0x520
[ 1810.292327]
[ 1810.292327] which lock already depends on the new lock.
[ 1810.292327]
[ 1810.292339]
[ 1810.292339] the existing dependency chain (in reverse order) is:
[ 1810.292348]
[ 1810.292348] -> #2 (cpuset_hotplug_work){+.+...}:
[ 1810.292360][] validate_chain+0x656/0x7c0
[ 1810.292371][] __lock_acquire+0x382/0x660
[ 1810.292380][] lock_acquire+0xf9/0x170
[ 1810.292389][] flush_work+0x39/0x90
[ 1810.292398][] cpuset_write_resmask+0x51/0x120
[ 1810.292409][] cgroup_file_write+0x49/0x1f0
[ 1810.292419][] kernfs_fop_write+0xfd/0x190
[ 1810.292431][] vfs_write+0xe5/0x190
[ 1810.292443][] SyS_write+0x5c/0xc0
[ 1810.292452][] system_call_fastpath+0x16/0x1b
[ 1810.292464]
[ 1810.292464] -> #1 (s_active#175){.+}:
[ 1810.292476][] validate_chain+0x656/0x7c0
[ 1810.292486][] __lock_acquire+0x382/0x660
[ 1810.292495][] lock_acquire+0xf9/0x170
[ 1810.292504][] kernfs_drain+0x13b/0x1c0
[ 1810.292513][] __kernfs_remove+0xc8/0x220
[ 1810.292523][] kernfs_remove_by_name_ns+0x50/0xb0
[ 1810.292533][] cgroup_addrm_files+0x16e/0x290
[ 1810.292543][] cgroup_clear_dir+0x6d/0xa0
[ 1810.292552][] rebind_subsystems+0x10f/0x350
[ 1810.292562][] cgroup_setup_root+0x1bf/0x290
[ 1810.292571][] cgroup_mount+0x123/0x3d0
[ 1810.292581][] mount_fs+0x4d/0x1a0
[ 1810.292591][] vfs_kern_mount+0x79/0x160
[ 1810.292602][] do_new_mount+0xd9/0x200
[ 1810.292611][] do_mount+0x1dc/0x220
[ 1810.292621][] SyS_mount+0xbc/0xe0
[ 1810.292630][] system_call_fastpath+0x16/0x1b
[ 1810.292640]
[ 1810.292640] -> #0 (cgroup_mutex){+.+.+.}:
[ 1810.292651][] check_prev_add+0x43e/0x4b0
[ 1810.292660][] validate_chain+0x656/0x7c0
[ 1810.292669][] __lock_acquire+0x382/0x660
[ 1810.292678][] lock_acquire+0xf9/0x170
[ 1810.292687][] mutex_lock_nested+0x6f/0x380
[ 1810.292697][] cgroup_transfer_tasks+0x37/0x150
[ 1810.292707][] 
hotplug_update_tasks_insane+0x110/0x1d0
[ 1810.292718][] 
cpuset_hotplug_update_tasks+0x13d/0x180
[ 1810.292729][] cpuset_hotplug_workfn+0x18c/0x630
[ 1810.292739][] process_one_work+0x254/0x520
[ 1810.292748][] worker_thread+0x13d/0x3d0
[ 1810.292758][] kthread+0xf8/0x100
[ 1810.292768][] ret_from_fork+0x7c/0xb0
[ 1810.292778]
[ 1810.292778] other info that might help us debug this:
[ 1810.292778]
[ 1810.292789] Chain exists of:
[ 1810.292789]   cgroup_mutex --> s_active#175 --> cpuset_hotplug_work
[ 1810.292789]
[ 1810.292807]  Possible unsafe locking scenario:
[ 1810.292807]
[ 1810.292816]CPU0CPU1
[ 1810.292822]
[ 1810.292827]   lock(cpuset_hotplug_work);
[ 1810.292835]lock(s_active#175);
[ 1810.292845]lock(cpuset_hotplug_work);
[ 1810.292855]   lock(cgroup_mutex);
[ 1810.292862]
[ 1810.292862]  *** DEADLOCK ***
[ 1810.292862]
[ 1810.292872] 2 locks held by kworker/1:0/32649:
[ 1810.292878]  #0:  ("events"){.+.+.+}, at: [] 
process_one_work+0x192/0x520
[ 1810.292895]  #1:  (cpuset_hotplug_work){+.+...}, at: [] 
process_one_work+0x192/0x520
[ 1810.292911]
[ 1810.292911] stack backtrace:
[ 1810.292920] CPU: 1 PID: 32649 Comm: kworker/1:0 Not tainted 
3.16.0-rc1-0.1-default+ #7
[ 1810.292929] Hardware name: Huawei Technologies Co., Ltd. Tecal RH2285
  /BC11BTSA  , BIOS CTSAV036 04/27/2011
[ 1810.292943] Workqueue: events cpuset_hotplug_workfn
[ 1810.292951]  824b01e0 8800afdd3918 815a5f78 
8800afdd3958
[ 1810.292964]  810c263f 1d1fa490 8800afdd3978 
88061d1fa490
[ 1810.292976]   88061d1fad08 88061d1fad40 
8800afdd39f8
[ 1810.292989] Call

Re: [PATCH v2 1/3] cgroup: fix mount failure in a corner case

2014-06-29 Thread Li Zefan

On 2014/6/28 19:58, Tejun Heo wrote:
> Hello, Li.
> 
> On Fri, Jun 27, 2014 at 05:13:12PM +0800, Li Zefan wrote:
>> +for_each_subsys(ss, i) {
>> +if (!(opts.subsys_mask & (1 << i)) ||
>> +ss->root == _dfl_root)
>> +continue;
>> +
>> +if (!percpu_ref_tryget_live(>root->cgrp.self.refcnt)) {
>> +mutex_unlock(_mutex);
>> +msleep(10);
>> +ret = restart_syscall();
>> +goto out_free;
>> +}
> 
> Why not just put it immediately?  We know that it's not gonna be
> destroyed while holding cgroup_mutex.  It may look a bit weird but
> this is a pretty special case anyway and deferring put doesn't buy
> anything.
> 

Yeah, this is better. :)

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: [PATCH v2 1/3] cgroup: fix mount failure in a corner case

2014-06-29 Thread Li Zefan

On 2014/6/28 19:58, Tejun Heo wrote:
 Hello, Li.
 
 On Fri, Jun 27, 2014 at 05:13:12PM +0800, Li Zefan wrote:
 +for_each_subsys(ss, i) {
 +if (!(opts.subsys_mask  (1  i)) ||
 +ss-root == cgrp_dfl_root)
 +continue;
 +
 +if (!percpu_ref_tryget_live(ss-root-cgrp.self.refcnt)) {
 +mutex_unlock(cgroup_mutex);
 +msleep(10);
 +ret = restart_syscall();
 +goto out_free;
 +}
 
 Why not just put it immediately?  We know that it's not gonna be
 destroyed while holding cgroup_mutex.  It may look a bit weird but
 this is a pretty special case anyway and deferring put doesn't buy
 anything.
 

Yeah, this is better. :)

--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

[BUG] cpuset: lockdep warning

2014-06-29 Thread Li Zefan

Hi Tejun,

In this lockdep warning kernfs and workqueue are involved, so I'm not sure 
what's
happening here.

This was triggered when tasks were being moved to parent cpuset due to hotplug.
The kernel is 3.16-rc1, with no modification.

localhost:/ # mount -t cgroup -o cpuset xxx /cpuset
localhost:/ # mkdir /cpuset/tmp
localhost:/ # echo 1  /cpuset/tmp/cpuset.cpus
localhost:/ # echo 0  cpuset/tmp/cpuset.mems
localhost:/ # echo $$  /cpuset/tmp/tasks
localhost:/ # echo 0  /sys/devices/system/cpu/cpu1/online


[ 1810.292243] ==
[ 1810.292251] [ INFO: possible circular locking dependency detected ]
[ 1810.292259] 3.16.0-rc1-0.1-default+ #7 Not tainted
[ 1810.292266] ---
[ 1810.292273] kworker/1:0/32649 is trying to acquire lock:
[ 1810.292280]  (cgroup_mutex){+.+.+.}, at: [8110e3d7] 
cgroup_transfer_tasks+0x37/0x150
[ 1810.292300]
[ 1810.292300] but task is already holding lock:
[ 1810.292309]  (cpuset_hotplug_work){+.+...}, at: [81085412] 
process_one_work+0x192/0x520
[ 1810.292327]
[ 1810.292327] which lock already depends on the new lock.
[ 1810.292327]
[ 1810.292339]
[ 1810.292339] the existing dependency chain (in reverse order) is:
[ 1810.292348]
[ 1810.292348] - #2 (cpuset_hotplug_work){+.+...}:
[ 1810.292360][810c4ee6] validate_chain+0x656/0x7c0
[ 1810.292371][810c53d2] __lock_acquire+0x382/0x660
[ 1810.292380][810c57a9] lock_acquire+0xf9/0x170
[ 1810.292389][810862b9] flush_work+0x39/0x90
[ 1810.292398][811158b1] cpuset_write_resmask+0x51/0x120
[ 1810.292409][8110cc39] cgroup_file_write+0x49/0x1f0
[ 1810.292419][81286c7d] kernfs_fop_write+0xfd/0x190
[ 1810.292431][81204a15] vfs_write+0xe5/0x190
[ 1810.292443][8120545c] SyS_write+0x5c/0xc0
[ 1810.292452][815acb92] system_call_fastpath+0x16/0x1b
[ 1810.292464]
[ 1810.292464] - #1 (s_active#175){.+}:
[ 1810.292476][810c4ee6] validate_chain+0x656/0x7c0
[ 1810.292486][810c53d2] __lock_acquire+0x382/0x660
[ 1810.292495][810c57a9] lock_acquire+0xf9/0x170
[ 1810.292504][812848eb] kernfs_drain+0x13b/0x1c0
[ 1810.292513][81285418] __kernfs_remove+0xc8/0x220
[ 1810.292523][812855c0] kernfs_remove_by_name_ns+0x50/0xb0
[ 1810.292533][8110802e] cgroup_addrm_files+0x16e/0x290
[ 1810.292543][811081bd] cgroup_clear_dir+0x6d/0xa0
[ 1810.292552][8110c30f] rebind_subsystems+0x10f/0x350
[ 1810.292562][8110f2cf] cgroup_setup_root+0x1bf/0x290
[ 1810.292571][8110f4c3] cgroup_mount+0x123/0x3d0
[ 1810.292581][81208b7d] mount_fs+0x4d/0x1a0
[ 1810.292591][8122b439] vfs_kern_mount+0x79/0x160
[ 1810.292602][8122be69] do_new_mount+0xd9/0x200
[ 1810.292611][8122cadc] do_mount+0x1dc/0x220
[ 1810.292621][8122cbdc] SyS_mount+0xbc/0xe0
[ 1810.292630][815acb92] system_call_fastpath+0x16/0x1b
[ 1810.292640]
[ 1810.292640] - #0 (cgroup_mutex){+.+.+.}:
[ 1810.292651][810c481e] check_prev_add+0x43e/0x4b0
[ 1810.292660][810c4ee6] validate_chain+0x656/0x7c0
[ 1810.292669][810c53d2] __lock_acquire+0x382/0x660
[ 1810.292678][810c57a9] lock_acquire+0xf9/0x170
[ 1810.292687][815aa13f] mutex_lock_nested+0x6f/0x380
[ 1810.292697][8110e3d7] cgroup_transfer_tasks+0x37/0x150
[ 1810.292707][811129c0] 
hotplug_update_tasks_insane+0x110/0x1d0
[ 1810.292718][81112bbd] 
cpuset_hotplug_update_tasks+0x13d/0x180
[ 1810.292729][811148ec] cpuset_hotplug_workfn+0x18c/0x630
[ 1810.292739][810854d4] process_one_work+0x254/0x520
[ 1810.292748][810875dd] worker_thread+0x13d/0x3d0
[ 1810.292758][8108e0c8] kthread+0xf8/0x100
[ 1810.292768][815acaec] ret_from_fork+0x7c/0xb0
[ 1810.292778]
[ 1810.292778] other info that might help us debug this:
[ 1810.292778]
[ 1810.292789] Chain exists of:
[ 1810.292789]   cgroup_mutex -- s_active#175 -- cpuset_hotplug_work
[ 1810.292789]
[ 1810.292807]  Possible unsafe locking scenario:
[ 1810.292807]
[ 1810.292816]CPU0CPU1
[ 1810.292822]
[ 1810.292827]   lock(cpuset_hotplug_work);
[ 1810.292835]lock(s_active#175);
[ 1810.292845]lock(cpuset_hotplug_work);
[ 1810.292855]   lock(cgroup_mutex);
[ 1810.292862]
[ 1810.292862]  *** DEADLOCK ***
[ 1810.292862]
[ 1810.292872] 2 locks held by kworker/1:0/32649:
[ 1810.292878]  #0:  (events){.+.+.+}, at: [81085412] 
process_one_work+0x192/0x520
[ 1810.292895]

[PATCH v3 1/3] cgroup: fix mount failure in a corner case

2014-06-29 Thread Li Zefan

  # cat test.sh
  #! /bin/bash

  mount -t cgroup -o cpu xxx /cgroup
  umount /cgroup

  mount -t cgroup -o cpu,cpuacct xxx /cgroup
  umount /cgroup
  # ./test.sh
  mount: xxx already mounted or /cgroup busy
  mount: according to mtab, xxx is already mounted on /cgroup

It's because the cgroupfs_root of the first mount was under destruction
asynchronously.

Fix this by delaying and then retrying mount for this case.

v3:
- put the refcnt immediately after getting it. (Tejun)

v2:
- use percpu_ref_tryget_live() rather that introducing
  percpu_ref_alive(). (Tejun)
- adjust comment.

Cc: sta...@vger.kernel.org # 3.15
Signed-off-by: Li Zefan lize...@huawei.com
---
 kernel/cgroup.c | 21 +
 1 file changed, 21 insertions(+)

diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index 1c65f24..d3662ac 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -1648,10 +1648,12 @@ static struct dentry *cgroup_mount(struct 
file_system_type *fs_type,
 int flags, const char *unused_dev_name,
 void *data)
 {
+   struct cgroup_subsys *ss;
struct cgroup_root *root;
struct cgroup_sb_opts opts;
struct dentry *dentry;
int ret;
+   int i;
bool new_sb;
 
/*
@@ -1677,6 +1679,25 @@ static struct dentry *cgroup_mount(struct 
file_system_type *fs_type,
goto out_unlock;
}
 
+   /*
+* Destruction of cgroup root is asynchronous, so we may fail to
+* mount a cgroupfs if it immediately follows a umount. Let's wait
+* a little bit and retry.
+*/
+   for_each_subsys(ss, i) {
+   if (!(opts.subsys_mask  (1  i)) ||
+   ss-root == cgrp_dfl_root)
+   continue;
+
+   if (!percpu_ref_tryget_live(ss-root-cgrp.self.refcnt)) {
+   mutex_unlock(cgroup_mutex);
+   msleep(10);
+   ret = restart_syscall();
+   goto out_free;
+   }
+   cgroup_put(ss-root-cgrp);
+   }
+
for_each_root(root) {
bool name_match = false;
 
-- 
1.8.0.2

--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

[PATCH v3 2/3] kernfs: introduce kernfs_pin_sb()

2014-06-29 Thread Li Zefan

kernfs_pin_sb() tries to get a refcnt of the superblock.

This will be used by cgroupfs.

v2:
- make kernfs_pin_sb() return the superblock.
- drop kernfs_drop_sb().

[ This is a prerequisite for a bugfix. ]
Cc: sta...@vger.kernel.org # 3.15
Acked-by: Greg Kroah-Hartman gre...@linuxfoundation.org
Signed-off-by: Li Zefan lize...@huawei.com
---
 fs/kernfs/mount.c  | 27 +++
 include/linux/kernfs.h |  1 +
 2 files changed, 28 insertions(+)

diff --git a/fs/kernfs/mount.c b/fs/kernfs/mount.c
index f25a7c0..616c5c4 100644
--- a/fs/kernfs/mount.c
+++ b/fs/kernfs/mount.c
@@ -210,6 +210,33 @@ void kernfs_kill_sb(struct super_block *sb)
kernfs_put(root_kn);
 }
 
+/**
+ * kernfs_pin_sb: try to pin the superblock associated with a kernfs_root
+ * @kernfs_root: the kernfs_root in question
+ * @ns: the namespace tag
+ *
+ * Pin the superblock so the superblock won't be destroyed in subsequent
+ * operations. Return NULL if there's no superblock associated to this
+ * kernfs_root, or -EINVAL if the superblock is being freed.
+ */
+struct super_block *kernfs_pin_sb(struct kernfs_root *root, const void *ns)
+{
+   struct kernfs_super_info *info;
+   struct super_block *sb = NULL;
+
+   mutex_lock(kernfs_mutex);
+   list_for_each_entry(info, root-supers, node) {
+   if (info-ns == ns) {
+   sb = info-sb;
+   if (!atomic_inc_not_zero(info-sb-s_active))
+   sb = ERR_PTR(-EINVAL);
+   break;
+   }
+   }
+   mutex_unlock(kernfs_mutex);
+   return sb;
+}
+
 void __init kernfs_init(void)
 {
kernfs_node_cache = kmem_cache_create(kernfs_node_cache,
diff --git a/include/linux/kernfs.h b/include/linux/kernfs.h
index 589318b..9096296 100644
--- a/include/linux/kernfs.h
+++ b/include/linux/kernfs.h
@@ -287,6 +287,7 @@ struct dentry *kernfs_mount_ns(struct file_system_type 
*fs_type, int flags,
   struct kernfs_root *root, bool *new_sb_created,
   const void *ns);
 void kernfs_kill_sb(struct super_block *sb);
+struct super_block *kernfs_pin_sb(struct kernfs_root *root, const void *ns);
 
 void kernfs_init(void);
 
-- 
1.8.0.2

--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

[PATCH v3 3/3] cgroup: fix a race between cgroup_mount() and cgroup_kill_sb()

2014-06-29 Thread Li Zefan

We've converted cgroup to kernfs so cgroup won't be intertwined with
vfs objects and locking, but there are dark areas.

Run two instances of this script concurrently:

for ((; ;))
{
mount -t cgroup -o cpuacct xxx /cgroup
umount /cgroup
}

After a while, I saw two mount processes were stuck at retrying, because
they were waiting for a subsystem to become free, but the root associated
with this subsystem never got freed.

This can happen, if thread A is in the process of killing superblock but
hasn't called percpu_ref_kill(), and at this time thread B is mounting
the same cgroup root and finds the root in the root list and performs
percpu_ref_try_get().

To fix this, we try to increase both the refcnt of the superblock and the
percpu refcnt of cgroup root.

v2:
- we should try to get both the superblock refcnt and cgroup_root refcnt,
  because cgroup_root may have no superblock assosiated with it.
- adjust/add comments.

Cc: sta...@vger.kernel.org # 3.15
Signed-off-by: Li Zefan lize...@huawei.com
---
 kernel/cgroup.c | 28 ++--
 1 file changed, 22 insertions(+), 6 deletions(-)

diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index d3662ac..11e40cf 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -1655,6 +1655,7 @@ static struct dentry *cgroup_mount(struct 
file_system_type *fs_type,
int ret;
int i;
bool new_sb;
+   struct super_block *sb = NULL;
 
/*
 * The first time anyone tries to mount a cgroup, enable the list
@@ -1739,14 +1740,18 @@ static struct dentry *cgroup_mount(struct 
file_system_type *fs_type,
 
/*
 * A root's lifetime is governed by its root cgroup.
-* tryget_live failure indicate that the root is being
-* destroyed.  Wait for destruction to complete so that the
-* subsystems are free.  We can use wait_queue for the wait
-* but this path is super cold.  Let's just sleep for a bit
-* and retry.
+* pin_sb and tryget_live failure indicate that the root is
+* being destroyed.  Wait for destruction to complete so that
+* the subsystems are free.  We can use wait_queue for the
+* wait but this path is super cold.  Let's just sleep for
+* a bit and retry.
 */
-   if (!percpu_ref_tryget_live(root-cgrp.self.refcnt)) {
+   sb = kernfs_pin_sb(root-kf_root, NULL);
+   if (IS_ERR(sb) ||
+   !percpu_ref_tryget_live(root-cgrp.self.refcnt)) {
mutex_unlock(cgroup_mutex);
+   if (!IS_ERR_OR_NULL(sb))
+   deactivate_super(sb);
msleep(10);
ret = restart_syscall();
goto out_free;
@@ -1790,6 +1795,17 @@ out_free:
dentry = kernfs_mount(fs_type, flags, root-kf_root, new_sb);
if (IS_ERR(dentry) || !new_sb)
cgroup_put(root-cgrp);
+
+   if (sb) {
+   /*
+* On success kernfs_mount() returns with sb-s_umount held,
+* but kernfs_mount() also increases the superblock's refcnt,
+* so calling deactivate_super() to drop the refcnt we got when
+* looking up cgroup root won't acquire sb-s_umount again.
+*/
+   WARN_ON(new_sb);
+   deactivate_super(sb);
+   }
return dentry;
 }
 
-- 
1.8.0.2

--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: [PATCH v2 1/3] cgroup: fix mount failure in a corner case

2014-06-27 Thread Li Zefan

Made a mistake again.. :(


==

From: Li Zefan 
Subject: [PATCH 1/3] cgroup: fix mount failure in a corner case

  # cat test.sh
  #! /bin/bash

  mount -t cgroup -o cpu xxx /cgroup
  umount /cgroup

  mount -t cgroup -o cpu,cpuacct xxx /cgroup
  umount /cgroup
  # ./test.sh
  mount: xxx already mounted or /cgroup busy
  mount: according to mtab, xxx is already mounted on /cgroup

It's because the cgroupfs_root of the first mount was under destruction
asynchronously.

Fix this by delaying and then retrying mount for this case.

v2:
- use percpu_ref_tryget_live() rather that introducing
  percpu_ref_alive(). (Tejun)
- adjust comment.

Signed-off-by: Li Zefan 
---
 kernel/cgroup.c | 29 +
 1 file changed, 29 insertions(+)

diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index 1c65f24..b94449f 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -1648,10 +1648,12 @@ static struct dentry *cgroup_mount(struct 
file_system_type *fs_type,
 int flags, const char *unused_dev_name,
 void *data)
 {
+   struct cgroup_subsys *ss;
struct cgroup_root *root;
struct cgroup_sb_opts opts;
struct dentry *dentry;
int ret;
+   int i, j = -1;
bool new_sb;
 
/*
@@ -1677,6 +1679,25 @@ static struct dentry *cgroup_mount(struct 
file_system_type *fs_type,
goto out_unlock;
}
 
+   /*
+* Destruction of cgroup root is asynchronous, so we may fail to
+* mount a cgroupfs if it immediately follows a umount. Let's wait
+* a little bit and retry.
+*/
+   for_each_subsys(ss, i) {
+   if (!(opts.subsys_mask & (1 << i)) ||
+   ss->root == _dfl_root)
+   continue;
+
+   if (!percpu_ref_tryget_live(>root->cgrp.self.refcnt)) {
+   mutex_unlock(_mutex);
+   msleep(10);
+   ret = restart_syscall();
+   goto out_free;
+   }
+   j = i;
+   }
+
for_each_root(root) {
bool name_match = false;
 
@@ -1763,6 +1784,14 @@ out_free:
kfree(opts.release_agent);
kfree(opts.name);
 
+   for_each_subsys(ss, i) {
+   if (i > j)
+   break;
+   if (!(opts.subsys_mask & (1 << i)))
+   continue;
+   cgroup_put(>root->cgrp);
+   }
+
if (ret)
return ERR_PTR(ret);
 
-- 
1.8.0.2

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: [PATCH v2 1/3] cgroup: fix mount failure in a corner case

2014-06-27 Thread Li Zefan

Oh sorry the cut was incomplete. Here's the complete one:



From: Li Zefan 
Date: Thu, 12 Jun 2014 09:11:00 +0800
Subject: [PATCH v2 1/3] cgroup: fix mount failure in a corner case

  # cat test.sh
  #! /bin/bash

  mount -t cgroup -o cpu xxx /cgroup
  umount /cgroup

  mount -t cgroup -o cpu,cpuacct xxx /cgroup
  umount /cgroup
  # ./test.sh
  mount: xxx already mounted or /cgroup busy
  mount: according to mtab, xxx is already mounted on /cgroup

It's because the cgroupfs_root of the first mount was under destruction
asynchronously.

Fix this by delaying and then retrying mount for this case.

v2:
- use percpu_ref_tryget_live() rather that introducing
  percpu_ref_alive(). (Tejun)
- adjust comment.

Signed-off-by: Li Zefan 
---
 kernel/cgroup.c | 29 +
 1 file changed, 29 insertions(+)

diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index 1c65f24..6826227 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -1648,10 +1648,12 @@ static struct dentry *cgroup_mount(struct 
file_system_type *fs_type,
 int flags, const char *unused_dev_name,
 void *data)
 {
+   struct cgroup_subsys *ss;
struct cgroup_root *root;
struct cgroup_sb_opts opts;
struct dentry *dentry;
int ret;
+   int i, j = -1;
bool new_sb;
 
/*
@@ -1677,6 +1679,25 @@ static struct dentry *cgroup_mount(struct 
file_system_type *fs_type,
goto out_unlock;
}
 
+   /*
+* Destruction of cgroup root is asynchronous, so we may fail to
+* mount a cgroupfs if it immediately follows a umount. Let's wait
+* a little bit and retry.
+*/
+   for_each_subsys(ss, i) {
+   if (!(opts.subsys_mask & (1 << i)) ||
+   ss->root === _dfl_root)
+   continue;
+
+   if (!percpu_ref_tryget_live(>root->cgrp.self.refcnt)) {
+   mutex_unlock(_mutex);
+   msleep(10);
+   ret = restart_syscall();
+   goto out_free;
+   }
+   j = i;
+   }
+
for_each_root(root) {
bool name_match = false;
 
@@ -1763,6 +1784,14 @@ out_free:
kfree(opts.release_agent);
kfree(opts.name);
 
+   for_each_subsys(ss, i) {
+   if (i > j)
+   break;
+   if (!(opts.subsys_mask & (1 << i)))
+   continue;
+   cgroup_put(>root->cgrp);
+   }
+
if (ret)
return ERR_PTR(ret);
 
-- 
1.8.0.2


--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

[PATCH v2 2/3] kernfs: introduce kernfs_pin_sb()

2014-06-27 Thread Li Zefan

kernfs_pin_sb() tries to get a refcnt of the superblock.

This will be used by cgroupfs.

v2:
- make kernfs_pin_sb() return pointer to the superblock.
- drop kernfs_drop_sb().

Signed-off-by: Li Zefan 
---
 fs/kernfs/mount.c  | 27 +++
 include/linux/kernfs.h |  1 +
 2 files changed, 28 insertions(+)

diff --git a/fs/kernfs/mount.c b/fs/kernfs/mount.c
index f25a7c0..616c5c4 100644
--- a/fs/kernfs/mount.c
+++ b/fs/kernfs/mount.c
@@ -210,6 +210,33 @@ void kernfs_kill_sb(struct super_block *sb)
kernfs_put(root_kn);
 }
 
+/**
+ * kernfs_pin_sb: try to pin the superblock associated with a kernfs_root
+ * @kernfs_root: the kernfs_root in question
+ * @ns: the namespace tag
+ *
+ * Pin the superblock so the superblock won't be destroyed in subsequent
+ * operations. Return NULL if there's no superblock associated to this
+ * kernfs_root, or -EINVAL if the superblock is being freed.
+ */
+struct super_block *kernfs_pin_sb(struct kernfs_root *root, const void *ns)
+{
+   struct kernfs_super_info *info;
+   struct super_block *sb = NULL;
+
+   mutex_lock(_mutex);
+   list_for_each_entry(info, >supers, node) {
+   if (info->ns == ns) {
+   sb = info->sb;
+   if (!atomic_inc_not_zero(>sb->s_active))
+   sb = ERR_PTR(-EINVAL);
+   break;
+   }
+   }
+   mutex_unlock(_mutex);
+   return sb;
+}
+
 void __init kernfs_init(void)
 {
kernfs_node_cache = kmem_cache_create("kernfs_node_cache",
diff --git a/include/linux/kernfs.h b/include/linux/kernfs.h
index 589318b..9096296 100644
--- a/include/linux/kernfs.h
+++ b/include/linux/kernfs.h
@@ -287,6 +287,7 @@ struct dentry *kernfs_mount_ns(struct file_system_type 
*fs_type, int flags,
   struct kernfs_root *root, bool *new_sb_created,
   const void *ns);
 void kernfs_kill_sb(struct super_block *sb);
+struct super_block *kernfs_pin_sb(struct kernfs_root *root, const void *ns);
 
 void kernfs_init(void);
 
-- 
1.8.0.2

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

[PATCH v2 3/3] cgroup: fix a race between cgroup_mount() and cgroup_kill_sb()

2014-06-27 Thread Li Zefan

We've converted cgroup to kernfs so cgroup won't be intertwined with
vfs objects and locking, but there are dark areas.

Run two instances of this script concurrently:

for ((; ;))
{
mount -t cgroup -o cpuacct xxx /cgroup
umount /cgroup
}

After a while, I saw two mount processes were stuck at retrying, because
they were waiting for a subsystem to become free, but the root associated
with this subsystem never got freed.

This can happen, if thread A is in the process of killing superblock but
hasn't called percpu_ref_kill(), and at this time thread B is mounting
the same cgroup root and finds the root in the root list and performs
percpu_ref_try_get().

To fix this, we try to increase both the refcnt of the superblock and the
percpu refcnt of cgroup root.

v2:
- we should try to get both the superblock refcnt and cgroup_root refcnt,
  because cgroup_root may have no superblock assosiated with it.
- adjust/add comments.

Signed-off-by: Li Zefan 
---
 kernel/cgroup.c | 28 ++--
 1 file changed, 22 insertions(+), 6 deletions(-)

diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index ae2b382..111b7c3 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -1655,6 +1655,7 @@ static struct dentry *cgroup_mount(struct 
file_system_type *fs_type,
int ret;
int i, j = -1;
bool new_sb;
+   struct super_block *sb = NULL;
 
/*
 * The first time anyone tries to mount a cgroup, enable the list
@@ -1737,14 +1738,18 @@ static struct dentry *cgroup_mount(struct 
file_system_type *fs_type,
 
/*
 * A root's lifetime is governed by its root cgroup.
-* tryget_live failure indicate that the root is being
-* destroyed.  Wait for destruction to complete so that the
-* subsystems are free.  We can use wait_queue for the wait
-* but this path is super cold.  Let's just sleep for a bit
-* and retry.
+* pin_sb and tryget_live failure indicate that the root is
+* being destroyed.  Wait for destruction to complete so that
+* the subsystems are free.  We can use wait_queue for the
+* wait but this path is super cold.  Let's just sleep for
+* a bit and retry.
 */
-   if (!percpu_ref_tryget_live(>cgrp.self.refcnt)) {
+   sb = kernfs_pin_sb(root->kf_root, NULL);
+   if (IS_ERR(sb) ||
+   !percpu_ref_tryget_live(>cgrp.self.refcnt)) {
mutex_unlock(_mutex);
+   if (!IS_ERR_OR_NULL(sb))
+   deactivate_super(sb);
msleep(10);
ret = restart_syscall();
goto out_free;
@@ -1796,6 +1801,17 @@ out_free:
dentry = kernfs_mount(fs_type, flags, root->kf_root, _sb);
if (IS_ERR(dentry) || !new_sb)
cgroup_put(>cgrp);
+
+   if (sb) {
+   /*
+* On success kernfs_mount() returns with sb->s_umount held,
+* but kernfs_mount() also increases the superblock's refcnt,
+* so calling deactivate_super() to drop the refcnt we got when
+* looking up cgroup root won't acquire sb->s_umount again.
+*/
+   WARN_ON(new_sb);
+   deactivate_super(sb);
+   }
return dentry;
 }
 
-- 
1.8.0.2

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

[PATCH v2 1/3] cgroup: fix mount failure in a corner case

2014-06-27 Thread Li Zefan

  # cat test.sh
  #! /bin/bash

  mount -t cgroup -o cpu xxx /cgroup
  umount /cgroup

  mount -t cgroup -o cpu,cpuacct xxx /cgroup
  umount /cgroup
  # ./test.sh
  mount: xxx already mounted or /mnt busy
  mount: according to mtab, xxx is already mounted on /mnt

It's because the cgroupfs_root of the first mount was under destruction
asynchronously.

Fix this by delaying and then retrying mount for this case.

v2:
- use percpu_ref_tryget_live() rather that introducing
  percpu_ref_alive(). (Tejun)
- adjust comment.

Signed-off-by: Li Zefan 
---
 kernel/cgroup.c | 27 +++
 1 file changed, 27 insertions(+)

diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index 1c65f24..ae2b382 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -1648,10 +1648,12 @@ static struct dentry *cgroup_mount(struct 
file_system_type *fs_type,
 int flags, const char *unused_dev_name,
 void *data)
 {
+   struct cgroup_subsys *ss;
struct cgroup_root *root;
struct cgroup_sb_opts opts;
struct dentry *dentry;
int ret;
+   int i, j = -1;
bool new_sb;
 
/*
@@ -1677,6 +1679,23 @@ static struct dentry *cgroup_mount(struct 
file_system_type *fs_type,
goto out_unlock;
}
 
+   /*
+* Destruction of cgroup root is asynchronous, so we may fail to
+* mount a cgroupfs if it immediately follows a umount. Let's wait
+* a little bit and retry.
+*/
+   for_each_subsys(ss, i) {
+   if (!(opts.subsys_mask & (1 << i)))
+   continue;
+   if (!percpu_ref_tryget_live(>root->cgrp.self.refcnt)) {
+   mutex_unlock(_mutex);
+   msleep(10);
+   ret = restart_syscall
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: [PATCH 5/5] cgroup: fix a race between cgroup_mount() and cgroup_kill_sb()

2014-06-27 Thread Li Zefan

On 2014/6/25 23:00, Tejun Heo wrote:
> Hey,
> 
> On Wed, Jun 25, 2014 at 09:56:31AM +0800, Li Zefan wrote:
>>> Hmmm?  Why does that matter?  The only region in cgroup_mount() which
>>> needs to be put inside such mutex would be root lookup, no?
>>
>> unfortunately that won't help. I think what you suggest is:
>>
>> cgroup_mount()
>> {
>>  mutex_lock();
>>  lookup_cgroup_root();
>>  mutex_unlock();
>>  kernfs_mount();
>> }
>>
>> cgroup_kill_sb()
>> {
>>  mutex_lock();
>>  percpu_ref_kill();
>>  mutex_Unlock();
>>  kernfs_kill_sb();
>> }
>>
>> See, we may still be destroying the superblock after we've succeeded
>> in getting the refcnt of cgroup root.
> 
> Sure, but now the decision to kill is synchronized so the other side
> can interlock with it.  e.g.
> 
> cgroup_mount()
> {
>   mutex_lock();
>   lookup_cgroup_root();
>   if (root isn't killed yet)
>   root->this_better_stay_alive++;
>   mutex_unlock();
>   kernfs_mount();
> }
> 
> cgroup_kill_sb()
> {
>   mutex_lock();
>   if (check whether root can be killed)
>   percpu_ref_kill();
>   mutex_unlock();
>   if (the above condition was true)
>   kernfs_kill_sb();
> }
> 

This looks nasty, and I don't think it's correct. If we skip the call
to kernfs_kill_sb(), kernfs_super_info won't be freed but super_block
will, so we will end up either leaking memory or accessing invalid
memory. There are other problems like returning with sb->s_umount still
held.

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: [PATCH 5/5] cgroup: fix a race between cgroup_mount() and cgroup_kill_sb()

2014-06-27 Thread Li Zefan

On 2014/6/25 23:00, Tejun Heo wrote:
 Hey,
 
 On Wed, Jun 25, 2014 at 09:56:31AM +0800, Li Zefan wrote:
 Hmmm?  Why does that matter?  The only region in cgroup_mount() which
 needs to be put inside such mutex would be root lookup, no?

 unfortunately that won't help. I think what you suggest is:

 cgroup_mount()
 {
  mutex_lock();
  lookup_cgroup_root();
  mutex_unlock();
  kernfs_mount();
 }

 cgroup_kill_sb()
 {
  mutex_lock();
  percpu_ref_kill();
  mutex_Unlock();
  kernfs_kill_sb();
 }

 See, we may still be destroying the superblock after we've succeeded
 in getting the refcnt of cgroup root.
 
 Sure, but now the decision to kill is synchronized so the other side
 can interlock with it.  e.g.
 
 cgroup_mount()
 {
   mutex_lock();
   lookup_cgroup_root();
   if (root isn't killed yet)
   root-this_better_stay_alive++;
   mutex_unlock();
   kernfs_mount();
 }
 
 cgroup_kill_sb()
 {
   mutex_lock();
   if (check whether root can be killed)
   percpu_ref_kill();
   mutex_unlock();
   if (the above condition was true)
   kernfs_kill_sb();
 }
 

This looks nasty, and I don't think it's correct. If we skip the call
to kernfs_kill_sb(), kernfs_super_info won't be freed but super_block
will, so we will end up either leaking memory or accessing invalid
memory. There are other problems like returning with sb-s_umount still
held.

--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

[PATCH v2 1/3] cgroup: fix mount failure in a corner case

2014-06-27 Thread Li Zefan

  # cat test.sh
  #! /bin/bash

  mount -t cgroup -o cpu xxx /cgroup
  umount /cgroup

  mount -t cgroup -o cpu,cpuacct xxx /cgroup
  umount /cgroup
  # ./test.sh
  mount: xxx already mounted or /mnt busy
  mount: according to mtab, xxx is already mounted on /mnt

It's because the cgroupfs_root of the first mount was under destruction
asynchronously.

Fix this by delaying and then retrying mount for this case.

v2:
- use percpu_ref_tryget_live() rather that introducing
  percpu_ref_alive(). (Tejun)
- adjust comment.

Signed-off-by: Li Zefan lize...@huawei.com
---
 kernel/cgroup.c | 27 +++
 1 file changed, 27 insertions(+)

diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index 1c65f24..ae2b382 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -1648,10 +1648,12 @@ static struct dentry *cgroup_mount(struct 
file_system_type *fs_type,
 int flags, const char *unused_dev_name,
 void *data)
 {
+   struct cgroup_subsys *ss;
struct cgroup_root *root;
struct cgroup_sb_opts opts;
struct dentry *dentry;
int ret;
+   int i, j = -1;
bool new_sb;
 
/*
@@ -1677,6 +1679,23 @@ static struct dentry *cgroup_mount(struct 
file_system_type *fs_type,
goto out_unlock;
}
 
+   /*
+* Destruction of cgroup root is asynchronous, so we may fail to
+* mount a cgroupfs if it immediately follows a umount. Let's wait
+* a little bit and retry.
+*/
+   for_each_subsys(ss, i) {
+   if (!(opts.subsys_mask  (1  i)))
+   continue;
+   if (!percpu_ref_tryget_live(ss-root-cgrp.self.refcnt)) {
+   mutex_unlock(cgroup_mutex);
+   msleep(10);
+   ret = restart_syscall
--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

[PATCH v2 3/3] cgroup: fix a race between cgroup_mount() and cgroup_kill_sb()

2014-06-27 Thread Li Zefan

We've converted cgroup to kernfs so cgroup won't be intertwined with
vfs objects and locking, but there are dark areas.

Run two instances of this script concurrently:

for ((; ;))
{
mount -t cgroup -o cpuacct xxx /cgroup
umount /cgroup
}

After a while, I saw two mount processes were stuck at retrying, because
they were waiting for a subsystem to become free, but the root associated
with this subsystem never got freed.

This can happen, if thread A is in the process of killing superblock but
hasn't called percpu_ref_kill(), and at this time thread B is mounting
the same cgroup root and finds the root in the root list and performs
percpu_ref_try_get().

To fix this, we try to increase both the refcnt of the superblock and the
percpu refcnt of cgroup root.

v2:
- we should try to get both the superblock refcnt and cgroup_root refcnt,
  because cgroup_root may have no superblock assosiated with it.
- adjust/add comments.

Signed-off-by: Li Zefan lize...@huawei.com
---
 kernel/cgroup.c | 28 ++--
 1 file changed, 22 insertions(+), 6 deletions(-)

diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index ae2b382..111b7c3 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -1655,6 +1655,7 @@ static struct dentry *cgroup_mount(struct 
file_system_type *fs_type,
int ret;
int i, j = -1;
bool new_sb;
+   struct super_block *sb = NULL;
 
/*
 * The first time anyone tries to mount a cgroup, enable the list
@@ -1737,14 +1738,18 @@ static struct dentry *cgroup_mount(struct 
file_system_type *fs_type,
 
/*
 * A root's lifetime is governed by its root cgroup.
-* tryget_live failure indicate that the root is being
-* destroyed.  Wait for destruction to complete so that the
-* subsystems are free.  We can use wait_queue for the wait
-* but this path is super cold.  Let's just sleep for a bit
-* and retry.
+* pin_sb and tryget_live failure indicate that the root is
+* being destroyed.  Wait for destruction to complete so that
+* the subsystems are free.  We can use wait_queue for the
+* wait but this path is super cold.  Let's just sleep for
+* a bit and retry.
 */
-   if (!percpu_ref_tryget_live(root-cgrp.self.refcnt)) {
+   sb = kernfs_pin_sb(root-kf_root, NULL);
+   if (IS_ERR(sb) ||
+   !percpu_ref_tryget_live(root-cgrp.self.refcnt)) {
mutex_unlock(cgroup_mutex);
+   if (!IS_ERR_OR_NULL(sb))
+   deactivate_super(sb);
msleep(10);
ret = restart_syscall();
goto out_free;
@@ -1796,6 +1801,17 @@ out_free:
dentry = kernfs_mount(fs_type, flags, root-kf_root, new_sb);
if (IS_ERR(dentry) || !new_sb)
cgroup_put(root-cgrp);
+
+   if (sb) {
+   /*
+* On success kernfs_mount() returns with sb-s_umount held,
+* but kernfs_mount() also increases the superblock's refcnt,
+* so calling deactivate_super() to drop the refcnt we got when
+* looking up cgroup root won't acquire sb-s_umount again.
+*/
+   WARN_ON(new_sb);
+   deactivate_super(sb);
+   }
return dentry;
 }
 
-- 
1.8.0.2

--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

[PATCH v2 2/3] kernfs: introduce kernfs_pin_sb()

2014-06-27 Thread Li Zefan

kernfs_pin_sb() tries to get a refcnt of the superblock.

This will be used by cgroupfs.

v2:
- make kernfs_pin_sb() return pointer to the superblock.
- drop kernfs_drop_sb().

Signed-off-by: Li Zefan lize...@huawei.com
---
 fs/kernfs/mount.c  | 27 +++
 include/linux/kernfs.h |  1 +
 2 files changed, 28 insertions(+)

diff --git a/fs/kernfs/mount.c b/fs/kernfs/mount.c
index f25a7c0..616c5c4 100644
--- a/fs/kernfs/mount.c
+++ b/fs/kernfs/mount.c
@@ -210,6 +210,33 @@ void kernfs_kill_sb(struct super_block *sb)
kernfs_put(root_kn);
 }
 
+/**
+ * kernfs_pin_sb: try to pin the superblock associated with a kernfs_root
+ * @kernfs_root: the kernfs_root in question
+ * @ns: the namespace tag
+ *
+ * Pin the superblock so the superblock won't be destroyed in subsequent
+ * operations. Return NULL if there's no superblock associated to this
+ * kernfs_root, or -EINVAL if the superblock is being freed.
+ */
+struct super_block *kernfs_pin_sb(struct kernfs_root *root, const void *ns)
+{
+   struct kernfs_super_info *info;
+   struct super_block *sb = NULL;
+
+   mutex_lock(kernfs_mutex);
+   list_for_each_entry(info, root-supers, node) {
+   if (info-ns == ns) {
+   sb = info-sb;
+   if (!atomic_inc_not_zero(info-sb-s_active))
+   sb = ERR_PTR(-EINVAL);
+   break;
+   }
+   }
+   mutex_unlock(kernfs_mutex);
+   return sb;
+}
+
 void __init kernfs_init(void)
 {
kernfs_node_cache = kmem_cache_create(kernfs_node_cache,
diff --git a/include/linux/kernfs.h b/include/linux/kernfs.h
index 589318b..9096296 100644
--- a/include/linux/kernfs.h
+++ b/include/linux/kernfs.h
@@ -287,6 +287,7 @@ struct dentry *kernfs_mount_ns(struct file_system_type 
*fs_type, int flags,
   struct kernfs_root *root, bool *new_sb_created,
   const void *ns);
 void kernfs_kill_sb(struct super_block *sb);
+struct super_block *kernfs_pin_sb(struct kernfs_root *root, const void *ns);
 
 void kernfs_init(void);
 
-- 
1.8.0.2

--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

1 2 3 4 5 6 7 8 9 10 >

1 - 100 of 1870 matches

Mail list logo