Re: [PATCH stable 3.10] mm: memcontrol: factor out reclaim iterator loading and updating

2017-07-27 Thread Michal Hocko
On Thu 27-07-17 21:59:06, Wenwei Tao wrote:
> From: Johannes Weiner 
> 
> commit 519ebea3bf6df45439e79c54bda1d9e29fe13a64 upstream
> 
> It turned out that this is actually a bug fix which prevents a double
> css_put on last_visited memcg which results in kernel BUG at 
> kernel/cgroup.c:893!
> See http://lkml.kernel.org/r/20170726130742.5976-1-wenwei@gmail.com
> for more details.
> 
> mem_cgroup_iter() is too hard to follow.  Factor out the lockless reclaim
> iterator loading and updating so it's easier to follow the big picture.
> 
> Also document the iterator invalidation mechanism a bit more extensively.
> 
> Signed-off-by: Johannes Weiner 
> Reported-by: Tejun Heo 
> Reviewed-by: Tejun Heo 
> Acked-by: Michal Hocko 
> Cc: KAMEZAWA Hiroyuki 
> Cc: Glauber Costa 
> Signed-off-by: Andrew Morton 
> Signed-off-by: Linus Torvalds 
> [wt: Backported to linux-3.10.y: adjusted context]
> Signed-off-by: Wenwei Tao 

Yes this looks ok and rebases on top of ecc736fc3c71 ("memcg: fix
endless loop caused by mem_cgroup_iter") backport properly.

Thanks!

> ---
>  mm/memcontrol.c | 97 
> -
>  1 file changed, 68 insertions(+), 29 deletions(-)
> 
> diff --git a/mm/memcontrol.c b/mm/memcontrol.c
> index 437ae2c..fcde430 100644
> --- a/mm/memcontrol.c
> +++ b/mm/memcontrol.c
> @@ -1149,6 +1149,68 @@ skip_node:
>   return NULL;
>  }
>  
> +static void mem_cgroup_iter_invalidate(struct mem_cgroup *root)
> +{
> + /*
> +  * When a group in the hierarchy below root is destroyed, the
> +  * hierarchy iterator can no longer be trusted since it might
> +  * have pointed to the destroyed group.  Invalidate it.
> +  */
> + atomic_inc(&root->dead_count);
> +}
> +
> +static struct mem_cgroup *
> +mem_cgroup_iter_load(struct mem_cgroup_reclaim_iter *iter,
> +  struct mem_cgroup *root,
> +  int *sequence)
> +{
> + struct mem_cgroup *position = NULL;
> + /*
> +  * A cgroup destruction happens in two stages: offlining and
> +  * release.  They are separated by a RCU grace period.
> +  *
> +  * If the iterator is valid, we may still race with an
> +  * offlining.  The RCU lock ensures the object won't be
> +  * released, tryget will fail if we lost the race.
> +  */
> + *sequence = atomic_read(&root->dead_count);
> + if (iter->last_dead_count == *sequence) {
> + smp_rmb();
> + position = iter->last_visited;
> +
> + /*
> +  * We cannot take a reference to root because we might race
> +  * with root removal and returning NULL would end up in
> +  * an endless loop on the iterator user level when root
> +  * would be returned all the time.
> +  */
> + if (position && position != root &&
> + !css_tryget(&position->css))
> + position = NULL;
> + }
> + return position;
> +}
> +
> +static void mem_cgroup_iter_update(struct mem_cgroup_reclaim_iter *iter,
> +struct mem_cgroup *last_visited,
> +struct mem_cgroup *new_position,
> +struct mem_cgroup *root,
> +int sequence)
> +{
> + /* root reference counting symmetric to mem_cgroup_iter_load */
> + if (last_visited && last_visited != root)
> + css_put(&last_visited->css);
> + /*
> +  * We store the sequence count from the time @last_visited was
> +  * loaded successfully instead of rereading it here so that we
> +  * don't lose destruction events in between.  We could have
> +  * raced with the destruction of @new_position after all.
> +  */
> + iter->last_visited = new_position;
> + smp_wmb();
> + iter->last_dead_count = sequence;
> +}
> +
>  /**
>   * mem_cgroup_iter - iterate over memory cgroup hierarchy
>   * @root: hierarchy root
> @@ -1172,7 +1234,6 @@ struct mem_cgroup *mem_cgroup_iter(struct mem_cgroup 
> *root,
>  {
>   struct mem_cgroup *memcg = NULL;
>   struct mem_cgroup *last_visited = NULL;
> - unsigned long uninitialized_var(dead_count);
>  
>   if (mem_cgroup_disabled())
>   return NULL;
> @@ -1192,6 +1253,7 @@ struct mem_cgroup *mem_cgroup_iter(struct mem_cgroup 
> *root,
>   rcu_read_lock();
>   while (!memcg) {
>   struct mem_cgroup_reclaim_iter *uninitialized_var(iter);
> + int uninitialized_var(seq);
>  
>   if (reclaim) {
>   int nid = zone_to_nid(reclaim->zone);
> @@ -1205,37 +1267,14 @@ struct mem_cgroup *mem_cgroup_iter(struct mem_cgroup 
> *root,
>   goto out_unlock;
>   }
>  
> - /*
> -  * If the dead_count mismatches, a destruction
> -  * has happened or is happening concurr

[PATCH stable 3.10] mm: memcontrol: factor out reclaim iterator loading and updating

2017-07-27 Thread Wenwei Tao
From: Johannes Weiner 

commit 519ebea3bf6df45439e79c54bda1d9e29fe13a64 upstream

It turned out that this is actually a bug fix which prevents a double
css_put on last_visited memcg which results in kernel BUG at 
kernel/cgroup.c:893!
See http://lkml.kernel.org/r/20170726130742.5976-1-wenwei@gmail.com
for more details.

mem_cgroup_iter() is too hard to follow.  Factor out the lockless reclaim
iterator loading and updating so it's easier to follow the big picture.

Also document the iterator invalidation mechanism a bit more extensively.

Signed-off-by: Johannes Weiner 
Reported-by: Tejun Heo 
Reviewed-by: Tejun Heo 
Acked-by: Michal Hocko 
Cc: KAMEZAWA Hiroyuki 
Cc: Glauber Costa 
Signed-off-by: Andrew Morton 
Signed-off-by: Linus Torvalds 
[wt: Backported to linux-3.10.y: adjusted context]
Signed-off-by: Wenwei Tao 
---
 mm/memcontrol.c | 97 -
 1 file changed, 68 insertions(+), 29 deletions(-)

diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 437ae2c..fcde430 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -1149,6 +1149,68 @@ skip_node:
return NULL;
 }
 
+static void mem_cgroup_iter_invalidate(struct mem_cgroup *root)
+{
+   /*
+* When a group in the hierarchy below root is destroyed, the
+* hierarchy iterator can no longer be trusted since it might
+* have pointed to the destroyed group.  Invalidate it.
+*/
+   atomic_inc(&root->dead_count);
+}
+
+static struct mem_cgroup *
+mem_cgroup_iter_load(struct mem_cgroup_reclaim_iter *iter,
+struct mem_cgroup *root,
+int *sequence)
+{
+   struct mem_cgroup *position = NULL;
+   /*
+* A cgroup destruction happens in two stages: offlining and
+* release.  They are separated by a RCU grace period.
+*
+* If the iterator is valid, we may still race with an
+* offlining.  The RCU lock ensures the object won't be
+* released, tryget will fail if we lost the race.
+*/
+   *sequence = atomic_read(&root->dead_count);
+   if (iter->last_dead_count == *sequence) {
+   smp_rmb();
+   position = iter->last_visited;
+
+   /*
+* We cannot take a reference to root because we might race
+* with root removal and returning NULL would end up in
+* an endless loop on the iterator user level when root
+* would be returned all the time.
+*/
+   if (position && position != root &&
+   !css_tryget(&position->css))
+   position = NULL;
+   }
+   return position;
+}
+
+static void mem_cgroup_iter_update(struct mem_cgroup_reclaim_iter *iter,
+  struct mem_cgroup *last_visited,
+  struct mem_cgroup *new_position,
+  struct mem_cgroup *root,
+  int sequence)
+{
+   /* root reference counting symmetric to mem_cgroup_iter_load */
+   if (last_visited && last_visited != root)
+   css_put(&last_visited->css);
+   /*
+* We store the sequence count from the time @last_visited was
+* loaded successfully instead of rereading it here so that we
+* don't lose destruction events in between.  We could have
+* raced with the destruction of @new_position after all.
+*/
+   iter->last_visited = new_position;
+   smp_wmb();
+   iter->last_dead_count = sequence;
+}
+
 /**
  * mem_cgroup_iter - iterate over memory cgroup hierarchy
  * @root: hierarchy root
@@ -1172,7 +1234,6 @@ struct mem_cgroup *mem_cgroup_iter(struct mem_cgroup 
*root,
 {
struct mem_cgroup *memcg = NULL;
struct mem_cgroup *last_visited = NULL;
-   unsigned long uninitialized_var(dead_count);
 
if (mem_cgroup_disabled())
return NULL;
@@ -1192,6 +1253,7 @@ struct mem_cgroup *mem_cgroup_iter(struct mem_cgroup 
*root,
rcu_read_lock();
while (!memcg) {
struct mem_cgroup_reclaim_iter *uninitialized_var(iter);
+   int uninitialized_var(seq);
 
if (reclaim) {
int nid = zone_to_nid(reclaim->zone);
@@ -1205,37 +1267,14 @@ struct mem_cgroup *mem_cgroup_iter(struct mem_cgroup 
*root,
goto out_unlock;
}
 
-   /*
-* If the dead_count mismatches, a destruction
-* has happened or is happening concurrently.
-* If the dead_count matches, a destruction
-* might still happen concurrently, but since
-* we checked under RCU, that destruction
-* won't free the object until we release the
-* RC