From: Vladimir Davydov <[email protected]>
Subject: mm: memcontrol: fix possible memcg leak due to interrupted reclaim
Memory cgroup reclaim can be interrupted with mem_cgroup_iter_break() once
enough pages have been reclaimed, in which case, in contrast to a full
round-trip over a cgroup sub-tree, the current position stored in
mem_cgroup_reclaim_iter of the target cgroup does not get invalidated and
so is left holding the reference to the last scanned cgroup. If the
target cgroup does not get scanned again (we might have just reclaimed the
last page or all processes might exit and free their memory voluntary), we
will leak it, because there is nobody to put the reference held by the
iterator.
The problem is easy to reproduce by running the following command sequence
in a loop:
mkdir /sys/fs/cgroup/memory/test
echo 100M > /sys/fs/cgroup/memory/test/memory.limit_in_bytes
echo $$ > /sys/fs/cgroup/memory/test/cgroup.procs
memhog 150M
echo $$ > /sys/fs/cgroup/memory/cgroup.procs
rmdir test
The cgroups generated by it will never get freed.
This patch fixes this issue by making mem_cgroup_iter avoid taking
reference to the current position. In order not to hit use-after-free bug
while running reclaim in parallel with cgroup deletion, we make use of
->css_released cgroup callback to clear references to the dying cgroup in
all reclaim iterators that might refer to it. This callback is called
right before scheduling rcu work which will free css, so if we access
iter->position from rcu read section, we might be sure it won't go away
under us.
[[email protected]: clean up css ref handling]
Fixes: 5ac8fb31ad2e ("mm: memcontrol: convert reclaim iterator to simple css
refcounting")
Signed-off-by: Vladimir Davydov <[email protected]>
Signed-off-by: Johannes Weiner <[email protected]>
Acked-by: Michal Hocko <[email protected]>
Acked-by: Johannes Weiner <[email protected]>
Cc: <[email protected]> [3.19+]
Signed-off-by: Andrew Morton <[email protected]>
---
mm/memcontrol.c | 60 +++++++++++++++++++++++++++++++++++-----------
1 file changed, 46 insertions(+), 14 deletions(-)
diff -puN
mm/memcontrol.c~mm-memcontrol-fix-possible-memcg-leak-due-to-interrupted-reclaim
mm/memcontrol.c
---
a/mm/memcontrol.c~mm-memcontrol-fix-possible-memcg-leak-due-to-interrupted-reclaim
+++ a/mm/memcontrol.c
@@ -903,14 +903,20 @@ struct mem_cgroup *mem_cgroup_iter(struc
if (prev && reclaim->generation != iter->generation)
goto out_unlock;
- do {
+ while (1) {
pos = READ_ONCE(iter->position);
+ if (!pos || css_tryget(&pos->css))
+ break;
/*
- * A racing update may change the position and
- * put the last reference, hence css_tryget(),
- * or retry to see the updated position.
+ * css reference reached zero, so iter->position will
+ * be cleared by ->css_released. However, we should not
+ * rely on this happening soon, because ->css_released
+ * is called from a work queue, and by busy-waiting we
+ * might block it. So we clear iter->position right
+ * away.
*/
- } while (pos && !css_tryget(&pos->css));
+ (void)cmpxchg(&iter->position, pos, NULL);
+ }
}
if (pos)
@@ -956,17 +962,13 @@ struct mem_cgroup *mem_cgroup_iter(struc
}
if (reclaim) {
- if (cmpxchg(&iter->position, pos, memcg) == pos) {
- if (memcg)
- css_get(&memcg->css);
- if (pos)
- css_put(&pos->css);
- }
-
/*
- * pairs with css_tryget when dereferencing iter->position
- * above.
+ * The position could have already been updated by a competing
+ * thread, so check that the value hasn't changed since we read
+ * it to avoid reclaiming from the same cgroup twice.
*/
+ (void)cmpxchg(&iter->position, pos, memcg);
+
if (pos)
css_put(&pos->css);
@@ -999,6 +1001,28 @@ void mem_cgroup_iter_break(struct mem_cg
css_put(&prev->css);
}
+static void invalidate_reclaim_iterators(struct mem_cgroup *dead_memcg)
+{
+ struct mem_cgroup *memcg = dead_memcg;
+ struct mem_cgroup_reclaim_iter *iter;
+ struct mem_cgroup_per_zone *mz;
+ int nid, zid;
+ int i;
+
+ while ((memcg = parent_mem_cgroup(memcg))) {
+ for_each_node(nid) {
+ for (zid = 0; zid < MAX_NR_ZONES; zid++) {
+ mz = &memcg->nodeinfo[nid]->zoneinfo[zid];
+ for (i = 0; i <= DEF_PRIORITY; i++) {
+ iter = &mz->iter[i];
+ cmpxchg(&iter->position,
+ dead_memcg, NULL);
+ }
+ }
+ }
+ }
+}
+
/*
* Iteration constructs for visiting all cgroups (under a tree). If
* loops are exited prematurely (break), mem_cgroup_iter_break() must
@@ -4324,6 +4348,13 @@ static void mem_cgroup_css_offline(struc
wb_memcg_offline(memcg);
}
+static void mem_cgroup_css_released(struct cgroup_subsys_state *css)
+{
+ struct mem_cgroup *memcg = mem_cgroup_from_css(css);
+
+ invalidate_reclaim_iterators(memcg);
+}
+
static void mem_cgroup_css_free(struct cgroup_subsys_state *css)
{
struct mem_cgroup *memcg = mem_cgroup_from_css(css);
@@ -5185,6 +5216,7 @@ struct cgroup_subsys memory_cgrp_subsys
.css_alloc = mem_cgroup_css_alloc,
.css_online = mem_cgroup_css_online,
.css_offline = mem_cgroup_css_offline,
+ .css_released = mem_cgroup_css_released,
.css_free = mem_cgroup_css_free,
.css_reset = mem_cgroup_css_reset,
.can_attach = mem_cgroup_can_attach,
_
--
To unsubscribe from this list: send the line "unsubscribe stable" in
the body of a message to [email protected]
More majordomo info at http://vger.kernel.org/majordomo-info.html