[PATCH v11 12/15] memcg: allow kmem limit to be resized down

2013-10-24 Thread Vladimir Davydov
From: Glauber Costa 

The userspace memory limit can be freely resized down. Upon attempt,
reclaim will be called to flush the pages away until we either reach the
limit we want or give up.

It wasn't possible so far with the kmem limit, since we had no way to
shrink the kmem buffers other than using the big hammer of shrink_slab,
that effectively frees data around the whole system.

The situation flips now that we have a per-memcg shrinker
infrastructure. We will proceed analogously to our user memory
counterpart and try to shrink our buffers until we either reach the
limit we want or give up.

Signed-off-by: Glauber Costa 
Cc: Johannes Weiner 
Cc: Michal Hocko 
Cc: Kamezawa Hiroyuki 
---
 mm/memcontrol.c |   43 ++-
 1 file changed, 38 insertions(+), 5 deletions(-)

diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 03178d0..7bf4dc7 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -5574,10 +5574,39 @@ static ssize_t mem_cgroup_read(struct 
cgroup_subsys_state *css,
return simple_read_from_buffer(buf, nbytes, ppos, str, len);
 }
 
+#ifdef CONFIG_MEMCG_KMEM
+/*
+ * This is slightly different than res or memsw reclaim.  We already have
+ * vmscan behind us to drive the reclaim, so we can basically keep trying until
+ * all buffers that can be flushed are flushed. We have a very clear signal
+ * about it in the form of the return value of try_to_free_mem_cgroup_kmem.
+ */
+static int mem_cgroup_resize_kmem_limit(struct mem_cgroup *memcg,
+   unsigned long long val)
+{
+   int ret = -EBUSY;
+
+   for (;;) {
+   if (signal_pending(current)) {
+   ret = -EINTR;
+   break;
+   }
+
+   ret = res_counter_set_limit(&memcg->kmem, val);
+   if (!ret)
+   break;
+
+   /* Can't free anything, pointless to continue */
+   if (!try_to_free_mem_cgroup_kmem(memcg, GFP_KERNEL))
+   break;
+   }
+
+   return ret;
+}
+
 static int memcg_update_kmem_limit(struct cgroup_subsys_state *css, u64 val)
 {
int ret = -EINVAL;
-#ifdef CONFIG_MEMCG_KMEM
struct mem_cgroup *memcg = mem_cgroup_from_css(css);
/*
 * For simplicity, we won't allow this to be disabled.  It also can't
@@ -5612,16 +5641,15 @@ static int memcg_update_kmem_limit(struct 
cgroup_subsys_state *css, u64 val)
 * starts accounting before all call sites are patched
 */
memcg_kmem_set_active(memcg);
-   } else
-   ret = res_counter_set_limit(&memcg->kmem, val);
+   } else {
+   ret = mem_cgroup_resize_kmem_limit(memcg, val);
+   }
 out:
mutex_unlock(&set_limit_mutex);
mutex_unlock(&memcg_create_mutex);
-#endif
return ret;
 }
 
-#ifdef CONFIG_MEMCG_KMEM
 static int memcg_propagate_kmem(struct mem_cgroup *memcg)
 {
int ret = 0;
@@ -5658,6 +5686,11 @@ static int memcg_propagate_kmem(struct mem_cgroup *memcg)
 out:
return ret;
 }
+#else
+static int memcg_update_kmem_limit(struct cgroup *cont, u64 val)
+{
+   return -EINVAL;
+}
 #endif /* CONFIG_MEMCG_KMEM */
 
 /*
-- 
1.7.10.4

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[PATCH v11 09/15] memcg,list_lru: add per-memcg LRU list infrastructure

2013-10-24 Thread Vladimir Davydov
FS-shrinkers, which shrink dcaches and icaches, keep dentries and inodes
in list_lru structures in order to evict least recently used objects.
With per-memcg kmem shrinking infrastructure introduced, we have to make
those LRU lists per-memcg in order to allow shrinking FS caches that
belong to different memory cgroups independently.

This patch addresses the issue by introducing struct memcg_list_lru.
This struct aggregates list_lru objects for each kmem-active memcg, and
keeps it uptodate whenever a memcg is created or destroyed. Its
interface is very simple: it only allows to get the pointer to the
appropriate list_lru object from a memcg or a kmem ptr, which should be
further operated with conventional list_lru methods.

Signed-off-by: Vladimir Davydov 
Cc: Glauber Costa 
Cc: Dave Chinner 
Cc: Mel Gorman 
Cc: Rik van Riel 
Cc: Johannes Weiner 
Cc: Michal Hocko 
Cc: Hugh Dickins 
Cc: Kamezawa Hiroyuki 
Cc: Andrew Morton 
---
 include/linux/list_lru.h |   56 +++
 mm/memcontrol.c  |  251 --
 2 files changed, 301 insertions(+), 6 deletions(-)

diff --git a/include/linux/list_lru.h b/include/linux/list_lru.h
index 3ce5417..b3b3b86 100644
--- a/include/linux/list_lru.h
+++ b/include/linux/list_lru.h
@@ -10,6 +10,8 @@
 #include 
 #include 
 
+struct mem_cgroup;
+
 /* list_lru_walk_cb has to always return one of those */
 enum lru_status {
LRU_REMOVED,/* item removed from list */
@@ -31,6 +33,27 @@ struct list_lru {
nodemask_t  active_nodes;
 };
 
+struct memcg_list_lru {
+   struct list_lru global_lru;
+
+#ifdef CONFIG_MEMCG_KMEM
+   struct list_lru **memcg_lrus;   /* rcu-protected array of per-memcg
+  lrus, indexed by memcg_cache_id() */
+
+   struct list_head list;  /* list of all memcg-aware lrus */
+
+   /*
+* The memcg_lrus array is rcu protected, so we can only free it after
+* a call to synchronize_rcu(). To avoid multiple calls to
+* synchronize_rcu() when many lrus get updated at the same time, which
+* is a typical scenario, we will store the pointer to the previous
+* version of the array in the old_lrus variable for each lru, and then
+* free them all at once after a single call to synchronize_rcu().
+*/
+   void *old_lrus;
+#endif
+};
+
 void list_lru_destroy(struct list_lru *lru);
 int list_lru_init(struct list_lru *lru);
 
@@ -128,4 +151,37 @@ list_lru_walk(struct list_lru *lru, list_lru_walk_cb 
isolate,
}
return isolated;
 }
+
+#ifdef CONFIG_MEMCG_KMEM
+int memcg_list_lru_init(struct memcg_list_lru *lru);
+void memcg_list_lru_destroy(struct memcg_list_lru *lru);
+
+struct list_lru *
+mem_cgroup_list_lru(struct memcg_list_lru *lru, struct mem_cgroup *memcg);
+struct list_lru *
+mem_cgroup_kmem_list_lru(struct memcg_list_lru *lru, void *ptr);
+#else
+static inline int memcg_list_lru_init(struct memcg_list_lru *lru)
+{
+   return list_lru_init(&lru->global_lru);
+}
+
+static inline void memcg_list_lru_destroy(struct memcg_list_lru *lru)
+{
+   list_lru_destroy(&lru->global_lru);
+}
+
+static inline struct list_lru *
+mem_cgroup_list_lru(struct memcg_list_lru *lru, struct mem_cgroup *memcg)
+{
+   return &lru->global_lru;
+}
+
+static inline struct list_lru *
+mem_cgroup_kmem_list_lru(struct memcg_list_lru *lru, void *ptr)
+{
+   return &lru->global_lru;
+}
+#endif /* CONFIG_MEMCG_KMEM */
+
 #endif /* _LRU_LIST_H */
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 2f5a777..39e4772 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -54,6 +54,7 @@
 #include 
 #include 
 #include 
+#include 
 #include "internal.h"
 #include 
 #include 
@@ -3233,6 +3234,8 @@ void memcg_cache_list_add(struct mem_cgroup *memcg, 
struct kmem_cache *cachep)
mutex_unlock(&memcg->slab_caches_mutex);
 }
 
+static int memcg_update_all_lrus(int num_groups);
+
 /*
  * This ends up being protected by the set_limit mutex, during normal
  * operation, because that is its main call site.
@@ -3257,15 +3260,28 @@ int memcg_update_cache_sizes(struct mem_cgroup *memcg)
 */
memcg_kmem_set_activated(memcg);
 
-   ret = memcg_update_all_caches(num+1);
-   if (ret) {
-   ida_simple_remove(&kmem_limited_groups, num);
-   memcg_kmem_clear_activated(memcg);
-   return ret;
-   }
+   /*
+* We need to update the memcg lru lists before we update the caches.
+* Once the caches are updated, they will be able to start hosting
+* objects. If a cache is created very quickly and an element is used
+* and disposed to the lru quickly as well, we can end up with a NULL
+* pointer dereference while trying to add a new element to a memcg
+* lru.
+*/
+   ret = memcg_update_all_lrus(num + 1);
+  

[PATCH v11 07/15] memcg: scan cache objects hierarchically

2013-10-24 Thread Vladimir Davydov
From: Glauber Costa 

When reaching shrink_slab, we should descent in children memcg searching
for objects that could be shrunk. This is true even if the memcg does
not have kmem limits on, since the kmem res_counter will also be billed
against the user res_counter of the parent.

It is possible that we will free objects and not free any pages, that
will just harm the child groups without helping the parent group at all.
But at this point, we basically are prepared to pay the price.

Signed-off-by: Glauber Costa 
Cc: Dave Chinner 
Cc: Mel Gorman 
Cc: Rik van Riel 
Cc: Johannes Weiner 
Cc: Michal Hocko 
Cc: Hugh Dickins 
Cc: Kamezawa Hiroyuki 
Cc: Andrew Morton 
---
 include/linux/memcontrol.h |6 
 mm/memcontrol.c|   13 +
 mm/vmscan.c|   65 
 3 files changed, 73 insertions(+), 11 deletions(-)

diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
index d16ba51..a513fad 100644
--- a/include/linux/memcontrol.h
+++ b/include/linux/memcontrol.h
@@ -488,6 +488,7 @@ static inline bool memcg_kmem_enabled(void)
return static_key_false(&memcg_kmem_enabled_key);
 }
 
+bool memcg_kmem_should_reclaim(struct mem_cgroup *memcg);
 bool memcg_kmem_is_active(struct mem_cgroup *memcg);
 
 /*
@@ -624,6 +625,11 @@ memcg_kmem_get_cache(struct kmem_cache *cachep, gfp_t gfp)
 }
 #else
 
+static inline bool memcg_kmem_should_reclaim(struct mem_cgroup *memcg)
+{
+   return false;
+}
+
 static inline bool memcg_kmem_is_active(struct mem_cgroup *memcg)
 {
return false;
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index f5b9921..2f5a777 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -2972,6 +2972,19 @@ static void __mem_cgroup_commit_charge(struct mem_cgroup 
*memcg,
 }
 
 #ifdef CONFIG_MEMCG_KMEM
+bool memcg_kmem_should_reclaim(struct mem_cgroup *memcg)
+{
+   struct mem_cgroup *iter;
+
+   for_each_mem_cgroup_tree(iter, memcg) {
+   if (memcg_kmem_is_active(iter)) {
+   mem_cgroup_iter_break(memcg, iter);
+   return true;
+   }
+   }
+   return false;
+}
+
 static inline bool memcg_can_account_kmem(struct mem_cgroup *memcg)
 {
return !mem_cgroup_disabled() && !mem_cgroup_is_root(memcg) &&
diff --git a/mm/vmscan.c b/mm/vmscan.c
index cdfc364..36fc133 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -149,7 +149,7 @@ static bool global_reclaim(struct scan_control *sc)
 static bool has_kmem_reclaim(struct scan_control *sc)
 {
return !sc->target_mem_cgroup ||
-   memcg_kmem_is_active(sc->target_mem_cgroup);
+   memcg_kmem_should_reclaim(sc->target_mem_cgroup);
 }
 
 static unsigned long
@@ -360,12 +360,35 @@ shrink_slab_node(struct shrink_control *shrinkctl, struct 
shrinker *shrinker,
  *
  * Returns the number of slab objects which we shrunk.
  */
+static unsigned long
+shrink_slab_one(struct shrink_control *shrinkctl, struct shrinker *shrinker,
+   unsigned long nr_pages_scanned, unsigned long lru_pages)
+{
+   unsigned long freed = 0;
+
+   for_each_node_mask(shrinkctl->nid, shrinkctl->nodes_to_scan) {
+   if (!node_online(shrinkctl->nid))
+   continue;
+
+   if (!(shrinker->flags & SHRINKER_NUMA_AWARE) &&
+   (shrinkctl->nid != 0))
+   break;
+
+   freed += shrink_slab_node(shrinkctl, shrinker,
+nr_pages_scanned, lru_pages);
+
+   }
+
+   return freed;
+}
+
 unsigned long shrink_slab(struct shrink_control *shrinkctl,
  unsigned long nr_pages_scanned,
  unsigned long lru_pages)
 {
struct shrinker *shrinker;
unsigned long freed = 0;
+   struct mem_cgroup *root = shrinkctl->target_mem_cgroup;
 
if (nr_pages_scanned == 0)
nr_pages_scanned = SWAP_CLUSTER_MAX;
@@ -390,19 +413,39 @@ unsigned long shrink_slab(struct shrink_control 
*shrinkctl,
if (shrinkctl->target_mem_cgroup &&
!(shrinker->flags & SHRINKER_MEMCG_AWARE))
continue;
+   /*
+* In a hierarchical chain, it might be that not all memcgs are
+* kmem active. kmemcg design mandates that when one memcg is
+* active, its children will be active as well. But it is
+* perfectly possible that its parent is not.
+*
+* We also need to make sure we scan at least once, for the
+* global case. So if we don't have a target memcg (saved in
+* root), we proceed normally and expect to break in the next
+* round.
+*/
+   do {
+   struct mem_cgroup *memcg = shrinkctl->target_mem_cgroup;
 
-   for_each_node_mask(shrinkctl->nid, shrinkctl->nodes_to_scan) 

[PATCH v11 10/15] memcg,list_lru: add function walking over all lists of a per-memcg LRU

2013-10-24 Thread Vladimir Davydov
Sometimes it can be necessary to iterate over all memcgs' lists of the
same memcg-aware LRU. For example shrink_dcache_sb() should prune all
dentries no matter what memory cgroup they belong to. Current interface
to struct memcg_list_lru, however, only allows per-memcg LRU walks.
This patch adds the special method memcg_list_lru_walk_all() which
provides the required functionality. Note that this function does not
guarantee that all the elements will be processed in the true
least-recently-used order, in fact it simply enumerates all kmem-active
memcgs and for each of them calls list_lru_walk(), but
shrink_dcache_sb(), which is going to be the only user of this function,
does not need it.

Signed-off-by: Vladimir Davydov 
Cc: Glauber Costa 
Cc: Dave Chinner 
Cc: Mel Gorman 
Cc: Rik van Riel 
Cc: Johannes Weiner 
Cc: Michal Hocko 
Cc: Hugh Dickins 
Cc: Kamezawa Hiroyuki 
Cc: Andrew Morton 
---
 include/linux/list_lru.h |   21 ++
 mm/memcontrol.c  |   55 ++
 2 files changed, 76 insertions(+)

diff --git a/include/linux/list_lru.h b/include/linux/list_lru.h
index b3b3b86..ce815cc 100644
--- a/include/linux/list_lru.h
+++ b/include/linux/list_lru.h
@@ -40,6 +40,16 @@ struct memcg_list_lru {
struct list_lru **memcg_lrus;   /* rcu-protected array of per-memcg
   lrus, indexed by memcg_cache_id() */
 
+   /*
+* When a memory cgroup is removed, all pointers to its list_lru
+* objects stored in memcg_lrus arrays are first marked as dead by
+* setting the lowest bit of the address while the actual data free
+* happens only after an rcu grace period. If a memcg_lrus reader,
+* which should be rcu-protected, faces a dead pointer, it won't
+* dereference it. This ensures there will be no use-after-free.
+*/
+#define MEMCG_LIST_LRU_DEAD1
+
struct list_head list;  /* list of all memcg-aware lrus */
 
/*
@@ -160,6 +170,10 @@ struct list_lru *
 mem_cgroup_list_lru(struct memcg_list_lru *lru, struct mem_cgroup *memcg);
 struct list_lru *
 mem_cgroup_kmem_list_lru(struct memcg_list_lru *lru, void *ptr);
+
+unsigned long
+memcg_list_lru_walk_all(struct memcg_list_lru *lru, list_lru_walk_cb isolate,
+   void *cb_arg, unsigned long nr_to_walk);
 #else
 static inline int memcg_list_lru_init(struct memcg_list_lru *lru)
 {
@@ -182,6 +196,13 @@ mem_cgroup_kmem_list_lru(struct memcg_list_lru *lru, void 
*ptr)
 {
return &lru->global_lru;
 }
+
+static inline unsigned long
+memcg_list_lru_walk_all(struct memcg_list_lru *lru, list_lru_walk_cb isolate,
+   void *cb_arg, unsigned long nr_to_walk)
+{
+   return list_lru_walk(&lru->global_lru, isolate, cb_arg, nr_to_walk);
+}
 #endif /* CONFIG_MEMCG_KMEM */
 
 #endif /* _LRU_LIST_H */
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 39e4772..03178d0 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -3899,16 +3899,30 @@ static int alloc_memcg_lru(struct memcg_list_lru *lru, 
int memcg_id)
return err;
}
 
+   smp_wmb();
VM_BUG_ON(lru->memcg_lrus[memcg_id]);
lru->memcg_lrus[memcg_id] = memcg_lru;
return 0;
 }
 
+static void memcg_lru_mark_dead(struct memcg_list_lru *lru, int memcg_id)
+{
+   struct list_lru *memcg_lru;
+   
+   BUG_ON(!lru->memcg_lrus);
+   memcg_lru = lru->memcg_lrus[memcg_id];
+   if (memcg_lru)
+   lru->memcg_lrus[memcg_id] = (void *)((unsigned long)memcg_lru |
+MEMCG_LIST_LRU_DEAD);
+}
+
 static void free_memcg_lru(struct memcg_list_lru *lru, int memcg_id)
 {
struct list_lru *memcg_lru = NULL;
 
swap(lru->memcg_lrus[memcg_id], memcg_lru);
+   memcg_lru = (void *)((unsigned long)memcg_lru &
+~MEMCG_LIST_LRU_DEAD);
if (memcg_lru) {
list_lru_destroy(memcg_lru);
kfree(memcg_lru);
@@ -3942,6 +3956,17 @@ static void __memcg_destroy_all_lrus(int memcg_id)
 {
struct memcg_list_lru *lru;
 
+   /*
+* Mark all lru lists of this memcg as dead and free them only after a
+* grace period. This is to prevent functions iterating over memcg_lrus
+* arrays (e.g. memcg_list_lru_walk_all()) from dereferencing pointers
+* pointing to already freed data.
+*/
+   list_for_each_entry(lru, &memcg_lrus_list, list)
+   memcg_lru_mark_dead(lru, memcg_id);
+
+   synchronize_rcu();
+
list_for_each_entry(lru, &memcg_lrus_list, list)
free_memcg_lru(lru, memcg_id);
 }
@@ -4103,6 +4128,36 @@ mem_cgroup_kmem_list_lru(struct memcg_list_lru *lru, 
void *ptr)
}
return mem_cgroup_list_lru(lru, memcg);
 }
+
+unsigned long
+memcg_list_lru_walk_al

[PATCH v11 05/15] memcg: move stop and resume accounting functions

2013-10-24 Thread Vladimir Davydov
From: Glauber Costa 

I need to move this up a bit, and I am doing it in a separate patch just to
reduce churn in the patch that needs it.

Signed-off-by: Glauber Costa 
Cc: Johannes Weiner 
Cc: Michal Hocko 
Cc: Hugh Dickins 
Cc: Kamezawa Hiroyuki 
Cc: Andrew Morton 
---
 mm/memcontrol.c |   62 +++
 1 file changed, 31 insertions(+), 31 deletions(-)

diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 5104d1f..bb38596 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -2987,6 +2987,37 @@ static struct kmem_cache *memcg_params_to_cache(struct 
memcg_cache_params *p)
return cachep->memcg_params->memcg_caches[memcg_cache_idx(p->memcg)];
 }
 
+/*
+ * During the creation a new cache, we need to disable our accounting mechanism
+ * altogether. This is true even if we are not creating, but rather just
+ * enqueing new caches to be created.
+ *
+ * This is because that process will trigger allocations; some visible, like
+ * explicit kmallocs to auxiliary data structures, name strings and internal
+ * cache structures; some well concealed, like INIT_WORK() that can allocate
+ * objects during debug.
+ *
+ * If any allocation happens during memcg_kmem_get_cache, we will recurse back
+ * to it. This may not be a bounded recursion: since the first cache creation
+ * failed to complete (waiting on the allocation), we'll just try to create the
+ * cache again, failing at the same point.
+ *
+ * memcg_kmem_get_cache is prepared to abort after seeing a positive count of
+ * memcg_kmem_skip_account. So we enclose anything that might allocate memory
+ * inside the following two functions.
+ */
+static inline void memcg_stop_kmem_account(void)
+{
+   VM_BUG_ON(!current->mm);
+   current->memcg_kmem_skip_account++;
+}
+
+static inline void memcg_resume_kmem_account(void)
+{
+   VM_BUG_ON(!current->mm);
+   current->memcg_kmem_skip_account--;
+}
+
 #ifdef CONFIG_SLABINFO
 static int mem_cgroup_slabinfo_read(struct cgroup_subsys_state *css,
struct cftype *cft, struct seq_file *m)
@@ -3262,37 +3293,6 @@ out:
kfree(s->memcg_params);
 }
 
-/*
- * During the creation a new cache, we need to disable our accounting mechanism
- * altogether. This is true even if we are not creating, but rather just
- * enqueing new caches to be created.
- *
- * This is because that process will trigger allocations; some visible, like
- * explicit kmallocs to auxiliary data structures, name strings and internal
- * cache structures; some well concealed, like INIT_WORK() that can allocate
- * objects during debug.
- *
- * If any allocation happens during memcg_kmem_get_cache, we will recurse back
- * to it. This may not be a bounded recursion: since the first cache creation
- * failed to complete (waiting on the allocation), we'll just try to create the
- * cache again, failing at the same point.
- *
- * memcg_kmem_get_cache is prepared to abort after seeing a positive count of
- * memcg_kmem_skip_account. So we enclose anything that might allocate memory
- * inside the following two functions.
- */
-static inline void memcg_stop_kmem_account(void)
-{
-   VM_BUG_ON(!current->mm);
-   current->memcg_kmem_skip_account++;
-}
-
-static inline void memcg_resume_kmem_account(void)
-{
-   VM_BUG_ON(!current->mm);
-   current->memcg_kmem_skip_account--;
-}
-
 static void kmem_cache_destroy_work_func(struct work_struct *w)
 {
struct kmem_cache *cachep;
-- 
1.7.10.4

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[PATCH v11 02/15] memcg: consolidate callers of memcg_cache_id

2013-10-24 Thread Vladimir Davydov
From: Glauber Costa 

Each caller of memcg_cache_id ends up sanitizing its parameters in its own way.
Now that the memcg_cache_id itself is more robust, we can consolidate this.

Also, as suggested by Michal, a special helper memcg_cache_idx is used when the
result is expected to be used directly as an array index to make sure we never
accesses in a negative index.

Signed-off-by: Glauber Costa 
Cc: Johannes Weiner 
Cc: Michal Hocko 
Cc: Kamezawa Hiroyuki 
---
 mm/memcontrol.c |   49 +
 1 file changed, 29 insertions(+), 20 deletions(-)

diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 0712277..0a5cc30 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -2937,6 +2937,30 @@ static inline bool memcg_can_account_kmem(struct 
mem_cgroup *memcg)
 }
 
 /*
+ * helper for acessing a memcg's index. It will be used as an index in the
+ * child cache array in kmem_cache, and also to derive its name. This function
+ * will return -1 when this is not a kmem-limited memcg.
+ */
+int memcg_cache_id(struct mem_cgroup *memcg)
+{
+   if (!memcg || !memcg_can_account_kmem(memcg))
+   return -1;
+   return memcg->kmemcg_id;
+}
+
+/*
+ * This helper around memcg_cache_id is not intented for use outside memcg
+ * core. It is meant for places where the cache id is used directly as an array
+ * index
+ */
+static int memcg_cache_idx(struct mem_cgroup *memcg)
+{
+   int ret = memcg_cache_id(memcg);
+   BUG_ON(ret < 0);
+   return ret;
+}
+
+/*
  * This is a bit cumbersome, but it is rarely used and avoids a backpointer
  * in the memcg_cache_params struct.
  */
@@ -2946,7 +2970,7 @@ static struct kmem_cache *memcg_params_to_cache(struct 
memcg_cache_params *p)
 
VM_BUG_ON(p->is_root_cache);
cachep = p->root_cache;
-   return cachep->memcg_params->memcg_caches[memcg_cache_id(p->memcg)];
+   return cachep->memcg_params->memcg_caches[memcg_cache_idx(p->memcg)];
 }
 
 #ifdef CONFIG_SLABINFO
@@ -3051,18 +3075,6 @@ void memcg_cache_list_add(struct mem_cgroup *memcg, 
struct kmem_cache *cachep)
 }
 
 /*
- * helper for acessing a memcg's index. It will be used as an index in the
- * child cache array in kmem_cache, and also to derive its name. This function
- * will return -1 when this is not a kmem-limited memcg.
- */
-int memcg_cache_id(struct mem_cgroup *memcg)
-{
-   if (!memcg || !memcg_can_account_kmem(memcg))
-   return -1;
-   return memcg->kmemcg_id;
-}
-
-/*
  * This ends up being protected by the set_limit mutex, during normal
  * operation, because that is its main call site.
  *
@@ -3224,7 +3236,7 @@ void memcg_release_cache(struct kmem_cache *s)
goto out;
 
memcg = s->memcg_params->memcg;
-   id  = memcg_cache_id(memcg);
+   id = memcg_cache_idx(memcg);
 
root = s->memcg_params->root_cache;
root->memcg_params->memcg_caches[id] = NULL;
@@ -3387,9 +3399,7 @@ static struct kmem_cache *memcg_create_kmem_cache(struct 
mem_cgroup *memcg,
struct kmem_cache *new_cachep;
int idx;
 
-   BUG_ON(!memcg_can_account_kmem(memcg));
-
-   idx = memcg_cache_id(memcg);
+   idx = memcg_cache_idx(memcg);
 
mutex_lock(&memcg_cache_mutex);
new_cachep = cachep->memcg_params->memcg_caches[idx];
@@ -3562,10 +3572,9 @@ struct kmem_cache *__memcg_kmem_get_cache(struct 
kmem_cache *cachep,
rcu_read_lock();
memcg = mem_cgroup_from_task(rcu_dereference(current->mm->owner));
 
-   if (!memcg_can_account_kmem(memcg))
-   goto out;
-
idx = memcg_cache_id(memcg);
+   if (idx < 0)
+   return cachep;
 
/*
 * barrier to mare sure we're always seeing the up to date value.  The
-- 
1.7.10.4

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[PATCH v11 04/15] memcg: move initialization to memcg creation

2013-10-24 Thread Vladimir Davydov
From: Glauber Costa 

Those structures are only used for memcgs that are effectively using
kmemcg. However, in a later patch I intend to use scan that list
inconditionally (list empty meaning no kmem caches present), which
simplifies the code a lot.

So move the initialization to early kmem creation.

Signed-off-by: Glauber Costa 
Cc: Dave Chinner 
Cc: Mel Gorman 
Cc: Rik van Riel 
Cc: Johannes Weiner 
Cc: Michal Hocko 
Cc: Hugh Dickins 
Cc: Kamezawa Hiroyuki 
Cc: Andrew Morton 
---
 mm/memcontrol.c |4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 41c216a..5104d1f 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -3120,8 +3120,6 @@ int memcg_update_cache_sizes(struct mem_cgroup *memcg)
}
 
memcg->kmemcg_id = num;
-   INIT_LIST_HEAD(&memcg->memcg_slab_caches);
-   mutex_init(&memcg->slab_caches_mutex);
return 0;
 }
 
@@ -5916,6 +5914,8 @@ static int memcg_init_kmem(struct mem_cgroup *memcg, 
struct cgroup_subsys *ss)
 {
int ret;
 
+   INIT_LIST_HEAD(&memcg->memcg_slab_caches);
+   mutex_init(&memcg->slab_caches_mutex);
memcg->kmemcg_id = -1;
ret = memcg_propagate_kmem(memcg);
if (ret)
-- 
1.7.10.4

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[PATCH v11 03/15] vmscan: also shrink slab in memcg pressure

2013-10-24 Thread Vladimir Davydov
From: Glauber Costa 

Without the surrounding infrastructure, this patch is a bit of a hammer:
it will basically shrink objects from all memcgs under memcg pressure.
At least, however, we will keep the scan limited to the shrinkers marked
as per-memcg.

Future patches will implement the in-shrinker logic to filter objects
based on its memcg association.

Signed-off-by: Glauber Costa 
Cc: Dave Chinner 
Cc: Mel Gorman 
Cc: Rik van Riel 
Cc: Johannes Weiner 
Cc: Michal Hocko 
Cc: Hugh Dickins 
Cc: Kamezawa Hiroyuki 
Cc: Andrew Morton 
---
 include/linux/memcontrol.h |   17 +++
 include/linux/shrinker.h   |6 +-
 mm/memcontrol.c|   16 +-
 mm/vmscan.c|   50 +++-
 4 files changed, 82 insertions(+), 7 deletions(-)

diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
index b3e7a66..d16ba51 100644
--- a/include/linux/memcontrol.h
+++ b/include/linux/memcontrol.h
@@ -231,6 +231,9 @@ void mem_cgroup_split_huge_fixup(struct page *head);
 bool mem_cgroup_bad_page_check(struct page *page);
 void mem_cgroup_print_bad_page(struct page *page);
 #endif
+
+unsigned long
+memcg_zone_reclaimable_pages(struct mem_cgroup *memcg, struct zone *zone);
 #else /* CONFIG_MEMCG */
 struct mem_cgroup;
 
@@ -427,6 +430,12 @@ static inline void mem_cgroup_replace_page_cache(struct 
page *oldpage,
struct page *newpage)
 {
 }
+
+static inline unsigned long
+memcg_zone_reclaimable_pages(struct mem_cgroup *memcg, struct zone *zone)
+{
+   return 0;
+}
 #endif /* CONFIG_MEMCG */
 
 #if !defined(CONFIG_MEMCG) || !defined(CONFIG_DEBUG_VM)
@@ -479,6 +488,8 @@ static inline bool memcg_kmem_enabled(void)
return static_key_false(&memcg_kmem_enabled_key);
 }
 
+bool memcg_kmem_is_active(struct mem_cgroup *memcg);
+
 /*
  * In general, we'll do everything in our power to not incur in any overhead
  * for non-memcg users for the kmem functions. Not even a function call, if we
@@ -612,6 +623,12 @@ memcg_kmem_get_cache(struct kmem_cache *cachep, gfp_t gfp)
return __memcg_kmem_get_cache(cachep, gfp);
 }
 #else
+
+static inline bool memcg_kmem_is_active(struct mem_cgroup *memcg)
+{
+   return false;
+}
+
 #define for_each_memcg_cache_index(_idx)   \
for (; NULL; )
 
diff --git a/include/linux/shrinker.h b/include/linux/shrinker.h
index 68c0970..7d462b1 100644
--- a/include/linux/shrinker.h
+++ b/include/linux/shrinker.h
@@ -22,6 +22,9 @@ struct shrink_control {
nodemask_t nodes_to_scan;
/* current node being shrunk (for NUMA aware shrinkers) */
int nid;
+
+   /* reclaim from this memcg only (if not NULL) */
+   struct mem_cgroup *target_mem_cgroup;
 };
 
 #define SHRINK_STOP (~0UL)
@@ -63,7 +66,8 @@ struct shrinker {
 #define DEFAULT_SEEKS 2 /* A good number if you don't know better. */
 
 /* Flags */
-#define SHRINKER_NUMA_AWARE (1 << 0)
+#define SHRINKER_NUMA_AWARE(1 << 0)
+#define SHRINKER_MEMCG_AWARE   (1 << 1)
 
 extern int register_shrinker(struct shrinker *);
 extern void unregister_shrinker(struct shrinker *);
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 0a5cc30..41c216a 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -356,7 +356,7 @@ static inline void memcg_kmem_set_active(struct mem_cgroup 
*memcg)
set_bit(KMEM_ACCOUNTED_ACTIVE, &memcg->kmem_account_flags);
 }
 
-static bool memcg_kmem_is_active(struct mem_cgroup *memcg)
+bool memcg_kmem_is_active(struct mem_cgroup *memcg)
 {
return test_bit(KMEM_ACCOUNTED_ACTIVE, &memcg->kmem_account_flags);
 }
@@ -938,6 +938,20 @@ mem_cgroup_zone_nr_lru_pages(struct mem_cgroup *memcg, int 
nid, int zid,
return ret;
 }
 
+unsigned long
+memcg_zone_reclaimable_pages(struct mem_cgroup *memcg, struct zone *zone)
+{
+   int nid = zone_to_nid(zone);
+   int zid = zone_idx(zone);
+   unsigned long val;
+
+   val = mem_cgroup_zone_nr_lru_pages(memcg, nid, zid, LRU_ALL_FILE);
+   if (do_swap_account)
+   val += mem_cgroup_zone_nr_lru_pages(memcg, nid, zid,
+   LRU_ALL_ANON);
+   return val;
+}
+
 static unsigned long
 mem_cgroup_node_nr_lru_pages(struct mem_cgroup *memcg,
int nid, unsigned int lru_mask)
diff --git a/mm/vmscan.c b/mm/vmscan.c
index eea668d..652dfa3 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -140,11 +140,41 @@ static bool global_reclaim(struct scan_control *sc)
 {
return !sc->target_mem_cgroup;
 }
+
+/*
+ * kmem reclaim should usually not be triggered when we are doing targetted
+ * reclaim. It is only valid when global reclaim is triggered, or when the
+ * underlying memcg has kmem objects.
+ */
+static bool has_kmem_reclaim(struct scan_control *sc)
+{
+   return !sc->target_mem_cgroup ||
+   memcg_kmem_is_active(sc->target_mem_cgroup);
+}
+
+static unsigned long
+zone_nr_reclaimable_pages(struct scan_control *sc, str

Re: [Devel] Race in memcg kmem?

2013-12-17 Thread Vladimir Davydov
On 12/12/2013 05:39 PM, Vladimir Davydov wrote:
> On 12/12/2013 05:21 PM, Michal Hocko wrote:
>> On Wed 11-12-13 10:22:06, Vladimir Davydov wrote:
>>> On 12/11/2013 03:13 AM, Glauber Costa wrote:
>>>> On Tue, Dec 10, 2013 at 5:59 PM, Vladimir Davydov
>> [...]
>>>>> -- memcg_update_cache_size(s, num_groups) --
>>>>> grows s->memcg_params to accomodate data for num_groups memcg's
>>>>> @s is the root cache whose memcg_params we want to grow
>>>>> @num_groups is the new number of kmem-active cgroups (defines the new
>>>>> size of memcg_params array).
>>>>>
>>>>> The function:
>>>>>
>>>>> B1) allocates and assigns a new cache:
>>>>> cur_params = s->memcg_params;
>>>>> s->memcg_params = kzalloc(size, GFP_KERNEL);
>>>>>
>>>>> B2) copies per-memcg cache ptrs from the old memcg_params array to the
>>>>> new one:
>>>>> for (i = 0; i < memcg_limited_groups_array_size; i++) {
>>>>> if (!cur_params->memcg_caches[i])
>>>>> continue;
>>>>> s->memcg_params->memcg_caches[i] =
>>>>> cur_params->memcg_caches[i];
>>>>> }
>>>>>
>>>>> B3) frees the old array:
>>>>> kfree(cur_params);
>>>>>
>>>>>
>>>>> Since these two functions do not share any mutexes, we can get the
>>>> They do share a mutex, the slab mutex.
>> Worth sticking in a lock_dep_assert?
> AFAIU, lockdep_assert_held() is not applicable here:
> memcg_create_kmem_cache() is called w/o the slab_mutex held, but it
> calls kmem_cache_create_kmemcg(), which takes and releases this mutex,
> working as a barrier. Placing lockdep_assert_held() into the latter
> won't make things any clearer. IMO, we need a big good comment in
> memcg_create_kmem_cache() proving its correctness.

After a bit of thinking on the comment explaining why the race is
impossible I seem to have found another one in these two functions.

Assume two threads schedule kmem_cache creation works for the same
kmem_cache of the same memcg from __memcg_kmem_get_cache(). One of the
works successfully creates it. Another work should fail then, but if it
interleaves with memcg_update_cache_size() as follows, it does not:

memcg_create_kmem_cache()   memcg_update_cache_size()
(called w/o mutexes held)   (called with slab_mutex
held)
-   -
mutex_lock(&memcg_cache_mutex)
s->memcg_params=kzalloc(...)
new_cachep=cache_from_memcg_idx(cachep,idx)
// new_cachep==NULL => proceed to creation
// initialize
s->memcg_params;
// sets s->memcg_params
//   
->memcg_caches[idx]
new_cachep = kmem_cache_dup(memcg, cachep)
// nothing prevents kmem_cache_dup from
// succeeding so ...
cachep->memcg_params->memcg_caches[idx]=new_cachep
// we've overwritten an existing cache ptr!

slab_mutex won't help here...

Anyway, I'm going to move check and initialization of memcg_caches[idx]
from memcg_create_kmem_cache() to kmem_cache_create_memcg() under the
slab_mutex eliminating every possibility of race there. Will send the
patch soon.

Thanks.
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[PATCH 6/6] memcg, slab: RCU protect memcg_params for root caches

2013-12-18 Thread Vladimir Davydov
We update root cache's memcg_params whenever we need to grow the
memcg_caches array to accommodate all kmem-active memory cgroups.
Currently we free the old version immediately then, which can lead to
use-after-free, because the memcg_caches array is accessed lock-free.
This patch fixes this by making memcg_params RCU-protected.

Signed-off-by: Vladimir Davydov 
Cc: Michal Hocko 
Cc: Johannes Weiner 
Cc: Glauber Costa 
Cc: Christoph Lameter 
Cc: Pekka Enberg 
Cc: Andrew Morton 
---
 include/linux/slab.h |5 -
 mm/memcontrol.c  |   15 ---
 mm/slab.h|8 +++-
 3 files changed, 19 insertions(+), 9 deletions(-)

diff --git a/include/linux/slab.h b/include/linux/slab.h
index 1e2f4fe..f7e5649 100644
--- a/include/linux/slab.h
+++ b/include/linux/slab.h
@@ -528,7 +528,10 @@ static __always_inline void *kmalloc_node(size_t size, 
gfp_t flags, int node)
 struct memcg_cache_params {
bool is_root_cache;
union {
-   struct kmem_cache *memcg_caches[0];
+   struct {
+   struct rcu_head rcu_head;
+   struct kmem_cache *memcg_caches[0];
+   };
struct {
struct mem_cgroup *memcg;
struct list_head list;
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index ad8de6a..379fc5f 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -3142,18 +3142,17 @@ int memcg_update_cache_size(struct kmem_cache *s, int 
num_groups)
 
if (num_groups > memcg_limited_groups_array_size) {
int i;
+   struct memcg_cache_params *new_params;
ssize_t size = memcg_caches_array_size(num_groups);
 
size *= sizeof(void *);
size += offsetof(struct memcg_cache_params, memcg_caches);
 
-   s->memcg_params = kzalloc(size, GFP_KERNEL);
-   if (!s->memcg_params) {
-   s->memcg_params = cur_params;
+   new_params = kzalloc(size, GFP_KERNEL);
+   if (!new_params)
return -ENOMEM;
-   }
 
-   s->memcg_params->is_root_cache = true;
+   new_params->is_root_cache = true;
 
/*
 * There is the chance it will be bigger than
@@ -3167,7 +3166,7 @@ int memcg_update_cache_size(struct kmem_cache *s, int 
num_groups)
for (i = 0; i < memcg_limited_groups_array_size; i++) {
if (!cur_params->memcg_caches[i])
continue;
-   s->memcg_params->memcg_caches[i] =
+   new_params->memcg_caches[i] =
cur_params->memcg_caches[i];
}
 
@@ -3180,7 +3179,9 @@ int memcg_update_cache_size(struct kmem_cache *s, int 
num_groups)
 * bigger than the others. And all updates will reset this
 * anyway.
 */
-   kfree(cur_params);
+   rcu_assign_pointer(s->memcg_params, new_params);
+   if (cur_params)
+   kfree_rcu(cur_params, rcu_head);
}
return 0;
 }
diff --git a/mm/slab.h b/mm/slab.h
index 1d8b53f..53b81a9 100644
--- a/mm/slab.h
+++ b/mm/slab.h
@@ -164,10 +164,16 @@ static inline struct kmem_cache *
 cache_from_memcg_idx(struct kmem_cache *s, int idx)
 {
struct kmem_cache *cachep;
+   struct memcg_cache_params *params;
 
if (!s->memcg_params)
return NULL;
-   cachep = s->memcg_params->memcg_caches[idx];
+
+   rcu_read_lock();
+   params = rcu_dereference(s->memcg_params);
+   cachep = params->memcg_caches[idx];
+   rcu_read_unlock();
+
smp_read_barrier_depends(); /* see memcg_register_cache() */
return cachep;
 }
-- 
1.7.10.4

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[PATCH 5/6] memcg: clear memcg_params after removing cache from memcg_slab_caches list

2013-12-18 Thread Vladimir Davydov
All caches of the same memory cgroup are linked in the memcg_slab_caches
list via kmem_cache::memcg_params::list. This list is traversed when we
read memory.kmem.slabinfo. Since the list actually consists of
memcg_cache_params objects, to convert an element of the list to a
kmem_cache object, we use memcg_params_to_cache(), which obtains the
pointer to the cache from the memcg_params::memcg_caches array of the
root cache, but on cache destruction this pointer is cleared before the
removal of the cache from the list, which potentially can result in a
NULL ptr dereference. Let's fix this by clearing the pointer to a cache
in the memcg_params::memcg_caches array of its parent only after it
cannot be accessed by the memcg_slab_caches list.

Signed-off-by: Vladimir Davydov 
Cc: Michal Hocko 
Cc: Johannes Weiner 
Cc: Glauber Costa 
Cc: Christoph Lameter 
Cc: Pekka Enberg 
Cc: Andrew Morton 
---
 mm/memcontrol.c |   16 +---
 1 file changed, 13 insertions(+), 3 deletions(-)

diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 62b9991..ad8de6a 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -3241,6 +3241,11 @@ void memcg_register_cache(struct kmem_cache *s)
 */
smp_wmb();
 
+   /*
+* Initialize the pointer to this cache in its parent's memcg_params
+* before adding it to the memcg_slab_caches list, otherwise we can
+* fail to convert memcg_params_to_cache() while traversing the list.
+*/
root->memcg_params->memcg_caches[id] = s;
 
mutex_lock(&memcg->slab_caches_mutex);
@@ -3265,15 +3270,20 @@ void memcg_release_cache(struct kmem_cache *s)
goto out;
 
memcg = s->memcg_params->memcg;
-   id  = memcg_cache_id(memcg);
-
+   id = memcg_cache_id(memcg);
root = s->memcg_params->root_cache;
-   root->memcg_params->memcg_caches[id] = NULL;
 
mutex_lock(&memcg->slab_caches_mutex);
list_del(&s->memcg_params->list);
mutex_unlock(&memcg->slab_caches_mutex);
 
+   /*
+* Clear the pointer to this cache in its parent's memcg_params only
+* after removing it from the memcg_slab_caches list, otherwise we can
+* fail to convert memcg_params_to_cache() while traversing the list.
+*/
+   root->memcg_params->memcg_caches[id] = NULL;
+
css_put(&memcg->css);
 out:
kfree(s->memcg_params);
-- 
1.7.10.4

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[PATCH 4/6] memcg, slab: check and init memcg_cahes under slab_mutex

2013-12-18 Thread Vladimir Davydov
The memcg_params::memcg_caches array can be updated concurrently from
memcg_update_cache_size() and memcg_create_kmem_cache(). Although both
of these functions take the slab_mutex during their operation, the
latter checks if memcg's cache has already been allocated w/o taking the
mutex. This can result in a race as described below.

Asume two threads schedule kmem_cache creation works for the same
kmem_cache of the same memcg from __memcg_kmem_get_cache(). One of the
works successfully creates it. Another work should fail then, but if it
interleaves with memcg_update_cache_size() as follows, it does not:

  memcg_create_kmem_cache() memcg_update_cache_size()
  (called w/o mutexes held) (called with slab_mutex held)
  - -
  mutex_lock(&memcg_cache_mutex)
s->memcg_params=kzalloc(...)
  new_cachep=cache_from_memcg_idx(cachep,idx)
  // new_cachep==NULL => proceed to creation
s->memcg_params->memcg_caches[i]
=cur_params->memcg_caches[i]
  // kmem_cache_dup takes slab_mutex so we will
  // hang around here until memcg_update_cache_size()
  // finishes, but ...
  new_cachep = kmem_cache_dup(memcg, cachep)
  // nothing will prevent kmem_cache_dup from
  // succeeding so ...
  cachep->memcg_params->memcg_caches[idx]=new_cachep
  // we've overwritten an existing cache ptr!

Let's fix this by moving both the check and the update of
memcg_params::memcg_caches from memcg_create_kmem_cache() to
kmem_cache_create_memcg() to be called under the slab_mutex.

Signed-off-by: Vladimir Davydov 
Cc: Michal Hocko 
Cc: Johannes Weiner 
Cc: Glauber Costa 
Cc: Christoph Lameter 
Cc: Pekka Enberg 
Cc: Andrew Morton 
---
 include/linux/memcontrol.h |9 ++--
 mm/memcontrol.c|   98 +++-
 mm/slab_common.c   |8 +++-
 3 files changed, 44 insertions(+), 71 deletions(-)

diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
index b357ae3..fdd3f30 100644
--- a/include/linux/memcontrol.h
+++ b/include/linux/memcontrol.h
@@ -500,8 +500,8 @@ int memcg_cache_id(struct mem_cgroup *memcg);
 int memcg_init_cache_params(struct mem_cgroup *memcg, struct kmem_cache *s,
struct kmem_cache *root_cache);
 void memcg_free_cache_params(struct kmem_cache *s);
-void memcg_release_cache(struct kmem_cache *cachep);
-void memcg_cache_list_add(struct mem_cgroup *memcg, struct kmem_cache *cachep);
+void memcg_register_cache(struct kmem_cache *s);
+void memcg_release_cache(struct kmem_cache *s);
 
 int memcg_update_cache_size(struct kmem_cache *s, int num_groups);
 void memcg_update_array_size(int num_groups);
@@ -652,12 +652,11 @@ static inline void memcg_free_cache_params(struct 
kmem_cache *s);
 {
 }
 
-static inline void memcg_release_cache(struct kmem_cache *cachep)
+static inline void memcg_register_cache(struct kmem_cache *s)
 {
 }
 
-static inline void memcg_cache_list_add(struct mem_cgroup *memcg,
-   struct kmem_cache *s)
+static inline void memcg_release_cache(struct kmem_cache *s)
 {
 }
 
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index e37fdb5..62b9991 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -3059,16 +3059,6 @@ static void memcg_uncharge_kmem(struct mem_cgroup 
*memcg, u64 size)
css_put(&memcg->css);
 }
 
-void memcg_cache_list_add(struct mem_cgroup *memcg, struct kmem_cache *cachep)
-{
-   if (!memcg)
-   return;
-
-   mutex_lock(&memcg->slab_caches_mutex);
-   list_add(&cachep->memcg_params->list, &memcg->memcg_slab_caches);
-   mutex_unlock(&memcg->slab_caches_mutex);
-}
-
 /*
  * helper for acessing a memcg's index. It will be used as an index in the
  * child cache array in kmem_cache, and also to derive its name. This function
@@ -3229,6 +3219,35 @@ void memcg_free_cache_params(struct kmem_cache *s)
kfree(s->memcg_params);
 }
 
+void memcg_register_cache(struct kmem_cache *s)
+{
+   struct kmem_cache *root;
+   struct mem_cgroup *memcg;
+   int id;
+
+   if (is_root_cache(s))
+   return;
+
+   memcg = s->memcg_params->memcg;
+   id = memcg_cache_id(memcg);
+   root = s->memcg_params->root_cache;
+
+   css_get(&memcg->css);
+
+   /*
+* Since readers won't lock (see cache_from_memcg_idx()), we need a
+* barrier here to ensure nobody will see the kmem_cache partially
+* initialized.
+*/
+   smp_wmb();
+
+   root->memcg_params->memcg_caches[id] = s;
+
+   mutex_lock(&memcg->slab_caches_mutex);
+   list_add(&s->memcg_params->list, &memcg->memcg_slab_cache

[PATCH 1/6] slab: cleanup kmem_cache_create_memcg()

2013-12-18 Thread Vladimir Davydov
Signed-off-by: Vladimir Davydov 
Cc: Michal Hocko 
Cc: Johannes Weiner 
Cc: Glauber Costa 
Cc: Christoph Lameter 
Cc: Pekka Enberg 
Cc: Andrew Morton 
---
 mm/slab_common.c |   66 +++---
 1 file changed, 33 insertions(+), 33 deletions(-)

diff --git a/mm/slab_common.c b/mm/slab_common.c
index 0b7bb39..5d6f743 100644
--- a/mm/slab_common.c
+++ b/mm/slab_common.c
@@ -176,8 +176,9 @@ kmem_cache_create_memcg(struct mem_cgroup *memcg, const 
char *name, size_t size,
get_online_cpus();
mutex_lock(&slab_mutex);
 
-   if (!kmem_cache_sanity_check(memcg, name, size) == 0)
-   goto out_locked;
+   err = kmem_cache_sanity_check(memcg, name, size);
+   if (err)
+   goto out_unlock;
 
/*
 * Some allocators will constraint the set of valid flags to a subset
@@ -189,45 +190,41 @@ kmem_cache_create_memcg(struct mem_cgroup *memcg, const 
char *name, size_t size,
 
s = __kmem_cache_alias(memcg, name, size, align, flags, ctor);
if (s)
-   goto out_locked;
+   goto out_unlock;
 
s = kmem_cache_zalloc(kmem_cache, GFP_KERNEL);
-   if (s) {
-   s->object_size = s->size = size;
-   s->align = calculate_alignment(flags, align, size);
-   s->ctor = ctor;
-
-   if (memcg_register_cache(memcg, s, parent_cache)) {
-   kmem_cache_free(kmem_cache, s);
-   err = -ENOMEM;
-   goto out_locked;
-   }
+   if (!s) {
+   err = -ENOMEM;
+   goto out_unlock;
+   }
 
-   s->name = kstrdup(name, GFP_KERNEL);
-   if (!s->name) {
-   kmem_cache_free(kmem_cache, s);
-   err = -ENOMEM;
-   goto out_locked;
-   }
+   s->object_size = s->size = size;
+   s->align = calculate_alignment(flags, align, size);
+   s->ctor = ctor;
 
-   err = __kmem_cache_create(s, flags);
-   if (!err) {
-   s->refcount = 1;
-   list_add(&s->list, &slab_caches);
-   memcg_cache_list_add(memcg, s);
-   } else {
-   kfree(s->name);
-   kmem_cache_free(kmem_cache, s);
-   }
-   } else
+   s->name = kstrdup(name, GFP_KERNEL);
+   if (!s->name) {
err = -ENOMEM;
+   goto out_free_cache;
+   }
+
+   err = memcg_register_cache(memcg, s, parent_cache);
+   if (err)
+   goto out_free_cache;
 
-out_locked:
+   err = __kmem_cache_create(s, flags);
+   if (err)
+   goto out_free_cache;
+
+   s->refcount = 1;
+   list_add(&s->list, &slab_caches);
+   memcg_cache_list_add(memcg, s);
+
+out_unlock:
mutex_unlock(&slab_mutex);
put_online_cpus();
 
if (err) {
-
if (flags & SLAB_PANIC)
panic("kmem_cache_create: Failed to create slab '%s'. 
Error %d\n",
name, err);
@@ -236,11 +233,14 @@ out_locked:
name, err);
dump_stack();
}
-
return NULL;
}
-
return s;
+
+out_free_cache:
+   kfree(s->name);
+   kmem_cache_free(kmem_cache, s);
+   goto out_unlock;
 }
 
 struct kmem_cache *
-- 
1.7.10.4

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[PATCH 2/6] memcg, slab: kmem_cache_create_memcg(): free memcg params on error

2013-12-18 Thread Vladimir Davydov
Plus, rename memcg_register_cache() to memcg_init_cache_params(),
because it actually does not register the cache anywhere, but simply
initialize kmem_cache::memcg_params.

Signed-off-by: Vladimir Davydov 
Cc: Michal Hocko 
Cc: Johannes Weiner 
Cc: Glauber Costa 
Cc: Christoph Lameter 
Cc: Pekka Enberg 
Cc: Andrew Morton 
---
 include/linux/memcontrol.h |   13 +
 mm/memcontrol.c|9 +++--
 mm/slab_common.c   |3 ++-
 3 files changed, 18 insertions(+), 7 deletions(-)

diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
index b3e7a66..b357ae3 100644
--- a/include/linux/memcontrol.h
+++ b/include/linux/memcontrol.h
@@ -497,8 +497,9 @@ void __memcg_kmem_commit_charge(struct page *page,
 void __memcg_kmem_uncharge_pages(struct page *page, int order);
 
 int memcg_cache_id(struct mem_cgroup *memcg);
-int memcg_register_cache(struct mem_cgroup *memcg, struct kmem_cache *s,
-struct kmem_cache *root_cache);
+int memcg_init_cache_params(struct mem_cgroup *memcg, struct kmem_cache *s,
+   struct kmem_cache *root_cache);
+void memcg_free_cache_params(struct kmem_cache *s);
 void memcg_release_cache(struct kmem_cache *cachep);
 void memcg_cache_list_add(struct mem_cgroup *memcg, struct kmem_cache *cachep);
 
@@ -641,12 +642,16 @@ static inline int memcg_cache_id(struct mem_cgroup *memcg)
 }
 
 static inline int
-memcg_register_cache(struct mem_cgroup *memcg, struct kmem_cache *s,
-struct kmem_cache *root_cache)
+memcg_init_cache_params(struct mem_cgroup *memcg, struct kmem_cache *s,
+   struct kmem_cache *root_cache)
 {
return 0;
 }
 
+static inline void memcg_free_cache_params(struct kmem_cache *s);
+{
+}
+
 static inline void memcg_release_cache(struct kmem_cache *cachep)
 {
 }
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index bf5e894..e6ad6ff 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -3195,8 +3195,8 @@ int memcg_update_cache_size(struct kmem_cache *s, int 
num_groups)
return 0;
 }
 
-int memcg_register_cache(struct mem_cgroup *memcg, struct kmem_cache *s,
-struct kmem_cache *root_cache)
+int memcg_init_cache_params(struct mem_cgroup *memcg, struct kmem_cache *s,
+   struct kmem_cache *root_cache)
 {
size_t size;
 
@@ -3224,6 +3224,11 @@ int memcg_register_cache(struct mem_cgroup *memcg, 
struct kmem_cache *s,
return 0;
 }
 
+void memcg_free_cache_params(struct kmem_cache *s)
+{
+   kfree(s->memcg_params);
+}
+
 void memcg_release_cache(struct kmem_cache *s)
 {
struct kmem_cache *root;
diff --git a/mm/slab_common.c b/mm/slab_common.c
index 5d6f743..62712fe 100644
--- a/mm/slab_common.c
+++ b/mm/slab_common.c
@@ -208,7 +208,7 @@ kmem_cache_create_memcg(struct mem_cgroup *memcg, const 
char *name, size_t size,
goto out_free_cache;
}
 
-   err = memcg_register_cache(memcg, s, parent_cache);
+   err = memcg_init_cache_params(memcg, s, parent_cache);
if (err)
goto out_free_cache;
 
@@ -238,6 +238,7 @@ out_unlock:
return s;
 
 out_free_cache:
+   memcg_free_cache_params(s);
kfree(s->name);
kmem_cache_free(kmem_cache, s);
goto out_unlock;
-- 
1.7.10.4

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[PATCH 3/6] memcg, slab: cleanup barrier usage when accessing memcg_caches

2013-12-18 Thread Vladimir Davydov
First, in memcg_create_kmem_cache() we should issue the write barrier
after the kmem_cache is initialized, but before storing the pointer to
it in its parent's memcg_params.

Second, we should always issue the read barrier after
cache_from_memcg_idx() to conform with the write barrier.

Third, its better to use smp_* versions of barriers, because we don't
need them on UP systems.

Signed-off-by: Vladimir Davydov 
Cc: Michal Hocko 
Cc: Johannes Weiner 
Cc: Glauber Costa 
Cc: Christoph Lameter 
Cc: Pekka Enberg 
Cc: Andrew Morton 
---
 mm/memcontrol.c |   24 ++--
 mm/slab.h   |6 +-
 2 files changed, 15 insertions(+), 15 deletions(-)

diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index e6ad6ff..e37fdb5 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -3429,12 +3429,14 @@ static struct kmem_cache 
*memcg_create_kmem_cache(struct mem_cgroup *memcg,
 
atomic_set(&new_cachep->memcg_params->nr_pages , 0);
 
-   cachep->memcg_params->memcg_caches[idx] = new_cachep;
/*
-* the readers won't lock, make sure everybody sees the updated value,
-* so they won't put stuff in the queue again for no reason
+* Since readers won't lock (see cache_from_memcg_idx()), we need a
+* barrier here to ensure nobody will see the kmem_cache partially
+* initialized.
 */
-   wmb();
+   smp_wmb();
+
+   cachep->memcg_params->memcg_caches[idx] = new_cachep;
 out:
mutex_unlock(&memcg_cache_mutex);
return new_cachep;
@@ -3573,7 +3575,7 @@ struct kmem_cache *__memcg_kmem_get_cache(struct 
kmem_cache *cachep,
  gfp_t gfp)
 {
struct mem_cgroup *memcg;
-   int idx;
+   struct kmem_cache *memcg_cachep;
 
VM_BUG_ON(!cachep->memcg_params);
VM_BUG_ON(!cachep->memcg_params->is_root_cache);
@@ -3587,15 +3589,9 @@ struct kmem_cache *__memcg_kmem_get_cache(struct 
kmem_cache *cachep,
if (!memcg_can_account_kmem(memcg))
goto out;
 
-   idx = memcg_cache_id(memcg);
-
-   /*
-* barrier to mare sure we're always seeing the up to date value.  The
-* code updating memcg_caches will issue a write barrier to match this.
-*/
-   read_barrier_depends();
-   if (likely(cache_from_memcg_idx(cachep, idx))) {
-   cachep = cache_from_memcg_idx(cachep, idx);
+   memcg_cachep = cache_from_memcg_idx(cachep, memcg_cache_id(memcg));
+   if (likely(memcg_cachep)) {
+   cachep = memcg_cachep;
goto out;
}
 
diff --git a/mm/slab.h b/mm/slab.h
index 0859c42..1d8b53f 100644
--- a/mm/slab.h
+++ b/mm/slab.h
@@ -163,9 +163,13 @@ static inline const char *cache_name(struct kmem_cache *s)
 static inline struct kmem_cache *
 cache_from_memcg_idx(struct kmem_cache *s, int idx)
 {
+   struct kmem_cache *cachep;
+
if (!s->memcg_params)
return NULL;
-   return s->memcg_params->memcg_caches[idx];
+   cachep = s->memcg_params->memcg_caches[idx];
+   smp_read_barrier_depends(); /* see memcg_register_cache() */
+   return cachep;
 }
 
 static inline struct kmem_cache *memcg_root_cache(struct kmem_cache *s)
-- 
1.7.10.4

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH 1/6] slab: cleanup kmem_cache_create_memcg()

2013-12-18 Thread Vladimir Davydov
On 12/18/2013 08:56 PM, Michal Hocko wrote:
> On Wed 18-12-13 17:16:52, Vladimir Davydov wrote:
>> Signed-off-by: Vladimir Davydov 
>> Cc: Michal Hocko 
>> Cc: Johannes Weiner 
>> Cc: Glauber Costa 
>> Cc: Christoph Lameter 
>> Cc: Pekka Enberg 
>> Cc: Andrew Morton 
> Dunno, is this really better to be worth the code churn?
>
> It even makes the generated code tiny bit bigger:
> textdata bss dec hex filename
> 4355 171 2364762129a mm/slab_common.o.after
> 4342 171 2364749128d mm/slab_common.o.before
>
> Or does it make the further changes much more easier? Be explicit in the
> patch description if so.

Hi, Michal

IMO, undoing under labels looks better than inside conditionals, because
we don't have to repeat the same deinitialization code then, like this
(note three calls to kmem_cache_free()):

s = kmem_cache_zalloc(kmem_cache, GFP_KERNEL);
if (s) {
s->object_size = s->size = size;
s->align = calculate_alignment(flags, align, size);
s->ctor = ctor;

if (memcg_register_cache(memcg, s, parent_cache)) {
kmem_cache_free(kmem_cache, s);
err = -ENOMEM;
goto out_locked;
}

s->name = kstrdup(name, GFP_KERNEL);
if (!s->name) {
kmem_cache_free(kmem_cache, s);
err = -ENOMEM;
goto out_locked;
}

err = __kmem_cache_create(s, flags);
if (!err) {
s->refcount = 1;
list_add(&s->list, &slab_caches);
memcg_cache_list_add(memcg, s);
} else {
kfree(s->name);
kmem_cache_free(kmem_cache, s);
}
} else
err = -ENOMEM;

The next patch, which fixes the memcg_params leakage on error, would
make it even worse introducing two calls to memcg_free_cache_params()
after kstrdup and __kmem_cache_create.

If you think it isn't worthwhile applying this patch, just let me know,
I don't mind dropping it.

Anyway, I'll improve the comment and resend.

Thanks.

>
>> ---
>>  mm/slab_common.c |   66 
>> +++---
>>  1 file changed, 33 insertions(+), 33 deletions(-)
>>
>> diff --git a/mm/slab_common.c b/mm/slab_common.c
>> index 0b7bb39..5d6f743 100644
>> --- a/mm/slab_common.c
>> +++ b/mm/slab_common.c
>> @@ -176,8 +176,9 @@ kmem_cache_create_memcg(struct mem_cgroup *memcg, const 
>> char *name, size_t size,
>>  get_online_cpus();
>>  mutex_lock(&slab_mutex);
>>  
>> -if (!kmem_cache_sanity_check(memcg, name, size) == 0)
>> -goto out_locked;
>> +err = kmem_cache_sanity_check(memcg, name, size);
>> +if (err)
>> +goto out_unlock;
>>  
>>  /*
>>   * Some allocators will constraint the set of valid flags to a subset
>> @@ -189,45 +190,41 @@ kmem_cache_create_memcg(struct mem_cgroup *memcg, 
>> const char *name, size_t size,
>>  
>>  s = __kmem_cache_alias(memcg, name, size, align, flags, ctor);
>>  if (s)
>> -goto out_locked;
>> +goto out_unlock;
>>  
>>  s = kmem_cache_zalloc(kmem_cache, GFP_KERNEL);
>> -if (s) {
>> -s->object_size = s->size = size;
>> -s->align = calculate_alignment(flags, align, size);
>> -s->ctor = ctor;
>> -
>> -if (memcg_register_cache(memcg, s, parent_cache)) {
>> -kmem_cache_free(kmem_cache, s);
>> -err = -ENOMEM;
>> -goto out_locked;
>> -}
>> +if (!s) {
>> +err = -ENOMEM;
>> +goto out_unlock;
>> +}
>>  
>> -s->name = kstrdup(name, GFP_KERNEL);
>> -if (!s->name) {
>> -kmem_cache_free(kmem_cache, s);
>> -err = -ENOMEM;
>> -goto out_locked;
>> -}
>> +s->object_size = s->size = size;
>> +s->align = calculate_alignment(flags, align, size);
>> +s->ctor = ctor;
>>  
>> -err = __kmem_cache_create(s, flags);
>> -if (!err) {
>> -s->refcount = 1;
>> -list_add(&s->list, &slab_caches);
>> -memcg_cache_list_add(memcg, s);
>> -} else {
>> -kfree(s->name);
>> -kmem_cache_free(kmem_cache, s);
>> -}
>> -} else
>> +s->name

Re: [PATCH 2/6] memcg, slab: kmem_cache_create_memcg(): free memcg params on error

2013-12-18 Thread Vladimir Davydov
On 12/18/2013 09:06 PM, Michal Hocko wrote:
> On Wed 18-12-13 17:16:53, Vladimir Davydov wrote:
>> Plus, rename memcg_register_cache() to memcg_init_cache_params(),
>> because it actually does not register the cache anywhere, but simply
>> initialize kmem_cache::memcg_params.
> I've almost missed this is a memory leak fix.

Yeah, the comment is poor, sorry about that. Will fix it.

> I do not mind renaming and the name but wouldn't
> memcg_alloc_cache_params suit better?

As you wish. I don't have a strong preference for memcg_init_cache_params.

Thanks.

>
>> Signed-off-by: Vladimir Davydov 
>> Cc: Michal Hocko 
>> Cc: Johannes Weiner 
>> Cc: Glauber Costa 
>> Cc: Christoph Lameter 
>> Cc: Pekka Enberg 
>> Cc: Andrew Morton 
>> ---
>>  include/linux/memcontrol.h |   13 +
>>  mm/memcontrol.c|9 +++--
>>  mm/slab_common.c   |3 ++-
>>  3 files changed, 18 insertions(+), 7 deletions(-)
>>
>> diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
>> index b3e7a66..b357ae3 100644
>> --- a/include/linux/memcontrol.h
>> +++ b/include/linux/memcontrol.h
>> @@ -497,8 +497,9 @@ void __memcg_kmem_commit_charge(struct page *page,
>>  void __memcg_kmem_uncharge_pages(struct page *page, int order);
>>  
>>  int memcg_cache_id(struct mem_cgroup *memcg);
>> -int memcg_register_cache(struct mem_cgroup *memcg, struct kmem_cache *s,
>> - struct kmem_cache *root_cache);
>> +int memcg_init_cache_params(struct mem_cgroup *memcg, struct kmem_cache *s,
>> +struct kmem_cache *root_cache);
>> +void memcg_free_cache_params(struct kmem_cache *s);
>>  void memcg_release_cache(struct kmem_cache *cachep);
>>  void memcg_cache_list_add(struct mem_cgroup *memcg, struct kmem_cache 
>> *cachep);
>>  
>> @@ -641,12 +642,16 @@ static inline int memcg_cache_id(struct mem_cgroup 
>> *memcg)
>>  }
>>  
>>  static inline int
>> -memcg_register_cache(struct mem_cgroup *memcg, struct kmem_cache *s,
>> - struct kmem_cache *root_cache)
>> +memcg_init_cache_params(struct mem_cgroup *memcg, struct kmem_cache *s,
>> +struct kmem_cache *root_cache)
>>  {
>>  return 0;
>>  }
>>  
>> +static inline void memcg_free_cache_params(struct kmem_cache *s);
>> +{
>> +}
>> +
>>  static inline void memcg_release_cache(struct kmem_cache *cachep)
>>  {
>>  }
>> diff --git a/mm/memcontrol.c b/mm/memcontrol.c
>> index bf5e894..e6ad6ff 100644
>> --- a/mm/memcontrol.c
>> +++ b/mm/memcontrol.c
>> @@ -3195,8 +3195,8 @@ int memcg_update_cache_size(struct kmem_cache *s, int 
>> num_groups)
>>  return 0;
>>  }
>>  
>> -int memcg_register_cache(struct mem_cgroup *memcg, struct kmem_cache *s,
>> - struct kmem_cache *root_cache)
>> +int memcg_init_cache_params(struct mem_cgroup *memcg, struct kmem_cache *s,
>> +struct kmem_cache *root_cache)
>>  {
>>  size_t size;
>>  
>> @@ -3224,6 +3224,11 @@ int memcg_register_cache(struct mem_cgroup *memcg, 
>> struct kmem_cache *s,
>>  return 0;
>>  }
>>  
>> +void memcg_free_cache_params(struct kmem_cache *s)
>> +{
>> +kfree(s->memcg_params);
>> +}
>> +
>>  void memcg_release_cache(struct kmem_cache *s)
>>  {
>>  struct kmem_cache *root;
>> diff --git a/mm/slab_common.c b/mm/slab_common.c
>> index 5d6f743..62712fe 100644
>> --- a/mm/slab_common.c
>> +++ b/mm/slab_common.c
>> @@ -208,7 +208,7 @@ kmem_cache_create_memcg(struct mem_cgroup *memcg, const 
>> char *name, size_t size,
>>  goto out_free_cache;
>>  }
>>  
>> -err = memcg_register_cache(memcg, s, parent_cache);
>> +err = memcg_init_cache_params(memcg, s, parent_cache);
>>  if (err)
>>  goto out_free_cache;
>>  
>> @@ -238,6 +238,7 @@ out_unlock:
>>  return s;
>>  
>>  out_free_cache:
>> +memcg_free_cache_params(s);
>>  kfree(s->name);
>>  kmem_cache_free(kmem_cache, s);
>>  goto out_unlock;
>> -- 
>> 1.7.10.4
>>

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH 3/6] memcg, slab: cleanup barrier usage when accessing memcg_caches

2013-12-18 Thread Vladimir Davydov
On 12/18/2013 09:14 PM, Michal Hocko wrote:
> On Wed 18-12-13 17:16:54, Vladimir Davydov wrote:
>> First, in memcg_create_kmem_cache() we should issue the write barrier
>> after the kmem_cache is initialized, but before storing the pointer to
>> it in its parent's memcg_params.
>>
>> Second, we should always issue the read barrier after
>> cache_from_memcg_idx() to conform with the write barrier.
>>
>> Third, its better to use smp_* versions of barriers, because we don't
>> need them on UP systems.
> Please be (much) more verbose on Why. Barriers are tricky and should be
> documented accordingly. So if you say that we should issue a barrier
> always be specific why we should do it.

In short, we have kmem_cache::memcg_params::memcg_caches is an array of
pointers to per-memcg caches. We access it lock-free so we should use
memory barriers during initialization. Obviously we should place a write
barrier just before we set the pointer in order to make sure nobody will
see a partially initialized structure. Besides there must be a read
barrier between reading the pointer and accessing the structure, to
conform with the write barrier. It's all that similar to rcu_assign and
rcu_deref. Currently the barrier usage looks rather strange:

memcg_create_kmem_cache:
initialize kmem
set the pointer in memcg_caches
wmb() // ???

__memcg_kmem_get_cache:
<...>
read_barrier_depends() // ???
cachep = root_cache->memcg_params->memcg_caches[memcg_id]
<...>

Nothing prevents some archs from moving initialization after setting the
pointer, or reading data before reading the pointer to it.

Of course, I will include a detailed description in the next version of
this patch.

Thanks.

>> Signed-off-by: Vladimir Davydov 
>> Cc: Michal Hocko 
>> Cc: Johannes Weiner 
>> Cc: Glauber Costa 
>> Cc: Christoph Lameter 
>> Cc: Pekka Enberg 
>> Cc: Andrew Morton 
>> ---
>>  mm/memcontrol.c |   24 ++--
>>  mm/slab.h   |6 +-
>>  2 files changed, 15 insertions(+), 15 deletions(-)
>>
>> diff --git a/mm/memcontrol.c b/mm/memcontrol.c
>> index e6ad6ff..e37fdb5 100644
>> --- a/mm/memcontrol.c
>> +++ b/mm/memcontrol.c
>> @@ -3429,12 +3429,14 @@ static struct kmem_cache 
>> *memcg_create_kmem_cache(struct mem_cgroup *memcg,
>>  
>>  atomic_set(&new_cachep->memcg_params->nr_pages , 0);
>>  
>> -cachep->memcg_params->memcg_caches[idx] = new_cachep;
>>  /*
>> - * the readers won't lock, make sure everybody sees the updated value,
>> - * so they won't put stuff in the queue again for no reason
>> + * Since readers won't lock (see cache_from_memcg_idx()), we need a
>> + * barrier here to ensure nobody will see the kmem_cache partially
>> + * initialized.
>>   */
>> -wmb();
>> +smp_wmb();
>> +
>> +cachep->memcg_params->memcg_caches[idx] = new_cachep;
>>  out:
>>  mutex_unlock(&memcg_cache_mutex);
>>  return new_cachep;
>> @@ -3573,7 +3575,7 @@ struct kmem_cache *__memcg_kmem_get_cache(struct 
>> kmem_cache *cachep,
>>gfp_t gfp)
>>  {
>>  struct mem_cgroup *memcg;
>> -int idx;
>> +struct kmem_cache *memcg_cachep;
>>  
>>  VM_BUG_ON(!cachep->memcg_params);
>>  VM_BUG_ON(!cachep->memcg_params->is_root_cache);
>> @@ -3587,15 +3589,9 @@ struct kmem_cache *__memcg_kmem_get_cache(struct 
>> kmem_cache *cachep,
>>  if (!memcg_can_account_kmem(memcg))
>>  goto out;
>>  
>> -idx = memcg_cache_id(memcg);
>> -
>> -/*
>> - * barrier to mare sure we're always seeing the up to date value.  The
>> - * code updating memcg_caches will issue a write barrier to match this.
>> - */
>> -read_barrier_depends();
>> -if (likely(cache_from_memcg_idx(cachep, idx))) {
>> -cachep = cache_from_memcg_idx(cachep, idx);
>> +memcg_cachep = cache_from_memcg_idx(cachep, memcg_cache_id(memcg));
>> +if (likely(memcg_cachep)) {
>> +cachep = memcg_cachep;
>>  goto out;
>>  }
>>  
>> diff --git a/mm/slab.h b/mm/slab.h
>> index 0859c42..1d8b53f 100644
>> --- a/mm/slab.h
>> +++ b/mm/slab.h
>> @@ -163,9 +163,13 @@ static inline const char *cache_name(struct kmem_cache 
>> *s)
>>  static inline struct kmem_cache *
>>  cache_from_memcg_idx(struct kmem_cache *s, int idx)
>>  {
>> +struct kmem_cache *cachep;
>> +
>>  if (!s->memcg_params)
>>  return NULL;
>> -return s->memcg_params->memcg_caches[idx];
>> +cachep = s->memcg_params->memcg_caches[idx];
>> +smp_read_barrier_depends(); /* see memcg_register_cache() */
>> +return cachep;
>>  }
>>  
>>  static inline struct kmem_cache *memcg_root_cache(struct kmem_cache *s)
>> -- 
>> 1.7.10.4
>>

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH 4/6] memcg, slab: check and init memcg_cahes under slab_mutex

2013-12-18 Thread Vladimir Davydov
On 12/18/2013 09:41 PM, Michal Hocko wrote:
> On Wed 18-12-13 17:16:55, Vladimir Davydov wrote:
>> The memcg_params::memcg_caches array can be updated concurrently from
>> memcg_update_cache_size() and memcg_create_kmem_cache(). Although both
>> of these functions take the slab_mutex during their operation, the
>> latter checks if memcg's cache has already been allocated w/o taking the
>> mutex. This can result in a race as described below.
>>
>> Asume two threads schedule kmem_cache creation works for the same
>> kmem_cache of the same memcg from __memcg_kmem_get_cache(). One of the
>> works successfully creates it. Another work should fail then, but if it
>> interleaves with memcg_update_cache_size() as follows, it does not:
> I am not sure I understand the race. memcg_update_cache_size is called
> when we start accounting a new memcg or a child is created and it
> inherits accounting from the parent. memcg_create_kmem_cache is called
> when a new cache is first allocated from, right?

memcg_update_cache_size() is called when kmem accounting is activated
for a memcg, no matter how.

memcg_create_kmem_cache() is scheduled from __memcg_kmem_get_cache().
It's OK to have a bunch of such methods trying to create the same memcg
cache concurrently, but only one of them should succeed.

> Why cannot we simply take slab_mutex inside memcg_create_kmem_cache?
> it is running from the workqueue context so it should clash with other
> locks.

Hmm, Glauber's code never takes the slab_mutex inside memcontrol.c. I
have always been wondering why, because it could simplify flow paths
significantly (e.g. update_cache_sizes() -> update_all_caches() ->
update_cache_size() - from memcontrol.c to slab_common.c and back again
just to take the mutex).

I don't see any reason preventing us from taking the mutex in
memcontrol.c. This would allow us to move all memcg-related kmem cache
initialization (addition to the memcg slab list, initialization of the
pointer in memcg_caches) to memcg_kmem_cache_create() and remove a bunch
of public functions. I guess I'll do this in my next iteration.

Thanks.

>
>>   memcg_create_kmem_cache() memcg_update_cache_size()
>>   (called w/o mutexes held) (called with slab_mutex held)
>>   - -
>>   mutex_lock(&memcg_cache_mutex)
>> s->memcg_params=kzalloc(...)
>>   new_cachep=cache_from_memcg_idx(cachep,idx)
>>   // new_cachep==NULL => proceed to creation
>> 
>> s->memcg_params->memcg_caches[i]
>> 
>> =cur_params->memcg_caches[i]
>>   // kmem_cache_dup takes slab_mutex so we will
>>   // hang around here until memcg_update_cache_size()
>>   // finishes, but ...
>>   new_cachep = kmem_cache_dup(memcg, cachep)
>>   // nothing will prevent kmem_cache_dup from
>>   // succeeding so ...
>>   cachep->memcg_params->memcg_caches[idx]=new_cachep
>>   // we've overwritten an existing cache ptr!
>>
>> Let's fix this by moving both the check and the update of
>> memcg_params::memcg_caches from memcg_create_kmem_cache() to
>> kmem_cache_create_memcg() to be called under the slab_mutex.
>>
>> Signed-off-by: Vladimir Davydov 
>> Cc: Michal Hocko 
>> Cc: Johannes Weiner 
>> Cc: Glauber Costa 
>> Cc: Christoph Lameter 
>> Cc: Pekka Enberg 
>> Cc: Andrew Morton 
>> ---
>>  include/linux/memcontrol.h |9 ++--
>>  mm/memcontrol.c|   98 
>> +++-
>>  mm/slab_common.c   |8 +++-
>>  3 files changed, 44 insertions(+), 71 deletions(-)
>>
>> diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
>> index b357ae3..fdd3f30 100644
>> --- a/include/linux/memcontrol.h
>> +++ b/include/linux/memcontrol.h
>> @@ -500,8 +500,8 @@ int memcg_cache_id(struct mem_cgroup *memcg);
>>  int memcg_init_cache_params(struct mem_cgroup *memcg, struct kmem_cache *s,
>>  struct kmem_cache *root_cache);
>>  void memcg_free_cache_params(struct kmem_cache *s);
>> -void memcg_release_cache(struct kmem_cache *cachep);
>> -void memcg_cache_list_add(struct mem_cgroup *memcg, struct kmem_cache 
>> *cachep);
>> +void memcg_register_cache(struct kmem_cache *s);
>> +void memcg_release_cache(struct kmem_cache *s);
>>  
>>  int memcg_update_cache_size(struct kmem_cache *s, int num_groups);
>>  void memcg_update_array_size(

Re: [Devel] [PATCH 1/6] slab: cleanup kmem_cache_create_memcg()

2013-12-19 Thread Vladimir Davydov
On 12/19/2013 12:17 PM, Vasily Averin wrote:
> On 12/18/2013 05:16 PM, Vladimir Davydov wrote:
>> --- a/mm/slab_common.c
>> +++ b/mm/slab_common.c
>> @@ -176,8 +176,9 @@ kmem_cache_create_memcg(struct mem_cgroup *memcg, const 
>> char *name, size_t size,
>>  get_online_cpus();
>>  mutex_lock(&slab_mutex);
>>  
>> -if (!kmem_cache_sanity_check(memcg, name, size) == 0)
>> -goto out_locked;
>> +err = kmem_cache_sanity_check(memcg, name, size);
>> +if (err)
>> +goto out_unlock;
>>  
>>  /*
>>   * Some allocators will constraint the set of valid flags to a subset
> Theoretically in future kmem_cache_sanity_check() can return positive value.
> Probably it's better to check (err < 0) in caller ?

Hmm, why? What information could positive retval carry here? We have
plenty of places throughout the code where we check for (err), not
(err<0), simply because it looks clearer, e.g. look at
__kmem_cache_create() calls. If it returns a positive value one day, we
will have to parse every place where it's called. Anyway, if someone
wants to change a function behavior, he must check every place where
this function is called and fix them accordingly.

Thanks.
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH 1/6] slab: cleanup kmem_cache_create_memcg()

2013-12-19 Thread Vladimir Davydov
On 12/19/2013 12:44 PM, Michal Hocko wrote:
> On Thu 19-12-13 10:31:43, Vladimir Davydov wrote:
>> On 12/18/2013 08:56 PM, Michal Hocko wrote:
>>> On Wed 18-12-13 17:16:52, Vladimir Davydov wrote:
>>>> Signed-off-by: Vladimir Davydov 
>>>> Cc: Michal Hocko 
>>>> Cc: Johannes Weiner 
>>>> Cc: Glauber Costa 
>>>> Cc: Christoph Lameter 
>>>> Cc: Pekka Enberg 
>>>> Cc: Andrew Morton 
>>> Dunno, is this really better to be worth the code churn?
>>>
>>> It even makes the generated code tiny bit bigger:
>>> textdata bss dec hex filename
>>> 4355 171 2364762129a mm/slab_common.o.after
>>> 4342 171 2364749128d mm/slab_common.o.before
>>>
>>> Or does it make the further changes much more easier? Be explicit in the
>>> patch description if so.
>> Hi, Michal
>>
>> IMO, undoing under labels looks better than inside conditionals, because
>> we don't have to repeat the same deinitialization code then, like this
>> (note three calls to kmem_cache_free()):
> Agreed but the resulting code is far from doing nice undo on different
> conditions. You have out_free_cache which frees everything regardless
> whether name or cache registration failed. So it doesn't help with
> readability much IMO.

AFAIK it's common practice not to split kfree's to be called under
different labels on fail paths, because kfree(NULL) results in a no-op.
Since on undo, we only call kfree, I introduce the only label. Of course
I could do something like

s->name=...
if (!s->name)
goto out_free_name;
err = __kmem_new_cache(...)
if (err)
goto out_free_name;
<...>
out_free_name:
kfree(s->name);
out_free_cache:
kfree(s);
goto out_unlock;

But I think using only out_free_cache makes the code look clearer.

>
>> s = kmem_cache_zalloc(kmem_cache, GFP_KERNEL);
>> if (s) {
>> s->object_size = s->size = size;
>> s->align = calculate_alignment(flags, align, size);
>> s->ctor = ctor;
>>
>> if (memcg_register_cache(memcg, s, parent_cache)) {
>> kmem_cache_free(kmem_cache, s);
>> err = -ENOMEM;
>> goto out_locked;
>> }
>>
>> s->name = kstrdup(name, GFP_KERNEL);
>> if (!s->name) {
>> kmem_cache_free(kmem_cache, s);
>> err = -ENOMEM;
>> goto out_locked;
>> }
>>
>> err = __kmem_cache_create(s, flags);
>> if (!err) {
>> s->refcount = 1;
>> list_add(&s->list, &slab_caches);
>> memcg_cache_list_add(memcg, s);
>> } else {
>> kfree(s->name);
>> kmem_cache_free(kmem_cache, s);
>> }
>> } else
>> err = -ENOMEM;
>>
>> The next patch, which fixes the memcg_params leakage on error, would
>> make it even worse introducing two calls to memcg_free_cache_params()
>> after kstrdup and __kmem_cache_create.
>>
>> If you think it isn't worthwhile applying this patch, just let me know,
>> I don't mind dropping it.
> As I've said if it helps with the later patches then I do not mind but
> on its own it doesn't sound like a huge improvement.
>
> Btw. you do not have to set err = -ENOMEM before goto out_locked. Just
> set before kmem_cache_zalloc. You also do not need to initialize it to 0
> because kmem_cache_sanity_check will set it.

OK, thanks.
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH 2/6] memcg, slab: kmem_cache_create_memcg(): free memcg params on error

2013-12-19 Thread Vladimir Davydov
On 12/19/2013 12:48 PM, Michal Hocko wrote:
> On Thu 19-12-13 10:32:29, Vladimir Davydov wrote:
>> On 12/18/2013 09:06 PM, Michal Hocko wrote:
>>> On Wed 18-12-13 17:16:53, Vladimir Davydov wrote:
>>>> Plus, rename memcg_register_cache() to memcg_init_cache_params(),
>>>> because it actually does not register the cache anywhere, but simply
>>>> initialize kmem_cache::memcg_params.
>>> I've almost missed this is a memory leak fix.
>> Yeah, the comment is poor, sorry about that. Will fix it.
>>
>>> I do not mind renaming and the name but wouldn't
>>> memcg_alloc_cache_params suit better?
>> As you wish. I don't have a strong preference for memcg_init_cache_params.
> I really hate naming... but it seems that alloc is a better fit. _init_
> would expect an already allocated object.
>
> Btw. memcg_free_cache_params is called only once which sounds
> suspicious. The regular destroy path should use it as well?
> [...]

The usual destroy path uses memcg_release_cache(), which does the trick.
Plus, it actually "unregisters" the cache. BTW, I forgot to substitute
kfree(s->memcg_params) with the new memcg_free_cache_params() there.
Although it currently does not break anything, better to fix it in case
new memcg_free_cache_params() will have to do something else.

And you're right about the naming is not good.

Currently we have:

  on create:
memcg_register_cache()
memcg_cache_list_add()
  on destroy:
memcg_release_cache()

After this patch we would have:

  on create:
memcg_alloc_cache_params()
memcg_register_cache()
  on destroy:
memcg_release_cache()

Still not perfect: "alloc" does not have corresponding "free", while
"register" does not have corresponding "unregister", everything is done
by "release".

What do you think about splitting memcg_release_cache() into two functions:

memcg_unregister_cache()
memcg_free_cache_params()

?

Thanks.
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH 3/6] memcg, slab: cleanup barrier usage when accessing memcg_caches

2013-12-19 Thread Vladimir Davydov
On 12/19/2013 01:10 PM, Michal Hocko wrote:
> On Thu 19-12-13 10:37:27, Vladimir Davydov wrote:
>> On 12/18/2013 09:14 PM, Michal Hocko wrote:
>>> On Wed 18-12-13 17:16:54, Vladimir Davydov wrote:
>>>> First, in memcg_create_kmem_cache() we should issue the write barrier
>>>> after the kmem_cache is initialized, but before storing the pointer to
>>>> it in its parent's memcg_params.
>>>>
>>>> Second, we should always issue the read barrier after
>>>> cache_from_memcg_idx() to conform with the write barrier.
>>>>
>>>> Third, its better to use smp_* versions of barriers, because we don't
>>>> need them on UP systems.
>>> Please be (much) more verbose on Why. Barriers are tricky and should be
>>> documented accordingly. So if you say that we should issue a barrier
>>> always be specific why we should do it.
>> In short, we have kmem_cache::memcg_params::memcg_caches is an array of
>> pointers to per-memcg caches. We access it lock-free so we should use
>> memory barriers during initialization. Obviously we should place a write
>> barrier just before we set the pointer in order to make sure nobody will
>> see a partially initialized structure. Besides there must be a read
>> barrier between reading the pointer and accessing the structure, to
>> conform with the write barrier. It's all that similar to rcu_assign and
>> rcu_deref. Currently the barrier usage looks rather strange:
>>
>> memcg_create_kmem_cache:
>> initialize kmem
>> set the pointer in memcg_caches
>> wmb() // ???
>>
>> __memcg_kmem_get_cache:
>> <...>
>> read_barrier_depends() // ???
>> cachep = root_cache->memcg_params->memcg_caches[memcg_id]
>> <...>
> Why do we need explicit memory barriers when we can use RCU?
> __memcg_kmem_get_cache already dereferences within rcu_read_lock.

Because it's not RCU, IMO. RCU implies freeing the old version after a
grace period, while kmem_caches are freed immediately. We simply want to
be sure the kmem_cache is fully initialized. And we do not require
calling this in an RCU critical section.

> Btw. cache_from_memcg_idx is desperately asking for a comment about
> required locking.

Actually, I placed a reference to the comment there ;-) but no problem,
I move it to cache_from_memcg_idx().

Thanks.
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH 4/6] memcg, slab: check and init memcg_cahes under slab_mutex

2013-12-19 Thread Vladimir Davydov
On 12/19/2013 01:12 PM, Michal Hocko wrote:
> On Thu 19-12-13 12:00:58, Glauber Costa wrote:
>> On Thu, Dec 19, 2013 at 11:07 AM, Vladimir Davydov
>>  wrote:
>>> On 12/18/2013 09:41 PM, Michal Hocko wrote:
>>>> On Wed 18-12-13 17:16:55, Vladimir Davydov wrote:
>>>>> The memcg_params::memcg_caches array can be updated concurrently from
>>>>> memcg_update_cache_size() and memcg_create_kmem_cache(). Although both
>>>>> of these functions take the slab_mutex during their operation, the
>>>>> latter checks if memcg's cache has already been allocated w/o taking the
>>>>> mutex. This can result in a race as described below.
>>>>>
>>>>> Asume two threads schedule kmem_cache creation works for the same
>>>>> kmem_cache of the same memcg from __memcg_kmem_get_cache(). One of the
>>>>> works successfully creates it. Another work should fail then, but if it
>>>>> interleaves with memcg_update_cache_size() as follows, it does not:
>>>> I am not sure I understand the race. memcg_update_cache_size is called
>>>> when we start accounting a new memcg or a child is created and it
>>>> inherits accounting from the parent. memcg_create_kmem_cache is called
>>>> when a new cache is first allocated from, right?
>>> memcg_update_cache_size() is called when kmem accounting is activated
>>> for a memcg, no matter how.
>>>
>>> memcg_create_kmem_cache() is scheduled from __memcg_kmem_get_cache().
>>> It's OK to have a bunch of such methods trying to create the same memcg
>>> cache concurrently, but only one of them should succeed.
>>>
>>>> Why cannot we simply take slab_mutex inside memcg_create_kmem_cache?
>>>> it is running from the workqueue context so it should clash with other
>>>> locks.
>>> Hmm, Glauber's code never takes the slab_mutex inside memcontrol.c. I
>>> have always been wondering why, because it could simplify flow paths
>>> significantly (e.g. update_cache_sizes() -> update_all_caches() ->
>>> update_cache_size() - from memcontrol.c to slab_common.c and back again
>>> just to take the mutex).
>>>
>> Because that is a layering violation and exposes implementation
>> details of the slab to
>> the outside world. I agree this would make things a lot simpler, but
>> please check with Christoph
>> if this is acceptable before going forward.
> We do not have to expose the lock directly. We can hide it behind a
> helper function. Relying on the lock silently at many places is worse
> then expose it IMHO.

BTW, the lock is already exposed by mm/slab.h, which is included into
mm/memcontrol.c :-) So we have immediate access to the lock right now.

Thanks.
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH 4/6] memcg, slab: check and init memcg_cahes under slab_mutex

2013-12-19 Thread Vladimir Davydov
Hi, Christoph

We have a problem with memcg-vs-slab interactions. Currently we set the
pointer to a new kmem_cache in its parent's memcg_caches array inside
memcg_create_kmem_cache() (mm/memcontrol.c):

memcg_create_kmem_cache():
new_cachep = cache_from_memcg_idx(cachep, idx);
if (new_cachep)
goto out;
new_cachep = kmem_cache_dup(memcg, cachep);
cachep->memcg_params->memcg_caches[idx] = new_cachep;

It seems to be prone to a race as explained in the comment to this
patch. To fix the race, we need to move the assignment of new_cachep to
memcg_caches[idx] to be called under the slab_mutex protection.

There are basically two ways of doing this:

1. Move the assignment to kmem_cache_create_memcg() defined in
mm/slab.c. This is how this patch handles it.
2. Move taking of the slab_mutex, along with some memcg-specific
initialization bits, from kmem_cache_create_memcg() to
memcg_create_kmem_cache().

The second way, although looks clearer, will break the convention not to
take the slab_mutex inside mm/memcontrol.c, Glauber tried to observe
while implementing kmemcg.

So the question is: what do you think about taking the slab_mutex
directly from mm/memcontrol.c w/o using helper functions (confusing call
paths, IMO)?

Thanks.

On 12/19/2013 12:00 PM, Glauber Costa wrote:
> On Thu, Dec 19, 2013 at 11:07 AM, Vladimir Davydov
>  wrote:
>> On 12/18/2013 09:41 PM, Michal Hocko wrote:
>>> On Wed 18-12-13 17:16:55, Vladimir Davydov wrote:
>>>> The memcg_params::memcg_caches array can be updated concurrently from
>>>> memcg_update_cache_size() and memcg_create_kmem_cache(). Although both
>>>> of these functions take the slab_mutex during their operation, the
>>>> latter checks if memcg's cache has already been allocated w/o taking the
>>>> mutex. This can result in a race as described below.
>>>>
>>>> Asume two threads schedule kmem_cache creation works for the same
>>>> kmem_cache of the same memcg from __memcg_kmem_get_cache(). One of the
>>>> works successfully creates it. Another work should fail then, but if it
>>>> interleaves with memcg_update_cache_size() as follows, it does not:
>>> I am not sure I understand the race. memcg_update_cache_size is called
>>> when we start accounting a new memcg or a child is created and it
>>> inherits accounting from the parent. memcg_create_kmem_cache is called
>>> when a new cache is first allocated from, right?
>> memcg_update_cache_size() is called when kmem accounting is activated
>> for a memcg, no matter how.
>>
>> memcg_create_kmem_cache() is scheduled from __memcg_kmem_get_cache().
>> It's OK to have a bunch of such methods trying to create the same memcg
>> cache concurrently, but only one of them should succeed.
>>
>>> Why cannot we simply take slab_mutex inside memcg_create_kmem_cache?
>>> it is running from the workqueue context so it should clash with other
>>> locks.
>> Hmm, Glauber's code never takes the slab_mutex inside memcontrol.c. I
>> have always been wondering why, because it could simplify flow paths
>> significantly (e.g. update_cache_sizes() -> update_all_caches() ->
>> update_cache_size() - from memcontrol.c to slab_common.c and back again
>> just to take the mutex).
>>
> Because that is a layering violation and exposes implementation
> details of the slab to
> the outside world. I agree this would make things a lot simpler, but
> please check with Christoph
> if this is acceptable before going forward.
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH 3/6] memcg, slab: cleanup barrier usage when accessing memcg_caches

2013-12-19 Thread Vladimir Davydov
On 12/19/2013 01:21 PM, Michal Hocko wrote:
> On Thu 19-12-13 13:16:01, Vladimir Davydov wrote:
>> On 12/19/2013 01:10 PM, Michal Hocko wrote:
>>> On Thu 19-12-13 10:37:27, Vladimir Davydov wrote:
>>>> On 12/18/2013 09:14 PM, Michal Hocko wrote:
>>>>> On Wed 18-12-13 17:16:54, Vladimir Davydov wrote:
>>>>>> First, in memcg_create_kmem_cache() we should issue the write barrier
>>>>>> after the kmem_cache is initialized, but before storing the pointer to
>>>>>> it in its parent's memcg_params.
>>>>>>
>>>>>> Second, we should always issue the read barrier after
>>>>>> cache_from_memcg_idx() to conform with the write barrier.
>>>>>>
>>>>>> Third, its better to use smp_* versions of barriers, because we don't
>>>>>> need them on UP systems.
>>>>> Please be (much) more verbose on Why. Barriers are tricky and should be
>>>>> documented accordingly. So if you say that we should issue a barrier
>>>>> always be specific why we should do it.
>>>> In short, we have kmem_cache::memcg_params::memcg_caches is an array of
>>>> pointers to per-memcg caches. We access it lock-free so we should use
>>>> memory barriers during initialization. Obviously we should place a write
>>>> barrier just before we set the pointer in order to make sure nobody will
>>>> see a partially initialized structure. Besides there must be a read
>>>> barrier between reading the pointer and accessing the structure, to
>>>> conform with the write barrier. It's all that similar to rcu_assign and
>>>> rcu_deref. Currently the barrier usage looks rather strange:
>>>>
>>>> memcg_create_kmem_cache:
>>>> initialize kmem
>>>> set the pointer in memcg_caches
>>>> wmb() // ???
>>>>
>>>> __memcg_kmem_get_cache:
>>>> <...>
>>>> read_barrier_depends() // ???
>>>> cachep = root_cache->memcg_params->memcg_caches[memcg_id]
>>>> <...>
>>> Why do we need explicit memory barriers when we can use RCU?
>>> __memcg_kmem_get_cache already dereferences within rcu_read_lock.
>> Because it's not RCU, IMO. RCU implies freeing the old version after a
>> grace period, while kmem_caches are freed immediately. We simply want to
>> be sure the kmem_cache is fully initialized. And we do not require
>> calling this in an RCU critical section.
> And you can use rcu_dereference and rcu_assign for that as well.

rcu_dereference() will complain if called outside an RCU critical
section, while cache_from_memcg_idx() is called w/o RCU protection from
some places.

> It hides all the juicy details about memory barriers.

IMO, a memory barrier with a good comment looks better than an
rcu_dereference() without RCU protection :-)

> Besides that nothing prevents us from freeing from rcu callback. Or?

It's an overhead we can live without there. The point is that we can
access a cache only if it is active. I mean no allocation can go from a
cache that has already been destroyed. It would be a bug. So there is no
point in introducing RCU-protection for kmem_caches there. It would only
confuse, IMO.

Thanks.
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH 6/6] memcg, slab: RCU protect memcg_params for root caches

2013-12-19 Thread Vladimir Davydov
On 12/19/2013 01:28 PM, Michal Hocko wrote:
> On Wed 18-12-13 17:16:57, Vladimir Davydov wrote:
>> We update root cache's memcg_params whenever we need to grow the
>> memcg_caches array to accommodate all kmem-active memory cgroups.
>> Currently we free the old version immediately then, which can lead to
>> use-after-free, because the memcg_caches array is accessed lock-free.
>> This patch fixes this by making memcg_params RCU-protected.
> yes, I was thinking about something like this when talking about RCU
> usage.

Not exactly (if you mean your replies to this series). We do not protect
kmem_caches, but we do protect the memcg_caches array, which can grow.

>
>> Signed-off-by: Vladimir Davydov 
>> Cc: Michal Hocko 
>> Cc: Johannes Weiner 
>> Cc: Glauber Costa 
>> Cc: Christoph Lameter 
>> Cc: Pekka Enberg 
>> Cc: Andrew Morton 
>> ---
>>  include/linux/slab.h |5 -
>>  mm/memcontrol.c  |   15 ---
>>  mm/slab.h|8 +++-
>>  3 files changed, 19 insertions(+), 9 deletions(-)
>>
>> diff --git a/include/linux/slab.h b/include/linux/slab.h
>> index 1e2f4fe..f7e5649 100644
>> --- a/include/linux/slab.h
>> +++ b/include/linux/slab.h
>> @@ -528,7 +528,10 @@ static __always_inline void *kmalloc_node(size_t size, 
>> gfp_t flags, int node)
>>  struct memcg_cache_params {
>>  bool is_root_cache;
>>  union {
>> -struct kmem_cache *memcg_caches[0];
>> +struct {
>> +struct rcu_head rcu_head;
>> +struct kmem_cache *memcg_caches[0];
>> +};
>>  struct {
>>  struct mem_cgroup *memcg;
>>  struct list_head list;
>> diff --git a/mm/memcontrol.c b/mm/memcontrol.c
>> index ad8de6a..379fc5f 100644
>> --- a/mm/memcontrol.c
>> +++ b/mm/memcontrol.c
>> @@ -3142,18 +3142,17 @@ int memcg_update_cache_size(struct kmem_cache *s, 
>> int num_groups)
>>  
>>  if (num_groups > memcg_limited_groups_array_size) {
>>  int i;
>> +struct memcg_cache_params *new_params;
>>  ssize_t size = memcg_caches_array_size(num_groups);
>>  
>>  size *= sizeof(void *);
>>  size += offsetof(struct memcg_cache_params, memcg_caches);
>>  
>> -s->memcg_params = kzalloc(size, GFP_KERNEL);
>> -if (!s->memcg_params) {
>> -s->memcg_params = cur_params;
>> +new_params = kzalloc(size, GFP_KERNEL);
>> +if (!new_params)
>>  return -ENOMEM;
>> -}
>>  
>> -s->memcg_params->is_root_cache = true;
>> +new_params->is_root_cache = true;
>>  
>>  /*
>>   * There is the chance it will be bigger than
>> @@ -3167,7 +3166,7 @@ int memcg_update_cache_size(struct kmem_cache *s, int 
>> num_groups)
>>  for (i = 0; i < memcg_limited_groups_array_size; i++) {
>>  if (!cur_params->memcg_caches[i])
>>  continue;
>> -s->memcg_params->memcg_caches[i] =
>> +new_params->memcg_caches[i] =
>>  cur_params->memcg_caches[i];
>>  }
>>  
>> @@ -3180,7 +3179,9 @@ int memcg_update_cache_size(struct kmem_cache *s, int 
>> num_groups)
>>   * bigger than the others. And all updates will reset this
>>   * anyway.
>>   */
>> -kfree(cur_params);
>> +rcu_assign_pointer(s->memcg_params, new_params);
>> +if (cur_params)
>> +kfree_rcu(cur_params, rcu_head);
>>  }
>>  return 0;
>>  }
>> diff --git a/mm/slab.h b/mm/slab.h
>> index 1d8b53f..53b81a9 100644
>> --- a/mm/slab.h
>> +++ b/mm/slab.h
>> @@ -164,10 +164,16 @@ static inline struct kmem_cache *
>>  cache_from_memcg_idx(struct kmem_cache *s, int idx)
>>  {
>>  struct kmem_cache *cachep;
>> +struct memcg_cache_params *params;
>>  
>>  if (!s->memcg_params)
>>  return NULL;
>> -cachep = s->memcg_params->memcg_caches[idx];
>> +
>> +rcu_read_lock();
>> +params = rcu_dereference(s->memcg_params);
>> +cachep = params->memcg_caches[idx];
>> +rcu_read_unlock();
>> +
> Consumer has to be covered by the

Re: [Devel] [PATCH 1/6] slab: cleanup kmem_cache_create_memcg()

2013-12-19 Thread Vladimir Davydov
On 12/19/2013 01:26 PM, Vasily Averin wrote:
> On 12/19/2013 12:39 PM, Vladimir Davydov wrote:
>> On 12/19/2013 12:17 PM, Vasily Averin wrote:
>>> On 12/18/2013 05:16 PM, Vladimir Davydov wrote:
>>>> --- a/mm/slab_common.c
>>>> +++ b/mm/slab_common.c
>>>> @@ -176,8 +176,9 @@ kmem_cache_create_memcg(struct mem_cgroup *memcg, 
>>>> const char *name, size_t size,
>>>>get_online_cpus();
>>>>mutex_lock(&slab_mutex);
>>>>  
>>>> -  if (!kmem_cache_sanity_check(memcg, name, size) == 0)
>>>> -  goto out_locked;
>>>> +  err = kmem_cache_sanity_check(memcg, name, size);
>>>> +  if (err)
>>>> +  goto out_unlock;
>>>>  
>>>>/*
>>>> * Some allocators will constraint the set of valid flags to a subset
>>> Theoretically in future kmem_cache_sanity_check() can return positive value.
>>> Probably it's better to check (err < 0) in caller ?
>> Hmm, why? What information could positive retval carry here? We have
>> plenty of places throughout the code where we check for (err), not
>> (err<0), simply because it looks clearer, e.g. look at
>> __kmem_cache_create() calls. If it returns a positive value one day, we
>> will have to parse every place where it's called. Anyway, if someone
>> wants to change a function behavior, he must check every place where
>> this function is called and fix them accordingly.
> I believe expected semantic of function -- return negative in case of error.
> So correct error cheek should be (err < 0).
> (err) check is semantically incorrect, and it can lead to troubles in future.

You are free to use the "correct" check then, but making everyone do so
would be too painful ;-)

linux-tip$ grep -rI '^\s*if (err)' . | wc -l
13631
linux-tip$ grep -rI '^\s*if (err\s*<\s*0)' . | wc -l
5449

Thanks.
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH 6/6] memcg, slab: RCU protect memcg_params for root caches

2013-12-19 Thread Vladimir Davydov
On 12/19/2013 01:43 PM, Michal Hocko wrote:
> On Thu 19-12-13 13:36:42, Vladimir Davydov wrote:
>> On 12/19/2013 01:28 PM, Michal Hocko wrote:
>>> On Wed 18-12-13 17:16:57, Vladimir Davydov wrote:
> [...]
>>>> diff --git a/mm/slab.h b/mm/slab.h
>>>> index 1d8b53f..53b81a9 100644
>>>> --- a/mm/slab.h
>>>> +++ b/mm/slab.h
>>>> @@ -164,10 +164,16 @@ static inline struct kmem_cache *
>>>>  cache_from_memcg_idx(struct kmem_cache *s, int idx)
>>>>  {
>>>>struct kmem_cache *cachep;
>>>> +  struct memcg_cache_params *params;
>>>>  
>>>>if (!s->memcg_params)
>>>>return NULL;
>>>> -  cachep = s->memcg_params->memcg_caches[idx];
>>>> +
>>>> +  rcu_read_lock();
>>>> +  params = rcu_dereference(s->memcg_params);
>>>> +  cachep = params->memcg_caches[idx];
>>>> +  rcu_read_unlock();
>>>> +
>>> Consumer has to be covered by the same rcu section otherwise
>>> memcg_params might be freed right after rcu unlock here.
>> No. We protect only accesses to kmem_cache::memcg_params, which can
>> potentially be relocated for root caches.
> Hmm, ok. So memcg_params might change (a new memcg is accounted) but
> pointers at idx will be same, right?

Yes, that's a classical Read-Copy-Update :-)

>
>> But as soon as we get the
>> pointer to a kmem_cache from this array, we can freely dereference it,
>> because the cache cannot be freed when we use it. This is, because we
>> access a kmem_cache either under the slab_mutex or
>> memcg->slab_caches_mutex, or when we allocate/free from it. While doing
>> the latter, the cache can't go away, it would be a bug. IMO.
> That expects that cache_from_memcg_idx is always called with slab_mutex
> or slab_caches_mutex held, right? Please document it.

Yeah, you're right, this longs for a documentation. I'm going to check
this code a bit more and try to write a good comment about it (although
I'm rather poor at writing comments :-( )

Thanks.
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH 3/6] memcg, slab: cleanup barrier usage when accessing memcg_caches

2013-12-19 Thread Vladimir Davydov
On 12/19/2013 01:36 PM, Michal Hocko wrote:
> On Thu 19-12-13 13:29:59, Vladimir Davydov wrote:
>> On 12/19/2013 01:21 PM, Michal Hocko wrote:
>>> On Thu 19-12-13 13:16:01, Vladimir Davydov wrote:
>>>> On 12/19/2013 01:10 PM, Michal Hocko wrote:
>>>>> On Thu 19-12-13 10:37:27, Vladimir Davydov wrote:
>>>>>> On 12/18/2013 09:14 PM, Michal Hocko wrote:
>>>>>>> On Wed 18-12-13 17:16:54, Vladimir Davydov wrote:
>>>>>>>> First, in memcg_create_kmem_cache() we should issue the write barrier
>>>>>>>> after the kmem_cache is initialized, but before storing the pointer to
>>>>>>>> it in its parent's memcg_params.
>>>>>>>>
>>>>>>>> Second, we should always issue the read barrier after
>>>>>>>> cache_from_memcg_idx() to conform with the write barrier.
>>>>>>>>
>>>>>>>> Third, its better to use smp_* versions of barriers, because we don't
>>>>>>>> need them on UP systems.
>>>>>>> Please be (much) more verbose on Why. Barriers are tricky and should be
>>>>>>> documented accordingly. So if you say that we should issue a barrier
>>>>>>> always be specific why we should do it.
>>>>>> In short, we have kmem_cache::memcg_params::memcg_caches is an array of
>>>>>> pointers to per-memcg caches. We access it lock-free so we should use
>>>>>> memory barriers during initialization. Obviously we should place a write
>>>>>> barrier just before we set the pointer in order to make sure nobody will
>>>>>> see a partially initialized structure. Besides there must be a read
>>>>>> barrier between reading the pointer and accessing the structure, to
>>>>>> conform with the write barrier. It's all that similar to rcu_assign and
>>>>>> rcu_deref. Currently the barrier usage looks rather strange:
>>>>>>
>>>>>> memcg_create_kmem_cache:
>>>>>> initialize kmem
>>>>>> set the pointer in memcg_caches
>>>>>> wmb() // ???
>>>>>>
>>>>>> __memcg_kmem_get_cache:
>>>>>> <...>
>>>>>> read_barrier_depends() // ???
>>>>>> cachep = root_cache->memcg_params->memcg_caches[memcg_id]
>>>>>> <...>
>>>>> Why do we need explicit memory barriers when we can use RCU?
>>>>> __memcg_kmem_get_cache already dereferences within rcu_read_lock.
>>>> Because it's not RCU, IMO. RCU implies freeing the old version after a
>>>> grace period, while kmem_caches are freed immediately. We simply want to
>>>> be sure the kmem_cache is fully initialized. And we do not require
>>>> calling this in an RCU critical section.
>>> And you can use rcu_dereference and rcu_assign for that as well.
>> rcu_dereference() will complain if called outside an RCU critical
>> section, while cache_from_memcg_idx() is called w/o RCU protection from
>> some places.
> Does anything prevents us from using RCU from those callers as well?

Yes, take a look at kmem_cache_destroy_memcg_children(), for instance.
We call cancel_work_sync() there on a cache obtained via
cache_from_memcg_idx().

Thanks.
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH v2] slub: Do not assert not having lock in removing freed partial

2014-02-05 Thread Vladimir Davydov
On 02/06/2014 07:21 AM, Steven Rostedt wrote:
> Vladimir reported the following issue:
>
> Commit c65c1877bd68 ("slub: use lockdep_assert_held") requires
> remove_partial() to be called with n->list_lock held, but free_partial()
> called from kmem_cache_close() on cache destruction does not follow this
> rule, leading to a warning:
>
>   WARNING: CPU: 0 PID: 2787 at mm/slub.c:1536 
> __kmem_cache_shutdown+0x1b2/0x1f0()
>   Modules linked in:
>   CPU: 0 PID: 2787 Comm: modprobe Tainted: GW3.14.0-rc1-mm1+ #1
>   Hardware name:
>0600 88003ae1dde8 816d9583 0600
> 88003ae1de28 8107c107 
>880037ab2b00 88007c240d30 ea0001ee5280 ea0001ee52a0
>   Call Trace:
>[] dump_stack+0x51/0x6e
>[] warn_slowpath_common+0x87/0xb0
>[] warn_slowpath_null+0x15/0x20
>[] __kmem_cache_shutdown+0x1b2/0x1f0
>[] kmem_cache_destroy+0x43/0xf0
>[] xfs_destroy_zones+0x103/0x110 [xfs]
>[] exit_xfs_fs+0x38/0x4e4 [xfs]
>[] SyS_delete_module+0x19a/0x1f0
>[] ? retint_swapgs+0x13/0x1b
>[] ? trace_hardirqs_on_caller+0x105/0x1d0
>[] ? trace_hardirqs_on_thunk+0x3a/0x3f
>[] system_call_fastpath+0x16/0x1b
>
>
> His solution was to add a spinlock in order to quiet lockdep. Although
> there would be no contention to adding the lock, that lock also
> requires disabling of interrupts which will have a larger impact on the
> system.
>
> Instead of adding a spinlock to a location where it is not needed for
> lockdep, make a __remove_partial() function that does not test if
> the list_lock is held, as no one should have it due to it being freed.
>
> Also added a __add_partial() function that does not do the lock validation
> either, as it is not needed for the creation of the cache.
>
> Suggested-by: David Rientjes 
> Reported-by: Vladimir Davydov 
> Signed-off-by: Steven Rostedt 
>
> Index: linux-trace.git/mm/slub.c
> ===
> --- linux-trace.git.orig/mm/slub.c
> +++ linux-trace.git/mm/slub.c
> @@ -1520,11 +1520,9 @@ static void discard_slab(struct kmem_cac
>  /*
>   * Management of partially allocated slabs.
>   */
> -static inline void add_partial(struct kmem_cache_node *n,
> - struct page *page, int tail)
> +static inline void
> +__add_partial(struct kmem_cache_node *n, struct page *page, int tail)
>  {
> - lockdep_assert_held(&n->list_lock);
> -
>   n->nr_partial++;
>   if (tail == DEACTIVATE_TO_TAIL)
>   list_add_tail(&page->lru, &n->partial);
> @@ -1532,15 +1530,27 @@ static inline void add_partial(struct km
>   list_add(&page->lru, &n->partial);
>  }
>  
> -static inline void remove_partial(struct kmem_cache_node *n,
> - struct page *page)
> +static inline void add_partial(struct kmem_cache_node *n,
> + struct page *page, int tail)
>  {
>   lockdep_assert_held(&n->list_lock);
> + __add_partial(n, page, tail);
> +}
>  
> +static inline void
> +__remove_partial(struct kmem_cache_node *n, struct page *page)
> +{
>   list_del(&page->lru);
>   n->nr_partial--;
>  }
>  
> +static inline void remove_partial(struct kmem_cache_node *n,
> + struct page *page)
> +{
> + lockdep_assert_held(&n->list_lock);
> + __remove_partial(n, page);
> +}
> +
>  /*
>   * Remove slab from the partial list, freeze it and
>   * return the pointer to the freelist.
> @@ -2906,12 +2916,10 @@ static void early_kmem_cache_node_alloc(
>   inc_slabs_node(kmem_cache_node, node, page->objects);
>  
>   /*
> -  * the lock is for lockdep's sake, not for any actual
> -  * race protection
> +  * No locks need to be taken here as it has just been
> +  * initialized and there is no concurrent access.
>*/
> - spin_lock(&n->list_lock);
> - add_partial(n, page, DEACTIVATE_TO_HEAD);
> - spin_unlock(&n->list_lock);
> + __add_partial(n, page, DEACTIVATE_TO_HEAD);
>  }
>  
>  static void free_kmem_cache_nodes(struct kmem_cache *s)
> @@ -3197,7 +3205,7 @@ static void free_partial(struct kmem_cac
>  
>   list_for_each_entry_safe(page, h, &n->partial, lru) {
>   if (!page->inuse) {
> - remove_partial(n, page);
> + __remove_partial(n, page);
>   discard_slab(s, page);
>   } else {
>   list_slab_objects(s, page,

Looks neat.

FWIW,

Acked-by: Vladimir Davydov 

Thanks.
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH 3/8] memcg, slab: never try to merge memcg caches

2014-02-06 Thread Vladimir Davydov
On 02/06/2014 06:07 PM, Michal Hocko wrote:
> On Tue 04-02-14 19:27:19, Vladimir Davydov wrote:
> [...]
>> What does this patch change? Actually, it introduces no functional
>> changes - it only remove the code trying to find an alias for a memcg
>> cache, because it will fail anyway. So this is rather a cleanup.
> But this also means that two different memcgs might share the same cache
> and so the pages for that cache, no?

No, because in this patch I explicitly forbid to merge memcg caches by
this hunk:

@@ -200,9 +200,11 @@ kmem_cache_create_memcg(struct mem_cgroup *memcg,
const char *name, size_t size,
  */
 flags &= CACHE_CREATE_MASK;
 
-s = __kmem_cache_alias(memcg, name, size, align, flags, ctor);
-if (s)
-goto out_unlock;
+if (!memcg) {
+s = __kmem_cache_alias(name, size, align, flags, ctor);
+if (s)
+goto out_unlock;
+}

Thanks.

> Actually it would depend on timing
> because a new page would be chaged for the current allocator.
>
> cachep->memcg_params->memcg == memcg would prevent from such a merge
> previously AFAICS, or am I still confused?

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH 3/8] memcg, slab: never try to merge memcg caches

2014-02-06 Thread Vladimir Davydov
On 02/06/2014 07:29 PM, Michal Hocko wrote:
> On Thu 06-02-14 18:15:50, Vladimir Davydov wrote:
>> On 02/06/2014 06:07 PM, Michal Hocko wrote:
>>> On Tue 04-02-14 19:27:19, Vladimir Davydov wrote:
>>> [...]
>>>> What does this patch change? Actually, it introduces no functional
>>>> changes - it only remove the code trying to find an alias for a memcg
>>>> cache, because it will fail anyway. So this is rather a cleanup.
>>> But this also means that two different memcgs might share the same cache
>>> and so the pages for that cache, no?
>> No, because in this patch I explicitly forbid to merge memcg caches by
>> this hunk:
>>
>> @@ -200,9 +200,11 @@ kmem_cache_create_memcg(struct mem_cgroup *memcg,
>> const char *name, size_t size,
>>   */
>>  flags &= CACHE_CREATE_MASK;
>>  
>> -s = __kmem_cache_alias(memcg, name, size, align, flags, ctor);
>> -if (s)
>> -goto out_unlock;
>> +if (!memcg) {
>> +s = __kmem_cache_alias(name, size, align, flags, ctor);
>> +if (s)
>> +goto out_unlock;
>> +}
> Ohh, that was the missing part. Thanks and sorry I have missed it.

Never mind.

> Maybe it is worth mentioning in the changelog?

Hmm, changelog? This hunk was there from the very beginning :-/

Anyway, I'm going to expand this patch's comment, because it's too short
and difficult to understand.

Thanks.
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[PATCH RFC] slub: do not drop slab_mutex for sysfs_slab_{add,remove}

2014-02-06 Thread Vladimir Davydov
When creating/destroying a kmem cache, we do a lot of work holding the
slab_mutex, but we drop it for sysfs_slab_{add,remove} for some reason.
Since __kmem_cache_create and __kmem_cache_shutdown are extremely rare,
I propose to simplify locking by calling sysfs_slab_{add,remove} w/o
dropping the slab_mutex.

I'm interested in this, because when creating a memcg cache I need the
slab_mutex locked until the cache is fully initialized and registered to
the memcg subsys (memcg_cache_register() is called). If this is not
true, I get races when several threads try to create a cache for the
same memcg.  An alternative fix for my problem would be moving
sysfs_slab_{add,remove} after the slab_mutex is dropped, but I'd like to
try the shortest path first.

Any objections to this?

Thanks.
---
 mm/slub.c |   15 +--
 1 file changed, 1 insertion(+), 14 deletions(-)

diff --git a/mm/slub.c b/mm/slub.c
index 3d3a8a7a0f8c..6f4393892d2d 100644
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -3229,19 +3229,8 @@ int __kmem_cache_shutdown(struct kmem_cache *s)
 {
int rc = kmem_cache_close(s);
 
-   if (!rc) {
-   /*
-* We do the same lock strategy around sysfs_slab_add, see
-* __kmem_cache_create. Because this is pretty much the last
-* operation we do and the lock will be released shortly after
-* that in slab_common.c, we could just move sysfs_slab_remove
-* to a later point in common code. We should do that when we
-* have a common sysfs framework for all allocators.
-*/
-   mutex_unlock(&slab_mutex);
+   if (!rc)
sysfs_slab_remove(s);
-   mutex_lock(&slab_mutex);
-   }
 
return rc;
 }
@@ -3772,9 +3761,7 @@ int __kmem_cache_create(struct kmem_cache *s, unsigned 
long flags)
return 0;
 
memcg_propagate_slab_attrs(s);
-   mutex_unlock(&slab_mutex);
err = sysfs_slab_add(s);
-   mutex_lock(&slab_mutex);
 
if (err)
kmem_cache_close(s);
-- 
1.7.10.4

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH v2 3/7] memcg, slab: separate memcg vs root cache creation paths

2014-02-06 Thread Vladimir Davydov
On 02/06/2014 08:41 PM, Michal Hocko wrote:
> On Tue 04-02-14 23:19:24, Vladimir Davydov wrote:
>> On 02/04/2014 08:03 PM, Michal Hocko wrote:
>>> On Mon 03-02-14 19:54:38, Vladimir Davydov wrote:
>>>> Memcg-awareness turned kmem_cache_create() into a dirty interweaving of
>>>> memcg-only and except-for-memcg calls. To clean this up, let's create a
>>>> separate function handling memcg caches creation. Although this will
>>>> result in the two functions having several hunks of practically the same
>>>> code, I guess this is the case when readability fully covers the cost of
>>>> code duplication.
>>> I don't know. The code is apparently cleaner because calling a function
>>> with NULL memcg just to go via several if (memcg) branches is ugly as
>>> hell. But having a duplicated function like this calls for a problem
>>> later.
>>>
>>> Would it be possible to split kmem_cache_create into memcg independant
>>> part and do the rest in a single memcg branch?
>> May be, something like the patch attached?
>>
>>>  
>>>> Signed-off-by: Vladimir Davydov 
>>>> ---
>>>>  include/linux/memcontrol.h |   14 ++---
>>>>  include/linux/slab.h   |9 ++-
>>>>  mm/memcontrol.c|   16 ++
>>>>  mm/slab_common.c   |  130 
>>>> ++--
>>>>  4 files changed, 90 insertions(+), 79 deletions(-)
>>>>
>>>> diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
>>>> index 84e4801fc36c..de79a9617e09 100644
>>>> --- a/include/linux/memcontrol.h
>>>> +++ b/include/linux/memcontrol.h
>>>> @@ -500,8 +500,8 @@ int memcg_cache_id(struct mem_cgroup *memcg);
>>>>  
>>>>  char *memcg_create_cache_name(struct mem_cgroup *memcg,
>>>>  struct kmem_cache *root_cache);
>>>> -int memcg_alloc_cache_params(struct mem_cgroup *memcg, struct kmem_cache 
>>>> *s,
>>>> -   struct kmem_cache *root_cache);
>>>> +int memcg_alloc_cache_params(struct kmem_cache *s,
>>>> +  struct mem_cgroup *memcg, struct kmem_cache *root_cache);
>>> Why is the parameters ordering changed? It really doesn't help
>>> review the patch.
>> Oh, this is because seeing something like
>>
>> memcg_alloc_cache_params(NULL, s, NULL);
>>
>> hurts my brain :-) I prefer to have NULLs in the end.
> the function still allocates parameters for the given memcg and cache
> and needs a reference to root cache so the ordering kind of makes sense
> to me.

All right, I'll leave it as is then - anyway, in this patch this hunk is
absent.

>  
>>> Also what does `s' stand for and can we use a more
>>> descriptive name, please?
>> Yes, we can call it `cachep', but it would be too long :-/
>>
>> `s' is the common name for a kmem_cache throughout mm/sl[au]b.c so I
>> guess it fits here. However, this function certainly needs a comment - I
>> guess I'll do it along with swapping the function parameters in a
>> separate patch.
> Yes, it seems that self explaining `s' is spread all over the place.
>
>> From 55f0916c794ad25a8bf45566f6d333bea956e0d4 Mon Sep 17 00:00:00 2001
>> From: Vladimir Davydov 
>> Date: Mon, 3 Feb 2014 19:18:22 +0400
>> Subject: [PATCH] memcg, slab: separate memcg vs root cache creation paths
>>
>> Memcg-awareness turned kmem_cache_create() into a dirty interweaving of
>> memcg-only and except-for-memcg calls. To clean this up, let's create a
>> separate function handling memcg caches creation. Although this will
>> result in the two functions having several hunks of practically the same
>> code, I guess this is the case when readability fully covers the cost of
>> code duplication.
>>
>> Signed-off-by: Vladimir Davydov 
> This looks better. The naming could still be little bit better because
> do_kmem_cache_create suggests that no memcg is involved but it at least
> reduced all the code duplication and nasty if(memcg) parts.
>
> Few minor comments bellow
>
>> ---
>>  include/linux/slab.h |9 ++-
>>  mm/memcontrol.c  |   12 +---
>>  mm/slab_common.c |  174 
>> +++---
>>  3 files changed, 101 insertions(+), 94 deletions(-)
>>
>> diff --git a/include/linux/slab.h b/include/linux/slab.h
>> index 9260abdd67df..e8c95d0bb879 100644
>> --- a/include/linux/

Re: [PATCH RFC] slub: do not drop slab_mutex for sysfs_slab_{add,remove}

2014-02-06 Thread Vladimir Davydov
On 02/06/2014 08:22 PM, Christoph Lameter wrote:
> On Thu, 6 Feb 2014, Vladimir Davydov wrote:
>
>> When creating/destroying a kmem cache, we do a lot of work holding the
>> slab_mutex, but we drop it for sysfs_slab_{add,remove} for some reason.
>> Since __kmem_cache_create and __kmem_cache_shutdown are extremely rare,
>> I propose to simplify locking by calling sysfs_slab_{add,remove} w/o
>> dropping the slab_mutex.
> The problem is that sysfs does nasty things like spawning a process in
> user space that may lead to something wanting to create slabs too. The
> module may then hang waiting on the lock ...

Hmm... IIUC the only function of concern is kobject_uevent() -
everything else called from sysfs_slab_{add,remove} is a mix of kmalloc,
kfree, mutex_lock/unlock - in short, nothing dangerous. There we do
call_usermodehelper(), but we do it with UMH_WAIT_EXEC, which means
"wait for exec only, but not for the process to complete". An exec
shouldn't issue any slab-related stuff AFAIU. At least, I tried to run
the patched kernel with lockdep enabled and got no warnings at all when
getting uevents about adding/removing caches. That's why I started to
doubt whether we really need this lock...

Please correct me if I'm wrong.

> I would be very thankful, if you can get that actually working reliably
> without deadlock issues.

If there is no choice rather than moving sysfs_slab_{add,remove} out of
the slab_mutex critical section, I'll have to do it that way. But first
I'd like to make sure it cannot be done with less footprint.

Thanks.
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH v2 3/7] memcg, slab: separate memcg vs root cache creation paths

2014-02-06 Thread Vladimir Davydov
On 02/06/2014 10:17 PM, Michal Hocko wrote:
> On Thu 06-02-14 21:12:51, Vladimir Davydov wrote:
>> On 02/06/2014 08:41 PM, Michal Hocko wrote:
> [...]
>>>> +int kmem_cache_create_memcg(struct mem_cgroup *memcg, struct kmem_cache 
>>>> *cachep)
>>>>  {
>>>> -  return kmem_cache_create_memcg(NULL, name, size, align, flags, ctor, 
>>>> NULL);
>>>> +  struct kmem_cache *s;
>>>> +  int err;
>>>> +
>>>> +  get_online_cpus();
>>>> +  mutex_lock(&slab_mutex);
>>>> +
>>>> +  /*
>>>> +   * Since per-memcg caches are created asynchronously on first
>>>> +   * allocation (see memcg_kmem_get_cache()), several threads can try to
>>>> +   * create the same cache, but only one of them may succeed.
>>>> +   */
>>>> +  err = -EEXIST;
>>> Does it make any sense to report the error here? If we are racing then at
>>> least on part wins and the work is done.
>> Yeah, you're perfectly right. It's better to return 0 here.
> Why not void?

Yeah, better to make it void for now, just to keep it clean. I guess if
one day we need an error code there (for accounting of error reporting),
we'll add it then, but currently there is no point in that.

>
>>> We should probably warn about errors which prevent from accounting but
>>> I do not think there is much more we can do so returning an error code
>>> from this function seems pointless. memcg_create_cache_work_func ignores
>>> the return value anyway.
>> I do not think warnings are appropriate here, because it is not actually
>> an error if we are short on memory and can't do proper memcg accounting
>> due to this. Perhaps, we'd better add fail counters for memcg cache
>> creations and/or accounting to the root cache instead of memcg's one.
>> That would be useful for debugging. I'm not sure though.
> warn on once per memcg would be probably sufficient but it would still
> be great if an admin could see that a memcg is not accounted although it
> is supposed to be. Scanning all the memcgs might be really impractical.
> We do not fail allocations needed for those object in the real life now
> but we shouldn't rely on that.

Hmm, an alert in dmesg first time kmem_cache_create_memcg() fails for a
particular memcg, just to draw attention, plus accounting of total
number of failures for each memcg so that admin could check if it's a
real problem... Sounds reasonable to me. I guess I'll handle it in a
separate patch a bit later.

Thanks.
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH 2/3] mm: vmscan: get rid of DEFAULT_SEEKS and document shrink_slab logic

2014-02-06 Thread Vladimir Davydov
On 02/06/2014 12:52 AM, Andrew Morton wrote:
> On Wed, 5 Feb 2014 11:16:49 +0400 Vladimir Davydov  
> wrote:
>
>>> So why did I originally make DEFAULT_SEEKS=2?  Because I figured that to
>>> recreate (say) an inode would require a seek to the inode data then a
>>> seek back.  Is it legitimate to include the
>>> seek-back-to-what-you-were-doing-before seek in the cost of an inode
>>> reclaim?  I guess so...
>> Hmm, that explains this 2. Since we typically don't need to "seek back"
>> when recreating a cache page, as they are usually read in bunches by
>> readahead, the number of seeks to bring back a user page is 1, while the
>> number of seeks to recreate an average inode is 2, right?
> Sounds right to me.
>
>> Then to scan inodes and user pages so that they would generate
>> approximately the same number of seeks, we should calculate the number
>> of objects to scan as follows:
>>
>> nr_objects_to_scan = nr_pages_scanned / lru_pages *
>> nr_freeable_objects /
>> shrinker->seeks
>>
>> where shrinker->seeks = DEFAULT_SEEKS = 2 for inodes.
> hm, I wonder if we should take the size of the object into account. 
> Should we be maximizing (memory-reclaimed / seeks-to-reestablish-it).

I'm not sure I understand you quite right. You mean that if two slab
caches have obj sizes 1k and 2k and both of them need 2 seeks to
recreate an object, we should scan the 1k (or 2k?) slab cache more
aggressively than the 2k one? Hmm... I don't know. It depends on what we
want to achieve. But this won't balance the seeks, which is our goal for
now, IIUC.

>> But currently we
>> have four times that. I can explain why we should multiply this by 2 -
>> we do not count pages moving from active to inactive lrus in
>> nr_pages_scanned, and 2*nr_pages_scanned can be a good approximation for
>> that - but I have no idea why we multiply it by 4...
> I don't understand this code at all:
>
>   total_scan = nr;
>   delta = (4 * nr_pages_scanned) / shrinker->seeks;
>   delta *= freeable;
>   do_div(delta, lru_pages + 1);
>   total_scan += delta;
>
> If it actually makes any sense, it sorely sorely needs documentation.

To find its roots I had to checkout the linux history tree:

commit c3f4656118a78c1c294e0b4d338ac946265a822b
Author: Andrew Morton 
Date:   Mon Dec 29 23:48:44 2003 -0800

[PATCH] shrink_slab acounts for seeks incorrectly
   
wli points out that shrink_slab inverts the sense of
shrinker->seeks: those
caches which require more seeks to reestablish an object are shrunk
harder.
That's wrong - they should be shrunk less.
   
So fix that up, but scaling the result so that the patch is actually
a no-op
at this time, because all caches use DEFAULT_SEEKS (2).

diff --git a/mm/vmscan.c b/mm/vmscan.c
index b8594827bbac..f2da3c9fb346 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -154,7 +154,7 @@ static int shrink_slab(long scanned, unsigned int
gfp_mask)
list_for_each_entry(shrinker, &shrinker_list, list) {
unsigned long long delta;
 
-   delta = scanned * shrinker->seeks;
+   delta = 4 * (scanned / shrinker->seeks);
delta *= (*shrinker->shrinker)(0, gfp_mask);
do_div(delta, pages + 1);
shrinker->nr += delta;


So the idea seemed to be fixing a bug without introducing any functional
changes. Since then we have been living with this "4", which makes no
sense (?). Nobody complained though.

Thanks.

> David, you touched it last.  Any hints?

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[PATCH 1/2] kobject: don't block for each kobject_uevent

2014-02-09 Thread Vladimir Davydov
Currently kobject_uevent has somewhat unpredictable semantics. The point
is, since it may call a usermode helper and wait for it to execute
(UMH_WAIT_EXEC), it is impossible to say for sure what lock dependencies
it will introduce for the caller - strictly speaking it depends on what
fs the binary is located on and the set of locks fork may take. There
are quite a few kobject_uevent's users that do not take this into
account and call it with various mutexes taken, e.g. rtnl_mutex,
net_mutex, which might potentially lead to a deadlock.

Since there is actually no reason to wait for the usermode helper to
execute there, let's make kobject_uevent start the helper asynchronously
with the aid of the UMH_NO_WAIT flag.

Personally, I'm interested in this, because I really want kobject_uevent
to be called under the slab_mutex in the slub implementation as it used
to be some time ago, because it greatly simplifies synchronization and
automatically fixes a kmemcg-related race. However, there was a deadlock
detected on an attempt to call kobject_uevent under the slab_mutex (see
https://lkml.org/lkml/2012/1/14/45), which was reported to be fixed by
releasing the slab_mutex for kobject_uevent. Unfortunately, there was no
information about who exactly blocked on the slab_mutex causing the
usermode helper to stall, neither have I managed to find this out or
reproduce the issue.

BTW, this is not the first attempt to make kobject_uevent use
UMH_NO_WAIT. Previous one was made by commit f520360d93c, but it was
wrong (it passed arguments allocated on stack to async thread) so it was
reverted (commit 05f54c13cd0c). It targeted on speeding up the boot
process though.

Signed-off-by: Vladimir Davydov 
---
 include/linux/kobject.h |1 +
 lib/kobject_uevent.c|   42 --
 2 files changed, 37 insertions(+), 6 deletions(-)

diff --git a/include/linux/kobject.h b/include/linux/kobject.h
index 926afb6f6b5f..f896a33e8341 100644
--- a/include/linux/kobject.h
+++ b/include/linux/kobject.h
@@ -119,6 +119,7 @@ struct kobj_type {
 };
 
 struct kobj_uevent_env {
+   char *argv[3];
char *envp[UEVENT_NUM_ENVP];
int envp_idx;
char buf[UEVENT_BUFFER_SIZE];
diff --git a/lib/kobject_uevent.c b/lib/kobject_uevent.c
index 5f72767ddd9b..21c39b32e2aa 100644
--- a/lib/kobject_uevent.c
+++ b/lib/kobject_uevent.c
@@ -124,6 +124,30 @@ static int kobj_usermode_filter(struct kobject *kobj)
return 0;
 }
 
+static int init_uevent_argv(struct kobj_uevent_env *env, const char *subsystem)
+{
+   int len;
+
+   len = strlcpy(&env->buf[env->buflen], subsystem,
+ sizeof(env->buf) - env->buflen);
+   if (len >= (sizeof(env->buf) - env->buflen)) {
+   WARN(1, KERN_ERR "init_uevent_argv: buffer size too small\n");
+   return -ENOMEM;
+   }
+
+   env->argv[0] = uevent_helper;
+   env->argv[1] = &env->buf[env->buflen];
+   env->argv[2] = NULL;
+
+   env->buflen += len + 1;
+   return 0;
+}
+
+static void cleanup_uevent_env(struct subprocess_info *info)
+{
+   kfree(info->data);
+}
+
 /**
  * kobject_uevent_env - send an uevent with environmental data
  *
@@ -301,11 +325,8 @@ int kobject_uevent_env(struct kobject *kobj, enum 
kobject_action action,
 
/* call uevent_helper, usually only enabled during early boot */
if (uevent_helper[0] && !kobj_usermode_filter(kobj)) {
-   char *argv [3];
+   struct subprocess_info *info;
 
-   argv [0] = uevent_helper;
-   argv [1] = (char *)subsystem;
-   argv [2] = NULL;
retval = add_uevent_var(env, "HOME=/");
if (retval)
goto exit;
@@ -313,9 +334,18 @@ int kobject_uevent_env(struct kobject *kobj, enum 
kobject_action action,
"PATH=/sbin:/bin:/usr/sbin:/usr/bin");
if (retval)
goto exit;
+   retval = init_uevent_argv(env, subsystem);
+   if (retval)
+   goto exit;
 
-   retval = call_usermodehelper(argv[0], argv,
-env->envp, UMH_WAIT_EXEC);
+   retval = -ENOMEM;
+   info = call_usermodehelper_setup(env->argv[0], env->argv,
+env->envp, GFP_KERNEL,
+NULL, cleanup_uevent_env, env);
+   if (info)
+   retval = call_usermodehelper_exec(info, UMH_NO_WAIT);
+   if (!retval)
+   env = NULL; /* will be freed by cleanup_uevent_env 
*/
}
 
 exit:
-- 
1.7.10.4

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message

[PATCH 2/2] slub: do not drop slab_mutex for sysfs_slab_add

2014-02-09 Thread Vladimir Davydov
We release the slab_mutex while calling sysfs_slab_add from
__kmem_cache_create since commit 66c4c35c6bc5, because kobject_uevent
called by sysfs_slab_add might block waiting for the usermode helper to
exec, which would result in a deadlock if we took the slab_mutex while
executing it.

However, apart from complicating synchronization rules, releasing the
slab_mutex on kmem cache creation can result in a kmemcg-related race.
The point is that we check if the memcg cache exists before going to
__kmem_cache_create, but register the new cache in memcg subsys after
it. Since we can drop the mutex there, several threads can see that the
memcg cache does not exist and proceed to creating it, which is wrong.

Fortunately, recently kobject_uevent was patched to call the usermode
helper with the UMH_NO_WAIT flag, making the deadlock impossible.
Therefore there is no point in releasing the slab_mutex while calling
sysfs_slab_add, so let's simplify kmem_cache_create synchronization and
fix the kmemcg-race mentioned above by holding the slab_mutex during the
whole cache creation path.

Signed-off-by: Vladimir Davydov 
---
 mm/slub.c |8 +++-
 1 file changed, 3 insertions(+), 5 deletions(-)

diff --git a/mm/slub.c b/mm/slub.c
index 3d3a8a7a0f8c..0625fed32ce9 100644
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -3231,8 +3231,9 @@ int __kmem_cache_shutdown(struct kmem_cache *s)
 
if (!rc) {
/*
-* We do the same lock strategy around sysfs_slab_add, see
-* __kmem_cache_create. Because this is pretty much the last
+* Since slab_attr_store may take the slab_mutex, we should
+* release the lock while removing the sysfs entry in order to
+* avoid a deadlock. Because this is pretty much the last
 * operation we do and the lock will be released shortly after
 * that in slab_common.c, we could just move sysfs_slab_remove
 * to a later point in common code. We should do that when we
@@ -3772,10 +3773,7 @@ int __kmem_cache_create(struct kmem_cache *s, unsigned 
long flags)
return 0;
 
memcg_propagate_slab_attrs(s);
-   mutex_unlock(&slab_mutex);
err = sysfs_slab_add(s);
-   mutex_lock(&slab_mutex);
-
if (err)
kmem_cache_close(s);
 
-- 
1.7.10.4

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH v12 09/18] vmscan: shrink slab on memcg pressure

2013-12-04 Thread Vladimir Davydov
On 12/05/2013 09:01 AM, Dave Chinner wrote:
> On Wed, Dec 04, 2013 at 10:31:32AM +0400, Vladimir Davydov wrote:
>> On 12/04/2013 08:51 AM, Dave Chinner wrote:
>>> On Tue, Dec 03, 2013 at 04:15:57PM +0400, Vladimir Davydov wrote:
>>>> On 12/03/2013 02:48 PM, Dave Chinner wrote:
>>>>>> @@ -236,11 +236,17 @@ shrink_slab_node(struct shrink_control *shrinkctl, 
>>>>>> struct shrinker *shrinker,
>>>>>>  return 0;
>>>>>>  
>>>>>>  /*
>>>>>> - * copy the current shrinker scan count into a local variable
>>>>>> - * and zero it so that other concurrent shrinker invocations
>>>>>> - * don't also do this scanning work.
>>>>>> + * Do not touch global counter of deferred objects on memcg 
>>>>>> pressure to
>>>>>> + * avoid isolation issues. Ideally the counter should be 
>>>>>> per-memcg.
>>>>>>   */
>>>>>> -nr = atomic_long_xchg(&shrinker->nr_deferred[nid], 0);
>>>>>> +if (!shrinkctl->target_mem_cgroup) {
>>>>>> +/*
>>>>>> + * copy the current shrinker scan count into a local 
>>>>>> variable
>>>>>> + * and zero it so that other concurrent shrinker 
>>>>>> invocations
>>>>>> + * don't also do this scanning work.
>>>>>> + */
>>>>>> +nr = atomic_long_xchg(&shrinker->nr_deferred[nid], 0);
>>>>>> +}
>>>>> That's ugly. Effectively it means that memcg reclaim is going to be
>>>>> completely ineffective when large numbers of allocations and hence
>>>>> reclaim attempts are done under GFP_NOFS context.
>>>>>
>>>>> The only thing that keeps filesystem caches in balance when there is
>>>>> lots of filesystem work going on (i.e. lots of GFP_NOFS allocations)
>>>>> is the deferal of reclaim work to a context that can do something
>>>>> about it.
>>>> Imagine the situation: a memcg issues a GFP_NOFS allocation and goes to
>>>> shrink_slab() where it defers them to the global counter; then another
>>>> memcg issues a GFP_KERNEL allocation, also goes to shrink_slab() where
>>>> it sees a huge number of deferred objects and starts shrinking them,
>>>> which is not good IMHO.
>>> That's exactly what the deferred mechanism is for - we know we have
>>> to do the work, but we can't do it right now so let someone else do
>>> it who can.
>>>
>>> In most cases, deferral is handled by kswapd, because when a
>>> filesystem workload is causing memory pressure then most allocations
>>> are done in GFP_NOFS conditions. Hence the only memory reclaim that
>>> can make progress here is kswapd.
>>>
>>> Right now, you aren't deferring any of this memory pressure to some
>>> other agent, so it just does not get done. That's a massive problem
>>> - it's a design flaw - and instead I see lots of crazy hacks being
>>> added to do stuff that should simply be deferred to kswapd like is
>>> done for global memory pressure.
>>>
>>> Hell, kswapd shoul dbe allowed to walk memcg LRU lists and trim
>>> them, just like it does for the global lists. We only need a single
>>> "deferred work" counter per node for that - just let kswapd
>>> proportion the deferred work over the per-node LRU and the
>>> memcgs
>> Seems I misunderstand :-(
>>
>> Let me try. You mean we have the only nr_deferred counter per-node, and
>> kswapd scans
>>
>> nr_deferred*memcg_kmem_size/total_kmem_size
>>
>> objects in each memcg, right?
>>
>> Then if there were a lot of objects deferred on memcg (not global)
>> pressure due to a memcg issuing a lot of GFP_NOFS allocations, kswapd
>> will reclaim objects from all, even unlimited, memcgs. This looks like
>> an isolation issue :-/
> Which, when you are running out of memory, is a much less of an
> issue than not being able to make progress reclaiming memory.
>
> Besides, the "isolation" argument runs both ways. e.g. when there
> isn't memory available, it's entirely possible it's because there is
> actually no free memory, not because we've hit a memcg l

[PATCH] cgroup: fix bug on cgroup_create() fail path

2013-12-05 Thread Vladimir Davydov
If cgroup_create() fails to online_css() we will get a bug:

BUG: unable to handle kernel NULL pointer dereference at 0008
IP: [] cgroup_destroy_locked+0x118/0x2f0
PGD a780a067 PUD aadbe067 PMD 0
Oops:  [#1] SMP
Modules linked in:
CPU: 6 PID: 7360 Comm: mkdir Not tainted 3.13.0-rc2+ #69
Hardware name:
task: 8800b9dbec00 ti: 8800a781a000 task.ti: 8800a781a000
RIP: 0010:[]  [] 
cgroup_destroy_locked+0x118/0x2f0
RSP: 0018:8800a781bd98  EFLAGS: 00010282
RAX: 880586903878 RBX: 880586903800 RCX: 880586903820
RDX: 880586903860 RSI: 8800a781bdb0 RDI: 880586903820
RBP: 8800a781bde8 R08: 88060e0b8048 R09: 811d7bc1
R10: 008c R11: 0001 R12: 8800a72286c0
R13:  R14: 81cf7a40 R15: 0001
FS:  7f60ecda57a0() GS:8806272c() knlGS:
CS:  0010 DS:  ES:  CR0: 80050033
CR2: 0008 CR3: a7a03000 CR4: 07e0
Stack:
 880586903860 880586903910 8800a72286c0 880586903820
 81cf7a40 880586903800 88060e0b8018 81cf7a40
 8800b9dbec00 8800b9dbf098 8800a781bec8 810ef5bf
Call Trace:
 [] cgroup_mkdir+0x55f/0x5f0
 [] vfs_mkdir+0xee/0x140
 [] SyS_mkdirat+0x6e/0xf0
 [] SyS_mkdir+0x19/0x20
 [] system_call_fastpath+0x16/0x1b

The point is that cgroup_destroy_locked() that is called on the fail
path assumes all css's have already been assigned to the cgroup, which
is not true, and calls kill_css() to destroy them.

The patch makes css_online() proceed to assigning css to a cgroup even
if subsys-specific css_online method fails - it only skips setting
CSS_ONLINE flag then. Respectively, offline_css() should skip only
subsys-specific css_offline method if CSS_ONLINE is not set. Besides, it
makes cgroup_create() call online_css() for all css's before going to
cgroup_destroy_locked(). It is not that optimal, but it's only a fail
path.

Signed-off-by: Vladimir Davydov 
Cc: Tejun Heo 
Cc: Li Zefan 
---
 kernel/cgroup.c |   28 +---
 1 file changed, 17 insertions(+), 11 deletions(-)

diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index 8b729c2..1846923 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -4296,11 +4296,10 @@ static int online_css(struct cgroup_subsys_state *css)
 
if (ss->css_online)
ret = ss->css_online(css);
-   if (!ret) {
+   if (!ret)
css->flags |= CSS_ONLINE;
-   css->cgroup->nr_css++;
-   rcu_assign_pointer(css->cgroup->subsys[ss->subsys_id], css);
-   }
+   css->cgroup->nr_css++;
+   rcu_assign_pointer(css->cgroup->subsys[ss->subsys_id], css);
return ret;
 }
 
@@ -4311,10 +4310,7 @@ static void offline_css(struct cgroup_subsys_state *css)
 
lockdep_assert_held(&cgroup_mutex);
 
-   if (!(css->flags & CSS_ONLINE))
-   return;
-
-   if (ss->css_offline)
+   if ((css->flags & CSS_ONLINE) && ss->css_offline)
ss->css_offline(css);
 
css->flags &= ~CSS_ONLINE;
@@ -4437,13 +4433,20 @@ static long cgroup_create(struct cgroup *parent, struct 
dentry *dentry,
/* hold a ref to the parent's dentry */
dget(parent->dentry);
 
+   err = 0;
+
/* creation succeeded, notify subsystems */
for_each_root_subsys(root, ss) {
struct cgroup_subsys_state *css = css_ar[ss->subsys_id];
+   int ret;
 
-   err = online_css(css);
-   if (err)
-   goto err_destroy;
+   /* Continue assigning css's to this cgroup on failure so that
+* all css's will be killed by cgroup_destroy_locked(). */
+   ret = online_css(css);
+   if (ret) {
+   err = ret;
+   continue;
+   }
 
if (ss->broken_hierarchy && !ss->warned_broken_hierarchy &&
parent->parent) {
@@ -4455,6 +4458,9 @@ static long cgroup_create(struct cgroup *parent, struct 
dentry *dentry,
}
}
 
+   if (err)
+   goto err_destroy;
+
idr_replace(&root->cgroup_idr, cgrp, cgrp->id);
 
err = cgroup_addrm_files(cgrp, cgroup_base_files, true);
-- 
1.7.10.4

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH cgroup/for-3.13-fixes] cgroup: fix oops in cgroup init failure path

2013-12-05 Thread Vladimir Davydov
On 12/06/2013 01:18 AM, Tejun Heo wrote:
> Hello, Vladimir.
>
> Thanks a lot for the report and fix; however, I really wanna make sure
> that only online css's become visible, so I wrote up a different fix.
> Can you please test this one?

Hi, Tejun

This patch fixes this bug, but I have a couple of questions regarding it.

First, cgroup_load_subsys() also calls css_online(), and if it fails, it
calls cgroup_unload_subsys() to rollback. The latter function executes
the following command:

offline_css(cgroup_css(cgroup_dummy_top, ss));

But since we failed to online_css(), cgroup_css() will return NULL
resulting in another oops.

Second, it's not clear to me why we need the CSS_ONLINE flag at all if
we never assign css's that we fail to online to a cgroup. AFAIU we will
never see such css's, because in all places we call offline_css(),
namely cgroup_destroy_locked() (via kill_css()) and
cgroup_unload_subsys(), we use cgroup_css() which will return NULL for them.

Third, please see commends inline.

Thanks.

> --- a/kernel/cgroup.c
> +++ b/kernel/cgroup.c
> @@ -4399,13 +4399,13 @@ static long cgroup_create(struct cgroup
>   css = ss->css_alloc(cgroup_css(parent, ss));
>   if (IS_ERR(css)) {
>   err = PTR_ERR(css);
> - goto err_free_all;
> + goto err_deactivate;
>   }
>   css_ar[ss->subsys_id] = css;
>  
>   err = percpu_ref_init(&css->refcnt, css_release);
>   if (err)
> - goto err_free_all;
> + goto err_deactivate;
>  
>   init_css(css, ss, cgrp);
>   }
> @@ -4417,7 +4417,7 @@ static long cgroup_create(struct cgroup
>*/
>   err = cgroup_create_file(dentry, S_IFDIR | mode, sb);
>   if (err < 0)
> - goto err_free_all;
> + goto err_deactivate;
>   lockdep_assert_held(&dentry->d_inode->i_mutex);
>  
>   cgrp->serial_nr = cgroup_serial_nr_next++;
> @@ -4445,6 +4445,9 @@ static long cgroup_create(struct cgroup
>   if (err)
>   goto err_destroy;

Before we get here, we call

/* each css holds a ref to the cgroup's dentry and the parent css */
for_each_root_subsys(root, ss) {
struct cgroup_subsys_state *css = css_ar[ss->subsys_id];

dget(dentry);
css_get(css->parent);
}

If we fail to online a css, we will only call

ss->css_free(css);

on it skipping css_put() on parent.

css_put() is called on parent in css_release() on normal destroy path.

>  
> + /* @css successfully attached, now owned by @cgrp */
> + css_ar[ss->subsys_id] = NULL;
> +
>   if (ss->broken_hierarchy && !ss->warned_broken_hierarchy &&
>   parent->parent) {
>   pr_warning("cgroup: %s (%d) created nested cgroup for 
> controller \"%s\" which has incomplete hierarchy support. Nested cgroups may 
> change behavior in the future.\n",
> @@ -4470,15 +4473,7 @@ static long cgroup_create(struct cgroup
>  
>   return 0;
>  
> -err_free_all:
> - for_each_root_subsys(root, ss) {
> - struct cgroup_subsys_state *css = css_ar[ss->subsys_id];
> -
> - if (css) {
> - percpu_ref_cancel_init(&css->refcnt);
> - ss->css_free(css);
> - }
> - }
> +err_deactivate:
>   mutex_unlock(&cgroup_mutex);
>   /* Release the reference count that we took on the superblock */
>   deactivate_super(sb);
> @@ -4488,12 +4483,21 @@ err_free_name:
>   kfree(rcu_dereference_raw(cgrp->name));
>  err_free_cgrp:
>   kfree(cgrp);
> - return err;
> + goto out_free_css_ar;
>  
>  err_destroy:
>   cgroup_destroy_locked(cgrp);
>   mutex_unlock(&cgroup_mutex);
>   mutex_unlock(&dentry->d_inode->i_mutex);
> +out_free_css_ar:
> + for_each_root_subsys(root, ss) {
> + struct cgroup_subsys_state *css = css_ar[ss->subsys_id];
> +
> + if (css) {
> + percpu_ref_cancel_init(&css->refcnt);
> + ss->css_free(css);
> + }
> + }
>   return err;
>  }
>  
> @@ -4650,10 +4654,14 @@ static int cgroup_destroy_locked(struct
>   /*
>* Initiate massacre of all css's.  cgroup_destroy_css_killed()
>* will be invoked to perform the rest of destruction once the
> -  * percpu refs of all css's are confirmed to be killed.
> +  * percpu refs of all css's are confirmed to be killed.  Not all
> +  * css's may be present if @cgrp failed init half-way.
>*/
> - for_each_root_subsys(cgrp->root, ss)
> - kill_css(cgroup_css(cgrp, ss));
> + for_each_root_subsys(cgrp->root, ss) {
> + struct cgroup_subsys_state *css = cgroup_css(cgrp, ss);
> + if (css)
> + kill_css(cgroup_css(cgrp, ss));
> + }
>  
>   /*
>* Mark @cgrp dead.  This pr

Re: [PATCH cgroup/for-3.13-fixes] cgroup: fix oops in cgroup init failure path

2013-12-06 Thread Vladimir Davydov
On 12/06/2013 08:13 PM, Tejun Heo wrote:
> Hello, Vladimir.
>
> On Fri, Dec 06, 2013 at 11:02:07AM +0400, Vladimir Davydov wrote:
>> This patch fixes this bug, but I have a couple of questions regarding it.
>>
>> First, cgroup_load_subsys() also calls css_online(), and if it fails, it
>> calls cgroup_unload_subsys() to rollback. The latter function executes
>> the following command:
>>
>> offline_css(cgroup_css(cgroup_dummy_top, ss));
>>
>> But since we failed to online_css(), cgroup_css() will return NULL
>> resulting in another oops.
> I don't think the root css onlining fails for any existing controllers
> but yeah that looks wrong.  Can you please send a patch?

Sure, I will send it soon.

Thanks.
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH cgroup/for-3.13-fixes] cgroup: fix oops in cgroup init failure path

2013-12-06 Thread Vladimir Davydov
On 12/06/2013 08:25 PM, Tejun Heo wrote:
> On Fri, Dec 06, 2013 at 11:13:12AM -0500, Tejun Heo wrote:
>>> Second, it's not clear to me why we need the CSS_ONLINE flag at all if
>>> we never assign css's that we fail to online to a cgroup. AFAIU we will
>>> never see such css's, because in all places we call offline_css(),
>>> namely cgroup_destroy_locked() (via kill_css()) and
>>> cgroup_unload_subsys(), we use cgroup_css() which will return NULL for them.
>> The whole thing is in flux and will look very different in near
>> future.  I actually had patches queued which deal with the issue you
>> spotted but they are being blocked on other changes ATM.  So, yeah,
>> there are some spurious stuff now.
> LOL, I found the patch.  It was posted and acked I just forgot to
> apply the whole series.  I'm a moron.
>
>   http://permalink.gmane.org/gmane.linux.kernel.containers/26804
>
> This should do it, right?  I'll update the patch description and
> repost the series.

If combined with the patch you've sent recently, this should do the trick.

Thanks.
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[PATCH] cgroup: fix fail path in cgroup_load_subsys()

2013-12-06 Thread Vladimir Davydov
We should not call cgroup_unload_subsys() if online_css() fails, because
online_css() does not assign css to cgroup on failure, while
offline_css() called from cgroup_unload_subsys() expects it is assigned.
So let's call everything we need to rollback inline without involving
cgroup_unload_subsys().

Signed-off-by: Vladimir Davydov 
Cc: Tejun Heo 
Cc: Li Zefan 
---
 kernel/cgroup.c |   23 ---
 1 file changed, 12 insertions(+), 11 deletions(-)

diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index 8b729c2..3cd7247 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -4861,10 +4861,8 @@ int __init_or_module cgroup_load_subsys(struct 
cgroup_subsys *ss)
 */
css = ss->css_alloc(cgroup_css(cgroup_dummy_top, ss));
if (IS_ERR(css)) {
-   /* failure case - need to deassign the cgroup_subsys[] slot. */
-   cgroup_subsys[ss->subsys_id] = NULL;
-   mutex_unlock(&cgroup_mutex);
-   return PTR_ERR(css);
+   ret = PTR_ERR(css);
+   goto out_err;
}
 
list_add(&ss->sibling, &cgroup_dummy_root.subsys_list);
@@ -4873,6 +4871,10 @@ int __init_or_module cgroup_load_subsys(struct 
cgroup_subsys *ss)
/* our new subsystem will be attached to the dummy hierarchy. */
init_css(css, ss, cgroup_dummy_top);
 
+   ret = online_css(css);
+   if (ret)
+   goto free_css;
+
/*
 * Now we need to entangle the css into the existing css_sets. unlike
 * in cgroup_init_subsys, there are now multiple css_sets, so each one
@@ -4896,18 +4898,17 @@ int __init_or_module cgroup_load_subsys(struct 
cgroup_subsys *ss)
}
write_unlock(&css_set_lock);
 
-   ret = online_css(css);
-   if (ret)
-   goto err_unload;
-
/* success! */
mutex_unlock(&cgroup_mutex);
return 0;
 
-err_unload:
+free_css:
+   list_del(&ss->sibling);
+   ss->css_free(css);
+out_err:
+   /* failure case - need to deassign the cgroup_subsys[] slot. */
+   cgroup_subsys[ss->subsys_id] = NULL;
mutex_unlock(&cgroup_mutex);
-   /* @ss can't be mounted here as try_module_get() would fail */
-   cgroup_unload_subsys(ss);
return ret;
 }
 EXPORT_SYMBOL_GPL(cgroup_load_subsys);
-- 
1.7.10.4

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[PATCH RESEND 00/11] kmemcg-fixes

2014-01-06 Thread Vladimir Davydov
Hi,

This patch-set fixes several bugs here and there in the implementation
of kmem accounting for memory cgroups and hopefully makes the code look
a bit clearer.

Links to discussion threads that led to this patch-set:
http://www.spinics.net/lists/cgroups/msg09512.html
http://www.spinics.net/lists/cgroups/msg09695.html
http://www.spinics.net/lists/cgroups/msg09796.html

Any comments are highly appreciated.

Thanks.

Vladimir Davydov (11):
  slab: cleanup kmem_cache_create_memcg() error handling
  memcg, slab: kmem_cache_create_memcg(): fix memleak on fail path
  memcg, slab: cleanup memcg cache initialization/destruction
  memcg, slab: fix barrier usage when accessing memcg_caches
  memcg: fix possible NULL deref while traversing memcg_slab_caches
list
  memcg, slab: fix races in per-memcg cache creation/destruction
  memcg: get rid of kmem_cache_dup
  slab: do not panic if we fail to create memcg cache
  memcg, slab: RCU protect memcg_params for root caches
  memcg: remove KMEM_ACCOUNTED_ACTIVATED flag
  memcg: rework memcg_update_kmem_limit synchronization

 include/linux/memcontrol.h |   23 +--
 include/linux/slab.h   |9 +-
 mm/memcontrol.c|  405 +---
 mm/slab.h  |   26 ++-
 mm/slab_common.c   |   90 ++
 5 files changed, 292 insertions(+), 261 deletions(-)

-- 
1.7.10.4

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[PATCH RESEND 02/11] memcg, slab: kmem_cache_create_memcg(): fix memleak on fail path

2014-01-06 Thread Vladimir Davydov
We do not free the cache's memcg_params if __kmem_cache_create fails.
Fix this.

Plus, rename memcg_register_cache() to memcg_alloc_cache_params(),
because it actually does not register the cache anywhere, but simply
initialize kmem_cache::memcg_params.

Signed-off-by: Vladimir Davydov 
Cc: Michal Hocko 
Cc: Glauber Costa 
Cc: Johannes Weiner 
Cc: Balbir Singh 
Cc: KAMEZAWA Hiroyuki 
Cc: Pekka Enberg 
Cc: Christoph Lameter 
Cc: Andrew Morton 
---
 include/linux/memcontrol.h |   14 +-
 mm/memcontrol.c|   11 ---
 mm/slab_common.c   |3 ++-
 3 files changed, 19 insertions(+), 9 deletions(-)

diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
index b3e7a66..5e6541f 100644
--- a/include/linux/memcontrol.h
+++ b/include/linux/memcontrol.h
@@ -497,8 +497,9 @@ void __memcg_kmem_commit_charge(struct page *page,
 void __memcg_kmem_uncharge_pages(struct page *page, int order);
 
 int memcg_cache_id(struct mem_cgroup *memcg);
-int memcg_register_cache(struct mem_cgroup *memcg, struct kmem_cache *s,
-struct kmem_cache *root_cache);
+int memcg_alloc_cache_params(struct mem_cgroup *memcg, struct kmem_cache *s,
+struct kmem_cache *root_cache);
+void memcg_free_cache_params(struct kmem_cache *s);
 void memcg_release_cache(struct kmem_cache *cachep);
 void memcg_cache_list_add(struct mem_cgroup *memcg, struct kmem_cache *cachep);
 
@@ -640,13 +641,16 @@ static inline int memcg_cache_id(struct mem_cgroup *memcg)
return -1;
 }
 
-static inline int
-memcg_register_cache(struct mem_cgroup *memcg, struct kmem_cache *s,
-struct kmem_cache *root_cache)
+static inline int memcg_alloc_cache_params(struct mem_cgroup *memcg,
+   struct kmem_cache *s, struct kmem_cache *root_cache)
 {
return 0;
 }
 
+static inline void memcg_free_cache_params(struct kmem_cache *s);
+{
+}
+
 static inline void memcg_release_cache(struct kmem_cache *cachep)
 {
 }
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index bf5e894..8c47910 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -3195,8 +3195,8 @@ int memcg_update_cache_size(struct kmem_cache *s, int 
num_groups)
return 0;
 }
 
-int memcg_register_cache(struct mem_cgroup *memcg, struct kmem_cache *s,
-struct kmem_cache *root_cache)
+int memcg_alloc_cache_params(struct mem_cgroup *memcg, struct kmem_cache *s,
+struct kmem_cache *root_cache)
 {
size_t size;
 
@@ -3224,6 +3224,11 @@ int memcg_register_cache(struct mem_cgroup *memcg, 
struct kmem_cache *s,
return 0;
 }
 
+void memcg_free_cache_params(struct kmem_cache *s)
+{
+   kfree(s->memcg_params);
+}
+
 void memcg_release_cache(struct kmem_cache *s)
 {
struct kmem_cache *root;
@@ -3252,7 +3257,7 @@ void memcg_release_cache(struct kmem_cache *s)
 
css_put(&memcg->css);
 out:
-   kfree(s->memcg_params);
+   memcg_free_cache_params(s);
 }
 
 /*
diff --git a/mm/slab_common.c b/mm/slab_common.c
index f70df3e..70f9e24 100644
--- a/mm/slab_common.c
+++ b/mm/slab_common.c
@@ -205,7 +205,7 @@ kmem_cache_create_memcg(struct mem_cgroup *memcg, const 
char *name, size_t size,
if (!s->name)
goto out_free_cache;
 
-   err = memcg_register_cache(memcg, s, parent_cache);
+   err = memcg_alloc_cache_params(memcg, s, parent_cache);
if (err)
goto out_free_cache;
 
@@ -235,6 +235,7 @@ out_unlock:
return s;
 
 out_free_cache:
+   memcg_free_cache_params(s);
kfree(s->name);
kmem_cache_free(kmem_cache, s);
goto out_unlock;
-- 
1.7.10.4

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[PATCH RESEND 03/11] memcg, slab: cleanup memcg cache initialization/destruction

2014-01-06 Thread Vladimir Davydov
Currently, we have rather a messy function set relating to per-memcg
kmem cache initialization/destruction.

Per-memcg caches are created in memcg_create_kmem_cache(). This function
calls kmem_cache_create_memcg() to allocate and initialize a kmem cache
and then "registers" the new cache in the memcg_params::memcg_caches
array of the parent cache.

During its work-flow, kmem_cache_create_memcg() executes the following
memcg-related functions:

 - memcg_alloc_cache_params(), to initialize memcg_params of the newly
   created cache;
 - memcg_cache_list_add(), to add the new cache to the memcg_slab_caches
   list.

On the other hand, kmem_cache_destroy() called on a cache destruction
only calls memcg_release_cache(), which does all the work: it cleans the
reference to the cache in its parent's memcg_params::memcg_caches,
removes the cache from the memcg_slab_caches list, and frees
memcg_params.

Such an inconsistency between destruction and initialization paths make
the code difficult to read, so let's clean this up a bit.

This patch moves all the code relating to registration of per-memcg
caches (adding to memcg list, setting the pointer to a cache from its
parent) to the newly created memcg_register_cache() and
memcg_unregister_cache() functions making the initialization and
destruction paths look symmetrical.

Signed-off-by: Vladimir Davydov 
Cc: Michal Hocko 
Cc: Glauber Costa 
Cc: Johannes Weiner 
Cc: Balbir Singh 
Cc: KAMEZAWA Hiroyuki 
Cc: Pekka Enberg 
Cc: Christoph Lameter 
Cc: Andrew Morton 
---
 include/linux/memcontrol.h |9 +++
 mm/memcontrol.c|   64 +---
 mm/slab_common.c   |5 ++--
 3 files changed, 37 insertions(+), 41 deletions(-)

diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
index 5e6541f..6202406 100644
--- a/include/linux/memcontrol.h
+++ b/include/linux/memcontrol.h
@@ -500,8 +500,8 @@ int memcg_cache_id(struct mem_cgroup *memcg);
 int memcg_alloc_cache_params(struct mem_cgroup *memcg, struct kmem_cache *s,
 struct kmem_cache *root_cache);
 void memcg_free_cache_params(struct kmem_cache *s);
-void memcg_release_cache(struct kmem_cache *cachep);
-void memcg_cache_list_add(struct mem_cgroup *memcg, struct kmem_cache *cachep);
+void memcg_register_cache(struct kmem_cache *s);
+void memcg_unregister_cache(struct kmem_cache *s);
 
 int memcg_update_cache_size(struct kmem_cache *s, int num_groups);
 void memcg_update_array_size(int num_groups);
@@ -651,12 +651,11 @@ static inline void memcg_free_cache_params(struct 
kmem_cache *s);
 {
 }
 
-static inline void memcg_release_cache(struct kmem_cache *cachep)
+static inline void memcg_register_cache(struct kmem_cache *s)
 {
 }
 
-static inline void memcg_cache_list_add(struct mem_cgroup *memcg,
-   struct kmem_cache *s)
+static inline void memcg_unregister_cache(struct kmem_cache *s)
 {
 }
 
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 8c47910..f8eb994 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -3059,16 +3059,6 @@ static void memcg_uncharge_kmem(struct mem_cgroup 
*memcg, u64 size)
css_put(&memcg->css);
 }
 
-void memcg_cache_list_add(struct mem_cgroup *memcg, struct kmem_cache *cachep)
-{
-   if (!memcg)
-   return;
-
-   mutex_lock(&memcg->slab_caches_mutex);
-   list_add(&cachep->memcg_params->list, &memcg->memcg_slab_caches);
-   mutex_unlock(&memcg->slab_caches_mutex);
-}
-
 /*
  * helper for acessing a memcg's index. It will be used as an index in the
  * child cache array in kmem_cache, and also to derive its name. This function
@@ -3229,21 +3219,41 @@ void memcg_free_cache_params(struct kmem_cache *s)
kfree(s->memcg_params);
 }
 
-void memcg_release_cache(struct kmem_cache *s)
+void memcg_register_cache(struct kmem_cache *s)
 {
struct kmem_cache *root;
struct mem_cgroup *memcg;
int id;
 
+   if (is_root_cache(s))
+   return;
+
+   root = s->memcg_params->root_cache;
+   memcg = s->memcg_params->memcg;
+   id = memcg_cache_id(memcg);
+
+   css_get(&memcg->css);
+
+   mutex_lock(&memcg->slab_caches_mutex);
+   list_add(&s->memcg_params->list, &memcg->memcg_slab_caches);
+   mutex_unlock(&memcg->slab_caches_mutex);
+
+   root->memcg_params->memcg_caches[id] = s;
/*
-* This happens, for instance, when a root cache goes away before we
-* add any memcg.
+* the readers won't lock, make sure everybody sees the updated value,
+* so they won't put stuff in the queue again for no reason
 */
-   if (!s->memcg_params)
-   return;
+   wmb();
+}
 
-   if (s->memcg_params->is_root_cache)
-   goto out;
+void memcg_unregister_ca

[PATCH RESEND 09/11] memcg, slab: RCU protect memcg_params for root caches

2014-01-06 Thread Vladimir Davydov
We relocate root cache's memcg_params whenever we need to grow the
memcg_caches array to accommodate all kmem-active memory cgroups.
Currently on relocation we free the old version immediately, which can
lead to use-after-free, because the memcg_caches array is accessed
lock-free (see cache_from_memcg_idx()). This patch fixes this by making
memcg_params RCU-protected for root caches.

Signed-off-by: Vladimir Davydov 
Cc: Michal Hocko 
Cc: Glauber Costa 
Cc: Johannes Weiner 
Cc: Balbir Singh 
Cc: KAMEZAWA Hiroyuki 
Cc: Pekka Enberg 
Cc: Christoph Lameter 
Cc: Andrew Morton 
---
 include/linux/slab.h |9 +++--
 mm/memcontrol.c  |   15 ---
 mm/slab.h|   16 +++-
 3 files changed, 30 insertions(+), 10 deletions(-)

diff --git a/include/linux/slab.h b/include/linux/slab.h
index 1e2f4fe..a060142 100644
--- a/include/linux/slab.h
+++ b/include/linux/slab.h
@@ -513,7 +513,9 @@ static __always_inline void *kmalloc_node(size_t size, 
gfp_t flags, int node)
  *
  * Both the root cache and the child caches will have it. For the root cache,
  * this will hold a dynamically allocated array large enough to hold
- * information about the currently limited memcgs in the system.
+ * information about the currently limited memcgs in the system. To allow the
+ * array to be accessed without taking any locks, on relocation we free the old
+ * version only after a grace period.
  *
  * Child caches will hold extra metadata needed for its operation. Fields are:
  *
@@ -528,7 +530,10 @@ static __always_inline void *kmalloc_node(size_t size, 
gfp_t flags, int node)
 struct memcg_cache_params {
bool is_root_cache;
union {
-   struct kmem_cache *memcg_caches[0];
+   struct {
+   struct rcu_head rcu_head;
+   struct kmem_cache *memcg_caches[0];
+   };
struct {
struct mem_cgroup *memcg;
struct list_head list;
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index ce25f77..a7521c3 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -3142,18 +3142,17 @@ int memcg_update_cache_size(struct kmem_cache *s, int 
num_groups)
 
if (num_groups > memcg_limited_groups_array_size) {
int i;
+   struct memcg_cache_params *new_params;
ssize_t size = memcg_caches_array_size(num_groups);
 
size *= sizeof(void *);
size += offsetof(struct memcg_cache_params, memcg_caches);
 
-   s->memcg_params = kzalloc(size, GFP_KERNEL);
-   if (!s->memcg_params) {
-   s->memcg_params = cur_params;
+   new_params = kzalloc(size, GFP_KERNEL);
+   if (!new_params)
return -ENOMEM;
-   }
 
-   s->memcg_params->is_root_cache = true;
+   new_params->is_root_cache = true;
 
/*
 * There is the chance it will be bigger than
@@ -3167,7 +3166,7 @@ int memcg_update_cache_size(struct kmem_cache *s, int 
num_groups)
for (i = 0; i < memcg_limited_groups_array_size; i++) {
if (!cur_params->memcg_caches[i])
continue;
-   s->memcg_params->memcg_caches[i] =
+   new_params->memcg_caches[i] =
cur_params->memcg_caches[i];
}
 
@@ -3180,7 +3179,9 @@ int memcg_update_cache_size(struct kmem_cache *s, int 
num_groups)
 * bigger than the others. And all updates will reset this
 * anyway.
 */
-   kfree(cur_params);
+   rcu_assign_pointer(s->memcg_params, new_params);
+   if (cur_params)
+   kfree_rcu(cur_params, rcu_head);
}
return 0;
 }
diff --git a/mm/slab.h b/mm/slab.h
index 72d1f9d..8184a7c 100644
--- a/mm/slab.h
+++ b/mm/slab.h
@@ -160,14 +160,28 @@ static inline const char *cache_name(struct kmem_cache *s)
return s->name;
 }
 
+/*
+ * Note, we protect with RCU only the memcg_caches array, not per-memcg caches.
+ * That said the caller must assure the memcg's cache won't go away. Since once
+ * created a memcg's cache is destroyed only along with the root cache, it is
+ * true if we are going to allocate from the cache or hold a reference to the
+ * root cache by other means. Otherwise, we should hold either the slab_mutex
+ * or the memcg's slab_caches_mutex while calling this function and accessing
+ * the returned value.
+ */
 static inline struct kmem_cache *
 cache_from_memcg_idx(struct kmem_cache *s, int idx)
 {
struct kmem_cache *cachep;
+   struct memcg_cache_params *params;
 
if (!s->memcg_params)
return NULL;
-   cachep = s-&g

[PATCH RESEND 04/11] memcg, slab: fix barrier usage when accessing memcg_caches

2014-01-06 Thread Vladimir Davydov
Each root kmem_cache has pointers to per-memcg caches stored in its
memcg_params::memcg_caches array. Whenever we want to allocate a slab
for a memcg, we access this array to get per-memcg cache to allocate
from (see memcg_kmem_get_cache()). The access must be lock-free for
performance reasons, so we should use barriers to assert the kmem_cache
is up-to-date.

First, we should place a write barrier immediately before setting the
pointer to it in the memcg_caches array in order to make sure nobody
will see a partially initialized object. Second, we should issue a read
barrier before dereferencing the pointer to conform to the write
barrier.

However, currently the barrier usage looks rather strange. We have a
write barrier *after* setting the pointer and a read barrier *before*
reading the pointer, which is incorrect. This patch fixes this.

Signed-off-by: Vladimir Davydov 
Cc: Michal Hocko 
Cc: Glauber Costa 
Cc: Johannes Weiner 
Cc: Balbir Singh 
Cc: KAMEZAWA Hiroyuki 
Cc: Pekka Enberg 
Cc: Christoph Lameter 
Cc: Andrew Morton 
---
 mm/memcontrol.c |   24 ++--
 mm/slab.h   |   12 +++-
 2 files changed, 21 insertions(+), 15 deletions(-)

diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index f8eb994..999e7d4 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -3238,12 +3238,14 @@ void memcg_register_cache(struct kmem_cache *s)
list_add(&s->memcg_params->list, &memcg->memcg_slab_caches);
mutex_unlock(&memcg->slab_caches_mutex);
 
-   root->memcg_params->memcg_caches[id] = s;
/*
-* the readers won't lock, make sure everybody sees the updated value,
-* so they won't put stuff in the queue again for no reason
+* Since readers won't lock (see cache_from_memcg_idx()), we need a
+* barrier here to ensure nobody will see the kmem_cache partially
+* initialized.
 */
-   wmb();
+   smp_wmb();
+
+   root->memcg_params->memcg_caches[id] = s;
 }
 
 void memcg_unregister_cache(struct kmem_cache *s)
@@ -3569,7 +3571,7 @@ struct kmem_cache *__memcg_kmem_get_cache(struct 
kmem_cache *cachep,
  gfp_t gfp)
 {
struct mem_cgroup *memcg;
-   int idx;
+   struct kmem_cache *memcg_cachep;
 
VM_BUG_ON(!cachep->memcg_params);
VM_BUG_ON(!cachep->memcg_params->is_root_cache);
@@ -3583,15 +3585,9 @@ struct kmem_cache *__memcg_kmem_get_cache(struct 
kmem_cache *cachep,
if (!memcg_can_account_kmem(memcg))
goto out;
 
-   idx = memcg_cache_id(memcg);
-
-   /*
-* barrier to mare sure we're always seeing the up to date value.  The
-* code updating memcg_caches will issue a write barrier to match this.
-*/
-   read_barrier_depends();
-   if (likely(cache_from_memcg_idx(cachep, idx))) {
-   cachep = cache_from_memcg_idx(cachep, idx);
+   memcg_cachep = cache_from_memcg_idx(cachep, memcg_cache_id(memcg));
+   if (likely(memcg_cachep)) {
+   cachep = memcg_cachep;
goto out;
}
 
diff --git a/mm/slab.h b/mm/slab.h
index 0859c42..72d1f9d 100644
--- a/mm/slab.h
+++ b/mm/slab.h
@@ -163,9 +163,19 @@ static inline const char *cache_name(struct kmem_cache *s)
 static inline struct kmem_cache *
 cache_from_memcg_idx(struct kmem_cache *s, int idx)
 {
+   struct kmem_cache *cachep;
+
if (!s->memcg_params)
return NULL;
-   return s->memcg_params->memcg_caches[idx];
+   cachep = s->memcg_params->memcg_caches[idx];
+
+   /*
+* Make sure we will access the up-to-date value. The code updating
+* memcg_caches issues a write barrier to match this (see
+* memcg_register_cache()).
+*/
+   smp_read_barrier_depends();
+   return cachep;
 }
 
 static inline struct kmem_cache *memcg_root_cache(struct kmem_cache *s)
-- 
1.7.10.4

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[PATCH RESEND 06/11] memcg, slab: fix races in per-memcg cache creation/destruction

2014-01-06 Thread Vladimir Davydov
We obtain a per-memcg cache from a root kmem_cache by dereferencing an
entry of the root cache's memcg_params::memcg_caches array. If we find
no cache for a memcg there on allocation, we initiate the memcg cache
creation (see memcg_kmem_get_cache()). The cache creation proceeds
asynchronously in memcg_create_kmem_cache() in order to avoid lock
clashes, so there can be several threads trying to create the same
kmem_cache concurrently, but only one of them may succeed. However, due
to a race in the code, it is not always true. The point is that the
memcg_caches array can be relocated when we activate kmem accounting for
a memcg (see memcg_update_all_caches(), memcg_update_cache_size()). If
memcg_update_cache_size() and memcg_create_kmem_cache() proceed
concurrently as described below, we can leak a kmem_cache.

Asume two threads schedule creation of the same kmem_cache. One of them
successfully creates it. Another one should fail then, but if
memcg_create_kmem_cache() interleaves with memcg_update_cache_size() as
follows, it won't:

  memcg_create_kmem_cache() memcg_update_cache_size()
  (called w/o mutexes held) (called with slab_mutex,
 set_limit_mutex held)
  - -

  mutex_lock(&memcg_cache_mutex)

s->memcg_params=kzalloc(...)

  new_cachep=cache_from_memcg_idx(cachep,idx)
  // new_cachep==NULL => proceed to creation

s->memcg_params->memcg_caches[i]
=cur_params->memcg_caches[i]

  // kmem_cache_create_memcg takes slab_mutex
  // so we will hang around until
  // memcg_update_cache_size finishes, but
  // nothing will prevent it from succeeding so
  // memcg_caches[idx] will be overwritten in
  // memcg_register_cache!

  new_cachep = kmem_cache_create_memcg(...)
  mutex_unlock(&memcg_cache_mutex)

Let's fix this by moving the check for existence of the memcg cache to
kmem_cache_create_memcg() to be called under the slab_mutex and make it
return NULL if so.

A similar race is possible when destroying a memcg cache (see
kmem_cache_destroy()). Since memcg_unregister_cache(), which clears the
pointer in the memcg_caches array, is called w/o protection, we can race
with memcg_update_cache_size() and omit clearing the pointer. Therefore
memcg_unregister_cache() should be moved before we release the
slab_mutex.

Signed-off-by: Vladimir Davydov 
Cc: Michal Hocko 
Cc: Glauber Costa 
Cc: Johannes Weiner 
Cc: Balbir Singh 
Cc: KAMEZAWA Hiroyuki 
Cc: Pekka Enberg 
Cc: Christoph Lameter 
Cc: Andrew Morton 
---
 mm/memcontrol.c  |   23 ++-
 mm/slab_common.c |   14 +-
 2 files changed, 27 insertions(+), 10 deletions(-)

diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index d918626..56fc410 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -3228,6 +3228,12 @@ void memcg_register_cache(struct kmem_cache *s)
if (is_root_cache(s))
return;
 
+   /*
+* Holding the slab_mutex assures nobody will touch the memcg_caches
+* array while we are modifying it.
+*/
+   lockdep_assert_held(&slab_mutex);
+
root = s->memcg_params->root_cache;
memcg = s->memcg_params->memcg;
id = memcg_cache_id(memcg);
@@ -3247,6 +3253,7 @@ void memcg_register_cache(struct kmem_cache *s)
 * before adding it to the memcg_slab_caches list, otherwise we can
 * fail to convert memcg_params_to_cache() while traversing the list.
 */
+   VM_BUG_ON(root->memcg_params->memcg_caches[id]);
root->memcg_params->memcg_caches[id] = s;
 
mutex_lock(&memcg->slab_caches_mutex);
@@ -3263,6 +3270,12 @@ void memcg_unregister_cache(struct kmem_cache *s)
if (is_root_cache(s))
return;
 
+   /*
+* Holding the slab_mutex assures nobody will touch the memcg_caches
+* array while we are modifying it.
+*/
+   lockdep_assert_held(&slab_mutex);
+
root = s->memcg_params->root_cache;
memcg = s->memcg_params->memcg;
id = memcg_cache_id(memcg);
@@ -3276,6 +3289,7 @@ void memcg_unregister_cache(struct kmem_cache *s)
 * after removing it from the memcg_slab_caches list, otherwise we can
 * fail to convert memcg_params_to_cache() while traversing the list.
 */
+   VM_BUG_ON(!root->memcg_params->memcg_caches[id]);
root->memcg_params->memcg_caches[id] = NULL;
 
css_put(&memcg->css);
@@ -3428,22 +3442,13 @@ static struct kmem_cache 
*memcg_create_kmem_cache(struct mem_cgroup *memcg,
  struct kmem_cache *cachep)
 {
struct kmem_cache *new_cachep;
-   int idx;
 
BUG_ON(!memcg_can_account_kmem(

[PATCH RESEND 08/11] slab: do not panic if we fail to create memcg cache

2014-01-06 Thread Vladimir Davydov
There is no point in flooding logs with warnings or especially crashing
the system if we fail to create a cache for a memcg. In this case we
will be accounting the memcg allocation to the root cgroup until we
succeed to create its own cache, but it isn't that critical.

Signed-off-by: Vladimir Davydov 
Cc: Michal Hocko 
Cc: Glauber Costa 
Cc: Johannes Weiner 
Cc: Pekka Enberg 
Cc: Christoph Lameter 
Cc: Andrew Morton 
---
 mm/slab_common.c |9 -
 1 file changed, 8 insertions(+), 1 deletion(-)

diff --git a/mm/slab_common.c b/mm/slab_common.c
index f34707e..8e40321 100644
--- a/mm/slab_common.c
+++ b/mm/slab_common.c
@@ -233,7 +233,14 @@ out_unlock:
mutex_unlock(&slab_mutex);
put_online_cpus();
 
-   if (err) {
+   /*
+* There is no point in flooding logs with warnings or especially
+* crashing the system if we fail to create a cache for a memcg. In
+* this case we will be accounting the memcg allocation to the root
+* cgroup until we succeed to create its own cache, but it isn't that
+* critical.
+*/
+   if (err && !memcg) {
if (flags & SLAB_PANIC)
panic("kmem_cache_create: Failed to create slab '%s'. 
Error %d\n",
name, err);
-- 
1.7.10.4

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[PATCH RESEND 05/11] memcg: fix possible NULL deref while traversing memcg_slab_caches list

2014-01-06 Thread Vladimir Davydov
All caches of the same memory cgroup are linked in the memcg_slab_caches
list via kmem_cache::memcg_params::list. This list is traversed, for
example, when we read memory.kmem.slabinfo. Since the list actually
consists of memcg_cache_params objects, we have to convert an element of
the list to a kmem_cache object using memcg_params_to_cache(), which
obtains the pointer to the cache from the memcg_params::memcg_caches
array of the corresponding root cache. That said the pointer to a
kmem_cache in its parent's memcg_params must be initialized before
adding the cache to the list, and cleared only after it has been
unlinked. Currently it is vice-versa, which can result in a NULL ptr
dereference while traversing the memcg_slab_caches list. This patch
restores the correct order.

Signed-off-by: Vladimir Davydov 
Cc: Michal Hocko 
Cc: Glauber Costa 
Cc: Johannes Weiner 
Cc: Balbir Singh 
Cc: KAMEZAWA Hiroyuki 
Cc: Andrew Morton 
---
 mm/memcontrol.c |   25 ++---
 1 file changed, 18 insertions(+), 7 deletions(-)

diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 999e7d4..d918626 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -3234,9 +3234,6 @@ void memcg_register_cache(struct kmem_cache *s)
 
css_get(&memcg->css);
 
-   mutex_lock(&memcg->slab_caches_mutex);
-   list_add(&s->memcg_params->list, &memcg->memcg_slab_caches);
-   mutex_unlock(&memcg->slab_caches_mutex);
 
/*
 * Since readers won't lock (see cache_from_memcg_idx()), we need a
@@ -3245,7 +3242,16 @@ void memcg_register_cache(struct kmem_cache *s)
 */
smp_wmb();
 
+   /*
+* Initialize the pointer to this cache in its parent's memcg_params
+* before adding it to the memcg_slab_caches list, otherwise we can
+* fail to convert memcg_params_to_cache() while traversing the list.
+*/
root->memcg_params->memcg_caches[id] = s;
+
+   mutex_lock(&memcg->slab_caches_mutex);
+   list_add(&s->memcg_params->list, &memcg->memcg_slab_caches);
+   mutex_unlock(&memcg->slab_caches_mutex);
 }
 
 void memcg_unregister_cache(struct kmem_cache *s)
@@ -3257,16 +3263,21 @@ void memcg_unregister_cache(struct kmem_cache *s)
if (is_root_cache(s))
return;
 
-   memcg = s->memcg_params->memcg;
-   id  = memcg_cache_id(memcg);
-
root = s->memcg_params->root_cache;
-   root->memcg_params->memcg_caches[id] = NULL;
+   memcg = s->memcg_params->memcg;
+   id = memcg_cache_id(memcg);
 
mutex_lock(&memcg->slab_caches_mutex);
list_del(&s->memcg_params->list);
mutex_unlock(&memcg->slab_caches_mutex);
 
+   /*
+* Clear the pointer to this cache in its parent's memcg_params only
+* after removing it from the memcg_slab_caches list, otherwise we can
+* fail to convert memcg_params_to_cache() while traversing the list.
+*/
+   root->memcg_params->memcg_caches[id] = NULL;
+
css_put(&memcg->css);
 }
 
-- 
1.7.10.4

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[PATCH RESEND 01/11] slab: cleanup kmem_cache_create_memcg() error handling

2014-01-06 Thread Vladimir Davydov
Currently kmem_cache_create_memcg() backoffs on failure inside
conditionals, without using gotos. This results in the rollback code
duplication, which makes the function look cumbersome even though on
error we should only free the allocated cache. Since in the next patch I
am going to add yet another rollback function call on error path there,
let's employ labels instead of conditionals for undoing any changes on
failure to keep things clean.

Signed-off-by: Vladimir Davydov 
Reviewed-by: Pekka Enberg 
Cc: Michal Hocko 
Cc: Glauber Costa 
Cc: Johannes Weiner 
Cc: Christoph Lameter 
Cc: Andrew Morton 
---
 mm/slab_common.c |   65 ++
 1 file changed, 31 insertions(+), 34 deletions(-)

diff --git a/mm/slab_common.c b/mm/slab_common.c
index 0b7bb39..f70df3e 100644
--- a/mm/slab_common.c
+++ b/mm/slab_common.c
@@ -171,13 +171,14 @@ kmem_cache_create_memcg(struct mem_cgroup *memcg, const 
char *name, size_t size,
struct kmem_cache *parent_cache)
 {
struct kmem_cache *s = NULL;
-   int err = 0;
+   int err;
 
get_online_cpus();
mutex_lock(&slab_mutex);
 
-   if (!kmem_cache_sanity_check(memcg, name, size) == 0)
-   goto out_locked;
+   err = kmem_cache_sanity_check(memcg, name, size);
+   if (err)
+   goto out_unlock;
 
/*
 * Some allocators will constraint the set of valid flags to a subset
@@ -189,45 +190,38 @@ kmem_cache_create_memcg(struct mem_cgroup *memcg, const 
char *name, size_t size,
 
s = __kmem_cache_alias(memcg, name, size, align, flags, ctor);
if (s)
-   goto out_locked;
+   goto out_unlock;
 
+   err = -ENOMEM;
s = kmem_cache_zalloc(kmem_cache, GFP_KERNEL);
-   if (s) {
-   s->object_size = s->size = size;
-   s->align = calculate_alignment(flags, align, size);
-   s->ctor = ctor;
+   if (!s)
+   goto out_unlock;
 
-   if (memcg_register_cache(memcg, s, parent_cache)) {
-   kmem_cache_free(kmem_cache, s);
-   err = -ENOMEM;
-   goto out_locked;
-   }
+   s->object_size = s->size = size;
+   s->align = calculate_alignment(flags, align, size);
+   s->ctor = ctor;
 
-   s->name = kstrdup(name, GFP_KERNEL);
-   if (!s->name) {
-   kmem_cache_free(kmem_cache, s);
-   err = -ENOMEM;
-   goto out_locked;
-   }
+   s->name = kstrdup(name, GFP_KERNEL);
+   if (!s->name)
+   goto out_free_cache;
 
-   err = __kmem_cache_create(s, flags);
-   if (!err) {
-   s->refcount = 1;
-   list_add(&s->list, &slab_caches);
-   memcg_cache_list_add(memcg, s);
-   } else {
-   kfree(s->name);
-   kmem_cache_free(kmem_cache, s);
-   }
-   } else
-   err = -ENOMEM;
+   err = memcg_register_cache(memcg, s, parent_cache);
+   if (err)
+   goto out_free_cache;
+
+   err = __kmem_cache_create(s, flags);
+   if (err)
+   goto out_free_cache;
+
+   s->refcount = 1;
+   list_add(&s->list, &slab_caches);
+   memcg_cache_list_add(memcg, s);
 
-out_locked:
+out_unlock:
mutex_unlock(&slab_mutex);
put_online_cpus();
 
if (err) {
-
if (flags & SLAB_PANIC)
panic("kmem_cache_create: Failed to create slab '%s'. 
Error %d\n",
name, err);
@@ -236,11 +230,14 @@ out_locked:
name, err);
dump_stack();
}
-
return NULL;
}
-
return s;
+
+out_free_cache:
+   kfree(s->name);
+   kmem_cache_free(kmem_cache, s);
+   goto out_unlock;
 }
 
 struct kmem_cache *
-- 
1.7.10.4

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[PATCH RESEND 10/11] memcg: remove KMEM_ACCOUNTED_ACTIVATED flag

2014-01-06 Thread Vladimir Davydov
Currently we have two state bits in mem_cgroup::kmem_account_flags
regarding kmem accounting activation, ACTIVATED and ACTIVE. We start
kmem accounting only if both flags are set (memcg_can_account_kmem()),
plus throughout the code there are several places where we check only
the ACTIVE flag, but we never check the ACTIVATED flag alone. These
flags are both set from memcg_update_kmem_limit() under the
set_limit_mutex, the ACTIVE flag always being set after ACTIVATED, and
they never get cleared. That said checking if both flags are set is
equivalent to checking only for the ACTIVE flag, and since there is no
ACTIVATED flag checks, we can safely remove the ACTIVATED flag, and
nothing will change.

Let's try to understand what was the reason for introducing these flags.
The purpose of the ACTIVE flag is clear - it states that kmem should be
accounting to the cgroup. The only requirement for it is that it should
be set after we have fully initialized kmem accounting bits for the
cgroup and patched all static branches relating to kmem accounting.
Since we always check if static branch is enabled before actually
considering if we should account (otherwise we wouldn't benefit from
static branching), this guarantees us that we won't skip a commit or
uncharge after a charge due to an unpatched static branch.

Now let's move on to the ACTIVATED bit. As I proved in the beginning of
this message, it is absolutely useless, and removing it will change
nothing. So what was the reason introducing it?

The ACTIVATED flag was introduced by commit a8964b9b ("memcg: use static
branches when code not in use") in order to guarantee that
static_key_slow_inc(&memcg_kmem_enabled_key) would be called only once
for each memory cgroup when its kmem accounting was activated. The point
was that at that time the memcg_update_kmem_limit() function's work-flow
looked like this:

bool must_inc_static_branch = false;

cgroup_lock();
mutex_lock(&set_limit_mutex);
if (!memcg->kmem_account_flags && val != RESOURCE_MAX) {
/* The kmem limit is set for the first time */
ret = res_counter_set_limit(&memcg->kmem, val);

memcg_kmem_set_activated(memcg);
must_inc_static_branch = true;
} else
ret = res_counter_set_limit(&memcg->kmem, val);
mutex_unlock(&set_limit_mutex);
cgroup_unlock();

if (must_inc_static_branch) {
/* We can't do this under cgroup_lock */
static_key_slow_inc(&memcg_kmem_enabled_key);
memcg_kmem_set_active(memcg);
}

So that without the ACTIVATED flag we could race with other threads
trying to set the limit and increment the static branching ref-counter
more than once. Today we call the whole memcg_update_kmem_limit()
function under the set_limit_mutex and this race is impossible.

As now we understand why the ACTIVATED bit was introduced and why we
don't need it now, and know that removing it will change nothing anyway,
let's get rid of it.

Signed-off-by: Vladimir Davydov 
Cc: Michal Hocko 
Cc: Glauber Costa 
Cc: Johannes Weiner 
Cc: Balbir Singh 
Cc: KAMEZAWA Hiroyuki 
Cc: Andrew Morton 
---
 mm/memcontrol.c |   28 ++--
 1 file changed, 2 insertions(+), 26 deletions(-)

diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index a7521c3..a5a1ae1 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -343,15 +343,10 @@ static size_t memcg_size(void)
 
 /* internal only representation about the status of kmem accounting. */
 enum {
-   KMEM_ACCOUNTED_ACTIVE = 0, /* accounted by this cgroup itself */
-   KMEM_ACCOUNTED_ACTIVATED, /* static key enabled. */
+   KMEM_ACCOUNTED_ACTIVE, /* accounted by this cgroup itself */
KMEM_ACCOUNTED_DEAD, /* dead memcg with pending kmem charges */
 };
 
-/* We account when limit is on, but only after call sites are patched */
-#define KMEM_ACCOUNTED_MASK \
-   ((1 << KMEM_ACCOUNTED_ACTIVE) | (1 << KMEM_ACCOUNTED_ACTIVATED))
-
 #ifdef CONFIG_MEMCG_KMEM
 static inline void memcg_kmem_set_active(struct mem_cgroup *memcg)
 {
@@ -363,16 +358,6 @@ static bool memcg_kmem_is_active(struct mem_cgroup *memcg)
return test_bit(KMEM_ACCOUNTED_ACTIVE, &memcg->kmem_account_flags);
 }
 
-static void memcg_kmem_set_activated(struct mem_cgroup *memcg)
-{
-   set_bit(KMEM_ACCOUNTED_ACTIVATED, &memcg->kmem_account_flags);
-}
-
-static void memcg_kmem_clear_activated(struct mem_cgroup *memcg)
-{
-   clear_bit(KMEM_ACCOUNTED_ACTIVATED, &memcg->kmem_account_flags);
-}
-
 static void memcg_kmem_mark_dead(struct mem_cgroup *memcg)
 {
/*
@@ -2959,7 +2944,7 @@ static DEFINE_MUTEX(set_limit_mutex);
 static inline bool memcg_can_account_kmem(struct mem_cgroup *memcg)
 {
return !mem_cgroup_disabled() && !mem_cg

[PATCH RESEND 11/11] memcg: rework memcg_update_kmem_limit synchronization

2014-01-06 Thread Vladimir Davydov
Currently we take both the memcg_create_mutex and the set_limit_mutex
when we enable kmem accounting for a memory cgroup, which makes kmem
activation events serialize with both memcg creations and other memcg
limit updates (memory.limit, memory.memsw.limit). However, there is no
point in such strict synchronization rules there.

First, the set_limit_mutex was introduced to keep the memory.limit and
memory.memsw.limit values in sync. Since memory.kmem.limit can be set
independently of them, it is better to introduce a separate mutex to
synchronize against concurrent kmem limit updates.

Second, we take the memcg_create_mutex in order to make sure all
children of this memcg will be kmem-active as well. For achieving that,
it is enough to hold this mutex only while checking if
memcg_has_children() though. This guarantees that if a child is added
after we checked that the memcg has no children, the newly added cgroup
will see its parent kmem-active (of course if the latter succeeded), and
call kmem activation for itself.

This patch simplifies the locking rules of memcg_update_kmem_limit()
according to these considerations.

Signed-off-by: Vladimir Davydov 
Cc: Michal Hocko 
Cc: Glauber Costa 
Cc: Johannes Weiner 
Cc: Balbir Singh 
Cc: KAMEZAWA Hiroyuki 
Cc: Andrew Morton 
---
 mm/memcontrol.c |  198 +--
 1 file changed, 106 insertions(+), 92 deletions(-)

diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index a5a1ae1..696707c 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -2941,6 +2941,8 @@ static void __mem_cgroup_commit_charge(struct mem_cgroup 
*memcg,
 static DEFINE_MUTEX(set_limit_mutex);
 
 #ifdef CONFIG_MEMCG_KMEM
+static DEFINE_MUTEX(activate_kmem_mutex);
+
 static inline bool memcg_can_account_kmem(struct mem_cgroup *memcg)
 {
return !mem_cgroup_disabled() && !mem_cgroup_is_root(memcg) &&
@@ -3054,34 +3056,6 @@ int memcg_cache_id(struct mem_cgroup *memcg)
return memcg ? memcg->kmemcg_id : -1;
 }
 
-/*
- * This ends up being protected by the set_limit mutex, during normal
- * operation, because that is its main call site.
- *
- * But when we create a new cache, we can call this as well if its parent
- * is kmem-limited. That will have to hold set_limit_mutex as well.
- */
-int memcg_update_cache_sizes(struct mem_cgroup *memcg)
-{
-   int num, ret;
-
-   num = ida_simple_get(&kmem_limited_groups,
-   0, MEMCG_CACHES_MAX_SIZE, GFP_KERNEL);
-   if (num < 0)
-   return num;
-
-   ret = memcg_update_all_caches(num+1);
-   if (ret) {
-   ida_simple_remove(&kmem_limited_groups, num);
-   return ret;
-   }
-
-   memcg->kmemcg_id = num;
-   INIT_LIST_HEAD(&memcg->memcg_slab_caches);
-   mutex_init(&memcg->slab_caches_mutex);
-   return 0;
-}
-
 static size_t memcg_caches_array_size(int num_groups)
 {
ssize_t size;
@@ -3424,9 +3398,10 @@ void kmem_cache_destroy_memcg_children(struct kmem_cache 
*s)
 *
 * Still, we don't want anyone else freeing memcg_caches under our
 * noses, which can happen if a new memcg comes to life. As usual,
-* we'll take the set_limit_mutex to protect ourselves against this.
+* we'll take the activate_kmem_mutex to protect ourselves against
+* this.
 */
-   mutex_lock(&set_limit_mutex);
+   mutex_lock(&activate_kmem_mutex);
for_each_memcg_cache_index(i) {
c = cache_from_memcg_idx(s, i);
if (!c)
@@ -3449,7 +3424,7 @@ void kmem_cache_destroy_memcg_children(struct kmem_cache 
*s)
cancel_work_sync(&c->memcg_params->destroy);
kmem_cache_destroy(c);
}
-   mutex_unlock(&set_limit_mutex);
+   mutex_unlock(&activate_kmem_mutex);
 }
 
 struct create_work {
@@ -5116,11 +5091,23 @@ static ssize_t mem_cgroup_read(struct 
cgroup_subsys_state *css,
return simple_read_from_buffer(buf, nbytes, ppos, str, len);
 }
 
-static int memcg_update_kmem_limit(struct cgroup_subsys_state *css, u64 val)
-{
-   int ret = -EINVAL;
 #ifdef CONFIG_MEMCG_KMEM
-   struct mem_cgroup *memcg = mem_cgroup_from_css(css);
+/* should be called with activate_kmem_mutex held */
+static int __memcg_activate_kmem(struct mem_cgroup *memcg,
+unsigned long long limit)
+{
+   int err = 0;
+   int memcg_id;
+
+   if (memcg_kmem_is_active(memcg))
+   return 0;
+
+   /*
+* We are going to allocate memory for data shared by all memory
+* cgroups so let's stop accounting here.
+*/
+   memcg_stop_kmem_account();
+
/*
 * For simplicity, we won't allow this to be disabled.  It also can't
 * be changed if the cgroup has children already, or if tasks had
@@ -5134,72 +5121,101 @@ stati

[PATCH RESEND 07/11] memcg: get rid of kmem_cache_dup

2014-01-06 Thread Vladimir Davydov
kmem_cache_dup() is only called from memcg_create_kmem_cache(). The
latter, in fact, does nothing besides this, so let's fold
kmem_cache_dup() into memcg_create_kmem_cache().

This patch also makes the memcg_cache_mutex private to
memcg_create_kmem_cache(), because it is not used anywhere else.

Signed-off-by: Vladimir Davydov 
Cc: Michal Hocko 
Cc: Glauber Costa 
Cc: Johannes Weiner 
Cc: Balbir Singh 
Cc: KAMEZAWA Hiroyuki 
Cc: Andrew Morton 
---
 mm/memcontrol.c |   39 ---
 1 file changed, 8 insertions(+), 31 deletions(-)

diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 56fc410..ce25f77 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -3391,27 +3391,16 @@ void mem_cgroup_destroy_cache(struct kmem_cache *cachep)
schedule_work(&cachep->memcg_params->destroy);
 }
 
-/*
- * This lock protects updaters, not readers. We want readers to be as fast as
- * they can, and they will either see NULL or a valid cache value. Our model
- * allow them to see NULL, in which case the root memcg will be selected.
- *
- * We need this lock because multiple allocations to the same cache from a non
- * will span more than one worker. Only one of them can create the cache.
- */
-static DEFINE_MUTEX(memcg_cache_mutex);
-
-/*
- * Called with memcg_cache_mutex held
- */
-static struct kmem_cache *kmem_cache_dup(struct mem_cgroup *memcg,
-struct kmem_cache *s)
+static struct kmem_cache *memcg_create_kmem_cache(struct mem_cgroup *memcg,
+ struct kmem_cache *s)
 {
struct kmem_cache *new;
static char *tmp_name = NULL;
+   static DEFINE_MUTEX(mutex); /* protects tmp_name */
 
-   lockdep_assert_held(&memcg_cache_mutex);
+   BUG_ON(!memcg_can_account_kmem(memcg));
 
+   mutex_lock(&mutex);
/*
 * kmem_cache_create_memcg duplicates the given name and
 * cgroup_name for this name requires RCU context.
@@ -3434,25 +3423,13 @@ static struct kmem_cache *kmem_cache_dup(struct 
mem_cgroup *memcg,
 
if (new)
new->allocflags |= __GFP_KMEMCG;
+   else
+   new = s;
 
+   mutex_unlock(&mutex);
return new;
 }
 
-static struct kmem_cache *memcg_create_kmem_cache(struct mem_cgroup *memcg,
- struct kmem_cache *cachep)
-{
-   struct kmem_cache *new_cachep;
-
-   BUG_ON(!memcg_can_account_kmem(memcg));
-
-   mutex_lock(&memcg_cache_mutex);
-   new_cachep = kmem_cache_dup(memcg, cachep);
-   if (new_cachep == NULL)
-   new_cachep = cachep;
-   mutex_unlock(&memcg_cache_mutex);
-   return new_cachep;
-}
-
 void kmem_cache_destroy_memcg_children(struct kmem_cache *s)
 {
struct kmem_cache *c;
-- 
1.7.10.4

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[PATCH RESEND] fs: fix WARN on alloc_super() fail path

2014-01-06 Thread Vladimir Davydov
On fail path alloc_super() calls destroy_super(), which issues a warning
if the sb's s_mounts list is not empty, in particular if it has not been
initialized. That said s_mounts must be initialized in alloc_super()
before any possible failure, but currently it is initialized close to
the end of the function leading to a useless warning dumped to log if
either percpu_counter_init() or list_lru_init() fails. Let's fix this.

Signed-off-by: Vladimir Davydov 
Cc: Al Viro 
---
 fs/super.c |3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/fs/super.c b/fs/super.c
index e5f6c2c..cecd780 100644
--- a/fs/super.c
+++ b/fs/super.c
@@ -166,6 +166,8 @@ static struct super_block *alloc_super(struct 
file_system_type *type, int flags)
if (!s)
return NULL;
 
+   INIT_LIST_HEAD(&s->s_mounts);
+
if (security_sb_alloc(s))
goto fail;
 
@@ -188,7 +190,6 @@ static struct super_block *alloc_super(struct 
file_system_type *type, int flags)
if (list_lru_init(&s->s_inode_lru))
goto fail;
 
-   INIT_LIST_HEAD(&s->s_mounts);
init_rwsem(&s->s_umount);
lockdep_set_class(&s->s_umount, &type->s_umount_key);
/*
-- 
1.7.10.4

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH v14 16/18] vmpressure: in-kernel notifications

2013-12-20 Thread Vladimir Davydov
On 12/20/2013 06:31 PM, Glauber Costa wrote:
>> I have the exact problem described above for a project I'm working on
>> and this solution seems to solve it well.
>>
>> However, I had a few issues while trying to use this interface. I'll
>> comment on them below, but please take this more as advice seeking
>> than patch review.
>>
>>> This patch extends that to also support in-kernel users. Events that
>>> should be generated for in-kernel consumption will be marked as such,
>>> and for those, we will call a registered function instead of triggering
>>> an eventfd notification.
>>>
>>> Please note that due to my lack of understanding of each shrinker user,
>>> I will stay away from converting the actual users, you are all welcome
>>> to do so.
>>>
>>> Signed-off-by: Glauber Costa 
>>> Signed-off-by: Vladimir Davydov 
>>> Acked-by: Anton Vorontsov 
>>> Acked-by: Pekka Enberg 
>>> Reviewed-by: Greg Thelen 
>>> Cc: Dave Chinner 
>>> Cc: John Stultz 
>>> Cc: Andrew Morton 
>>> Cc: Joonsoo Kim 
>>> Cc: Michal Hocko 
>>> Cc: Kamezawa Hiroyuki 
>>> Cc: Johannes Weiner 
>>> ---
>>>  include/linux/vmpressure.h |5 +
>>>  mm/vmpressure.c|   53 
>>> +---
>>>  2 files changed, 55 insertions(+), 3 deletions(-)
>>>
>>> diff --git a/include/linux/vmpressure.h b/include/linux/vmpressure.h
>>> index 3f3788d..9102e53 100644
>>> --- a/include/linux/vmpressure.h
>>> +++ b/include/linux/vmpressure.h
>>> @@ -19,6 +19,9 @@ struct vmpressure {
>>>   /* Have to grab the lock on events traversal or modifications. */
>>>   struct mutex events_lock;
>>>
>>> + /* False if only kernel users want to be notified, true otherwise. */
>>> + bool notify_userspace;
>>> +
>>>   struct work_struct work;
>>>  };
>>>
>>> @@ -38,6 +41,8 @@ extern int vmpressure_register_event(struct 
>>> cgroup_subsys_state *css,
>>>struct cftype *cft,
>>>struct eventfd_ctx *eventfd,
>>>const char *args);
>>> +extern int vmpressure_register_kernel_event(struct cgroup_subsys_state 
>>> *css,
>>> + void (*fn)(void));
>>>  extern void vmpressure_unregister_event(struct cgroup_subsys_state *css,
>>>   struct cftype *cft,
>>>   struct eventfd_ctx *eventfd);
>>> diff --git a/mm/vmpressure.c b/mm/vmpressure.c
>>> index e0f6283..730e7c1 100644
>>> --- a/mm/vmpressure.c
>>> +++ b/mm/vmpressure.c
>>> @@ -130,8 +130,12 @@ static enum vmpressure_levels 
>>> vmpressure_calc_level(unsigned long scanned,
>>>  }
>>>
>>>  struct vmpressure_event {
>>> - struct eventfd_ctx *efd;
>>> + union {
>>> + struct eventfd_ctx *efd;
>>> + void (*fn)(void);
>> How does the callback access its private data?
>>
>>> + };
>>>   enum vmpressure_levels level;
>>> + bool kernel_event;
>>>   struct list_head node;
>>>  };
>>>
>>> @@ -147,12 +151,15 @@ static bool vmpressure_event(struct vmpressure *vmpr,
>>>   mutex_lock(&vmpr->events_lock);
>>>
>>>   list_for_each_entry(ev, &vmpr->events, node) {
>>> - if (level >= ev->level) {
>>> + if (ev->kernel_event) {
>>> + ev->fn();
>> I think it would be interesting to pass 'level' to the callback (I'll
>> probably use it myself), but we could wait for a in-tree user before
>> adding it.
>>
>>> + } else if (vmpr->notify_userspace && level >= ev->level) {
>>>   eventfd_signal(ev->efd, 1);
>>>   signalled = true;
>>>   }
>>>   }
>>>
>>> + vmpr->notify_userspace = false;
>>>   mutex_unlock(&vmpr->events_lock);
>>>
>>>   return signalled;
>>> @@ -222,7 +229,7 @@ void vmpressure(gfp_t gfp, struct mem_cgroup *memcg,
>>>* we account it too.
>>>*/
>>>   if (!(gfp & (__GFP_HIGHMEM | __GFP_MOVABLE | __

[PATCH 09/11] memcg, slab: RCU protect memcg_params for root caches

2013-12-21 Thread Vladimir Davydov
We relocate root cache's memcg_params whenever we need to grow the
memcg_caches array to accommodate all kmem-active memory cgroups.
Currently on relocation we free the old version immediately, which can
lead to use-after-free, because the memcg_caches array is accessed
lock-free (see cache_from_memcg_idx()). This patch fixes this by making
memcg_params RCU-protected for root caches.

Signed-off-by: Vladimir Davydov 
Cc: Michal Hocko 
Cc: Glauber Costa 
Cc: Johannes Weiner 
Cc: Balbir Singh 
Cc: KAMEZAWA Hiroyuki 
Cc: Pekka Enberg 
Cc: Christoph Lameter 
Cc: Andrew Morton 
---
 include/linux/slab.h |9 +++--
 mm/memcontrol.c  |   15 ---
 mm/slab.h|   16 +++-
 3 files changed, 30 insertions(+), 10 deletions(-)

diff --git a/include/linux/slab.h b/include/linux/slab.h
index 1e2f4fe..a060142 100644
--- a/include/linux/slab.h
+++ b/include/linux/slab.h
@@ -513,7 +513,9 @@ static __always_inline void *kmalloc_node(size_t size, 
gfp_t flags, int node)
  *
  * Both the root cache and the child caches will have it. For the root cache,
  * this will hold a dynamically allocated array large enough to hold
- * information about the currently limited memcgs in the system.
+ * information about the currently limited memcgs in the system. To allow the
+ * array to be accessed without taking any locks, on relocation we free the old
+ * version only after a grace period.
  *
  * Child caches will hold extra metadata needed for its operation. Fields are:
  *
@@ -528,7 +530,10 @@ static __always_inline void *kmalloc_node(size_t size, 
gfp_t flags, int node)
 struct memcg_cache_params {
bool is_root_cache;
union {
-   struct kmem_cache *memcg_caches[0];
+   struct {
+   struct rcu_head rcu_head;
+   struct kmem_cache *memcg_caches[0];
+   };
struct {
struct mem_cgroup *memcg;
struct list_head list;
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index ce25f77..a7521c3 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -3142,18 +3142,17 @@ int memcg_update_cache_size(struct kmem_cache *s, int 
num_groups)
 
if (num_groups > memcg_limited_groups_array_size) {
int i;
+   struct memcg_cache_params *new_params;
ssize_t size = memcg_caches_array_size(num_groups);
 
size *= sizeof(void *);
size += offsetof(struct memcg_cache_params, memcg_caches);
 
-   s->memcg_params = kzalloc(size, GFP_KERNEL);
-   if (!s->memcg_params) {
-   s->memcg_params = cur_params;
+   new_params = kzalloc(size, GFP_KERNEL);
+   if (!new_params)
return -ENOMEM;
-   }
 
-   s->memcg_params->is_root_cache = true;
+   new_params->is_root_cache = true;
 
/*
 * There is the chance it will be bigger than
@@ -3167,7 +3166,7 @@ int memcg_update_cache_size(struct kmem_cache *s, int 
num_groups)
for (i = 0; i < memcg_limited_groups_array_size; i++) {
if (!cur_params->memcg_caches[i])
continue;
-   s->memcg_params->memcg_caches[i] =
+   new_params->memcg_caches[i] =
cur_params->memcg_caches[i];
}
 
@@ -3180,7 +3179,9 @@ int memcg_update_cache_size(struct kmem_cache *s, int 
num_groups)
 * bigger than the others. And all updates will reset this
 * anyway.
 */
-   kfree(cur_params);
+   rcu_assign_pointer(s->memcg_params, new_params);
+   if (cur_params)
+   kfree_rcu(cur_params, rcu_head);
}
return 0;
 }
diff --git a/mm/slab.h b/mm/slab.h
index 72d1f9d..8184a7c 100644
--- a/mm/slab.h
+++ b/mm/slab.h
@@ -160,14 +160,28 @@ static inline const char *cache_name(struct kmem_cache *s)
return s->name;
 }
 
+/*
+ * Note, we protect with RCU only the memcg_caches array, not per-memcg caches.
+ * That said the caller must assure the memcg's cache won't go away. Since once
+ * created a memcg's cache is destroyed only along with the root cache, it is
+ * true if we are going to allocate from the cache or hold a reference to the
+ * root cache by other means. Otherwise, we should hold either the slab_mutex
+ * or the memcg's slab_caches_mutex while calling this function and accessing
+ * the returned value.
+ */
 static inline struct kmem_cache *
 cache_from_memcg_idx(struct kmem_cache *s, int idx)
 {
struct kmem_cache *cachep;
+   struct memcg_cache_params *params;
 
if (!s->memcg_params)
return NULL;
-   cachep = s-&g

[PATCH 05/11] memcg: fix possible NULL deref while traversing memcg_slab_caches list

2013-12-21 Thread Vladimir Davydov
All caches of the same memory cgroup are linked in the memcg_slab_caches
list via kmem_cache::memcg_params::list. This list is traversed, for
example, when we read memory.kmem.slabinfo. Since the list actually
consists of memcg_cache_params objects, we have to convert an element of
the list to a kmem_cache object using memcg_params_to_cache(), which
obtains the pointer to the cache from the memcg_params::memcg_caches
array of the corresponding root cache. That said the pointer to a
kmem_cache in its parent's memcg_params must be initialized before
adding the cache to the list, and cleared only after it has been
unlinked. Currently it is vice-versa, which can result in a NULL ptr
dereference while traversing the memcg_slab_caches list. This patch
restores the correct order.

Signed-off-by: Vladimir Davydov 
Cc: Michal Hocko 
Cc: Glauber Costa 
Cc: Johannes Weiner 
Cc: Balbir Singh 
Cc: KAMEZAWA Hiroyuki 
Cc: Andrew Morton 
---
 mm/memcontrol.c |   25 ++---
 1 file changed, 18 insertions(+), 7 deletions(-)

diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 999e7d4..d918626 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -3234,9 +3234,6 @@ void memcg_register_cache(struct kmem_cache *s)
 
css_get(&memcg->css);
 
-   mutex_lock(&memcg->slab_caches_mutex);
-   list_add(&s->memcg_params->list, &memcg->memcg_slab_caches);
-   mutex_unlock(&memcg->slab_caches_mutex);
 
/*
 * Since readers won't lock (see cache_from_memcg_idx()), we need a
@@ -3245,7 +3242,16 @@ void memcg_register_cache(struct kmem_cache *s)
 */
smp_wmb();
 
+   /*
+* Initialize the pointer to this cache in its parent's memcg_params
+* before adding it to the memcg_slab_caches list, otherwise we can
+* fail to convert memcg_params_to_cache() while traversing the list.
+*/
root->memcg_params->memcg_caches[id] = s;
+
+   mutex_lock(&memcg->slab_caches_mutex);
+   list_add(&s->memcg_params->list, &memcg->memcg_slab_caches);
+   mutex_unlock(&memcg->slab_caches_mutex);
 }
 
 void memcg_unregister_cache(struct kmem_cache *s)
@@ -3257,16 +3263,21 @@ void memcg_unregister_cache(struct kmem_cache *s)
if (is_root_cache(s))
return;
 
-   memcg = s->memcg_params->memcg;
-   id  = memcg_cache_id(memcg);
-
root = s->memcg_params->root_cache;
-   root->memcg_params->memcg_caches[id] = NULL;
+   memcg = s->memcg_params->memcg;
+   id = memcg_cache_id(memcg);
 
mutex_lock(&memcg->slab_caches_mutex);
list_del(&s->memcg_params->list);
mutex_unlock(&memcg->slab_caches_mutex);
 
+   /*
+* Clear the pointer to this cache in its parent's memcg_params only
+* after removing it from the memcg_slab_caches list, otherwise we can
+* fail to convert memcg_params_to_cache() while traversing the list.
+*/
+   root->memcg_params->memcg_caches[id] = NULL;
+
css_put(&memcg->css);
 }
 
-- 
1.7.10.4

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[PATCH 03/11] memcg, slab: cleanup memcg cache initialization/destruction

2013-12-21 Thread Vladimir Davydov
Currently, we have rather a messy function set relating to per-memcg
kmem cache initialization/destruction.

Per-memcg caches are created in memcg_create_kmem_cache(). This function
calls kmem_cache_create_memcg() to allocate and initialize a kmem cache
and then "registers" the new cache in the memcg_params::memcg_caches
array of the parent cache.

During its work-flow, kmem_cache_create_memcg() executes the following
memcg-related functions:

 - memcg_alloc_cache_params(), to initialize memcg_params of the newly
   created cache;
 - memcg_cache_list_add(), to add the new cache to the memcg_slab_caches
   list.

On the other hand, kmem_cache_destroy() called on a cache destruction
only calls memcg_release_cache(), which does all the work: it cleans the
reference to the cache in its parent's memcg_params::memcg_caches,
removes the cache from the memcg_slab_caches list, and frees
memcg_params.

Such an inconsistency between destruction and initialization paths make
the code difficult to read, so let's clean this up a bit.

This patch moves all the code relating to registration of per-memcg
caches (adding to memcg list, setting the pointer to a cache from its
parent) to the newly created memcg_register_cache() and
memcg_unregister_cache() functions making the initialization and
destruction paths look symmetrical.

Signed-off-by: Vladimir Davydov 
Cc: Michal Hocko 
Cc: Glauber Costa 
Cc: Johannes Weiner 
Cc: Balbir Singh 
Cc: KAMEZAWA Hiroyuki 
Cc: Pekka Enberg 
Cc: Christoph Lameter 
Cc: Andrew Morton 
---
 include/linux/memcontrol.h |9 +++
 mm/memcontrol.c|   64 +---
 mm/slab_common.c   |5 ++--
 3 files changed, 37 insertions(+), 41 deletions(-)

diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
index 5e6541f..6202406 100644
--- a/include/linux/memcontrol.h
+++ b/include/linux/memcontrol.h
@@ -500,8 +500,8 @@ int memcg_cache_id(struct mem_cgroup *memcg);
 int memcg_alloc_cache_params(struct mem_cgroup *memcg, struct kmem_cache *s,
 struct kmem_cache *root_cache);
 void memcg_free_cache_params(struct kmem_cache *s);
-void memcg_release_cache(struct kmem_cache *cachep);
-void memcg_cache_list_add(struct mem_cgroup *memcg, struct kmem_cache *cachep);
+void memcg_register_cache(struct kmem_cache *s);
+void memcg_unregister_cache(struct kmem_cache *s);
 
 int memcg_update_cache_size(struct kmem_cache *s, int num_groups);
 void memcg_update_array_size(int num_groups);
@@ -651,12 +651,11 @@ static inline void memcg_free_cache_params(struct 
kmem_cache *s);
 {
 }
 
-static inline void memcg_release_cache(struct kmem_cache *cachep)
+static inline void memcg_register_cache(struct kmem_cache *s)
 {
 }
 
-static inline void memcg_cache_list_add(struct mem_cgroup *memcg,
-   struct kmem_cache *s)
+static inline void memcg_unregister_cache(struct kmem_cache *s)
 {
 }
 
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 8c47910..f8eb994 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -3059,16 +3059,6 @@ static void memcg_uncharge_kmem(struct mem_cgroup 
*memcg, u64 size)
css_put(&memcg->css);
 }
 
-void memcg_cache_list_add(struct mem_cgroup *memcg, struct kmem_cache *cachep)
-{
-   if (!memcg)
-   return;
-
-   mutex_lock(&memcg->slab_caches_mutex);
-   list_add(&cachep->memcg_params->list, &memcg->memcg_slab_caches);
-   mutex_unlock(&memcg->slab_caches_mutex);
-}
-
 /*
  * helper for acessing a memcg's index. It will be used as an index in the
  * child cache array in kmem_cache, and also to derive its name. This function
@@ -3229,21 +3219,41 @@ void memcg_free_cache_params(struct kmem_cache *s)
kfree(s->memcg_params);
 }
 
-void memcg_release_cache(struct kmem_cache *s)
+void memcg_register_cache(struct kmem_cache *s)
 {
struct kmem_cache *root;
struct mem_cgroup *memcg;
int id;
 
+   if (is_root_cache(s))
+   return;
+
+   root = s->memcg_params->root_cache;
+   memcg = s->memcg_params->memcg;
+   id = memcg_cache_id(memcg);
+
+   css_get(&memcg->css);
+
+   mutex_lock(&memcg->slab_caches_mutex);
+   list_add(&s->memcg_params->list, &memcg->memcg_slab_caches);
+   mutex_unlock(&memcg->slab_caches_mutex);
+
+   root->memcg_params->memcg_caches[id] = s;
/*
-* This happens, for instance, when a root cache goes away before we
-* add any memcg.
+* the readers won't lock, make sure everybody sees the updated value,
+* so they won't put stuff in the queue again for no reason
 */
-   if (!s->memcg_params)
-   return;
+   wmb();
+}
 
-   if (s->memcg_params->is_root_cache)
-   goto out;
+void memcg_unregister_ca

[PATCH 06/11] memcg, slab: fix races in per-memcg cache creation/destruction

2013-12-21 Thread Vladimir Davydov
We obtain a per-memcg cache from a root kmem_cache by dereferencing an
entry of the root cache's memcg_params::memcg_caches array. If we find
no cache for a memcg there on allocation, we initiate the memcg cache
creation (see memcg_kmem_get_cache()). The cache creation proceeds
asynchronously in memcg_create_kmem_cache() in order to avoid lock
clashes, so there can be several threads trying to create the same
kmem_cache concurrently, but only one of them may succeed. However, due
to a race in the code, it is not always true. The point is that the
memcg_caches array can be relocated when we activate kmem accounting for
a memcg (see memcg_update_all_caches(), memcg_update_cache_size()). If
memcg_update_cache_size() and memcg_create_kmem_cache() proceed
concurrently as described below, we can leak a kmem_cache.

Asume two threads schedule creation of the same kmem_cache. One of them
successfully creates it. Another one should fail then, but if
memcg_create_kmem_cache() interleaves with memcg_update_cache_size() as
follows, it won't:

  memcg_create_kmem_cache() memcg_update_cache_size()
  (called w/o mutexes held) (called with slab_mutex,
 set_limit_mutex held)
  - -

  mutex_lock(&memcg_cache_mutex)

s->memcg_params=kzalloc(...)

  new_cachep=cache_from_memcg_idx(cachep,idx)
  // new_cachep==NULL => proceed to creation

s->memcg_params->memcg_caches[i]
=cur_params->memcg_caches[i]

  // kmem_cache_create_memcg takes slab_mutex
  // so we will hang around until
  // memcg_update_cache_size finishes, but
  // nothing will prevent it from succeeding so
  // memcg_caches[idx] will be overwritten in
  // memcg_register_cache!

  new_cachep = kmem_cache_create_memcg(...)
  mutex_unlock(&memcg_cache_mutex)

Let's fix this by moving the check for existence of the memcg cache to
kmem_cache_create_memcg() to be called under the slab_mutex and make it
return NULL if so.

A similar race is possible when destroying a memcg cache (see
kmem_cache_destroy()). Since memcg_unregister_cache(), which clears the
pointer in the memcg_caches array, is called w/o protection, we can race
with memcg_update_cache_size() and omit clearing the pointer. Therefore
memcg_unregister_cache() should be moved before we release the
slab_mutex.

Signed-off-by: Vladimir Davydov 
Cc: Michal Hocko 
Cc: Glauber Costa 
Cc: Johannes Weiner 
Cc: Balbir Singh 
Cc: KAMEZAWA Hiroyuki 
Cc: Pekka Enberg 
Cc: Christoph Lameter 
Cc: Andrew Morton 
---
 mm/memcontrol.c  |   23 ++-
 mm/slab_common.c |   14 +-
 2 files changed, 27 insertions(+), 10 deletions(-)

diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index d918626..56fc410 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -3228,6 +3228,12 @@ void memcg_register_cache(struct kmem_cache *s)
if (is_root_cache(s))
return;
 
+   /*
+* Holding the slab_mutex assures nobody will touch the memcg_caches
+* array while we are modifying it.
+*/
+   lockdep_assert_held(&slab_mutex);
+
root = s->memcg_params->root_cache;
memcg = s->memcg_params->memcg;
id = memcg_cache_id(memcg);
@@ -3247,6 +3253,7 @@ void memcg_register_cache(struct kmem_cache *s)
 * before adding it to the memcg_slab_caches list, otherwise we can
 * fail to convert memcg_params_to_cache() while traversing the list.
 */
+   VM_BUG_ON(root->memcg_params->memcg_caches[id]);
root->memcg_params->memcg_caches[id] = s;
 
mutex_lock(&memcg->slab_caches_mutex);
@@ -3263,6 +3270,12 @@ void memcg_unregister_cache(struct kmem_cache *s)
if (is_root_cache(s))
return;
 
+   /*
+* Holding the slab_mutex assures nobody will touch the memcg_caches
+* array while we are modifying it.
+*/
+   lockdep_assert_held(&slab_mutex);
+
root = s->memcg_params->root_cache;
memcg = s->memcg_params->memcg;
id = memcg_cache_id(memcg);
@@ -3276,6 +3289,7 @@ void memcg_unregister_cache(struct kmem_cache *s)
 * after removing it from the memcg_slab_caches list, otherwise we can
 * fail to convert memcg_params_to_cache() while traversing the list.
 */
+   VM_BUG_ON(!root->memcg_params->memcg_caches[id]);
root->memcg_params->memcg_caches[id] = NULL;
 
css_put(&memcg->css);
@@ -3428,22 +3442,13 @@ static struct kmem_cache 
*memcg_create_kmem_cache(struct mem_cgroup *memcg,
  struct kmem_cache *cachep)
 {
struct kmem_cache *new_cachep;
-   int idx;
 
BUG_ON(!memcg_can_account_kmem(

[PATCH 04/11] memcg, slab: fix barrier usage when accessing memcg_caches

2013-12-21 Thread Vladimir Davydov
Each root kmem_cache has pointers to per-memcg caches stored in its
memcg_params::memcg_caches array. Whenever we want to allocate a slab
for a memcg, we access this array to get per-memcg cache to allocate
from (see memcg_kmem_get_cache()). The access must be lock-free for
performance reasons, so we should use barriers to assert the kmem_cache
is up-to-date.

First, we should place a write barrier immediately before setting the
pointer to it in the memcg_caches array in order to make sure nobody
will see a partially initialized object. Second, we should issue a read
barrier before dereferencing the pointer to conform to the write
barrier.

However, currently the barrier usage looks rather strange. We have a
write barrier *after* setting the pointer and a read barrier *before*
reading the pointer, which is incorrect. This patch fixes this.

Signed-off-by: Vladimir Davydov 
Cc: Michal Hocko 
Cc: Glauber Costa 
Cc: Johannes Weiner 
Cc: Balbir Singh 
Cc: KAMEZAWA Hiroyuki 
Cc: Pekka Enberg 
Cc: Christoph Lameter 
Cc: Andrew Morton 
---
 mm/memcontrol.c |   24 ++--
 mm/slab.h   |   12 +++-
 2 files changed, 21 insertions(+), 15 deletions(-)

diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index f8eb994..999e7d4 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -3238,12 +3238,14 @@ void memcg_register_cache(struct kmem_cache *s)
list_add(&s->memcg_params->list, &memcg->memcg_slab_caches);
mutex_unlock(&memcg->slab_caches_mutex);
 
-   root->memcg_params->memcg_caches[id] = s;
/*
-* the readers won't lock, make sure everybody sees the updated value,
-* so they won't put stuff in the queue again for no reason
+* Since readers won't lock (see cache_from_memcg_idx()), we need a
+* barrier here to ensure nobody will see the kmem_cache partially
+* initialized.
 */
-   wmb();
+   smp_wmb();
+
+   root->memcg_params->memcg_caches[id] = s;
 }
 
 void memcg_unregister_cache(struct kmem_cache *s)
@@ -3569,7 +3571,7 @@ struct kmem_cache *__memcg_kmem_get_cache(struct 
kmem_cache *cachep,
  gfp_t gfp)
 {
struct mem_cgroup *memcg;
-   int idx;
+   struct kmem_cache *memcg_cachep;
 
VM_BUG_ON(!cachep->memcg_params);
VM_BUG_ON(!cachep->memcg_params->is_root_cache);
@@ -3583,15 +3585,9 @@ struct kmem_cache *__memcg_kmem_get_cache(struct 
kmem_cache *cachep,
if (!memcg_can_account_kmem(memcg))
goto out;
 
-   idx = memcg_cache_id(memcg);
-
-   /*
-* barrier to mare sure we're always seeing the up to date value.  The
-* code updating memcg_caches will issue a write barrier to match this.
-*/
-   read_barrier_depends();
-   if (likely(cache_from_memcg_idx(cachep, idx))) {
-   cachep = cache_from_memcg_idx(cachep, idx);
+   memcg_cachep = cache_from_memcg_idx(cachep, memcg_cache_id(memcg));
+   if (likely(memcg_cachep)) {
+   cachep = memcg_cachep;
goto out;
}
 
diff --git a/mm/slab.h b/mm/slab.h
index 0859c42..72d1f9d 100644
--- a/mm/slab.h
+++ b/mm/slab.h
@@ -163,9 +163,19 @@ static inline const char *cache_name(struct kmem_cache *s)
 static inline struct kmem_cache *
 cache_from_memcg_idx(struct kmem_cache *s, int idx)
 {
+   struct kmem_cache *cachep;
+
if (!s->memcg_params)
return NULL;
-   return s->memcg_params->memcg_caches[idx];
+   cachep = s->memcg_params->memcg_caches[idx];
+
+   /*
+* Make sure we will access the up-to-date value. The code updating
+* memcg_caches issues a write barrier to match this (see
+* memcg_register_cache()).
+*/
+   smp_read_barrier_depends();
+   return cachep;
 }
 
 static inline struct kmem_cache *memcg_root_cache(struct kmem_cache *s)
-- 
1.7.10.4

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[PATCH 11/11] memcg: rework memcg_update_kmem_limit synchronization

2013-12-21 Thread Vladimir Davydov
Currently we take both the memcg_create_mutex and the set_limit_mutex
when we enable kmem accounting for a memory cgroup, which makes kmem
activation events serialize with both memcg creations and other memcg
limit updates (memory.limit, memory.memsw.limit). However, there is no
point in such strict synchronization rules there.

First, the set_limit_mutex was introduced to keep the memory.limit and
memory.memsw.limit values in sync. Since memory.kmem.limit can be set
independently of them, it is better to introduce a separate mutex to
synchronize against concurrent kmem limit updates.

Second, we take the memcg_create_mutex in order to make sure all
children of this memcg will be kmem-active as well. For achieving that,
it is enough to hold this mutex only while checking if
memcg_has_children() though. This guarantees that if a child is added
after we checked that the memcg has no children, the newly added cgroup
will see its parent kmem-active (of course if the latter succeeded), and
call kmem activation for itself.

This patch simplifies the locking rules of memcg_update_kmem_limit()
according to these considerations.

Signed-off-by: Vladimir Davydov 
Cc: Michal Hocko 
Cc: Glauber Costa 
Cc: Johannes Weiner 
Cc: Balbir Singh 
Cc: KAMEZAWA Hiroyuki 
Cc: Andrew Morton 
---
 mm/memcontrol.c |  198 +--
 1 file changed, 106 insertions(+), 92 deletions(-)

diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index a5a1ae1..696707c 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -2941,6 +2941,8 @@ static void __mem_cgroup_commit_charge(struct mem_cgroup 
*memcg,
 static DEFINE_MUTEX(set_limit_mutex);
 
 #ifdef CONFIG_MEMCG_KMEM
+static DEFINE_MUTEX(activate_kmem_mutex);
+
 static inline bool memcg_can_account_kmem(struct mem_cgroup *memcg)
 {
return !mem_cgroup_disabled() && !mem_cgroup_is_root(memcg) &&
@@ -3054,34 +3056,6 @@ int memcg_cache_id(struct mem_cgroup *memcg)
return memcg ? memcg->kmemcg_id : -1;
 }
 
-/*
- * This ends up being protected by the set_limit mutex, during normal
- * operation, because that is its main call site.
- *
- * But when we create a new cache, we can call this as well if its parent
- * is kmem-limited. That will have to hold set_limit_mutex as well.
- */
-int memcg_update_cache_sizes(struct mem_cgroup *memcg)
-{
-   int num, ret;
-
-   num = ida_simple_get(&kmem_limited_groups,
-   0, MEMCG_CACHES_MAX_SIZE, GFP_KERNEL);
-   if (num < 0)
-   return num;
-
-   ret = memcg_update_all_caches(num+1);
-   if (ret) {
-   ida_simple_remove(&kmem_limited_groups, num);
-   return ret;
-   }
-
-   memcg->kmemcg_id = num;
-   INIT_LIST_HEAD(&memcg->memcg_slab_caches);
-   mutex_init(&memcg->slab_caches_mutex);
-   return 0;
-}
-
 static size_t memcg_caches_array_size(int num_groups)
 {
ssize_t size;
@@ -3424,9 +3398,10 @@ void kmem_cache_destroy_memcg_children(struct kmem_cache 
*s)
 *
 * Still, we don't want anyone else freeing memcg_caches under our
 * noses, which can happen if a new memcg comes to life. As usual,
-* we'll take the set_limit_mutex to protect ourselves against this.
+* we'll take the activate_kmem_mutex to protect ourselves against
+* this.
 */
-   mutex_lock(&set_limit_mutex);
+   mutex_lock(&activate_kmem_mutex);
for_each_memcg_cache_index(i) {
c = cache_from_memcg_idx(s, i);
if (!c)
@@ -3449,7 +3424,7 @@ void kmem_cache_destroy_memcg_children(struct kmem_cache 
*s)
cancel_work_sync(&c->memcg_params->destroy);
kmem_cache_destroy(c);
}
-   mutex_unlock(&set_limit_mutex);
+   mutex_unlock(&activate_kmem_mutex);
 }
 
 struct create_work {
@@ -5116,11 +5091,23 @@ static ssize_t mem_cgroup_read(struct 
cgroup_subsys_state *css,
return simple_read_from_buffer(buf, nbytes, ppos, str, len);
 }
 
-static int memcg_update_kmem_limit(struct cgroup_subsys_state *css, u64 val)
-{
-   int ret = -EINVAL;
 #ifdef CONFIG_MEMCG_KMEM
-   struct mem_cgroup *memcg = mem_cgroup_from_css(css);
+/* should be called with activate_kmem_mutex held */
+static int __memcg_activate_kmem(struct mem_cgroup *memcg,
+unsigned long long limit)
+{
+   int err = 0;
+   int memcg_id;
+
+   if (memcg_kmem_is_active(memcg))
+   return 0;
+
+   /*
+* We are going to allocate memory for data shared by all memory
+* cgroups so let's stop accounting here.
+*/
+   memcg_stop_kmem_account();
+
/*
 * For simplicity, we won't allow this to be disabled.  It also can't
 * be changed if the cgroup has children already, or if tasks had
@@ -5134,72 +5121,101 @@ stati

[PATCH 10/11] memcg: remove KMEM_ACCOUNTED_ACTIVATED flag

2013-12-21 Thread Vladimir Davydov
Currently we have two state bits in mem_cgroup::kmem_account_flags
regarding kmem accounting activation, ACTIVATED and ACTIVE. We start
kmem accounting only if both flags are set (memcg_can_account_kmem()),
plus throughout the code there are several places where we check only
the ACTIVE flag, but we never check the ACTIVATED flag alone. These
flags are both set from memcg_update_kmem_limit() under the
set_limit_mutex, the ACTIVE flag always being set after ACTIVATED, and
they never get cleared. That said checking if both flags are set is
equivalent to checking only for the ACTIVE flag, and since there is no
ACTIVATED flag checks, we can safely remove the ACTIVATED flag, and
nothing will change.

Let's try to understand what was the reason for introducing these flags.
The purpose of the ACTIVE flag is clear - it states that kmem should be
accounting to the cgroup. The only requirement for it is that it should
be set after we have fully initialized kmem accounting bits for the
cgroup and patched all static branches relating to kmem accounting.
Since we always check if static branch is enabled before actually
considering if we should account (otherwise we wouldn't benefit from
static branching), this guarantees us that we won't skip a commit or
uncharge after a charge due to an unpatched static branch.

Now let's move on to the ACTIVATED bit. As I proved in the beginning of
this message, it is absolutely useless, and removing it will change
nothing. So what was the reason introducing it?

The ACTIVATED flag was introduced by commit a8964b9b ("memcg: use static
branches when code not in use") in order to guarantee that
static_key_slow_inc(&memcg_kmem_enabled_key) would be called only once
for each memory cgroup when its kmem accounting was activated. The point
was that at that time the memcg_update_kmem_limit() function's work-flow
looked like this:

bool must_inc_static_branch = false;

cgroup_lock();
mutex_lock(&set_limit_mutex);
if (!memcg->kmem_account_flags && val != RESOURCE_MAX) {
/* The kmem limit is set for the first time */
ret = res_counter_set_limit(&memcg->kmem, val);

memcg_kmem_set_activated(memcg);
must_inc_static_branch = true;
} else
ret = res_counter_set_limit(&memcg->kmem, val);
mutex_unlock(&set_limit_mutex);
cgroup_unlock();

if (must_inc_static_branch) {
/* We can't do this under cgroup_lock */
static_key_slow_inc(&memcg_kmem_enabled_key);
memcg_kmem_set_active(memcg);
}

So that without the ACTIVATED flag we could race with other threads
trying to set the limit and increment the static branching ref-counter
more than once. Today we call the whole memcg_update_kmem_limit()
function under the set_limit_mutex and this race is impossible.

As now we understand why the ACTIVATED bit was introduced and why we
don't need it now, and know that removing it will change nothing anyway,
let's get rid of it.

Signed-off-by: Vladimir Davydov 
Cc: Michal Hocko 
Cc: Glauber Costa 
Cc: Johannes Weiner 
Cc: Balbir Singh 
Cc: KAMEZAWA Hiroyuki 
Cc: Andrew Morton 
---
 mm/memcontrol.c |   28 ++--
 1 file changed, 2 insertions(+), 26 deletions(-)

diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index a7521c3..a5a1ae1 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -343,15 +343,10 @@ static size_t memcg_size(void)
 
 /* internal only representation about the status of kmem accounting. */
 enum {
-   KMEM_ACCOUNTED_ACTIVE = 0, /* accounted by this cgroup itself */
-   KMEM_ACCOUNTED_ACTIVATED, /* static key enabled. */
+   KMEM_ACCOUNTED_ACTIVE, /* accounted by this cgroup itself */
KMEM_ACCOUNTED_DEAD, /* dead memcg with pending kmem charges */
 };
 
-/* We account when limit is on, but only after call sites are patched */
-#define KMEM_ACCOUNTED_MASK \
-   ((1 << KMEM_ACCOUNTED_ACTIVE) | (1 << KMEM_ACCOUNTED_ACTIVATED))
-
 #ifdef CONFIG_MEMCG_KMEM
 static inline void memcg_kmem_set_active(struct mem_cgroup *memcg)
 {
@@ -363,16 +358,6 @@ static bool memcg_kmem_is_active(struct mem_cgroup *memcg)
return test_bit(KMEM_ACCOUNTED_ACTIVE, &memcg->kmem_account_flags);
 }
 
-static void memcg_kmem_set_activated(struct mem_cgroup *memcg)
-{
-   set_bit(KMEM_ACCOUNTED_ACTIVATED, &memcg->kmem_account_flags);
-}
-
-static void memcg_kmem_clear_activated(struct mem_cgroup *memcg)
-{
-   clear_bit(KMEM_ACCOUNTED_ACTIVATED, &memcg->kmem_account_flags);
-}
-
 static void memcg_kmem_mark_dead(struct mem_cgroup *memcg)
 {
/*
@@ -2959,7 +2944,7 @@ static DEFINE_MUTEX(set_limit_mutex);
 static inline bool memcg_can_account_kmem(struct mem_cgroup *memcg)
 {
return !mem_cgroup_disabled() && !mem_cg

[PATCH 08/11] slab: do not panic if we fail to create memcg cache

2013-12-21 Thread Vladimir Davydov
There is no point in flooding logs with warnings or especially crashing
the system if we fail to create a cache for a memcg. In this case we
will be accounting the memcg allocation to the root cgroup until we
succeed to create its own cache, but it isn't that critical.

Signed-off-by: Vladimir Davydov 
Cc: Michal Hocko 
Cc: Glauber Costa 
Cc: Johannes Weiner 
Cc: Pekka Enberg 
Cc: Christoph Lameter 
Cc: Andrew Morton 
---
 mm/slab_common.c |9 -
 1 file changed, 8 insertions(+), 1 deletion(-)

diff --git a/mm/slab_common.c b/mm/slab_common.c
index f34707e..8e40321 100644
--- a/mm/slab_common.c
+++ b/mm/slab_common.c
@@ -233,7 +233,14 @@ out_unlock:
mutex_unlock(&slab_mutex);
put_online_cpus();
 
-   if (err) {
+   /*
+* There is no point in flooding logs with warnings or especially
+* crashing the system if we fail to create a cache for a memcg. In
+* this case we will be accounting the memcg allocation to the root
+* cgroup until we succeed to create its own cache, but it isn't that
+* critical.
+*/
+   if (err && !memcg) {
if (flags & SLAB_PANIC)
panic("kmem_cache_create: Failed to create slab '%s'. 
Error %d\n",
name, err);
-- 
1.7.10.4

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[PATCH 02/11] memcg, slab: kmem_cache_create_memcg(): fix memleak on fail path

2013-12-21 Thread Vladimir Davydov
We do not free the cache's memcg_params if __kmem_cache_create fails.
Fix this.

Plus, rename memcg_register_cache() to memcg_alloc_cache_params(),
because it actually does not register the cache anywhere, but simply
initialize kmem_cache::memcg_params.

Signed-off-by: Vladimir Davydov 
Cc: Michal Hocko 
Cc: Glauber Costa 
Cc: Johannes Weiner 
Cc: Balbir Singh 
Cc: KAMEZAWA Hiroyuki 
Cc: Pekka Enberg 
Cc: Christoph Lameter 
Cc: Andrew Morton 
---
 include/linux/memcontrol.h |   14 +-
 mm/memcontrol.c|   11 ---
 mm/slab_common.c   |3 ++-
 3 files changed, 19 insertions(+), 9 deletions(-)

diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
index b3e7a66..5e6541f 100644
--- a/include/linux/memcontrol.h
+++ b/include/linux/memcontrol.h
@@ -497,8 +497,9 @@ void __memcg_kmem_commit_charge(struct page *page,
 void __memcg_kmem_uncharge_pages(struct page *page, int order);
 
 int memcg_cache_id(struct mem_cgroup *memcg);
-int memcg_register_cache(struct mem_cgroup *memcg, struct kmem_cache *s,
-struct kmem_cache *root_cache);
+int memcg_alloc_cache_params(struct mem_cgroup *memcg, struct kmem_cache *s,
+struct kmem_cache *root_cache);
+void memcg_free_cache_params(struct kmem_cache *s);
 void memcg_release_cache(struct kmem_cache *cachep);
 void memcg_cache_list_add(struct mem_cgroup *memcg, struct kmem_cache *cachep);
 
@@ -640,13 +641,16 @@ static inline int memcg_cache_id(struct mem_cgroup *memcg)
return -1;
 }
 
-static inline int
-memcg_register_cache(struct mem_cgroup *memcg, struct kmem_cache *s,
-struct kmem_cache *root_cache)
+static inline int memcg_alloc_cache_params(struct mem_cgroup *memcg,
+   struct kmem_cache *s, struct kmem_cache *root_cache)
 {
return 0;
 }
 
+static inline void memcg_free_cache_params(struct kmem_cache *s);
+{
+}
+
 static inline void memcg_release_cache(struct kmem_cache *cachep)
 {
 }
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index bf5e894..8c47910 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -3195,8 +3195,8 @@ int memcg_update_cache_size(struct kmem_cache *s, int 
num_groups)
return 0;
 }
 
-int memcg_register_cache(struct mem_cgroup *memcg, struct kmem_cache *s,
-struct kmem_cache *root_cache)
+int memcg_alloc_cache_params(struct mem_cgroup *memcg, struct kmem_cache *s,
+struct kmem_cache *root_cache)
 {
size_t size;
 
@@ -3224,6 +3224,11 @@ int memcg_register_cache(struct mem_cgroup *memcg, 
struct kmem_cache *s,
return 0;
 }
 
+void memcg_free_cache_params(struct kmem_cache *s)
+{
+   kfree(s->memcg_params);
+}
+
 void memcg_release_cache(struct kmem_cache *s)
 {
struct kmem_cache *root;
@@ -3252,7 +3257,7 @@ void memcg_release_cache(struct kmem_cache *s)
 
css_put(&memcg->css);
 out:
-   kfree(s->memcg_params);
+   memcg_free_cache_params(s);
 }
 
 /*
diff --git a/mm/slab_common.c b/mm/slab_common.c
index f70df3e..70f9e24 100644
--- a/mm/slab_common.c
+++ b/mm/slab_common.c
@@ -205,7 +205,7 @@ kmem_cache_create_memcg(struct mem_cgroup *memcg, const 
char *name, size_t size,
if (!s->name)
goto out_free_cache;
 
-   err = memcg_register_cache(memcg, s, parent_cache);
+   err = memcg_alloc_cache_params(memcg, s, parent_cache);
if (err)
goto out_free_cache;
 
@@ -235,6 +235,7 @@ out_unlock:
return s;
 
 out_free_cache:
+   memcg_free_cache_params(s);
kfree(s->name);
kmem_cache_free(kmem_cache, s);
goto out_unlock;
-- 
1.7.10.4

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[PATCH 07/11] memcg: get rid of kmem_cache_dup

2013-12-21 Thread Vladimir Davydov
kmem_cache_dup() is only called from memcg_create_kmem_cache(). The
latter, in fact, does nothing besides this, so let's fold
kmem_cache_dup() into memcg_create_kmem_cache().

This patch also makes the memcg_cache_mutex private to
memcg_create_kmem_cache(), because it is not used anywhere else.

Signed-off-by: Vladimir Davydov 
Cc: Michal Hocko 
Cc: Glauber Costa 
Cc: Johannes Weiner 
Cc: Balbir Singh 
Cc: KAMEZAWA Hiroyuki 
Cc: Andrew Morton 
---
 mm/memcontrol.c |   39 ---
 1 file changed, 8 insertions(+), 31 deletions(-)

diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 56fc410..ce25f77 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -3391,27 +3391,16 @@ void mem_cgroup_destroy_cache(struct kmem_cache *cachep)
schedule_work(&cachep->memcg_params->destroy);
 }
 
-/*
- * This lock protects updaters, not readers. We want readers to be as fast as
- * they can, and they will either see NULL or a valid cache value. Our model
- * allow them to see NULL, in which case the root memcg will be selected.
- *
- * We need this lock because multiple allocations to the same cache from a non
- * will span more than one worker. Only one of them can create the cache.
- */
-static DEFINE_MUTEX(memcg_cache_mutex);
-
-/*
- * Called with memcg_cache_mutex held
- */
-static struct kmem_cache *kmem_cache_dup(struct mem_cgroup *memcg,
-struct kmem_cache *s)
+static struct kmem_cache *memcg_create_kmem_cache(struct mem_cgroup *memcg,
+ struct kmem_cache *s)
 {
struct kmem_cache *new;
static char *tmp_name = NULL;
+   static DEFINE_MUTEX(mutex); /* protects tmp_name */
 
-   lockdep_assert_held(&memcg_cache_mutex);
+   BUG_ON(!memcg_can_account_kmem(memcg));
 
+   mutex_lock(&mutex);
/*
 * kmem_cache_create_memcg duplicates the given name and
 * cgroup_name for this name requires RCU context.
@@ -3434,25 +3423,13 @@ static struct kmem_cache *kmem_cache_dup(struct 
mem_cgroup *memcg,
 
if (new)
new->allocflags |= __GFP_KMEMCG;
+   else
+   new = s;
 
+   mutex_unlock(&mutex);
return new;
 }
 
-static struct kmem_cache *memcg_create_kmem_cache(struct mem_cgroup *memcg,
- struct kmem_cache *cachep)
-{
-   struct kmem_cache *new_cachep;
-
-   BUG_ON(!memcg_can_account_kmem(memcg));
-
-   mutex_lock(&memcg_cache_mutex);
-   new_cachep = kmem_cache_dup(memcg, cachep);
-   if (new_cachep == NULL)
-   new_cachep = cachep;
-   mutex_unlock(&memcg_cache_mutex);
-   return new_cachep;
-}
-
 void kmem_cache_destroy_memcg_children(struct kmem_cache *s)
 {
struct kmem_cache *c;
-- 
1.7.10.4

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[PATCH 00/11] kmemcg fixes

2013-12-21 Thread Vladimir Davydov
This patch-set fixes several bugs here and there in the implementation
of kmem accounting for memory cgroups and makes the code look a bit
clearer.

Vladimir Davydov (11):
  slab: cleanup kmem_cache_create_memcg() error handling
  memcg, slab: kmem_cache_create_memcg(): fix memleak on fail path
  memcg, slab: cleanup memcg cache initialization/destruction
  memcg, slab: fix barrier usage when accessing memcg_caches
  memcg: fix possible NULL deref while traversing memcg_slab_caches
list
  memcg, slab: fix races in per-memcg cache creation/destruction
  memcg: get rid of kmem_cache_dup
  slab: do not panic if we fail to create memcg cache
  memcg, slab: RCU protect memcg_params for root caches
  memcg: remove KMEM_ACCOUNTED_ACTIVATED flag
  memcg: rework memcg_update_kmem_limit synchronization

 include/linux/memcontrol.h |   23 +--
 include/linux/slab.h   |9 +-
 mm/memcontrol.c|  405 +---
 mm/slab.h  |   26 ++-
 mm/slab_common.c   |   90 ++
 5 files changed, 292 insertions(+), 261 deletions(-)

-- 
1.7.10.4

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[PATCH 01/11] slab: cleanup kmem_cache_create_memcg() error handling

2013-12-21 Thread Vladimir Davydov
Currently kmem_cache_create_memcg() backoffs on failure inside
conditionals, without using gotos. This results in the rollback code
duplication, which makes the function look cumbersome even though on
error we should only free the allocated cache. Since in the next patch I
am going to add yet another rollback function call on error path there,
let's employ labels instead of conditionals for undoing any changes on
failure to keep things clean.

Signed-off-by: Vladimir Davydov 
Reviewed-by: Pekka Enberg 
Cc: Michal Hocko 
Cc: Glauber Costa 
Cc: Johannes Weiner 
Cc: Christoph Lameter 
Cc: Andrew Morton 
---
 mm/slab_common.c |   65 ++
 1 file changed, 31 insertions(+), 34 deletions(-)

diff --git a/mm/slab_common.c b/mm/slab_common.c
index 0b7bb39..f70df3e 100644
--- a/mm/slab_common.c
+++ b/mm/slab_common.c
@@ -171,13 +171,14 @@ kmem_cache_create_memcg(struct mem_cgroup *memcg, const 
char *name, size_t size,
struct kmem_cache *parent_cache)
 {
struct kmem_cache *s = NULL;
-   int err = 0;
+   int err;
 
get_online_cpus();
mutex_lock(&slab_mutex);
 
-   if (!kmem_cache_sanity_check(memcg, name, size) == 0)
-   goto out_locked;
+   err = kmem_cache_sanity_check(memcg, name, size);
+   if (err)
+   goto out_unlock;
 
/*
 * Some allocators will constraint the set of valid flags to a subset
@@ -189,45 +190,38 @@ kmem_cache_create_memcg(struct mem_cgroup *memcg, const 
char *name, size_t size,
 
s = __kmem_cache_alias(memcg, name, size, align, flags, ctor);
if (s)
-   goto out_locked;
+   goto out_unlock;
 
+   err = -ENOMEM;
s = kmem_cache_zalloc(kmem_cache, GFP_KERNEL);
-   if (s) {
-   s->object_size = s->size = size;
-   s->align = calculate_alignment(flags, align, size);
-   s->ctor = ctor;
+   if (!s)
+   goto out_unlock;
 
-   if (memcg_register_cache(memcg, s, parent_cache)) {
-   kmem_cache_free(kmem_cache, s);
-   err = -ENOMEM;
-   goto out_locked;
-   }
+   s->object_size = s->size = size;
+   s->align = calculate_alignment(flags, align, size);
+   s->ctor = ctor;
 
-   s->name = kstrdup(name, GFP_KERNEL);
-   if (!s->name) {
-   kmem_cache_free(kmem_cache, s);
-   err = -ENOMEM;
-   goto out_locked;
-   }
+   s->name = kstrdup(name, GFP_KERNEL);
+   if (!s->name)
+   goto out_free_cache;
 
-   err = __kmem_cache_create(s, flags);
-   if (!err) {
-   s->refcount = 1;
-   list_add(&s->list, &slab_caches);
-   memcg_cache_list_add(memcg, s);
-   } else {
-   kfree(s->name);
-   kmem_cache_free(kmem_cache, s);
-   }
-   } else
-   err = -ENOMEM;
+   err = memcg_register_cache(memcg, s, parent_cache);
+   if (err)
+   goto out_free_cache;
+
+   err = __kmem_cache_create(s, flags);
+   if (err)
+   goto out_free_cache;
+
+   s->refcount = 1;
+   list_add(&s->list, &slab_caches);
+   memcg_cache_list_add(memcg, s);
 
-out_locked:
+out_unlock:
mutex_unlock(&slab_mutex);
put_online_cpus();
 
if (err) {
-
if (flags & SLAB_PANIC)
panic("kmem_cache_create: Failed to create slab '%s'. 
Error %d\n",
name, err);
@@ -236,11 +230,14 @@ out_locked:
name, err);
dump_stack();
}
-
return NULL;
}
-
return s;
+
+out_free_cache:
+   kfree(s->name);
+   kmem_cache_free(kmem_cache, s);
+   goto out_unlock;
 }
 
 struct kmem_cache *
-- 
1.7.10.4

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [RFC] memory cgroup: my thoughts on memsw

2014-09-08 Thread Vladimir Davydov
On Sat, Sep 06, 2014 at 08:15:44AM +0900, Kamezawa Hiroyuki wrote:
> As you noticed, hitting anon+swap limit just means oom-kill.
> My point is that using oom-killer for "server management" just seems crazy.
> 
> Let my clarify things. your proposal was.
>  1. soft-limit will be a main feature for server management.
>  2. Because of soft-limit, global memory reclaim runs.
>  3. Using swap at global memory reclaim can cause poor performance.
>  4. So, making use of OOM-Killer for avoiding swap.
> 
> I can't agree "4". I think
> 
>  - don't configure swap.

Suppose there are two containers, each having soft limit set to 50% of
total system RAM. One of the containers eats 90% of the system RAM by
allocating anonymous pages. Another starts using file caches and wants
more than 10% of RAM to work w/o issuing disk reads. So what should we
do then? We won't be able to shrink the first container to its soft
limit, because there's no swap. Leaving it as is would be unfair from
the second container's point of view. Kill it? But the whole system is
going OK, because the working set of the second container is easily
shrinkable. Besides there may be some progress in shrinking file caches
from the first container.

>  - use zram

In fact this isn't different from the previous proposal (working w/o
swap). ZRAM only compresses data while still storing them in RAM so we
eventually may get into a situation where almost all RAM is full of
compressed anon pages.

>  - use SSD for swap

Such a requirement might be OK in enterprise, but forcing SMB to update
their hardware to run a piece of software is a no go. And again, SSD
isn't infinite, we may use it up.

> Or
>  - provide a way to notify usage of "anon+swap" to container management 
> software.
> 
>Now we have "vmpressure". Container management software can kill or 
> respawn container
>with using user-defined policy for avoidng swap.
> 
>If you don't want to run kswapd at all, threshold notifier enhancement may 
> be required.
> 
> /proc/meminfo provides total number of ANON/CACHE pages.
> Many things can be done in userland.

AFAIK OOM-in-userspace-handling has been discussed many times, but
there's still no agreement upon it. Basically it isn't reliable, because
it can lead to a deadlock if the userspace handler won't be able to
allocate memory to proceed or will get stuck in some other way. IMO
there must be in-kernel OOM-handling as a last resort anyway. And
actually we already have one - we may kill processes when they hit the
memsw limit.

But OK, you don't like OOM on hitting anon+swap limit and propose to
introduce a kind of userspace notification instead, but the problem
actually isn't *WHAT* we should do on hitting anon+swap limit, but *HOW*
we should implement it (or should we implement it at all). No matter
which way we go, in-kernel OOM or userland notifications, we have to
*INTRODUCE ANON+SWAP ACCOUNTING* to achieve that so that on breaching a
predefined threshold we could invoke OOM or issue a userland
notification or both. And here goes the problem: there's anon+file and
anon+file+swap resource counters, but no anon+swap counter. To react on
anon+swap limit breaching, we must introduce one. I propose to *REUSE*
memsw instead by slightly modifying its meaning.

What we would get then is the ability to react on potentially
unreclaimable memory growth inside a container. What we would loose is
the current implementation of memory+swap limit, *BUT* we would still be
able to limit memory+swap usage by imposing limits on total memory and
anon+swap usage.

> And your idea can't help swap-out caused by memory pressure comes from 
> "zones".

It would help limit swap-out to a sane value.


I'm sorry if I'm not clear or don't understand something that looks
trivial to you.

Thanks,
Vladimir
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [RFC] memory cgroup: my thoughts on memsw

2014-09-09 Thread Vladimir Davydov
On Mon, Sep 08, 2014 at 10:53:48PM +0900, Kamezawa Hiroyuki wrote:
> (2014/09/08 20:01), Vladimir Davydov wrote:
> >But OK, you don't like OOM on hitting anon+swap limit and propose to
> >introduce a kind of userspace notification instead, but the problem
> >actually isn't *WHAT* we should do on hitting anon+swap limit, but *HOW*
> >we should implement it (or should we implement it at all).
> 
> 
> I'm not sure you're aware of or not, "hardlimit" counter is too expensive
> for your purpose.
> 
> If I was you, I'll use some lightweight counter like percpu_counter() or
> memcg's event handling system.
> Did you see how threshold notifier or vmpressure works ? It's very light 
> weight.

OK, after looking through the memory thresholds code and pondering the
problem a bit I tend to agree with you. We can tweak the notifiers to
trigger on anon+swap thresholds, handle them in userspace and do
whatever we like. At least for now, I don't see anything why this could
be worse than hard anon+swap limit except it requires more steps to
configure. Thank you for your patience while explaining this to me :-)

However, there's one thing, which made me start this discussion, and it
still bothers me. It's about memsw.limit_in_bytes knob itself.

First, its value must be greater or equal to memory.limit_in_bytes.
IMO, such a dependency in the user interface isn't great, but it isn't
the worst thing. What is worse, there's only point in setting it to
infinity if one wants to fully make use of soft limits as I pointed out
earlier.

So, we have a userspace knob that suits only for strict sand-boxing when
one wants to hard-limit the amount of memory and swap an app can use.
When it comes to soft limits, you have to set it to infinity, and it'll
still be accounted at the cost of performance, but without any purpose.
It just seems meaningless to me.

Not counting that the knob itself is a kind of confusing IMO. memsw
means memory+swap, so one would mistakenly think memsw.limit-mem.limit
is the limit on swap usage, but that's wrong.

My point is that anon+swap accounting instead of the current
anon+file+swap memsw implementation would be more flexible. We could
still sandbox apps by setting hard anon+swap and memory limits, but it
would also be possible to make use of it in "soft" environments. It
wouldn't be mandatory though. If one doesn't like OOM, he can use
threshold notifications to restart the container when it starts to
behave badly. But if the user just doesn't want to bother about
configuration or is OK with OOM-killer, he could set hard anon+swap
limit. Besides, it would untie mem.limit knob from memsw.limit, which
would make the user interface simpler and cleaner.

So, I think anon+swap limit would be more flexible than file+anon+swap
limit we have now. Is there any use case where anon+swap and anon+file
accounting couldn't satisfy the user requirements while the
anon+file+swap and anon+file pair could?

> >No matter which way we go, in-kernel OOM or userland notifications, we have 
> >to
> >*INTRODUCE ANON+SWAP ACCOUNTING* to achieve that so that on breaching a
> >predefined threshold we could invoke OOM or issue a userland
> >notification or both. And here goes the problem: there's anon+file and
> >anon+file+swap resource counters, but no anon+swap counter. To react on
> >anon+swap limit breaching, we must introduce one. I propose to *REUSE*
> >memsw instead by slightly modifying its meaning.
> >
> you can see "anon+swap"  via memcg's accounting.
> 
> >What we would get then is the ability to react on potentially
> >unreclaimable memory growth inside a container. What we would loose is
> >the current implementation of memory+swap limit, *BUT* we would still be
> >able to limit memory+swap usage by imposing limits on total memory and
> >anon+swap usage.
> >
> 
> I repeatedly say anon+swap "hardlimit" just means OOM. That's not buy.

anon+file+swap hardlimit eventually means OOM too :-/

> >>And your idea can't help swap-out caused by memory pressure comes from 
> >>"zones".
> >
> >It would help limit swap-out to a sane value.
> >
> >
> >I'm sorry if I'm not clear or don't understand something that looks
> >trivial to you.
> >
> 
> It seems your purpose is to avoiding system-wide-oom-situation. Right ?

This is the purpose of any hard memory limit, including the current
implementation - avoiding global memory pressure in general and
system-wide OOM in particular.

> Implementing system-wide-oom-kill-avoidance logic in memcg doesn't
> sound good to me. It should work under system-wide memory management logic.
>

Re: [RFC] memory cgroup: my thoughts on memsw

2014-09-10 Thread Vladimir Davydov
On Mon, Sep 08, 2014 at 10:53:48PM +0900, Kamezawa Hiroyuki wrote:
> (2014/09/08 20:01), Vladimir Davydov wrote:
> >On Sat, Sep 06, 2014 at 08:15:44AM +0900, Kamezawa Hiroyuki wrote:
> >>As you noticed, hitting anon+swap limit just means oom-kill.
> >>My point is that using oom-killer for "server management" just seems crazy.
> >>
> >>Let my clarify things. your proposal was.
> >>  1. soft-limit will be a main feature for server management.
> >>  2. Because of soft-limit, global memory reclaim runs.
> >>  3. Using swap at global memory reclaim can cause poor performance.
> >>  4. So, making use of OOM-Killer for avoiding swap.
> >>
> >>I can't agree "4". I think
> >>
> >>  - don't configure swap.
> >
> >Suppose there are two containers, each having soft limit set to 50% of
> >total system RAM. One of the containers eats 90% of the system RAM by
> >allocating anonymous pages. Another starts using file caches and wants
> >more than 10% of RAM to work w/o issuing disk reads. So what should we
> >do then?
> >We won't be able to shrink the first container to its soft
> >limit, because there's no swap. Leaving it as is would be unfair from
> >the second container's point of view. Kill it? But the whole system is
> >going OK, because the working set of the second container is easily
> >shrinkable. Besides there may be some progress in shrinking file caches
> >from the first container.
> >
> >>  - use zram
> >
> >In fact this isn't different from the previous proposal (working w/o
> >swap). ZRAM only compresses data while still storing them in RAM so we
> >eventually may get into a situation where almost all RAM is full of
> >compressed anon pages.
> >
> 
> In above 2 cases, "vmpressure" works fine.

What if a container allocates memory so fast that the userspace thread
handling its threshold notifications won't have time to react before it
eats all memory?

Thanks,
Vladimir
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[RFC] memory cgroup: my thoughts on memsw

2014-09-04 Thread Vladimir Davydov
Hi,

Over its long history the memory cgroup has been developed rapidly, but
rather in a disordered manner. As a result, today we have a bunch of
features that are practically unusable and wants redesign (soft limits)
or even not working (kmem accounting), not talking about the messy user
interface we have (the _in_bytes suffix is driving me mad :-).

Fortunately, thanks to Tejun's unified cgroup hierarchy, we have a great
chance to drop or redesign some of the old features and their
interfaces. We should use this opportunity to examine every aspect of
the memory cgroup design, because we will probably not be granted such a
present in future.

That's why I'm starting a series of RFC's with *my thoughts* not only on
kmem accounting, which I've been trying to fix for a while, but also on
other parts of the memory cgroup. I'll be happy if anybody reads this to
the end, but please don't kick me too hard if something will look stupid
to you :-)


Today's topic is (surprisingly!) the memsw resource counter and where it
fails to satisfy user requests.

Let's start from the very beginning. The memory cgroup has basically two
resource counters (not counting kmem, which is unusable anyway):
mem_cgroup->res (configured by memory.limit), which counts the total
amount of user pages charged to the cgroup, and mem_cgroup->memsw
(memory.memsw.limit), which is basically res + the cgroup's swap usage.
Obviously, memsw always has both the value and limit less than the value
and limit of res. That said, we have three options:

 - memory.limit=inf, memory.memsw.limit=inf
   No limits, only accounting.

 - memory.limit=Lres accounting
and limiting total user memory (cache+anon) usage for processes inside
cgroups. This is where there's nothing to do. However, mem_cgroup->memsw
should be reworked to account *only* memory that may be swapped out plus
memory that has been swapped out (i.e. swap usage).

This way, by setting memsw.limit (or how it should be called) less than
memory soft limit we would solve the problem I described above. The
container would be then allowed to use only file caches above its
memsw.limit, which are usually easily shrinkable, and get OOM-kill while
trying to eat too much swappable memory.

The configuration will also be less confusing then IMO:

 - memory.limit - container can't use memory above this
 - memory.memsw.limit - container can't use swappable memory above this

>From this it clearly follows maximal swap usage is limited by
memory.memsw.limit.

One more thought. Anon memory and file caches are different and should
be handled differently, so mixing them both under the same counter looks
strange to me. Moreover, they are *already* handled differently
throughout the kernel - just look at mm/vmscan.c. Here are the
differences between them I see:

 - Anon memory is handled by the user application, while file caches are
   all on the kernel. That means the application will *definitely* die
   w/o anon memory. W/o file caches it usually can survive, but the more
   caches it has the better it feels.

 - Anon memory is not that easy to reclaim. Swap out is a really slow
   process, because data are usually read/written w/o any specific
   order. Dropping file caches is much easier. Typically we have lots of
   clean pages there.

 - Swap space is limited. And today, it's OK to have TBs of RAM and only
   several GBs of swap. Customers simply don't want to waste their disk
   space on that.

IMO, these lead us to the need for limiting swap/swappable memory usage,
but not swap+mem usage.


Now, a bad thing about such a change (if it were ever considered).
There's no way to convert old settings to new, i.e. if we currently have

  mem <= L,
  mem + swap <= S,
  L <= S,

we can set

  mem <= L1,
  swappable_mem <= S1,

where either 

L1 = L, S1 = S

or

L1 = L, S1 = S - L,

but both configurations won't be exactly the same. In the first case
memory+swap usage will be limited by L+S, not by S. In the second case,
although memory+swaphttp://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [RFC] memory cgroup: my thoughts on memsw

2014-09-05 Thread Vladimir Davydov
Hi Kamezawa,

Thanks for reading this :-)

On Fri, Sep 05, 2014 at 07:03:57AM +0900, Kamezawa Hiroyuki wrote:
> (2014/09/04 23:30), Vladimir Davydov wrote:
> >  - memory.limit - container can't use memory above this
> >  - memory.memsw.limit - container can't use swappable memory above this
> 
> If one hits anon+swap limit, it just means OOM. Hitting limit means
> process's death.

Basically yes. Hitting the memory.limit will result in swap out + cache
reclaim no matter if it's an anon charge or a page cache one. Hitting
the swappable memory limit (anon+swap) can only occur on anon charge and
if it happens we have no choice rather than invoking OOM.

Frankly, I don't see anything wrong in such a behavior. Why is it worse
than the current behavior where we also kill processes if a cgroup
reaches memsw.limit and we can't reclaim page caches?

I admit I may be missing something. So I'd appreciate if you could
provide me with a use case where we want *only* the current behavior and
my proposal is a no-go.

> Is it useful ?

I think so, at least, if we want to use soft limits. The point is we
will have to kill a process if it eats too much anon memory *anyway*
when it comes to global memory pressure, but before finishing it we'll
be torturing the culprit as well as *innocent* processes by issuing
massive reclaim, as I tried to point out in the example above. IMO, this
is no good.

Besides, I believe such a distinction between swappable memory and
caches would look more natural to users. Everyone got used to it
actually. For example, when an admin or user or any userspace utility
looks at the output of free(1), it primarily pays attention to free
memory "-/+ buffers/caches", because almost all memory is usually full
with file caches. And they know that caches easy come, easy go. IMO, for
them it'd be more useful to limit this to avoid nasty surprises in the
future, and only set some hints for page cache reclaim.

The only exception is strict sand-boxing, but AFAIU we can sand-box apps
perfectly well with this either, because we would still have a strict
memory limit and a limit on maximal swap usage.

Please sorry if the idea looks to you totally stupid (may be it is!),
but let's just try to consider every possibility we have in mind.

Thanks,
Vladimir
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [patch] mm: memcontrol: revert use of root_mem_cgroup res_counter

2014-09-05 Thread Vladimir Davydov
On Fri, Sep 05, 2014 at 08:43:57AM -0400, Johannes Weiner wrote:
> Dave Hansen reports a massive scalability regression in an uncontained
> page fault benchmark with more than 30 concurrent threads, which he
> bisected down to 05b843012335 ("mm: memcontrol: use root_mem_cgroup
> res_counter") and pin-pointed on res_counter spinlock contention.
> 
> That change relied on the per-cpu charge caches to mostly swallow the
> res_counter costs, but it's apparent that the caches don't scale yet.
> 
> Revert memcg back to bypassing res_counters on the root level in order
> to restore performance for uncontained workloads.
> 
> Reported-by: Dave Hansen 
> Signed-off-by: Johannes Weiner 
> Tested-by: Dave Hansen 
> Acked-by: Michal Hocko 

It's a pity we have to revert this nice cleanup, but seems we can't do
anything better right now. FWIW,

Reviewed-by: Vladimir Davydov 
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [RFC] memory cgroup: my thoughts on memsw

2014-09-05 Thread Vladimir Davydov
On Fri, Sep 05, 2014 at 11:20:43PM +0900, Kamezawa Hiroyuki wrote:
> Basically, I don't like OOM Kill. Anyone don't like it, I think.
> 
> In recent container use, application may be build as "stateless" and
> kill-and-respawn may not be problematic, but I think killing "a" process
> by oom-kill is too naive.
> 
> If your proposal is triggering notification to user space at hitting
> anon+swap limit, it may be useful.
> ...Some container-cluster management software can handle it.
> For example, container may be restarted.
> 
> Memcg has threshold notifier and vmpressure notifier.
> I think you can enhance it.
[...]
> My point is that "killing a process" tend not to be able to fix the situation.
> For example, fork-bomb by "make -j" cannot be handled by it.
> 
> So, I don't want to think about enhancing OOM-Kill. Please think of better
> way to survive. With the help of countainer-management-softwares, I think
> we can have several choices.
> 
> Restart contantainer (killall) may be the best if container app is stateless.
> Or container-management can provide some failover.

The problem I'm trying to set out is not about OOM actually (sorry if
the way I explain is confusing). We could probably configure OOM to kill
a whole cgroup (not just a process) and/or improve user-notification so
that the userspace could react somehow. I'm sure it must and will be
discussed one day.

The problem is that *before* invoking OOM on *global* pressure we're
trying to reclaim containers' memory and if there's progress we won't
invoke OOM. This can result in a huge slow down of the whole system (due
to swap out).

And if we want to fully make use of soft limits, we currently have no
means to limit anon memory at all. It's just impossible, because
memsw.limit must be > soft limit, otherwise it makes no sense. So we
will be trying to swap out under global pressure until we finally
realize there's no point in it and call OOM. If we don't, we'll be
suffering until the load goes away by itself.

> The 1st reason we added memsw.limit was for avoiding that the whole swap
> is used up by a cgroup where memory-leak of forkbomb running and not for
> some intellegent controls.
> 
> From your opinion, I feel what you want is avoiding charging against 
> page-caches.
> But thiking docker at el, page-cache is not shared between containers any 
> more.
> I think "including cache" makes sense.

Not exactly. It's not about sharing caches among containers. The point
is (1) it's difficult to estimate the size of file caches that will max
out the performance of a container, and (2) a typical workload will
perform better and put less pressure on disk if it has more caches.

Now imagine a big host running a small number of containers and
therefore having a lot of free memory most of time, but still
experiencing load spikes once an hour/day/whatever when memory usage
raises up drastically. It'd be unwise to set hard limits for those
containers that are running regularly, because they'd probably perform
much better if they had more file caches. So the admin decides to use
soft limits instead. He is forced to use memsw.limit > the soft limit,
but this is unsafe, because the container may eat anon memory up to
memsw.limit then, and anon memory isn't easy to get rid of when it comes
to the global pressure. If the admin had a mean to limit swappable
memory, he could avoid it. This is what I was trying to illustrate by
the example in the first e-mail of this thread.

Note if there were no soft limits, the current setup would be just fine,
otherwise it fails. And soft limits are proved to be useful AFAIK.

Thanks,
Vladimir
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[PATCH -mm] slab: fix cpuset check in fallback_alloc

2014-08-10 Thread Vladimir Davydov
fallback_alloc is called on kmalloc if the preferred node doesn't have
free or partial slabs and there's no pages on the node's free list
(GFP_THISNODE allocations fail). Before invoking the reclaimer it tries
to locate a free or partial slab on other allowed nodes' lists. While
iterating over the preferred node's zonelist it skips those zones which
cpuset_zone_allowed_hardwall returns false for. That means that for a
task bound to a specific node using cpusets fallback_alloc will always
ignore free slabs on other nodes and go directly to the reclaimer,
which, however, may allocate from other nodes if cpuset.mem_hardwall is
unset (default). As a result, we may get lists of free slabs grow
without bounds on other nodes, which is bad, because inactive slabs are
only evicted by cache_reap at a very slow rate and cannot be dropped
forcefully.

To reproduce the issue, run a process that will walk over a directory
tree with lots of files inside a cpuset bound to a node that constantly
experiences memory pressure. Look at num_slabs vs active_slabs growth as
reported by /proc/slabinfo.

We should use cpuset_zone_allowed_softwall in fallback_alloc. Since it
can sleep, we only call it on __GFP_WAIT allocations. For atomic
allocations we simply ignore cpusets, which is in agreement with the
cpuset documenation (see the comment to __cpuset_node_allowed_softwall).

Signed-off-by: Vladimir Davydov 
Cc: Christoph Lameter 
Cc: Pekka Enberg 
Cc: David Rientjes 
Cc: Joonsoo Kim 
---
 mm/slab.c |   23 ---
 1 file changed, 16 insertions(+), 7 deletions(-)

diff --git a/mm/slab.c b/mm/slab.c
index 2e60bf3dedbb..1d77a4df7ee1 100644
--- a/mm/slab.c
+++ b/mm/slab.c
@@ -3049,14 +3049,23 @@ retry:
for_each_zone_zonelist(zone, z, zonelist, high_zoneidx) {
nid = zone_to_nid(zone);
 
-   if (cpuset_zone_allowed_hardwall(zone, flags) &&
-   get_node(cache, nid) &&
-   get_node(cache, nid)->free_objects) {
-   obj = cache_alloc_node(cache,
-   flags | GFP_THISNODE, nid);
-   if (obj)
-   break;
+   if (!get_node(cache, nid) ||
+   !get_node(cache, nid)->free_objects)
+   continue;
+
+   if (local_flags & __GFP_WAIT) {
+   bool allowed;
+
+   local_irq_enable();
+   allowed = cpuset_zone_allowed_softwall(zone, flags);
+   local_irq_disable();
+   if (!allowed)
+   continue;
}
+
+   obj = cache_alloc_node(cache, flags | GFP_THISNODE, nid);
+   if (obj)
+   break;
}
 
if (!obj) {
-- 
1.7.10.4

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH -mm] slab: fix cpuset check in fallback_alloc

2014-08-11 Thread Vladimir Davydov
On Sun, Aug 10, 2014 at 03:43:21PM -0700, David Rientjes wrote:
> On Sun, 10 Aug 2014, Vladimir Davydov wrote:
> 
> > fallback_alloc is called on kmalloc if the preferred node doesn't have
> > free or partial slabs and there's no pages on the node's free list
> > (GFP_THISNODE allocations fail). Before invoking the reclaimer it tries
> > to locate a free or partial slab on other allowed nodes' lists. While
> > iterating over the preferred node's zonelist it skips those zones which
> > cpuset_zone_allowed_hardwall returns false for. That means that for a
> > task bound to a specific node using cpusets fallback_alloc will always
> > ignore free slabs on other nodes and go directly to the reclaimer,
> > which, however, may allocate from other nodes if cpuset.mem_hardwall is
> > unset (default). As a result, we may get lists of free slabs grow
> > without bounds on other nodes, which is bad, because inactive slabs are
> > only evicted by cache_reap at a very slow rate and cannot be dropped
> > forcefully.
> > 
> > To reproduce the issue, run a process that will walk over a directory
> > tree with lots of files inside a cpuset bound to a node that constantly
> > experiences memory pressure. Look at num_slabs vs active_slabs growth as
> > reported by /proc/slabinfo.
> > 
> > We should use cpuset_zone_allowed_softwall in fallback_alloc. Since it
> > can sleep, we only call it on __GFP_WAIT allocations. For atomic
> > allocations we simply ignore cpusets, which is in agreement with the
> > cpuset documenation (see the comment to __cpuset_node_allowed_softwall).
> > 
> 
> If that rule were ever changed, nobody would think to modify the 
> fallback_alloc() behavior in the slab allocator.  Why can't 
> cpuset_zone_allowed_hardwall() just return 1 for !__GFP_WAIT?
> 
> I don't think this issue is restricted only to slab, it's for all callers 
> of cpuset_zone_allowed_softwall() that could possibly be atomic.  I think 
> it would be better to determine if cpuset_zone_allowed() should be 
> hardwall or softwall depending on the gfp flags.
> 
> Let's add Li, the cpuset maintainer.  Any reason we can't do this?
> ---
[...]
> diff --git a/mm/slab.c b/mm/slab.c
> --- a/mm/slab.c
> +++ b/mm/slab.c
> @@ -3047,16 +3047,19 @@ retry:
>* from existing per node queues.
>*/
>   for_each_zone_zonelist(zone, z, zonelist, high_zoneidx) {
> - nid = zone_to_nid(zone);
> + struct kmem_cache_node *n;
>  
> - if (cpuset_zone_allowed_hardwall(zone, flags) &&
> - get_node(cache, nid) &&
> - get_node(cache, nid)->free_objects) {
> - obj = cache_alloc_node(cache,
> - flags | GFP_THISNODE, nid);
> - if (obj)
> - break;
> - }
> + nid = zone_to_nid(zone);
> + if (!cpuset_zone_allowed(zone, flags | __GFP_HARDWALL))

We must use softwall check here, otherwise we will proceed to
alloc_pages even if there are lots of free slabs on other nodes.
alloc_pages, in turn, may allocate from other nodes in case
cpuset.mem_hardwall=0, because it uses softwall check, so it may add yet
another free slab to another node's list even if it isn't empty. As a
result, we may get free list bloating on other nodes. I've seen a
machine with one of its nodes almost completely filled with inactive
slabs for buffer_heads (dozens of GBs) w/o any chance to drop them. So,
this is a bug that must be fixed.

Note, for SLUB using hardwall check in get_any_partial won't lead to
such a problem, because once added a new slab is loaded to a per cpu
list forcing any further user to allocate from it. Strictly speaking, we
should use softwall check there either though.

> + continue;
> + n = get_node(cache, nid);
> + if (!n)
> + continue;
> + if (!n->free_objects)
> + continue;
> + obj = cache_alloc_node(cache, flags | GFP_THISNODE, nid);
> + if (obj)
> + break;
>   }
>  
>   if (!obj) {
> diff --git a/mm/slub.c b/mm/slub.c
> --- a/mm/slub.c
> +++ b/mm/slub.c
> @@ -1671,20 +1671,22 @@ static void *get_any_partial(struct kmem_cache *s, 
> gfp_t flags,
>   struct kmem_cache_node *n;
>  
>   n = get_node(s, zone_to_nid(zone));
> + if (!n)
> + continue;
> + if (!cpuset_zo

Re: [PATCH -mm] slab: fix cpuset check in fallback_alloc

2014-08-11 Thread Vladimir Davydov
On Mon, Aug 11, 2014 at 04:37:15AM -0700, David Rientjes wrote:
> On Mon, 11 Aug 2014, Vladimir Davydov wrote:
> 
> > > diff --git a/mm/slab.c b/mm/slab.c
> > > --- a/mm/slab.c
> > > +++ b/mm/slab.c
> > > @@ -3047,16 +3047,19 @@ retry:
> > >* from existing per node queues.
> > >*/
> > >   for_each_zone_zonelist(zone, z, zonelist, high_zoneidx) {
> > > - nid = zone_to_nid(zone);
> > > + struct kmem_cache_node *n;
> > >  
> > > - if (cpuset_zone_allowed_hardwall(zone, flags) &&
> > > - get_node(cache, nid) &&
> > > - get_node(cache, nid)->free_objects) {
> > > - obj = cache_alloc_node(cache,
> > > - flags | GFP_THISNODE, nid);
> > > - if (obj)
> > > - break;
> > > - }
> > > + nid = zone_to_nid(zone);
> > > + if (!cpuset_zone_allowed(zone, flags | __GFP_HARDWALL))
> > 
> > We must use softwall check here, otherwise we will proceed to
> > alloc_pages even if there are lots of free slabs on other nodes.
> > alloc_pages, in turn, may allocate from other nodes in case
> > cpuset.mem_hardwall=0, because it uses softwall check, so it may add yet
> > another free slab to another node's list even if it isn't empty. As a
> > result, we may get free list bloating on other nodes. I've seen a
> > machine with one of its nodes almost completely filled with inactive
> > slabs for buffer_heads (dozens of GBs) w/o any chance to drop them. So,
> > this is a bug that must be fixed.
> > 
> 
> Right, I understand, and my patch makes no attempt to fix that issue, it's 
> simply collapsing the code down into a single cpuset_zone_allowed() 
> function and the context for the allocation is controlled by the gfp 
> flags (and hardwall is controlled by setting __GFP_HARDWALL) as it should 
> be.  I understand the issue you face, but I can't combine a cleanup with a 
> fix and I would prefer to have your patch keep your commit description.  

Sorry, I misunderstood you.

> The diffstat for my proposal removes many more lines than it adds and I 
> think it will avoid this type of issue in the future for new callers.  
> Your patch could then be based on the single cpuset_zone_allowed() 
> function where you would simply have to remove the __GFP_HARDWALL above.  
> Or, your patch could be merged first and then my cleanup on top, but it 
> seems like your one-liner would be more clear if it is based on mine.

Having one function instead of two doing similar thing is usually better
IMO, but AFAIU your patch isn't a mere cleanup - it also slightly
changes the logic behind !__GFP_WAIT vs cpusets interaction:

> @@ -2505,18 +2501,22 @@ static struct cpuset 
> *nearest_hardwall_ancestor(struct cpuset *cs)
>   *   GFP_USER - only nodes in current tasks mems allowed ok.
>   *
>   * Rule:
> - *Don't call cpuset_node_allowed_softwall if you can't sleep, unless you
> + *Don't call __cpuset_node_allowed if you can't sleep, unless you
>   *pass in the __GFP_HARDWALL flag set in gfp_flag, which disables
>   *the code that might scan up ancestor cpusets and sleep.
>   */
> -int __cpuset_node_allowed_softwall(int node, gfp_t gfp_mask)
> +int __cpuset_node_allowed(int node, const gfp_t gfp_mask)
>  {
>   struct cpuset *cs;  /* current cpuset ancestors */
>   int allowed;/* is allocation in zone z allowed? */
>  
> - if (in_interrupt() || (gfp_mask & __GFP_THISNODE))
> + if (in_interrupt())
>   return 1;
>   might_sleep_if(!(gfp_mask & __GFP_HARDWALL));
> + if (gfp_mask & __GFP_THISNODE)
> + return 1;
> + if (!(gfp_mask & __GFP_WAIT))
> + return 1;

This means cpuset_zone_allowed will now always return true for
!__GFP_WAIT allocations.

>   if (node_isset(node, current->mems_allowed))
>   return 1;
>   /*
[...]
> diff --git a/mm/page_alloc.c b/mm/page_alloc.c
> --- a/mm/page_alloc.c
> +++ b/mm/page_alloc.c
> @@ -1963,7 +1963,7 @@ zonelist_scan:
>  
>   /*
>* Scan zonelist, looking for a zone with enough free.
> -  * See also __cpuset_node_allowed_softwall() comment in kernel/cpuset.c.
> +  * See __cpuset_node_allowed() comment in kernel/cpuset.c.
>*/
>   for_each_zone_zonelist_nodemask(zone, z, zonelist,
>   high_zoneidx, nodemask) {
> @@ -1974,7 +1974,

Re: [PATCH -mm] memcg: remove activate_kmem_mutex

2014-10-20 Thread Vladimir Davydov
On Mon, Oct 20, 2014 at 08:53:06PM +0200, Michal Hocko wrote:
> On Mon 20-10-14 19:11:29, Vladimir Davydov wrote:
> > The activate_kmem_mutex is used to serialize memcg.kmem.limit updates,
> > but we already serialize them with memcg_limit_mutex so let's remove the
> > former.
> > 
> > Signed-off-by: Vladimir Davydov 
> 
> Is this the case since bd67314586a3 (memcg, slab: simplify
> synchronization scheme)?

No, it's since Johannes' lockless page counters patch where we have the
memcg_limit_mutex introduced to synchronize concurrent limit updates (mm
commit dc1815408849 "mm: memcontrol: lockless page counters").

Thanks,
Vladimir

> Anyway Looks good to me.
> Acked-by: Michal Hocko 
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [patch 1/4] mm: memcontrol: uncharge pages on swapout

2014-10-21 Thread Vladimir Davydov
On Mon, Oct 20, 2014 at 11:22:09AM -0400, Johannes Weiner wrote:
> mem_cgroup_swapout() is called with exclusive access to the page at
> the end of the page's lifetime.  Instead of clearing the PCG_MEMSW
> flag and deferring the uncharge, just do it right away.  This allows
> follow-up patches to simplify the uncharge code.
> 
> Signed-off-by: Johannes Weiner 
> ---
>  mm/memcontrol.c | 17 +
>  1 file changed, 13 insertions(+), 4 deletions(-)
> 
> diff --git a/mm/memcontrol.c b/mm/memcontrol.c
> index bea3fddb3372..7709f17347f3 100644
> --- a/mm/memcontrol.c
> +++ b/mm/memcontrol.c
> @@ -5799,6 +5799,7 @@ static void __init enable_swap_cgroup(void)
>   */
>  void mem_cgroup_swapout(struct page *page, swp_entry_t entry)
>  {
> + struct mem_cgroup *memcg;
>   struct page_cgroup *pc;
>   unsigned short oldid;
>  
> @@ -5815,13 +5816,21 @@ void mem_cgroup_swapout(struct page *page, 
> swp_entry_t entry)
>   return;
>  
>   VM_BUG_ON_PAGE(!(pc->flags & PCG_MEMSW), page);
> + memcg = pc->mem_cgroup;
>  
> - oldid = swap_cgroup_record(entry, mem_cgroup_id(pc->mem_cgroup));
> + oldid = swap_cgroup_record(entry, mem_cgroup_id(memcg));
>   VM_BUG_ON_PAGE(oldid, page);
> + mem_cgroup_swap_statistics(memcg, true);
>  
> - pc->flags &= ~PCG_MEMSW;
> - css_get(&pc->mem_cgroup->css);
> - mem_cgroup_swap_statistics(pc->mem_cgroup, true);
> + pc->flags = 0;
> +
> + if (!mem_cgroup_is_root(memcg))
> + page_counter_uncharge(&memcg->memory, 1);

AFAIU it removes batched uncharge of swapped out pages, doesn't it? Will
it affect performance?

Besides, it looks asymmetric with respect to the page cache uncharge
path, where we still defer uncharge to mem_cgroup_uncharge_list(), and I
personally rather dislike this asymmetry.

> +
> + local_irq_disable();
> + mem_cgroup_charge_statistics(memcg, page, -1);
> + memcg_check_events(memcg, page);
> + local_irq_enable();

AFAICT mem_cgroup_swapout() is called under mapping->tree_lock with irqs
disabled, so we should use irq_save/restore here.

Thanks,
Vladimir

>  }
>  
>  /**
> -- 
> 2.1.2
> 
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[PATCH] memcg: remove mem_cgroup_reclaimable check from soft reclaim

2014-10-21 Thread Vladimir Davydov
mem_cgroup_reclaimable() checks whether a cgroup has reclaimable pages
on *any* NUMA node. However, the only place where it's called is
mem_cgroup_soft_reclaim(), which tries to reclaim memory from a
*specific* zone. So the way how it's used is incorrect - it will return
true even if the cgroup doesn't have pages on the zone we're scanning.

I think we can get rid of this check completely, because
mem_cgroup_shrink_node_zone(), which is called by
mem_cgroup_soft_reclaim() if mem_cgroup_reclaimable() returns true, is
equivalent to shrink_lruvec(), which exits almost immediately if the
lruvec passed to it is empty. So there's no need to optimize anything
here. Besides, we don't have such a check in the general scan path
(shrink_zone) either.

Signed-off-by: Vladimir Davydov 
---
 mm/memcontrol.c |   43 ---
 1 file changed, 43 deletions(-)

diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 53393e27ff03..833b6a696aab 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -1799,52 +1799,11 @@ int mem_cgroup_select_victim_node(struct mem_cgroup 
*memcg)
memcg->last_scanned_node = node;
return node;
 }
-
-/*
- * Check all nodes whether it contains reclaimable pages or not.
- * For quick scan, we make use of scan_nodes. This will allow us to skip
- * unused nodes. But scan_nodes is lazily updated and may not cotain
- * enough new information. We need to do double check.
- */
-static bool mem_cgroup_reclaimable(struct mem_cgroup *memcg, bool noswap)
-{
-   int nid;
-
-   /*
-* quick check...making use of scan_node.
-* We can skip unused nodes.
-*/
-   if (!nodes_empty(memcg->scan_nodes)) {
-   for (nid = first_node(memcg->scan_nodes);
-nid < MAX_NUMNODES;
-nid = next_node(nid, memcg->scan_nodes)) {
-
-   if (test_mem_cgroup_node_reclaimable(memcg, nid, 
noswap))
-   return true;
-   }
-   }
-   /*
-* Check rest of nodes.
-*/
-   for_each_node_state(nid, N_MEMORY) {
-   if (node_isset(nid, memcg->scan_nodes))
-   continue;
-   if (test_mem_cgroup_node_reclaimable(memcg, nid, noswap))
-   return true;
-   }
-   return false;
-}
-
 #else
 int mem_cgroup_select_victim_node(struct mem_cgroup *memcg)
 {
return 0;
 }
-
-static bool mem_cgroup_reclaimable(struct mem_cgroup *memcg, bool noswap)
-{
-   return test_mem_cgroup_node_reclaimable(memcg, 0, noswap);
-}
 #endif
 
 static int mem_cgroup_soft_reclaim(struct mem_cgroup *root_memcg,
@@ -1888,8 +1847,6 @@ static int mem_cgroup_soft_reclaim(struct mem_cgroup 
*root_memcg,
}
continue;
}
-   if (!mem_cgroup_reclaimable(victim, false))
-   continue;
total += mem_cgroup_shrink_node_zone(victim, gfp_mask, false,
 zone, &nr_scanned);
*total_scanned += nr_scanned;
-- 
1.7.10.4

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH] memcg: remove mem_cgroup_reclaimable check from soft reclaim

2014-10-21 Thread Vladimir Davydov
On Tue, Oct 21, 2014 at 02:22:39PM -0400, Johannes Weiner wrote:
> On Tue, Oct 21, 2014 at 05:15:50PM +0400, Vladimir Davydov wrote:
> > mem_cgroup_reclaimable() checks whether a cgroup has reclaimable pages
> > on *any* NUMA node. However, the only place where it's called is
> > mem_cgroup_soft_reclaim(), which tries to reclaim memory from a
> > *specific* zone. So the way how it's used is incorrect - it will return
> > true even if the cgroup doesn't have pages on the zone we're scanning.
> > 
> > I think we can get rid of this check completely, because
> > mem_cgroup_shrink_node_zone(), which is called by
> > mem_cgroup_soft_reclaim() if mem_cgroup_reclaimable() returns true, is
> > equivalent to shrink_lruvec(), which exits almost immediately if the
> > lruvec passed to it is empty. So there's no need to optimize anything
> > here. Besides, we don't have such a check in the general scan path
> > (shrink_zone) either.
> > 
> > Signed-off-by: Vladimir Davydov 
> 
> Acked-by: Johannes Weiner 
> 
> How about this on top?
> 
> ---
> 
> From 27bd24b00433d9f6c8d60ba2b13dbff158b06c13 Mon Sep 17 00:00:00 2001
> From: Johannes Weiner 
> Date: Tue, 21 Oct 2014 09:53:54 -0400
> Subject: [patch] mm: memcontrol: do not filter reclaimable nodes in NUMA
>  round-robin
> 
> The round-robin node reclaim currently tries to include only nodes
> that have memory of the memcg in question, which is quite elaborate.
> 
> Just use plain round-robin over the nodes that are allowed by the
> task's cpuset, which are the most likely to contain that memcg's
> memory.  But even if zones without memcg memory are encountered,
> direct reclaim will skip over them without too much hassle.
> 
> Signed-off-by: Johannes Weiner 

Totally agree.

Acked-by: Vladimir Davydov 

> ---
>  mm/memcontrol.c | 97 
> +++--
>  1 file changed, 5 insertions(+), 92 deletions(-)
> 
> diff --git a/mm/memcontrol.c b/mm/memcontrol.c
> index d353d9e1fdca..293db8234179 100644
> --- a/mm/memcontrol.c
> +++ b/mm/memcontrol.c
> @@ -54,6 +54,7 @@
>  #include 
>  #include 
>  #include 
> +#include 
>  #include 
>  #include 
>  #include "internal.h"
> @@ -129,12 +130,10 @@ static const char * const mem_cgroup_lru_names[] = {
>  enum mem_cgroup_events_target {
>   MEM_CGROUP_TARGET_THRESH,
>   MEM_CGROUP_TARGET_SOFTLIMIT,
> - MEM_CGROUP_TARGET_NUMAINFO,
>   MEM_CGROUP_NTARGETS,
>  };
>  #define THRESHOLDS_EVENTS_TARGET 128
>  #define SOFTLIMIT_EVENTS_TARGET 1024
> -#define NUMAINFO_EVENTS_TARGET   1024
>  
>  struct mem_cgroup_stat_cpu {
>   long count[MEM_CGROUP_STAT_NSTATS];
> @@ -352,11 +351,6 @@ struct mem_cgroup {
>  #endif
>  
>   int last_scanned_node;
> -#if MAX_NUMNODES > 1
> - nodemask_t  scan_nodes;
> - atomic_tnumainfo_events;
> - atomic_tnumainfo_updating;
> -#endif
>  
>   /* List of events which userspace want to receive */
>   struct list_head event_list;
> @@ -965,9 +959,6 @@ static bool mem_cgroup_event_ratelimit(struct mem_cgroup 
> *memcg,
>   case MEM_CGROUP_TARGET_SOFTLIMIT:
>   next = val + SOFTLIMIT_EVENTS_TARGET;
>   break;
> - case MEM_CGROUP_TARGET_NUMAINFO:
> - next = val + NUMAINFO_EVENTS_TARGET;
> - break;
>   default:
>   break;
>   }
> @@ -986,22 +977,10 @@ static void memcg_check_events(struct mem_cgroup 
> *memcg, struct page *page)
>   /* threshold event is triggered in finer grain than soft limit */
>   if (unlikely(mem_cgroup_event_ratelimit(memcg,
>   MEM_CGROUP_TARGET_THRESH))) {
> - bool do_softlimit;
> - bool do_numainfo __maybe_unused;
> -
> - do_softlimit = mem_cgroup_event_ratelimit(memcg,
> - MEM_CGROUP_TARGET_SOFTLIMIT);
> -#if MAX_NUMNODES > 1
> - do_numainfo = mem_cgroup_event_ratelimit(memcg,
> - MEM_CGROUP_TARGET_NUMAINFO);
> -#endif
>   mem_cgroup_threshold(memcg);
> - if (unlikely(do_softlimit))
> + if (mem_cgroup_event_ratelimit(memcg,
> +MEM_CGROUP_TARGET_SOFTLIMIT))
>   mem_cgroup_update_tree(memcg, page);
> -#if MAX_NUMNODES > 1
> - if (unlikely(do_numainfo))
> - atomic_inc(&memcg->numainfo_eve

Re: [patch 1/4] mm: memcontrol: inline memcg->move_lock locking

2014-10-21 Thread Vladimir Davydov
On Tue, Oct 21, 2014 at 04:21:33PM -0400, Johannes Weiner wrote:
> The wrappers around taking and dropping the memcg->move_lock spinlock
> add nothing of value.  Inline the spinlock calls into the callsites.
> 
> Signed-off-by: Johannes Weiner 

Acked-by: Vladimir Davydov 

> ---
>  mm/memcontrol.c | 34 +-
>  1 file changed, 9 insertions(+), 25 deletions(-)
> 
> diff --git a/mm/memcontrol.c b/mm/memcontrol.c
> index 293db8234179..1ff125d2a427 100644
> --- a/mm/memcontrol.c
> +++ b/mm/memcontrol.c
> @@ -1507,23 +1507,6 @@ static bool mem_cgroup_wait_acct_move(struct 
> mem_cgroup *memcg)
>   return false;
>  }
>  
> -/*
> - * Take this lock when
> - * - a code tries to modify page's memcg while it's USED.
> - * - a code tries to modify page state accounting in a memcg.
> - */
> -static void move_lock_mem_cgroup(struct mem_cgroup *memcg,
> -   unsigned long *flags)
> -{
> - spin_lock_irqsave(&memcg->move_lock, *flags);
> -}
> -
> -static void move_unlock_mem_cgroup(struct mem_cgroup *memcg,
> - unsigned long *flags)
> -{
> - spin_unlock_irqrestore(&memcg->move_lock, *flags);
> -}
> -
>  #define K(x) ((x) << (PAGE_SHIFT-10))
>  /**
>   * mem_cgroup_print_oom_info: Print OOM information relevant to memory 
> controller.
> @@ -2013,7 +1996,7 @@ again:
>   return;
>   /*
>* If this memory cgroup is not under account moving, we don't
> -  * need to take move_lock_mem_cgroup(). Because we already hold
> +  * need to take &memcg->move_lock. Because we already hold
>* rcu_read_lock(), any calls to move_account will be delayed until
>* rcu_read_unlock().
>*/
> @@ -2021,9 +2004,9 @@ again:
>   if (atomic_read(&memcg->moving_account) <= 0)
>   return;
>  
> - move_lock_mem_cgroup(memcg, flags);
> + spin_lock_irqsave(&memcg->move_lock, *flags);
>   if (memcg != pc->mem_cgroup) {
> - move_unlock_mem_cgroup(memcg, flags);
> + spin_unlock_irqrestore(&memcg->move_lock, *flags);
>   goto again;
>   }
>   *locked = true;
> @@ -2038,7 +2021,7 @@ void __mem_cgroup_end_update_page_stat(struct page 
> *page, unsigned long *flags)
>* lock is held because a routine modifies pc->mem_cgroup
>* should take move_lock_mem_cgroup().
>*/
> - move_unlock_mem_cgroup(pc->mem_cgroup, flags);
> + spin_unlock_irqrestore(&pc->mem_cgroup->move_lock, *flags);
>  }
>  
>  void mem_cgroup_update_page_stat(struct page *page,
> @@ -3083,7 +3066,7 @@ static int mem_cgroup_move_account(struct page *page,
>   if (pc->mem_cgroup != from)
>   goto out_unlock;
>  
> - move_lock_mem_cgroup(from, &flags);
> + spin_lock_irqsave(&from->move_lock, flags);
>  
>   if (!PageAnon(page) && page_mapped(page)) {
>   __this_cpu_sub(from->stat->count[MEM_CGROUP_STAT_FILE_MAPPED],
> @@ -3107,7 +3090,8 @@ static int mem_cgroup_move_account(struct page *page,
>  
>   /* caller should have done css_get */
>   pc->mem_cgroup = to;
> - move_unlock_mem_cgroup(from, &flags);
> + spin_unlock_irqrestore(&from->move_lock, flags);
> +
>   ret = 0;
>  
>   local_irq_disable();
> @@ -6033,9 +6017,9 @@ void mem_cgroup_migrate(struct page *oldpage, struct 
> page *newpage,
>* but there might still be references, e.g. from finishing
>* writeback.  Follow the charge moving protocol here.
>*/
> - move_lock_mem_cgroup(memcg, &flags);
> + spin_lock_irqsave(&memcg->move_lock, flags);
>   pc->mem_cgroup = NULL;
> - move_unlock_mem_cgroup(memcg, &flags);
> + spin_unlock_irqrestore(&memcg->move_lock, flags);
>  
>   if (lrucare)
>   unlock_page_lru(oldpage, isolated);
> -- 
> 2.1.2
> 
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [patch 2/4] mm: memcontrol: don't pass a NULL memcg to mem_cgroup_end_move()

2014-10-21 Thread Vladimir Davydov
On Tue, Oct 21, 2014 at 04:21:34PM -0400, Johannes Weiner wrote:
> mem_cgroup_end_move() checks if the passed memcg is NULL, along with a
> lengthy comment to explain why this seemingly non-sensical situation
> is even possible.
> 
> Check in cancel_attach() itself whether can_attach() set up the move
> context or not, it's a lot more obvious from there.  Then remove the
> check and comment in mem_cgroup_end_move().
> 
> Signed-off-by: Johannes Weiner 

Acked-by: Vladimir Davydov 

> ---
>  mm/memcontrol.c | 13 -
>  1 file changed, 4 insertions(+), 9 deletions(-)
> 
> diff --git a/mm/memcontrol.c b/mm/memcontrol.c
> index 1ff125d2a427..c1fe774d712a 100644
> --- a/mm/memcontrol.c
> +++ b/mm/memcontrol.c
> @@ -1452,14 +1452,8 @@ static void mem_cgroup_start_move(struct mem_cgroup 
> *memcg)
>  
>  static void mem_cgroup_end_move(struct mem_cgroup *memcg)
>  {
> - /*
> -  * Now, mem_cgroup_clear_mc() may call this function with NULL.
> -  * We check NULL in callee rather than caller.
> -  */
> - if (memcg) {
> - atomic_dec(&memcg_moving);
> - atomic_dec(&memcg->moving_account);
> - }
> + atomic_dec(&memcg_moving);
> + atomic_dec(&memcg->moving_account);
>  }
>  
>  /*
> @@ -5383,7 +5377,8 @@ static int mem_cgroup_can_attach(struct 
> cgroup_subsys_state *css,
>  static void mem_cgroup_cancel_attach(struct cgroup_subsys_state *css,
>struct cgroup_taskset *tset)
>  {
> - mem_cgroup_clear_mc();
> + if (mc.to)
> + mem_cgroup_clear_mc();
>  }
>  
>  static int mem_cgroup_move_charge_pte_range(pmd_t *pmd,
> -- 
> 2.1.2
> 
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [patch 1/4] mm: memcontrol: uncharge pages on swapout

2014-10-22 Thread Vladimir Davydov
On Tue, Oct 21, 2014 at 05:03:28PM -0400, Johannes Weiner wrote:
> On Tue, Oct 21, 2014 at 04:52:52PM +0400, Vladimir Davydov wrote:
> > On Mon, Oct 20, 2014 at 11:22:09AM -0400, Johannes Weiner wrote:
> > > mem_cgroup_swapout() is called with exclusive access to the page at
> > > the end of the page's lifetime.  Instead of clearing the PCG_MEMSW
> > > flag and deferring the uncharge, just do it right away.  This allows
> > > follow-up patches to simplify the uncharge code.
> > > 
> > > Signed-off-by: Johannes Weiner 
> > > ---
> > >  mm/memcontrol.c | 17 +
> > >  1 file changed, 13 insertions(+), 4 deletions(-)
> > > 
> > > diff --git a/mm/memcontrol.c b/mm/memcontrol.c
> > > index bea3fddb3372..7709f17347f3 100644
> > > --- a/mm/memcontrol.c
> > > +++ b/mm/memcontrol.c
> > > @@ -5799,6 +5799,7 @@ static void __init enable_swap_cgroup(void)
> > >   */
> > >  void mem_cgroup_swapout(struct page *page, swp_entry_t entry)
> > >  {
> > > + struct mem_cgroup *memcg;
> > >   struct page_cgroup *pc;
> > >   unsigned short oldid;
> > >  
> > > @@ -5815,13 +5816,21 @@ void mem_cgroup_swapout(struct page *page, 
> > > swp_entry_t entry)
> > >   return;
> > >  
> > >   VM_BUG_ON_PAGE(!(pc->flags & PCG_MEMSW), page);
> > > + memcg = pc->mem_cgroup;
> > >  
> > > - oldid = swap_cgroup_record(entry, mem_cgroup_id(pc->mem_cgroup));
> > > + oldid = swap_cgroup_record(entry, mem_cgroup_id(memcg));
> > >   VM_BUG_ON_PAGE(oldid, page);
> > > + mem_cgroup_swap_statistics(memcg, true);
> > >  
> > > - pc->flags &= ~PCG_MEMSW;
> > > - css_get(&pc->mem_cgroup->css);
> > > - mem_cgroup_swap_statistics(pc->mem_cgroup, true);
> > > + pc->flags = 0;
> > > +
> > > + if (!mem_cgroup_is_root(memcg))
> > > + page_counter_uncharge(&memcg->memory, 1);
> > 
> > AFAIU it removes batched uncharge of swapped out pages, doesn't it? Will
> > it affect performance?
> 
> During swapout and with lockless page counters?  I don't think so.

How is this different from page cache out? I mean, we can have a lot of
pages in the swap cache that have already been swapped out, and are
waiting to be unmapped, uncharged, and freed, just like usual page
cache. Why do we use batching for file cache pages then?

> 
> > Besides, it looks asymmetric with respect to the page cache uncharge
> > path, where we still defer uncharge to mem_cgroup_uncharge_list(), and I
> > personally rather dislike this asymmetry.
> 
> The asymmetry is inherent in the fact that we mave memory and
> memory+swap accounting, and here a memory charge is transferred out to
> swap.  Before, the asymmetry was in mem_cgroup_uncharge_list() where
> we separate out memory and memsw pages (which the next patch fixes).

I agree that memsw is inherently asymmetric, but IMO it isn't the case
for swap *cache* vs page *cache*. We handle them similarly - removing
from a mapping, uncharging, freeing. If one wants batching, why
shouldn't the other?

> 
> So nothing changed, the ugliness was just moved around.  I actually
> like it better now that it's part of the swap controller, because
> that's where the nastiness actually comes from.  This will all go away
> when we account swap separately.  Then, swapped pages can keep their
> memory charge until mem_cgroup_uncharge() again and the swap charge
> will be completely independent from it.  This reshuffling is just
> necessary because it allows us to get rid of the per-page flag.

Do you mean that swap cache uncharge batching will be back soon?

> 
> > > + local_irq_disable();
> > > + mem_cgroup_charge_statistics(memcg, page, -1);
> > > + memcg_check_events(memcg, page);
> > > + local_irq_enable();
> > 
> > AFAICT mem_cgroup_swapout() is called under mapping->tree_lock with irqs
> > disabled, so we should use irq_save/restore here.
> 
> Good catch!  I don't think this function actually needs to be called
> under the tree_lock, so I'd rather send a follow-up that moves it out.

That's exactly what I though after sending that message.

> For now, this should be sufficient:
> 
> ---
> 
> From 3a40bd3b85a70db104ade873007dbb84b5117993 Mon Sep 17 00:00:00 2001
> From: Johannes Weiner 
> Date: Tue, 21 Oct 2014 16:53:14 -0400
> Subject: [patch] mm: memcontrol: uncharge pages on swapout fix
> 
> Vladimir notes:
> 
> > > +   local_irq_disable();
> > > +   mem_c

Re: [patch 1/4] mm: memcontrol: uncharge pages on swapout

2014-10-22 Thread Vladimir Davydov
On Wed, Oct 22, 2014 at 09:20:38AM -0400, Johannes Weiner wrote:
> On Wed, Oct 22, 2014 at 12:33:53PM +0400, Vladimir Davydov wrote:
> > On Tue, Oct 21, 2014 at 05:03:28PM -0400, Johannes Weiner wrote:
> > > On Tue, Oct 21, 2014 at 04:52:52PM +0400, Vladimir Davydov wrote:
> > > > On Mon, Oct 20, 2014 at 11:22:09AM -0400, Johannes Weiner wrote:
> > > > > mem_cgroup_swapout() is called with exclusive access to the page at
> > > > > the end of the page's lifetime.  Instead of clearing the PCG_MEMSW
> > > > > flag and deferring the uncharge, just do it right away.  This allows
> > > > > follow-up patches to simplify the uncharge code.
> > > > > 
> > > > > Signed-off-by: Johannes Weiner 
> > > > > ---
> > > > >  mm/memcontrol.c | 17 +
> > > > >  1 file changed, 13 insertions(+), 4 deletions(-)
> > > > > 
> > > > > diff --git a/mm/memcontrol.c b/mm/memcontrol.c
> > > > > index bea3fddb3372..7709f17347f3 100644
> > > > > --- a/mm/memcontrol.c
> > > > > +++ b/mm/memcontrol.c
> > > > > @@ -5799,6 +5799,7 @@ static void __init enable_swap_cgroup(void)
> > > > >   */
> > > > >  void mem_cgroup_swapout(struct page *page, swp_entry_t entry)
> > > > >  {
> > > > > + struct mem_cgroup *memcg;
> > > > >   struct page_cgroup *pc;
> > > > >   unsigned short oldid;
> > > > >  
> > > > > @@ -5815,13 +5816,21 @@ void mem_cgroup_swapout(struct page *page, 
> > > > > swp_entry_t entry)
> > > > >   return;
> > > > >  
> > > > >   VM_BUG_ON_PAGE(!(pc->flags & PCG_MEMSW), page);
> > > > > + memcg = pc->mem_cgroup;
> > > > >  
> > > > > - oldid = swap_cgroup_record(entry, 
> > > > > mem_cgroup_id(pc->mem_cgroup));
> > > > > + oldid = swap_cgroup_record(entry, mem_cgroup_id(memcg));
> > > > >   VM_BUG_ON_PAGE(oldid, page);
> > > > > + mem_cgroup_swap_statistics(memcg, true);
> > > > >  
> > > > > - pc->flags &= ~PCG_MEMSW;
> > > > > - css_get(&pc->mem_cgroup->css);
> > > > > - mem_cgroup_swap_statistics(pc->mem_cgroup, true);
> > > > > + pc->flags = 0;
> > > > > +
> > > > > + if (!mem_cgroup_is_root(memcg))
> > > > > + page_counter_uncharge(&memcg->memory, 1);
> > > > 
> > > > AFAIU it removes batched uncharge of swapped out pages, doesn't it? Will
> > > > it affect performance?
> > > 
> > > During swapout and with lockless page counters?  I don't think so.
> > 
> > How is this different from page cache out? I mean, we can have a lot of
> > pages in the swap cache that have already been swapped out, and are
> > waiting to be unmapped, uncharged, and freed, just like usual page
> > cache. Why do we use batching for file cache pages then?
> 
> The batching is mostly for munmap().  We do it for reclaim because
> it's convenient, but I don't think an extra word per struct page to
> batch one, sometimes a few, locked subtractions per swapped out page
> is a reasonable trade-off.
> 
> > > > Besides, it looks asymmetric with respect to the page cache uncharge
> > > > path, where we still defer uncharge to mem_cgroup_uncharge_list(), and I
> > > > personally rather dislike this asymmetry.
> > > 
> > > The asymmetry is inherent in the fact that we mave memory and
> > > memory+swap accounting, and here a memory charge is transferred out to
> > > swap.  Before, the asymmetry was in mem_cgroup_uncharge_list() where
> > > we separate out memory and memsw pages (which the next patch fixes).
> > 
> > I agree that memsw is inherently asymmetric, but IMO it isn't the case
> > for swap *cache* vs page *cache*. We handle them similarly - removing
> > from a mapping, uncharging, freeing. If one wants batching, why
> > shouldn't the other?
> 
> It has to be worth it in practical terms.  You can argue symmetry
> between swap cache and page cache, but swapping simply is a much
> colder path than reclaiming page cache.  Our reclaim algorithm avoids
> it like the plague.
> 
> > > So nothing changed, the ugliness was just moved around.  I actually
> > > like it better now that it's part 

Re: [patch 2/4] mm: memcontrol: remove unnecessary PCG_MEMSW memory+swap charge flag

2014-10-22 Thread Vladimir Davydov
On Mon, Oct 20, 2014 at 11:22:10AM -0400, Johannes Weiner wrote:
> Now that mem_cgroup_swapout() fully uncharges the page, every page
> that is still in use when reaching mem_cgroup_uncharge() is known to
> carry both the memory and the memory+swap charge.  Simplify the
> uncharge path and remove the PCG_MEMSW page flag accordingly.
> 
> Signed-off-by: Johannes Weiner 

Reviewed-by: Vladimir Davydov 

> ---
>  include/linux/page_cgroup.h |  1 -
>  mm/memcontrol.c | 34 --
>  2 files changed, 12 insertions(+), 23 deletions(-)
> 
> diff --git a/include/linux/page_cgroup.h b/include/linux/page_cgroup.h
> index 5c831f1eca79..da62ee2be28b 100644
> --- a/include/linux/page_cgroup.h
> +++ b/include/linux/page_cgroup.h
> @@ -5,7 +5,6 @@ enum {
>   /* flags for mem_cgroup */
>   PCG_USED = 0x01,/* This page is charged to a memcg */
>   PCG_MEM = 0x02, /* This page holds a memory charge */
> - PCG_MEMSW = 0x04,   /* This page holds a memory+swap charge */
>  };
>  
>  struct pglist_data;
> diff --git a/mm/memcontrol.c b/mm/memcontrol.c
> index 7709f17347f3..9bab35fc3e9e 100644
> --- a/mm/memcontrol.c
> +++ b/mm/memcontrol.c
> @@ -2606,7 +2606,7 @@ static void commit_charge(struct page *page, struct 
> mem_cgroup *memcg,
>*   have the page locked
>*/
>   pc->mem_cgroup = memcg;
> - pc->flags = PCG_USED | PCG_MEM | (do_swap_account ? PCG_MEMSW : 0);
> + pc->flags = PCG_USED | PCG_MEM;
>  
>   if (lrucare)
>   unlock_page_lru(page, isolated);
> @@ -5815,7 +5815,6 @@ void mem_cgroup_swapout(struct page *page, swp_entry_t 
> entry)
>   if (!PageCgroupUsed(pc))
>   return;
>  
> - VM_BUG_ON_PAGE(!(pc->flags & PCG_MEMSW), page);
>   memcg = pc->mem_cgroup;
>  
>   oldid = swap_cgroup_record(entry, mem_cgroup_id(memcg));
> @@ -6010,17 +6009,16 @@ void mem_cgroup_cancel_charge(struct page *page, 
> struct mem_cgroup *memcg)
>  }
>  
>  static void uncharge_batch(struct mem_cgroup *memcg, unsigned long pgpgout,
> -unsigned long nr_mem, unsigned long nr_memsw,
>  unsigned long nr_anon, unsigned long nr_file,
>  unsigned long nr_huge, struct page *dummy_page)
>  {
> + unsigned long nr_pages = nr_anon + nr_file;
>   unsigned long flags;
>  
>   if (!mem_cgroup_is_root(memcg)) {
> - if (nr_mem)
> - page_counter_uncharge(&memcg->memory, nr_mem);
> - if (nr_memsw)
> - page_counter_uncharge(&memcg->memsw, nr_memsw);
> + page_counter_uncharge(&memcg->memory, nr_pages);
> + if (do_swap_account)
> + page_counter_uncharge(&memcg->memsw, nr_pages);
>   memcg_oom_recover(memcg);
>   }
>  
> @@ -6029,23 +6027,21 @@ static void uncharge_batch(struct mem_cgroup *memcg, 
> unsigned long pgpgout,
>   __this_cpu_sub(memcg->stat->count[MEM_CGROUP_STAT_CACHE], nr_file);
>   __this_cpu_sub(memcg->stat->count[MEM_CGROUP_STAT_RSS_HUGE], nr_huge);
>   __this_cpu_add(memcg->stat->events[MEM_CGROUP_EVENTS_PGPGOUT], pgpgout);
> - __this_cpu_add(memcg->stat->nr_page_events, nr_anon + nr_file);
> + __this_cpu_add(memcg->stat->nr_page_events, nr_pages);
>   memcg_check_events(memcg, dummy_page);
>   local_irq_restore(flags);
>  
>   if (!mem_cgroup_is_root(memcg))
> - css_put_many(&memcg->css, max(nr_mem, nr_memsw));
> + css_put_many(&memcg->css, nr_pages);
>  }
>  
>  static void uncharge_list(struct list_head *page_list)
>  {
>   struct mem_cgroup *memcg = NULL;
> - unsigned long nr_memsw = 0;
>   unsigned long nr_anon = 0;
>   unsigned long nr_file = 0;
>   unsigned long nr_huge = 0;
>   unsigned long pgpgout = 0;
> - unsigned long nr_mem = 0;
>   struct list_head *next;
>   struct page *page;
>  
> @@ -6072,10 +6068,9 @@ static void uncharge_list(struct list_head *page_list)
>  
>   if (memcg != pc->mem_cgroup) {
>   if (memcg) {
> - uncharge_batch(memcg, pgpgout, nr_mem, nr_memsw,
> -nr_anon, nr_file, nr_huge, page);
> - pgpgout = nr_mem = nr_memsw = 0;
> - nr_anon = nr_file = nr_huge = 0;
> + uncharge_batch(memcg, pgpgout, nr_anon, nr_file,
> +nr_huge, page);
> +   

Re: [patch 4/4] mm: memcontrol: remove unnecessary PCG_USED pc->mem_cgroup valid flag

2014-10-22 Thread Vladimir Davydov
On Mon, Oct 20, 2014 at 11:22:12AM -0400, Johannes Weiner wrote:
> pc->mem_cgroup had to be left intact after uncharge for the final LRU
> removal, and !PCG_USED indicated whether the page was uncharged.  But
> since 0a31bc97c80c ("mm: memcontrol: rewrite uncharge API") pages are
> uncharged after the final LRU removal.  Uncharge can simply clear the
> pointer and the PCG_USED/PageCgroupUsed sites can test that instead.
> 
> Because this is the last page_cgroup flag, this patch reduces the
> memcg per-page overhead to a single pointer.
> 
> Signed-off-by: Johannes Weiner 

Reviewed-by: Vladimir Davydov 

> ---
>  include/linux/page_cgroup.h |  10 -
>  mm/memcontrol.c | 107 
> +---
>  2 files changed, 42 insertions(+), 75 deletions(-)
> 
> diff --git a/include/linux/page_cgroup.h b/include/linux/page_cgroup.h
> index 97536e685843..1289be6b436c 100644
> --- a/include/linux/page_cgroup.h
> +++ b/include/linux/page_cgroup.h
> @@ -1,11 +1,6 @@
>  #ifndef __LINUX_PAGE_CGROUP_H
>  #define __LINUX_PAGE_CGROUP_H
>  
> -enum {
> - /* flags for mem_cgroup */
> - PCG_USED = 0x01,/* This page is charged to a memcg */
> -};
> -
>  struct pglist_data;
>  
>  #ifdef CONFIG_MEMCG
> @@ -19,7 +14,6 @@ struct mem_cgroup;
>   * then the page cgroup for pfn always exists.
>   */
>  struct page_cgroup {
> - unsigned long flags;
>   struct mem_cgroup *mem_cgroup;
>  };
>  
> @@ -39,10 +33,6 @@ static inline void page_cgroup_init(void)
>  
>  struct page_cgroup *lookup_page_cgroup(struct page *page);
>  
> -static inline int PageCgroupUsed(struct page_cgroup *pc)
> -{
> - return !!(pc->flags & PCG_USED);
> -}
>  #else /* !CONFIG_MEMCG */
>  struct page_cgroup;
>  
> diff --git a/mm/memcontrol.c b/mm/memcontrol.c
> index 1d66ac49e702..48d49c6b08d1 100644
> --- a/mm/memcontrol.c
> +++ b/mm/memcontrol.c
> @@ -1284,14 +1284,12 @@ struct lruvec *mem_cgroup_page_lruvec(struct page 
> *page, struct zone *zone)
>  
>   pc = lookup_page_cgroup(page);
>   memcg = pc->mem_cgroup;
> -
>   /*
>* Swapcache readahead pages are added to the LRU - and
> -  * possibly migrated - before they are charged.  Ensure
> -  * pc->mem_cgroup is sane.
> +  * possibly migrated - before they are charged.
>*/
> - if (!PageLRU(page) && !PageCgroupUsed(pc) && memcg != root_mem_cgroup)
> - pc->mem_cgroup = memcg = root_mem_cgroup;
> + if (!memcg)
> + memcg = root_mem_cgroup;
>  
>   mz = mem_cgroup_page_zoneinfo(memcg, page);
>   lruvec = &mz->lruvec;
> @@ -2141,7 +2139,7 @@ void __mem_cgroup_begin_update_page_stat(struct page 
> *page,
>   pc = lookup_page_cgroup(page);
>  again:
>   memcg = pc->mem_cgroup;
> - if (unlikely(!memcg || !PageCgroupUsed(pc)))
> + if (unlikely(!memcg))
>   return;
>   /*
>* If this memory cgroup is not under account moving, we don't
> @@ -2154,7 +2152,7 @@ again:
>   return;
>  
>   move_lock_mem_cgroup(memcg, flags);
> - if (memcg != pc->mem_cgroup || !PageCgroupUsed(pc)) {
> + if (memcg != pc->mem_cgroup) {
>   move_unlock_mem_cgroup(memcg, flags);
>   goto again;
>   }
> @@ -2186,7 +2184,7 @@ void mem_cgroup_update_page_stat(struct page *page,
>  
>   pc = lookup_page_cgroup(page);
>   memcg = pc->mem_cgroup;
> - if (unlikely(!memcg || !PageCgroupUsed(pc)))
> + if (unlikely(!memcg))
>   return;
>  
>   this_cpu_add(memcg->stat->count[idx], val);
> @@ -2525,9 +2523,10 @@ struct mem_cgroup *try_get_mem_cgroup_from_page(struct 
> page *page)
>   VM_BUG_ON_PAGE(!PageLocked(page), page);
>  
>   pc = lookup_page_cgroup(page);
> - if (PageCgroupUsed(pc)) {
> - memcg = pc->mem_cgroup;
> - if (memcg && !css_tryget_online(&memcg->css))
> + memcg = pc->mem_cgroup;
> +
> + if (memcg) {
> + if (!css_tryget_online(&memcg->css))
>   memcg = NULL;
>   } else if (PageSwapCache(page)) {
>   ent.val = page_private(page);
> @@ -2578,7 +2577,7 @@ static void commit_charge(struct page *page, struct 
> mem_cgroup *memcg,
>   struct page_cgroup *pc = lookup_page_cgroup(page);
>   int isolated;
>  
> - VM_BUG_ON_PAGE(PageCgroupUsed(pc), page);
> + VM_BUG_ON_PAGE(pc->mem_cgroup, page);
>   /*
>* we don't need page_cgroup_lock about tail pages, becase they are not
> 

Re: [patch 3/4] mm: memcontrol: remove unnecessary PCG_MEM memory charge flag

2014-10-22 Thread Vladimir Davydov
On Mon, Oct 20, 2014 at 11:22:11AM -0400, Johannes Weiner wrote:
> PCG_MEM is a remnant from an earlier version of 0a31bc97c80c ("mm:
> memcontrol: rewrite uncharge API"), used to tell whether migration
> cleared a charge while leaving pc->mem_cgroup valid and PCG_USED set.
> But in the final version, mem_cgroup_migrate() directly uncharges the
> source page, rendering this distinction unnecessary.  Remove it.
> 
> Signed-off-by: Johannes Weiner 

Reviewed-by: Vladimir Davydov 

> ---
>  include/linux/page_cgroup.h | 1 -
>  mm/memcontrol.c | 4 +---
>  2 files changed, 1 insertion(+), 4 deletions(-)
> 
> diff --git a/include/linux/page_cgroup.h b/include/linux/page_cgroup.h
> index da62ee2be28b..97536e685843 100644
> --- a/include/linux/page_cgroup.h
> +++ b/include/linux/page_cgroup.h
> @@ -4,7 +4,6 @@
>  enum {
>   /* flags for mem_cgroup */
>   PCG_USED = 0x01,/* This page is charged to a memcg */
> - PCG_MEM = 0x02, /* This page holds a memory charge */
>  };
>  
>  struct pglist_data;
> diff --git a/mm/memcontrol.c b/mm/memcontrol.c
> index 9bab35fc3e9e..1d66ac49e702 100644
> --- a/mm/memcontrol.c
> +++ b/mm/memcontrol.c
> @@ -2606,7 +2606,7 @@ static void commit_charge(struct page *page, struct 
> mem_cgroup *memcg,
>*   have the page locked
>*/
>   pc->mem_cgroup = memcg;
> - pc->flags = PCG_USED | PCG_MEM;
> + pc->flags = PCG_USED;
>  
>   if (lrucare)
>   unlock_page_lru(page, isolated);
> @@ -6177,8 +6177,6 @@ void mem_cgroup_migrate(struct page *oldpage, struct 
> page *newpage,
>   if (!PageCgroupUsed(pc))
>   return;
>  
> - VM_BUG_ON_PAGE(!(pc->flags & PCG_MEM), oldpage);
> -
>   if (lrucare)
>   lock_page_lru(oldpage, &isolated);
>  
> -- 
> 2.1.2
> 
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[PATCH -mm] memcg: zap kmem_account_flags

2014-11-05 Thread Vladimir Davydov
The only such flag is KMEM_ACCOUNTED_ACTIVE, but it's set iff
mem_cgroup->kmemcg_id >= 0, so we can check kmemcg_id instead of having
a separate flags field.

Signed-off-by: Vladimir Davydov 
---
 mm/memcontrol.c |   25 ++---
 1 file changed, 6 insertions(+), 19 deletions(-)

diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 0c315c99122d..9a37d99aee54 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -296,7 +296,6 @@ struct mem_cgroup {
 * Should the accounting and control be hierarchical, per subtree?
 */
bool use_hierarchy;
-   unsigned long kmem_account_flags; /* See KMEM_ACCOUNTED_*, below */
 
booloom_lock;
atomic_tunder_oom;
@@ -363,22 +362,11 @@ struct mem_cgroup {
/* WARNING: nodeinfo must be the last member here */
 };
 
-/* internal only representation about the status of kmem accounting. */
-enum {
-   KMEM_ACCOUNTED_ACTIVE, /* accounted by this cgroup itself */
-};
-
 #ifdef CONFIG_MEMCG_KMEM
-static inline void memcg_kmem_set_active(struct mem_cgroup *memcg)
-{
-   set_bit(KMEM_ACCOUNTED_ACTIVE, &memcg->kmem_account_flags);
-}
-
 static bool memcg_kmem_is_active(struct mem_cgroup *memcg)
 {
-   return test_bit(KMEM_ACCOUNTED_ACTIVE, &memcg->kmem_account_flags);
+   return memcg->kmemcg_id >= 0;
 }
-
 #endif
 
 /* Stuffs for move charges at task migration. */
@@ -3471,22 +3459,21 @@ static int memcg_activate_kmem(struct mem_cgroup *memcg,
goto out;
}
 
-   memcg->kmemcg_id = memcg_id;
-
/*
-* We couldn't have accounted to this cgroup, because it hasn't got the
-* active bit set yet, so this should succeed.
+* We couldn't have accounted to this cgroup, because it hasn't got
+* activated yet, so this should succeed.
 */
err = page_counter_limit(&memcg->kmem, nr_pages);
VM_BUG_ON(err);
 
static_key_slow_inc(&memcg_kmem_enabled_key);
/*
-* Setting the active bit after enabling static branching will
+* A memory cgroup is considered kmem-active as soon as it gets
+* kmemcg_id. Setting the id after enabling static branching will
 * guarantee no one starts accounting before all call sites are
 * patched.
 */
-   memcg_kmem_set_active(memcg);
+   memcg->kmemcg_id = memcg_id;
 out:
memcg_resume_kmem_account();
return err;
-- 
1.7.10.4

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[PATCH -mm] memcg: __mem_cgroup_free: remove stale disarm_static_keys comment

2014-11-05 Thread Vladimir Davydov
cpuset code stopped using cgroup_lock in favor of cpuset_mutex long ago.

Signed-off-by: Vladimir Davydov 
---
 mm/memcontrol.c |   11 ---
 1 file changed, 11 deletions(-)

diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 9a37d99aee54..95ee47c0f0a2 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -4610,17 +4610,6 @@ static void __mem_cgroup_free(struct mem_cgroup *memcg)
 
free_percpu(memcg->stat);
 
-   /*
-* We need to make sure that (at least for now), the jump label
-* destruction code runs outside of the cgroup lock. This is because
-* get_online_cpus(), which is called from the static_branch update,
-* can't be called inside the cgroup_lock. cpusets are the ones
-* enforcing this dependency, so if they ever change, we might as well.
-*
-* schedule_work() will guarantee this happens. Be careful if you need
-* to move this code around, and make sure it is outside
-* the cgroup_lock.
-*/
disarm_static_keys(memcg);
kfree(memcg);
 }
-- 
1.7.10.4

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[PATCH -mm] memcg: don't check mm in __memcg_kmem_{get_cache,newpage_charge}

2014-11-05 Thread Vladimir Davydov
We already assured the current task has mm in memcg_kmem_should_charge,
no need to double check.

Signed-off-by: Vladimir Davydov 
---
 mm/memcontrol.c |4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 95ee47c0f0a2..f61ecbc97d30 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -2687,7 +2687,7 @@ struct kmem_cache *__memcg_kmem_get_cache(struct 
kmem_cache *cachep,
VM_BUG_ON(!cachep->memcg_params);
VM_BUG_ON(!cachep->memcg_params->is_root_cache);
 
-   if (!current->mm || current->memcg_kmem_skip_account)
+   if (current->memcg_kmem_skip_account)
return cachep;
 
rcu_read_lock();
@@ -2773,7 +2773,7 @@ __memcg_kmem_newpage_charge(gfp_t gfp, struct mem_cgroup 
**_memcg, int order)
 * allocations are extremely rare but can happen, for instance, for the
 * cache arrays. We bring this test here.
 */
-   if (!current->mm || current->memcg_kmem_skip_account)
+   if (current->memcg_kmem_skip_account)
return true;
 
memcg = get_mem_cgroup_from_mm(current->mm);
-- 
1.7.10.4

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[PATCH -mm 2/3] memcg: turn memcg_kmem_skip_account into a bit field

2014-11-05 Thread Vladimir Davydov
It isn't supposed to stack, so turn it into a bit-field to save 4 bytes
on the task_struct.

Also, remove the memcg_stop/resume_kmem_account helpers - it is clearer
to set/clear the flag inline. Regarding the overwhelming comment to the
helpers, which is removed by this patch too, we already have a compact
yet accurate explanation in memcg_schedule_cache_create, no need in yet
another one.

Signed-off-by: Vladimir Davydov 
---
 include/linux/sched.h |7 +--
 mm/memcontrol.c   |   35 ++-
 2 files changed, 7 insertions(+), 35 deletions(-)

diff --git a/include/linux/sched.h b/include/linux/sched.h
index 40497a2ed2d4..7b08b0240736 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1364,6 +1364,10 @@ struct task_struct {
unsigned sched_reset_on_fork:1;
unsigned sched_contributes_to_load:1;
 
+#ifdef CONFIG_MEMCG_KMEM
+   unsigned memcg_kmem_skip_account:1;
+#endif
+
unsigned long atomic_flags; /* Flags needing atomic access. */
 
pid_t pid;
@@ -1684,8 +1688,7 @@ struct task_struct {
/* bitmask and counter of trace recursion */
unsigned long trace_recursion;
 #endif /* CONFIG_TRACING */
-#ifdef CONFIG_MEMCG /* memcg uses this to do batch job */
-   unsigned int memcg_kmem_skip_account;
+#ifdef CONFIG_MEMCG
struct memcg_oom_info {
struct mem_cgroup *memcg;
gfp_t gfp_mask;
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index b3fe830fdb29..52d1e933bb9f 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -2575,37 +2575,6 @@ void memcg_update_array_size(int num)
memcg_limited_groups_array_size = num;
 }
 
-/*
- * During the creation a new cache, we need to disable our accounting mechanism
- * altogether. This is true even if we are not creating, but rather just
- * enqueing new caches to be created.
- *
- * This is because that process will trigger allocations; some visible, like
- * explicit kmallocs to auxiliary data structures, name strings and internal
- * cache structures; some well concealed, like INIT_WORK() that can allocate
- * objects during debug.
- *
- * If any allocation happens during memcg_kmem_get_cache, we will recurse back
- * to it. This may not be a bounded recursion: since the first cache creation
- * failed to complete (waiting on the allocation), we'll just try to create the
- * cache again, failing at the same point.
- *
- * memcg_kmem_get_cache is prepared to abort after seeing a positive count of
- * memcg_kmem_skip_account. So we enclose anything that might allocate memory
- * inside the following two functions.
- */
-static inline void memcg_stop_kmem_account(void)
-{
-   VM_BUG_ON(!current->mm);
-   current->memcg_kmem_skip_account++;
-}
-
-static inline void memcg_resume_kmem_account(void)
-{
-   VM_BUG_ON(!current->mm);
-   current->memcg_kmem_skip_account--;
-}
-
 struct memcg_cache_create_work {
struct mem_cgroup *memcg;
struct kmem_cache *cachep;
@@ -2660,9 +2629,9 @@ static void memcg_schedule_cache_create(struct mem_cgroup 
*memcg,
 * this point we can't allow ourselves back into memcg_kmem_get_cache,
 * the safest choice is to do it like this, wrapping the whole function.
 */
-   memcg_stop_kmem_account();
+   current->memcg_kmem_skip_account = 1;
__memcg_schedule_cache_create(memcg, cachep);
-   memcg_resume_kmem_account();
+   current->memcg_kmem_skip_account = 0;
 }
 
 /*
-- 
1.7.10.4

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[PATCH -mm 1/3] memcg: do not abuse memcg_kmem_skip_account

2014-11-05 Thread Vladimir Davydov
task_struct->memcg_kmem_skip_account was initially introduced to avoid
recursion during kmem cache creation: memcg_kmem_get_cache, which is
called by kmem_cache_alloc to determine the per-memcg cache to account
allocation to, may issue lazy cache creation if the needed cache doesn't
exist, which means issuing yet another kmem_cache_alloc. We can't just
pass a flag to the nested kmem_cache_alloc disabling kmem accounting,
because there are hidden allocations, e.g. in INIT_WORK. So we
introduced a flag on the task_struct, memcg_kmem_skip_account, making
memcg_kmem_get_cache return immediately.

By its nature, the flag may also be used to disable accounting for
allocations shared among different cgroups, and currently it is used
this way in memcg_activate_kmem. Using it like this looks like abusing
it to me. If we want to disable accounting for some allocatrons (which
we will definitely want one day), we should either add GFP_NO_MEMCG or
GFP_MEMCG flag in order to blacklist/whitelist some allocations.

For now, let's simply remove memcg_stop/resume_kmem_account from
memcg_activate_kmem.

Signed-off-by: Vladimir Davydov 
---
 mm/memcontrol.c |7 ---
 1 file changed, 7 deletions(-)

diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index f61ecbc97d30..b3fe830fdb29 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -3428,12 +3428,6 @@ static int memcg_activate_kmem(struct mem_cgroup *memcg,
return 0;
 
/*
-* We are going to allocate memory for data shared by all memory
-* cgroups so let's stop accounting here.
-*/
-   memcg_stop_kmem_account();
-
-   /*
 * For simplicity, we won't allow this to be disabled.  It also can't
 * be changed if the cgroup has children already, or if tasks had
 * already joined.
@@ -3475,7 +3469,6 @@ static int memcg_activate_kmem(struct mem_cgroup *memcg,
 */
memcg->kmemcg_id = memcg_id;
 out:
-   memcg_resume_kmem_account();
return err;
 }
 
-- 
1.7.10.4

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[PATCH -mm 3/3] memcg: only check memcg_kmem_skip_account in __memcg_kmem_get_cache

2014-11-05 Thread Vladimir Davydov
__memcg_kmem_get_cache can recurse if it calls kmalloc (which it does if
the cgroup's kmem cache doesn't exist), because kmalloc may call
__memcg_kmem_get_cache internally again. To avoid the recursion, we use
the task_struct->memcg_kmem_skip_account flag.

However, there's no need in checking the flag in
memcg_kmem_newpage_charge or memcg_kmem_recharge_slab, because there's
no way how these two functions could result in recursion, if called from
memcg_kmem_get_cache. So let's remove the redundant code.

Signed-off-by: Vladimir Davydov 
---
 mm/memcontrol.c |   31 ---
 1 file changed, 31 deletions(-)

diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 52d1e933bb9f..d7de40cb3c8e 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -2717,34 +2717,6 @@ __memcg_kmem_newpage_charge(gfp_t gfp, struct mem_cgroup 
**_memcg, int order)
 
*_memcg = NULL;
 
-   /*
-* Disabling accounting is only relevant for some specific memcg
-* internal allocations. Therefore we would initially not have such
-* check here, since direct calls to the page allocator that are
-* accounted to kmemcg (alloc_kmem_pages and friends) only happen
-* outside memcg core. We are mostly concerned with cache allocations,
-* and by having this test at memcg_kmem_get_cache, we are already able
-* to relay the allocation to the root cache and bypass the memcg cache
-* altogether.
-*
-* There is one exception, though: the SLUB allocator does not create
-* large order caches, but rather service large kmallocs directly from
-* the page allocator. Therefore, the following sequence when backed by
-* the SLUB allocator:
-*
-*  memcg_stop_kmem_account();
-*  kmalloc()
-*  memcg_resume_kmem_account();
-*
-* would effectively ignore the fact that we should skip accounting,
-* since it will drive us directly to this function without passing
-* through the cache selector memcg_kmem_get_cache. Such large
-* allocations are extremely rare but can happen, for instance, for the
-* cache arrays. We bring this test here.
-*/
-   if (current->memcg_kmem_skip_account)
-   return true;
-
memcg = get_mem_cgroup_from_mm(current->mm);
 
if (!memcg_kmem_is_active(memcg)) {
@@ -2800,9 +2772,6 @@ int __memcg_kmem_recharge_slab(void *obj, gfp_t gfp)
int nr_pages;
int ret = 0;
 
-   if (current->memcg_kmem_skip_account)
-   goto out;
-
page = virt_to_head_page(obj);
page_memcg = ACCESS_ONCE(page->mem_cgroup);
 
-- 
1.7.10.4

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[PATCH TRIVIAL] net: unix_listen: remove unused old_pid variable

2014-11-05 Thread Vladimir Davydov
Signed-off-by: Vladimir Davydov 
---
 net/unix/af_unix.c |2 --
 1 file changed, 2 deletions(-)

diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c
index e96884380732..2948f39b9a4f 100644
--- a/net/unix/af_unix.c
+++ b/net/unix/af_unix.c
@@ -478,7 +478,6 @@ static int unix_listen(struct socket *sock, int backlog)
int err;
struct sock *sk = sock->sk;
struct unix_sock *u = unix_sk(sk);
-   struct pid *old_pid = NULL;
 
err = -EOPNOTSUPP;
if (sock->type != SOCK_STREAM && sock->type != SOCK_SEQPACKET)
@@ -499,7 +498,6 @@ static int unix_listen(struct socket *sock, int backlog)
 
 out_unlock:
unix_state_unlock(sk);
-   put_pid(old_pid);
 out:
return err;
 }
-- 
1.7.10.4

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


<    4   5   6   7   8   9   10   11   12   13   >