[PATCH v3 06/16] memcg: infrastructure to match an allocation to the right cache

2012-09-18 Thread Glauber Costa
The page allocator is able to bind a page to a memcg when it is
allocated. But for the caches, we'd like to have as many objects as
possible in a page belonging to the same cache.

This is done in this patch by calling memcg_kmem_get_cache in the
beginning of every allocation function. This routing is patched out by
static branches when kernel memory controller is not being used.

It assumes that the task allocating, which determines the memcg in the
page allocator, belongs to the same cgroup throughout the whole process.
Misacounting can happen if the task calls memcg_kmem_get_cache() while
belonging to a cgroup, and later on changes. This is considered
acceptable, and should only happen upon task migration.

Before the cache is created by the memcg core, there is also a possible
imbalance: the task belongs to a memcg, but the cache being allocated
from is the global cache, since the child cache is not yet guaranteed to
be ready. This case is also fine, since in this case the GFP_KMEMCG will
not be passed and the page allocator will not attempt any cgroup
accounting.

Signed-off-by: Glauber Costa 
CC: Christoph Lameter 
CC: Pekka Enberg 
CC: Michal Hocko 
CC: Kamezawa Hiroyuki 
CC: Johannes Weiner 
CC: Suleiman Souhlal 
---
 include/linux/memcontrol.h |  38 +
 init/Kconfig   |   2 +-
 mm/memcontrol.c| 203 +
 3 files changed, 242 insertions(+), 1 deletion(-)

diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
index a5f3055..c44a5f2 100644
--- a/include/linux/memcontrol.h
+++ b/include/linux/memcontrol.h
@@ -419,6 +419,8 @@ extern void memcg_init_kmem_cache(void);
 extern void memcg_register_cache(struct mem_cgroup *memcg,
  struct kmem_cache *s);
 extern void memcg_release_cache(struct kmem_cache *cachep);
+struct kmem_cache *
+__memcg_kmem_get_cache(struct kmem_cache *cachep, gfp_t gfp);
 #else
 
 static inline void memcg_init_kmem_cache(void)
@@ -460,6 +462,12 @@ static inline void
 __memcg_kmem_commit_charge(struct page *page, struct mem_cgroup *memcg, int 
order)
 {
 }
+
+static inline struct kmem_cache *
+__memcg_kmem_get_cache(struct kmem_cache *cachep, gfp_t gfp)
+{
+   return cachep;
+}
 #endif /* CONFIG_MEMCG_KMEM */
 
 /**
@@ -526,5 +534,35 @@ memcg_kmem_commit_charge(struct page *page, struct 
mem_cgroup *memcg, int order)
if (memcg_kmem_enabled() && memcg)
__memcg_kmem_commit_charge(page, memcg, order);
 }
+
+/**
+ * memcg_kmem_get_kmem_cache: selects the correct per-memcg cache for 
allocation
+ * @cachep: the original global kmem cache
+ * @gfp: allocation flags.
+ *
+ * This function assumes that the task allocating, which determines the memcg
+ * in the page allocator, belongs to the same cgroup throughout the whole
+ * process.  Misacounting can happen if the task calls memcg_kmem_get_cache()
+ * while belonging to a cgroup, and later on changes. This is considered
+ * acceptable, and should only happen upon task migration.
+ *
+ * Before the cache is created by the memcg core, there is also a possible
+ * imbalance: the task belongs to a memcg, but the cache being allocated from
+ * is the global cache, since the child cache is not yet guaranteed to be
+ * ready. This case is also fine, since in this case the GFP_KMEMCG will not be
+ * passed and the page allocator will not attempt any cgroup accounting.
+ */
+static __always_inline struct kmem_cache *
+memcg_kmem_get_cache(struct kmem_cache *cachep, gfp_t gfp)
+{
+   if (!memcg_kmem_enabled())
+   return cachep;
+   if (gfp & __GFP_NOFAIL)
+   return cachep;
+   if (in_interrupt() || (!current->mm) || (current->flags & PF_KTHREAD))
+   return cachep;
+
+   return __memcg_kmem_get_cache(cachep, gfp);
+}
 #endif /* _LINUX_MEMCONTROL_H */
 
diff --git a/init/Kconfig b/init/Kconfig
index 707d015..31c4f74 100644
--- a/init/Kconfig
+++ b/init/Kconfig
@@ -741,7 +741,7 @@ config MEMCG_SWAP_ENABLED
  then swapaccount=0 does the trick).
 config MEMCG_KMEM
bool "Memory Resource Controller Kernel Memory accounting 
(EXPERIMENTAL)"
-   depends on MEMCG && EXPERIMENTAL
+   depends on MEMCG && EXPERIMENTAL && !SLOB
default n
help
  The Kernel Memory extension for Memory Resource Controller can limit
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 04851bb..1cce5c3 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -339,6 +339,11 @@ struct mem_cgroup {
 #ifdef CONFIG_INET
struct tcp_memcontrol tcp_mem;
 #endif
+
+#ifdef CONFIG_MEMCG_KMEM
+   /* Slab accounting */
+   struct kmem_cache *slabs[MAX_KMEM_CACHE_TYPES];
+#endif
 };
 
 enum {
@@ -539,6 +544,40 @@ static inline bool memcg_can_account_kmem(struct 
mem_cgroup *memcg)
 (memcg->kmem_accounted & (KMEM_ACCOUNTED_MASK));
 }
 
+static char *memcg_cache_name(struct mem_cgroup *memcg, struct kmem_cache 
*cachep)

Re: [PATCH v3 06/16] memcg: infrastructure to match an allocation to the right cache

2012-09-21 Thread Tejun Heo
On Tue, Sep 18, 2012 at 06:12:00PM +0400, Glauber Costa wrote:
> diff --git a/mm/memcontrol.c b/mm/memcontrol.c
> index 04851bb..1cce5c3 100644
> --- a/mm/memcontrol.c
> +++ b/mm/memcontrol.c
> @@ -339,6 +339,11 @@ struct mem_cgroup {
>  #ifdef CONFIG_INET
>   struct tcp_memcontrol tcp_mem;
>  #endif
> +
> +#ifdef CONFIG_MEMCG_KMEM
> + /* Slab accounting */
> + struct kmem_cache *slabs[MAX_KMEM_CACHE_TYPES];
> +#endif

Bah, 400 entry array in struct mem_cgroup.  Can't we do something a
bit more flexible?

> +static char *memcg_cache_name(struct mem_cgroup *memcg, struct kmem_cache 
> *cachep)
> +{
> + char *name;
> + struct dentry *dentry;
> +
> + rcu_read_lock();
> + dentry = rcu_dereference(memcg->css.cgroup->dentry);
> + rcu_read_unlock();
> +
> + BUG_ON(dentry == NULL);
> +
> + name = kasprintf(GFP_KERNEL, "%s(%d:%s)",
> + cachep->name, css_id(&memcg->css), dentry->d_name.name);

Maybe including full path is better, I don't know.

> + return name;
> +}
...
>  void __init memcg_init_kmem_cache(void)
> @@ -665,6 +704,170 @@ static void disarm_kmem_keys(struct mem_cgroup *memcg)
>*/
>   WARN_ON(res_counter_read_u64(&memcg->kmem, RES_USAGE) != 0);
>  }
> +
> +static DEFINE_MUTEX(memcg_cache_mutex);

Blank line missing.  Or if it's used inside memcg_create_kmem_cache()
only move it inside the function?

> +static struct kmem_cache *memcg_create_kmem_cache(struct mem_cgroup *memcg,
> +   struct kmem_cache *cachep)
> +{
> + struct kmem_cache *new_cachep;
> + int idx;
> +
> + BUG_ON(!memcg_can_account_kmem(memcg));

WARN_ON_ONCE() generally preferred.

> + idx = cachep->memcg_params.id;

Ah, okay so the id is assigned to the "base" cache.  Maybe explain it
somewhere?

> + mutex_lock(&memcg_cache_mutex);
> + new_cachep = memcg->slabs[idx];
> + if (new_cachep)
> + goto out;
> +
> + new_cachep = kmem_cache_dup(memcg, cachep);
> +
> + if (new_cachep == NULL) {
> + new_cachep = cachep;
> + goto out;
> + }
> +
> + mem_cgroup_get(memcg);
> + memcg->slabs[idx] = new_cachep;
> + new_cachep->memcg_params.memcg = memcg;
> +out:
> + mutex_unlock(&memcg_cache_mutex);
> + return new_cachep;
> +}
> +
> +struct create_work {
> + struct mem_cgroup *memcg;
> + struct kmem_cache *cachep;
> + struct list_head list;
> +};
> +
> +/* Use a single spinlock for destruction and creation, not a frequent op */
> +static DEFINE_SPINLOCK(cache_queue_lock);
> +static LIST_HEAD(create_queue);
> +
> +/*
> + * Flush the queue of kmem_caches to create, because we're creating a cgroup.
> + *
> + * We might end up flushing other cgroups' creation requests as well, but
> + * they will just get queued again next time someone tries to make a slab
> + * allocation for them.
> + */
> +void memcg_flush_cache_create_queue(void)
> +{
...
> +static void memcg_create_cache_enqueue(struct mem_cgroup *memcg,
> +struct kmem_cache *cachep)
> +{
> + struct create_work *cw;
> + unsigned long flags;
> +
> + spin_lock_irqsave(&cache_queue_lock, flags);
> + list_for_each_entry(cw, &create_queue, list) {
> + if (cw->memcg == memcg && cw->cachep == cachep) {
> + spin_unlock_irqrestore(&cache_queue_lock, flags);
> + return;
> + }
> + }
> + spin_unlock_irqrestore(&cache_queue_lock, flags);
> +
> + /* The corresponding put will be done in the workqueue. */
> + if (!css_tryget(&memcg->css))
> + return;
> +
> + cw = kmalloc(sizeof(struct create_work), GFP_NOWAIT);
> + if (cw == NULL) {
> + css_put(&memcg->css);
> + return;
> + }
> +
> + cw->memcg = memcg;
> + cw->cachep = cachep;
> + spin_lock_irqsave(&cache_queue_lock, flags);
> + list_add_tail(&cw->list, &create_queue);
> + spin_unlock_irqrestore(&cache_queue_lock, flags);
> +
> + schedule_work(&memcg_create_cache_work);
> +}

Why create your own worklist and flush mechanism?  Just embed a work
item in create_work and use a dedicated workqueue for flushing.

> +/*
> + * Return the kmem_cache we're supposed to use for a slab allocation.
> + * We try to use the current memcg's version of the cache.
> + *
> + * If the cache does not exist yet, if we are the first user of it,
> + * we either create it immediately, if possible, or create it asynchronously
> + * in a workqueue.
> + * In the latter case, we will let the current allocation go through with
> + * the original cache.
> + *
> + * Can't be called in interrupt context or from kernel threads.
> + * This function needs to be called with rcu_read_lock() held.
> + */
> +struct kmem_cache *__memcg_kmem_get_cache(struct kmem_cache *cachep,
> +   gfp_t gfp)
> +{
> + struct mem_cgroup *memcg;
> + int idx;
> + struct

Re: [PATCH v3 06/16] memcg: infrastructure to match an allocation to the right cache

2012-09-21 Thread Tejun Heo
Missed some stuff.

On Tue, Sep 18, 2012 at 06:12:00PM +0400, Glauber Costa wrote:
> +static struct kmem_cache *memcg_create_kmem_cache(struct mem_cgroup *memcg,
> +   struct kmem_cache *cachep)
> +{
...
> + memcg->slabs[idx] = new_cachep;
...
> +struct kmem_cache *__memcg_kmem_get_cache(struct kmem_cache *cachep,
> +   gfp_t gfp)
> +{
...
> + return memcg->slabs[idx];

I think you need memory barriers for the above pair.

Thanks.

-- 
tejun
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH v3 06/16] memcg: infrastructure to match an allocation to the right cache

2012-09-24 Thread Glauber Costa
On 09/22/2012 12:52 AM, Tejun Heo wrote:
> Missed some stuff.
> 
> On Tue, Sep 18, 2012 at 06:12:00PM +0400, Glauber Costa wrote:
>> +static struct kmem_cache *memcg_create_kmem_cache(struct mem_cgroup *memcg,
>> +  struct kmem_cache *cachep)
>> +{
> ...
>> +memcg->slabs[idx] = new_cachep;
> ...
>> +struct kmem_cache *__memcg_kmem_get_cache(struct kmem_cache *cachep,
>> +  gfp_t gfp)
>> +{
> ...
>> +return memcg->slabs[idx];
> 
> I think you need memory barriers for the above pair.
> 
> Thanks.
> 

Why is that?

We'll either see a value, or NULL. If we see NULL, we assume the cache
is not yet created. Not a big deal.

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH v3 06/16] memcg: infrastructure to match an allocation to the right cache

2012-09-24 Thread Glauber Costa
On 09/21/2012 10:32 PM, Tejun Heo wrote:
> On Tue, Sep 18, 2012 at 06:12:00PM +0400, Glauber Costa wrote:
>> diff --git a/mm/memcontrol.c b/mm/memcontrol.c
>> index 04851bb..1cce5c3 100644
>> --- a/mm/memcontrol.c
>> +++ b/mm/memcontrol.c
>> @@ -339,6 +339,11 @@ struct mem_cgroup {
>>  #ifdef CONFIG_INET
>>  struct tcp_memcontrol tcp_mem;
>>  #endif
>> +
>> +#ifdef CONFIG_MEMCG_KMEM
>> +/* Slab accounting */
>> +struct kmem_cache *slabs[MAX_KMEM_CACHE_TYPES];
>> +#endif
> 
> Bah, 400 entry array in struct mem_cgroup.  Can't we do something a
> bit more flexible?
> 

I guess. I still would like it to be an array, so we can easily access
its fields. There are two ways around this:

1) Do like the events mechanism and allocate this in a separate
structure. Add a pointer chase in the access, and I don't think it helps
much because it gets allocated anyway. But we could at least
defer it to the time when we limit the cache.

2) The indexes are only assigned after the slab is FULL. At that time, a
lot of the caches are already initialized. We can, for instance, allow
for "twice the number we have in the system", which already provides
room for a couple of more appearing. Combining with the 1st approach, we
can defer it to limit-time, and then allow for, say, 50 % more caches
than we already have. The pointer chasing may very well be worth it...

>> +static char *memcg_cache_name(struct mem_cgroup *memcg, struct kmem_cache 
>> *cachep)
>> +{
>> +char *name;
>> +struct dentry *dentry;
>> +
>> +rcu_read_lock();
>> +dentry = rcu_dereference(memcg->css.cgroup->dentry);
>> +rcu_read_unlock();
>> +
>> +BUG_ON(dentry == NULL);
>> +
>> +name = kasprintf(GFP_KERNEL, "%s(%d:%s)",
>> +cachep->name, css_id(&memcg->css), dentry->d_name.name);
> 
> Maybe including full path is better, I don't know.

It can get way too big.

>> +static void memcg_create_cache_enqueue(struct mem_cgroup *memcg,
>> +   struct kmem_cache *cachep)
>> +{
>> +struct create_work *cw;
>> +unsigned long flags;
>> +
>> +spin_lock_irqsave(&cache_queue_lock, flags);
>> +list_for_each_entry(cw, &create_queue, list) {
>> +if (cw->memcg == memcg && cw->cachep == cachep) {
>> +spin_unlock_irqrestore(&cache_queue_lock, flags);
>> +return;
>> +}
>> +}
>> +spin_unlock_irqrestore(&cache_queue_lock, flags);
>> +
>> +/* The corresponding put will be done in the workqueue. */
>> +if (!css_tryget(&memcg->css))
>> +return;
>> +
>> +cw = kmalloc(sizeof(struct create_work), GFP_NOWAIT);
>> +if (cw == NULL) {
>> +css_put(&memcg->css);
>> +return;
>> +}
>> +
>> +cw->memcg = memcg;
>> +cw->cachep = cachep;
>> +spin_lock_irqsave(&cache_queue_lock, flags);
>> +list_add_tail(&cw->list, &create_queue);
>> +spin_unlock_irqrestore(&cache_queue_lock, flags);
>> +
>> +schedule_work(&memcg_create_cache_work);
>> +}
> 
> Why create your own worklist and flush mechanism?  Just embed a work
> item in create_work and use a dedicated workqueue for flushing.

I'll take a look at this.

> 
>> +/*
>> + * Return the kmem_cache we're supposed to use for a slab allocation.
>> + * We try to use the current memcg's version of the cache.
>> + *
>> + * If the cache does not exist yet, if we are the first user of it,
>> + * we either create it immediately, if possible, or create it asynchronously
>> + * in a workqueue.
>> + * In the latter case, we will let the current allocation go through with
>> + * the original cache.
>> + *
>> + * Can't be called in interrupt context or from kernel threads.
>> + * This function needs to be called with rcu_read_lock() held.
>> + */
>> +struct kmem_cache *__memcg_kmem_get_cache(struct kmem_cache *cachep,
>> +  gfp_t gfp)
>> +{
>> +struct mem_cgroup *memcg;
>> +int idx;
>> +struct task_struct *p;
>> +
>> +if (cachep->memcg_params.memcg)
>> +return cachep;
>> +
>> +idx = cachep->memcg_params.id;
>> +VM_BUG_ON(idx == -1);
>> +
>> +rcu_read_lock();
>> +p = rcu_dereference(current->mm->owner);
>> +memcg = mem_cgroup_from_task(p);
>> +rcu_read_unlock();
>> +
>> +if (!memcg_can_account_kmem(memcg))
>> +return cachep;
>> +
>> +if (memcg->slabs[idx] == NULL) {
>> +memcg_create_cache_enqueue(memcg, cachep);
> 
> Do we want to wait for the work item if @gfp allows?
> 

I tried this once, and it got complicated enough that I deemed as "not
worth it". I honestly don't remember much of the details now, it was one
of the first things I tried, and a bunch of time has passed. If you
think it is absolutely worth it, I can try it again. But at the very
best, I view this as an optimization.


--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
Mo

Re: [PATCH v3 06/16] memcg: infrastructure to match an allocation to the right cache

2012-09-24 Thread Tejun Heo
Hello, Glauber.

On Mon, Sep 24, 2012 at 12:46:35PM +0400, Glauber Costa wrote:
> >> +#ifdef CONFIG_MEMCG_KMEM
> >> +  /* Slab accounting */
> >> +  struct kmem_cache *slabs[MAX_KMEM_CACHE_TYPES];
> >> +#endif
> > 
> > Bah, 400 entry array in struct mem_cgroup.  Can't we do something a
> > bit more flexible?
> > 
> 
> I guess. I still would like it to be an array, so we can easily access
> its fields. There are two ways around this:
> 
> 1) Do like the events mechanism and allocate this in a separate
> structure. Add a pointer chase in the access, and I don't think it helps
> much because it gets allocated anyway. But we could at least
> defer it to the time when we limit the cache.

Start at some reasonable size and then double it as usage grows?  How
many kmem_caches do we typically end up using?

> >> +  if (memcg->slabs[idx] == NULL) {
> >> +  memcg_create_cache_enqueue(memcg, cachep);
> > 
> > Do we want to wait for the work item if @gfp allows?
> > 
> 
> I tried this once, and it got complicated enough that I deemed as "not
> worth it". I honestly don't remember much of the details now, it was one
> of the first things I tried, and a bunch of time has passed. If you
> think it is absolutely worth it, I can try it again. But at the very
> best, I view this as an optimization.

I don't know.  It seems like a logical thing to try and depends on how
complex it gets.  I don't think it's a must.  The whole thing is
somewhat opportunistic after all.

Thanks.

-- 
tejun
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH v3 06/16] memcg: infrastructure to match an allocation to the right cache

2012-09-24 Thread Tejun Heo
On Mon, Sep 24, 2012 at 12:17:37PM +0400, Glauber Costa wrote:
> On 09/22/2012 12:52 AM, Tejun Heo wrote:
> > Missed some stuff.
> > 
> > On Tue, Sep 18, 2012 at 06:12:00PM +0400, Glauber Costa wrote:
> >> +static struct kmem_cache *memcg_create_kmem_cache(struct mem_cgroup 
> >> *memcg,
> >> +struct kmem_cache *cachep)
> >> +{
> > ...
> >> +  memcg->slabs[idx] = new_cachep;
> > ...
> >> +struct kmem_cache *__memcg_kmem_get_cache(struct kmem_cache *cachep,
> >> +gfp_t gfp)
> >> +{
> > ...
> >> +  return memcg->slabs[idx];
> > 
> > I think you need memory barriers for the above pair.
> > 
> > Thanks.
> > 
> 
> Why is that?
> 
> We'll either see a value, or NULL. If we see NULL, we assume the cache
> is not yet created. Not a big deal.

Because when you see !NULL cache pointer you want to be able to see
the cache fully initialized.  You need wmb between cache creation and
pointer assignment and at least read_barrier_depends() between
fetching the cache pointer and dereferencing it.

Thanks.

-- 
tejun
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH v3 06/16] memcg: infrastructure to match an allocation to the right cache

2012-09-25 Thread Glauber Costa
On 09/24/2012 09:56 PM, Tejun Heo wrote:
> Hello, Glauber.
> 
> On Mon, Sep 24, 2012 at 12:46:35PM +0400, Glauber Costa wrote:
 +#ifdef CONFIG_MEMCG_KMEM
 +  /* Slab accounting */
 +  struct kmem_cache *slabs[MAX_KMEM_CACHE_TYPES];
 +#endif
>>>
>>> Bah, 400 entry array in struct mem_cgroup.  Can't we do something a
>>> bit more flexible?
>>>
>>
>> I guess. I still would like it to be an array, so we can easily access
>> its fields. There are two ways around this:
>>
>> 1) Do like the events mechanism and allocate this in a separate
>> structure. Add a pointer chase in the access, and I don't think it helps
>> much because it gets allocated anyway. But we could at least
>> defer it to the time when we limit the cache.
> 
> Start at some reasonable size and then double it as usage grows?  How
> many kmem_caches do we typically end up using?
> 

So my Fedora box here, recently booted on a Fedora kernel, will have 111
caches. How would 150 sound to you?
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH v3 06/16] memcg: infrastructure to match an allocation to the right cache

2012-09-25 Thread Christoph Lameter
On Tue, 25 Sep 2012, Glauber Costa wrote:

> >> 1) Do like the events mechanism and allocate this in a separate
> >> structure. Add a pointer chase in the access, and I don't think it helps
> >> much because it gets allocated anyway. But we could at least
> >> defer it to the time when we limit the cache.
> >
> > Start at some reasonable size and then double it as usage grows?  How
> > many kmem_caches do we typically end up using?
> >
>
> So my Fedora box here, recently booted on a Fedora kernel, will have 111
> caches. How would 150 sound to you?

Some drivers/subsystems can dynamically create slabs as needed for new
devices or instances of metadata. You cannot use a fixed size
array and cannot establish an upper boundary for the number of slabs on
the system.


--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/