On Thu 21-04-16 19:09:02, Tejun Heo wrote:
> Hello,
> 
> So, this ended up a lot simpler than I originally expected.  I tested
> it lightly and it seems to work fine.  Petr, can you please test these
> two patches w/o the lru drain drop patch and see whether the problem
> is gone?
> 
> Thanks.
> ------ 8< ------
> If charge moving is used, memcg performs relabeling of the affected
> pages from its ->attach callback which is called under both
> cgroup_threadgroup_rwsem and thus can't create new kthreads.  This is
> fragile as various operations may depend on workqueues making forward
> progress which relies on the ability to create new kthreads.
> 
> There's no reason to perform charge moving from ->attach which is deep
> in the task migration path.  Move it to ->post_attach which is called
> after the actual migration is finished and cgroup_threadgroup_rwsem is
> dropped.
> 
> * move_charge_struct->mm is added and ->can_attach is now responsible
>   for pinning and recording the target mm.  mem_cgroup_clear_mc() is
>   updated accordingly.  This also simplifies mem_cgroup_move_task().
> 
> * mem_cgroup_move_task() is now called from ->post_attach instead of
>   ->attach.

This looks much better than the previous quick&dirty workaround. Thanks
for taking an extra step!

Sorry for being so pushy but shouldn't we at least document those
callbacks which depend on cgroup_threadgroup_rwsem to never ever block
on WQ directly or indirectly. Maybe even enforce they have to be
non-blocking. This would be out of scope of this particular patch of
course but it would fit nicely into the series.
 
> Signed-off-by: Tejun Heo <t...@kernel.org>
> Cc: Johannes Weiner <han...@cmpxchg.org>
> Cc: Michal Hocko <mho...@kernel.org>
> Debugged-by: Petr Mladek <pmla...@suse.com>
> Reported-by: Cyril Hrubis <chru...@suse.cz>
> Reported-by: Johannes Weiner <han...@cmpxchg.org>
> Fixes: 1ed1328792ff ("sched, cgroup: replace signal_struct->group_rwsem with 
> a global percpu_rwsem")
> Cc: <sta...@vger.kernel.org> # 4.4+

Acked-by: Michal Hocko <mho...@suse.com>

Thanks!

> ---
>  mm/memcontrol.c |   37 +++++++++++++++++++------------------
>  1 file changed, 19 insertions(+), 18 deletions(-)
> 
> --- a/mm/memcontrol.c
> +++ b/mm/memcontrol.c
> @@ -207,6 +207,7 @@ static void mem_cgroup_oom_notify(struct
>  /* "mc" and its members are protected by cgroup_mutex */
>  static struct move_charge_struct {
>       spinlock_t        lock; /* for from, to */
> +     struct mm_struct  *mm;
>       struct mem_cgroup *from;
>       struct mem_cgroup *to;
>       unsigned long flags;
> @@ -4667,6 +4668,8 @@ static void __mem_cgroup_clear_mc(void)
>  
>  static void mem_cgroup_clear_mc(void)
>  {
> +     struct mm_struct *mm = mc.mm;
> +
>       /*
>        * we must clear moving_task before waking up waiters at the end of
>        * task migration.
> @@ -4676,7 +4679,10 @@ static void mem_cgroup_clear_mc(void)
>       spin_lock(&mc.lock);
>       mc.from = NULL;
>       mc.to = NULL;
> +     mc.mm = NULL;
>       spin_unlock(&mc.lock);
> +
> +     mmput(mm);
>  }
>  
>  static int mem_cgroup_can_attach(struct cgroup_taskset *tset)
> @@ -4733,6 +4739,7 @@ static int mem_cgroup_can_attach(struct
>               VM_BUG_ON(mc.moved_swap);
>  
>               spin_lock(&mc.lock);
> +             mc.mm = mm;
>               mc.from = from;
>               mc.to = memcg;
>               mc.flags = move_flags;
> @@ -4742,8 +4749,9 @@ static int mem_cgroup_can_attach(struct
>               ret = mem_cgroup_precharge_mc(mm);
>               if (ret)
>                       mem_cgroup_clear_mc();
> +     } else {
> +             mmput(mm);
>       }
> -     mmput(mm);
>       return ret;
>  }
>  
> @@ -4852,11 +4860,11 @@ put:                  /* get_mctgt_type() gets the 
> page
>       return ret;
>  }
>  
> -static void mem_cgroup_move_charge(struct mm_struct *mm)
> +static void mem_cgroup_move_charge(void)
>  {
>       struct mm_walk mem_cgroup_move_charge_walk = {
>               .pmd_entry = mem_cgroup_move_charge_pte_range,
> -             .mm = mm,
> +             .mm = mc.mm,
>       };
>  
>       lru_add_drain_all();
> @@ -4868,7 +4876,7 @@ static void mem_cgroup_move_charge(struc
>       atomic_inc(&mc.from->moving_account);
>       synchronize_rcu();
>  retry:
> -     if (unlikely(!down_read_trylock(&mm->mmap_sem))) {
> +     if (unlikely(!down_read_trylock(&mc.mm->mmap_sem))) {
>               /*
>                * Someone who are holding the mmap_sem might be waiting in
>                * waitq. So we cancel all extra charges, wake up all waiters,
> @@ -4885,23 +4893,16 @@ retry:
>        * additional charge, the page walk just aborts.
>        */
>       walk_page_range(0, ~0UL, &mem_cgroup_move_charge_walk);
> -     up_read(&mm->mmap_sem);
> +     up_read(&mc.mm->mmap_sem);
>       atomic_dec(&mc.from->moving_account);
>  }
>  
> -static void mem_cgroup_move_task(struct cgroup_taskset *tset)
> +static void mem_cgroup_move_task(void)
>  {
> -     struct cgroup_subsys_state *css;
> -     struct task_struct *p = cgroup_taskset_first(tset, &css);
> -     struct mm_struct *mm = get_task_mm(p);
> -
> -     if (mm) {
> -             if (mc.to)
> -                     mem_cgroup_move_charge(mm);
> -             mmput(mm);
> -     }
> -     if (mc.to)
> +     if (mc.to) {
> +             mem_cgroup_move_charge();
>               mem_cgroup_clear_mc();
> +     }
>  }
>  #else        /* !CONFIG_MMU */
>  static int mem_cgroup_can_attach(struct cgroup_taskset *tset)
> @@ -4911,7 +4912,7 @@ static int mem_cgroup_can_attach(struct
>  static void mem_cgroup_cancel_attach(struct cgroup_taskset *tset)
>  {
>  }
> -static void mem_cgroup_move_task(struct cgroup_taskset *tset)
> +static void mem_cgroup_move_task(void)
>  {
>  }
>  #endif
> @@ -5195,7 +5196,7 @@ struct cgroup_subsys memory_cgrp_subsys
>       .css_reset = mem_cgroup_css_reset,
>       .can_attach = mem_cgroup_can_attach,
>       .cancel_attach = mem_cgroup_cancel_attach,
> -     .attach = mem_cgroup_move_task,
> +     .post_attach = mem_cgroup_move_task,
>       .bind = mem_cgroup_bind,
>       .dfl_cftypes = memory_files,
>       .legacy_cftypes = mem_cgroup_legacy_files,

-- 
Michal Hocko
SUSE Labs

Reply via email to