On Fri, 20 Apr 2018, Michal Hocko wrote: > diff --git a/mm/mmap.c b/mm/mmap.c > index faf85699f1a1..216efa6d9f61 100644 > --- a/mm/mmap.c > +++ b/mm/mmap.c > @@ -3004,10 +3004,21 @@ void exit_mmap(struct mm_struct *mm) > struct mmu_gather tlb; > struct vm_area_struct *vma; > unsigned long nr_accounted = 0; > + bool locked = false; > > /* mm's last user has gone, and its about to be pulled down */ > mmu_notifier_release(mm); > > + /* > + * The mm is not accessible for anybody except for the oom reaper > + * which cannot race with munlocking so make sure we exclude the > + * two. > + */ > + if (unlikely(mm_is_oom_victim(mm))) { > + down_write(&mm->mmap_sem); > + locked = true; > + } > + > if (mm->locked_vm) { > vma = mm->mmap; > while (vma) { > @@ -3021,7 +3032,7 @@ void exit_mmap(struct mm_struct *mm) > > vma = mm->mmap; > if (!vma) /* Can happen if dup_mmap() received an OOM */ > - return; > + goto out_unlock; > > lru_add_drain(); > flush_cache_mm(mm); > @@ -3030,23 +3041,6 @@ void exit_mmap(struct mm_struct *mm) > /* Use -1 here to ensure all VMAs in the mm are unmapped */ > unmap_vmas(&tlb, vma, 0, -1); > > - if (unlikely(mm_is_oom_victim(mm))) { > - /* > - * Wait for oom_reap_task() to stop working on this > - * mm. Because MMF_OOM_SKIP is already set before > - * calling down_read(), oom_reap_task() will not run > - * on this "mm" post up_write(). > - * > - * mm_is_oom_victim() cannot be set from under us > - * either because victim->mm is already set to NULL > - * under task_lock before calling mmput and oom_mm is > - * set not NULL by the OOM killer only if victim->mm > - * is found not NULL while holding the task_lock. > - */ > - set_bit(MMF_OOM_SKIP, &mm->flags); > - down_write(&mm->mmap_sem); > - up_write(&mm->mmap_sem); > - } > free_pgtables(&tlb, vma, FIRST_USER_ADDRESS, USER_PGTABLES_CEILING); > tlb_finish_mmu(&tlb, 0, -1); > > @@ -3060,6 +3054,12 @@ void exit_mmap(struct mm_struct *mm) > vma = remove_vma(vma); > } > vm_unacct_memory(nr_accounted); > + > +out_unlock: > + if (unlikely(locked)) { > + set_bit(MMF_OOM_SKIP, &mm->flags); > + up_write(&mm->mmap_sem); > + } > } > > /* Insert vm structure into process list sorted by address
How have you tested this? I'm wondering why you do not see oom killing of many processes if the victim is a very large process that takes a long time to free memory in exit_mmap() as I do because the oom reaper gives up trying to acquire mm->mmap_sem and just sets MMF_OOM_SKIP itself.