On Fri, 20 Apr 2018, Michal Hocko wrote:

> diff --git a/mm/mmap.c b/mm/mmap.c
> index faf85699f1a1..216efa6d9f61 100644
> --- a/mm/mmap.c
> +++ b/mm/mmap.c
> @@ -3004,10 +3004,21 @@ void exit_mmap(struct mm_struct *mm)
>       struct mmu_gather tlb;
>       struct vm_area_struct *vma;
>       unsigned long nr_accounted = 0;
> +     bool locked = false;
>  
>       /* mm's last user has gone, and its about to be pulled down */
>       mmu_notifier_release(mm);
>  
> +     /*
> +      * The mm is not accessible for anybody except for the oom reaper
> +      * which cannot race with munlocking so make sure we exclude the
> +      * two.
> +      */
> +     if (unlikely(mm_is_oom_victim(mm))) {
> +             down_write(&mm->mmap_sem);
> +             locked = true;
> +     }
> +
>       if (mm->locked_vm) {
>               vma = mm->mmap;
>               while (vma) {
> @@ -3021,7 +3032,7 @@ void exit_mmap(struct mm_struct *mm)
>  
>       vma = mm->mmap;
>       if (!vma)       /* Can happen if dup_mmap() received an OOM */
> -             return;
> +             goto out_unlock;
>  
>       lru_add_drain();
>       flush_cache_mm(mm);
> @@ -3030,23 +3041,6 @@ void exit_mmap(struct mm_struct *mm)
>       /* Use -1 here to ensure all VMAs in the mm are unmapped */
>       unmap_vmas(&tlb, vma, 0, -1);
>  
> -     if (unlikely(mm_is_oom_victim(mm))) {
> -             /*
> -              * Wait for oom_reap_task() to stop working on this
> -              * mm. Because MMF_OOM_SKIP is already set before
> -              * calling down_read(), oom_reap_task() will not run
> -              * on this "mm" post up_write().
> -              *
> -              * mm_is_oom_victim() cannot be set from under us
> -              * either because victim->mm is already set to NULL
> -              * under task_lock before calling mmput and oom_mm is
> -              * set not NULL by the OOM killer only if victim->mm
> -              * is found not NULL while holding the task_lock.
> -              */
> -             set_bit(MMF_OOM_SKIP, &mm->flags);
> -             down_write(&mm->mmap_sem);
> -             up_write(&mm->mmap_sem);
> -     }
>       free_pgtables(&tlb, vma, FIRST_USER_ADDRESS, USER_PGTABLES_CEILING);
>       tlb_finish_mmu(&tlb, 0, -1);
>  
> @@ -3060,6 +3054,12 @@ void exit_mmap(struct mm_struct *mm)
>               vma = remove_vma(vma);
>       }
>       vm_unacct_memory(nr_accounted);
> +
> +out_unlock:
> +     if (unlikely(locked)) {
> +             set_bit(MMF_OOM_SKIP, &mm->flags);
> +             up_write(&mm->mmap_sem);
> +     }
>  }
>  
>  /* Insert vm structure into process list sorted by address

How have you tested this?

I'm wondering why you do not see oom killing of many processes if the 
victim is a very large process that takes a long time to free memory in 
exit_mmap() as I do because the oom reaper gives up trying to acquire 
mm->mmap_sem and just sets MMF_OOM_SKIP itself.

Reply via email to