> 
> From: Ebru Akagunduz <ebru.akagun...@gmail.com>
> 
> Currently khugepaged makes swapin readahead under down_write.  This patch
> supplies to make swapin readahead under down_read instead of down_write.
> 
> The patch was tested with a test program that allocates 800MB of memory,
> writes to it, and then sleeps.  The system was forced to swap out all.
> Afterwards, the test program touches the area by writing, it skips a page
> in each 20 pages of the area.
> 
> Link: 
> http://lkml.kernel.org/r/1464335964-6510-4-git-send-email-ebru.akagun...@gmail.com
> Signed-off-by: Ebru Akagunduz <ebru.akagun...@gmail.com>
> Cc: Hugh Dickins <hu...@google.com>
> Cc: Rik van Riel <r...@redhat.com>
> Cc: "Kirill A. Shutemov" <kirill.shute...@linux.intel.com>
> Cc: Naoya Horiguchi <n-horigu...@ah.jp.nec.com>
> Cc: Andrea Arcangeli <aarca...@redhat.com>
> Cc: Joonsoo Kim <iamjoonsoo....@lge.com>
> Cc: Cyrill Gorcunov <gorcu...@openvz.org>
> Cc: Mel Gorman <mgor...@suse.de>
> Cc: David Rientjes <rient...@google.com>
> Cc: Vlastimil Babka <vba...@suse.cz>
> Cc: Aneesh Kumar K.V <aneesh.ku...@linux.vnet.ibm.com>
> Cc: Johannes Weiner <han...@cmpxchg.org>
> Cc: Michal Hocko <mho...@suse.cz>
> Cc: Minchan Kim <minchan....@gmail.com>
> Signed-off-by: Andrew Morton <a...@linux-foundation.org>
> ---
>  mm/huge_memory.c | 92 
> ++++++++++++++++++++++++++++++++++++++------------------
>  1 file changed, 63 insertions(+), 29 deletions(-)
> 
> diff --git a/mm/huge_memory.c b/mm/huge_memory.c
> index f2bc57c45d2f..96dfe3f09bf6 100644
> --- a/mm/huge_memory.c
> +++ b/mm/huge_memory.c
> @@ -2378,6 +2378,35 @@ static bool hugepage_vma_check(struct vm_area_struct 
> *vma)
>  }
> 
>  /*
> + * If mmap_sem temporarily dropped, revalidate vma
> + * before taking mmap_sem.

See below

> + * Return 0 if succeeds, otherwise return none-zero
> + * value (scan code).
> + */
> +
> +static int hugepage_vma_revalidate(struct mm_struct *mm,
> +                                struct vm_area_struct *vma,
> +                                unsigned long address)
> +{
> +     unsigned long hstart, hend;
> +
> +     if (unlikely(khugepaged_test_exit(mm)))
> +             return SCAN_ANY_PROCESS;
> +
> +     vma = find_vma(mm, address);
> +     if (!vma)
> +             return SCAN_VMA_NULL;
> +
> +     hstart = (vma->vm_start + ~HPAGE_PMD_MASK) & HPAGE_PMD_MASK;
> +     hend = vma->vm_end & HPAGE_PMD_MASK;
> +     if (address < hstart || address + HPAGE_PMD_SIZE > hend)
> +             return SCAN_ADDRESS_RANGE;
> +     if (!hugepage_vma_check(vma))
> +             return SCAN_VMA_CHECK;
> +     return 0;
> +}
> +
> +/*
>   * Bring missing pages in from swap, to complete THP collapse.
>   * Only done if khugepaged_scan_pmd believes it is worthwhile.
>   *
> @@ -2385,7 +2414,7 @@ static bool hugepage_vma_check(struct vm_area_struct 
> *vma)
>   * but with mmap_sem held to protect against vma changes.
>   */
> 
> -static void __collapse_huge_page_swapin(struct mm_struct *mm,
> +static bool __collapse_huge_page_swapin(struct mm_struct *mm,
>                                       struct vm_area_struct *vma,
>                                       unsigned long address, pmd_t *pmd)
>  {
> @@ -2401,11 +2430,18 @@ static void __collapse_huge_page_swapin(struct 
> mm_struct *mm,
>                       continue;
>               swapped_in++;
>               ret = do_swap_page(mm, vma, _address, pte, pmd,
> -                                
> FAULT_FLAG_ALLOW_RETRY|FAULT_FLAG_RETRY_NOWAIT,
> +                                FAULT_FLAG_ALLOW_RETRY,

Add a description in change log for it please.

>                                  pteval);
> +             /* do_swap_page returns VM_FAULT_RETRY with released mmap_sem */
> +             if (ret & VM_FAULT_RETRY) {
> +                     down_read(&mm->mmap_sem);
> +                     /* vma is no longer available, don't continue to swapin 
> */
> +                     if (hugepage_vma_revalidate(mm, vma, address))
> +                             return false;

Revalidate vma _after_ acquiring mmap_sem, but the above comment says _before_.

> +             }
>               if (ret & VM_FAULT_ERROR) {
>                       trace_mm_collapse_huge_page_swapin(mm, swapped_in, 0);
> -                     return;
> +                     return false;
>               }
>               /* pte is unmapped now, we need to map it */
>               pte = pte_offset_map(pmd, _address);
> @@ -2413,6 +2449,7 @@ static void __collapse_huge_page_swapin(struct 
> mm_struct *mm,
>       pte--;
>       pte_unmap(pte);
>       trace_mm_collapse_huge_page_swapin(mm, swapped_in, 1);
> +     return true;
>  }
> 
>  static void collapse_huge_page(struct mm_struct *mm,
> @@ -2427,7 +2464,6 @@ static void collapse_huge_page(struct mm_struct *mm,
>       struct page *new_page;
>       spinlock_t *pmd_ptl, *pte_ptl;
>       int isolated = 0, result = 0;
> -     unsigned long hstart, hend;
>       struct mem_cgroup *memcg;
>       unsigned long mmun_start;       /* For mmu_notifiers */
>       unsigned long mmun_end;         /* For mmu_notifiers */
> @@ -2450,39 +2486,37 @@ static void collapse_huge_page(struct mm_struct *mm,
>               goto out_nolock;
>       }
> 
> -     /*
> -      * Prevent all access to pagetables with the exception of
> -      * gup_fast later hanlded by the ptep_clear_flush and the VM
> -      * handled by the anon_vma lock + PG_lock.
> -      */
> -     down_write(&mm->mmap_sem);
> -     if (unlikely(khugepaged_test_exit(mm))) {
> -             result = SCAN_ANY_PROCESS;
> +     down_read(&mm->mmap_sem);
> +     result = hugepage_vma_revalidate(mm, vma, address);
> +     if (result)
>               goto out;
> -     }
> 
> -     vma = find_vma(mm, address);
> -     if (!vma) {
> -             result = SCAN_VMA_NULL;
> -             goto out;
> -     }
> -     hstart = (vma->vm_start + ~HPAGE_PMD_MASK) & HPAGE_PMD_MASK;
> -     hend = vma->vm_end & HPAGE_PMD_MASK;
> -     if (address < hstart || address + HPAGE_PMD_SIZE > hend) {
> -             result = SCAN_ADDRESS_RANGE;
> -             goto out;
> -     }
> -     if (!hugepage_vma_check(vma)) {
> -             result = SCAN_VMA_CHECK;
> -             goto out;
> -     }
>       pmd = mm_find_pmd(mm, address);
>       if (!pmd) {
>               result = SCAN_PMD_NULL;
>               goto out;
>       }
> 
> -     __collapse_huge_page_swapin(mm, vma, address, pmd);
> +     /*
> +      * __collapse_huge_page_swapin always returns with mmap_sem
> +      * locked. If it fails, release mmap_sem and jump directly
> +      * label out. Continuing to collapse causes inconsistency.
> +      */
> +     if (!__collapse_huge_page_swapin(mm, vma, address, pmd)) {
> +             up_read(&mm->mmap_sem);
> +             goto out;

Jump out with mmap_sem released, 

> +     }
> +
> +     up_read(&mm->mmap_sem);
> +     /*
> +      * Prevent all access to pagetables with the exception of
> +      * gup_fast later handled by the ptep_clear_flush and the VM
> +      * handled by the anon_vma lock + PG_lock.
> +      */
> +     down_write(&mm->mmap_sem);
> +     result = hugepage_vma_revalidate(mm, vma, address);
> +     if (result)
> +             goto out;

but jump out again with mmap_sem held.

They are cleaned up in subsequent darns?

> 
>       anon_vma_lock_write(vma->anon_vma);
> 
> --
> 2.8.1


Reply via email to