[PATCH -V6 09/21] swap: Swapin a THP in one piece
With this patch, when page fault handler find a PMD swap mapping, it will swap in a THP in one piece. This avoids the overhead of splitting/collapsing before/after the THP swapping. And improves the swap performance greatly for reduced page fault count etc. do_huge_pmd_swap_page() is added in the patch to implement this. It is similar to do_swap_page() for normal page swapin. If failing to allocate a THP, the huge swap cluster and the PMD swap mapping will be split to fallback to normal page swapin. If the huge swap cluster has been split already, the PMD swap mapping will be split to fallback to normal page swapin. Signed-off-by: "Huang, Ying" Cc: "Kirill A. Shutemov" Cc: Andrea Arcangeli Cc: Michal Hocko Cc: Johannes Weiner Cc: Shaohua Li Cc: Hugh Dickins Cc: Minchan Kim Cc: Rik van Riel Cc: Dave Hansen Cc: Naoya Horiguchi Cc: Zi Yan Cc: Daniel Jordan --- include/linux/huge_mm.h | 9 +++ mm/huge_memory.c| 174 mm/memory.c | 16 +++-- 3 files changed, 193 insertions(+), 6 deletions(-) diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h index a0e7f4f9c12b..d88579cb059a 100644 --- a/include/linux/huge_mm.h +++ b/include/linux/huge_mm.h @@ -373,4 +373,13 @@ static inline gfp_t alloc_hugepage_direct_gfpmask(struct vm_area_struct *vma, } #endif /* CONFIG_TRANSPARENT_HUGEPAGE */ +#ifdef CONFIG_THP_SWAP +extern int do_huge_pmd_swap_page(struct vm_fault *vmf, pmd_t orig_pmd); +#else /* CONFIG_THP_SWAP */ +static inline int do_huge_pmd_swap_page(struct vm_fault *vmf, pmd_t orig_pmd) +{ + return 0; +} +#endif /* CONFIG_THP_SWAP */ + #endif /* _LINUX_HUGE_MM_H */ diff --git a/mm/huge_memory.c b/mm/huge_memory.c index a025494dd828..fbc9c9e30992 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -33,6 +33,8 @@ #include #include #include +#include +#include #include #include @@ -1659,6 +1661,178 @@ static void __split_huge_swap_pmd(struct vm_area_struct *vma, pmd_populate(mm, pmd, pgtable); } +#ifdef CONFIG_THP_SWAP +static int split_huge_swap_pmd(struct vm_area_struct *vma, pmd_t *pmd, + unsigned long address, pmd_t orig_pmd) +{ + struct mm_struct *mm = vma->vm_mm; + spinlock_t *ptl; + int ret = 0; + + ptl = pmd_lock(mm, pmd); + if (pmd_same(*pmd, orig_pmd)) + __split_huge_swap_pmd(vma, address & HPAGE_PMD_MASK, pmd); + else + ret = -ENOENT; + spin_unlock(ptl); + + return ret; +} + +int do_huge_pmd_swap_page(struct vm_fault *vmf, pmd_t orig_pmd) +{ + struct page *page; + struct mem_cgroup *memcg; + struct vm_area_struct *vma = vmf->vma; + unsigned long haddr = vmf->address & HPAGE_PMD_MASK; + swp_entry_t entry; + pmd_t pmd; + int i, locked, exclusive = 0, ret = 0; + + entry = pmd_to_swp_entry(orig_pmd); + VM_BUG_ON(non_swap_entry(entry)); + delayacct_set_flag(DELAYACCT_PF_SWAPIN); +retry: + page = lookup_swap_cache(entry, NULL, vmf->address); + if (!page) { + page = read_swap_cache_async(entry, GFP_HIGHUSER_MOVABLE, vma, +haddr, false); + if (!page) { + /* +* Back out if somebody else faulted in this pmd +* while we released the pmd lock. +*/ + if (likely(pmd_same(*vmf->pmd, orig_pmd))) { + /* +* Failed to allocate huge page, split huge swap +* cluster, and fallback to swapin normal page +*/ + ret = split_swap_cluster(entry, 0); + /* Somebody else swapin the swap entry, retry */ + if (ret == -EEXIST) { + ret = 0; + goto retry; + /* swapoff occurs under us */ + } else if (ret == -EINVAL) + ret = 0; + else + goto fallback; + } + delayacct_clear_flag(DELAYACCT_PF_SWAPIN); + goto out; + } + + /* Had to read the page from swap area: Major fault */ + ret = VM_FAULT_MAJOR; + count_vm_event(PGMAJFAULT); + count_memcg_event_mm(vma->vm_mm, PGMAJFAULT); + } else if (!PageTransCompound(page)) + goto fallback; + + locked = lock_page_or_retry(page, vma->vm_mm, vmf->flags); + + delayacct_clear_flag(DELAYACCT_PF_SWAPIN); + if (!locked) { + ret |= VM_FAULT_RETRY; + goto out_release;
[PATCH -V6 09/21] swap: Swapin a THP in one piece
With this patch, when page fault handler find a PMD swap mapping, it will swap in a THP in one piece. This avoids the overhead of splitting/collapsing before/after the THP swapping. And improves the swap performance greatly for reduced page fault count etc. do_huge_pmd_swap_page() is added in the patch to implement this. It is similar to do_swap_page() for normal page swapin. If failing to allocate a THP, the huge swap cluster and the PMD swap mapping will be split to fallback to normal page swapin. If the huge swap cluster has been split already, the PMD swap mapping will be split to fallback to normal page swapin. Signed-off-by: "Huang, Ying" Cc: "Kirill A. Shutemov" Cc: Andrea Arcangeli Cc: Michal Hocko Cc: Johannes Weiner Cc: Shaohua Li Cc: Hugh Dickins Cc: Minchan Kim Cc: Rik van Riel Cc: Dave Hansen Cc: Naoya Horiguchi Cc: Zi Yan Cc: Daniel Jordan --- include/linux/huge_mm.h | 9 +++ mm/huge_memory.c| 174 mm/memory.c | 16 +++-- 3 files changed, 193 insertions(+), 6 deletions(-) diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h index a0e7f4f9c12b..d88579cb059a 100644 --- a/include/linux/huge_mm.h +++ b/include/linux/huge_mm.h @@ -373,4 +373,13 @@ static inline gfp_t alloc_hugepage_direct_gfpmask(struct vm_area_struct *vma, } #endif /* CONFIG_TRANSPARENT_HUGEPAGE */ +#ifdef CONFIG_THP_SWAP +extern int do_huge_pmd_swap_page(struct vm_fault *vmf, pmd_t orig_pmd); +#else /* CONFIG_THP_SWAP */ +static inline int do_huge_pmd_swap_page(struct vm_fault *vmf, pmd_t orig_pmd) +{ + return 0; +} +#endif /* CONFIG_THP_SWAP */ + #endif /* _LINUX_HUGE_MM_H */ diff --git a/mm/huge_memory.c b/mm/huge_memory.c index a025494dd828..fbc9c9e30992 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -33,6 +33,8 @@ #include #include #include +#include +#include #include #include @@ -1659,6 +1661,178 @@ static void __split_huge_swap_pmd(struct vm_area_struct *vma, pmd_populate(mm, pmd, pgtable); } +#ifdef CONFIG_THP_SWAP +static int split_huge_swap_pmd(struct vm_area_struct *vma, pmd_t *pmd, + unsigned long address, pmd_t orig_pmd) +{ + struct mm_struct *mm = vma->vm_mm; + spinlock_t *ptl; + int ret = 0; + + ptl = pmd_lock(mm, pmd); + if (pmd_same(*pmd, orig_pmd)) + __split_huge_swap_pmd(vma, address & HPAGE_PMD_MASK, pmd); + else + ret = -ENOENT; + spin_unlock(ptl); + + return ret; +} + +int do_huge_pmd_swap_page(struct vm_fault *vmf, pmd_t orig_pmd) +{ + struct page *page; + struct mem_cgroup *memcg; + struct vm_area_struct *vma = vmf->vma; + unsigned long haddr = vmf->address & HPAGE_PMD_MASK; + swp_entry_t entry; + pmd_t pmd; + int i, locked, exclusive = 0, ret = 0; + + entry = pmd_to_swp_entry(orig_pmd); + VM_BUG_ON(non_swap_entry(entry)); + delayacct_set_flag(DELAYACCT_PF_SWAPIN); +retry: + page = lookup_swap_cache(entry, NULL, vmf->address); + if (!page) { + page = read_swap_cache_async(entry, GFP_HIGHUSER_MOVABLE, vma, +haddr, false); + if (!page) { + /* +* Back out if somebody else faulted in this pmd +* while we released the pmd lock. +*/ + if (likely(pmd_same(*vmf->pmd, orig_pmd))) { + /* +* Failed to allocate huge page, split huge swap +* cluster, and fallback to swapin normal page +*/ + ret = split_swap_cluster(entry, 0); + /* Somebody else swapin the swap entry, retry */ + if (ret == -EEXIST) { + ret = 0; + goto retry; + /* swapoff occurs under us */ + } else if (ret == -EINVAL) + ret = 0; + else + goto fallback; + } + delayacct_clear_flag(DELAYACCT_PF_SWAPIN); + goto out; + } + + /* Had to read the page from swap area: Major fault */ + ret = VM_FAULT_MAJOR; + count_vm_event(PGMAJFAULT); + count_memcg_event_mm(vma->vm_mm, PGMAJFAULT); + } else if (!PageTransCompound(page)) + goto fallback; + + locked = lock_page_or_retry(page, vma->vm_mm, vmf->flags); + + delayacct_clear_flag(DELAYACCT_PF_SWAPIN); + if (!locked) { + ret |= VM_FAULT_RETRY; + goto out_release;