[PATCH -V5 RESEND 06/21] swap: Support PMD swap mapping when splitting huge PMD

2018-09-25 Thread Huang Ying
A huge PMD need to be split when zap a part of the PMD mapping etc.
If the PMD mapping is a swap mapping, we need to split it too.  This
patch implemented the support for this.  This is similar as splitting
the PMD page mapping, except we need to decrease the PMD swap mapping
count for the huge swap cluster too.  If the PMD swap mapping count
becomes 0, the huge swap cluster will be split.

Notice: is_huge_zero_pmd() and pmd_page() doesn't work well with swap
PMD, so pmd_present() check is called before them.

Signed-off-by: "Huang, Ying" 
Cc: "Kirill A. Shutemov" 
Cc: Andrea Arcangeli 
Cc: Michal Hocko 
Cc: Johannes Weiner 
Cc: Shaohua Li 
Cc: Hugh Dickins 
Cc: Minchan Kim 
Cc: Rik van Riel 
Cc: Dave Hansen 
Cc: Naoya Horiguchi 
Cc: Zi Yan 
Cc: Daniel Jordan 
---
 include/linux/huge_mm.h |  4 
 include/linux/swap.h|  6 ++
 mm/huge_memory.c| 48 +++-
 mm/swapfile.c   | 32 
 4 files changed, 85 insertions(+), 5 deletions(-)

diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h
index 99c19b06d9a4..0f3e1739986f 100644
--- a/include/linux/huge_mm.h
+++ b/include/linux/huge_mm.h
@@ -226,6 +226,10 @@ static inline bool is_huge_zero_page(struct page *page)
return READ_ONCE(huge_zero_page) == page;
 }
 
+/*
+ * is_huge_zero_pmd() must be called after checking pmd_present(),
+ * otherwise, it may report false positive for PMD swap entry.
+ */
 static inline bool is_huge_zero_pmd(pmd_t pmd)
 {
return is_huge_zero_page(pmd_page(pmd));
diff --git a/include/linux/swap.h b/include/linux/swap.h
index 88eb06eb1444..5bd54b6fd4a1 100644
--- a/include/linux/swap.h
+++ b/include/linux/swap.h
@@ -618,11 +618,17 @@ static inline swp_entry_t get_swap_page(struct page *page)
 
 #ifdef CONFIG_THP_SWAP
 extern int split_swap_cluster(swp_entry_t entry);
+extern int split_swap_cluster_map(swp_entry_t entry);
 #else
 static inline int split_swap_cluster(swp_entry_t entry)
 {
return 0;
 }
+
+static inline int split_swap_cluster_map(swp_entry_t entry)
+{
+   return 0;
+}
 #endif
 
 #ifdef CONFIG_MEMCG
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index 63edf18ca9cc..3ea7318fcdcd 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -1604,6 +1604,40 @@ vm_fault_t do_huge_pmd_numa_page(struct vm_fault *vmf, 
pmd_t pmd)
return 0;
 }
 
+/* Convert a PMD swap mapping to a set of PTE swap mappings */
+static void __split_huge_swap_pmd(struct vm_area_struct *vma,
+ unsigned long haddr,
+ pmd_t *pmd)
+{
+   struct mm_struct *mm = vma->vm_mm;
+   pgtable_t pgtable;
+   pmd_t _pmd;
+   swp_entry_t entry;
+   int i, soft_dirty;
+
+   entry = pmd_to_swp_entry(*pmd);
+   soft_dirty = pmd_soft_dirty(*pmd);
+
+   split_swap_cluster_map(entry);
+
+   pgtable = pgtable_trans_huge_withdraw(mm, pmd);
+   pmd_populate(mm, &_pmd, pgtable);
+
+   for (i = 0; i < HPAGE_PMD_NR; i++, haddr += PAGE_SIZE, entry.val++) {
+   pte_t *pte, ptent;
+
+   pte = pte_offset_map(&_pmd, haddr);
+   VM_BUG_ON(!pte_none(*pte));
+   ptent = swp_entry_to_pte(entry);
+   if (soft_dirty)
+   ptent = pte_swp_mksoft_dirty(ptent);
+   set_pte_at(mm, haddr, pte, ptent);
+   pte_unmap(pte);
+   }
+   smp_wmb(); /* make pte visible before pmd */
+   pmd_populate(mm, pmd, pgtable);
+}
+
 /*
  * Return true if we do MADV_FREE successfully on entire pmd page.
  * Otherwise, return false.
@@ -2070,7 +2104,7 @@ static void __split_huge_pmd_locked(struct vm_area_struct 
*vma, pmd_t *pmd,
VM_BUG_ON(haddr & ~HPAGE_PMD_MASK);
VM_BUG_ON_VMA(vma->vm_start > haddr, vma);
VM_BUG_ON_VMA(vma->vm_end < haddr + HPAGE_PMD_SIZE, vma);
-   VM_BUG_ON(!is_pmd_migration_entry(*pmd) && !pmd_trans_huge(*pmd)
+   VM_BUG_ON(!is_swap_pmd(*pmd) && !pmd_trans_huge(*pmd)
&& !pmd_devmap(*pmd));
 
count_vm_event(THP_SPLIT_PMD);
@@ -2094,7 +2128,7 @@ static void __split_huge_pmd_locked(struct vm_area_struct 
*vma, pmd_t *pmd,
put_page(page);
add_mm_counter(mm, mm_counter_file(page), -HPAGE_PMD_NR);
return;
-   } else if (is_huge_zero_pmd(*pmd)) {
+   } else if (pmd_present(*pmd) && is_huge_zero_pmd(*pmd)) {
/*
 * FIXME: Do we want to invalidate secondary mmu by calling
 * mmu_notifier_invalidate_range() see comments below inside
@@ -2138,6 +2172,9 @@ static void __split_huge_pmd_locked(struct vm_area_struct 
*vma, pmd_t *pmd,
page = pfn_to_page(swp_offset(entry));
} else
 #endif
+   if (IS_ENABLED(CONFIG_THP_SWAP) && is_swap_pmd(old_pmd))
+   return __split_huge_swap_pmd(vma, haddr, pmd);
+   else
page = 

[PATCH -V5 RESEND 06/21] swap: Support PMD swap mapping when splitting huge PMD

2018-09-25 Thread Huang Ying
A huge PMD need to be split when zap a part of the PMD mapping etc.
If the PMD mapping is a swap mapping, we need to split it too.  This
patch implemented the support for this.  This is similar as splitting
the PMD page mapping, except we need to decrease the PMD swap mapping
count for the huge swap cluster too.  If the PMD swap mapping count
becomes 0, the huge swap cluster will be split.

Notice: is_huge_zero_pmd() and pmd_page() doesn't work well with swap
PMD, so pmd_present() check is called before them.

Signed-off-by: "Huang, Ying" 
Cc: "Kirill A. Shutemov" 
Cc: Andrea Arcangeli 
Cc: Michal Hocko 
Cc: Johannes Weiner 
Cc: Shaohua Li 
Cc: Hugh Dickins 
Cc: Minchan Kim 
Cc: Rik van Riel 
Cc: Dave Hansen 
Cc: Naoya Horiguchi 
Cc: Zi Yan 
Cc: Daniel Jordan 
---
 include/linux/huge_mm.h |  4 
 include/linux/swap.h|  6 ++
 mm/huge_memory.c| 48 +++-
 mm/swapfile.c   | 32 
 4 files changed, 85 insertions(+), 5 deletions(-)

diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h
index 99c19b06d9a4..0f3e1739986f 100644
--- a/include/linux/huge_mm.h
+++ b/include/linux/huge_mm.h
@@ -226,6 +226,10 @@ static inline bool is_huge_zero_page(struct page *page)
return READ_ONCE(huge_zero_page) == page;
 }
 
+/*
+ * is_huge_zero_pmd() must be called after checking pmd_present(),
+ * otherwise, it may report false positive for PMD swap entry.
+ */
 static inline bool is_huge_zero_pmd(pmd_t pmd)
 {
return is_huge_zero_page(pmd_page(pmd));
diff --git a/include/linux/swap.h b/include/linux/swap.h
index 88eb06eb1444..5bd54b6fd4a1 100644
--- a/include/linux/swap.h
+++ b/include/linux/swap.h
@@ -618,11 +618,17 @@ static inline swp_entry_t get_swap_page(struct page *page)
 
 #ifdef CONFIG_THP_SWAP
 extern int split_swap_cluster(swp_entry_t entry);
+extern int split_swap_cluster_map(swp_entry_t entry);
 #else
 static inline int split_swap_cluster(swp_entry_t entry)
 {
return 0;
 }
+
+static inline int split_swap_cluster_map(swp_entry_t entry)
+{
+   return 0;
+}
 #endif
 
 #ifdef CONFIG_MEMCG
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index 63edf18ca9cc..3ea7318fcdcd 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -1604,6 +1604,40 @@ vm_fault_t do_huge_pmd_numa_page(struct vm_fault *vmf, 
pmd_t pmd)
return 0;
 }
 
+/* Convert a PMD swap mapping to a set of PTE swap mappings */
+static void __split_huge_swap_pmd(struct vm_area_struct *vma,
+ unsigned long haddr,
+ pmd_t *pmd)
+{
+   struct mm_struct *mm = vma->vm_mm;
+   pgtable_t pgtable;
+   pmd_t _pmd;
+   swp_entry_t entry;
+   int i, soft_dirty;
+
+   entry = pmd_to_swp_entry(*pmd);
+   soft_dirty = pmd_soft_dirty(*pmd);
+
+   split_swap_cluster_map(entry);
+
+   pgtable = pgtable_trans_huge_withdraw(mm, pmd);
+   pmd_populate(mm, &_pmd, pgtable);
+
+   for (i = 0; i < HPAGE_PMD_NR; i++, haddr += PAGE_SIZE, entry.val++) {
+   pte_t *pte, ptent;
+
+   pte = pte_offset_map(&_pmd, haddr);
+   VM_BUG_ON(!pte_none(*pte));
+   ptent = swp_entry_to_pte(entry);
+   if (soft_dirty)
+   ptent = pte_swp_mksoft_dirty(ptent);
+   set_pte_at(mm, haddr, pte, ptent);
+   pte_unmap(pte);
+   }
+   smp_wmb(); /* make pte visible before pmd */
+   pmd_populate(mm, pmd, pgtable);
+}
+
 /*
  * Return true if we do MADV_FREE successfully on entire pmd page.
  * Otherwise, return false.
@@ -2070,7 +2104,7 @@ static void __split_huge_pmd_locked(struct vm_area_struct 
*vma, pmd_t *pmd,
VM_BUG_ON(haddr & ~HPAGE_PMD_MASK);
VM_BUG_ON_VMA(vma->vm_start > haddr, vma);
VM_BUG_ON_VMA(vma->vm_end < haddr + HPAGE_PMD_SIZE, vma);
-   VM_BUG_ON(!is_pmd_migration_entry(*pmd) && !pmd_trans_huge(*pmd)
+   VM_BUG_ON(!is_swap_pmd(*pmd) && !pmd_trans_huge(*pmd)
&& !pmd_devmap(*pmd));
 
count_vm_event(THP_SPLIT_PMD);
@@ -2094,7 +2128,7 @@ static void __split_huge_pmd_locked(struct vm_area_struct 
*vma, pmd_t *pmd,
put_page(page);
add_mm_counter(mm, mm_counter_file(page), -HPAGE_PMD_NR);
return;
-   } else if (is_huge_zero_pmd(*pmd)) {
+   } else if (pmd_present(*pmd) && is_huge_zero_pmd(*pmd)) {
/*
 * FIXME: Do we want to invalidate secondary mmu by calling
 * mmu_notifier_invalidate_range() see comments below inside
@@ -2138,6 +2172,9 @@ static void __split_huge_pmd_locked(struct vm_area_struct 
*vma, pmd_t *pmd,
page = pfn_to_page(swp_offset(entry));
} else
 #endif
+   if (IS_ENABLED(CONFIG_THP_SWAP) && is_swap_pmd(old_pmd))
+   return __split_huge_swap_pmd(vma, haddr, pmd);
+   else
page = 

[PATCH -V5 RESEND 06/21] swap: Support PMD swap mapping when splitting huge PMD

2018-09-11 Thread Huang Ying
A huge PMD need to be split when zap a part of the PMD mapping etc.
If the PMD mapping is a swap mapping, we need to split it too.  This
patch implemented the support for this.  This is similar as splitting
the PMD page mapping, except we need to decrease the PMD swap mapping
count for the huge swap cluster too.  If the PMD swap mapping count
becomes 0, the huge swap cluster will be split.

Notice: is_huge_zero_pmd() and pmd_page() doesn't work well with swap
PMD, so pmd_present() check is called before them.

Signed-off-by: "Huang, Ying" 
Cc: "Kirill A. Shutemov" 
Cc: Andrea Arcangeli 
Cc: Michal Hocko 
Cc: Johannes Weiner 
Cc: Shaohua Li 
Cc: Hugh Dickins 
Cc: Minchan Kim 
Cc: Rik van Riel 
Cc: Dave Hansen 
Cc: Naoya Horiguchi 
Cc: Zi Yan 
Cc: Daniel Jordan 
---
 include/linux/huge_mm.h |  4 
 include/linux/swap.h|  6 ++
 mm/huge_memory.c| 48 +++-
 mm/swapfile.c   | 32 
 4 files changed, 85 insertions(+), 5 deletions(-)

diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h
index 99c19b06d9a4..0f3e1739986f 100644
--- a/include/linux/huge_mm.h
+++ b/include/linux/huge_mm.h
@@ -226,6 +226,10 @@ static inline bool is_huge_zero_page(struct page *page)
return READ_ONCE(huge_zero_page) == page;
 }
 
+/*
+ * is_huge_zero_pmd() must be called after checking pmd_present(),
+ * otherwise, it may report false positive for PMD swap entry.
+ */
 static inline bool is_huge_zero_pmd(pmd_t pmd)
 {
return is_huge_zero_page(pmd_page(pmd));
diff --git a/include/linux/swap.h b/include/linux/swap.h
index db3e07a3d9bc..a2a3d85decd9 100644
--- a/include/linux/swap.h
+++ b/include/linux/swap.h
@@ -618,11 +618,17 @@ static inline swp_entry_t get_swap_page(struct page *page)
 
 #ifdef CONFIG_THP_SWAP
 extern int split_swap_cluster(swp_entry_t entry);
+extern int split_swap_cluster_map(swp_entry_t entry);
 #else
 static inline int split_swap_cluster(swp_entry_t entry)
 {
return 0;
 }
+
+static inline int split_swap_cluster_map(swp_entry_t entry)
+{
+   return 0;
+}
 #endif
 
 #ifdef CONFIG_MEMCG
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index c235ba78de68..b8b61a0879f6 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -1609,6 +1609,40 @@ vm_fault_t do_huge_pmd_numa_page(struct vm_fault *vmf, 
pmd_t pmd)
return 0;
 }
 
+/* Convert a PMD swap mapping to a set of PTE swap mappings */
+static void __split_huge_swap_pmd(struct vm_area_struct *vma,
+ unsigned long haddr,
+ pmd_t *pmd)
+{
+   struct mm_struct *mm = vma->vm_mm;
+   pgtable_t pgtable;
+   pmd_t _pmd;
+   swp_entry_t entry;
+   int i, soft_dirty;
+
+   entry = pmd_to_swp_entry(*pmd);
+   soft_dirty = pmd_soft_dirty(*pmd);
+
+   split_swap_cluster_map(entry);
+
+   pgtable = pgtable_trans_huge_withdraw(mm, pmd);
+   pmd_populate(mm, &_pmd, pgtable);
+
+   for (i = 0; i < HPAGE_PMD_NR; i++, haddr += PAGE_SIZE, entry.val++) {
+   pte_t *pte, ptent;
+
+   pte = pte_offset_map(&_pmd, haddr);
+   VM_BUG_ON(!pte_none(*pte));
+   ptent = swp_entry_to_pte(entry);
+   if (soft_dirty)
+   ptent = pte_swp_mksoft_dirty(ptent);
+   set_pte_at(mm, haddr, pte, ptent);
+   pte_unmap(pte);
+   }
+   smp_wmb(); /* make pte visible before pmd */
+   pmd_populate(mm, pmd, pgtable);
+}
+
 /*
  * Return true if we do MADV_FREE successfully on entire pmd page.
  * Otherwise, return false.
@@ -2075,7 +2109,7 @@ static void __split_huge_pmd_locked(struct vm_area_struct 
*vma, pmd_t *pmd,
VM_BUG_ON(haddr & ~HPAGE_PMD_MASK);
VM_BUG_ON_VMA(vma->vm_start > haddr, vma);
VM_BUG_ON_VMA(vma->vm_end < haddr + HPAGE_PMD_SIZE, vma);
-   VM_BUG_ON(!is_pmd_migration_entry(*pmd) && !pmd_trans_huge(*pmd)
+   VM_BUG_ON(!is_swap_pmd(*pmd) && !pmd_trans_huge(*pmd)
&& !pmd_devmap(*pmd));
 
count_vm_event(THP_SPLIT_PMD);
@@ -2099,7 +2133,7 @@ static void __split_huge_pmd_locked(struct vm_area_struct 
*vma, pmd_t *pmd,
put_page(page);
add_mm_counter(mm, mm_counter_file(page), -HPAGE_PMD_NR);
return;
-   } else if (is_huge_zero_pmd(*pmd)) {
+   } else if (pmd_present(*pmd) && is_huge_zero_pmd(*pmd)) {
/*
 * FIXME: Do we want to invalidate secondary mmu by calling
 * mmu_notifier_invalidate_range() see comments below inside
@@ -2143,6 +2177,9 @@ static void __split_huge_pmd_locked(struct vm_area_struct 
*vma, pmd_t *pmd,
page = pfn_to_page(swp_offset(entry));
} else
 #endif
+   if (IS_ENABLED(CONFIG_THP_SWAP) && is_swap_pmd(old_pmd))
+   return __split_huge_swap_pmd(vma, haddr, pmd);
+   else
page = 

[PATCH -V5 RESEND 06/21] swap: Support PMD swap mapping when splitting huge PMD

2018-09-11 Thread Huang Ying
A huge PMD need to be split when zap a part of the PMD mapping etc.
If the PMD mapping is a swap mapping, we need to split it too.  This
patch implemented the support for this.  This is similar as splitting
the PMD page mapping, except we need to decrease the PMD swap mapping
count for the huge swap cluster too.  If the PMD swap mapping count
becomes 0, the huge swap cluster will be split.

Notice: is_huge_zero_pmd() and pmd_page() doesn't work well with swap
PMD, so pmd_present() check is called before them.

Signed-off-by: "Huang, Ying" 
Cc: "Kirill A. Shutemov" 
Cc: Andrea Arcangeli 
Cc: Michal Hocko 
Cc: Johannes Weiner 
Cc: Shaohua Li 
Cc: Hugh Dickins 
Cc: Minchan Kim 
Cc: Rik van Riel 
Cc: Dave Hansen 
Cc: Naoya Horiguchi 
Cc: Zi Yan 
Cc: Daniel Jordan 
---
 include/linux/huge_mm.h |  4 
 include/linux/swap.h|  6 ++
 mm/huge_memory.c| 48 +++-
 mm/swapfile.c   | 32 
 4 files changed, 85 insertions(+), 5 deletions(-)

diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h
index 99c19b06d9a4..0f3e1739986f 100644
--- a/include/linux/huge_mm.h
+++ b/include/linux/huge_mm.h
@@ -226,6 +226,10 @@ static inline bool is_huge_zero_page(struct page *page)
return READ_ONCE(huge_zero_page) == page;
 }
 
+/*
+ * is_huge_zero_pmd() must be called after checking pmd_present(),
+ * otherwise, it may report false positive for PMD swap entry.
+ */
 static inline bool is_huge_zero_pmd(pmd_t pmd)
 {
return is_huge_zero_page(pmd_page(pmd));
diff --git a/include/linux/swap.h b/include/linux/swap.h
index db3e07a3d9bc..a2a3d85decd9 100644
--- a/include/linux/swap.h
+++ b/include/linux/swap.h
@@ -618,11 +618,17 @@ static inline swp_entry_t get_swap_page(struct page *page)
 
 #ifdef CONFIG_THP_SWAP
 extern int split_swap_cluster(swp_entry_t entry);
+extern int split_swap_cluster_map(swp_entry_t entry);
 #else
 static inline int split_swap_cluster(swp_entry_t entry)
 {
return 0;
 }
+
+static inline int split_swap_cluster_map(swp_entry_t entry)
+{
+   return 0;
+}
 #endif
 
 #ifdef CONFIG_MEMCG
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index c235ba78de68..b8b61a0879f6 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -1609,6 +1609,40 @@ vm_fault_t do_huge_pmd_numa_page(struct vm_fault *vmf, 
pmd_t pmd)
return 0;
 }
 
+/* Convert a PMD swap mapping to a set of PTE swap mappings */
+static void __split_huge_swap_pmd(struct vm_area_struct *vma,
+ unsigned long haddr,
+ pmd_t *pmd)
+{
+   struct mm_struct *mm = vma->vm_mm;
+   pgtable_t pgtable;
+   pmd_t _pmd;
+   swp_entry_t entry;
+   int i, soft_dirty;
+
+   entry = pmd_to_swp_entry(*pmd);
+   soft_dirty = pmd_soft_dirty(*pmd);
+
+   split_swap_cluster_map(entry);
+
+   pgtable = pgtable_trans_huge_withdraw(mm, pmd);
+   pmd_populate(mm, &_pmd, pgtable);
+
+   for (i = 0; i < HPAGE_PMD_NR; i++, haddr += PAGE_SIZE, entry.val++) {
+   pte_t *pte, ptent;
+
+   pte = pte_offset_map(&_pmd, haddr);
+   VM_BUG_ON(!pte_none(*pte));
+   ptent = swp_entry_to_pte(entry);
+   if (soft_dirty)
+   ptent = pte_swp_mksoft_dirty(ptent);
+   set_pte_at(mm, haddr, pte, ptent);
+   pte_unmap(pte);
+   }
+   smp_wmb(); /* make pte visible before pmd */
+   pmd_populate(mm, pmd, pgtable);
+}
+
 /*
  * Return true if we do MADV_FREE successfully on entire pmd page.
  * Otherwise, return false.
@@ -2075,7 +2109,7 @@ static void __split_huge_pmd_locked(struct vm_area_struct 
*vma, pmd_t *pmd,
VM_BUG_ON(haddr & ~HPAGE_PMD_MASK);
VM_BUG_ON_VMA(vma->vm_start > haddr, vma);
VM_BUG_ON_VMA(vma->vm_end < haddr + HPAGE_PMD_SIZE, vma);
-   VM_BUG_ON(!is_pmd_migration_entry(*pmd) && !pmd_trans_huge(*pmd)
+   VM_BUG_ON(!is_swap_pmd(*pmd) && !pmd_trans_huge(*pmd)
&& !pmd_devmap(*pmd));
 
count_vm_event(THP_SPLIT_PMD);
@@ -2099,7 +2133,7 @@ static void __split_huge_pmd_locked(struct vm_area_struct 
*vma, pmd_t *pmd,
put_page(page);
add_mm_counter(mm, mm_counter_file(page), -HPAGE_PMD_NR);
return;
-   } else if (is_huge_zero_pmd(*pmd)) {
+   } else if (pmd_present(*pmd) && is_huge_zero_pmd(*pmd)) {
/*
 * FIXME: Do we want to invalidate secondary mmu by calling
 * mmu_notifier_invalidate_range() see comments below inside
@@ -2143,6 +2177,9 @@ static void __split_huge_pmd_locked(struct vm_area_struct 
*vma, pmd_t *pmd,
page = pfn_to_page(swp_offset(entry));
} else
 #endif
+   if (IS_ENABLED(CONFIG_THP_SWAP) && is_swap_pmd(old_pmd))
+   return __split_huge_swap_pmd(vma, haddr, pmd);
+   else
page =