from:"Huang Ying"

[PATCH -V5 20/21] swap: create PMD swap mapping when unmap the THP

2018-09-03 Thread Huang Ying

This is the final step of the THP swapin support.  When reclaiming a
anonymous THP, after allocating the huge swap cluster and add the THP
into swap cache, the PMD page mapping will be changed to the mapping
to the swap space.  Previously, the PMD page mapping will be split
before being changed.  In this patch, the unmap code is enhanced not
to split the PMD mapping, but create a PMD swap mapping to replace it
instead.  So later when clear the SWAP_HAS_CACHE flag in the last step
of swapout, the huge swap cluster will be kept instead of being split,
and when swapin, the huge swap cluster will be read in one piece into a
THP.  That is, the THP will not be split during swapout/swapin.  This
can eliminate the overhead of splitting/collapsing, and reduce the
page fault count, etc.  But more important, the utilization of THP is
improved greatly, that is, much more THP will be kept when swapping is
used, so that we can take full advantage of THP including its high
performance for swapout/swapin.

Signed-off-by: "Huang, Ying" 
Cc: "Kirill A. Shutemov" 
Cc: Andrea Arcangeli 
Cc: Michal Hocko 
Cc: Johannes Weiner 
Cc: Shaohua Li 
Cc: Hugh Dickins 
Cc: Minchan Kim 
Cc: Rik van Riel 
Cc: Dave Hansen 
Cc: Naoya Horiguchi 
Cc: Zi Yan 
Cc: Daniel Jordan 
---
 include/linux/huge_mm.h | 11 +++
 mm/huge_memory.c| 30 ++
 mm/rmap.c   | 43 ++-
 mm/vmscan.c |  6 +-
 4 files changed, 84 insertions(+), 6 deletions(-)

diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h
index 6586c1bfac21..8cbce31bc090 100644
--- a/include/linux/huge_mm.h
+++ b/include/linux/huge_mm.h
@@ -405,6 +405,8 @@ static inline gfp_t alloc_hugepage_direct_gfpmask(struct 
vm_area_struct *vma)
 }
 #endif /* CONFIG_TRANSPARENT_HUGEPAGE */
 
+struct page_vma_mapped_walk;
+
 #ifdef CONFIG_THP_SWAP
 extern void __split_huge_swap_pmd(struct vm_area_struct *vma,
  unsigned long haddr,
@@ -412,6 +414,8 @@ extern void __split_huge_swap_pmd(struct vm_area_struct 
*vma,
 extern int split_huge_swap_pmd(struct vm_area_struct *vma, pmd_t *pmd,
   unsigned long address, pmd_t orig_pmd);
 extern int do_huge_pmd_swap_page(struct vm_fault *vmf, pmd_t orig_pmd);
+extern bool set_pmd_swap_entry(struct page_vma_mapped_walk *pvmw,
+   struct page *page, unsigned long address, pmd_t pmdval);
 
 static inline bool transparent_hugepage_swapin_enabled(
struct vm_area_struct *vma)
@@ -453,6 +457,13 @@ static inline int do_huge_pmd_swap_page(struct vm_fault 
*vmf, pmd_t orig_pmd)
return 0;
 }
 
+static inline bool set_pmd_swap_entry(struct page_vma_mapped_walk *pvmw,
+ struct page *page, unsigned long address,
+ pmd_t pmdval)
+{
+   return false;
+}
+
 static inline bool transparent_hugepage_swapin_enabled(
struct vm_area_struct *vma)
 {
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index c825f470d58a..a86cdcab2627 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -1884,6 +1884,36 @@ int do_huge_pmd_swap_page(struct vm_fault *vmf, pmd_t 
orig_pmd)
count_vm_event(THP_SWPIN_FALLBACK);
goto fallback;
 }
+
+bool set_pmd_swap_entry(struct page_vma_mapped_walk *pvmw, struct page *page,
+   unsigned long address, pmd_t pmdval)
+{
+   struct vm_area_struct *vma = pvmw->vma;
+   struct mm_struct *mm = vma->vm_mm;
+   pmd_t swp_pmd;
+   swp_entry_t entry = { .val = page_private(page) };
+
+   if (swap_duplicate(&entry, HPAGE_PMD_NR) < 0) {
+   set_pmd_at(mm, address, pvmw->pmd, pmdval);
+   return false;
+   }
+   if (list_empty(&mm->mmlist)) {
+   spin_lock(&mmlist_lock);
+   if (list_empty(&mm->mmlist))
+   list_add(&mm->mmlist, &init_mm.mmlist);
+   spin_unlock(&mmlist_lock);
+   }
+   add_mm_counter(mm, MM_ANONPAGES, -HPAGE_PMD_NR);
+   add_mm_counter(mm, MM_SWAPENTS, HPAGE_PMD_NR);
+   swp_pmd = swp_entry_to_pmd(entry);
+   if (pmd_soft_dirty(pmdval))
+   swp_pmd = pmd_swp_mksoft_dirty(swp_pmd);
+   set_pmd_at(mm, address, pvmw->pmd, swp_pmd);
+
+   page_remove_rmap(page, true);
+   put_page(page);
+   return true;
+}
 #endif
 
 static inline void zap_deposited_table(struct mm_struct *mm, pmd_t *pmd)
diff --git a/mm/rmap.c b/mm/rmap.c
index 3bb4be720bc0..a180cb1fe2db 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -1413,11 +1413,52 @@ static bool try_to_unmap_one(struct page *page, struct 
vm_area_struct *vma,
continue;
}
 
+   address = pvmw.address;
+
+#ifdef CONFIG_THP_SWAP
+   /* PMD-mapped THP swap entry */
+   if (IS_ENABLED(CONFIG_THP_SWAP) &&

[PATCH -V5 13/21] swap: Support PMD swap mapping in madvise_free()

2018-09-03 Thread Huang Ying

When madvise_free() found a PMD swap mapping, if only part of the huge
swap cluster is operated on, the PMD swap mapping will be split and
fallback to PTE swap mapping processing.  Otherwise, if all huge swap
cluster is operated on, free_swap_and_cache() will be called to
decrease the PMD swap mapping count and probably free the swap space
and the THP in swap cache too.

Signed-off-by: "Huang, Ying" 
Cc: "Kirill A. Shutemov" 
Cc: Andrea Arcangeli 
Cc: Michal Hocko 
Cc: Johannes Weiner 
Cc: Shaohua Li 
Cc: Hugh Dickins 
Cc: Minchan Kim 
Cc: Rik van Riel 
Cc: Dave Hansen 
Cc: Naoya Horiguchi 
Cc: Zi Yan 
Cc: Daniel Jordan 
---
 mm/huge_memory.c | 54 +++---
 mm/madvise.c |  2 +-
 2 files changed, 40 insertions(+), 16 deletions(-)

diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index 4b7ee510a9fe..656e760d19e2 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -1844,6 +1844,15 @@ int do_huge_pmd_swap_page(struct vm_fault *vmf, pmd_t 
orig_pmd)
 }
 #endif
 
+static inline void zap_deposited_table(struct mm_struct *mm, pmd_t *pmd)
+{
+   pgtable_t pgtable;
+
+   pgtable = pgtable_trans_huge_withdraw(mm, pmd);
+   pte_free(mm, pgtable);
+   mm_dec_nr_ptes(mm);
+}
+
 /*
  * Return true if we do MADV_FREE successfully on entire pmd page.
  * Otherwise, return false.
@@ -1864,15 +1873,39 @@ bool madvise_free_huge_pmd(struct mmu_gather *tlb, 
struct vm_area_struct *vma,
goto out_unlocked;
 
orig_pmd = *pmd;
-   if (is_huge_zero_pmd(orig_pmd))
-   goto out;
-
if (unlikely(!pmd_present(orig_pmd))) {
-   VM_BUG_ON(thp_migration_supported() &&
- !is_pmd_migration_entry(orig_pmd));
-   goto out;
+   swp_entry_t entry = pmd_to_swp_entry(orig_pmd);
+
+   if (is_migration_entry(entry)) {
+   VM_BUG_ON(!thp_migration_supported());
+   goto out;
+   } else if (IS_ENABLED(CONFIG_THP_SWAP) &&
+  !non_swap_entry(entry)) {
+   /*
+* If part of THP is discarded, split the PMD
+* swap mapping and operate on the PTEs
+*/
+   if (next - addr != HPAGE_PMD_SIZE) {
+   unsigned long haddr = addr & HPAGE_PMD_MASK;
+
+   __split_huge_swap_pmd(vma, haddr, pmd);
+   goto out;
+   }
+   free_swap_and_cache(entry, HPAGE_PMD_NR);
+   pmd_clear(pmd);
+   zap_deposited_table(mm, pmd);
+   if (current->mm == mm)
+   sync_mm_rss(mm);
+   add_mm_counter(mm, MM_SWAPENTS, -HPAGE_PMD_NR);
+   ret = true;
+   goto out;
+   } else
+   VM_BUG_ON(1);
}
 
+   if (is_huge_zero_pmd(orig_pmd))
+   goto out;
+
page = pmd_page(orig_pmd);
/*
 * If other processes are mapping this page, we couldn't discard
@@ -1918,15 +1951,6 @@ bool madvise_free_huge_pmd(struct mmu_gather *tlb, 
struct vm_area_struct *vma,
return ret;
 }
 
-static inline void zap_deposited_table(struct mm_struct *mm, pmd_t *pmd)
-{
-   pgtable_t pgtable;
-
-   pgtable = pgtable_trans_huge_withdraw(mm, pmd);
-   pte_free(mm, pgtable);
-   mm_dec_nr_ptes(mm);
-}
-
 int zap_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma,
 pmd_t *pmd, unsigned long addr)
 {
diff --git a/mm/madvise.c b/mm/madvise.c
index 50282ba862e2..20101ff125d0 100644
--- a/mm/madvise.c
+++ b/mm/madvise.c
@@ -321,7 +321,7 @@ static int madvise_free_pte_range(pmd_t *pmd, unsigned long 
addr,
unsigned long next;
 
next = pmd_addr_end(addr, end);
-   if (pmd_trans_huge(*pmd))
+   if (pmd_trans_huge(*pmd) || is_swap_pmd(*pmd))
if (madvise_free_huge_pmd(tlb, vma, pmd, addr, next))
goto next;
 
-- 
2.16.4

[PATCH -V5 07/21] swap: Support PMD swap mapping in split_swap_cluster()

2018-09-03 Thread Huang Ying

When splitting a THP in swap cache or failing to allocate a THP when
swapin a huge swap cluster, the huge swap cluster will be split.  In
addition to clear the huge flag of the swap cluster, the PMD swap
mapping count recorded in cluster_count() will be set to 0.  But we
will not touch PMD swap mappings themselves, because it is hard to
find them all sometimes.  When the PMD swap mappings are operated
later, it will be found that the huge swap cluster has been split and
the PMD swap mappings will be split at that time.

Unless splitting a THP in swap cache (specified via "force"
parameter), split_swap_cluster() will return -EEXIST if there is
SWAP_HAS_CACHE flag in swap_map[offset].  Because this indicates there
is a THP corresponds to this huge swap cluster, and it isn't desired
to split the THP.

When splitting a THP in swap cache, the position to call
split_swap_cluster() is changed to before unlocking sub-pages.  So
that all sub-pages will be kept locked from the THP has been split to
the huge swap cluster is split.  This makes the code much easier to be
reasoned.

Signed-off-by: "Huang, Ying" 
Cc: "Kirill A. Shutemov" 
Cc: Andrea Arcangeli 
Cc: Michal Hocko 
Cc: Johannes Weiner 
Cc: Shaohua Li 
Cc: Hugh Dickins 
Cc: Minchan Kim 
Cc: Rik van Riel 
Cc: Dave Hansen 
Cc: Naoya Horiguchi 
Cc: Zi Yan 
Cc: Daniel Jordan 
---
 include/linux/swap.h |  6 --
 mm/huge_memory.c | 18 ++--
 mm/swapfile.c| 58 +---
 3 files changed, 57 insertions(+), 25 deletions(-)

diff --git a/include/linux/swap.h b/include/linux/swap.h
index 7ee8bfdd0861..1ab197a0e065 100644
--- a/include/linux/swap.h
+++ b/include/linux/swap.h
@@ -612,11 +612,13 @@ static inline swp_entry_t get_swap_page(struct page *page)
 
 #endif /* CONFIG_SWAP */
 
+#define SSC_SPLIT_CACHED   0x1
+
 #ifdef CONFIG_THP_SWAP
-extern int split_swap_cluster(swp_entry_t entry);
+extern int split_swap_cluster(swp_entry_t entry, unsigned long flags);
 extern int split_swap_cluster_map(swp_entry_t entry);
 #else
-static inline int split_swap_cluster(swp_entry_t entry)
+static inline int split_swap_cluster(swp_entry_t entry, unsigned long flags)
 {
return 0;
 }
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index 956c49bfd208..9341c90aa286 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -2496,6 +2496,17 @@ static void __split_huge_page(struct page *page, struct 
list_head *list,
 
unfreeze_page(head);
 
+   /*
+* Split swap cluster before unlocking sub-pages.  So all
+* sub-pages will be kept locked from THP has been split to
+* swap cluster is split.
+*/
+   if (PageSwapCache(head)) {
+   swp_entry_t entry = { .val = page_private(head) };
+
+   split_swap_cluster(entry, SSC_SPLIT_CACHED);
+   }
+
for (i = 0; i < HPAGE_PMD_NR; i++) {
struct page *subpage = head + i;
if (subpage == page)
@@ -2719,12 +2730,7 @@ int split_huge_page_to_list(struct page *page, struct 
list_head *list)
__dec_node_page_state(page, NR_SHMEM_THPS);
spin_unlock(&pgdata->split_queue_lock);
__split_huge_page(page, list, flags);
-   if (PageSwapCache(head)) {
-   swp_entry_t entry = { .val = page_private(head) };
-
-   ret = split_swap_cluster(entry);
-   } else
-   ret = 0;
+   ret = 0;
} else {
if (IS_ENABLED(CONFIG_DEBUG_VM) && mapcount) {
pr_alert("total_mapcount: %u, page_count(): %u\n",
diff --git a/mm/swapfile.c b/mm/swapfile.c
index 242f70c9e1f2..a16cd903d3ef 100644
--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -1469,23 +1469,6 @@ void put_swap_page(struct page *page, swp_entry_t entry)
unlock_cluster_or_swap_info(si, ci);
 }
 
-#ifdef CONFIG_THP_SWAP
-int split_swap_cluster(swp_entry_t entry)
-{
-   struct swap_info_struct *si;
-   struct swap_cluster_info *ci;
-   unsigned long offset = swp_offset(entry);
-
-   si = _swap_info_get(entry);
-   if (!si)
-   return -EBUSY;
-   ci = lock_cluster(si, offset);
-   cluster_clear_huge(ci);
-   unlock_cluster(ci);
-   return 0;
-}
-#endif
-
 static int swp_entry_cmp(const void *ent1, const void *ent2)
 {
const swp_entry_t *e1 = ent1, *e2 = ent2;
@@ -4064,6 +4047,47 @@ int split_swap_cluster_map(swp_entry_t entry)
unlock_cluster(ci);
return 0;
 }
+
+/*
+ * We will not try to split all PMD swap mappings to the swap cluster,
+ * because we haven't enough information available for that.  Later,
+ * when the PMD swap mapping is duplicated or swapin, etc, the PMD
+ * swap mapping will be split and fallback to the PTE operations.
+ */
+int split_swap_cluster(swp_entry_t entry, unsigned long flags)
+{
+

[PATCH -V5 12/21] swap: Support PMD swap mapping in swapoff

2018-09-03 Thread Huang Ying

During swapoff, for a huge swap cluster, we need to allocate a THP,
read its contents into the THP and unuse the PMD and PTE swap mappings
to it.  If failed to allocate a THP, the huge swap cluster will be
split.

During unuse, if it is found that the swap cluster mapped by a PMD
swap mapping is split already, we will split the PMD swap mapping and
unuse the PTEs.

Signed-off-by: "Huang, Ying" 
Cc: "Kirill A. Shutemov" 
Cc: Andrea Arcangeli 
Cc: Michal Hocko 
Cc: Johannes Weiner 
Cc: Shaohua Li 
Cc: Hugh Dickins 
Cc: Minchan Kim 
Cc: Rik van Riel 
Cc: Dave Hansen 
Cc: Naoya Horiguchi 
Cc: Zi Yan 
Cc: Daniel Jordan 
---
 include/asm-generic/pgtable.h | 14 +--
 include/linux/huge_mm.h   |  8 
 mm/huge_memory.c  |  4 +-
 mm/swapfile.c | 86 ++-
 4 files changed, 97 insertions(+), 15 deletions(-)

diff --git a/include/asm-generic/pgtable.h b/include/asm-generic/pgtable.h
index bf207f915967..d18a4415db46 100644
--- a/include/asm-generic/pgtable.h
+++ b/include/asm-generic/pgtable.h
@@ -931,22 +931,12 @@ static inline int 
pmd_none_or_trans_huge_or_clear_bad(pmd_t *pmd)
barrier();
 #endif
/*
-* !pmd_present() checks for pmd migration entries
-*
-* The complete check uses is_pmd_migration_entry() in linux/swapops.h
-* But using that requires moving current function and 
pmd_trans_unstable()
-* to linux/swapops.h to resovle dependency, which is too much code 
move.
-*
-* !pmd_present() is equivalent to is_pmd_migration_entry() currently,
-* because !pmd_present() pages can only be under migration not swapped
-* out.
-*
-* pmd_none() is preseved for future condition checks on pmd migration
+* pmd_none() is preseved for future condition checks on pmd swap
 * entries and not confusing with this function name, although it is
 * redundant with !pmd_present().
 */
if (pmd_none(pmdval) || pmd_trans_huge(pmdval) ||
-   (IS_ENABLED(CONFIG_ARCH_ENABLE_THP_MIGRATION) && 
!pmd_present(pmdval)))
+   (IS_ENABLED(CONFIG_HAVE_PMD_SWAP_ENTRY) && !pmd_present(pmdval)))
return 1;
if (unlikely(pmd_bad(pmdval))) {
pmd_clear_bad(pmd);
diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h
index 9dedff974def..25ba9b5f1e60 100644
--- a/include/linux/huge_mm.h
+++ b/include/linux/huge_mm.h
@@ -406,6 +406,8 @@ static inline gfp_t alloc_hugepage_direct_gfpmask(struct 
vm_area_struct *vma)
 #endif /* CONFIG_TRANSPARENT_HUGEPAGE */
 
 #ifdef CONFIG_THP_SWAP
+extern int split_huge_swap_pmd(struct vm_area_struct *vma, pmd_t *pmd,
+  unsigned long address, pmd_t orig_pmd);
 extern int do_huge_pmd_swap_page(struct vm_fault *vmf, pmd_t orig_pmd);
 
 static inline bool transparent_hugepage_swapin_enabled(
@@ -431,6 +433,12 @@ static inline bool transparent_hugepage_swapin_enabled(
return false;
 }
 #else /* CONFIG_THP_SWAP */
+static inline int split_huge_swap_pmd(struct vm_area_struct *vma, pmd_t *pmd,
+ unsigned long address, pmd_t orig_pmd)
+{
+   return 0;
+}
+
 static inline int do_huge_pmd_swap_page(struct vm_fault *vmf, pmd_t orig_pmd)
 {
return 0;
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index 7d22e33fdd43..4b7ee510a9fe 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -1666,8 +1666,8 @@ static void __split_huge_swap_pmd(struct vm_area_struct 
*vma,
 }
 
 #ifdef CONFIG_THP_SWAP
-static int split_huge_swap_pmd(struct vm_area_struct *vma, pmd_t *pmd,
-  unsigned long address, pmd_t orig_pmd)
+int split_huge_swap_pmd(struct vm_area_struct *vma, pmd_t *pmd,
+   unsigned long address, pmd_t orig_pmd)
 {
struct mm_struct *mm = vma->vm_mm;
spinlock_t *ptl;
diff --git a/mm/swapfile.c b/mm/swapfile.c
index d98c1f74f87e..f801d9852b3e 100644
--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -1931,6 +1931,11 @@ static inline int pte_same_as_swp(pte_t pte, pte_t 
swp_pte)
return pte_same(pte_swp_clear_soft_dirty(pte), swp_pte);
 }
 
+static inline int pmd_same_as_swp(pmd_t pmd, pmd_t swp_pmd)
+{
+   return pmd_same(pmd_swp_clear_soft_dirty(pmd), swp_pmd);
+}
+
 /*
  * No need to decide whether this PTE shares the swap entry with others,
  * just let do_wp_page work it out if a write is requested later - to
@@ -1992,6 +1997,53 @@ static int unuse_pte(struct vm_area_struct *vma, pmd_t 
*pmd,
return ret;
 }
 
+#ifdef CONFIG_THP_SWAP
+static int unuse_pmd(struct vm_area_struct *vma, pmd_t *pmd,
+unsigned long addr, swp_entry_t entry, struct page *page)
+{
+   struct mem_cgroup *memcg;
+   spinlock_t *ptl;
+   int ret = 1;
+
+   if (mem_cgroup_try_charge(page, vma->vm_mm, GFP_KERNEL,
+ &

[PATCH -V5 06/21] swap: Support PMD swap mapping when splitting huge PMD

2018-09-03 Thread Huang Ying

A huge PMD need to be split when zap a part of the PMD mapping etc.
If the PMD mapping is a swap mapping, we need to split it too.  This
patch implemented the support for this.  This is similar as splitting
the PMD page mapping, except we need to decrease the PMD swap mapping
count for the huge swap cluster too.  If the PMD swap mapping count
becomes 0, the huge swap cluster will be split.

Notice: is_huge_zero_pmd() and pmd_page() doesn't work well with swap
PMD, so pmd_present() check is called before them.

Signed-off-by: "Huang, Ying" 
Cc: "Kirill A. Shutemov" 
Cc: Andrea Arcangeli 
Cc: Michal Hocko 
Cc: Johannes Weiner 
Cc: Shaohua Li 
Cc: Hugh Dickins 
Cc: Minchan Kim 
Cc: Rik van Riel 
Cc: Dave Hansen 
Cc: Naoya Horiguchi 
Cc: Zi Yan 
Cc: Daniel Jordan 
---
 include/linux/huge_mm.h |  4 
 include/linux/swap.h|  6 ++
 mm/huge_memory.c| 48 +++-
 mm/swapfile.c   | 32 
 4 files changed, 85 insertions(+), 5 deletions(-)

diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h
index 99c19b06d9a4..0f3e1739986f 100644
--- a/include/linux/huge_mm.h
+++ b/include/linux/huge_mm.h
@@ -226,6 +226,10 @@ static inline bool is_huge_zero_page(struct page *page)
return READ_ONCE(huge_zero_page) == page;
 }
 
+/*
+ * is_huge_zero_pmd() must be called after checking pmd_present(),
+ * otherwise, it may report false positive for PMD swap entry.
+ */
 static inline bool is_huge_zero_pmd(pmd_t pmd)
 {
return is_huge_zero_page(pmd_page(pmd));
diff --git a/include/linux/swap.h b/include/linux/swap.h
index 63235b879a59..7ee8bfdd0861 100644
--- a/include/linux/swap.h
+++ b/include/linux/swap.h
@@ -614,11 +614,17 @@ static inline swp_entry_t get_swap_page(struct page *page)
 
 #ifdef CONFIG_THP_SWAP
 extern int split_swap_cluster(swp_entry_t entry);
+extern int split_swap_cluster_map(swp_entry_t entry);
 #else
 static inline int split_swap_cluster(swp_entry_t entry)
 {
return 0;
 }
+
+static inline int split_swap_cluster_map(swp_entry_t entry)
+{
+   return 0;
+}
 #endif
 
 #ifdef CONFIG_MEMCG
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index 9f3e8b5cdf7d..956c49bfd208 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -1604,6 +1604,40 @@ vm_fault_t do_huge_pmd_numa_page(struct vm_fault *vmf, 
pmd_t pmd)
return 0;
 }
 
+/* Convert a PMD swap mapping to a set of PTE swap mappings */
+static void __split_huge_swap_pmd(struct vm_area_struct *vma,
+ unsigned long haddr,
+ pmd_t *pmd)
+{
+   struct mm_struct *mm = vma->vm_mm;
+   pgtable_t pgtable;
+   pmd_t _pmd;
+   swp_entry_t entry;
+   int i, soft_dirty;
+
+   entry = pmd_to_swp_entry(*pmd);
+   soft_dirty = pmd_soft_dirty(*pmd);
+
+   split_swap_cluster_map(entry);
+
+   pgtable = pgtable_trans_huge_withdraw(mm, pmd);
+   pmd_populate(mm, &_pmd, pgtable);
+
+   for (i = 0; i < HPAGE_PMD_NR; i++, haddr += PAGE_SIZE, entry.val++) {
+   pte_t *pte, ptent;
+
+   pte = pte_offset_map(&_pmd, haddr);
+   VM_BUG_ON(!pte_none(*pte));
+   ptent = swp_entry_to_pte(entry);
+   if (soft_dirty)
+   ptent = pte_swp_mksoft_dirty(ptent);
+   set_pte_at(mm, haddr, pte, ptent);
+   pte_unmap(pte);
+   }
+   smp_wmb(); /* make pte visible before pmd */
+   pmd_populate(mm, pmd, pgtable);
+}
+
 /*
  * Return true if we do MADV_FREE successfully on entire pmd page.
  * Otherwise, return false.
@@ -2070,7 +2104,7 @@ static void __split_huge_pmd_locked(struct vm_area_struct 
*vma, pmd_t *pmd,
VM_BUG_ON(haddr & ~HPAGE_PMD_MASK);
VM_BUG_ON_VMA(vma->vm_start > haddr, vma);
VM_BUG_ON_VMA(vma->vm_end < haddr + HPAGE_PMD_SIZE, vma);
-   VM_BUG_ON(!is_pmd_migration_entry(*pmd) && !pmd_trans_huge(*pmd)
+   VM_BUG_ON(!is_swap_pmd(*pmd) && !pmd_trans_huge(*pmd)
&& !pmd_devmap(*pmd));
 
count_vm_event(THP_SPLIT_PMD);
@@ -2094,7 +2128,7 @@ static void __split_huge_pmd_locked(struct vm_area_struct 
*vma, pmd_t *pmd,
put_page(page);
add_mm_counter(mm, mm_counter_file(page), -HPAGE_PMD_NR);
return;
-   } else if (is_huge_zero_pmd(*pmd)) {
+   } else if (pmd_present(*pmd) && is_huge_zero_pmd(*pmd)) {
/*
 * FIXME: Do we want to invalidate secondary mmu by calling
 * mmu_notifier_invalidate_range() see comments below inside
@@ -2138,6 +2172,9 @@ static void __split_huge_pmd_locked(struct vm_area_struct 
*vma, pmd_t *pmd,
page = pfn_to_page(swp_offset(entry));
} else
 #endif
+   if (IS_ENABLED(CONFIG_THP_SWAP) && is_swap_pmd(old_pmd))
+

[PATCH -V5 08/21] swap: Support to read a huge swap cluster for swapin a THP

2018-09-03 Thread Huang Ying

To swapin a THP in one piece, we need to read a huge swap cluster from
the swap device.  This patch revised the __read_swap_cache_async() and
its callers and callees to support this.  If __read_swap_cache_async()
find the swap cluster of the specified swap entry is huge, it will try
to allocate a THP, add it into the swap cache.  So later the contents
of the huge swap cluster can be read into the THP.

Signed-off-by: "Huang, Ying" 
Cc: "Kirill A. Shutemov" 
Cc: Andrea Arcangeli 
Cc: Michal Hocko 
Cc: Johannes Weiner 
Cc: Shaohua Li 
Cc: Hugh Dickins 
Cc: Minchan Kim 
Cc: Rik van Riel 
Cc: Dave Hansen 
Cc: Naoya Horiguchi 
Cc: Zi Yan 
Cc: Daniel Jordan 
---
 include/linux/huge_mm.h | 38 +++
 include/linux/swap.h|  4 ++--
 mm/huge_memory.c| 26 -
 mm/swap_state.c | 60 +++--
 mm/swapfile.c   |  9 +---
 5 files changed, 94 insertions(+), 43 deletions(-)

diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h
index 0f3e1739986f..3fdb29bc250c 100644
--- a/include/linux/huge_mm.h
+++ b/include/linux/huge_mm.h
@@ -250,6 +250,39 @@ static inline bool thp_migration_supported(void)
return IS_ENABLED(CONFIG_ARCH_ENABLE_THP_MIGRATION);
 }
 
+/*
+ * always: directly stall for all thp allocations
+ * defer: wake kswapd and fail if not immediately available
+ * defer+madvise: wake kswapd and directly stall for MADV_HUGEPAGE, otherwise
+ *   fail if not immediately available
+ * madvise: directly stall for MADV_HUGEPAGE, otherwise fail if not immediately
+ * available
+ * never: never stall for any thp allocation
+ */
+static inline gfp_t alloc_hugepage_direct_gfpmask(struct vm_area_struct *vma)
+{
+   bool vma_madvised;
+
+   if (!vma)
+   return GFP_TRANSHUGE_LIGHT;
+   vma_madvised = !!(vma->vm_flags & VM_HUGEPAGE);
+   if (test_bit(TRANSPARENT_HUGEPAGE_DEFRAG_DIRECT_FLAG,
+&transparent_hugepage_flags))
+   return GFP_TRANSHUGE | (vma_madvised ? 0 : __GFP_NORETRY);
+   if (test_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_FLAG,
+&transparent_hugepage_flags))
+   return GFP_TRANSHUGE_LIGHT | __GFP_KSWAPD_RECLAIM;
+   if (test_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_OR_MADV_FLAG,
+&transparent_hugepage_flags))
+   return GFP_TRANSHUGE_LIGHT |
+   (vma_madvised ? __GFP_DIRECT_RECLAIM :
+   __GFP_KSWAPD_RECLAIM);
+   if (test_bit(TRANSPARENT_HUGEPAGE_DEFRAG_REQ_MADV_FLAG,
+&transparent_hugepage_flags))
+   return GFP_TRANSHUGE_LIGHT |
+   (vma_madvised ? __GFP_DIRECT_RECLAIM : 0);
+   return GFP_TRANSHUGE_LIGHT;
+}
 #else /* CONFIG_TRANSPARENT_HUGEPAGE */
 #define HPAGE_PMD_SHIFT ({ BUILD_BUG(); 0; })
 #define HPAGE_PMD_MASK ({ BUILD_BUG(); 0; })
@@ -363,6 +396,11 @@ static inline bool thp_migration_supported(void)
 {
return false;
 }
+
+static inline gfp_t alloc_hugepage_direct_gfpmask(struct vm_area_struct *vma)
+{
+   return 0;
+}
 #endif /* CONFIG_TRANSPARENT_HUGEPAGE */
 
 #endif /* _LINUX_HUGE_MM_H */
diff --git a/include/linux/swap.h b/include/linux/swap.h
index 1ab197a0e065..0f653b9027d7 100644
--- a/include/linux/swap.h
+++ b/include/linux/swap.h
@@ -457,7 +457,7 @@ extern sector_t map_swap_page(struct page *, struct 
block_device **);
 extern sector_t swapdev_block(int, pgoff_t);
 extern int page_swapcount(struct page *);
 extern int __swap_count(swp_entry_t entry);
-extern int __swp_swapcount(swp_entry_t entry);
+extern int __swp_swapcount(swp_entry_t entry, int *entry_size);
 extern int swp_swapcount(swp_entry_t entry);
 extern struct swap_info_struct *page_swap_info(struct page *);
 extern struct swap_info_struct *swp_swap_info(swp_entry_t entry);
@@ -585,7 +585,7 @@ static inline int __swap_count(swp_entry_t entry)
return 0;
 }
 
-static inline int __swp_swapcount(swp_entry_t entry)
+static inline int __swp_swapcount(swp_entry_t entry, int *entry_size)
 {
return 0;
 }
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index 9341c90aa286..b14d4bfb06f4 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -620,32 +620,6 @@ static vm_fault_t __do_huge_pmd_anonymous_page(struct 
vm_fault *vmf,
 
 }
 
-/*
- * always: directly stall for all thp allocations
- * defer: wake kswapd and fail if not immediately available
- * defer+madvise: wake kswapd and directly stall for MADV_HUGEPAGE, otherwise
- *   fail if not immediately available
- * madvise: directly stall for MADV_HUGEPAGE, otherwise fail if not immediately
- * available
- * never: never stall for any thp allocation
- */
-static inline gfp_t alloc_hugepage_direct_gfpmask(struct vm_area_struct *vma)
-{
-   const bool vma_madvised = !!(vma->v

[PATCH -V5 00/21] swap: Swapout/swapin THP in one piece

2018-09-03 Thread Huang Ying

Hi, Andrew, could you help me to check whether the overall design is
reasonable?

Hi, Hugh, Shaohua, Minchan and Rik, could you help me to review the
swap part of the patchset?  Especially [02/21], [03/21], [04/21],
[05/21], [06/21], [07/21], [08/21], [09/21], [10/21], [11/21],
[12/21], [20/21], [21/21].

Hi, Andrea and Kirill, could you help me to review the THP part of the
patchset?  Especially [01/21], [07/21], [09/21], [11/21], [13/21],
[15/21], [16/21], [17/21], [18/21], [19/21], [20/21].

Hi, Johannes and Michal, could you help me to review the cgroup part
of the patchset?  Especially [14/21].

And for all, Any comment is welcome!

This patchset is based on the 2018-08-23 head of mmotm/master.

This is the final step of THP (Transparent Huge Page) swap
optimization.  After the first and second step, the splitting huge
page is delayed from almost the first step of swapout to after swapout
has been finished.  In this step, we avoid splitting THP for swapout
and swapout/swapin the THP in one piece.

We tested the patchset with vm-scalability benchmark swap-w-seq test
case, with 16 processes.  The test case forks 16 processes.  Each
process allocates large anonymous memory range, and writes it from
begin to end for 8 rounds.  The first round will swapout, while the
remaining rounds will swapin and swapout.  The test is done on a Xeon
E5 v3 system, the swap device used is a RAM simulated PMEM (persistent
memory) device.  The test result is as follow,

base  optimized
 -- 
 %stddev %change %stddev
 \  |\  
   1417897 ±  2%+992.8%   15494673vm-scalability.throughput
   1020489 ±  4%   +1091.2%   12156349vmstat.swap.si
   1255093 ±  3%+940.3%   13056114vmstat.swap.so
   1259769 ±  7%   +1818.3%   24166779meminfo.AnonHugePages
  28021761   -10.7%   25018848 ±  2%  meminfo.AnonPages
  64080064 ±  4% -95.6%2787565 ± 33%  
interrupts.CAL:Function_call_interrupts
 13.91 ±  5% -13.80.10 ± 27%  
perf-profile.children.cycles-pp.native_queued_spin_lock_slowpath

Where, the score of benchmark (bytes written per second) improved
992.8%.  The swapout/swapin throughput improved 1008% (from about
2.17GB/s to 24.04GB/s).  The performance difference is huge.  In base
kernel, for the first round of writing, the THP is swapout and split,
so in the remaining rounds, there is only normal page swapin and
swapout.  While in optimized kernel, the THP is kept after first
swapout, so THP swapin and swapout is used in the remaining rounds.
This shows the key benefit to swapout/swapin THP in one piece, the THP
will be kept instead of being split.  meminfo information verified
this, in base kernel only 4.5% of anonymous page are THP during the
test, while in optimized kernel, that is 96.6%.  The TLB flushing IPI
(represented as interrupts.CAL:Function_call_interrupts) reduced
95.6%, while cycles for spinlock reduced from 13.9% to 0.1%.  These
are performance benefit of THP swapout/swapin too.

Below is the description for all steps of THP swap optimization.

Recently, the performance of the storage devices improved so fast that
we cannot saturate the disk bandwidth with single logical CPU when do
page swapping even on a high-end server machine.  Because the
performance of the storage device improved faster than that of single
logical CPU.  And it seems that the trend will not change in the near
future.  On the other hand, the THP becomes more and more popular
because of increased memory size.  So it becomes necessary to optimize
THP swap performance.

The advantages to swapout/swapin a THP in one piece include:

- Batch various swap operations for the THP.  Many operations need to
  be done once per THP instead of per normal page, for example,
  allocating/freeing the swap space, writing/reading the swap space,
  flushing TLB, page fault, etc.  This will improve the performance of
  the THP swap greatly.

- The THP swap space read/write will be large sequential IO (2M on
  x86_64).  It is particularly helpful for the swapin, which are
  usually 4k random IO.  This will improve the performance of the THP
  swap too.

- It will help the memory fragmentation, especially when the THP is
  heavily used by the applications.  The THP order pages will be free
  up after THP swapout.

- It will improve the THP utilization on the system with the swap
  turned on.  Because the speed for khugepaged to collapse the normal
  pages into the THP is quite slow.  After the THP is split during the
  swapout, it will take quite long time for the normal pages to
  collapse back into the THP after being swapin.  The high THP
  utilization helps the efficiency of the page based memory management
  too.

There are some concerns regarding THP swapin, mainly because possible
enlarged read/write IO size (for swapout/swapin) may put more overhead
on the storage device.

[PATCH 2/3] swap: call free_swap_slot() in __swap_entry_free()

2018-08-27 Thread Huang Ying

This is a code cleanup patch without functionality change.

Originally, when __swap_entry_free() is called, and its return value
is 0, free_swap_slot() will always be called to free the swap entry to
the per-CPU pool.  So move the call to free_swap_slot() to
__swap_entry_free() to simplify the code.

Signed-off-by: "Huang, Ying" 
Cc: Dave Hansen 
Cc: Michal Hocko 
Cc: Johannes Weiner 
Cc: Shaohua Li 
Cc: Hugh Dickins 
Cc: Minchan Kim 
---
 mm/swapfile.c | 10 --
 1 file changed, 4 insertions(+), 6 deletions(-)

diff --git a/mm/swapfile.c b/mm/swapfile.c
index 409926079607..ef974bbd7715 100644
--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -1241,6 +1241,8 @@ static unsigned char __swap_entry_free(struct 
swap_info_struct *p,
ci = lock_cluster_or_swap_info(p, offset);
usage = __swap_entry_free_locked(p, offset, usage);
unlock_cluster_or_swap_info(p, ci);
+   if (!usage)
+   free_swap_slot(entry);
 
return usage;
 }
@@ -1271,10 +1273,8 @@ void swap_free(swp_entry_t entry)
struct swap_info_struct *p;
 
p = _swap_info_get(entry);
-   if (p) {
-   if (!__swap_entry_free(p, entry, 1))
-   free_swap_slot(entry);
-   }
+   if (p)
+   __swap_entry_free(p, entry, 1);
 }
 
 /*
@@ -1705,8 +1705,6 @@ int free_swap_and_cache(swp_entry_t entry)
!swap_page_trans_huge_swapped(p, entry))
__try_to_reclaim_swap(p, swp_offset(entry),
  TTRS_UNMAPPED | TTRS_FULL);
-   else if (!count)
-   free_swap_slot(entry);
}
return p != NULL;
 }
-- 
2.16.4

[PATCH 3/3] swap: Clear si->swap_map[] in swap_free_cluster()

2018-08-27 Thread Huang Ying

si->swap_map[] of the swap entries in cluster needs to be cleared
during freeing.  Previously, this is done in the caller of
swap_free_cluster().  This may cause code duplication (one user now,
will add more users later) and lock/unlock cluster unnecessarily.  In
this patch, the clearing code is moved to swap_free_cluster() to avoid
the downside.

Signed-off-by: "Huang, Ying" 
Cc: Dave Hansen 
Cc: Michal Hocko 
Cc: Johannes Weiner 
Cc: Shaohua Li 
Cc: Hugh Dickins 
Cc: Minchan Kim 
---
 mm/swapfile.c | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/mm/swapfile.c b/mm/swapfile.c
index ef974bbd7715..97a1bd1a7c9a 100644
--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -933,6 +933,7 @@ static void swap_free_cluster(struct swap_info_struct *si, 
unsigned long idx)
struct swap_cluster_info *ci;
 
ci = lock_cluster(si, offset);
+   memset(si->swap_map + offset, 0, SWAPFILE_CLUSTER);
cluster_set_count_flag(ci, 0, 0);
free_cluster(si, idx);
unlock_cluster(ci);
@@ -1309,9 +1310,6 @@ void put_swap_page(struct page *page, swp_entry_t entry)
if (free_entries == SWAPFILE_CLUSTER) {
unlock_cluster_or_swap_info(si, ci);
spin_lock(&si->lock);
-   ci = lock_cluster(si, offset);
-   memset(map, 0, SWAPFILE_CLUSTER);
-   unlock_cluster(ci);
mem_cgroup_uncharge_swap(entry, SWAPFILE_CLUSTER);
swap_free_cluster(si, idx);
spin_unlock(&si->lock);
-- 
2.16.4

[PATCH 0/3] swap: Code refactoring for some swap free related functions

2018-08-27 Thread Huang Ying

To improve the code readability.  Some swap free related functions are 
refactored.

This patchset is based on 8/23 HEAD of mmotm tree.

Best Regards,
Huang, Ying

[PATCH 1/3] swap: Use __try_to_reclaim_swap() in free_swap_and_cache()

2018-08-27 Thread Huang Ying

The code path to reclaim the swap entry in free_swap_and_cache() is
almost same as that of __try_to_reclaim_swap().  The largest
difference is just coding style.  So the support to the additional
requirement of free_swap_and_cache() is added into
__try_to_reclaim_swap().  free_swap_and_cache() is changed to call
__try_to_reclaim_swap(), and delete the duplicated code.  This will
improve code readability and reduce the potential bugs.

There are 2 functionality differences between __try_to_reclaim_swap()
and swap entry reclaim code of free_swap_and_cache().

- free_swap_and_cache() only reclaims the swap entry if the page is
  unmapped or swap is getting full.  The support has been added into
  __try_to_reclaim_swap().

- try_to_free_swap() (called by __try_to_reclaim_swap()) checks
  pm_suspended_storage(), while free_swap_and_cache() not.  I think
  this is OK.  Because the page and the swap entry can be reclaimed
  later eventually.

Signed-off-by: "Huang, Ying" 
Cc: Dave Hansen 
Cc: Michal Hocko 
Cc: Johannes Weiner 
Cc: Shaohua Li 
Cc: Hugh Dickins 
Cc: Minchan Kim 
---
 mm/swapfile.c | 57 +
 1 file changed, 25 insertions(+), 32 deletions(-)

diff --git a/mm/swapfile.c b/mm/swapfile.c
index 9211c427a9ad..409926079607 100644
--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -104,26 +104,39 @@ static inline unsigned char swap_count(unsigned char ent)
return ent & ~SWAP_HAS_CACHE;   /* may include COUNT_CONTINUED flag */
 }
 
+/* Reclaim the swap entry anyway if possible */
+#define TTRS_ANYWAY0x1
+/*
+ * Reclaim the swap entry if there are no more mappings of the
+ * corresponding page
+ */
+#define TTRS_UNMAPPED  0x2
+/* Reclaim the swap entry if swap is getting full*/
+#define TTRS_FULL  0x4
+
 /* returns 1 if swap entry is freed */
-static int
-__try_to_reclaim_swap(struct swap_info_struct *si, unsigned long offset)
+static int __try_to_reclaim_swap(struct swap_info_struct *si,
+unsigned long offset, unsigned long flags)
 {
swp_entry_t entry = swp_entry(si->type, offset);
struct page *page;
int ret = 0;
 
-   page = find_get_page(swap_address_space(entry), swp_offset(entry));
+   page = find_get_page(swap_address_space(entry), offset);
if (!page)
return 0;
/*
-* This function is called from scan_swap_map() and it's called
-* by vmscan.c at reclaiming pages. So, we hold a lock on a page, here.
-* We have to use trylock for avoiding deadlock. This is a special
+* When this function is called from scan_swap_map_slots() and it's
+* called by vmscan.c at reclaiming pages. So, we hold a lock on a page,
+* here. We have to use trylock for avoiding deadlock. This is a special
 * case and you should use try_to_free_swap() with explicit lock_page()
 * in usual operations.
 */
if (trylock_page(page)) {
-   ret = try_to_free_swap(page);
+   if ((flags & TTRS_ANYWAY) ||
+   ((flags & TTRS_UNMAPPED) && !page_mapped(page)) ||
+   ((flags & TTRS_FULL) && mem_cgroup_swap_full(page)))
+   ret = try_to_free_swap(page);
unlock_page(page);
}
put_page(page);
@@ -781,7 +794,7 @@ static int scan_swap_map_slots(struct swap_info_struct *si,
int swap_was_freed;
unlock_cluster(ci);
spin_unlock(&si->lock);
-   swap_was_freed = __try_to_reclaim_swap(si, offset);
+   swap_was_freed = __try_to_reclaim_swap(si, offset, TTRS_ANYWAY);
spin_lock(&si->lock);
/* entry was freed successfully, try to use this again */
if (swap_was_freed)
@@ -1680,7 +1693,6 @@ int try_to_free_swap(struct page *page)
 int free_swap_and_cache(swp_entry_t entry)
 {
struct swap_info_struct *p;
-   struct page *page = NULL;
unsigned char count;
 
if (non_swap_entry(entry))
@@ -1690,31 +1702,12 @@ int free_swap_and_cache(swp_entry_t entry)
if (p) {
count = __swap_entry_free(p, entry, 1);
if (count == SWAP_HAS_CACHE &&
-   !swap_page_trans_huge_swapped(p, entry)) {
-   page = find_get_page(swap_address_space(entry),
-swp_offset(entry));
-   if (page && !trylock_page(page)) {
-   put_page(page);
-   page = NULL;
-   }
-   } else if (!count)
+   !swap_page_trans_huge_swapped(p, entry))
+   __try_to_reclaim_swap(p, swp_offset(entry),
+ TTRS_UNMAPPED | TTRS

[PATCH] swap: Use __try_to_reclaim_swap() in free_swap_and_cache()

2018-08-03 Thread Huang Ying

The code path to reclaim the swap entry in free_swap_and_cache() is
almost same as that of __try_to_reclaim_swap(), the largest difference
is just coding style.  So the support to the additional requirement of
free_swap_and_cache() is added into __try_to_reclaim_swap(), and
__try_to_reclaim_swap() is called in free_swap_and_cache() to delete
the duplicated code.  This will improve code readability and reduce
the potential bugs.

There are 2 functionality differences between __try_to_reclaim_swap()
and swap entry reclaim code of free_swap_and_cache().

- free_swap_and_cache() only reclaims the swap entry if the page is
  unmapped or swap is getting full.  The support has been added into
  __try_to_reclaim_swap().

- try_to_free_swap() (called by __try_to_reclaim_swap()) checks
  pm_suspended_storage(), while free_swap_and_cache() not.  I think
  this is OK.  Because the page and the swap entry can be reclaimed
  later eventually.

Signed-off-by: "Huang, Ying" 
Cc: Dave Hansen 
Cc: Michal Hocko 
Cc: Johannes Weiner 
Cc: Shaohua Li 
Cc: Hugh Dickins 
Cc: Minchan Kim 
---
 mm/swapfile.c | 57 +
 1 file changed, 25 insertions(+), 32 deletions(-)

diff --git a/mm/swapfile.c b/mm/swapfile.c
index 8ee4c93e3316..00bb4ed08410 100644
--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -104,26 +104,39 @@ static inline unsigned char swap_count(unsigned char ent)
return ent & ~SWAP_HAS_CACHE;   /* may include COUNT_CONTINUED flag */
 }
 
+/* Reclaim the swap entry anyway if possible */
+#define TTRS_ANYWAY0x1
+/*
+ * Reclaim the swap entry if there are no more mappings of the
+ * corresponding page
+ */
+#define TTRS_UNMAPPED  0x2
+/* Reclaim the swap entry if swap is getting full*/
+#define TTRS_FULL  0x4
+
 /* returns 1 if swap entry is freed */
-static int
-__try_to_reclaim_swap(struct swap_info_struct *si, unsigned long offset)
+static int __try_to_reclaim_swap(struct swap_info_struct *si,
+unsigned long offset, unsigned long flags)
 {
swp_entry_t entry = swp_entry(si->type, offset);
struct page *page;
int ret = 0;
 
-   page = find_get_page(swap_address_space(entry), swp_offset(entry));
+   page = find_get_page(swap_address_space(entry), offset);
if (!page)
return 0;
/*
-* This function is called from scan_swap_map() and it's called
-* by vmscan.c at reclaiming pages. So, we hold a lock on a page, here.
-* We have to use trylock for avoiding deadlock. This is a special
+* When this function is called from scan_swap_map_slots() and it's
+* called by vmscan.c at reclaiming pages. So, we hold a lock on a page,
+* here. We have to use trylock for avoiding deadlock. This is a special
 * case and you should use try_to_free_swap() with explicit lock_page()
 * in usual operations.
 */
if (trylock_page(page)) {
-   ret = try_to_free_swap(page);
+   if ((flags & TTRS_ANYWAY) ||
+   ((flags & TTRS_UNMAPPED) && !page_mapped(page)) ||
+   ((flags & TTRS_FULL) && mem_cgroup_swap_full(page)))
+   ret = try_to_free_swap(page);
unlock_page(page);
}
put_page(page);
@@ -781,7 +794,7 @@ static int scan_swap_map_slots(struct swap_info_struct *si,
int swap_was_freed;
unlock_cluster(ci);
spin_unlock(&si->lock);
-   swap_was_freed = __try_to_reclaim_swap(si, offset);
+   swap_was_freed = __try_to_reclaim_swap(si, offset, TTRS_ANYWAY);
spin_lock(&si->lock);
/* entry was freed successfully, try to use this again */
if (swap_was_freed)
@@ -1680,7 +1693,6 @@ int try_to_free_swap(struct page *page)
 int free_swap_and_cache(swp_entry_t entry)
 {
struct swap_info_struct *p;
-   struct page *page = NULL;
unsigned char count;
 
if (non_swap_entry(entry))
@@ -1690,31 +1702,12 @@ int free_swap_and_cache(swp_entry_t entry)
if (p) {
count = __swap_entry_free(p, entry, 1);
if (count == SWAP_HAS_CACHE &&
-   !swap_page_trans_huge_swapped(p, entry)) {
-   page = find_get_page(swap_address_space(entry),
-swp_offset(entry));
-   if (page && !trylock_page(page)) {
-   put_page(page);
-   page = NULL;
-   }
-   } else if (!count)
+   !swap_page_trans_huge_swapped(p, entry))
+   __try_to_reclaim_swap(p, swp_offset(entry),
+ TTRS_UNMAPPED | TTRS

Re: [QUESTION] llist: Comment releasing 'must delete' restriction before traversing

2018-07-31 Thread Huang, Ying

Byungchul Park  writes:

> I think rcu list also works well. But I decided to use llist because
> llist is simpler and has one less pointer.
>
> Just to be sure, let me explain my use case more:
>
>1. Introduced a global list where single linked list is sufficient.
>2. All operations I need is to add items and traverse the list.
>3. Ensure the operations always happen within irq-disabled section.
>4. I'm considering CONFIG_ARCH_HAVE_NMI_SAFE_CMPXCHG properly.
>5. The list can be accessed by every CPU concurrently.
>

Can you provide more information about where is your use case?  Is it a
kernel driver?  Then it is better to submit the patch together with its
user.

And I have the similar concern as Steven Rostedt.  That is, if you are
the only user forever, it's not necessary to change the common code.

Best Regards,
Huang, Ying

Re: [QUESTION] llist: Comment releasing 'must delete' restriction before traversing

2018-07-30 Thread Huang, Ying

Byungchul Park  writes:

> On Tue, Jul 31, 2018 at 09:37:50AM +0800, Huang, Ying wrote:
>> Byungchul Park  writes:
>> 
>> > Hello folks,
>> >
>> > I'm careful in saying.. and curious about..
>> >
>> > In restrictive cases like only addtions happen but never deletion, can't
>> > we safely traverse a llist? I believe llist can be more useful if we can
>> > release the restriction. Can't we?
>> >
>> > If yes, we may add another function traversing starting from a head. Or
>> > just use existing funtion with head->first.
>> >
>> > Thank a lot for your answers in advance :)
>> 
>> What's the use case?  I don't know how it is useful that items are never
>> deleted from the llist.
>> 
>> Some other locks could be used to provide mutual exclusive between
>> 
>> - llist add, llist traverse
>
> Hello Huang,

Hello Byungchul,

> In my use case, I only do adding and traversing on a llist.

Can you provide more details about your use case?

Best Regards,
Huang, Ying

>> 
>> and
>> 
>> - llist delete
>
> Of course, I will use a lock when deletion is needed.
>
> So.. in the case only adding into and traversing a llist is needed,
> can't we safely traverse a llist in the way I thought? Or am I missing
> something?
>
> Thank you.
>
>> Is this your use case?
>> 
>> Best Regards,
>> Huang, Ying

Re: [QUESTION] llist: Comment releasing 'must delete' restriction before traversing

2018-07-30 Thread Huang, Ying

Byungchul Park  writes:

> Hello folks,
>
> I'm careful in saying.. and curious about..
>
> In restrictive cases like only addtions happen but never deletion, can't
> we safely traverse a llist? I believe llist can be more useful if we can
> release the restriction. Can't we?
>
> If yes, we may add another function traversing starting from a head. Or
> just use existing funtion with head->first.
>
> Thank a lot for your answers in advance :)

What's the use case?  I don't know how it is useful that items are never
deleted from the llist.

Some other locks could be used to provide mutual exclusive between

- llist add, llist traverse

and

- llist delete

Is this your use case?

Best Regards,
Huang, Ying

Re: [PATCH v2] RFC: clear 1G pages with streaming stores on x86

2018-07-26 Thread Huang, Ying

;> > - Is the highmem codepath really necessary? would 1GiB pages really be
>> > of much use on a highmem system? We recently removed some other parts of
>> > the code that support HIGHMEM for gigantic pages (see:
>> > http://lkml.kernel.org/r/20180711195913.1294-1-mike.krav...@oracle.com)
>> > so this seems like a logical continuation.
>> >
>> > - The calls to cond_resched() have been reduced from between every 4k
>> > page to every 64, as between all of the 256K page seemed overly
>> > frequent.  Does this seem like an appropriate frequency? On an idle
>> > system with many spare CPUs it get's rescheduled typically once or twice
>> > out of the 4096 times it calls cond_resched(), which seems like it is
>> > maybe the right amount, but more insight from a scheduling/latency point
>> > of view would be helpful. See the "Tested:" section below for some more 
>> > data.
>> >
>> > - Any other thoughts on the change overall and ways that this could
>> > be made more generally useful, and designed to be easily extensible to
>> > other platforms with non-temporal instructions and 1G pages, or any
>> > additional pitfalls I have not thought to consider.
>> >
>> > Tested:
>> >   Time to `mlock()` a 512GiB region on broadwell CPU
>> >   AVG time (s)% imp.  ms/page
>> >   clear_page_erms 133.584 -   261
>> >   clear_page_nt   34.154  74.43%  67
>> >
>> > For a more in depth look at how the frequency we call cond_resched() 
>> > affects
>> > the time this takes, I tested both on an idle system, and a system running
>> > `stress -c N` program to overcommit CPU to ~115%, and ran 10 replications 
>> > of
>> > the 512GiB mlock test.
>> >
>> > Unfortunately there wasn't as clear of a pattern as I had hoped. On an
>> > otherwise idle system there is no substantive difference different values 
>> > of
>> > PAGES_BETWEEN_RESCHED.
>> >
>> > On a stressed system, there appears to be a pattern, that resembles 
>> > something
>> > of a bell curve: constantly offering to yield, or never yielding until the 
>> > end
>> > produces the fastest results, but yielding infrequently increases latency 
>> > to a
>> > slight degree.
>> >
>> > That being said, it's not clear this is actually a significant difference, 
>> > the
>> > std deviation is occasionally quite high, and perhaps a larger sample set 
>> > would
>> > be more informative. From looking at the log messages indicating the 
>> > number of
>> > times cond_resched() returned 1, there wasn't that much variance, with it
>> > usually being 1 or 2 when idle, and only increasing to ~4-7 when stressed.
>> >
>> >
>> >   PAGES_BETWEEN_RESCHED   state   AVG stddev
>> >   1   4 KiB   idle36.086  1.920
>> >   16  64 KiB  idle34.797  1.702
>> >   32  128 KiB idle35.104  1.752
>> >   64  256 KiB idle34.468  0.661
>> >   512 2048 KiBidle36.427  0.946
>> >   20488192 KiBidle34.988  2.406
>> >   262144  1048576 KiB idle36.792  0.193
>> >   infin   512 GiB idle38.817  0.238  [causes softlockup]
>> >   1   4 KiB   stress  55.562  0.661
>> >   16  64 KiB  stress  57.509  0.248
>> >   32  128 KiB stress  69.265  3.913
>> >   64  256 KiB stress  70.217  4.534
>> >   512 2048 KiBstress  68.474  1.708
>> >   20488192 KiBstress  70.806  1.068
>> >   262144  1048576 KiB stress  55.217  1.184
>> >   infin   512 GiB stress  55.062  0.291  [causes softlockup]

I think it may be good to separate the two optimization into 2 patches.
This makes it easier to evaluate the benefit of individual optimization.

Best Regards,
Huang, Ying

[PATCH v4 8/8] swap, put_swap_page: Share more between huge/normal code path

2018-07-20 Thread Huang Ying

In this patch, locking related code is shared between huge/normal code
path in put_swap_page() to reduce code duplication.  And `free_entries
== 0` case is merged into more general `free_entries !=
SWAPFILE_CLUSTER` case, because the new locking method makes it easy.

The added lines is same as the removed lines.  But the code size is
increased when CONFIG_TRANSPARENT_HUGEPAGE=n.

text   data bss dec hex filename
base:  24123   2004 340   264676763 mm/swapfile.o
unified:   24485   2004 340   2682968cd mm/swapfile.o

Dig on step deeper with `size -A mm/swapfile.o` for base and unified
kernel and compare the result, yields,

  -.text17723  0
  +.text17835  0
  -.orc_unwind_ip1380  0
  +.orc_unwind_ip1480  0
  -.orc_unwind   2070  0
  +.orc_unwind   2220  0
  -Total26686
  +Total27048

The total difference is the same.  The text segment difference is much
smaller: 112.  More difference comes from the ORC unwinder
segments: (1480 + 2220) - (1380 + 2070) = 250.  If the frame pointer
unwinder is used, this costs nothing.

Signed-off-by: "Huang, Ying" 
Reviewed-by: Daniel Jordan 
Acked-by: Dave Hansen 
Cc: Michal Hocko 
Cc: Johannes Weiner 
Cc: Shaohua Li 
Cc: Hugh Dickins 
Cc: Minchan Kim 
Cc: Rik van Riel 
Cc: Dan Williams 
---
 mm/swapfile.c | 20 ++--
 1 file changed, 10 insertions(+), 10 deletions(-)

diff --git a/mm/swapfile.c b/mm/swapfile.c
index 402d52ff3e4a..f792fa902249 100644
--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -1282,8 +1282,8 @@ void put_swap_page(struct page *page, swp_entry_t entry)
if (!si)
return;
 
+   ci = lock_cluster_or_swap_info(si, offset);
if (size == SWAPFILE_CLUSTER) {
-   ci = lock_cluster(si, offset);
VM_BUG_ON(!cluster_is_huge(ci));
map = si->swap_map + offset;
for (i = 0; i < SWAPFILE_CLUSTER; i++) {
@@ -1292,13 +1292,9 @@ void put_swap_page(struct page *page, swp_entry_t entry)
if (val == SWAP_HAS_CACHE)
free_entries++;
}
-   if (!free_entries) {
-   for (i = 0; i < SWAPFILE_CLUSTER; i++)
-   map[i] &= ~SWAP_HAS_CACHE;
-   }
cluster_clear_huge(ci);
-   unlock_cluster(ci);
if (free_entries == SWAPFILE_CLUSTER) {
+   unlock_cluster_or_swap_info(si, ci);
spin_lock(&si->lock);
ci = lock_cluster(si, offset);
memset(map, 0, SWAPFILE_CLUSTER);
@@ -1309,12 +1305,16 @@ void put_swap_page(struct page *page, swp_entry_t entry)
return;
}
}
-   if (size == 1 || free_entries) {
-   for (i = 0; i < size; i++, entry.val++) {
-   if (!__swap_entry_free(si, entry, SWAP_HAS_CACHE))
-   free_swap_slot(entry);
+   for (i = 0; i < size; i++, entry.val++) {
+   if (!__swap_entry_free_locked(si, offset + i, SWAP_HAS_CACHE)) {
+   unlock_cluster_or_swap_info(si, ci);
+   free_swap_slot(entry);
+   if (i == size - 1)
+   return;
+   lock_cluster_or_swap_info(si, offset);
}
}
+   unlock_cluster_or_swap_info(si, ci);
 }
 
 #ifdef CONFIG_THP_SWAP
-- 
2.16.4

[PATCH v4 5/8] swap: Unify normal/huge code path in put_swap_page()

2018-07-20 Thread Huang Ying

In this patch, the normal/huge code path in put_swap_page() and
several helper functions are unified to avoid duplicated code, bugs,
etc. and make it easier to review the code.

The removed lines are more than added lines.  And the binary size is
kept exactly same when CONFIG_TRANSPARENT_HUGEPAGE=n.

Signed-off-by: "Huang, Ying" 
Suggested-and-acked-by: Dave Hansen 
Reviewed-by: Daniel Jordan 
Cc: Michal Hocko 
Cc: Johannes Weiner 
Cc: Shaohua Li 
Cc: Hugh Dickins 
Cc: Minchan Kim 
Cc: Rik van Riel 
Cc: Dan Williams 
---
 mm/swapfile.c | 83 ++-
 1 file changed, 37 insertions(+), 46 deletions(-)

diff --git a/mm/swapfile.c b/mm/swapfile.c
index 97814a01170d..ad247c3da494 100644
--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -205,8 +205,16 @@ static void discard_swap_cluster(struct swap_info_struct 
*si,
 
 #ifdef CONFIG_THP_SWAP
 #define SWAPFILE_CLUSTER   HPAGE_PMD_NR
+
+#define swap_entry_size(size)  (size)
 #else
 #define SWAPFILE_CLUSTER   256
+
+/*
+ * Define swap_entry_size() as constant to let compiler to optimize
+ * out some code if !CONFIG_THP_SWAP
+ */
+#define swap_entry_size(size)  1
 #endif
 #define LATENCY_LIMIT  256
 
@@ -1251,18 +1259,7 @@ void swap_free(swp_entry_t entry)
 /*
  * Called after dropping swapcache to decrease refcnt to swap entries.
  */
-static void swapcache_free(swp_entry_t entry)
-{
-   struct swap_info_struct *p;
-
-   p = _swap_info_get(entry);
-   if (p) {
-   if (!__swap_entry_free(p, entry, SWAP_HAS_CACHE))
-   free_swap_slot(entry);
-   }
-}
-
-static void swapcache_free_cluster(swp_entry_t entry)
+void put_swap_page(struct page *page, swp_entry_t entry)
 {
unsigned long offset = swp_offset(entry);
unsigned long idx = offset / SWAPFILE_CLUSTER;
@@ -1271,39 +1268,41 @@ static void swapcache_free_cluster(swp_entry_t entry)
unsigned char *map;
unsigned int i, free_entries = 0;
unsigned char val;
-
-   if (!IS_ENABLED(CONFIG_THP_SWAP))
-   return;
+   int size = swap_entry_size(hpage_nr_pages(page));
 
si = _swap_info_get(entry);
if (!si)
return;
 
-   ci = lock_cluster(si, offset);
-   VM_BUG_ON(!cluster_is_huge(ci));
-   map = si->swap_map + offset;
-   for (i = 0; i < SWAPFILE_CLUSTER; i++) {
-   val = map[i];
-   VM_BUG_ON(!(val & SWAP_HAS_CACHE));
-   if (val == SWAP_HAS_CACHE)
-   free_entries++;
-   }
-   if (!free_entries) {
-   for (i = 0; i < SWAPFILE_CLUSTER; i++)
-   map[i] &= ~SWAP_HAS_CACHE;
-   }
-   cluster_clear_huge(ci);
-   unlock_cluster(ci);
-   if (free_entries == SWAPFILE_CLUSTER) {
-   spin_lock(&si->lock);
+   if (size == SWAPFILE_CLUSTER) {
ci = lock_cluster(si, offset);
-   memset(map, 0, SWAPFILE_CLUSTER);
+   VM_BUG_ON(!cluster_is_huge(ci));
+   map = si->swap_map + offset;
+   for (i = 0; i < SWAPFILE_CLUSTER; i++) {
+   val = map[i];
+   VM_BUG_ON(!(val & SWAP_HAS_CACHE));
+   if (val == SWAP_HAS_CACHE)
+   free_entries++;
+   }
+   if (!free_entries) {
+   for (i = 0; i < SWAPFILE_CLUSTER; i++)
+   map[i] &= ~SWAP_HAS_CACHE;
+   }
+   cluster_clear_huge(ci);
unlock_cluster(ci);
-   mem_cgroup_uncharge_swap(entry, SWAPFILE_CLUSTER);
-   swap_free_cluster(si, idx);
-   spin_unlock(&si->lock);
-   } else if (free_entries) {
-   for (i = 0; i < SWAPFILE_CLUSTER; i++, entry.val++) {
+   if (free_entries == SWAPFILE_CLUSTER) {
+   spin_lock(&si->lock);
+   ci = lock_cluster(si, offset);
+   memset(map, 0, SWAPFILE_CLUSTER);
+   unlock_cluster(ci);
+   mem_cgroup_uncharge_swap(entry, SWAPFILE_CLUSTER);
+   swap_free_cluster(si, idx);
+   spin_unlock(&si->lock);
+   return;
+   }
+   }
+   if (size == 1 || free_entries) {
+   for (i = 0; i < size; i++, entry.val++) {
if (!__swap_entry_free(si, entry, SWAP_HAS_CACHE))
free_swap_slot(entry);
}
@@ -1327,14 +1326,6 @@ int split_swap_cluster(swp_entry_t entry)
 }
 #endif
 
-void put_swap_page(struct page *page, swp_entry_t entry)
-{
-   if (!PageTransHuge(page))
-   swapcache_free(entry);
-   else
-   swapcache_free_cluster(entry);
-}
-
 static int swp_entry_cmp(co

[PATCH v4 7/8] swap: Add __swap_entry_free_locked()

2018-07-20 Thread Huang Ying

The part of __swap_entry_free() with lock held is separated into a new
function __swap_entry_free_locked().  Because we want to reuse that
piece of code in some other places.

Just mechanical code refactoring, there is no any functional change in
this function.

Signed-off-by: "Huang, Ying" 
Reviewed-by: Daniel Jordan 
Acked-by: Dave Hansen 
Cc: Michal Hocko 
Cc: Johannes Weiner 
Cc: Shaohua Li 
Cc: Hugh Dickins 
Cc: Minchan Kim 
Cc: Rik van Riel 
Cc: Dan Williams 
---
 mm/swapfile.c | 20 ++--
 1 file changed, 14 insertions(+), 6 deletions(-)

diff --git a/mm/swapfile.c b/mm/swapfile.c
index d80567dd60db..402d52ff3e4a 100644
--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -1182,16 +1182,13 @@ struct swap_info_struct *get_swap_device(swp_entry_t 
entry)
return NULL;
 }
 
-static unsigned char __swap_entry_free(struct swap_info_struct *p,
-  swp_entry_t entry, unsigned char usage)
+static unsigned char __swap_entry_free_locked(struct swap_info_struct *p,
+ unsigned long offset,
+ unsigned char usage)
 {
-   struct swap_cluster_info *ci;
-   unsigned long offset = swp_offset(entry);
unsigned char count;
unsigned char has_cache;
 
-   ci = lock_cluster_or_swap_info(p, offset);
-
count = p->swap_map[offset];
 
has_cache = count & SWAP_HAS_CACHE;
@@ -1219,6 +1216,17 @@ static unsigned char __swap_entry_free(struct 
swap_info_struct *p,
usage = count | has_cache;
p->swap_map[offset] = usage ? : SWAP_HAS_CACHE;
 
+   return usage;
+}
+
+static unsigned char __swap_entry_free(struct swap_info_struct *p,
+  swp_entry_t entry, unsigned char usage)
+{
+   struct swap_cluster_info *ci;
+   unsigned long offset = swp_offset(entry);
+
+   ci = lock_cluster_or_swap_info(p, offset);
+   usage = __swap_entry_free_locked(p, offset, usage);
unlock_cluster_or_swap_info(p, ci);
 
return usage;
-- 
2.16.4

[PATCH v4 6/8] swap, get_swap_pages: Use entry_size instead of cluster in parameter

2018-07-20 Thread Huang Ying

As suggested by Matthew Wilcox, it is better to use "int entry_size"
instead of "bool cluster" as parameter to specify whether to operate
for huge or normal swap entries.  Because this improve the flexibility
to support other swap entry size.  And Dave Hansen thinks that this
improves code readability too.

So in this patch, the "bool cluster" parameter of get_swap_pages() is
replaced by "int entry_size".

And nr_swap_entries() trick is used to reduce the binary size when
!CONFIG_TRANSPARENT_HUGE_PAGE.

   textdata bss dec hex filename
base  242152028 340   2658367d7 mm/swapfile.o
head  241232004 340   264676763 mm/swapfile.o

Signed-off-by: "Huang, Ying" 
Acked-by: Dave Hansen 
Cc: Daniel Jordan 
Cc: Michal Hocko 
Cc: Johannes Weiner 
Cc: Shaohua Li 
Cc: Hugh Dickins 
Cc: Minchan Kim 
Cc: Rik van Riel 
Cc: Dan Williams 
---
 include/linux/swap.h |  2 +-
 mm/swap_slots.c  |  8 
 mm/swapfile.c| 16 
 3 files changed, 13 insertions(+), 13 deletions(-)

diff --git a/include/linux/swap.h b/include/linux/swap.h
index e20c240d6c65..34de0d8bf4fa 100644
--- a/include/linux/swap.h
+++ b/include/linux/swap.h
@@ -443,7 +443,7 @@ extern void si_swapinfo(struct sysinfo *);
 extern swp_entry_t get_swap_page(struct page *page);
 extern void put_swap_page(struct page *page, swp_entry_t entry);
 extern swp_entry_t get_swap_page_of_type(int);
-extern int get_swap_pages(int n, bool cluster, swp_entry_t swp_entries[]);
+extern int get_swap_pages(int n, swp_entry_t swp_entries[], int entry_size);
 extern int add_swap_count_continuation(swp_entry_t, gfp_t);
 extern void swap_shmem_alloc(swp_entry_t);
 extern int swap_duplicate(swp_entry_t);
diff --git a/mm/swap_slots.c b/mm/swap_slots.c
index 008ccb22fee6..63a7b4563a57 100644
--- a/mm/swap_slots.c
+++ b/mm/swap_slots.c
@@ -269,8 +269,8 @@ static int refill_swap_slots_cache(struct swap_slots_cache 
*cache)
 
cache->cur = 0;
if (swap_slot_cache_active)
-   cache->nr = get_swap_pages(SWAP_SLOTS_CACHE_SIZE, false,
-  cache->slots);
+   cache->nr = get_swap_pages(SWAP_SLOTS_CACHE_SIZE,
+  cache->slots, 1);
 
return cache->nr;
 }
@@ -316,7 +316,7 @@ swp_entry_t get_swap_page(struct page *page)
 
if (PageTransHuge(page)) {
if (IS_ENABLED(CONFIG_THP_SWAP))
-   get_swap_pages(1, true, &entry);
+   get_swap_pages(1, &entry, HPAGE_PMD_NR);
goto out;
}
 
@@ -350,7 +350,7 @@ swp_entry_t get_swap_page(struct page *page)
goto out;
}
 
-   get_swap_pages(1, false, &entry);
+   get_swap_pages(1, &entry, 1);
 out:
if (mem_cgroup_try_charge_swap(page, entry)) {
put_swap_page(page, entry);
diff --git a/mm/swapfile.c b/mm/swapfile.c
index ad247c3da494..d80567dd60db 100644
--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -941,18 +941,18 @@ static unsigned long scan_swap_map(struct 
swap_info_struct *si,
 
 }
 
-int get_swap_pages(int n_goal, bool cluster, swp_entry_t swp_entries[])
+int get_swap_pages(int n_goal, swp_entry_t swp_entries[], int entry_size)
 {
-   unsigned long nr_pages = cluster ? SWAPFILE_CLUSTER : 1;
+   unsigned long size = swap_entry_size(entry_size);
struct swap_info_struct *si, *next;
long avail_pgs;
int n_ret = 0;
int node;
 
/* Only single cluster request supported */
-   WARN_ON_ONCE(n_goal > 1 && cluster);
+   WARN_ON_ONCE(n_goal > 1 && size == SWAPFILE_CLUSTER);
 
-   avail_pgs = atomic_long_read(&nr_swap_pages) / nr_pages;
+   avail_pgs = atomic_long_read(&nr_swap_pages) / size;
if (avail_pgs <= 0)
goto noswap;
 
@@ -962,7 +962,7 @@ int get_swap_pages(int n_goal, bool cluster, swp_entry_t 
swp_entries[])
if (n_goal > avail_pgs)
n_goal = avail_pgs;
 
-   atomic_long_sub(n_goal * nr_pages, &nr_swap_pages);
+   atomic_long_sub(n_goal * size, &nr_swap_pages);
 
spin_lock(&swap_avail_lock);
 
@@ -989,14 +989,14 @@ int get_swap_pages(int n_goal, bool cluster, swp_entry_t 
swp_entries[])
spin_unlock(&si->lock);
goto nextsi;
}
-   if (cluster) {
+   if (size == SWAPFILE_CLUSTER) {
if (!(si->flags & SWP_FILE))
n_ret = swap_alloc_cluster(si, swp_entries);
} else
n_ret = scan_swap_map_slots(si, SWAP_HAS_CACHE,
n_goal, swp_entries);
spin_unlock(&si->lock);
-   if (n_ret || cluster)
+

[PATCH v4 2/8] mm/swapfile.c: Replace some #ifdef with IS_ENABLED()

2018-07-20 Thread Huang Ying

In mm/swapfile.c, THP (Transparent Huge Page) swap specific code is
enclosed by #ifdef CONFIG_THP_SWAP/#endif to avoid code dilating when
THP isn't enabled.  But #ifdef/#endif in .c file hurt the code
readability, so Dave suggested to use IS_ENABLED(CONFIG_THP_SWAP)
instead and let compiler to do the dirty job for us.  This has
potential to remove some duplicated code too.  From output of `size`,

text   data bss dec hex filename
THP=y: 26269   2076 340   28685700d mm/swapfile.o
ifdef/endif:   24115   2028 340   264836773 mm/swapfile.o
IS_ENABLED:24179   2028 340   2654767b3 mm/swapfile.o

IS_ENABLED() based solution works quite well, almost as good as that
of #ifdef/#endif.  And from the diffstat, the removed lines are more
than added lines.

One #ifdef for split_swap_cluster() is kept.  Because it is a public
function with a stub implementation for CONFIG_THP_SWAP=n in swap.h.

Signed-off-by: "Huang, Ying" 
Suggested-and-acked-by: Dave Hansen 
Reviewed-by: Daniel Jordan 
Cc: Michal Hocko 
Cc: Johannes Weiner 
Cc: Shaohua Li 
Cc: Hugh Dickins 
Cc: Minchan Kim 
Cc: Rik van Riel 
Cc: Dan Williams 
---
 mm/swapfile.c | 60 ---
 1 file changed, 20 insertions(+), 40 deletions(-)

diff --git a/mm/swapfile.c b/mm/swapfile.c
index d101e044efbf..7283104bfafa 100644
--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -869,7 +869,6 @@ static int scan_swap_map_slots(struct swap_info_struct *si,
return n_ret;
 }
 
-#ifdef CONFIG_THP_SWAP
 static int swap_alloc_cluster(struct swap_info_struct *si, swp_entry_t *slot)
 {
unsigned long idx;
@@ -877,6 +876,15 @@ static int swap_alloc_cluster(struct swap_info_struct *si, 
swp_entry_t *slot)
unsigned long offset, i;
unsigned char *map;
 
+   /*
+* Should not even be attempting cluster allocations when huge
+* page swap is disabled.  Warn and fail the allocation.
+*/
+   if (!IS_ENABLED(CONFIG_THP_SWAP)) {
+   VM_WARN_ON_ONCE(1);
+   return 0;
+   }
+
if (cluster_list_empty(&si->free_clusters))
return 0;
 
@@ -907,13 +915,6 @@ static void swap_free_cluster(struct swap_info_struct *si, 
unsigned long idx)
unlock_cluster(ci);
swap_range_free(si, offset, SWAPFILE_CLUSTER);
 }
-#else
-static int swap_alloc_cluster(struct swap_info_struct *si, swp_entry_t *slot)
-{
-   VM_WARN_ON_ONCE(1);
-   return 0;
-}
-#endif /* CONFIG_THP_SWAP */
 
 static unsigned long scan_swap_map(struct swap_info_struct *si,
   unsigned char usage)
@@ -1259,7 +1260,6 @@ static void swapcache_free(swp_entry_t entry)
}
 }
 
-#ifdef CONFIG_THP_SWAP
 static void swapcache_free_cluster(swp_entry_t entry)
 {
unsigned long offset = swp_offset(entry);
@@ -1270,6 +1270,9 @@ static void swapcache_free_cluster(swp_entry_t entry)
unsigned int i, free_entries = 0;
unsigned char val;
 
+   if (!IS_ENABLED(CONFIG_THP_SWAP))
+   return;
+
si = _swap_info_get(entry);
if (!si)
return;
@@ -1305,6 +1308,7 @@ static void swapcache_free_cluster(swp_entry_t entry)
}
 }
 
+#ifdef CONFIG_THP_SWAP
 int split_swap_cluster(swp_entry_t entry)
 {
struct swap_info_struct *si;
@@ -1319,11 +1323,7 @@ int split_swap_cluster(swp_entry_t entry)
unlock_cluster(ci);
return 0;
 }
-#else
-static inline void swapcache_free_cluster(swp_entry_t entry)
-{
-}
-#endif /* CONFIG_THP_SWAP */
+#endif
 
 void put_swap_page(struct page *page, swp_entry_t entry)
 {
@@ -1482,7 +1482,6 @@ int swp_swapcount(swp_entry_t entry)
return count;
 }
 
-#ifdef CONFIG_THP_SWAP
 static bool swap_page_trans_huge_swapped(struct swap_info_struct *si,
 swp_entry_t entry)
 {
@@ -1493,6 +1492,9 @@ static bool swap_page_trans_huge_swapped(struct 
swap_info_struct *si,
int i;
bool ret = false;
 
+   if (!IS_ENABLED(CONFIG_THP_SWAP))
+   return swap_swapcount(si, entry) != 0;
+
ci = lock_cluster_or_swap_info(si, offset);
if (!ci || !cluster_is_huge(ci)) {
if (map[roffset] != SWAP_HAS_CACHE)
@@ -1515,7 +1517,7 @@ static bool page_swapped(struct page *page)
swp_entry_t entry;
struct swap_info_struct *si;
 
-   if (likely(!PageTransCompound(page)))
+   if (!IS_ENABLED(CONFIG_THP_SWAP) || likely(!PageTransCompound(page)))
return page_swapcount(page) != 0;
 
page = compound_head(page);
@@ -1539,10 +1541,8 @@ static int page_trans_huge_map_swapcount(struct page 
*page, int *total_mapcount,
/* hugetlbfs shouldn't call it */
VM_BUG_ON_PAGE(PageHuge(page), page);
 
-   if (likely(!PageTransCompound(page))) {
-   mapcount = atomic_read(&page->_mapcount) + 1;

[PATCH v4 3/8] swap: Use swap_count() in swap_page_trans_huge_swapped()

2018-07-20 Thread Huang Ying

In swap_page_trans_huge_swapped(), to identify whether there's any
page table mapping for a 4k sized swap entry, "si->swap_map[i] !=
SWAP_HAS_CACHE" is used.  This works correctly now, because all users
of the function will only call it after checking SWAP_HAS_CACHE.  But
as pointed out by Daniel, it is better to use "swap_count(map[i])"
here, because it works for "map[i] == 0" case too.

And this makes the implementation more consistent between normal and
huge swap entry.

Signed-off-by: "Huang, Ying" 
Suggested-and-reviewed-by: Daniel Jordan 
Acked-by: Dave Hansen 
Cc: Michal Hocko 
Cc: Johannes Weiner 
Cc: Shaohua Li 
Cc: Hugh Dickins 
Cc: Minchan Kim 
Cc: Rik van Riel 
Cc: Dan Williams 
---
 mm/swapfile.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/mm/swapfile.c b/mm/swapfile.c
index 7283104bfafa..833613e59ef7 100644
--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -1497,12 +1497,12 @@ static bool swap_page_trans_huge_swapped(struct 
swap_info_struct *si,
 
ci = lock_cluster_or_swap_info(si, offset);
if (!ci || !cluster_is_huge(ci)) {
-   if (map[roffset] != SWAP_HAS_CACHE)
+   if (swap_count(map[roffset]))
ret = true;
goto unlock_out;
}
for (i = 0; i < SWAPFILE_CLUSTER; i++) {
-   if (map[offset + i] != SWAP_HAS_CACHE) {
+   if (swap_count(map[offset + i])) {
ret = true;
break;
}
-- 
2.16.4

[PATCH v4 1/8] swap: Add comments to lock_cluster_or_swap_info()

2018-07-20 Thread Huang Ying

To improve the code readability.

Signed-off-by: "Huang, Ying" 
Suggested-and-acked-by: Dave Hansen 
Reviewed-by: Daniel Jordan 
Cc: Michal Hocko 
Cc: Johannes Weiner 
Cc: Shaohua Li 
Cc: Hugh Dickins 
Cc: Minchan Kim 
Cc: Rik van Riel 
Cc: Dan Williams 
---
 mm/swapfile.c | 9 +++--
 1 file changed, 7 insertions(+), 2 deletions(-)

diff --git a/mm/swapfile.c b/mm/swapfile.c
index d8fddfb000ec..d101e044efbf 100644
--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -297,13 +297,18 @@ static inline void unlock_cluster(struct 
swap_cluster_info *ci)
spin_unlock(&ci->lock);
 }
 
+/*
+ * Determine the locking method in use for this device.  Return
+ * swap_cluster_info if SSD-style cluster-based locking is in place.
+ */
 static inline struct swap_cluster_info *lock_cluster_or_swap_info(
-   struct swap_info_struct *si,
-   unsigned long offset)
+   struct swap_info_struct *si, unsigned long offset)
 {
struct swap_cluster_info *ci;
 
+   /* Try to use fine-grained SSD-style locking if available: */
ci = lock_cluster(si, offset);
+   /* Otherwise, fall back to traditional, coarse locking: */
if (!ci)
spin_lock(&si->lock);
 
-- 
2.16.4

[PATCH v4 4/8] swap: Unify normal/huge code path in swap_page_trans_huge_swapped()

2018-07-20 Thread Huang Ying

As suggested by Dave, we should unify the code path for normal and
huge swap support if possible to avoid duplicated code, bugs, etc. and
make it easier to review code.

In this patch, the normal/huge code path in swap_page_trans_huge_swapped()
is unified, the added and removed lines are same.  And the binary size
is kept almost same when CONFIG_TRANSPARENT_HUGEPAGE=n.

 text  data bss dec hex filename
base:   24179  2028 340   2654767b3 mm/swapfile.o
unified:24215  2028 340   2658367d7 mm/swapfile.o

Signed-off-by: "Huang, Ying" 
Suggested-and-acked-by: Dave Hansen 
Reviewed-by: Daniel Jordan 
Cc: Michal Hocko 
Cc: Johannes Weiner 
Cc: Shaohua Li 
Cc: Hugh Dickins 
Cc: Minchan Kim 
Cc: Rik van Riel 
Cc: Dan Williams 
---
 mm/swapfile.c | 7 +++
 1 file changed, 3 insertions(+), 4 deletions(-)

diff --git a/mm/swapfile.c b/mm/swapfile.c
index 833613e59ef7..97814a01170d 100644
--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -270,7 +270,9 @@ static inline void cluster_set_null(struct 
swap_cluster_info *info)
 
 static inline bool cluster_is_huge(struct swap_cluster_info *info)
 {
-   return info->flags & CLUSTER_FLAG_HUGE;
+   if (IS_ENABLED(CONFIG_THP_SWAP))
+   return info->flags & CLUSTER_FLAG_HUGE;
+   return false;
 }
 
 static inline void cluster_clear_huge(struct swap_cluster_info *info)
@@ -1492,9 +1494,6 @@ static bool swap_page_trans_huge_swapped(struct 
swap_info_struct *si,
int i;
bool ret = false;
 
-   if (!IS_ENABLED(CONFIG_THP_SWAP))
-   return swap_swapcount(si, entry) != 0;
-
ci = lock_cluster_or_swap_info(si, offset);
if (!ci || !cluster_is_huge(ci)) {
if (swap_count(map[roffset]))
-- 
2.16.4

[PATCH v4 0/8] swap: THP optimizing refactoring

2018-07-20 Thread Huang Ying

This patchset is based on 2018-07-13 head of mmotm tree.

Now the THP (Transparent Huge Page) swap optimizing is implemented in
the way like below,

#ifdef CONFIG_THP_SWAP
huge_function(...)
{
}
#else
normal_function(...)
{
}
#endif

general_function(...)
{
if (huge)
return thp_function(...);
else
return normal_function(...);
}

As pointed out by Dave Hansen, this will,

1. Created a new, wholly untested code path for huge page
2. Created two places to patch bugs
3. Are not reusing code when possible

This patchset is to address these problems via merging huge/normal
code path/functions if possible.

One concern is that this may cause code size to dilate when
!CONFIG_TRANSPARENT_HUGEPAGE.  The data shows that most refactoring
will only cause quite slight code size increase.

Best Regards,
Huang, Ying

Re: [PATCH v3 1/8] swap: Add comments to lock_cluster_or_swap_info()

2018-07-19 Thread Huang, Ying

Christoph Hellwig  writes:

> On Thu, Jul 19, 2018 at 04:48:35PM +0800, Huang Ying wrote:
>> +/*
>> + * Determine the locking method in use for this device.  Return
>> + * swap_cluster_info if SSD-style cluster-based locking is in place.
>> + */
>>  static inline struct swap_cluster_info *lock_cluster_or_swap_info(
>>  struct swap_info_struct *si,
>>  unsigned long offset)
>>  {
>>  struct swap_cluster_info *ci;
>>  
>> +/* Try to use fine-grained SSD-style locking if available: */
>
> Once you touch this are can you also please use standard two-tab
> alignment for the spill-over function arguments:
>
> static inline struct swap_cluster_info *lock_cluster_or_swap_info(
>   struct swap_info_struct *si, unsigned long offset)

Sure.  Will change this in next version.

Best Regards,
Huang, Ying

Re: [PATCH v3 4/8] swap: Unify normal/huge code path in swap_page_trans_huge_swapped()

2018-07-19 Thread Huang, Ying

Christoph Hellwig  writes:

>>  static inline bool cluster_is_huge(struct swap_cluster_info *info)
>>  {
>> -return info->flags & CLUSTER_FLAG_HUGE;
>> +if (IS_ENABLED(CONFIG_THP_SWAP))
>> +return info->flags & CLUSTER_FLAG_HUGE;
>> +else
>> +return false;
>
> Nitpick: no need for an else after a return:
>
>   if (IS_ENABLED(CONFIG_THP_SWAP))
>   return info->flags & CLUSTER_FLAG_HUGE;
>       return false;

Sure.  Will change this in next version.

Best Regards,
Huang, Ying

[PATCH v3 8/8] swap, put_swap_page: Share more between huge/normal code path

2018-07-19 Thread Huang Ying

In this patch, locking related code is shared between huge/normal code
path in put_swap_page() to reduce code duplication.  And `free_entries
== 0` case is merged into more general `free_entries !=
SWAPFILE_CLUSTER` case, because the new locking method makes it easy.

The added lines is same as the removed lines.  But the code size is
increased when CONFIG_TRANSPARENT_HUGEPAGE=n.

text   data bss dec hex filename
base:  24123   2004 340   264676763 mm/swapfile.o
unified:   24485   2004 340   2682968cd mm/swapfile.o

Dig on step deeper with `size -A mm/swapfile.o` for base and unified
kernel and compare the result, yields,

  -.text17723  0
  +.text17835  0
  -.orc_unwind_ip1380  0
  +.orc_unwind_ip1480  0
  -.orc_unwind   2070  0
  +.orc_unwind   2220  0
  -Total26686
  +Total27048

The total difference is the same.  The text segment difference is much
smaller: 112.  More difference comes from the ORC unwinder
segments: (1480 + 2220) - (1380 + 2070) = 250.  If the frame pointer
unwinder is used, this costs nothing.

Signed-off-by: "Huang, Ying" 
Reviewed-by: Daniel Jordan 
Cc: Dave Hansen 
Cc: Michal Hocko 
Cc: Johannes Weiner 
Cc: Shaohua Li 
Cc: Hugh Dickins 
Cc: Minchan Kim 
Cc: Rik van Riel 
Cc: Dan Williams 
---
 mm/swapfile.c | 20 ++--
 1 file changed, 10 insertions(+), 10 deletions(-)

diff --git a/mm/swapfile.c b/mm/swapfile.c
index d313f7512d26..2fe2e93cee0e 100644
--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -1284,8 +1284,8 @@ void put_swap_page(struct page *page, swp_entry_t entry)
if (!si)
return;
 
+   ci = lock_cluster_or_swap_info(si, offset);
if (size == SWAPFILE_CLUSTER) {
-   ci = lock_cluster(si, offset);
VM_BUG_ON(!cluster_is_huge(ci));
map = si->swap_map + offset;
for (i = 0; i < SWAPFILE_CLUSTER; i++) {
@@ -1294,13 +1294,9 @@ void put_swap_page(struct page *page, swp_entry_t entry)
if (val == SWAP_HAS_CACHE)
free_entries++;
}
-   if (!free_entries) {
-   for (i = 0; i < SWAPFILE_CLUSTER; i++)
-   map[i] &= ~SWAP_HAS_CACHE;
-   }
cluster_clear_huge(ci);
-   unlock_cluster(ci);
if (free_entries == SWAPFILE_CLUSTER) {
+   unlock_cluster_or_swap_info(si, ci);
spin_lock(&si->lock);
ci = lock_cluster(si, offset);
memset(map, 0, SWAPFILE_CLUSTER);
@@ -1311,12 +1307,16 @@ void put_swap_page(struct page *page, swp_entry_t entry)
return;
}
}
-   if (size == 1 || free_entries) {
-   for (i = 0; i < size; i++, entry.val++) {
-   if (!__swap_entry_free(si, entry, SWAP_HAS_CACHE))
-   free_swap_slot(entry);
+   for (i = 0; i < size; i++, entry.val++) {
+   if (!__swap_entry_free_locked(si, offset + i, SWAP_HAS_CACHE)) {
+   unlock_cluster_or_swap_info(si, ci);
+   free_swap_slot(entry);
+   if (i == size - 1)
+   return;
+   lock_cluster_or_swap_info(si, offset);
}
}
+   unlock_cluster_or_swap_info(si, ci);
 }
 
 #ifdef CONFIG_THP_SWAP
-- 
2.16.4

[PATCH v3 0/8] swap: THP optimizing refactoring

2018-07-19 Thread Huang Ying

This patchset is based on 2018-07-13 head of mmotm tree.

Now the THP (Transparent Huge Page) swap optimizing is implemented in
the way like below,

#ifdef CONFIG_THP_SWAP
huge_function(...)
{
}
#else
normal_function(...)
{
}
#endif

general_function(...)
{
if (huge)
return thp_function(...);
else
return normal_function(...);
}

As pointed out by Dave Hansen, this will,

1. Created a new, wholly untested code path for huge page
2. Created two places to patch bugs
3. Are not reusing code when possible

This patchset is to address these problems via merging huge/normal
code path/functions if possible.

One concern is that this may cause code size to dilate when
!CONFIG_TRANSPARENT_HUGEPAGE.  The data shows that most refactoring
will only cause quite slight code size increase.

Best Regards,
Huang, Ying

[PATCH v3 3/8] swap: Use swap_count() in swap_page_trans_huge_swapped()

2018-07-19 Thread Huang Ying

In swap_page_trans_huge_swapped(), to identify whether there's any
page table mapping for a 4k sized swap entry, "si->swap_map[i] !=
SWAP_HAS_CACHE" is used.  This works correctly now, because all users
of the function will only call it after checking SWAP_HAS_CACHE.  But
as pointed out by Daniel, it is better to use "swap_count(map[i])"
here, because it works for "map[i] == 0" case too.

And this makes the implementation more consistent between normal and
huge swap entry.

Signed-off-by: "Huang, Ying" 
Suggested-and-reviewed-by: Daniel Jordan 
Cc: Dave Hansen 
Cc: Michal Hocko 
Cc: Johannes Weiner 
Cc: Shaohua Li 
Cc: Hugh Dickins 
Cc: Minchan Kim 
Cc: Rik van Riel 
Cc: Dan Williams 
---
 mm/swapfile.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/mm/swapfile.c b/mm/swapfile.c
index 87c4c3446aed..cb0bc54e99c0 100644
--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -1498,12 +1498,12 @@ static bool swap_page_trans_huge_swapped(struct 
swap_info_struct *si,
 
ci = lock_cluster_or_swap_info(si, offset);
if (!ci || !cluster_is_huge(ci)) {
-   if (map[roffset] != SWAP_HAS_CACHE)
+   if (swap_count(map[roffset]))
ret = true;
goto unlock_out;
}
for (i = 0; i < SWAPFILE_CLUSTER; i++) {
-   if (map[offset + i] != SWAP_HAS_CACHE) {
+   if (swap_count(map[offset + i])) {
ret = true;
break;
}
-- 
2.16.4

[PATCH v3 1/8] swap: Add comments to lock_cluster_or_swap_info()

2018-07-19 Thread Huang Ying

To improve the code readability.

Signed-off-by: "Huang, Ying" 
Suggested-by: Dave Hansen 
Reviewed-by: Daniel Jordan 
Cc: Michal Hocko 
Cc: Johannes Weiner 
Cc: Shaohua Li 
Cc: Hugh Dickins 
Cc: Minchan Kim 
Cc: Rik van Riel 
Cc: Dan Williams 
---
 mm/swapfile.c | 6 ++
 1 file changed, 6 insertions(+)

diff --git a/mm/swapfile.c b/mm/swapfile.c
index d8fddfb000ec..29da44411734 100644
--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -297,13 +297,19 @@ static inline void unlock_cluster(struct 
swap_cluster_info *ci)
spin_unlock(&ci->lock);
 }
 
+/*
+ * Determine the locking method in use for this device.  Return
+ * swap_cluster_info if SSD-style cluster-based locking is in place.
+ */
 static inline struct swap_cluster_info *lock_cluster_or_swap_info(
struct swap_info_struct *si,
unsigned long offset)
 {
struct swap_cluster_info *ci;
 
+   /* Try to use fine-grained SSD-style locking if available: */
ci = lock_cluster(si, offset);
+   /* Otherwise, fall back to traditional, coarse locking: */
if (!ci)
spin_lock(&si->lock);
 
-- 
2.16.4

[PATCH v3 5/8] swap: Unify normal/huge code path in put_swap_page()

2018-07-19 Thread Huang Ying

In this patch, the normal/huge code path in put_swap_page() and
several helper functions are unified to avoid duplicated code, bugs,
etc. and make it easier to review the code.

The removed lines are more than added lines.  And the binary size is
kept exactly same when CONFIG_TRANSPARENT_HUGEPAGE=n.

Signed-off-by: "Huang, Ying" 
Suggested-by: Dave Hansen 
Reviewed-by: Daniel Jordan 
Cc: Michal Hocko 
Cc: Johannes Weiner 
Cc: Shaohua Li 
Cc: Hugh Dickins 
Cc: Minchan Kim 
Cc: Rik van Riel 
Cc: Dan Williams 
---
 mm/swapfile.c | 83 ++-
 1 file changed, 37 insertions(+), 46 deletions(-)

diff --git a/mm/swapfile.c b/mm/swapfile.c
index 96018207b582..407f97287ab3 100644
--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -205,8 +205,16 @@ static void discard_swap_cluster(struct swap_info_struct 
*si,
 
 #ifdef CONFIG_THP_SWAP
 #define SWAPFILE_CLUSTER   HPAGE_PMD_NR
+
+#define swap_entry_size(size)  (size)
 #else
 #define SWAPFILE_CLUSTER   256
+
+/*
+ * Define swap_entry_size() as constant to let compiler to optimize
+ * out some code if !CONFIG_THP_SWAP
+ */
+#define swap_entry_size(size)  1
 #endif
 #define LATENCY_LIMIT  256
 
@@ -1253,18 +1261,7 @@ void swap_free(swp_entry_t entry)
 /*
  * Called after dropping swapcache to decrease refcnt to swap entries.
  */
-static void swapcache_free(swp_entry_t entry)
-{
-   struct swap_info_struct *p;
-
-   p = _swap_info_get(entry);
-   if (p) {
-   if (!__swap_entry_free(p, entry, SWAP_HAS_CACHE))
-   free_swap_slot(entry);
-   }
-}
-
-static void swapcache_free_cluster(swp_entry_t entry)
+void put_swap_page(struct page *page, swp_entry_t entry)
 {
unsigned long offset = swp_offset(entry);
unsigned long idx = offset / SWAPFILE_CLUSTER;
@@ -1273,39 +1270,41 @@ static void swapcache_free_cluster(swp_entry_t entry)
unsigned char *map;
unsigned int i, free_entries = 0;
unsigned char val;
-
-   if (!IS_ENABLED(CONFIG_THP_SWAP))
-   return;
+   int size = swap_entry_size(hpage_nr_pages(page));
 
si = _swap_info_get(entry);
if (!si)
return;
 
-   ci = lock_cluster(si, offset);
-   VM_BUG_ON(!cluster_is_huge(ci));
-   map = si->swap_map + offset;
-   for (i = 0; i < SWAPFILE_CLUSTER; i++) {
-   val = map[i];
-   VM_BUG_ON(!(val & SWAP_HAS_CACHE));
-   if (val == SWAP_HAS_CACHE)
-   free_entries++;
-   }
-   if (!free_entries) {
-   for (i = 0; i < SWAPFILE_CLUSTER; i++)
-   map[i] &= ~SWAP_HAS_CACHE;
-   }
-   cluster_clear_huge(ci);
-   unlock_cluster(ci);
-   if (free_entries == SWAPFILE_CLUSTER) {
-   spin_lock(&si->lock);
+   if (size == SWAPFILE_CLUSTER) {
ci = lock_cluster(si, offset);
-   memset(map, 0, SWAPFILE_CLUSTER);
+   VM_BUG_ON(!cluster_is_huge(ci));
+   map = si->swap_map + offset;
+   for (i = 0; i < SWAPFILE_CLUSTER; i++) {
+   val = map[i];
+   VM_BUG_ON(!(val & SWAP_HAS_CACHE));
+   if (val == SWAP_HAS_CACHE)
+   free_entries++;
+   }
+   if (!free_entries) {
+   for (i = 0; i < SWAPFILE_CLUSTER; i++)
+   map[i] &= ~SWAP_HAS_CACHE;
+   }
+   cluster_clear_huge(ci);
unlock_cluster(ci);
-   mem_cgroup_uncharge_swap(entry, SWAPFILE_CLUSTER);
-   swap_free_cluster(si, idx);
-   spin_unlock(&si->lock);
-   } else if (free_entries) {
-   for (i = 0; i < SWAPFILE_CLUSTER; i++, entry.val++) {
+   if (free_entries == SWAPFILE_CLUSTER) {
+   spin_lock(&si->lock);
+   ci = lock_cluster(si, offset);
+   memset(map, 0, SWAPFILE_CLUSTER);
+   unlock_cluster(ci);
+   mem_cgroup_uncharge_swap(entry, SWAPFILE_CLUSTER);
+   swap_free_cluster(si, idx);
+   spin_unlock(&si->lock);
+   return;
+   }
+   }
+   if (size == 1 || free_entries) {
+   for (i = 0; i < size; i++, entry.val++) {
if (!__swap_entry_free(si, entry, SWAP_HAS_CACHE))
free_swap_slot(entry);
}
@@ -1329,14 +1328,6 @@ int split_swap_cluster(swp_entry_t entry)
 }
 #endif
 
-void put_swap_page(struct page *page, swp_entry_t entry)
-{
-   if (!PageTransHuge(page))
-   swapcache_free(entry);
-   else
-   swapcache_free_cluster(entry);
-}
-
 static int swp_entry_cmp(const voi

[PATCH v3 4/8] swap: Unify normal/huge code path in swap_page_trans_huge_swapped()

2018-07-19 Thread Huang Ying

As suggested by Dave, we should unify the code path for normal and
huge swap support if possible to avoid duplicated code, bugs, etc. and
make it easier to review code.

In this patch, the normal/huge code path in swap_page_trans_huge_swapped()
is unified, the added and removed lines are same.  And the binary size
is kept almost same when CONFIG_TRANSPARENT_HUGEPAGE=n.

 text  data bss dec hex filename
base:   24179  2028 340   2654767b3 mm/swapfile.o
unified:24215  2028 340   2658367d7 mm/swapfile.o

Signed-off-by: "Huang, Ying" 
Suggested-by: Dave Hansen 
Reviewed-by: Daniel Jordan 
Cc: Michal Hocko 
Cc: Johannes Weiner 
Cc: Shaohua Li 
Cc: Hugh Dickins 
Cc: Minchan Kim 
Cc: Rik van Riel 
Cc: Dan Williams 
---
 mm/swapfile.c | 8 
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/mm/swapfile.c b/mm/swapfile.c
index cb0bc54e99c0..96018207b582 100644
--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -270,7 +270,10 @@ static inline void cluster_set_null(struct 
swap_cluster_info *info)
 
 static inline bool cluster_is_huge(struct swap_cluster_info *info)
 {
-   return info->flags & CLUSTER_FLAG_HUGE;
+   if (IS_ENABLED(CONFIG_THP_SWAP))
+   return info->flags & CLUSTER_FLAG_HUGE;
+   else
+   return false;
 }
 
 static inline void cluster_clear_huge(struct swap_cluster_info *info)
@@ -1493,9 +1496,6 @@ static bool swap_page_trans_huge_swapped(struct 
swap_info_struct *si,
int i;
bool ret = false;
 
-   if (!IS_ENABLED(CONFIG_THP_SWAP))
-   return swap_swapcount(si, entry) != 0;
-
ci = lock_cluster_or_swap_info(si, offset);
if (!ci || !cluster_is_huge(ci)) {
if (swap_count(map[roffset]))
-- 
2.16.4

[PATCH v3 7/8] swap: Add __swap_entry_free_locked()

2018-07-19 Thread Huang Ying

The part of __swap_entry_free() with lock held is separated into a new
function __swap_entry_free_locked().  Because we want to reuse that
piece of code in some other places.

Just mechanical code refactoring, there is no any functional change in
this function.

Signed-off-by: "Huang, Ying" 
Reviewed-by: Daniel Jordan 
Cc: Dave Hansen 
Cc: Michal Hocko 
Cc: Johannes Weiner 
Cc: Shaohua Li 
Cc: Hugh Dickins 
Cc: Minchan Kim 
Cc: Rik van Riel 
Cc: Dan Williams 
---
 mm/swapfile.c | 20 ++--
 1 file changed, 14 insertions(+), 6 deletions(-)

diff --git a/mm/swapfile.c b/mm/swapfile.c
index 318ceb527c78..d313f7512d26 100644
--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -1184,16 +1184,13 @@ struct swap_info_struct *get_swap_device(swp_entry_t 
entry)
return NULL;
 }
 
-static unsigned char __swap_entry_free(struct swap_info_struct *p,
-  swp_entry_t entry, unsigned char usage)
+static unsigned char __swap_entry_free_locked(struct swap_info_struct *p,
+ unsigned long offset,
+ unsigned char usage)
 {
-   struct swap_cluster_info *ci;
-   unsigned long offset = swp_offset(entry);
unsigned char count;
unsigned char has_cache;
 
-   ci = lock_cluster_or_swap_info(p, offset);
-
count = p->swap_map[offset];
 
has_cache = count & SWAP_HAS_CACHE;
@@ -1221,6 +1218,17 @@ static unsigned char __swap_entry_free(struct 
swap_info_struct *p,
usage = count | has_cache;
p->swap_map[offset] = usage ? : SWAP_HAS_CACHE;
 
+   return usage;
+}
+
+static unsigned char __swap_entry_free(struct swap_info_struct *p,
+  swp_entry_t entry, unsigned char usage)
+{
+   struct swap_cluster_info *ci;
+   unsigned long offset = swp_offset(entry);
+
+   ci = lock_cluster_or_swap_info(p, offset);
+   usage = __swap_entry_free_locked(p, offset, usage);
unlock_cluster_or_swap_info(p, ci);
 
return usage;
-- 
2.16.4

[PATCH v3 2/8] mm/swapfile.c: Replace some #ifdef with IS_ENABLED()

2018-07-19 Thread Huang Ying

In mm/swapfile.c, THP (Transparent Huge Page) swap specific code is
enclosed by #ifdef CONFIG_THP_SWAP/#endif to avoid code dilating when
THP isn't enabled.  But #ifdef/#endif in .c file hurt the code
readability, so Dave suggested to use IS_ENABLED(CONFIG_THP_SWAP)
instead and let compiler to do the dirty job for us.  This has
potential to remove some duplicated code too.  From output of `size`,

text   data bss dec hex filename
THP=y: 26269   2076 340   28685700d mm/swapfile.o
ifdef/endif:   24115   2028 340   264836773 mm/swapfile.o
IS_ENABLED:24179   2028 340   2654767b3 mm/swapfile.o

IS_ENABLED() based solution works quite well, almost as good as that
of #ifdef/#endif.  And from the diffstat, the removed lines are more
than added lines.

One #ifdef for split_swap_cluster() is kept.  Because it is a public
function with a stub implementation for CONFIG_THP_SWAP=n in swap.h.

Signed-off-by: "Huang, Ying" 
Suggested-by: Dave Hansen 
Reviewed-by: Daniel Jordan 
Cc: Michal Hocko 
Cc: Johannes Weiner 
Cc: Shaohua Li 
Cc: Hugh Dickins 
Cc: Minchan Kim 
Cc: Rik van Riel 
Cc: Dan Williams 
---
 mm/swapfile.c | 60 ---
 1 file changed, 20 insertions(+), 40 deletions(-)

diff --git a/mm/swapfile.c b/mm/swapfile.c
index 29da44411734..87c4c3446aed 100644
--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -870,7 +870,6 @@ static int scan_swap_map_slots(struct swap_info_struct *si,
return n_ret;
 }
 
-#ifdef CONFIG_THP_SWAP
 static int swap_alloc_cluster(struct swap_info_struct *si, swp_entry_t *slot)
 {
unsigned long idx;
@@ -878,6 +877,15 @@ static int swap_alloc_cluster(struct swap_info_struct *si, 
swp_entry_t *slot)
unsigned long offset, i;
unsigned char *map;
 
+   /*
+* Should not even be attempting cluster allocations when huge
+* page swap is disabled.  Warn and fail the allocation.
+*/
+   if (!IS_ENABLED(CONFIG_THP_SWAP)) {
+   VM_WARN_ON_ONCE(1);
+   return 0;
+   }
+
if (cluster_list_empty(&si->free_clusters))
return 0;
 
@@ -908,13 +916,6 @@ static void swap_free_cluster(struct swap_info_struct *si, 
unsigned long idx)
unlock_cluster(ci);
swap_range_free(si, offset, SWAPFILE_CLUSTER);
 }
-#else
-static int swap_alloc_cluster(struct swap_info_struct *si, swp_entry_t *slot)
-{
-   VM_WARN_ON_ONCE(1);
-   return 0;
-}
-#endif /* CONFIG_THP_SWAP */
 
 static unsigned long scan_swap_map(struct swap_info_struct *si,
   unsigned char usage)
@@ -1260,7 +1261,6 @@ static void swapcache_free(swp_entry_t entry)
}
 }
 
-#ifdef CONFIG_THP_SWAP
 static void swapcache_free_cluster(swp_entry_t entry)
 {
unsigned long offset = swp_offset(entry);
@@ -1271,6 +1271,9 @@ static void swapcache_free_cluster(swp_entry_t entry)
unsigned int i, free_entries = 0;
unsigned char val;
 
+   if (!IS_ENABLED(CONFIG_THP_SWAP))
+   return;
+
si = _swap_info_get(entry);
if (!si)
return;
@@ -1306,6 +1309,7 @@ static void swapcache_free_cluster(swp_entry_t entry)
}
 }
 
+#ifdef CONFIG_THP_SWAP
 int split_swap_cluster(swp_entry_t entry)
 {
struct swap_info_struct *si;
@@ -1320,11 +1324,7 @@ int split_swap_cluster(swp_entry_t entry)
unlock_cluster(ci);
return 0;
 }
-#else
-static inline void swapcache_free_cluster(swp_entry_t entry)
-{
-}
-#endif /* CONFIG_THP_SWAP */
+#endif
 
 void put_swap_page(struct page *page, swp_entry_t entry)
 {
@@ -1483,7 +1483,6 @@ int swp_swapcount(swp_entry_t entry)
return count;
 }
 
-#ifdef CONFIG_THP_SWAP
 static bool swap_page_trans_huge_swapped(struct swap_info_struct *si,
 swp_entry_t entry)
 {
@@ -1494,6 +1493,9 @@ static bool swap_page_trans_huge_swapped(struct 
swap_info_struct *si,
int i;
bool ret = false;
 
+   if (!IS_ENABLED(CONFIG_THP_SWAP))
+   return swap_swapcount(si, entry) != 0;
+
ci = lock_cluster_or_swap_info(si, offset);
if (!ci || !cluster_is_huge(ci)) {
if (map[roffset] != SWAP_HAS_CACHE)
@@ -1516,7 +1518,7 @@ static bool page_swapped(struct page *page)
swp_entry_t entry;
struct swap_info_struct *si;
 
-   if (likely(!PageTransCompound(page)))
+   if (!IS_ENABLED(CONFIG_THP_SWAP) || likely(!PageTransCompound(page)))
return page_swapcount(page) != 0;
 
page = compound_head(page);
@@ -1540,10 +1542,8 @@ static int page_trans_huge_map_swapcount(struct page 
*page, int *total_mapcount,
/* hugetlbfs shouldn't call it */
VM_BUG_ON_PAGE(PageHuge(page), page);
 
-   if (likely(!PageTransCompound(page))) {
-   mapcount = atomic_read(&page->_mapcount) + 1;

[PATCH v3 6/8] swap, get_swap_pages: Use entry_size instead of cluster in parameter

2018-07-19 Thread Huang Ying

As suggested by Matthew Wilcox, it is better to use "int entry_size"
instead of "bool cluster" as parameter to specify whether to operate
for huge or normal swap entries.  Because this improve the flexibility
to support other swap entry size.  And Dave Hansen thinks that this
improves code readability too.

So in this patch, the "bool cluster" parameter of get_swap_pages() is
replaced by "int entry_size".

And nr_swap_entries() trick is used to reduce the binary size when
!CONFIG_TRANSPARENT_HUGE_PAGE.

   textdata bss dec hex filename
base  242152028 340   2658367d7 mm/swapfile.o
head  241232004 340   264676763 mm/swapfile.o

Signed-off-by: "Huang, Ying" 
Cc: Daniel Jordan 
Cc: Dave Hansen 
Cc: Michal Hocko 
Cc: Johannes Weiner 
Cc: Shaohua Li 
Cc: Hugh Dickins 
Cc: Minchan Kim 
Cc: Rik van Riel 
Cc: Dan Williams 
---
 include/linux/swap.h |  2 +-
 mm/swap_slots.c  |  8 
 mm/swapfile.c| 16 
 3 files changed, 13 insertions(+), 13 deletions(-)

diff --git a/include/linux/swap.h b/include/linux/swap.h
index e20c240d6c65..34de0d8bf4fa 100644
--- a/include/linux/swap.h
+++ b/include/linux/swap.h
@@ -443,7 +443,7 @@ extern void si_swapinfo(struct sysinfo *);
 extern swp_entry_t get_swap_page(struct page *page);
 extern void put_swap_page(struct page *page, swp_entry_t entry);
 extern swp_entry_t get_swap_page_of_type(int);
-extern int get_swap_pages(int n, bool cluster, swp_entry_t swp_entries[]);
+extern int get_swap_pages(int n, swp_entry_t swp_entries[], int entry_size);
 extern int add_swap_count_continuation(swp_entry_t, gfp_t);
 extern void swap_shmem_alloc(swp_entry_t);
 extern int swap_duplicate(swp_entry_t);
diff --git a/mm/swap_slots.c b/mm/swap_slots.c
index 008ccb22fee6..63a7b4563a57 100644
--- a/mm/swap_slots.c
+++ b/mm/swap_slots.c
@@ -269,8 +269,8 @@ static int refill_swap_slots_cache(struct swap_slots_cache 
*cache)
 
cache->cur = 0;
if (swap_slot_cache_active)
-   cache->nr = get_swap_pages(SWAP_SLOTS_CACHE_SIZE, false,
-  cache->slots);
+   cache->nr = get_swap_pages(SWAP_SLOTS_CACHE_SIZE,
+  cache->slots, 1);
 
return cache->nr;
 }
@@ -316,7 +316,7 @@ swp_entry_t get_swap_page(struct page *page)
 
if (PageTransHuge(page)) {
if (IS_ENABLED(CONFIG_THP_SWAP))
-   get_swap_pages(1, true, &entry);
+   get_swap_pages(1, &entry, HPAGE_PMD_NR);
goto out;
}
 
@@ -350,7 +350,7 @@ swp_entry_t get_swap_page(struct page *page)
goto out;
}
 
-   get_swap_pages(1, false, &entry);
+   get_swap_pages(1, &entry, 1);
 out:
if (mem_cgroup_try_charge_swap(page, entry)) {
put_swap_page(page, entry);
diff --git a/mm/swapfile.c b/mm/swapfile.c
index 407f97287ab3..318ceb527c78 100644
--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -943,18 +943,18 @@ static unsigned long scan_swap_map(struct 
swap_info_struct *si,
 
 }
 
-int get_swap_pages(int n_goal, bool cluster, swp_entry_t swp_entries[])
+int get_swap_pages(int n_goal, swp_entry_t swp_entries[], int entry_size)
 {
-   unsigned long nr_pages = cluster ? SWAPFILE_CLUSTER : 1;
+   unsigned long size = swap_entry_size(entry_size);
struct swap_info_struct *si, *next;
long avail_pgs;
int n_ret = 0;
int node;
 
/* Only single cluster request supported */
-   WARN_ON_ONCE(n_goal > 1 && cluster);
+   WARN_ON_ONCE(n_goal > 1 && size == SWAPFILE_CLUSTER);
 
-   avail_pgs = atomic_long_read(&nr_swap_pages) / nr_pages;
+   avail_pgs = atomic_long_read(&nr_swap_pages) / size;
if (avail_pgs <= 0)
goto noswap;
 
@@ -964,7 +964,7 @@ int get_swap_pages(int n_goal, bool cluster, swp_entry_t 
swp_entries[])
if (n_goal > avail_pgs)
n_goal = avail_pgs;
 
-   atomic_long_sub(n_goal * nr_pages, &nr_swap_pages);
+   atomic_long_sub(n_goal * size, &nr_swap_pages);
 
spin_lock(&swap_avail_lock);
 
@@ -991,14 +991,14 @@ int get_swap_pages(int n_goal, bool cluster, swp_entry_t 
swp_entries[])
spin_unlock(&si->lock);
goto nextsi;
}
-   if (cluster) {
+   if (size == SWAPFILE_CLUSTER) {
if (!(si->flags & SWP_FILE))
n_ret = swap_alloc_cluster(si, swp_entries);
} else
n_ret = scan_swap_map_slots(si, SWAP_HAS_CACHE,
n_goal, swp_entries);
spin_unlock(&si->lock);
-   if (n_ret || cluster)
+

Re: [PATCH v2 2/7] mm/swapfile.c: Replace some #ifdef with IS_ENABLED()

2018-07-18 Thread Huang, Ying

Dave Hansen  writes:

> On 07/17/2018 08:25 PM, Huang, Ying wrote:
>>> Seriously, though, does it hurt us to add a comment or two to say
>>> something like:
>>>
>>> /*
>>>  * Should not even be attempting cluster allocations when
>>>  * huge page swap is disabled.  Warn and fail the allocation.
>>>  */
>>> if (!IS_ENABLED(CONFIG_THP_SWAP)) {
>>> VM_WARN_ON_ONCE(1);
>>> return 0;
>>> }
>> I totally agree with you that we should add more comments for THP swap
>> to improve the code readability.  As for this specific case,
>> VM_WARN_ON_ONCE() here is just to capture some programming error during
>> development.  Do we really need comments here?
>
> If it's code in mainline, we need to know what it is doing.
>
> If it's not useful to have in mainline, then let's remove it.

OK.  Will add the comments.

Best Regards,
Huang, Ying

Re: [PATCH v2 2/7] mm/swapfile.c: Replace some #ifdef with IS_ENABLED()

2018-07-17 Thread Huang, Ying

Dave Hansen  writes:

>> @@ -878,6 +877,11 @@ static int swap_alloc_cluster(struct swap_info_struct 
>> *si, swp_entry_t *slot)
>>  unsigned long offset, i;
>>  unsigned char *map;
>>  
>> +if (!IS_ENABLED(CONFIG_THP_SWAP)) {
>> +VM_WARN_ON_ONCE(1);
>> +return 0;
>> +}
>
> I see you seized the opportunity to keep this code gloriously
> unencumbered by pesky comments.  This seems like a time when you might
> have slipped up and been temped to add a comment or two.  Guess not. :)
>
> Seriously, though, does it hurt us to add a comment or two to say
> something like:
>
>   /*
>* Should not even be attempting cluster allocations when
>* huge page swap is disabled.  Warn and fail the allocation.
>*/
>   if (!IS_ENABLED(CONFIG_THP_SWAP)) {
>   VM_WARN_ON_ONCE(1);
>   return 0;
>   }

I totally agree with you that we should add more comments for THP swap
to improve the code readability.  As for this specific case,
VM_WARN_ON_ONCE() here is just to capture some programming error during
development.  Do we really need comments here?

I will try to add more comments for other places in code regardless this
one.

Best Regards,
Huang, Ying

Re: [PATCH v2 1/7] swap: Add comments to lock_cluster_or_swap_info()

2018-07-17 Thread Huang, Ying

Dave Hansen  writes:

> On 07/16/2018 05:55 PM, Huang, Ying wrote:
>> +/*
>> + * For non-HDD swap devices, the fine grained cluster lock is used to
>> + * protect si->swap_map.  But cluster and cluster locks isn't
>> + * available for HDD, so coarse grained si->lock will be used instead
>> + * for that.
>> + */
>>  static inline struct swap_cluster_info *lock_cluster_or_swap_info(
>>  struct swap_info_struct *si,
>>  unsigned long offset)
>
> This nomenclature is not consistent with the rest of the file.  We call
> a "non-HDD" device an "ssd" absolutely everywhere else in the file.  Why
> are you calling it a non-HDD here?  (fwiw, HDD _barely_ hits my acronym
> cache anyway).
>
> How about this?
>
> /*
>  * Determine the locking method in use for this device.  Return
>  * swap_cluster_info if SSD-style cluster-based locking is in place.
>  */
> static inline struct swap_cluster_info *lock_cluster_or_swap_info(
> struct swap_info_struct *si,
> unsigned long offset)
> {
> struct swap_cluster_info *ci;
>
>   /* Try to use fine-grained SSD-style locking if available: */
> ci = lock_cluster(si, offset);
>
>   /* Otherwise, fall back to traditional, coarse locking: */
> if (!ci)
> spin_lock(&si->lock);
>
> return ci;
> }

This is better than my one, will use this.  Thanks!

> Which reminds me?  Why do we even bother having two locking models?

Because si->cluster_info is NULL for non-SSD, so we cannot use cluster
lock.

About why not use struct swap_cluster_info for non-SSD?  Per my
understanding, struct swap_cluster_info is optimized for SSD.
Especially it assumes seeking is cheap.  So different free swap slot
scanning policy is used for SSD and non-SSD.

Best Regards,
Huang, Ying

Re: [PATCH v2 0/7] swap: THP optimizing refactoring

2018-07-17 Thread Huang, Ying

Daniel Jordan  writes:

> On Tue, Jul 17, 2018 at 08:55:49AM +0800, Huang, Ying wrote:
>> This patchset is based on 2018-07-13 head of mmotm tree.
>
> Looks good.
>
> Still think patch 7 would be easier to review if split into two logical
> changes.  Either way, though.
>
> For the series,
> Reviewed-by: Daniel Jordan 

Thanks a lot for your review!

Best Regards,
Huang, Ying

Re: [PATCH v2 7/7] swap, put_swap_page: Share more between huge/normal code path

2018-07-17 Thread Huang, Ying

Dave Hansen  writes:

> On 07/16/2018 05:55 PM, Huang, Ying wrote:
>>  text   data bss dec hex filename
>> base:   24215   2028 340   2658367d7 mm/swapfile.o
>> unified:   245772028 340   269456941 mm/swapfile.o
>
> That's a bit more than I'd expect looking at the rest of the diff.  Make
> me wonder if we missed an #ifdef somewhere or the compiler is getting
> otherwise confused.
>
> Might be worth a 10-minute look at the disassembly.

Dig one step deeper via 'size -A mm/swapfile.o' and diff between base
and unified,

--- b.s 2018-07-18 09:42:07.872501680 +0800
+++ h.s 2018-07-18 09:50:37.984499168 +0800
@@ -1,6 +1,6 @@
 mm/swapfile.o  :
 section   size   addr
-.text17815  0
+.text17927  0
 .data 1288  0
 .bss   340  0
 ___ksymtab_gpl+nr_swap_pages 8  0
@@ -26,8 +26,8 @@
 .data.once   1  0
 .comment35  0
 .note.GNU-stack  0  0
-.orc_unwind_ip1380  0
-.orc_unwind   2070  0
-Total26810
+.orc_unwind_ip1480  0
+.orc_unwind   2220  0
+Total27172

The total difference is same: 27172 - 26810 = 362 = 24577 - 24215.

The text section difference is small: 17927 - 17815 = 112.  The
additional size change comes from unwinder information: (1480 + 2220) -
(1380 + 2070) = 250.  If the frame pointer unwinder is chosen, this cost
nothing, but if the ORC unwinder is chosen, this is the real difference.

For 112 text section difference, use 'objdump -t' to get symbol size and
compare,

--- b.od2018-07-18 10:45:05.768483075 +0800
+++ h.od2018-07-18 10:44:39.556483204 +0800
@@ -30,9 +30,9 @@
 00a3 cluster_list_add_tail
 001e __kunmap_atomic.isra.34
 018c swap_count_continued
-00ac __swap_entry_free
 000f put_swap_device.isra.35
 00b4 inc_cluster_info_page
+006f __swap_entry_free_locked
 004a _enable_swap_info
 0046 wait_on_page_writeback
 002e inode_to_bdi
@@ -53,8 +53,8 @@
 0012 __x64_sys_swapon
 0011 __ia32_sys_swapon
 007a get_swap_device
-0032 swap_free
-0035 put_swap_page
+006e swap_free
+0078 put_swap_page
 0267 swapcache_free_entries
 0058 page_swapcount
 003a __swap_count
@@ -64,7 +64,7 @@
 011a try_to_free_swap
 01fb get_swap_pages
 0098 get_swap_page_of_type
-01b8 free_swap_and_cache
+01e6 free_swap_and_cache
 0543 try_to_unuse
 000e __x64_sys_swapoff
 000d __ia32_sys_swapoff

The size of put_swap_page() change is small: 0x78 - 0x35 = 67.  But
__swap_entry_free() is inlined by compiler, which cause some code
dilating.

Best Regards,
Huang, Ying

[PATCH v2 5/7] swap: Unify normal/huge code path in put_swap_page()

2018-07-16 Thread Huang, Ying

In this patch, the normal/huge code path in put_swap_page() and
several helper functions are unified to avoid duplicated code, bugs,
etc. and make it easier to review the code.

The removed lines are more than added lines.  And the binary size is
kept exactly same when CONFIG_TRANSPARENT_HUGEPAGE=n.

Signed-off-by: "Huang, Ying" 
Suggested-by: Dave Hansen 
Cc: Michal Hocko 
Cc: Johannes Weiner 
Cc: Shaohua Li 
Cc: Hugh Dickins 
Cc: Minchan Kim 
Cc: Rik van Riel 
Cc: Daniel Jordan 
Cc: Dan Williams 
---
 mm/swapfile.c | 83 ++-
 1 file changed, 37 insertions(+), 46 deletions(-)

diff --git a/mm/swapfile.c b/mm/swapfile.c
index a6d8b8117bc5..622edc47b67a 100644
--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -205,8 +205,16 @@ static void discard_swap_cluster(struct swap_info_struct 
*si,
 
 #ifdef CONFIG_THP_SWAP
 #define SWAPFILE_CLUSTER   HPAGE_PMD_NR
+
+#define nr_swap_entries(nr)(nr)
 #else
 #define SWAPFILE_CLUSTER   256
+
+/*
+ * Define nr_swap_entries() as constant to let compiler to optimize
+ * out some code if !CONFIG_THP_SWAP
+ */
+#define nr_swap_entries(nr)1
 #endif
 #define LATENCY_LIMIT  256
 
@@ -1249,18 +1257,7 @@ void swap_free(swp_entry_t entry)
 /*
  * Called after dropping swapcache to decrease refcnt to swap entries.
  */
-static void swapcache_free(swp_entry_t entry)
-{
-   struct swap_info_struct *p;
-
-   p = _swap_info_get(entry);
-   if (p) {
-   if (!__swap_entry_free(p, entry, SWAP_HAS_CACHE))
-   free_swap_slot(entry);
-   }
-}
-
-static void swapcache_free_cluster(swp_entry_t entry)
+void put_swap_page(struct page *page, swp_entry_t entry)
 {
unsigned long offset = swp_offset(entry);
unsigned long idx = offset / SWAPFILE_CLUSTER;
@@ -1269,39 +1266,41 @@ static void swapcache_free_cluster(swp_entry_t entry)
unsigned char *map;
unsigned int i, free_entries = 0;
unsigned char val;
-
-   if (!IS_ENABLED(CONFIG_THP_SWAP))
-   return;
+   int nr = nr_swap_entries(hpage_nr_pages(page));
 
si = _swap_info_get(entry);
if (!si)
return;
 
-   ci = lock_cluster(si, offset);
-   VM_BUG_ON(!cluster_is_huge(ci));
-   map = si->swap_map + offset;
-   for (i = 0; i < SWAPFILE_CLUSTER; i++) {
-   val = map[i];
-   VM_BUG_ON(!(val & SWAP_HAS_CACHE));
-   if (val == SWAP_HAS_CACHE)
-   free_entries++;
-   }
-   if (!free_entries) {
-   for (i = 0; i < SWAPFILE_CLUSTER; i++)
-   map[i] &= ~SWAP_HAS_CACHE;
-   }
-   cluster_clear_huge(ci);
-   unlock_cluster(ci);
-   if (free_entries == SWAPFILE_CLUSTER) {
-   spin_lock(&si->lock);
+   if (nr == SWAPFILE_CLUSTER) {
ci = lock_cluster(si, offset);
-   memset(map, 0, SWAPFILE_CLUSTER);
+   VM_BUG_ON(!cluster_is_huge(ci));
+   map = si->swap_map + offset;
+   for (i = 0; i < SWAPFILE_CLUSTER; i++) {
+   val = map[i];
+   VM_BUG_ON(!(val & SWAP_HAS_CACHE));
+   if (val == SWAP_HAS_CACHE)
+   free_entries++;
+   }
+   if (!free_entries) {
+   for (i = 0; i < SWAPFILE_CLUSTER; i++)
+   map[i] &= ~SWAP_HAS_CACHE;
+   }
+   cluster_clear_huge(ci);
unlock_cluster(ci);
-   mem_cgroup_uncharge_swap(entry, SWAPFILE_CLUSTER);
-   swap_free_cluster(si, idx);
-   spin_unlock(&si->lock);
-   } else if (free_entries) {
-   for (i = 0; i < SWAPFILE_CLUSTER; i++, entry.val++) {
+   if (free_entries == SWAPFILE_CLUSTER) {
+   spin_lock(&si->lock);
+   ci = lock_cluster(si, offset);
+   memset(map, 0, SWAPFILE_CLUSTER);
+   unlock_cluster(ci);
+   mem_cgroup_uncharge_swap(entry, SWAPFILE_CLUSTER);
+   swap_free_cluster(si, idx);
+   spin_unlock(&si->lock);
+   return;
+   }
+   }
+   if (nr == 1 || free_entries) {
+   for (i = 0; i < nr; i++, entry.val++) {
if (!__swap_entry_free(si, entry, SWAP_HAS_CACHE))
free_swap_slot(entry);
}
@@ -1325,14 +1324,6 @@ int split_swap_cluster(swp_entry_t entry)
 }
 #endif
 
-void put_swap_page(struct page *page, swp_entry_t entry)
-{
-   if (!PageTransHuge(page))
-   swapcache_free(entry);
-   else
-   swapcache_free_cluster(entry);
-}
-
 static int swp_entry_cmp(const void *ent1, const void *ent2)
 {
const swp_entry_t *e1 = ent1, *e2 = ent2;
-- 
2.16.4

[PATCH v2 6/7] swap: Add __swap_entry_free_locked()

2018-07-16 Thread Huang, Ying

The part of __swap_entry_free() with lock held is separated into a new
function __swap_entry_free_locked().  Because we want to reuse that
piece of code in some other places.

Just mechanical code refactoring, there is no any functional change in
this function.

Signed-off-by: "Huang, Ying" 
Cc: Dave Hansen 
Cc: Michal Hocko 
Cc: Johannes Weiner 
Cc: Shaohua Li 
Cc: Hugh Dickins 
Cc: Minchan Kim 
Cc: Rik van Riel 
Cc: Daniel Jordan 
Cc: Dan Williams 
---
 mm/swapfile.c | 20 ++--
 1 file changed, 14 insertions(+), 6 deletions(-)

diff --git a/mm/swapfile.c b/mm/swapfile.c
index 622edc47b67a..fec28f6c05b0 100644
--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -1180,16 +1180,13 @@ struct swap_info_struct *get_swap_device(swp_entry_t 
entry)
return NULL;
 }
 
-static unsigned char __swap_entry_free(struct swap_info_struct *p,
-  swp_entry_t entry, unsigned char usage)
+static unsigned char __swap_entry_free_locked(struct swap_info_struct *p,
+ unsigned long offset,
+ unsigned char usage)
 {
-   struct swap_cluster_info *ci;
-   unsigned long offset = swp_offset(entry);
unsigned char count;
unsigned char has_cache;
 
-   ci = lock_cluster_or_swap_info(p, offset);
-
count = p->swap_map[offset];
 
has_cache = count & SWAP_HAS_CACHE;
@@ -1217,6 +1214,17 @@ static unsigned char __swap_entry_free(struct 
swap_info_struct *p,
usage = count | has_cache;
p->swap_map[offset] = usage ? : SWAP_HAS_CACHE;
 
+   return usage;
+}
+
+static unsigned char __swap_entry_free(struct swap_info_struct *p,
+  swp_entry_t entry, unsigned char usage)
+{
+   struct swap_cluster_info *ci;
+   unsigned long offset = swp_offset(entry);
+
+   ci = lock_cluster_or_swap_info(p, offset);
+   usage = __swap_entry_free_locked(p, offset, usage);
unlock_cluster_or_swap_info(p, ci);
 
return usage;
-- 
2.16.4

[PATCH v2 4/7] swap: Unify normal/huge code path in swap_page_trans_huge_swapped()

2018-07-16 Thread Huang, Ying

As suggested by Dave, we should unify the code path for normal and
huge swap support if possible to avoid duplicated code, bugs, etc. and
make it easier to review code.

In this patch, the normal/huge code path in swap_page_trans_huge_swapped()
is unified, the added and removed lines are same.  And the binary size
is kept almost same when CONFIG_TRANSPARENT_HUGEPAGE=n.

 text  data bss dec hex filename
base:   24179  2028 340   2654767b3 mm/swapfile.o
unified:24215  2028 340   2658367d7 mm/swapfile.o

Signed-off-by: "Huang, Ying" 
Suggested-by: Dave Hansen 
Cc: Michal Hocko 
Cc: Johannes Weiner 
Cc: Shaohua Li 
Cc: Hugh Dickins 
Cc: Minchan Kim 
Cc: Rik van Riel 
Cc: Daniel Jordan 
Cc: Dan Williams 
---
 mm/swapfile.c | 8 
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/mm/swapfile.c b/mm/swapfile.c
index 92c24402706c..a6d8b8117bc5 100644
--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -270,7 +270,10 @@ static inline void cluster_set_null(struct 
swap_cluster_info *info)
 
 static inline bool cluster_is_huge(struct swap_cluster_info *info)
 {
-   return info->flags & CLUSTER_FLAG_HUGE;
+   if (IS_ENABLED(CONFIG_THP_SWAP))
+   return info->flags & CLUSTER_FLAG_HUGE;
+   else
+   return false;
 }
 
 static inline void cluster_clear_huge(struct swap_cluster_info *info)
@@ -1489,9 +1492,6 @@ static bool swap_page_trans_huge_swapped(struct 
swap_info_struct *si,
int i;
bool ret = false;
 
-   if (!IS_ENABLED(CONFIG_THP_SWAP))
-   return swap_swapcount(si, entry) != 0;
-
ci = lock_cluster_or_swap_info(si, offset);
if (!ci || !cluster_is_huge(ci)) {
if (swap_count(map[roffset]))
-- 
2.16.4

[PATCH v2 0/7] swap: THP optimizing refactoring

2018-07-16 Thread Huang, Ying

This patchset is based on 2018-07-13 head of mmotm tree.

Now the THP (Transparent Huge Page) swap optimizing is implemented in
the way like below,

#ifdef CONFIG_THP_SWAP
huge_function(...)
{
}
#else
normal_function(...)
{
}
#endif

general_function(...)
{
if (huge)
return thp_function(...);
else
return normal_function(...);
}

As pointed out by Dave Hansen, this will,

1. Created a new, wholly untested code path for huge page
2. Created two places to patch bugs
3. Are not reusing code when possible

This patchset is to address these problems via merging huge/normal
code path/functions if possible.

One concern is that this may cause code size to dilate when
!CONFIG_TRANSPARENT_HUGEPAGE.  The data shows that most refactoring
will only cause quite slight code size increase.

Best Regards,
Huang, Ying

[PATCH v2 7/7] swap, put_swap_page: Share more between huge/normal code path

2018-07-16 Thread Huang, Ying

In this patch, locking related code is shared between huge/normal code
path in put_swap_page() to reduce code duplication.  And `free_entries
== 0` case is merged into more general `free_entries !=
SWAPFILE_CLUSTER` case, because the new locking method makes it easy.

The added lines is same as the removed lines.  But the code size is
increased when CONFIG_TRANSPARENT_HUGEPAGE=n.

text   data bss dec hex filename
base:  24215   2028 340   2658367d7 mm/swapfile.o
unified:   24577   2028 340   269456941 mm/swapfile.o

Signed-off-by: "Huang, Ying" 
Cc: Dave Hansen 
Cc: Michal Hocko 
Cc: Johannes Weiner 
Cc: Shaohua Li 
Cc: Hugh Dickins 
Cc: Minchan Kim 
Cc: Rik van Riel 
Cc: Daniel Jordan 
Cc: Dan Williams 
---
 mm/swapfile.c | 20 ++--
 1 file changed, 10 insertions(+), 10 deletions(-)

diff --git a/mm/swapfile.c b/mm/swapfile.c
index fec28f6c05b0..cd75f449896b 100644
--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -1280,8 +1280,8 @@ void put_swap_page(struct page *page, swp_entry_t entry)
if (!si)
return;
 
+   ci = lock_cluster_or_swap_info(si, offset);
if (nr == SWAPFILE_CLUSTER) {
-   ci = lock_cluster(si, offset);
VM_BUG_ON(!cluster_is_huge(ci));
map = si->swap_map + offset;
for (i = 0; i < SWAPFILE_CLUSTER; i++) {
@@ -1290,13 +1290,9 @@ void put_swap_page(struct page *page, swp_entry_t entry)
if (val == SWAP_HAS_CACHE)
free_entries++;
}
-   if (!free_entries) {
-   for (i = 0; i < SWAPFILE_CLUSTER; i++)
-   map[i] &= ~SWAP_HAS_CACHE;
-   }
cluster_clear_huge(ci);
-   unlock_cluster(ci);
if (free_entries == SWAPFILE_CLUSTER) {
+   unlock_cluster_or_swap_info(si, ci);
spin_lock(&si->lock);
ci = lock_cluster(si, offset);
memset(map, 0, SWAPFILE_CLUSTER);
@@ -1307,12 +1303,16 @@ void put_swap_page(struct page *page, swp_entry_t entry)
return;
}
}
-   if (nr == 1 || free_entries) {
-   for (i = 0; i < nr; i++, entry.val++) {
-   if (!__swap_entry_free(si, entry, SWAP_HAS_CACHE))
-   free_swap_slot(entry);
+   for (i = 0; i < nr; i++, entry.val++) {
+   if (!__swap_entry_free_locked(si, offset + i, SWAP_HAS_CACHE)) {
+   unlock_cluster_or_swap_info(si, ci);
+   free_swap_slot(entry);
+   if (i == nr - 1)
+   return;
+   lock_cluster_or_swap_info(si, offset);
}
}
+   unlock_cluster_or_swap_info(si, ci);
 }
 
 #ifdef CONFIG_THP_SWAP
-- 
2.16.4

[PATCH v2 2/7] mm/swapfile.c: Replace some #ifdef with IS_ENABLED()

2018-07-16 Thread Huang, Ying

In mm/swapfile.c, THP (Transparent Huge Page) swap specific code is
enclosed by #ifdef CONFIG_THP_SWAP/#endif to avoid code dilating when
THP isn't enabled.  But #ifdef/#endif in .c file hurt the code
readability, so Dave suggested to use IS_ENABLED(CONFIG_THP_SWAP)
instead and let compiler to do the dirty job for us.  This has
potential to remove some duplicated code too.  From output of `size`,

text   data bss dec hex filename
THP=y: 26269   2076 340   28685700d mm/swapfile.o
ifdef/endif:   24115   2028 340   264836773 mm/swapfile.o
IS_ENABLED:24179   2028 340   2654767b3 mm/swapfile.o

IS_ENABLED() based solution works quite well, almost as good as that
of #ifdef/#endif.  And from the diffstat, the removed lines are more
than added lines.

One #ifdef for split_swap_cluster() is kept.  Because it is a public
function with a stub implementation for CONFIG_THP_SWAP=n in swap.h.

Signed-off-by: "Huang, Ying" 
Suggested-by: Dave Hansen 
Cc: Michal Hocko 
Cc: Johannes Weiner 
Cc: Shaohua Li 
Cc: Hugh Dickins 
Cc: Minchan Kim 
Cc: Rik van Riel 
Cc: Daniel Jordan 
Cc: Dan Williams 
---
 mm/swapfile.c | 56 
 1 file changed, 16 insertions(+), 40 deletions(-)

diff --git a/mm/swapfile.c b/mm/swapfile.c
index 0a2a9643dd78..dd9263411f11 100644
--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -870,7 +870,6 @@ static int scan_swap_map_slots(struct swap_info_struct *si,
return n_ret;
 }
 
-#ifdef CONFIG_THP_SWAP
 static int swap_alloc_cluster(struct swap_info_struct *si, swp_entry_t *slot)
 {
unsigned long idx;
@@ -878,6 +877,11 @@ static int swap_alloc_cluster(struct swap_info_struct *si, 
swp_entry_t *slot)
unsigned long offset, i;
unsigned char *map;
 
+   if (!IS_ENABLED(CONFIG_THP_SWAP)) {
+   VM_WARN_ON_ONCE(1);
+   return 0;
+   }
+
if (cluster_list_empty(&si->free_clusters))
return 0;
 
@@ -908,13 +912,6 @@ static void swap_free_cluster(struct swap_info_struct *si, 
unsigned long idx)
unlock_cluster(ci);
swap_range_free(si, offset, SWAPFILE_CLUSTER);
 }
-#else
-static int swap_alloc_cluster(struct swap_info_struct *si, swp_entry_t *slot)
-{
-   VM_WARN_ON_ONCE(1);
-   return 0;
-}
-#endif /* CONFIG_THP_SWAP */
 
 static unsigned long scan_swap_map(struct swap_info_struct *si,
   unsigned char usage)
@@ -1260,7 +1257,6 @@ static void swapcache_free(swp_entry_t entry)
}
 }
 
-#ifdef CONFIG_THP_SWAP
 static void swapcache_free_cluster(swp_entry_t entry)
 {
unsigned long offset = swp_offset(entry);
@@ -1271,6 +1267,9 @@ static void swapcache_free_cluster(swp_entry_t entry)
unsigned int i, free_entries = 0;
unsigned char val;
 
+   if (!IS_ENABLED(CONFIG_THP_SWAP))
+   return;
+
si = _swap_info_get(entry);
if (!si)
return;
@@ -1306,6 +1305,7 @@ static void swapcache_free_cluster(swp_entry_t entry)
}
 }
 
+#ifdef CONFIG_THP_SWAP
 int split_swap_cluster(swp_entry_t entry)
 {
struct swap_info_struct *si;
@@ -1320,11 +1320,7 @@ int split_swap_cluster(swp_entry_t entry)
unlock_cluster(ci);
return 0;
 }
-#else
-static inline void swapcache_free_cluster(swp_entry_t entry)
-{
-}
-#endif /* CONFIG_THP_SWAP */
+#endif
 
 void put_swap_page(struct page *page, swp_entry_t entry)
 {
@@ -1483,7 +1479,6 @@ int swp_swapcount(swp_entry_t entry)
return count;
 }
 
-#ifdef CONFIG_THP_SWAP
 static bool swap_page_trans_huge_swapped(struct swap_info_struct *si,
 swp_entry_t entry)
 {
@@ -1494,6 +1489,9 @@ static bool swap_page_trans_huge_swapped(struct 
swap_info_struct *si,
int i;
bool ret = false;
 
+   if (!IS_ENABLED(CONFIG_THP_SWAP))
+   return swap_swapcount(si, entry) != 0;
+
ci = lock_cluster_or_swap_info(si, offset);
if (!ci || !cluster_is_huge(ci)) {
if (map[roffset] != SWAP_HAS_CACHE)
@@ -1516,7 +1514,7 @@ static bool page_swapped(struct page *page)
swp_entry_t entry;
struct swap_info_struct *si;
 
-   if (likely(!PageTransCompound(page)))
+   if (!IS_ENABLED(CONFIG_THP_SWAP) || likely(!PageTransCompound(page)))
return page_swapcount(page) != 0;
 
page = compound_head(page);
@@ -1540,10 +1538,8 @@ static int page_trans_huge_map_swapcount(struct page 
*page, int *total_mapcount,
/* hugetlbfs shouldn't call it */
VM_BUG_ON_PAGE(PageHuge(page), page);
 
-   if (likely(!PageTransCompound(page))) {
-   mapcount = atomic_read(&page->_mapcount) + 1;
-   if (total_mapcount)
-   *total_mapcount = mapcount;
+   if (!IS_ENABLED(CONFIG_THP_SWAP) || likely(!PageTransCompound(page))) {
+   m

[PATCH v2 3/7] swap: Use swap_count() in swap_page_trans_huge_swapped()

2018-07-16 Thread Huang, Ying

In swap_page_trans_huge_swapped(), to identify whether there's any
page table mapping for a 4k sized swap entry, "si->swap_map[i] !=
SWAP_HAS_CACHE" is used.  This works correctly now, because all users
of the function will only call it after checking SWAP_HAS_CACHE.  But
as pointed out by Daniel, it is better to use "swap_count(map[i])"
here, because it works for "map[i] == 0" case too.

And this makes the implementation more consistent between normal and
huge swap entry.

Signed-off-by: "Huang, Ying" 
Suggested-by: Daniel Jordan 
Cc: Dave Hansen 
Cc: Michal Hocko 
Cc: Johannes Weiner 
Cc: Shaohua Li 
Cc: Hugh Dickins 
Cc: Minchan Kim 
Cc: Rik van Riel 
Cc: Dan Williams 
---
 mm/swapfile.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/mm/swapfile.c b/mm/swapfile.c
index dd9263411f11..92c24402706c 100644
--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -1494,12 +1494,12 @@ static bool swap_page_trans_huge_swapped(struct 
swap_info_struct *si,
 
ci = lock_cluster_or_swap_info(si, offset);
if (!ci || !cluster_is_huge(ci)) {
-   if (map[roffset] != SWAP_HAS_CACHE)
+   if (swap_count(map[roffset]))
ret = true;
goto unlock_out;
}
for (i = 0; i < SWAPFILE_CLUSTER; i++) {
-   if (map[offset + i] != SWAP_HAS_CACHE) {
+   if (swap_count(map[offset + i])) {
ret = true;
break;
}
-- 
2.16.4

[PATCH v2 1/7] swap: Add comments to lock_cluster_or_swap_info()

2018-07-16 Thread Huang, Ying

To improve the code readability.

Signed-off-by: "Huang, Ying" 
Suggested-by: Dave Hansen 
Cc: Michal Hocko 
Cc: Johannes Weiner 
Cc: Shaohua Li 
Cc: Hugh Dickins 
Cc: Minchan Kim 
Cc: Rik van Riel 
Cc: Daniel Jordan 
Cc: Dan Williams 
---
 mm/swapfile.c | 6 ++
 1 file changed, 6 insertions(+)

diff --git a/mm/swapfile.c b/mm/swapfile.c
index d8fddfb000ec..0a2a9643dd78 100644
--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -297,6 +297,12 @@ static inline void unlock_cluster(struct swap_cluster_info 
*ci)
spin_unlock(&ci->lock);
 }
 
+/*
+ * For non-HDD swap devices, the fine grained cluster lock is used to
+ * protect si->swap_map.  But cluster and cluster locks isn't
+ * available for HDD, so coarse grained si->lock will be used instead
+ * for that.
+ */
 static inline struct swap_cluster_info *lock_cluster_or_swap_info(
struct swap_info_struct *si,
unsigned long offset)
-- 
2.16.4

[PATCH v2 0/7] swap: THP optimizing refactoring

2018-07-16 Thread Huang, Ying

This patchset is based on 2018-07-13 head of mmotm tree.

Now the THP (Transparent Huge Page) swap optimizing is implemented in
the way like below,

#ifdef CONFIG_THP_SWAP
huge_function(...)
{
}
#else
normal_function(...)
{
}
#endif

general_function(...)
{
if (huge)
return thp_function(...);
else
return normal_function(...);
}

As pointed out by Dave Hansen, this will,

1. Created a new, wholly untested code path for huge page
2. Created two places to patch bugs
3. Are not reusing code when possible

This patchset is to address these problems via merging huge/normal
code path/functions if possible.

One concern is that this may cause code size to dilate when
!CONFIG_TRANSPARENT_HUGEPAGE.  The data shows that most refactoring
will only cause quite slight code size increase.

Best Regards,
Huang, Ying

Re: [PATCH 6/6] swap, put_swap_page: Share more between huge/normal code path

2018-07-14 Thread Huang, Ying

Daniel Jordan  writes:

> On Fri, Jul 13, 2018 at 07:36:36AM +0800, Huang, Ying wrote:
>> From: Huang Ying 
>> 
>> In this patch, locking related code is shared between huge/normal code
>> path in put_swap_page() to reduce code duplication.  And `free_entries
>> == 0` case is merged into more general `free_entries !=
>> SWAPFILE_CLUSTER` case, because the new locking method makes it easy.
>
> Might be a bit easier to think about the two changes if they were split up.

I just think the second change appears too trivial to be a separate patch.

> Agree with Dave's comment from patch 1, but otherwise the series looks ok to
> me.  I like the nr_swap_entries macro, that's clever.

Thanks!

Best Regards,
Huang, Ying

Re: [PATCH 3/6] swap: Unify normal/huge code path in swap_page_trans_huge_swapped()

2018-07-14 Thread Huang, Ying

Daniel Jordan  writes:

> On Fri, Jul 13, 2018 at 07:36:33AM +0800, Huang, Ying wrote:
>> diff --git a/mm/swapfile.c b/mm/swapfile.c
>> index 75c84aa763a3..160f78072667 100644
>> --- a/mm/swapfile.c
>> +++ b/mm/swapfile.c
>> @@ -270,7 +270,10 @@ static inline void cluster_set_null(struct 
>> swap_cluster_info *info)
>>  
>>  static inline bool cluster_is_huge(struct swap_cluster_info *info)
>>  {
>> -return info->flags & CLUSTER_FLAG_HUGE;
>> +if (IS_ENABLED(CONFIG_THP_SWAP))
>> +return info->flags & CLUSTER_FLAG_HUGE;
>> +else
>> +return false;
>>  }
>>  
>>  static inline void cluster_clear_huge(struct swap_cluster_info *info)
>> @@ -1489,9 +1492,6 @@ static bool swap_page_trans_huge_swapped(struct 
>> swap_info_struct *si,
>>  int i;
>>  bool ret = false;
>>  
>> -if (!IS_ENABLED(CONFIG_THP_SWAP))
>> -return swap_swapcount(si, entry) != 0;
>
> This tests the value returned from swap_count,
>
>> -
>>  ci = lock_cluster_or_swap_info(si, offset);
>>  if (!ci || !cluster_is_huge(ci)) {
>>  if (map[roffset] != SWAP_HAS_CACHE)
>
> and now we're testing
>
> map[roffset] != SWAP_HAS_CACHE
>
> instead.  The two seem to mean the same thing here, since the swap slot hasn't
> been freed to the global pool and so can't be 0, but it might be better for
> consistency and clarity to use swap_count here, and a few lines down too
>
> for (i = 0; i < SWAPFILE_CLUSTER; i++) {      
>    
> if (map[offset + i] != SWAP_HAS_CACHE) {  
>
>
> since swap_count seems to be used everywhere else for this.

Yes.  swap_count() looks better here.  Will change this.

Best Regards,
Huang, Ying

Re: [PATCH 1/6] swap: Add comments to lock_cluster_or_swap_info()

2018-07-13 Thread Huang, Ying

Dave Hansen  writes:

>> +/*
>> + * At most times, fine grained cluster lock is sufficient to protect
>
> Can we call out those times, please?

To protect si->swap_map[], if HDD, si->lock is used, otherwise cluster
lock is used.  "at most times" is ambiguous here, I will fix it.

>> + * the operations on sis->swap_map.  
>
> Please be careful with the naming.  You can call it 'si' because that's
> what the function argument is named.  Or, swap_info_struct because
> that's the struct name.  Calling it 'sis' is a bit sloppy, no?
>
>>  No need to acquire gross grained
>
> "coarse" is a conventional antonym for "fine".

Sorry for my poor English, will change this.

>> + * sis->lock.  But cluster and cluster lock isn't available for HDD,
>> + * so sis->lock will be instead for them.
>> + */
>>  static inline struct swap_cluster_info *lock_cluster_or_swap_info(
>>  struct swap_info_struct *si,
>>  unsigned long offset)
>
> What I already knew was: there are two locks.  We use one sometimes and
> the other at other times.
>
> What I don't know is why there are two locks, and the heuristics why we
> choose between them.  This comment doesn't help explain the things I
> don't know.

cluster lock is used to protect fields of struct swap_cluster_info, and
si->swap_map[], this is described in comments of struct
swap_cluster_info.  si->lock is used to protect other fields of si.  If
two locks need to be held, hold si->lock first.  This is for non-HDD.
For HDD, there are no cluster, so si->lock is used to protect
si->swap_map[].

Best Regards,
Huang, Ying

[PATCH] mm, swap: Make CONFIG_THP_SWAP depends on CONFIG_SWAP

2018-07-12 Thread Huang, Ying

From: Huang Ying 

CONFIG_THP_SWAP should depend on CONFIG_SWAP, because it's
unreasonable to optimize swapping for THP (Transparent Huge Page)
without basic swapping support.

In original code, when CONFIG_SWAP=n and CONFIG_THP_SWAP=y,
split_swap_cluster() will not be built because it is in swapfile.c,
but it will be called in huge_memory.c.  This doesn't trigger a build
error in practice because the call site is enclosed by
PageSwapCache(), which is defined to be constant 0 when CONFIG_SWAP=n.
But this is fragile and should be fixed.

The comments are fixed too to reflect the latest progress.

Fixes: 38d8b4e6bdc8 ("mm, THP, swap: delay splitting THP during swap out")
Signed-off-by: "Huang, Ying" 
Reviewed-by: Dan Williams 
Reviewed-by: Naoya Horiguchi 
Cc: Michal Hocko 
Cc: Johannes Weiner 
Cc: Shaohua Li 
Cc: Hugh Dickins 
Cc: Minchan Kim 
Cc: Rik van Riel 
Cc: Dave Hansen 
Cc: Zi Yan 
Cc: Daniel Jordan 
---
 mm/Kconfig | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/mm/Kconfig b/mm/Kconfig
index b78e7cd4e9fe..97114c94239c 100644
--- a/mm/Kconfig
+++ b/mm/Kconfig
@@ -419,10 +419,11 @@ config ARCH_WANTS_THP_SWAP
 
 config THP_SWAP
def_bool y
-   depends on TRANSPARENT_HUGEPAGE && ARCH_WANTS_THP_SWAP
+   depends on TRANSPARENT_HUGEPAGE && ARCH_WANTS_THP_SWAP && SWAP
help
  Swap transparent huge pages in one piece, without splitting.
- XXX: For now this only does clustered swap space allocation.
+ XXX: For now, swap cluster backing transparent huge page
+ will be split after swapout.
 
  For selection by architectures with reasonable THP sizes.
 
-- 
2.16.4

[PATCH 5/6] swap: Add __swap_entry_free_locked()

2018-07-12 Thread Huang, Ying

From: Huang Ying 

The part of __swap_entry_free() with lock held is separated into a new
function __swap_entry_free_locked().  Because we want to reuse that
piece of code in some other places.

Just mechanical code refactoring, there is no any functional change in
this function.

Signed-off-by: "Huang, Ying" 
Cc: Dave Hansen 
Cc: Michal Hocko 
Cc: Johannes Weiner 
Cc: Shaohua Li 
Cc: Hugh Dickins 
Cc: Minchan Kim 
Cc: Rik van Riel 
Cc: Daniel Jordan 
Cc: Dan Williams 
---
 mm/swapfile.c | 20 ++--
 1 file changed, 14 insertions(+), 6 deletions(-)

diff --git a/mm/swapfile.c b/mm/swapfile.c
index e0df8d22ac92..bc488bf36c86 100644
--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -1180,16 +1180,13 @@ struct swap_info_struct *get_swap_device(swp_entry_t 
entry)
return NULL;
 }
 
-static unsigned char __swap_entry_free(struct swap_info_struct *p,
-  swp_entry_t entry, unsigned char usage)
+static unsigned char __swap_entry_free_locked(struct swap_info_struct *p,
+ unsigned long offset,
+ unsigned char usage)
 {
-   struct swap_cluster_info *ci;
-   unsigned long offset = swp_offset(entry);
unsigned char count;
unsigned char has_cache;
 
-   ci = lock_cluster_or_swap_info(p, offset);
-
count = p->swap_map[offset];
 
has_cache = count & SWAP_HAS_CACHE;
@@ -1217,6 +1214,17 @@ static unsigned char __swap_entry_free(struct 
swap_info_struct *p,
usage = count | has_cache;
p->swap_map[offset] = usage ? : SWAP_HAS_CACHE;
 
+   return usage;
+}
+
+static unsigned char __swap_entry_free(struct swap_info_struct *p,
+  swp_entry_t entry, unsigned char usage)
+{
+   struct swap_cluster_info *ci;
+   unsigned long offset = swp_offset(entry);
+
+   ci = lock_cluster_or_swap_info(p, offset);
+   usage = __swap_entry_free_locked(p, offset, usage);
unlock_cluster_or_swap_info(p, ci);
 
return usage;
-- 
2.16.4

[PATCH 2/6] mm/swapfile.c: Replace some #ifdef with IS_ENABLED()

2018-07-12 Thread Huang, Ying

From: Huang Ying 

In mm/swapfile.c, THP (Transparent Huge Page) swap specific code is
enclosed by #ifdef CONFIG_THP_SWAP/#endif to avoid code dilating when
THP isn't enabled.  But #ifdef/#endif in .c file hurt the code
readability, so Dave suggested to use IS_ENABLED(CONFIG_THP_SWAP)
instead and let compiler to do the dirty job for us.  This has
potential to remove some duplicated code too.  From output of `size`,

text   data bss dec hex filename
THP=y: 26269   2076 340   28685700d mm/swapfile.o
ifdef/endif:   24115   2028 340   264836773 mm/swapfile.o
IS_ENABLED:24179   2028 340   2654767b3 mm/swapfile.o

IS_ENABLED() based solution works quite well, almost as good as that
of #ifdef/#endif.  And from the diffstat, the removed lines are more
than added lines.

One #ifdef for split_swap_cluster() is kept.  Because it is a public
function with a stub implementation for CONFIG_THP_SWAP=n in swap.h.

Signed-off-by: "Huang, Ying" 
Suggested-by: Dave Hansen 
Cc: Michal Hocko 
Cc: Johannes Weiner 
Cc: Shaohua Li 
Cc: Hugh Dickins 
Cc: Minchan Kim 
Cc: Rik van Riel 
Cc: Daniel Jordan 
Cc: Dan Williams 
---
 mm/swapfile.c | 56 
 1 file changed, 16 insertions(+), 40 deletions(-)

diff --git a/mm/swapfile.c b/mm/swapfile.c
index e31aa601d9c0..75c84aa763a3 100644
--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -870,7 +870,6 @@ static int scan_swap_map_slots(struct swap_info_struct *si,
return n_ret;
 }
 
-#ifdef CONFIG_THP_SWAP
 static int swap_alloc_cluster(struct swap_info_struct *si, swp_entry_t *slot)
 {
unsigned long idx;
@@ -878,6 +877,11 @@ static int swap_alloc_cluster(struct swap_info_struct *si, 
swp_entry_t *slot)
unsigned long offset, i;
unsigned char *map;
 
+   if (!IS_ENABLED(CONFIG_THP_SWAP)) {
+   VM_WARN_ON_ONCE(1);
+   return 0;
+   }
+
if (cluster_list_empty(&si->free_clusters))
return 0;
 
@@ -908,13 +912,6 @@ static void swap_free_cluster(struct swap_info_struct *si, 
unsigned long idx)
unlock_cluster(ci);
swap_range_free(si, offset, SWAPFILE_CLUSTER);
 }
-#else
-static int swap_alloc_cluster(struct swap_info_struct *si, swp_entry_t *slot)
-{
-   VM_WARN_ON_ONCE(1);
-   return 0;
-}
-#endif /* CONFIG_THP_SWAP */
 
 static unsigned long scan_swap_map(struct swap_info_struct *si,
   unsigned char usage)
@@ -1260,7 +1257,6 @@ static void swapcache_free(swp_entry_t entry)
}
 }
 
-#ifdef CONFIG_THP_SWAP
 static void swapcache_free_cluster(swp_entry_t entry)
 {
unsigned long offset = swp_offset(entry);
@@ -1271,6 +1267,9 @@ static void swapcache_free_cluster(swp_entry_t entry)
unsigned int i, free_entries = 0;
unsigned char val;
 
+   if (!IS_ENABLED(CONFIG_THP_SWAP))
+   return;
+
si = _swap_info_get(entry);
if (!si)
return;
@@ -1306,6 +1305,7 @@ static void swapcache_free_cluster(swp_entry_t entry)
}
 }
 
+#ifdef CONFIG_THP_SWAP
 int split_swap_cluster(swp_entry_t entry)
 {
struct swap_info_struct *si;
@@ -1320,11 +1320,7 @@ int split_swap_cluster(swp_entry_t entry)
unlock_cluster(ci);
return 0;
 }
-#else
-static inline void swapcache_free_cluster(swp_entry_t entry)
-{
-}
-#endif /* CONFIG_THP_SWAP */
+#endif
 
 void put_swap_page(struct page *page, swp_entry_t entry)
 {
@@ -1483,7 +1479,6 @@ int swp_swapcount(swp_entry_t entry)
return count;
 }
 
-#ifdef CONFIG_THP_SWAP
 static bool swap_page_trans_huge_swapped(struct swap_info_struct *si,
 swp_entry_t entry)
 {
@@ -1494,6 +1489,9 @@ static bool swap_page_trans_huge_swapped(struct 
swap_info_struct *si,
int i;
bool ret = false;
 
+   if (!IS_ENABLED(CONFIG_THP_SWAP))
+   return swap_swapcount(si, entry) != 0;
+
ci = lock_cluster_or_swap_info(si, offset);
if (!ci || !cluster_is_huge(ci)) {
if (map[roffset] != SWAP_HAS_CACHE)
@@ -1516,7 +1514,7 @@ static bool page_swapped(struct page *page)
swp_entry_t entry;
struct swap_info_struct *si;
 
-   if (likely(!PageTransCompound(page)))
+   if (!IS_ENABLED(CONFIG_THP_SWAP) || likely(!PageTransCompound(page)))
return page_swapcount(page) != 0;
 
page = compound_head(page);
@@ -1540,10 +1538,8 @@ static int page_trans_huge_map_swapcount(struct page 
*page, int *total_mapcount,
/* hugetlbfs shouldn't call it */
VM_BUG_ON_PAGE(PageHuge(page), page);
 
-   if (likely(!PageTransCompound(page))) {
-   mapcount = atomic_read(&page->_mapcount) + 1;
-   if (total_mapcount)
-   *total_mapcount = mapcount;
+   if (!IS_ENABLED(CONFIG_THP_SWAP) || likely(!PageTransCompound(

[PATCH 1/6] swap: Add comments to lock_cluster_or_swap_info()

2018-07-12 Thread Huang, Ying

From: Huang Ying 

To improve the code readability.

Signed-off-by: "Huang, Ying" 
Suggested-by: Dave Hansen 
Cc: Michal Hocko 
Cc: Johannes Weiner 
Cc: Shaohua Li 
Cc: Hugh Dickins 
Cc: Minchan Kim 
Cc: Rik van Riel 
Cc: Daniel Jordan 
Cc: Dan Williams 
---
 mm/swapfile.c | 6 ++
 1 file changed, 6 insertions(+)

diff --git a/mm/swapfile.c b/mm/swapfile.c
index d8fddfb000ec..e31aa601d9c0 100644
--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -297,6 +297,12 @@ static inline void unlock_cluster(struct swap_cluster_info 
*ci)
spin_unlock(&ci->lock);
 }
 
+/*
+ * At most times, fine grained cluster lock is sufficient to protect
+ * the operations on sis->swap_map.  No need to acquire gross grained
+ * sis->lock.  But cluster and cluster lock isn't available for HDD,
+ * so sis->lock will be instead for them.
+ */
 static inline struct swap_cluster_info *lock_cluster_or_swap_info(
struct swap_info_struct *si,
unsigned long offset)
-- 
2.16.4

[PATCH 3/6] swap: Unify normal/huge code path in swap_page_trans_huge_swapped()

2018-07-12 Thread Huang, Ying

From: Huang Ying 

As suggested by Dave, we should unify the code path for normal and
huge swap support if possible to avoid duplicated code, bugs, etc. and
make it easier to review code.

In this patch, the normal/huge code path in swap_page_trans_huge_swapped()
is unified, the added and removed lines are same.  And the binary size
is kept almost same when CONFIG_TRANSPARENT_HUGEPAGE=n.

 text  data bss dec hex filename
base:   24179  2028 340   2654767b3 mm/swapfile.o
unified:24215  2028 340   2658367d7 mm/swapfile.o

Signed-off-by: "Huang, Ying" 
Suggested-by: Dave Hansen 
Cc: Michal Hocko 
Cc: Johannes Weiner 
Cc: Shaohua Li 
Cc: Hugh Dickins 
Cc: Minchan Kim 
Cc: Rik van Riel 
Cc: Daniel Jordan 
Cc: Dan Williams 
---
 mm/swapfile.c | 8 
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/mm/swapfile.c b/mm/swapfile.c
index 75c84aa763a3..160f78072667 100644
--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -270,7 +270,10 @@ static inline void cluster_set_null(struct 
swap_cluster_info *info)
 
 static inline bool cluster_is_huge(struct swap_cluster_info *info)
 {
-   return info->flags & CLUSTER_FLAG_HUGE;
+   if (IS_ENABLED(CONFIG_THP_SWAP))
+   return info->flags & CLUSTER_FLAG_HUGE;
+   else
+   return false;
 }
 
 static inline void cluster_clear_huge(struct swap_cluster_info *info)
@@ -1489,9 +1492,6 @@ static bool swap_page_trans_huge_swapped(struct 
swap_info_struct *si,
int i;
bool ret = false;
 
-   if (!IS_ENABLED(CONFIG_THP_SWAP))
-   return swap_swapcount(si, entry) != 0;
-
ci = lock_cluster_or_swap_info(si, offset);
if (!ci || !cluster_is_huge(ci)) {
if (map[roffset] != SWAP_HAS_CACHE)
-- 
2.16.4

[PATCH 4/6] swap: Unify normal/huge code path in put_swap_page()

2018-07-12 Thread Huang, Ying

From: Huang Ying 

In this patch, the normal/huge code path in put_swap_page() and
several helper functions are unified to avoid duplicated code, bugs,
etc. and make it easier to review the code.

The removed lines are more than added lines.  And the binary size is
kept exactly same when CONFIG_TRANSPARENT_HUGEPAGE=n.

Signed-off-by: "Huang, Ying" 
Suggested-by: Dave Hansen 
Cc: Michal Hocko 
Cc: Johannes Weiner 
Cc: Shaohua Li 
Cc: Hugh Dickins 
Cc: Minchan Kim 
Cc: Rik van Riel 
Cc: Daniel Jordan 
Cc: Dan Williams 
---
 mm/swapfile.c | 83 ++-
 1 file changed, 37 insertions(+), 46 deletions(-)

diff --git a/mm/swapfile.c b/mm/swapfile.c
index 160f78072667..e0df8d22ac92 100644
--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -205,8 +205,16 @@ static void discard_swap_cluster(struct swap_info_struct 
*si,
 
 #ifdef CONFIG_THP_SWAP
 #define SWAPFILE_CLUSTER   HPAGE_PMD_NR
+
+#define nr_swap_entries(nr)(nr)
 #else
 #define SWAPFILE_CLUSTER   256
+
+/*
+ * Define nr_swap_entries() as constant to let compiler to optimize
+ * out some code if !CONFIG_THP_SWAP
+ */
+#define nr_swap_entries(nr)1
 #endif
 #define LATENCY_LIMIT  256
 
@@ -1249,18 +1257,7 @@ void swap_free(swp_entry_t entry)
 /*
  * Called after dropping swapcache to decrease refcnt to swap entries.
  */
-static void swapcache_free(swp_entry_t entry)
-{
-   struct swap_info_struct *p;
-
-   p = _swap_info_get(entry);
-   if (p) {
-   if (!__swap_entry_free(p, entry, SWAP_HAS_CACHE))
-   free_swap_slot(entry);
-   }
-}
-
-static void swapcache_free_cluster(swp_entry_t entry)
+void put_swap_page(struct page *page, swp_entry_t entry)
 {
unsigned long offset = swp_offset(entry);
unsigned long idx = offset / SWAPFILE_CLUSTER;
@@ -1269,39 +1266,41 @@ static void swapcache_free_cluster(swp_entry_t entry)
unsigned char *map;
unsigned int i, free_entries = 0;
unsigned char val;
-
-   if (!IS_ENABLED(CONFIG_THP_SWAP))
-   return;
+   int nr = nr_swap_entries(hpage_nr_pages(page));
 
si = _swap_info_get(entry);
if (!si)
return;
 
-   ci = lock_cluster(si, offset);
-   VM_BUG_ON(!cluster_is_huge(ci));
-   map = si->swap_map + offset;
-   for (i = 0; i < SWAPFILE_CLUSTER; i++) {
-   val = map[i];
-   VM_BUG_ON(!(val & SWAP_HAS_CACHE));
-   if (val == SWAP_HAS_CACHE)
-   free_entries++;
-   }
-   if (!free_entries) {
-   for (i = 0; i < SWAPFILE_CLUSTER; i++)
-   map[i] &= ~SWAP_HAS_CACHE;
-   }
-   cluster_clear_huge(ci);
-   unlock_cluster(ci);
-   if (free_entries == SWAPFILE_CLUSTER) {
-   spin_lock(&si->lock);
+   if (nr == SWAPFILE_CLUSTER) {
ci = lock_cluster(si, offset);
-   memset(map, 0, SWAPFILE_CLUSTER);
+   VM_BUG_ON(!cluster_is_huge(ci));
+   map = si->swap_map + offset;
+   for (i = 0; i < SWAPFILE_CLUSTER; i++) {
+   val = map[i];
+   VM_BUG_ON(!(val & SWAP_HAS_CACHE));
+   if (val == SWAP_HAS_CACHE)
+   free_entries++;
+   }
+   if (!free_entries) {
+   for (i = 0; i < SWAPFILE_CLUSTER; i++)
+   map[i] &= ~SWAP_HAS_CACHE;
+   }
+   cluster_clear_huge(ci);
unlock_cluster(ci);
-   mem_cgroup_uncharge_swap(entry, SWAPFILE_CLUSTER);
-   swap_free_cluster(si, idx);
-   spin_unlock(&si->lock);
-   } else if (free_entries) {
-   for (i = 0; i < SWAPFILE_CLUSTER; i++, entry.val++) {
+   if (free_entries == SWAPFILE_CLUSTER) {
+   spin_lock(&si->lock);
+   ci = lock_cluster(si, offset);
+   memset(map, 0, SWAPFILE_CLUSTER);
+   unlock_cluster(ci);
+   mem_cgroup_uncharge_swap(entry, SWAPFILE_CLUSTER);
+   swap_free_cluster(si, idx);
+   spin_unlock(&si->lock);
+   return;
+   }
+   }
+   if (nr == 1 || free_entries) {
+   for (i = 0; i < nr; i++, entry.val++) {
if (!__swap_entry_free(si, entry, SWAP_HAS_CACHE))
free_swap_slot(entry);
}
@@ -1325,14 +1324,6 @@ int split_swap_cluster(swp_entry_t entry)
 }
 #endif
 
-void put_swap_page(struct page *page, swp_entry_t entry)
-{
-   if (!PageTransHuge(page))
-   swapcache_free(entry);
-   else
-   swapcache_free_cluster(entry);
-}
-
 static int swp_entry_cmp(const voi

[PATCH 6/6] swap, put_swap_page: Share more between huge/normal code path

2018-07-12 Thread Huang, Ying

From: Huang Ying 

In this patch, locking related code is shared between huge/normal code
path in put_swap_page() to reduce code duplication.  And `free_entries
== 0` case is merged into more general `free_entries !=
SWAPFILE_CLUSTER` case, because the new locking method makes it easy.

The added lines is same as the removed lines.  But the code size is
increased when CONFIG_TRANSPARENT_HUGEPAGE=n.

text   data bss dec hex filename
base:  24215   2028 340   2658367d7 mm/swapfile.o
unified:   24577   2028 340   269456941 mm/swapfile.o

Signed-off-by: "Huang, Ying" 
Cc: Dave Hansen 
Cc: Michal Hocko 
Cc: Johannes Weiner 
Cc: Shaohua Li 
Cc: Hugh Dickins 
Cc: Minchan Kim 
Cc: Rik van Riel 
Cc: Daniel Jordan 
Cc: Dan Williams 
---
 mm/swapfile.c | 20 ++--
 1 file changed, 10 insertions(+), 10 deletions(-)

diff --git a/mm/swapfile.c b/mm/swapfile.c
index bc488bf36c86..17dce780b4c8 100644
--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -1280,8 +1280,8 @@ void put_swap_page(struct page *page, swp_entry_t entry)
if (!si)
return;
 
+   ci = lock_cluster_or_swap_info(si, offset);
if (nr == SWAPFILE_CLUSTER) {
-   ci = lock_cluster(si, offset);
VM_BUG_ON(!cluster_is_huge(ci));
map = si->swap_map + offset;
for (i = 0; i < SWAPFILE_CLUSTER; i++) {
@@ -1290,13 +1290,9 @@ void put_swap_page(struct page *page, swp_entry_t entry)
if (val == SWAP_HAS_CACHE)
free_entries++;
}
-   if (!free_entries) {
-   for (i = 0; i < SWAPFILE_CLUSTER; i++)
-   map[i] &= ~SWAP_HAS_CACHE;
-   }
cluster_clear_huge(ci);
-   unlock_cluster(ci);
if (free_entries == SWAPFILE_CLUSTER) {
+   unlock_cluster_or_swap_info(si, ci);
spin_lock(&si->lock);
ci = lock_cluster(si, offset);
memset(map, 0, SWAPFILE_CLUSTER);
@@ -1307,12 +1303,16 @@ void put_swap_page(struct page *page, swp_entry_t entry)
return;
}
}
-   if (nr == 1 || free_entries) {
-   for (i = 0; i < nr; i++, entry.val++) {
-   if (!__swap_entry_free(si, entry, SWAP_HAS_CACHE))
-   free_swap_slot(entry);
+   for (i = 0; i < nr; i++, entry.val++) {
+   if (!__swap_entry_free_locked(si, offset + i, SWAP_HAS_CACHE)) {
+   unlock_cluster_or_swap_info(si, ci);
+   free_swap_slot(entry);
+   if (i == nr - 1)
+   return;
+   lock_cluster_or_swap_info(si, offset);
}
}
+   unlock_cluster_or_swap_info(si, ci);
 }
 
 #ifdef CONFIG_THP_SWAP
-- 
2.16.4

[PATCH 0/6] swap: THP optimizing refactoring

2018-07-12 Thread Huang, Ying

This patchset is based on 2018-07-10 head of mmotm tree.

Now the THP (Transparent Huge Page) swap optimizing is implemented in
the way like below,

#ifdef CONFIG_THP_SWAP
huge_function(...)
{
}
#else
normal_function(...)
{
}
#endif

general_function(...)
{
if (huge)
return thp_function(...);
else
return normal_function(...);
}

As pointed out by Dave Hansen, this will,

1. Created a new, wholly untested code path for huge page
2. Created two places to patch bugs
3. Are not reusing code when possible

This patchset is to address these problems via merging huge/normal
code path/functions if possible.

One concern is that this may cause code size to dilate when
!CONFIG_TRANSPARENT_HUGEPAGE.  The data shows that most refactoring
will only cause quite slight code size increase.

Best Regards,
Huang, Ying

Re: [PATCH -mm -v4 05/21] mm, THP, swap: Support PMD swap mapping in free_swap_and_cache()/swap_free()

2018-07-10 Thread Huang, Ying

Dave Hansen  writes:

> On 07/10/2018 12:13 AM, Huang, Ying wrote:
>> Dave Hansen  writes:
>>> The code non-resuse was, and continues to be, IMNHO, one of the largest
>>> sources of bugs with the original THP implementation.  It might be
>>> infeasible to do here, but let's at least give it as much of a go as we can.
>> 
>> I totally agree that we should unify the code path for huge and normal
>> page/swap if possible.  One concern is code size for !CONFIG_THP_SWAP.
>
> I've honestly never heard that as an argument before.  In general, our
> .c files implement *full* functionality: the most complex case.  The
> headers #ifdef that functionality down because of our .config or
> architecture.
>
> The thing that matters here is debugging and reviewing the _complicated_
> case, IMNHO.

I agree with your point here.  I will try it and measure the code size
change too.

>> The original method is good for that.  The new method may introduce some
>> huge swap related code that is hard to be eliminated for
>> !CONFIG_THP_SWAP.  Andrew Morton pointed this out for the patchset of
>> the first step of the THP swap optimization.
>> 
>> This may be mitigated at least partly via,
>> 
>> `
>> #ifdef CONFIG_THP_SWAP
>> #define nr_swap_entries(nr)  (nr)
>> #else
>> #define nr_swap_entries(nr)  1
>> #endif
>> 
>> void do_something(swp_entry_t entry, int __nr_entries)
>> {
>> int i, nr_entries = nr_swap_entries(__nr_entries);
>> 
>> if (nr_entries = SWAPFILE_CLUSTER)
>> ; /* huge swap specific */
>> else
>> ; /* normal swap specific */
>> 
>> for (i = 0; i < nr_entries; i++) {
>> ; /* do something for each entry */
>> }
>> 
>> /* ... */
>> }
>> `
>
> While that isn't perfect, it's better than the current state of things.
>
> While you are refactoring things, I think you also need to take a good
> look at roughly chopping this series in half by finding another stopping
> point.  You've done a great job so far of trickling this functionality
> in so far, but 21 patches is quite a bit, and the set is only going to
> get larger.

Yes.  The patchset is too large.  I will try to reduce it if possible.
At least [21/21] can be separated.  [02/21] may be sent separately
too.  Other parts are hard, THP swapin and creating/supporting PMD swap
mapping need to be in one patchset.

Best Regards,
Huang, Ying

Re: [PATCH -mm -v4 04/21] mm, THP, swap: Support PMD swap mapping in swapcache_free_cluster()

2018-07-10 Thread Huang, Ying

Dave Hansen  writes:

> On 07/09/2018 11:53 PM, Huang, Ying wrote:
>> Dave Hansen  writes:
>>>> +#ifdef CONFIG_THP_SWAP
>>>> +static inline int cluster_swapcount(struct swap_cluster_info *ci)
>>>> +{
>>>> +  if (!ci || !cluster_is_huge(ci))
>>>> +  return 0;
>>>> +
>>>> +  return cluster_count(ci) - SWAPFILE_CLUSTER;
>>>> +}
>>>> +#else
>>>> +#define cluster_swapcount(ci) 0
>>>> +#endif
>>>
>>> Dumb questions, round 2:  On a CONFIG_THP_SWAP=n build, presumably,
>>> cluster_is_huge()=0 always, so cluster_swapout() always returns 0.  Right?
>>>
>>> So, why the #ifdef?
>> 
>> #ifdef here is to reduce the code size for !CONFIG_THP_SWAP.
>
> I'd just remove the !CONFIG_THP_SWAP version entirely.

Sure.  Unless there are some build errors after some other refactoring.

>>>> @@ -1288,24 +1301,30 @@ static void swapcache_free_cluster(swp_entry_t 
>>>> entry)
>>>>  
>>>>ci = lock_cluster(si, offset);
>>>>VM_BUG_ON(!cluster_is_huge(ci));
>>>> +  VM_BUG_ON(!is_cluster_offset(offset));
>>>> +  VM_BUG_ON(cluster_count(ci) < SWAPFILE_CLUSTER);
>>>>map = si->swap_map + offset;
>>>> -  for (i = 0; i < SWAPFILE_CLUSTER; i++) {
>>>> -  val = map[i];
>>>> -  VM_BUG_ON(!(val & SWAP_HAS_CACHE));
>>>> -  if (val == SWAP_HAS_CACHE)
>>>> -  free_entries++;
>>>> +  if (!cluster_swapcount(ci)) {
>>>> +  for (i = 0; i < SWAPFILE_CLUSTER; i++) {
>>>> +  val = map[i];
>>>> +  VM_BUG_ON(!(val & SWAP_HAS_CACHE));
>>>> +  if (val == SWAP_HAS_CACHE)
>>>> +  free_entries++;
>>>> +  }
>>>> +  if (free_entries != SWAPFILE_CLUSTER)
>>>> +  cluster_clear_huge(ci);
>>>>}
>>>
>>> Also, I'll point out that cluster_swapcount() continues the horrific
>>> naming of cluster_couunt(), not saying what the count is *of*.  The
>>> return value doesn't help much:
>>>
>>> return cluster_count(ci) - SWAPFILE_CLUSTER;
>> 
>> We have page_swapcount() for page, swp_swapcount() for swap entry.
>> cluster_swapcount() tries to mimic them for swap cluster.  But I am not
>> good at naming in general.  What's your suggestion?
>
> I don't have a suggestion because I haven't the foggiest idea what it is
> doing. :)
>
> Is it the number of instantiated swap cache pages that are referring to
> this cluster?  Is it just huge pages?  Huge and small?  One refcount per
> huge page, or 512?

page_swapcount() and swp_swapcount() for a normal swap entry is the
number of PTE swap mapping to the normal swap entry.

cluster_swapcount() for a huge swap entry (or huge swap cluster) is the
number of PMD swap mapping to the huge swap entry.

Originally, cluster_count is the reference count of the swap entries in
the swap cluster (that is, how many entries are in use).  Now, it is the
sum of the reference count of the swap entries in the swap cluster and
the number of PMD swap mapping to the huge swap entry.

I need to add comments for this at least.

Best Regards,
Huang, Ying

Re: [PATCH -mm -v4 03/21] mm, THP, swap: Support PMD swap mapping in swap_duplicate()

2018-07-10 Thread Huang, Ying

Dave Hansen  writes:

>> Yes.  Boolean parameter isn't good at most times.  Matthew Wilcox
>> suggested to use
>> 
>> swap_duplicate(&entry, HPAGE_PMD_NR);
>> 
>> vs.
>> 
>> swap_duplicate(&entry, 1);
>> 
>> He thinks this makes the interface more flexible to support other swap
>> entry size in the future.  What do you think about that?
>
> That looks great to me too.
>
>>>>if (likely(!non_swap_entry(entry))) {
>>>> -  if (swap_duplicate(entry) < 0)
>>>> +  if (swap_duplicate(&entry, false) < 0)
>>>>return entry.val;
>>>>  
>>>>/* make sure dst_mm is on swapoff's mmlist. */
>>>
>>> I'll also point out that in a multi-hundred-line patch, adding arguments
>>> to a existing function would not be something I'd try to include in the
>>> patch.  I'd break it out separately unless absolutely necessary.
>> 
>> You mean add another patch, which only adds arguments to the function,
>> but not change the body of the function?
>
> Yes.  Or, just add the non-THP-swap version first.

OK, will do this.

Best Regards,
Huang, Ying

Re: [PATCH -mm -v4 14/21] mm, cgroup, THP, swap: Support to move swap account for PMD swap mapping

2018-07-10 Thread Huang, Ying

Daniel Jordan  writes:

> On Fri, Jun 22, 2018 at 11:51:44AM +0800, Huang, Ying wrote:
>> Because there is no way to prevent a huge swap cluster from being
>> split except when it has SWAP_HAS_CACHE flag set.
>
> What about making get_mctgt_type_thp take the cluster lock?  That function
> would be the first lock_cluster user outside of swapfile.c, but it would
> serialize with split_swap_cluster.
>
>> It is possible for
>> the huge swap cluster to be split and the charge for the swap slots
>> inside to be changed, after we check the PMD swap mapping and the huge
>> swap cluster before we commit the charge moving.  But the race window
>> is so small, that we will just ignore the race.
>
> Moving the charges is a slow path, so can't we just be correct here and not
> leak?

Check the code and thought about this again, found the race may not
exist.  Because the PMD is locked when get_mctgt_type_thp() is called
until charge is completed for the PMD.  So the charge of the huge swap
cluster cannot be changed at the same time even if the huge swap cluster
is split by other processes.  Right?

Will update the comments for this.

Best Regards,
Huang, Ying

Re: [PATCH -mm -v4 05/21] mm, THP, swap: Support PMD swap mapping in free_swap_and_cache()/swap_free()

2018-07-10 Thread Huang, Ying

Dave Hansen  writes:

> I'm seeing a pattern here.
>
> old code:
>
> foo()
> {
>   do_swap_something()
> }
>
> new code:
>
> foo(bool cluster)
> {
>   if (cluster)
>   do_swap_cluster_something();
>   else
>   do_swap_something();
> }
>
> That make me fear that we have:
> 1. Created a new, wholly untested code path
> 2. Created two places to patch bugs
> 3. Are not reusing code when possible
>
> The code non-resuse was, and continues to be, IMNHO, one of the largest
> sources of bugs with the original THP implementation.  It might be
> infeasible to do here, but let's at least give it as much of a go as we can.

I totally agree that we should unify the code path for huge and normal
page/swap if possible.  One concern is code size for !CONFIG_THP_SWAP.
The original method is good for that.  The new method may introduce some
huge swap related code that is hard to be eliminated for
!CONFIG_THP_SWAP.  Andrew Morton pointed this out for the patchset of
the first step of the THP swap optimization.

This may be mitigated at least partly via,

`
#ifdef CONFIG_THP_SWAP
#define nr_swap_entries(nr)  (nr)
#else
#define nr_swap_entries(nr)  1
#endif

void do_something(swp_entry_t entry, int __nr_entries)
{
int i, nr_entries = nr_swap_entries(__nr_entries);

if (nr_entries = SWAPFILE_CLUSTER)
; /* huge swap specific */
else
; /* normal swap specific */

for (i = 0; i < nr_entries; i++) {
; /* do something for each entry */
}

/* ... */
}
`

and rely on compiler to do the dirty work for us if possible.

Hi, Andrew,

What do you think about this?

> Can I ask that you take another round through this set and:
>
> 1. Consolidate code refactoring into separate patches

Sure.

> 2. Add comments to code, and avoid doing it solely in changelogs

Sure.

> 3. Make an effort to share more code between the old code and new
>code.  Where code can not be shared, call that out in the changelog.

Will do that if we resolve the code size concern.

> This is a *really* hard-to-review set at the moment.  Doing those things
> will make it much easier to review and hopefully give us more
> maintainable code going forward.
>
> My apologies for not having done this review sooner.

Thanks a lot for your comments!

Best Regards,
Huang, Ying

Re: [PATCH -mm -v4 04/21] mm, THP, swap: Support PMD swap mapping in swapcache_free_cluster()

2018-07-09 Thread Huang, Ying

Dave Hansen  writes:

>> +#ifdef CONFIG_THP_SWAP
>> +static inline int cluster_swapcount(struct swap_cluster_info *ci)
>> +{
>> +if (!ci || !cluster_is_huge(ci))
>> +return 0;
>> +
>> +return cluster_count(ci) - SWAPFILE_CLUSTER;
>> +}
>> +#else
>> +#define cluster_swapcount(ci)   0
>> +#endif
>
> Dumb questions, round 2:  On a CONFIG_THP_SWAP=n build, presumably,
> cluster_is_huge()=0 always, so cluster_swapout() always returns 0.  Right?
>
> So, why the #ifdef?

#ifdef here is to reduce the code size for !CONFIG_THP_SWAP.

>>  /*
>>   * It's possible scan_swap_map() uses a free cluster in the middle of free
>>   * cluster list. Avoiding such abuse to avoid list corruption.
>> @@ -905,6 +917,7 @@ static void swap_free_cluster(struct swap_info_struct 
>> *si, unsigned long idx)
>>  struct swap_cluster_info *ci;
>>  
>>  ci = lock_cluster(si, offset);
>> +memset(si->swap_map + offset, 0, SWAPFILE_CLUSTER);
>>  cluster_set_count_flag(ci, 0, 0);
>>  free_cluster(si, idx);
>>  unlock_cluster(ci);
>
> This is another case of gloriously comment-free code, but stuff that
> _was_ covered in the changelog.  I'd much rather have code comments than
> changelog comments.  Could we fix that?
>
> I'm generally finding it quite hard to review this because I keep having
> to refer back to the changelog to see if what you are doing matches what
> you said you were doing.

Sure.  Will fix this.

>> @@ -1288,24 +1301,30 @@ static void swapcache_free_cluster(swp_entry_t entry)
>>  
>>  ci = lock_cluster(si, offset);
>>  VM_BUG_ON(!cluster_is_huge(ci));
>> +VM_BUG_ON(!is_cluster_offset(offset));
>> +VM_BUG_ON(cluster_count(ci) < SWAPFILE_CLUSTER);
>>  map = si->swap_map + offset;
>> -for (i = 0; i < SWAPFILE_CLUSTER; i++) {
>> -val = map[i];
>> -VM_BUG_ON(!(val & SWAP_HAS_CACHE));
>> -if (val == SWAP_HAS_CACHE)
>> -free_entries++;
>> +if (!cluster_swapcount(ci)) {
>> +for (i = 0; i < SWAPFILE_CLUSTER; i++) {
>> +val = map[i];
>> +VM_BUG_ON(!(val & SWAP_HAS_CACHE));
>> +if (val == SWAP_HAS_CACHE)
>> +free_entries++;
>> +}
>> +if (free_entries != SWAPFILE_CLUSTER)
>> +cluster_clear_huge(ci);
>>  }
>
> Also, I'll point out that cluster_swapcount() continues the horrific
> naming of cluster_couunt(), not saying what the count is *of*.  The
> return value doesn't help much:
>
>   return cluster_count(ci) - SWAPFILE_CLUSTER;

We have page_swapcount() for page, swp_swapcount() for swap entry.
cluster_swapcount() tries to mimic them for swap cluster.  But I am not
good at naming in general.  What's your suggestion?

Best Regards,
Huang, Ying

Re: [PATCH -mm -v4 03/21] mm, THP, swap: Support PMD swap mapping in swap_duplicate()

2018-07-09 Thread Huang, Ying

cate(swp_entry_t *entry, bool cluster)
>>  {
>>  int err = 0;
>>  
>> -while (!err && __swap_duplicate(entry, 1) == -ENOMEM)
>> -err = add_swap_count_continuation(entry, GFP_ATOMIC);
>> +if (thp_swap_supported() && cluster)
>> +return __swap_duplicate_cluster(entry, 1);
>> +
>> +while (!err && __swap_duplicate(*entry, 1) == -ENOMEM)
>> +err = add_swap_count_continuation(*entry, GFP_ATOMIC);
>>  return err;
>>  }
>
> Reading this, I wonder whether this has been refactored as much as
> possible.  Both add_swap_count_continuation() and
> __swap_duplciate_cluster() start off with the same get_swap_device() dance.

Yes.  There's some duplicated code logic.  Will think about how to
improve it.

>> @@ -3558,9 +3660,12 @@ int swap_duplicate(swp_entry_t entry)
>>   * -EBUSY means there is a swap cache.
>>   * Note: return code is different from swap_duplicate().
>>   */
>> -int swapcache_prepare(swp_entry_t entry)
>> +int swapcache_prepare(swp_entry_t entry, bool cluster)
>>  {
>> -return __swap_duplicate(entry, SWAP_HAS_CACHE);
>> +if (thp_swap_supported() && cluster)
>> +return __swap_duplicate_cluster(&entry, SWAP_HAS_CACHE);
>> +else
>> +return __swap_duplicate(entry, SWAP_HAS_CACHE);
>>  }
>>  
>>  struct swap_info_struct *swp_swap_info(swp_entry_t entry)
>> @@ -3590,51 +3695,13 @@ pgoff_t __page_file_index(struct page *page)
>>  }
>>  EXPORT_SYMBOL_GPL(__page_file_index);
>>  
>> -/*
>> - * add_swap_count_continuation - called when a swap count is duplicated
>> - * beyond SWAP_MAP_MAX, it allocates a new page and links that to the 
>> entry's
>> - * page of the original vmalloc'ed swap_map, to hold the continuation count
>> - * (for that entry and for its neighbouring PAGE_SIZE swap entries).  Called
>> - * again when count is duplicated beyond SWAP_MAP_MAX * SWAP_CONT_MAX, etc.
>
> This closes out with a lot of refactoring noise.  Any chance that can be
> isolated into another patch?

Sure.  Will do that.

Best Regards,
Huang, Ying

Re: [PATCH -mm -v4 02/21] mm, THP, swap: Make CONFIG_THP_SWAP depends on CONFIG_SWAP

2018-07-09 Thread Huang, Ying

Dave Hansen  writes:

> On 07/09/2018 06:19 PM, Huang, Ying wrote:
>> Dave Hansen  writes:
>> 
>>>>  config THP_SWAP
>>>>def_bool y
>>>> -  depends on TRANSPARENT_HUGEPAGE && ARCH_WANTS_THP_SWAP
>>>> +  depends on TRANSPARENT_HUGEPAGE && ARCH_WANTS_THP_SWAP && SWAP
>>>>help
>>>
>>> This seems like a bug-fix.  Is there a reason this didn't cause problems
>>> up to now?
>> Yes.  The original code has some problem in theory, but not in practice
>> because all code enclosed by
>> 
>> #ifdef CONFIG_THP_SWAP
>> #endif
>> 
>> are in swapfile.c.  But that will be not true in this patchset.
>
> That's great info for the changelog.

Sure.  Will add it.

Best Regards,
Huang, Ying

Re: [PATCH -mm -v4 02/21] mm, THP, swap: Make CONFIG_THP_SWAP depends on CONFIG_SWAP

2018-07-09 Thread Huang, Ying

Dave Hansen  writes:

>>  config THP_SWAP
>>  def_bool y
>> -depends on TRANSPARENT_HUGEPAGE && ARCH_WANTS_THP_SWAP
>> +depends on TRANSPARENT_HUGEPAGE && ARCH_WANTS_THP_SWAP && SWAP
>>  help
>
>
> This seems like a bug-fix.  Is there a reason this didn't cause problems
> up to now?

Yes.  The original code has some problem in theory, but not in practice
because all code enclosed by

#ifdef CONFIG_THP_SWAP
#endif

are in swapfile.c.  But that will be not true in this patchset.

Best Regards,
Huang, Ying

Re: [PATCH -mm -v4 01/21] mm, THP, swap: Enable PMD swap operations for CONFIG_THP_SWAP

2018-07-09 Thread Huang, Ying

Dave Hansen  writes:

> On 06/21/2018 08:51 PM, Huang, Ying wrote:
>> From: Huang Ying 
>> 
>> Previously, the PMD swap operations are only enabled for
>> CONFIG_ARCH_ENABLE_THP_MIGRATION.  Because they are only used by the
>> THP migration support.  We will support PMD swap mapping to the huge
>> swap cluster and swapin the THP as a whole.  That will be enabled via
>> CONFIG_THP_SWAP and needs these PMD swap operations.  So enable the
>> PMD swap operations for CONFIG_THP_SWAP too.
>
> This commit message kinda skirts around the real reasons for this patch.
>  Shouldn't we just say something like:
>
>   Currently, "swap entries" in the page tables are used for a
>   number of things outside of actual swap, like page migration.
>   We support THP/PMD "swap entries" for page migration currently
>   and the functions behind this are tied to page migration's
>   config option (CONFIG_ARCH_ENABLE_THP_MIGRATION).
>
>   But, we also need them for THP swap.
>   ...
>
> It would also be nice to explain a bit why you are moving code around.

This looks much better than my original words.  Thanks for help!

> Would this look any better if we made a Kconfig option:
>
>   config HAVE_THP_SWAP_ENTRIES
>   def_bool n
>   # "Swap entries" in the page tables are used
>   # both for migration and actual swap.
>   depends on THP_SWAP || ARCH_ENABLE_THP_MIGRATION
>
> You logically talked about this need for PMD swap operations in your
> commit message, so I think it makes sense to codify that in a single
> place where it can be coherently explained.

Because both you and Dan thinks it's better to add new Kconfig option, I
will do that.

Best Regards,
Huang, Ying

Re: [PATCH -mm -v4 03/21] mm, THP, swap: Support PMD swap mapping in swap_duplicate()

2018-07-09 Thread Huang, Ying

Dan Williams  writes:

> On Thu, Jun 21, 2018 at 8:55 PM Huang, Ying  wrote:
>>
>> From: Huang Ying 
>>
>> To support to swapin the THP as a whole, we need to create PMD swap
>> mapping during swapout, and maintain PMD swap mapping count.  This
>> patch implements the support to increase the PMD swap mapping
>> count (for swapout, fork, etc.)  and set SWAP_HAS_CACHE flag (for
>> swapin, etc.) for a huge swap cluster in swap_duplicate() function
>> family.  Although it only implements a part of the design of the swap
>> reference count with PMD swap mapping, the whole design is described
>> as follow to make it easy to understand the patch and the whole
>> picture.
>>
>> A huge swap cluster is used to hold the contents of a swapouted THP.
>> After swapout, a PMD page mapping to the THP will become a PMD
>> swap mapping to the huge swap cluster via a swap entry in PMD.  While
>> a PTE page mapping to a subpage of the THP will become the PTE swap
>> mapping to a swap slot in the huge swap cluster via a swap entry in
>> PTE.
>>
>> If there is no PMD swap mapping and the corresponding THP is removed
>> from the page cache (reclaimed), the huge swap cluster will be split
>> and become a normal swap cluster.
>>
>> The count (cluster_count()) of the huge swap cluster is
>> SWAPFILE_CLUSTER (= HPAGE_PMD_NR) + PMD swap mapping count.  Because
>> all swap slots in the huge swap cluster are mapped by PTE or PMD, or
>> has SWAP_HAS_CACHE bit set, the usage count of the swap cluster is
>> HPAGE_PMD_NR.  And the PMD swap mapping count is recorded too to make
>> it easy to determine whether there are remaining PMD swap mappings.
>>
>> The count in swap_map[offset] is the sum of PTE and PMD swap mapping
>> count.  This means when we increase the PMD swap mapping count, we
>> need to increase swap_map[offset] for all swap slots inside the swap
>> cluster.  An alternative choice is to make swap_map[offset] to record
>> PTE swap map count only, given we have recorded PMD swap mapping count
>> in the count of the huge swap cluster.  But this need to increase
>> swap_map[offset] when splitting the PMD swap mapping, that may fail
>> because of memory allocation for swap count continuation.  That is
>> hard to dealt with.  So we choose current solution.
>>
>> The PMD swap mapping to a huge swap cluster may be split when unmap a
>> part of PMD mapping etc.  That is easy because only the count of the
>> huge swap cluster need to be changed.  When the last PMD swap mapping
>> is gone and SWAP_HAS_CACHE is unset, we will split the huge swap
>> cluster (clear the huge flag).  This makes it easy to reason the
>> cluster state.
>>
>> A huge swap cluster will be split when splitting the THP in swap
>> cache, or failing to allocate THP during swapin, etc.  But when
>> splitting the huge swap cluster, we will not try to split all PMD swap
>> mappings, because we haven't enough information available for that
>> sometimes.  Later, when the PMD swap mapping is duplicated or swapin,
>> etc, the PMD swap mapping will be split and fallback to the PTE
>> operation.
>>
>> When a THP is added into swap cache, the SWAP_HAS_CACHE flag will be
>> set in the swap_map[offset] of all swap slots inside the huge swap
>> cluster backing the THP.  This huge swap cluster will not be split
>> unless the THP is split even if its PMD swap mapping count dropped to
>> 0.  Later, when the THP is removed from swap cache, the SWAP_HAS_CACHE
>> flag will be cleared in the swap_map[offset] of all swap slots inside
>> the huge swap cluster.  And this huge swap cluster will be split if
>> its PMD swap mapping count is 0.
>>
>> Signed-off-by: "Huang, Ying" 
>> Cc: "Kirill A. Shutemov" 
>> Cc: Andrea Arcangeli 
>> Cc: Michal Hocko 
>> Cc: Johannes Weiner 
>> Cc: Shaohua Li 
>> Cc: Hugh Dickins 
>> Cc: Minchan Kim 
>> Cc: Rik van Riel 
>> Cc: Dave Hansen 
>> Cc: Naoya Horiguchi 
>> Cc: Zi Yan 
>> Cc: Daniel Jordan 
>> ---
>>  include/linux/huge_mm.h |   5 +
>>  include/linux/swap.h|   9 +-
>>  mm/memory.c |   2 +-
>>  mm/rmap.c   |   2 +-
>>  mm/swap_state.c |   2 +-
>>  mm/swapfile.c   | 287 
>> +---
>>  6 files changed, 214 insertions(+), 93 deletions(-)
>
> I'm probably missing some background, but I find the patch hard to
> read. Can you disseminate some of this patch changelog into kernel-doc
> commentary so it's easier

Re: [PATCH -mm -v4 01/21] mm, THP, swap: Enable PMD swap operations for CONFIG_THP_SWAP

2018-07-08 Thread Huang, Ying

Dan Williams  writes:

> On Sun, Jul 8, 2018 at 10:40 PM, Huang, Ying  wrote:
>> Dan Williams  writes:

>>> Would that also allow us to clean up the usage of
>>> CONFIG_ARCH_ENABLE_THP_MIGRATION in fs/proc/task_mmu.c? In other
>>> words, what's the point of having nice ifdef'd alternatives in header
>>> files when ifdefs are still showing up in C files, all of it should be
>>> optionally determined by header files.
>>
>> Unfortunately, I think it is not a easy task to wrap all C code via
>> #ifdef in header files.  And it may be over-engineering to wrap them
>> all.  I guess this is why there are still some #ifdef in C files.
>
> That's the entire point. Yes, over-engineer the header files so the
> actual C code is more readable.

Take pagemap_pmd_range() in fs/proc/task_mmu.c as an example, to avoid
#ifdef, we may wrap all code between #ifdef/#endif into a separate
function, put the new function into another C file (which is compiled
only if #ifdef is true), then change header files for that too.

In this way, we avoid #ifdef/#endif, but the code is more complex and
tightly related code may be put into different files.  The readability
may be hurt too.

Maybe you have smarter way to change the code to avoid #ifdef and
improve code readability?

Best Regards,
Huang, Ying

Re: [PATCH -mm -v4 02/21] mm, THP, swap: Make CONFIG_THP_SWAP depends on CONFIG_SWAP

2018-07-08 Thread Huang, Ying

Dan Williams  writes:

> On Thu, Jun 21, 2018 at 8:55 PM Huang, Ying  wrote:
>>
>> From: Huang Ying 
>>
>> It's unreasonable to optimize swapping for THP without basic swapping
>> support.  And this will cause build errors when THP_SWAP functions are
>> defined in swapfile.c and called elsewhere.
>>
>> The comments are fixed too to reflect the latest progress.
>
> Looks good to me:
>
> Reviewed-by: Dan Williams 

Thanks!

Best Regards,
Huang, Ying

Re: [PATCH -mm -v4 01/21] mm, THP, swap: Enable PMD swap operations for CONFIG_THP_SWAP

2018-07-08 Thread Huang, Ying

Dan Williams  writes:

> On Thu, Jun 21, 2018 at 8:55 PM Huang, Ying  wrote:
>>
>> From: Huang Ying 
>>
>> Previously, the PMD swap operations are only enabled for
>> CONFIG_ARCH_ENABLE_THP_MIGRATION.  Because they are only used by the
>> THP migration support.  We will support PMD swap mapping to the huge
>> swap cluster and swapin the THP as a whole.  That will be enabled via
>> CONFIG_THP_SWAP and needs these PMD swap operations.  So enable the
>> PMD swap operations for CONFIG_THP_SWAP too.
>>
>> Signed-off-by: "Huang, Ying" 
>> Cc: "Kirill A. Shutemov" 
>> Cc: Andrea Arcangeli 
>> Cc: Michal Hocko 
>> Cc: Johannes Weiner 
>> Cc: Shaohua Li 
>> Cc: Hugh Dickins 
>> Cc: Minchan Kim 
>> Cc: Rik van Riel 
>> Cc: Dave Hansen 
>> Cc: Naoya Horiguchi 
>> Cc: Zi Yan 
>> Cc: Daniel Jordan 
>> ---
>>  arch/x86/include/asm/pgtable.h |  2 +-
>>  include/asm-generic/pgtable.h  |  2 +-
>>  include/linux/swapops.h| 44 
>> ++
>>  3 files changed, 25 insertions(+), 23 deletions(-)
>>
>> diff --git a/arch/x86/include/asm/pgtable.h b/arch/x86/include/asm/pgtable.h
>> index 99ecde23c3ec..13bf58838daf 100644
>> --- a/arch/x86/include/asm/pgtable.h
>> +++ b/arch/x86/include/asm/pgtable.h
>> @@ -1224,7 +1224,7 @@ static inline pte_t pte_swp_clear_soft_dirty(pte_t pte)
>> return pte_clear_flags(pte, _PAGE_SWP_SOFT_DIRTY);
>>  }
>>
>> -#ifdef CONFIG_ARCH_ENABLE_THP_MIGRATION
>> +#if defined(CONFIG_ARCH_ENABLE_THP_MIGRATION) || defined(CONFIG_THP_SWAP)
>
> How about introducing a new config symbol representing the common
> infrastructure between the two and have them select that symbol.

The common infrastructure shared by two mechanisms is PMD swap entry.
But I didn't find there are many places where the common infrastructure
is used.  So I think it may be over-engineering to introduce a new
config symbol but use it for so few times.

> Would that also allow us to clean up the usage of
> CONFIG_ARCH_ENABLE_THP_MIGRATION in fs/proc/task_mmu.c? In other
> words, what's the point of having nice ifdef'd alternatives in header
> files when ifdefs are still showing up in C files, all of it should be
> optionally determined by header files.

Unfortunately, I think it is not a easy task to wrap all C code via
#ifdef in header files.  And it may be over-engineering to wrap them
all.  I guess this is why there are still some #ifdef in C files.

Best Regards,
Huang, Ying

Re: [PATCH -mm -v4 05/21] mm, THP, swap: Support PMD swap mapping in free_swap_and_cache()/swap_free()

2018-07-06 Thread Huang, Ying

Daniel Jordan  writes:

> On Fri, Jun 22, 2018 at 11:51:35AM +0800, Huang, Ying wrote:
>> +static unsigned char swap_free_cluster(struct swap_info_struct *si,
>> +   swp_entry_t entry)
> ...
>> +/* Cluster has been split, free each swap entries in cluster */
>> +if (!cluster_is_huge(ci)) {
>> +unlock_cluster(ci);
>> +for (i = 0; i < SWAPFILE_CLUSTER; i++, entry.val++) {
>> +if (!__swap_entry_free(si, entry, 1)) {
>> +free_entries++;
>> +free_swap_slot(entry);
>> +}
>> +}
>
> Is is better on average to use __swap_entry_free_locked instead of
> __swap_entry_free here?  I'm not sure myself, just asking.
>
> As it's written, if the cluster's been split, we always take and drop the
> cluster lock 512 times, but if we don't expect to call free_swap_slot that
> often, then we could just drop and retake the cluster lock inside the 
> innermost
> 'if' against the possibility that free_swap_slot eventually makes us take the
> cluster lock again.

Yes.  This is a good idea.  Thanks for your suggestion!  I will change
this in the next version.

Best Regards,
Huang, Ying

> ...
>> +return !(free_entries == SWAPFILE_CLUSTER);
>
> return free_entries != SWAPFILE_CLUSTER;

Re: [PATCH -mm -v4 00/21] mm, THP, swap: Swapout/swapin THP in one piece

2018-07-03 Thread Huang, Ying

Sergey Senozhatsky  writes:

> On (07/04/18 10:20), Huang, Ying wrote:
>> > On (06/27/18 21:51), Andrew Morton wrote:
>> >> On Fri, 22 Jun 2018 11:51:30 +0800 "Huang, Ying"  
>> >> wrote:
>> >> 
>> >> > This is the final step of THP (Transparent Huge Page) swap
>> >> > optimization.  After the first and second step, the splitting huge
>> >> > page is delayed from almost the first step of swapout to after swapout
>> >> > has been finished.  In this step, we avoid splitting THP for swapout
>> >> > and swapout/swapin the THP in one piece.
>> >> 
>> >> It's a tremendously good performance improvement.  It's also a
>> >> tremendously large patchset :(
>> >
>> > Will zswap gain a THP swap out/in support at some point?
>> >
>> >
>> > mm/zswap.c: static int zswap_frontswap_store(...)
>> > ...
>> >/* THP isn't supported */
>> >if (PageTransHuge(page)) {
>> >ret = -EINVAL;
>> >goto reject;
>> >}
>> 
>> That's not on my TODO list.  Do you have interest to work on this?
>
> I'd say I'm interested. Can't promise that I'll have enough spare time
> any time soon, tho.

Thanks!

> The numbers you posted do look fantastic indeed, embedded devices
> [which normally use zswap/zram quite heavily] _probably_ should see
> some performance improvement as well once zswap [and may be zram] can
> handle THP.

Yes.  I think so too.

Best Regards,
Huang, Ying

Re: [PATCH -mm -v4 08/21] mm, THP, swap: Support to read a huge swap cluster for swapin a THP

2018-07-03 Thread Huang, Ying

Daniel Jordan  writes:

> On Fri, Jun 22, 2018 at 11:51:38AM +0800, Huang, Ying wrote:
>> @@ -411,14 +414,32 @@ struct page *__read_swap_cache_async(swp_entry_t 
>> entry, gfp_t gfp_mask,
> ...
>> +if (thp_swap_supported() && huge_cluster) {
>> +gfp_t gfp = alloc_hugepage_direct_gfpmask(vma);
>> +
>> +new_page = alloc_hugepage_vma(gfp, vma,
>> +addr, HPAGE_PMD_ORDER);
>
> When allocating a huge page, we ignore the gfp_mask argument.
>
> That doesn't matter right now since AFAICT we're not losing any flags: 
> gfp_mask
> from existing callers of __read_swap_cache_async seems to always be a subset 
> of
> GFP_HIGHUSER_MOVABLE and alloc_hugepage_direct_gfpmask always returns a
> superset of that.
>
> But maybe we should warn here in case we end up violating a restriction from a
> future caller.  Something like this?:
>
>> +gfp_t gfp = alloc_hugepage_direct_gfpmask(vma);
> VM_WARN_ONCE((gfp | gfp_mask) != gfp,
>    "ignoring gfp_mask bits");

This looks good!  Thanks!  Will add this.

Best Regards,
Huang, Ying

Re: [PATCH -mm -v4 00/21] mm, THP, swap: Swapout/swapin THP in one piece

2018-07-03 Thread Huang, Ying

Sergey Senozhatsky  writes:

> On (06/27/18 21:51), Andrew Morton wrote:
>> On Fri, 22 Jun 2018 11:51:30 +0800 "Huang, Ying"  
>> wrote:
>> 
>> > This is the final step of THP (Transparent Huge Page) swap
>> > optimization.  After the first and second step, the splitting huge
>> > page is delayed from almost the first step of swapout to after swapout
>> > has been finished.  In this step, we avoid splitting THP for swapout
>> > and swapout/swapin the THP in one piece.
>> 
>> It's a tremendously good performance improvement.  It's also a
>> tremendously large patchset :(
>
> Will zswap gain a THP swap out/in support at some point?
>
>
> mm/zswap.c: static int zswap_frontswap_store(...)
> ...
>   /* THP isn't supported */
>   if (PageTransHuge(page)) {
>   ret = -EINVAL;
>   goto reject;
>   }

That's not on my TODO list.  Do you have interest to work on this?

Best Regards,
Huang, Ying

Re: [PATCH -mm -v4 08/21] mm, THP, swap: Support to read a huge swap cluster for swapin a THP

2018-07-02 Thread Huang, Ying

Matthew Wilcox  writes:

> On Fri, Jun 22, 2018 at 11:51:38AM +0800, Huang, Ying wrote:
>> +++ b/mm/swap_state.c
>> @@ -426,33 +447,37 @@ struct page *__read_swap_cache_async(swp_entry_t 
>> entry, gfp_t gfp_mask,
>>  /*
>>   * call radix_tree_preload() while we can wait.
>>   */
>> -err = radix_tree_maybe_preload(gfp_mask & GFP_KERNEL);
>> +err = radix_tree_maybe_preload_order(gfp_mask & GFP_KERNEL,
>> + compound_order(new_page));
>>  if (err)
>>  break;
>
> There's no more preloading in the XArray world, so this can just be dropped.

Sure.

>>  /*
>>   * Swap entry may have been freed since our caller observed it.
>>   */
>> +err = swapcache_prepare(hentry, huge_cluster);
>> +if (err) {
>>  radix_tree_preload_end();
>> -break;
>> +if (err == -EEXIST) {
>> +/*
>> + * We might race against get_swap_page() and
>> + * stumble across a SWAP_HAS_CACHE swap_map
>> + * entry whose page has not been brought into
>> + * the swapcache yet.
>> + */
>> +cond_resched();
>> +continue;
>> +} else if (err == -ENOTDIR) {
>> +/* huge swap cluster is split under us */
>> +continue;
>> +} else  /* swp entry is obsolete ? */
>> +break;
>
> I'm not entirely happy about -ENOTDIR being overloaded to mean this.
> Maybe we can return a new enum rather than an errno?

Can we use -ESTALE instead?  The "huge swap cluster is split under us"
means the swap entry is kind of "staled".

> Also, I'm not sure that a true/false parameter is the right approach for
> "is this a huge page".  I think we'll have usecases for swap entries which
> are both larger and smaller than PMD_SIZE.

OK.  I can change the interface to number of swap entries style to make
it more flexible.

> I was hoping to encode the swap entry size into the entry; we only need one
> extra bit to do that (no matter the size of the entry).  I detailed the
> encoding scheme here:
>
> https://plus.google.com/117536210417097546339/posts/hvctn17WUZu
>
> (let me know if that doesn't work for you; I'm not very experienced with
> this G+ thing)

The encoding method looks good.  To use it, we need to

- Encode swap entry and size into swap_entry_size
- Call function with swap_entry_size
- Decode swap_entry_size to swap entry and size

It appears that there is no real benefit?

Best Regards,
Huang, Ying

Re: [PATCH -mm -v4 03/21] mm, THP, swap: Support PMD swap mapping in swap_duplicate()

2018-07-01 Thread Huang, Ying

Matthew Wilcox  writes:

> On Fri, Jun 22, 2018 at 11:51:33AM +0800, Huang, Ying wrote:
>> +++ b/mm/swap_state.c
>> @@ -433,7 +433,7 @@ struct page *__read_swap_cache_async(swp_entry_t entry, 
>> gfp_t gfp_mask,
>>  /*
>>   * Swap entry may have been freed since our caller observed it.
>>   */
>> -err = swapcache_prepare(entry);
>> +err = swapcache_prepare(entry, false);
>>  if (err == -EEXIST) {
>>  radix_tree_preload_end();
>>  /*
>
> This commit should be just a textual conflict.

Yes.  Will check it.

Best Regards,
Huang, Ying

Re: [PATCH -mm -v4 00/21] mm, THP, swap: Swapout/swapin THP in one piece

2018-07-01 Thread Huang, Ying

Matthew Wilcox  writes:

> On Fri, Jun 29, 2018 at 09:17:16AM +0800, Huang, Ying wrote:
>> Matthew Wilcox  writes:
>> > I'll take a look.  Honestly, my biggest problem with this patch set is
>> > overuse of tagging:
>> >
>> > 59832 Jun 22 Huang, Ying ( 131) [PATCH -mm -v4 00/21] mm, THP, 
>> > swap: Swa
>> > There's literally zero useful information displayed in the patch subjects.
>> 
>> Thanks!  What's your suggestion on tagging?  Only keep "mm" or "swap"?
>
> Subject: [PATCH v14 10/74] xarray: Add XArray tags
>
> I'm not sure where the extra '-' in front of '-v4' comes from.  I also
> wouldn't put the '-mm' in front of it -- that information can live in
> the cover letter's body rather than any patch's subject.
>
> I think 'swap:' implies "mm:", so yeah I'd just go with that.
>
> Subject: [PATCH v4 00/21] swap: Useful information here
>
> I'd see that as:
>
> 59832 Jun 22 Huang, Ying ( 131) [PATCH v4 00/21] swap: Useful 
> informatio

Looks good!  I will use this naming convention in the future.

> I had a quick look at your patches.  I think only two are affected by
> the XArray, and I'll make some general comments about them soon.

Thanks!

Best Regards,
Huang, Ying

Re: [PATCH -mm -v4 00/21] mm, THP, swap: Swapout/swapin THP in one piece

2018-06-28 Thread Huang, Ying

Matthew Wilcox  writes:

> On Wed, Jun 27, 2018 at 11:18:39PM -0700, Andrew Morton wrote:
>> On Thu, 28 Jun 2018 13:35:15 +0800 "Huang\, Ying"  
>> wrote:
>> > No problem.  I will rebase the patchset on your latest -mm tree, or the
>> > next version to be released?
>> 
>> We need to figure that out with Matthew.
>> 
>> Probably the xarray conversions are simpler and more mature so yes,
>> probably they should be staged first.
>
> I'll take a look.  Honestly, my biggest problem with this patch set is
> overuse of tagging:
>
> 59832 Jun 22 Huang, Ying ( 131) [PATCH -mm -v4 00/21] mm, THP, swap: 
> Swa
> 59833 N   Jun 22 Huang, Ying ( 126) ├─>[PATCH -mm -v4 01/21] mm, THP, 
> swap:
> 59834 N   Jun 22 Huang, Ying (  44) ├─>[PATCH -mm -v4 02/21] mm, THP, 
> swap:
> 59835 N   Jun 22 Huang, Ying ( 583) ├─>[PATCH -mm -v4 03/21] mm, THP, 
> swap:
> 59836 N   Jun 22 Huang, Ying ( 104) ├─>[PATCH -mm -v4 04/21] mm, THP, 
> swap:
> 59837 N   Jun 22 Huang, Ying ( 394) ├─>[PATCH -mm -v4 05/21] mm, THP, 
> swap:
> 59838 N   Jun 22 Huang, Ying ( 198) ├─>[PATCH -mm -v4 06/21] mm, THP, 
> swap:
> 59839 N   Jun 22 Huang, Ying ( 161) ├─>[PATCH -mm -v4 07/21] mm, THP, 
> swap:
> 59840 N   Jun 22 Huang, Ying     ( 351) ├─>[PATCH -mm -v4 08/21] mm, THP, 
> swap:
> 59841 N   Jun 22 Huang, Ying ( 293) ├─>[PATCH -mm -v4 09/21] mm, THP, 
> swap:
> 59842 N   Jun 22 Huang, Ying ( 138) ├─>[PATCH -mm -v4 10/21] mm, THP, 
> swap:
> 59843 N   Jun 22 Huang, Ying ( 264) ├─>[PATCH -mm -v4 11/21] mm, THP, 
> swap:
> 59844 N   Jun 22 Huang, Ying ( 251) ├─>[PATCH -mm -v4 12/21] mm, THP, 
> swap:
> 59845 N   Jun 22 Huang, Ying ( 121) ├─>[PATCH -mm -v4 13/21] mm, THP, 
> swap:
> 59846 N   Jun 22 Huang, Ying ( 517) ├─>[PATCH -mm -v4 14/21] mm, cgroup, 
> THP
> 59847 N   Jun 22 Huang, Ying ( 128) ├─>[PATCH -mm -v4 15/21] mm, THP, 
> swap:
> 59848 N   Jun 22 Huang, Ying (  85) ├─>[PATCH -mm -v4 16/21] mm, THP, 
> swap:
> 59849 N   Jun 22 Huang, Ying (  70) ├─>[PATCH -mm -v4 17/21] mm, THP, 
> swap:
>
> There's literally zero useful information displayed in the patch subjects.

Thanks!  What's your suggestion on tagging?  Only keep "mm" or "swap"?

Best Regards,
Huang, Ying

Re: [PATCH -mm -v4 00/21] mm, THP, swap: Swapout/swapin THP in one piece

2018-06-27 Thread Huang, Ying

Andrew Morton  writes:

> On Thu, 28 Jun 2018 13:29:09 +0800 "Huang\, Ying"  
> wrote:
>
>> Andrew Morton  writes:
>> 
>> > On Fri, 22 Jun 2018 11:51:30 +0800 "Huang, Ying"  
>> > wrote:
>> >
>> >> This is the final step of THP (Transparent Huge Page) swap
>> >> optimization.  After the first and second step, the splitting huge
>> >> page is delayed from almost the first step of swapout to after swapout
>> >> has been finished.  In this step, we avoid splitting THP for swapout
>> >> and swapout/swapin the THP in one piece.
>> >
>> > It's a tremendously good performance improvement.  It's also a
>> > tremendously large patchset :(
>> >
>> > And it depends upon your
>> > mm-swap-fix-race-between-swapoff-and-some-swap-operations.patch and
>> > mm-fix-race-between-swapoff-and-mincore.patch, the first of which has
>> > been floating about since February without adequate review.
>> >
>> > I'll give this patchset a spin in -mm to see what happens and will come
>> > back later to take a closer look.  But the best I can do at this time
>> > is to hopefully cc some possible reviewers :)
>> 
>> Thanks a lot for your help!  Hope more people can review it!
>
> I took it out of -mm again, temporarily.  Due to a huge tangle with the
> xarray conversions in linux-next.

No problem.  I will rebase the patchset on your latest -mm tree, or the
next version to be released?

Best Regards,
Huang, Ying

Re: [PATCH -mm -v4 00/21] mm, THP, swap: Swapout/swapin THP in one piece

2018-06-27 Thread Huang, Ying

Andrew Morton  writes:

> On Fri, 22 Jun 2018 11:51:30 +0800 "Huang, Ying"  wrote:
>
>> This is the final step of THP (Transparent Huge Page) swap
>> optimization.  After the first and second step, the splitting huge
>> page is delayed from almost the first step of swapout to after swapout
>> has been finished.  In this step, we avoid splitting THP for swapout
>> and swapout/swapin the THP in one piece.
>
> It's a tremendously good performance improvement.  It's also a
> tremendously large patchset :(
>
> And it depends upon your
> mm-swap-fix-race-between-swapoff-and-some-swap-operations.patch and
> mm-fix-race-between-swapoff-and-mincore.patch, the first of which has
> been floating about since February without adequate review.
>
> I'll give this patchset a spin in -mm to see what happens and will come
> back later to take a closer look.  But the best I can do at this time
> is to hopefully cc some possible reviewers :)

Thanks a lot for your help!  Hope more people can review it!

Best Regards,
Huang, Ying

[PATCH -mm -v4 03/21] mm, THP, swap: Support PMD swap mapping in swap_duplicate()

2018-06-21 Thread Huang, Ying

From: Huang Ying 

To support to swapin the THP as a whole, we need to create PMD swap
mapping during swapout, and maintain PMD swap mapping count.  This
patch implements the support to increase the PMD swap mapping
count (for swapout, fork, etc.)  and set SWAP_HAS_CACHE flag (for
swapin, etc.) for a huge swap cluster in swap_duplicate() function
family.  Although it only implements a part of the design of the swap
reference count with PMD swap mapping, the whole design is described
as follow to make it easy to understand the patch and the whole
picture.

A huge swap cluster is used to hold the contents of a swapouted THP.
After swapout, a PMD page mapping to the THP will become a PMD
swap mapping to the huge swap cluster via a swap entry in PMD.  While
a PTE page mapping to a subpage of the THP will become the PTE swap
mapping to a swap slot in the huge swap cluster via a swap entry in
PTE.

If there is no PMD swap mapping and the corresponding THP is removed
from the page cache (reclaimed), the huge swap cluster will be split
and become a normal swap cluster.

The count (cluster_count()) of the huge swap cluster is
SWAPFILE_CLUSTER (= HPAGE_PMD_NR) + PMD swap mapping count.  Because
all swap slots in the huge swap cluster are mapped by PTE or PMD, or
has SWAP_HAS_CACHE bit set, the usage count of the swap cluster is
HPAGE_PMD_NR.  And the PMD swap mapping count is recorded too to make
it easy to determine whether there are remaining PMD swap mappings.

The count in swap_map[offset] is the sum of PTE and PMD swap mapping
count.  This means when we increase the PMD swap mapping count, we
need to increase swap_map[offset] for all swap slots inside the swap
cluster.  An alternative choice is to make swap_map[offset] to record
PTE swap map count only, given we have recorded PMD swap mapping count
in the count of the huge swap cluster.  But this need to increase
swap_map[offset] when splitting the PMD swap mapping, that may fail
because of memory allocation for swap count continuation.  That is
hard to dealt with.  So we choose current solution.

The PMD swap mapping to a huge swap cluster may be split when unmap a
part of PMD mapping etc.  That is easy because only the count of the
huge swap cluster need to be changed.  When the last PMD swap mapping
is gone and SWAP_HAS_CACHE is unset, we will split the huge swap
cluster (clear the huge flag).  This makes it easy to reason the
cluster state.

A huge swap cluster will be split when splitting the THP in swap
cache, or failing to allocate THP during swapin, etc.  But when
splitting the huge swap cluster, we will not try to split all PMD swap
mappings, because we haven't enough information available for that
sometimes.  Later, when the PMD swap mapping is duplicated or swapin,
etc, the PMD swap mapping will be split and fallback to the PTE
operation.

When a THP is added into swap cache, the SWAP_HAS_CACHE flag will be
set in the swap_map[offset] of all swap slots inside the huge swap
cluster backing the THP.  This huge swap cluster will not be split
unless the THP is split even if its PMD swap mapping count dropped to
0.  Later, when the THP is removed from swap cache, the SWAP_HAS_CACHE
flag will be cleared in the swap_map[offset] of all swap slots inside
the huge swap cluster.  And this huge swap cluster will be split if
its PMD swap mapping count is 0.

Signed-off-by: "Huang, Ying" 
Cc: "Kirill A. Shutemov" 
Cc: Andrea Arcangeli 
Cc: Michal Hocko 
Cc: Johannes Weiner 
Cc: Shaohua Li 
Cc: Hugh Dickins 
Cc: Minchan Kim 
Cc: Rik van Riel 
Cc: Dave Hansen 
Cc: Naoya Horiguchi 
Cc: Zi Yan 
Cc: Daniel Jordan 
---
 include/linux/huge_mm.h |   5 +
 include/linux/swap.h|   9 +-
 mm/memory.c |   2 +-
 mm/rmap.c   |   2 +-
 mm/swap_state.c |   2 +-
 mm/swapfile.c   | 287 +---
 6 files changed, 214 insertions(+), 93 deletions(-)

diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h
index d3bbf6bea9e9..213d32e57c39 100644
--- a/include/linux/huge_mm.h
+++ b/include/linux/huge_mm.h
@@ -80,6 +80,11 @@ extern struct kobj_attribute shmem_enabled_attr;
 #define HPAGE_PMD_ORDER (HPAGE_PMD_SHIFT-PAGE_SHIFT)
 #define HPAGE_PMD_NR (1<lock);
 }
 
+static inline bool is_cluster_offset(unsigned long offset)
+{
+   return !(offset % SWAPFILE_CLUSTER);
+}
+
 static inline bool cluster_list_empty(struct swap_cluster_list *list)
 {
return cluster_is_null(&list->head);
@@ -1166,16 +1174,14 @@ struct swap_info_struct *get_swap_device(swp_entry_t 
entry)
return NULL;
 }
 
-static unsigned char __swap_entry_free(struct swap_info_struct *p,
-  swp_entry_t entry, unsigned char usage)
+static unsigned char __swap_entry_free_locked(struct swap_info_struct *p,
+ struct swap_cluster_info *ci,
+

[PATCH -mm -v4 06/21] mm, THP, swap: Support PMD swap mapping when splitting huge PMD

2018-06-21 Thread Huang, Ying

From: Huang Ying 

A huge PMD need to be split when zap a part of the PMD mapping etc.
If the PMD mapping is a swap mapping, we need to split it too.  This
patch implemented the support for this.  This is similar as splitting
the PMD page mapping, except we need to decrease the PMD swap mapping
count for the huge swap cluster too.  If the PMD swap mapping count
becomes 0, the huge swap cluster will be split.

Notice: is_huge_zero_pmd() and pmd_page() doesn't work well with swap
PMD, so pmd_present() check is called before them.

Signed-off-by: "Huang, Ying" 
Cc: "Kirill A. Shutemov" 
Cc: Andrea Arcangeli 
Cc: Michal Hocko 
Cc: Johannes Weiner 
Cc: Shaohua Li 
Cc: Hugh Dickins 
Cc: Minchan Kim 
Cc: Rik van Riel 
Cc: Dave Hansen 
Cc: Naoya Horiguchi 
Cc: Zi Yan 
Cc: Daniel Jordan 
---
 include/linux/swap.h |  6 ++
 mm/huge_memory.c | 58 +++-
 mm/swapfile.c| 28 +
 3 files changed, 87 insertions(+), 5 deletions(-)

diff --git a/include/linux/swap.h b/include/linux/swap.h
index 7ed2c727c9b6..bb9de2cb952a 100644
--- a/include/linux/swap.h
+++ b/include/linux/swap.h
@@ -618,11 +618,17 @@ static inline swp_entry_t get_swap_page(struct page *page)
 
 #ifdef CONFIG_THP_SWAP
 extern int split_swap_cluster(swp_entry_t entry);
+extern int split_swap_cluster_map(swp_entry_t entry);
 #else
 static inline int split_swap_cluster(swp_entry_t entry)
 {
return 0;
 }
+
+static inline int split_swap_cluster_map(swp_entry_t entry)
+{
+   return 0;
+}
 #endif
 
 #ifdef CONFIG_MEMCG
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index feba371169ca..2d615328d77f 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -1602,6 +1602,47 @@ int do_huge_pmd_numa_page(struct vm_fault *vmf, pmd_t 
pmd)
return 0;
 }
 
+#ifdef CONFIG_THP_SWAP
+static void __split_huge_swap_pmd(struct vm_area_struct *vma,
+ unsigned long haddr,
+ pmd_t *pmd)
+{
+   struct mm_struct *mm = vma->vm_mm;
+   pgtable_t pgtable;
+   pmd_t _pmd;
+   swp_entry_t entry;
+   int i, soft_dirty;
+
+   entry = pmd_to_swp_entry(*pmd);
+   soft_dirty = pmd_soft_dirty(*pmd);
+
+   split_swap_cluster_map(entry);
+
+   pgtable = pgtable_trans_huge_withdraw(mm, pmd);
+   pmd_populate(mm, &_pmd, pgtable);
+
+   for (i = 0; i < HPAGE_PMD_NR; i++, haddr += PAGE_SIZE, entry.val++) {
+   pte_t *pte, ptent;
+
+   pte = pte_offset_map(&_pmd, haddr);
+   VM_BUG_ON(!pte_none(*pte));
+   ptent = swp_entry_to_pte(entry);
+   if (soft_dirty)
+   ptent = pte_swp_mksoft_dirty(ptent);
+   set_pte_at(mm, haddr, pte, ptent);
+   pte_unmap(pte);
+   }
+   smp_wmb(); /* make pte visible before pmd */
+   pmd_populate(mm, pmd, pgtable);
+}
+#else
+static inline void __split_huge_swap_pmd(struct vm_area_struct *vma,
+unsigned long haddr,
+pmd_t *pmd)
+{
+}
+#endif
+
 /*
  * Return true if we do MADV_FREE successfully on entire pmd page.
  * Otherwise, return false.
@@ -2068,7 +2109,7 @@ static void __split_huge_pmd_locked(struct vm_area_struct 
*vma, pmd_t *pmd,
VM_BUG_ON(haddr & ~HPAGE_PMD_MASK);
VM_BUG_ON_VMA(vma->vm_start > haddr, vma);
VM_BUG_ON_VMA(vma->vm_end < haddr + HPAGE_PMD_SIZE, vma);
-   VM_BUG_ON(!is_pmd_migration_entry(*pmd) && !pmd_trans_huge(*pmd)
+   VM_BUG_ON(!is_swap_pmd(*pmd) && !pmd_trans_huge(*pmd)
&& !pmd_devmap(*pmd));
 
count_vm_event(THP_SPLIT_PMD);
@@ -2090,8 +2131,11 @@ static void __split_huge_pmd_locked(struct 
vm_area_struct *vma, pmd_t *pmd,
put_page(page);
add_mm_counter(mm, MM_FILEPAGES, -HPAGE_PMD_NR);
return;
-   } else if (is_huge_zero_pmd(*pmd)) {
+   } else if (pmd_present(*pmd) && is_huge_zero_pmd(*pmd)) {
/*
+* is_huge_zero_pmd() may return true for PMD swap
+* entry, so checking pmd_present() firstly.
+*
 * FIXME: Do we want to invalidate secondary mmu by calling
 * mmu_notifier_invalidate_range() see comments below inside
 * __split_huge_pmd() ?
@@ -2134,6 +2178,9 @@ static void __split_huge_pmd_locked(struct vm_area_struct 
*vma, pmd_t *pmd,
page = pfn_to_page(swp_offset(entry));
} else
 #endif
+   if (thp_swap_supported() && is_swap_pmd(old_pmd))
+   return __split_huge_swap_pmd(vma, haddr, pmd);
+   else
page = pmd_page(old_pmd);
VM_BUG_ON_PAGE(!page_count(page), page);
page_ref_add(page, HPAGE_PMD_NR - 1);
@@ -2225,14 +2272,15 @@ void __s

[PATCH -mm -v4 10/21] mm, THP, swap: Support to count THP swapin and its fallback

2018-06-21 Thread Huang, Ying

From: Huang Ying 

2 new /proc/vmstat fields are added, "thp_swapin" and
"thp_swapin_fallback" to count swapin a THP from swap device as a
whole and fallback to normal page swapin.

Signed-off-by: "Huang, Ying" 
Cc: "Kirill A. Shutemov" 
Cc: Andrea Arcangeli 
Cc: Michal Hocko 
Cc: Johannes Weiner 
Cc: Shaohua Li 
Cc: Hugh Dickins 
Cc: Minchan Kim 
Cc: Rik van Riel 
Cc: Dave Hansen 
Cc: Naoya Horiguchi 
Cc: Zi Yan 
Cc: Daniel Jordan 
---
 Documentation/admin-guide/mm/transhuge.rst |  8 
 include/linux/vm_event_item.h  |  2 ++
 mm/huge_memory.c   |  4 +++-
 mm/page_io.c   | 15 ---
 mm/vmstat.c|  2 ++
 5 files changed, 27 insertions(+), 4 deletions(-)

diff --git a/Documentation/admin-guide/mm/transhuge.rst 
b/Documentation/admin-guide/mm/transhuge.rst
index 7ab93a8404b9..85e33f785fd7 100644
--- a/Documentation/admin-guide/mm/transhuge.rst
+++ b/Documentation/admin-guide/mm/transhuge.rst
@@ -364,6 +364,14 @@ thp_swpout_fallback
Usually because failed to allocate some continuous swap space
for the huge page.
 
+thp_swpin
+   is incremented every time a huge page is swapin in one piece
+   without splitting.
+
+thp_swpin_fallback
+   is incremented if a huge page has to be split during swapin.
+   Usually because failed to allocate a huge page.
+
 As the system ages, allocating huge pages may be expensive as the
 system uses memory compaction to copy data around memory to free a
 huge page for use. There are some counters in ``/proc/vmstat`` to help
diff --git a/include/linux/vm_event_item.h b/include/linux/vm_event_item.h
index 5c7f010676a7..7b438548a78e 100644
--- a/include/linux/vm_event_item.h
+++ b/include/linux/vm_event_item.h
@@ -88,6 +88,8 @@ enum vm_event_item { PGPGIN, PGPGOUT, PSWPIN, PSWPOUT,
THP_ZERO_PAGE_ALLOC_FAILED,
THP_SWPOUT,
THP_SWPOUT_FALLBACK,
+   THP_SWPIN,
+   THP_SWPIN_FALLBACK,
 #endif
 #ifdef CONFIG_MEMORY_BALLOON
BALLOON_INFLATE,
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index ac79ae2ab257..8d49a11e83ee 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -1664,8 +1664,10 @@ int do_huge_pmd_swap_page(struct vm_fault *vmf, pmd_t 
orig_pmd)
/* swapoff occurs under us */
} else if (ret == -EINVAL)
ret = 0;
-   else
+   else {
+   count_vm_event(THP_SWPIN_FALLBACK);
goto fallback;
+   }
}
delayacct_clear_flag(DELAYACCT_PF_SWAPIN);
goto out;
diff --git a/mm/page_io.c b/mm/page_io.c
index b41cf9644585..96277058681e 100644
--- a/mm/page_io.c
+++ b/mm/page_io.c
@@ -347,6 +347,15 @@ int __swap_writepage(struct page *page, struct 
writeback_control *wbc,
return ret;
 }
 
+static inline void count_swpin_vm_event(struct page *page)
+{
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+   if (unlikely(PageTransHuge(page)))
+   count_vm_event(THP_SWPIN);
+#endif
+   count_vm_events(PSWPIN, hpage_nr_pages(page));
+}
+
 int swap_readpage(struct page *page, bool synchronous)
 {
struct bio *bio;
@@ -370,7 +379,7 @@ int swap_readpage(struct page *page, bool synchronous)
 
ret = mapping->a_ops->readpage(swap_file, page);
if (!ret)
-   count_vm_event(PSWPIN);
+   count_swpin_vm_event(page);
return ret;
}
 
@@ -381,7 +390,7 @@ int swap_readpage(struct page *page, bool synchronous)
unlock_page(page);
}
 
-   count_vm_event(PSWPIN);
+   count_swpin_vm_event(page);
return 0;
}
 
@@ -400,7 +409,7 @@ int swap_readpage(struct page *page, bool synchronous)
get_task_struct(current);
bio->bi_private = current;
bio_set_op_attrs(bio, REQ_OP_READ, 0);
-   count_vm_event(PSWPIN);
+   count_swpin_vm_event(page);
bio_get(bio);
qc = submit_bio(bio);
while (synchronous) {
diff --git a/mm/vmstat.c b/mm/vmstat.c
index 75eda9c2b260..259c7bddbb6e 100644
--- a/mm/vmstat.c
+++ b/mm/vmstat.c
@@ -1263,6 +1263,8 @@ const char * const vmstat_text[] = {
"thp_zero_page_alloc_failed",
"thp_swpout",
"thp_swpout_fallback",
+   "thp_swpin",
+   "thp_swpin_fallback",
 #endif
 #ifdef CONFIG_MEMORY_BALLOON
"balloon_inflate",
-- 
2.16.4

[PATCH -mm -v4 09/21] mm, THP, swap: Swapin a THP as a whole

2018-06-21 Thread Huang, Ying

From: Huang Ying 

With this patch, when page fault handler find a PMD swap mapping, it
will swap in a THP as a whole.  This avoids the overhead of
splitting/collapsing before/after the THP swapping.  And improves the
swap performance greatly for reduced page fault count etc.

do_huge_pmd_swap_page() is added in the patch to implement this.  It
is similar to do_swap_page() for normal page swapin.

If failing to allocate a THP, the huge swap cluster and the PMD swap
mapping will be split to fallback to normal page swapin.

If the huge swap cluster has been split already, the PMD swap mapping
will be split to fallback to normal page swapin.

Signed-off-by: "Huang, Ying" 
Cc: "Kirill A. Shutemov" 
Cc: Andrea Arcangeli 
Cc: Michal Hocko 
Cc: Johannes Weiner 
Cc: Shaohua Li 
Cc: Hugh Dickins 
Cc: Minchan Kim 
Cc: Rik van Riel 
Cc: Dave Hansen 
Cc: Naoya Horiguchi 
Cc: Zi Yan 
Cc: Daniel Jordan 
---
 include/linux/huge_mm.h |   9 +++
 include/linux/swap.h|   9 +++
 mm/huge_memory.c| 170 
 mm/memory.c |  16 +++--
 4 files changed, 198 insertions(+), 6 deletions(-)

diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h
index c5b8af173f67..42117b75de2d 100644
--- a/include/linux/huge_mm.h
+++ b/include/linux/huge_mm.h
@@ -403,4 +403,13 @@ static inline gfp_t alloc_hugepage_direct_gfpmask(struct 
vm_area_struct *vma)
 }
 #endif /* CONFIG_TRANSPARENT_HUGEPAGE */
 
+#ifdef CONFIG_THP_SWAP
+extern int do_huge_pmd_swap_page(struct vm_fault *vmf, pmd_t orig_pmd);
+#else /* CONFIG_THP_SWAP */
+static inline int do_huge_pmd_swap_page(struct vm_fault *vmf, pmd_t orig_pmd)
+{
+   return 0;
+}
+#endif /* CONFIG_THP_SWAP */
+
 #endif /* _LINUX_HUGE_MM_H */
diff --git a/include/linux/swap.h b/include/linux/swap.h
index d2e017dd7bbd..5832a750baed 100644
--- a/include/linux/swap.h
+++ b/include/linux/swap.h
@@ -560,6 +560,15 @@ static inline struct page *lookup_swap_cache(swp_entry_t 
swp,
return NULL;
 }
 
+static inline struct page *read_swap_cache_async(swp_entry_t swp,
+gfp_t gft_mask,
+struct vm_area_struct *vma,
+unsigned long addr,
+bool do_poll)
+{
+   return NULL;
+}
+
 static inline int add_to_swap(struct page *page)
 {
return 0;
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index 275a4e616ec9..ac79ae2ab257 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -33,6 +33,8 @@
 #include 
 #include 
 #include 
+#include 
+#include 
 
 #include 
 #include 
@@ -1609,6 +1611,174 @@ static void __split_huge_swap_pmd(struct vm_area_struct 
*vma,
smp_wmb(); /* make pte visible before pmd */
pmd_populate(mm, pmd, pgtable);
 }
+
+static int split_huge_swap_pmd(struct vm_area_struct *vma, pmd_t *pmd,
+  unsigned long address, pmd_t orig_pmd)
+{
+   struct mm_struct *mm = vma->vm_mm;
+   spinlock_t *ptl;
+   int ret = 0;
+
+   ptl = pmd_lock(mm, pmd);
+   if (pmd_same(*pmd, orig_pmd))
+   __split_huge_swap_pmd(vma, address & HPAGE_PMD_MASK, pmd);
+   else
+   ret = -ENOENT;
+   spin_unlock(ptl);
+
+   return ret;
+}
+
+int do_huge_pmd_swap_page(struct vm_fault *vmf, pmd_t orig_pmd)
+{
+   struct page *page;
+   struct mem_cgroup *memcg;
+   struct vm_area_struct *vma = vmf->vma;
+   unsigned long haddr = vmf->address & HPAGE_PMD_MASK;
+   swp_entry_t entry;
+   pmd_t pmd;
+   int i, locked, exclusive = 0, ret = 0;
+
+   entry = pmd_to_swp_entry(orig_pmd);
+   VM_BUG_ON(non_swap_entry(entry));
+   delayacct_set_flag(DELAYACCT_PF_SWAPIN);
+retry:
+   page = lookup_swap_cache(entry, NULL, vmf->address);
+   if (!page) {
+   page = read_swap_cache_async(entry, GFP_HIGHUSER_MOVABLE, vma,
+haddr, false);
+   if (!page) {
+   /*
+* Back out if somebody else faulted in this pmd
+* while we released the pmd lock.
+*/
+   if (likely(pmd_same(*vmf->pmd, orig_pmd))) {
+   ret = split_swap_cluster(entry, false);
+   /*
+* Retry if somebody else swap in the swap
+* entry
+*/
+   if (ret == -EEXIST) {
+   ret = 0;
+   goto retry;
+   /* swapoff occurs under us */
+   } else if (ret == -EINVAL)
+   ret = 0;
+   else
+

[PATCH -mm -v4 08/21] mm, THP, swap: Support to read a huge swap cluster for swapin a THP

2018-06-21 Thread Huang, Ying

From: Huang Ying 

To swapin a THP as a whole, we need to read a huge swap cluster from
the swap device.  This patch revised the __read_swap_cache_async() and
its callers and callees to support this.  If __read_swap_cache_async()
find the swap cluster of the specified swap entry is huge, it will try
to allocate a THP, add it into the swap cache.  So later the contents
of the huge swap cluster can be read into the THP.

Signed-off-by: "Huang, Ying" 
Cc: "Kirill A. Shutemov" 
Cc: Andrea Arcangeli 
Cc: Michal Hocko 
Cc: Johannes Weiner 
Cc: Shaohua Li 
Cc: Hugh Dickins 
Cc: Minchan Kim 
Cc: Rik van Riel 
Cc: Dave Hansen 
Cc: Naoya Horiguchi 
Cc: Zi Yan 
Cc: Daniel Jordan 
---
 include/linux/huge_mm.h | 38 
 include/linux/swap.h|  4 +--
 mm/huge_memory.c| 26 -
 mm/swap_state.c | 77 ++---
 mm/swapfile.c   | 11 ---
 5 files changed, 100 insertions(+), 56 deletions(-)

diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h
index 213d32e57c39..c5b8af173f67 100644
--- a/include/linux/huge_mm.h
+++ b/include/linux/huge_mm.h
@@ -251,6 +251,39 @@ static inline bool thp_migration_supported(void)
return IS_ENABLED(CONFIG_ARCH_ENABLE_THP_MIGRATION);
 }
 
+/*
+ * always: directly stall for all thp allocations
+ * defer: wake kswapd and fail if not immediately available
+ * defer+madvise: wake kswapd and directly stall for MADV_HUGEPAGE, otherwise
+ *   fail if not immediately available
+ * madvise: directly stall for MADV_HUGEPAGE, otherwise fail if not immediately
+ * available
+ * never: never stall for any thp allocation
+ */
+static inline gfp_t alloc_hugepage_direct_gfpmask(struct vm_area_struct *vma)
+{
+   bool vma_madvised;
+
+   if (!vma)
+   return GFP_TRANSHUGE_LIGHT;
+   vma_madvised = !!(vma->vm_flags & VM_HUGEPAGE);
+   if (test_bit(TRANSPARENT_HUGEPAGE_DEFRAG_DIRECT_FLAG,
+&transparent_hugepage_flags))
+   return GFP_TRANSHUGE | (vma_madvised ? 0 : __GFP_NORETRY);
+   if (test_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_FLAG,
+&transparent_hugepage_flags))
+   return GFP_TRANSHUGE_LIGHT | __GFP_KSWAPD_RECLAIM;
+   if (test_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_OR_MADV_FLAG,
+&transparent_hugepage_flags))
+   return GFP_TRANSHUGE_LIGHT |
+   (vma_madvised ? __GFP_DIRECT_RECLAIM :
+   __GFP_KSWAPD_RECLAIM);
+   if (test_bit(TRANSPARENT_HUGEPAGE_DEFRAG_REQ_MADV_FLAG,
+&transparent_hugepage_flags))
+   return GFP_TRANSHUGE_LIGHT |
+   (vma_madvised ? __GFP_DIRECT_RECLAIM : 0);
+   return GFP_TRANSHUGE_LIGHT;
+}
 #else /* CONFIG_TRANSPARENT_HUGEPAGE */
 #define HPAGE_PMD_SHIFT ({ BUILD_BUG(); 0; })
 #define HPAGE_PMD_MASK ({ BUILD_BUG(); 0; })
@@ -363,6 +396,11 @@ static inline bool thp_migration_supported(void)
 {
return false;
 }
+
+static inline gfp_t alloc_hugepage_direct_gfpmask(struct vm_area_struct *vma)
+{
+   return 0;
+}
 #endif /* CONFIG_TRANSPARENT_HUGEPAGE */
 
 #endif /* _LINUX_HUGE_MM_H */
diff --git a/include/linux/swap.h b/include/linux/swap.h
index 878f132dabc0..d2e017dd7bbd 100644
--- a/include/linux/swap.h
+++ b/include/linux/swap.h
@@ -462,7 +462,7 @@ extern sector_t map_swap_page(struct page *, struct 
block_device **);
 extern sector_t swapdev_block(int, pgoff_t);
 extern int page_swapcount(struct page *);
 extern int __swap_count(swp_entry_t entry);
-extern int __swp_swapcount(swp_entry_t entry);
+extern int __swp_swapcount(swp_entry_t entry, bool *huge_cluster);
 extern int swp_swapcount(swp_entry_t entry);
 extern struct swap_info_struct *page_swap_info(struct page *);
 extern struct swap_info_struct *swp_swap_info(swp_entry_t entry);
@@ -589,7 +589,7 @@ static inline int __swap_count(swp_entry_t entry)
return 0;
 }
 
-static inline int __swp_swapcount(swp_entry_t entry)
+static inline int __swp_swapcount(swp_entry_t entry, bool *huge_cluster)
 {
return 0;
 }
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index 586d8693b8af..275a4e616ec9 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -620,32 +620,6 @@ static int __do_huge_pmd_anonymous_page(struct vm_fault 
*vmf, struct page *page,
 
 }
 
-/*
- * always: directly stall for all thp allocations
- * defer: wake kswapd and fail if not immediately available
- * defer+madvise: wake kswapd and directly stall for MADV_HUGEPAGE, otherwise
- *   fail if not immediately available
- * madvise: directly stall for MADV_HUGEPAGE, otherwise fail if not immediately
- * available
- * never: never stall for any thp allocation
- */
-static inline gfp_t alloc_hugepage_direct_gfpmask(struct vm_area_struct *vma)
-{
-   const bool vma_madvised = !!(vma->v

[PATCH -mm -v4 07/21] mm, THP, swap: Support PMD swap mapping in split_swap_cluster()

2018-06-21 Thread Huang, Ying

From: Huang Ying 

When splitting a THP in swap cache or failing to allocate a THP when
swapin a huge swap cluster, the huge swap cluster will be split.  In
addition to clear the huge flag of the swap cluster, the PMD swap
mapping count recorded in cluster_count() will be set to 0.  But we
will not touch PMD swap mappings themselves, because it is hard to
find them all sometimes.  When the PMD swap mappings are operated
later, it will be found that the huge swap cluster has been split and
the PMD swap mappings will be split at that time.

Unless splitting a THP in swap cache (specified via "force"
parameter), split_swap_cluster() will return -EEXIST if there is
SWAP_HAS_CACHE flag in swap_map[offset].  Because this indicates there
is a THP corresponds to this huge swap cluster, and it isn't desired
to split the THP.

When splitting a THP in swap cache, the position to call
split_swap_cluster() is changed to before unlocking sub-pages.  So
that all sub-pages will be kept locked from the THP has been split to
the huge swap cluster is split.  This makes the code much easier to be
reasoned.

Signed-off-by: "Huang, Ying" 
Cc: "Kirill A. Shutemov" 
Cc: Andrea Arcangeli 
Cc: Michal Hocko 
Cc: Johannes Weiner 
Cc: Shaohua Li 
Cc: Hugh Dickins 
Cc: Minchan Kim 
Cc: Rik van Riel 
Cc: Dave Hansen 
Cc: Naoya Horiguchi 
Cc: Zi Yan 
Cc: Daniel Jordan 
---
 include/linux/swap.h |  4 ++--
 mm/huge_memory.c | 18 --
 mm/swapfile.c| 45 ++---
 3 files changed, 44 insertions(+), 23 deletions(-)

diff --git a/include/linux/swap.h b/include/linux/swap.h
index bb9de2cb952a..878f132dabc0 100644
--- a/include/linux/swap.h
+++ b/include/linux/swap.h
@@ -617,10 +617,10 @@ static inline swp_entry_t get_swap_page(struct page *page)
 #endif /* CONFIG_SWAP */
 
 #ifdef CONFIG_THP_SWAP
-extern int split_swap_cluster(swp_entry_t entry);
+extern int split_swap_cluster(swp_entry_t entry, bool force);
 extern int split_swap_cluster_map(swp_entry_t entry);
 #else
-static inline int split_swap_cluster(swp_entry_t entry)
+static inline int split_swap_cluster(swp_entry_t entry, bool force)
 {
return 0;
 }
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index 2d615328d77f..586d8693b8af 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -2502,6 +2502,17 @@ static void __split_huge_page(struct page *page, struct 
list_head *list,
 
unfreeze_page(head);
 
+   /*
+* Split swap cluster before unlocking sub-pages.  So all
+* sub-pages will be kept locked from THP has been split to
+* swap cluster is split.
+*/
+   if (PageSwapCache(head)) {
+   swp_entry_t entry = { .val = page_private(head) };
+
+   split_swap_cluster(entry, true);
+   }
+
for (i = 0; i < HPAGE_PMD_NR; i++) {
struct page *subpage = head + i;
if (subpage == page)
@@ -2728,12 +2739,7 @@ int split_huge_page_to_list(struct page *page, struct 
list_head *list)
__dec_node_page_state(page, NR_SHMEM_THPS);
spin_unlock(&pgdata->split_queue_lock);
__split_huge_page(page, list, flags);
-   if (PageSwapCache(head)) {
-   swp_entry_t entry = { .val = page_private(head) };
-
-   ret = split_swap_cluster(entry);
-   } else
-   ret = 0;
+   ret = 0;
} else {
if (IS_ENABLED(CONFIG_DEBUG_VM) && mapcount) {
pr_alert("total_mapcount: %u, page_count(): %u\n",
diff --git a/mm/swapfile.c b/mm/swapfile.c
index a0141307f3ac..5ff2da89b77c 100644
--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -1410,21 +1410,6 @@ static void swapcache_free_cluster(swp_entry_t entry)
}
}
 }
-
-int split_swap_cluster(swp_entry_t entry)
-{
-   struct swap_info_struct *si;
-   struct swap_cluster_info *ci;
-   unsigned long offset = swp_offset(entry);
-
-   si = _swap_info_get(entry);
-   if (!si)
-   return -EBUSY;
-   ci = lock_cluster(si, offset);
-   cluster_clear_huge(ci);
-   unlock_cluster(ci);
-   return 0;
-}
 #else
 static inline void swapcache_free_cluster(swp_entry_t entry)
 {
@@ -4069,6 +4054,36 @@ int split_swap_cluster_map(swp_entry_t entry)
unlock_cluster(ci);
return 0;
 }
+
+int split_swap_cluster(swp_entry_t entry, bool force)
+{
+   struct swap_info_struct *si;
+   struct swap_cluster_info *ci;
+   unsigned long offset = swp_offset(entry);
+   int ret = 0;
+
+   si = get_swap_device(entry);
+   if (!si)
+   return -EINVAL;
+   ci = lock_cluster(si, offset);
+   /* The swap cluster has been split by someone else */
+   if (!cluster_is_huge(ci))
+   goto out;
+   VM_BUG_ON(!is_cluster_offset(offset));
+   VM_

[PATCH -mm -v4 17/21] mm, THP, swap: Support PMD swap mapping for MADV_WILLNEED

2018-06-21 Thread Huang, Ying

From: Huang Ying 

During MADV_WILLNEED, for a PMD swap mapping, if THP swapin is enabled
for the VMA, the whole swap cluster will be swapin.  Otherwise, the
huge swap cluster and the PMD swap mapping will be split and fallback
to PTE swap mapping.

Signed-off-by: "Huang, Ying" 
Cc: "Kirill A. Shutemov" 
Cc: Andrea Arcangeli 
Cc: Michal Hocko 
Cc: Johannes Weiner 
Cc: Shaohua Li 
Cc: Hugh Dickins 
Cc: Minchan Kim 
Cc: Rik van Riel 
Cc: Dave Hansen 
Cc: Naoya Horiguchi 
Cc: Zi Yan 
Cc: Daniel Jordan 
---
 mm/madvise.c | 26 --
 1 file changed, 24 insertions(+), 2 deletions(-)

diff --git a/mm/madvise.c b/mm/madvise.c
index fc423d96d4d6..6c7a96357626 100644
--- a/mm/madvise.c
+++ b/mm/madvise.c
@@ -196,14 +196,36 @@ static int swapin_walk_pmd_entry(pmd_t *pmd, unsigned 
long start,
pte_t *orig_pte;
struct vm_area_struct *vma = walk->private;
unsigned long index;
+   swp_entry_t entry;
+   struct page *page;
+   pmd_t pmdval;
+
+   pmdval = *pmd;
+   if (thp_swap_supported() && is_swap_pmd(pmdval) &&
+   !is_pmd_migration_entry(pmdval)) {
+   entry = pmd_to_swp_entry(pmdval);
+   if (!transparent_hugepage_swapin_enabled(vma)) {
+   if (!split_swap_cluster(entry, false))
+   split_huge_swap_pmd(vma, pmd, start, pmdval);
+   } else {
+   page = read_swap_cache_async(entry,
+GFP_HIGHUSER_MOVABLE,
+vma, start, false);
+   /* The swap cluster has been split under us */
+   if (page) {
+   if (!PageTransHuge(page))
+   split_huge_swap_pmd(vma, pmd, start,
+   pmdval);
+   put_page(page);
+   }
+   }
+   }
 
if (pmd_none_or_trans_huge_or_clear_bad(pmd))
return 0;
 
for (index = start; index != end; index += PAGE_SIZE) {
pte_t pte;
-   swp_entry_t entry;
-   struct page *page;
spinlock_t *ptl;
 
orig_pte = pte_offset_map_lock(vma->vm_mm, pmd, start, &ptl);
-- 
2.16.4

[PATCH -mm -v4 14/21] mm, cgroup, THP, swap: Support to move swap account for PMD swap mapping

2018-06-21 Thread Huang, Ying

From: Huang Ying 

Previously the huge swap cluster will be split after the THP is
swapout.  Now, to support to swapin the THP as a whole, the huge swap
cluster will not be split after the THP is reclaimed.  So in memcg, we
need to move the swap account for PMD swap mappings in the process's
page table.

When the page table is scanned during moving memcg charge, the PMD
swap mapping will be identified.  And mem_cgroup_move_swap_account()
and its callee is revised to move account for the whole huge swap
cluster.  If the swap cluster mapped by PMD has been split, the PMD
swap mapping will be split and fallback to PTE processing.  Because
the swap slots of the swap cluster may have been swapin or moved to
other cgroup already.

Because there is no way to prevent a huge swap cluster from being
split except when it has SWAP_HAS_CACHE flag set.  It is possible for
the huge swap cluster to be split and the charge for the swap slots
inside to be changed, after we check the PMD swap mapping and the huge
swap cluster before we commit the charge moving.  But the race window
is so small, that we will just ignore the race.

Signed-off-by: "Huang, Ying" 
Cc: "Kirill A. Shutemov" 
Cc: Andrea Arcangeli 
Cc: Michal Hocko 
Cc: Johannes Weiner 
Cc: Shaohua Li 
Cc: Hugh Dickins 
Cc: Minchan Kim 
Cc: Rik van Riel 
Cc: Dave Hansen 
Cc: Naoya Horiguchi 
Cc: Zi Yan 
Cc: Daniel Jordan 
---
 include/linux/huge_mm.h |   9 +++
 include/linux/swap.h|   6 ++
 include/linux/swap_cgroup.h |   3 +-
 mm/huge_memory.c|  12 +---
 mm/memcontrol.c | 138 +++-
 mm/swap_cgroup.c|  45 ---
 mm/swapfile.c   |  12 
 7 files changed, 180 insertions(+), 45 deletions(-)

diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h
index bc92c2944756..8e706590fbc1 100644
--- a/include/linux/huge_mm.h
+++ b/include/linux/huge_mm.h
@@ -406,6 +406,9 @@ static inline gfp_t alloc_hugepage_direct_gfpmask(struct 
vm_area_struct *vma)
 #endif /* CONFIG_TRANSPARENT_HUGEPAGE */
 
 #ifdef CONFIG_THP_SWAP
+extern void __split_huge_swap_pmd(struct vm_area_struct *vma,
+ unsigned long haddr,
+ pmd_t *pmd);
 extern int split_huge_swap_pmd(struct vm_area_struct *vma, pmd_t *pmd,
   unsigned long address, pmd_t orig_pmd);
 extern int do_huge_pmd_swap_page(struct vm_fault *vmf, pmd_t orig_pmd);
@@ -433,6 +436,12 @@ static inline bool transparent_hugepage_swapin_enabled(
return false;
 }
 #else /* CONFIG_THP_SWAP */
+static inline void __split_huge_swap_pmd(struct vm_area_struct *vma,
+unsigned long haddr,
+pmd_t *pmd)
+{
+}
+
 static inline int split_huge_swap_pmd(struct vm_area_struct *vma, pmd_t *pmd,
  unsigned long address, pmd_t orig_pmd)
 {
diff --git a/include/linux/swap.h b/include/linux/swap.h
index 5832a750baed..88677acdcff6 100644
--- a/include/linux/swap.h
+++ b/include/linux/swap.h
@@ -628,6 +628,7 @@ static inline swp_entry_t get_swap_page(struct page *page)
 #ifdef CONFIG_THP_SWAP
 extern int split_swap_cluster(swp_entry_t entry, bool force);
 extern int split_swap_cluster_map(swp_entry_t entry);
+extern bool in_huge_swap_cluster(swp_entry_t entry);
 #else
 static inline int split_swap_cluster(swp_entry_t entry, bool force)
 {
@@ -638,6 +639,11 @@ static inline int split_swap_cluster_map(swp_entry_t entry)
 {
return 0;
 }
+
+static inline bool in_huge_swap_cluster(swp_entry_t entry)
+{
+   return false;
+}
 #endif
 
 #ifdef CONFIG_MEMCG
diff --git a/include/linux/swap_cgroup.h b/include/linux/swap_cgroup.h
index a12dd1c3966c..c40fb52b0563 100644
--- a/include/linux/swap_cgroup.h
+++ b/include/linux/swap_cgroup.h
@@ -7,7 +7,8 @@
 #ifdef CONFIG_MEMCG_SWAP
 
 extern unsigned short swap_cgroup_cmpxchg(swp_entry_t ent,
-   unsigned short old, unsigned short new);
+   unsigned short old, unsigned short new,
+   unsigned int nr_ents);
 extern unsigned short swap_cgroup_record(swp_entry_t ent, unsigned short id,
 unsigned int nr_ents);
 extern unsigned short lookup_swap_cgroup_id(swp_entry_t ent);
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index fe946ca09b28..adad32d54ff2 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -1630,9 +1630,9 @@ int do_huge_pmd_numa_page(struct vm_fault *vmf, pmd_t pmd)
 }
 
 #ifdef CONFIG_THP_SWAP
-static void __split_huge_swap_pmd(struct vm_area_struct *vma,
- unsigned long haddr,
- pmd_t *pmd)
+void __split_huge_swap_pmd(struct vm_area_struct *vma,
+  unsigned long haddr,
+  pmd_t *pmd)
 {
s

[PATCH -mm -v4 12/21] mm, THP, swap: Support PMD swap mapping in swapoff

2018-06-21 Thread Huang, Ying

From: Huang Ying 

During swapoff, for a huge swap cluster, we need to allocate a THP,
read its contents into the THP and unuse the PMD and PTE swap mappings
to it.  If failed to allocate a THP, the huge swap cluster will be
split.

During unuse, if it is found that the swap cluster mapped by a PMD
swap mapping is split already, we will split the PMD swap mapping and
unuse the PTEs.

Signed-off-by: "Huang, Ying" 
Cc: "Kirill A. Shutemov" 
Cc: Andrea Arcangeli 
Cc: Michal Hocko 
Cc: Johannes Weiner 
Cc: Shaohua Li 
Cc: Hugh Dickins 
Cc: Minchan Kim 
Cc: Rik van Riel 
Cc: Dave Hansen 
Cc: Naoya Horiguchi 
Cc: Zi Yan 
Cc: Daniel Jordan 
---
 include/asm-generic/pgtable.h | 15 ++--
 include/linux/huge_mm.h   |  8 
 mm/huge_memory.c  |  4 +-
 mm/swapfile.c | 86 ++-
 4 files changed, 98 insertions(+), 15 deletions(-)

diff --git a/include/asm-generic/pgtable.h b/include/asm-generic/pgtable.h
index bb8354981a36..caa381962cd2 100644
--- a/include/asm-generic/pgtable.h
+++ b/include/asm-generic/pgtable.h
@@ -931,22 +931,13 @@ static inline int 
pmd_none_or_trans_huge_or_clear_bad(pmd_t *pmd)
barrier();
 #endif
/*
-* !pmd_present() checks for pmd migration entries
-*
-* The complete check uses is_pmd_migration_entry() in linux/swapops.h
-* But using that requires moving current function and 
pmd_trans_unstable()
-* to linux/swapops.h to resovle dependency, which is too much code 
move.
-*
-* !pmd_present() is equivalent to is_pmd_migration_entry() currently,
-* because !pmd_present() pages can only be under migration not swapped
-* out.
-*
-* pmd_none() is preseved for future condition checks on pmd migration
+* pmd_none() is preseved for future condition checks on pmd swap
 * entries and not confusing with this function name, although it is
 * redundant with !pmd_present().
 */
if (pmd_none(pmdval) || pmd_trans_huge(pmdval) ||
-   (IS_ENABLED(CONFIG_ARCH_ENABLE_THP_MIGRATION) && 
!pmd_present(pmdval)))
+   ((IS_ENABLED(CONFIG_ARCH_ENABLE_THP_MIGRATION) ||
+ IS_ENABLED(CONFIG_THP_SWAP)) && !pmd_present(pmdval)))
return 1;
if (unlikely(pmd_bad(pmdval))) {
pmd_clear_bad(pmd);
diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h
index 7931fa888f11..bc92c2944756 100644
--- a/include/linux/huge_mm.h
+++ b/include/linux/huge_mm.h
@@ -406,6 +406,8 @@ static inline gfp_t alloc_hugepage_direct_gfpmask(struct 
vm_area_struct *vma)
 #endif /* CONFIG_TRANSPARENT_HUGEPAGE */
 
 #ifdef CONFIG_THP_SWAP
+extern int split_huge_swap_pmd(struct vm_area_struct *vma, pmd_t *pmd,
+  unsigned long address, pmd_t orig_pmd);
 extern int do_huge_pmd_swap_page(struct vm_fault *vmf, pmd_t orig_pmd);
 
 static inline bool transparent_hugepage_swapin_enabled(
@@ -431,6 +433,12 @@ static inline bool transparent_hugepage_swapin_enabled(
return false;
 }
 #else /* CONFIG_THP_SWAP */
+static inline int split_huge_swap_pmd(struct vm_area_struct *vma, pmd_t *pmd,
+ unsigned long address, pmd_t orig_pmd)
+{
+   return 0;
+}
+
 static inline int do_huge_pmd_swap_page(struct vm_fault *vmf, pmd_t orig_pmd)
 {
return 0;
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index da42d1cdc26a..73fc77633642 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -1663,8 +1663,8 @@ static void __split_huge_swap_pmd(struct vm_area_struct 
*vma,
pmd_populate(mm, pmd, pgtable);
 }
 
-static int split_huge_swap_pmd(struct vm_area_struct *vma, pmd_t *pmd,
-  unsigned long address, pmd_t orig_pmd)
+int split_huge_swap_pmd(struct vm_area_struct *vma, pmd_t *pmd,
+   unsigned long address, pmd_t orig_pmd)
 {
struct mm_struct *mm = vma->vm_mm;
spinlock_t *ptl;
diff --git a/mm/swapfile.c b/mm/swapfile.c
index e1e43654407c..34e64f3570c3 100644
--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -1933,6 +1933,11 @@ static inline int pte_same_as_swp(pte_t pte, pte_t 
swp_pte)
return pte_same(pte_swp_clear_soft_dirty(pte), swp_pte);
 }
 
+static inline int pmd_same_as_swp(pmd_t pmd, pmd_t swp_pmd)
+{
+   return pmd_same(pmd_swp_clear_soft_dirty(pmd), swp_pmd);
+}
+
 /*
  * No need to decide whether this PTE shares the swap entry with others,
  * just let do_wp_page work it out if a write is requested later - to
@@ -1994,6 +1999,57 @@ static int unuse_pte(struct vm_area_struct *vma, pmd_t 
*pmd,
return ret;
 }
 
+#ifdef CONFIG_THP_SWAP
+static int unuse_pmd(struct vm_area_struct *vma, pmd_t *pmd,
+unsigned long addr, swp_entry_t entry, struct page *page)
+{
+   struct mem_cgroup *memcg;
+   struct swap_info_struct *si;
+   spinlock_t

[PATCH -mm -v4 21/21] mm, THP: Avoid to split THP when reclaim MADV_FREE THP

2018-06-21 Thread Huang, Ying

From: Huang Ying 

Previously, to reclaim MADV_FREE THP, the THP will be split firstly,
then reclaim each sub-pages.  This wastes cycles to split THP and
unmap and free each sub-pages, and split THP even if it has been
written since MADV_FREE.  We have to do this because MADV_FREE THP
reclaiming shares same try_to_unmap() calling with swap, while swap
needs to split the PMD page mapping at that time.  Now swap can
process PMD mapping, this makes it easy to avoid to split THP when
MADV_FREE THP is reclaimed.

Signed-off-by: "Huang, Ying" 
Cc: "Kirill A. Shutemov" 
Cc: Andrea Arcangeli 
Cc: Michal Hocko 
Cc: Johannes Weiner 
Cc: Shaohua Li 
Cc: Hugh Dickins 
Cc: Minchan Kim 
Cc: Rik van Riel 
Cc: Dave Hansen 
Cc: Naoya Horiguchi 
Cc: Zi Yan 
Cc: Daniel Jordan 
---
 mm/huge_memory.c | 41 -
 mm/vmscan.c  |  3 ++-
 2 files changed, 34 insertions(+), 10 deletions(-)

diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index 195f24040b41..7c6edd897f15 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -1671,6 +1671,15 @@ int do_huge_pmd_numa_page(struct vm_fault *vmf, pmd_t 
pmd)
return 0;
 }
 
+static inline void zap_deposited_table(struct mm_struct *mm, pmd_t *pmd)
+{
+   pgtable_t pgtable;
+
+   pgtable = pgtable_trans_huge_withdraw(mm, pmd);
+   pte_free(mm, pgtable);
+   mm_dec_nr_ptes(mm);
+}
+
 #ifdef CONFIG_THP_SWAP
 void __split_huge_swap_pmd(struct vm_area_struct *vma,
   unsigned long haddr,
@@ -1885,6 +1894,28 @@ bool set_pmd_swap_entry(struct page_vma_mapped_walk 
*pvmw, struct page *page,
pmd_t swp_pmd;
swp_entry_t entry = { .val = page_private(page) };
 
+   if (unlikely(PageSwapBacked(page) != PageSwapCache(page))) {
+   WARN_ON_ONCE(1);
+   return false;
+   }
+
+   /* MADV_FREE page check */
+   if (!PageSwapBacked(page)) {
+   if (!PageDirty(page)) {
+   zap_deposited_table(mm, pvmw->pmd);
+   add_mm_counter(mm, MM_ANONPAGES, -HPAGE_PMD_NR);
+   goto out_remove_rmap;
+   }
+
+   /*
+* If the page was redirtied, it cannot be
+* discarded. Remap the page to page table.
+*/
+   set_pmd_at(mm, address, pvmw->pmd, pmdval);
+   SetPageSwapBacked(page);
+   return false;
+   }
+
if (swap_duplicate(&entry, true) < 0) {
set_pmd_at(mm, address, pvmw->pmd, pmdval);
return false;
@@ -1902,21 +1933,13 @@ bool set_pmd_swap_entry(struct page_vma_mapped_walk 
*pvmw, struct page *page,
swp_pmd = pmd_swp_mksoft_dirty(swp_pmd);
set_pmd_at(mm, address, pvmw->pmd, swp_pmd);
 
+out_remove_rmap:
page_remove_rmap(page, true);
put_page(page);
return true;
 }
 #endif
 
-static inline void zap_deposited_table(struct mm_struct *mm, pmd_t *pmd)
-{
-   pgtable_t pgtable;
-
-   pgtable = pgtable_trans_huge_withdraw(mm, pmd);
-   pte_free(mm, pgtable);
-   mm_dec_nr_ptes(mm);
-}
-
 /*
  * Return true if we do MADV_FREE successfully on entire pmd page.
  * Otherwise, return false.
diff --git a/mm/vmscan.c b/mm/vmscan.c
index 891d3c7b8f21..80d0150dd028 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -1137,7 +1137,8 @@ static unsigned long shrink_page_list(struct list_head 
*page_list,
/* Adding to swap updated mapping */
mapping = page_mapping(page);
}
-   } else if (unlikely(PageTransHuge(page))) {
+   } else if (unlikely(PageTransHuge(page)) &&
+  (!thp_swap_supported() || !PageAnon(page))) {
/* Split file THP */
if (split_huge_page_to_list(page, page_list))
goto keep_locked;
-- 
2.16.4

[PATCH -mm -v4 19/21] mm, THP, swap: Support PMD swap mapping in common path

2018-06-21 Thread Huang, Ying

From: Huang Ying 

Original code is only for PMD migration entry, it is revised to
support PMD swap mapping.

Signed-off-by: "Huang, Ying" 
Cc: "Kirill A. Shutemov" 
Cc: Andrea Arcangeli 
Cc: Michal Hocko 
Cc: Johannes Weiner 
Cc: Shaohua Li 
Cc: Hugh Dickins 
Cc: Minchan Kim 
Cc: Rik van Riel 
Cc: Dave Hansen 
Cc: Naoya Horiguchi 
Cc: Zi Yan 
Cc: Daniel Jordan 
---
 fs/proc/task_mmu.c |  8 
 mm/gup.c   | 34 ++
 mm/huge_memory.c   |  6 +++---
 mm/mempolicy.c |  2 +-
 4 files changed, 30 insertions(+), 20 deletions(-)

diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c
index e9679016271f..afcf6ac57219 100644
--- a/fs/proc/task_mmu.c
+++ b/fs/proc/task_mmu.c
@@ -978,7 +978,7 @@ static inline void clear_soft_dirty_pmd(struct 
vm_area_struct *vma,
pmd = pmd_clear_soft_dirty(pmd);
 
set_pmd_at(vma->vm_mm, addr, pmdp, pmd);
-   } else if (is_migration_entry(pmd_to_swp_entry(pmd))) {
+   } else if (is_swap_pmd(pmd)) {
pmd = pmd_swp_clear_soft_dirty(pmd);
set_pmd_at(vma->vm_mm, addr, pmdp, pmd);
}
@@ -1309,7 +1309,7 @@ static int pagemap_pmd_range(pmd_t *pmdp, unsigned long 
addr, unsigned long end,
frame = pmd_pfn(pmd) +
((addr & ~PMD_MASK) >> PAGE_SHIFT);
}
-#ifdef CONFIG_ARCH_ENABLE_THP_MIGRATION
+#if defined(CONFIG_ARCH_ENABLE_THP_MIGRATION) || defined(CONFIG_THP_SWAP)
else if (is_swap_pmd(pmd)) {
swp_entry_t entry = pmd_to_swp_entry(pmd);
unsigned long offset;
@@ -1323,8 +1323,8 @@ static int pagemap_pmd_range(pmd_t *pmdp, unsigned long 
addr, unsigned long end,
flags |= PM_SWAP;
if (pmd_swp_soft_dirty(pmd))
flags |= PM_SOFT_DIRTY;
-   VM_BUG_ON(!is_pmd_migration_entry(pmd));
-   page = migration_entry_to_page(entry);
+   if (is_pmd_migration_entry(pmd))
+   page = migration_entry_to_page(entry);
}
 #endif
 
diff --git a/mm/gup.c b/mm/gup.c
index b70d7ba7cc13..84ba4ad8120d 100644
--- a/mm/gup.c
+++ b/mm/gup.c
@@ -216,6 +216,7 @@ static struct page *follow_pmd_mask(struct vm_area_struct 
*vma,
spinlock_t *ptl;
struct page *page;
struct mm_struct *mm = vma->vm_mm;
+   swp_entry_t entry;
 
pmd = pmd_offset(pudp, address);
/*
@@ -243,18 +244,21 @@ static struct page *follow_pmd_mask(struct vm_area_struct 
*vma,
if (!pmd_present(pmdval)) {
if (likely(!(flags & FOLL_MIGRATION)))
return no_page_table(vma, flags);
-   VM_BUG_ON(thp_migration_supported() &&
- !is_pmd_migration_entry(pmdval));
-   if (is_pmd_migration_entry(pmdval))
+   entry = pmd_to_swp_entry(pmdval);
+   if (thp_migration_supported() && is_migration_entry(entry)) {
pmd_migration_entry_wait(mm, pmd);
-   pmdval = READ_ONCE(*pmd);
-   /*
-* MADV_DONTNEED may convert the pmd to null because
-* mmap_sem is held in read mode
-*/
-   if (pmd_none(pmdval))
+   pmdval = READ_ONCE(*pmd);
+   /*
+* MADV_DONTNEED may convert the pmd to null because
+* mmap_sem is held in read mode
+*/
+   if (pmd_none(pmdval))
+   return no_page_table(vma, flags);
+   goto retry;
+   }
+   if (thp_swap_supported() && !non_swap_entry(entry))
return no_page_table(vma, flags);
-   goto retry;
+   VM_BUG_ON(1);
}
if (pmd_devmap(pmdval)) {
ptl = pmd_lock(mm, pmd);
@@ -276,11 +280,17 @@ static struct page *follow_pmd_mask(struct vm_area_struct 
*vma,
return no_page_table(vma, flags);
}
if (unlikely(!pmd_present(*pmd))) {
+   entry = pmd_to_swp_entry(*pmd);
spin_unlock(ptl);
if (likely(!(flags & FOLL_MIGRATION)))
return no_page_table(vma, flags);
-   pmd_migration_entry_wait(mm, pmd);
-   goto retry_locked;
+   if (thp_migration_supported() && is_migration_entry(entry)) {
+   pmd_migration_entry_wait(mm, pmd);
+   goto retry_locked;
+   }
+   if (thp_swap_supported() && !non_swap_entry(entry))
+   return no_page_table(vma, flags);
+   VM_BUG_ON(1);

[PATCH -mm -v4 11/21] mm, THP, swap: Add sysfs interface to configure THP swapin

2018-06-21 Thread Huang, Ying

From: Huang Ying 

Swapin a THP as a whole isn't desirable at some situations.  For
example, for random access pattern, swapin a THP as a whole will
inflate the reading greatly.  So a sysfs interface:
/sys/kernel/mm/transparent_hugepage/swapin_enabled is added to
configure it.  Three options as follow are provided,

- always: THP swapin will be enabled always

- madvise: THP swapin will be enabled only for VMA with VM_HUGEPAGE
  flag set.

- never: THP swapin will be disabled always

The default configuration is: madvise.

During page fault, if a PMD swap mapping is found and THP swapin is
disabled, the huge swap cluster and the PMD swap mapping will be split
and fallback to normal page swapin.

Signed-off-by: "Huang, Ying" 
Cc: "Kirill A. Shutemov" 
Cc: Andrea Arcangeli 
Cc: Michal Hocko 
Cc: Johannes Weiner 
Cc: Shaohua Li 
Cc: Hugh Dickins 
Cc: Minchan Kim 
Cc: Rik van Riel 
Cc: Dave Hansen 
Cc: Naoya Horiguchi 
Cc: Zi Yan 
Cc: Daniel Jordan 
---
 Documentation/admin-guide/mm/transhuge.rst | 21 +++
 include/linux/huge_mm.h| 31 +++
 mm/huge_memory.c   | 89 --
 3 files changed, 123 insertions(+), 18 deletions(-)

diff --git a/Documentation/admin-guide/mm/transhuge.rst 
b/Documentation/admin-guide/mm/transhuge.rst
index 85e33f785fd7..23aefb17101c 100644
--- a/Documentation/admin-guide/mm/transhuge.rst
+++ b/Documentation/admin-guide/mm/transhuge.rst
@@ -160,6 +160,27 @@ Some userspace (such as a test program, or an optimized 
memory allocation
 
cat /sys/kernel/mm/transparent_hugepage/hpage_pmd_size
 
+Transparent hugepage may be swapout and swapin in one piece without
+splitting.  This will improve the utility of transparent hugepage but
+may inflate the read/write too.  So whether to enable swapin
+transparent hugepage in one piece can be configured as follow.
+
+   echo always >/sys/kernel/mm/transparent_hugepage/swapin_enabled
+   echo madvise >/sys/kernel/mm/transparent_hugepage/swapin_enabled
+   echo never >/sys/kernel/mm/transparent_hugepage/swapin_enabled
+
+always
+   Attempt to allocate a transparent huge page and read it from
+   swap space in one piece every time.
+
+never
+   Always split the swap space and PMD swap mapping and swapin
+   the fault normal page during swapin.
+
+madvise
+   Only swapin the transparent huge page in one piece for
+   MADV_HUGEPAGE madvise regions.
+
 khugepaged will be automatically started when
 transparent_hugepage/enabled is set to "always" or "madvise, and it'll
 be automatically shutdown if it's set to "never".
diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h
index 42117b75de2d..7931fa888f11 100644
--- a/include/linux/huge_mm.h
+++ b/include/linux/huge_mm.h
@@ -63,6 +63,8 @@ enum transparent_hugepage_flag {
 #ifdef CONFIG_DEBUG_VM
TRANSPARENT_HUGEPAGE_DEBUG_COW_FLAG,
 #endif
+   TRANSPARENT_HUGEPAGE_SWAPIN_FLAG,
+   TRANSPARENT_HUGEPAGE_SWAPIN_REQ_MADV_FLAG,
 };
 
 struct kobject;
@@ -405,11 +407,40 @@ static inline gfp_t alloc_hugepage_direct_gfpmask(struct 
vm_area_struct *vma)
 
 #ifdef CONFIG_THP_SWAP
 extern int do_huge_pmd_swap_page(struct vm_fault *vmf, pmd_t orig_pmd);
+
+static inline bool transparent_hugepage_swapin_enabled(
+   struct vm_area_struct *vma)
+{
+   if (vma->vm_flags & VM_NOHUGEPAGE)
+   return false;
+
+   if (is_vma_temporary_stack(vma))
+   return false;
+
+   if (test_bit(MMF_DISABLE_THP, &vma->vm_mm->flags))
+   return false;
+
+   if (transparent_hugepage_flags &
+   (1 << TRANSPARENT_HUGEPAGE_SWAPIN_FLAG))
+   return true;
+
+   if (transparent_hugepage_flags &
+   (1 << TRANSPARENT_HUGEPAGE_SWAPIN_REQ_MADV_FLAG))
+   return !!(vma->vm_flags & VM_HUGEPAGE);
+
+   return false;
+}
 #else /* CONFIG_THP_SWAP */
 static inline int do_huge_pmd_swap_page(struct vm_fault *vmf, pmd_t orig_pmd)
 {
return 0;
 }
+
+static inline bool transparent_hugepage_swapin_enabled(
+   struct vm_area_struct *vma)
+{
+   return false;
+}
 #endif /* CONFIG_THP_SWAP */
 
 #endif /* _LINUX_HUGE_MM_H */
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index 8d49a11e83ee..da42d1cdc26a 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -57,7 +57,8 @@ unsigned long transparent_hugepage_flags __read_mostly =
 #endif
(1<address);
if (!page) {
+   if (!transparent_hugepage_swapin_enabled(vma))
+   goto split;
+
page = read_swap_cache_async(entry, GFP_HIGHUSER_MOVABLE, vma,
 haddr, false);
if (!page) {
@@ -1652,23 +1706,8 @@ int do_huge_pmd_swap_page(struct vm_fault *vmf, pmd_t 
orig_pmd)

[PATCH -mm -v4 18/21] mm, THP, swap: Support PMD swap mapping in mincore()

2018-06-21 Thread Huang, Ying

From: Huang Ying 

During mincore(), for PMD swap mapping, swap cache will be looked up.
If the resulting page isn't compound page, the PMD swap mapping will
be split and fallback to PTE swap mapping processing.

Signed-off-by: "Huang, Ying" 
Cc: "Kirill A. Shutemov" 
Cc: Andrea Arcangeli 
Cc: Michal Hocko 
Cc: Johannes Weiner 
Cc: Shaohua Li 
Cc: Hugh Dickins 
Cc: Minchan Kim 
Cc: Rik van Riel 
Cc: Dave Hansen 
Cc: Naoya Horiguchi 
Cc: Zi Yan 
Cc: Daniel Jordan 
---
 mm/mincore.c | 37 +++--
 1 file changed, 31 insertions(+), 6 deletions(-)

diff --git a/mm/mincore.c b/mm/mincore.c
index a66f2052c7b1..897dd2c187e8 100644
--- a/mm/mincore.c
+++ b/mm/mincore.c
@@ -48,7 +48,8 @@ static int mincore_hugetlb(pte_t *pte, unsigned long hmask, 
unsigned long addr,
  * and is up to date; i.e. that no page-in operation would be required
  * at this time if an application were to map and access this page.
  */
-static unsigned char mincore_page(struct address_space *mapping, pgoff_t pgoff)
+static unsigned char mincore_page(struct address_space *mapping, pgoff_t pgoff,
+ bool *compound)
 {
unsigned char present = 0;
struct page *page;
@@ -86,6 +87,8 @@ static unsigned char mincore_page(struct address_space 
*mapping, pgoff_t pgoff)
 #endif
if (page) {
present = PageUptodate(page);
+   if (compound)
+   *compound = PageCompound(page);
put_page(page);
}
 
@@ -103,7 +106,8 @@ static int __mincore_unmapped_range(unsigned long addr, 
unsigned long end,
 
pgoff = linear_page_index(vma, addr);
for (i = 0; i < nr; i++, pgoff++)
-   vec[i] = mincore_page(vma->vm_file->f_mapping, pgoff);
+   vec[i] = mincore_page(vma->vm_file->f_mapping,
+ pgoff, NULL);
} else {
for (i = 0; i < nr; i++)
vec[i] = 0;
@@ -127,14 +131,36 @@ static int mincore_pte_range(pmd_t *pmd, unsigned long 
addr, unsigned long end,
pte_t *ptep;
unsigned char *vec = walk->private;
int nr = (end - addr) >> PAGE_SHIFT;
+   swp_entry_t entry;
 
ptl = pmd_trans_huge_lock(pmd, vma);
if (ptl) {
-   memset(vec, 1, nr);
+   unsigned char val = 1;
+   bool compound;
+
+   if (thp_swap_supported() && is_swap_pmd(*pmd)) {
+   entry = pmd_to_swp_entry(*pmd);
+   if (!non_swap_entry(entry)) {
+   val = mincore_page(swap_address_space(entry),
+  swp_offset(entry),
+  &compound);
+   /*
+* The huge swap cluster has been
+* split under us
+*/
+   if (!compound) {
+   __split_huge_swap_pmd(vma, addr, pmd);
+   spin_unlock(ptl);
+   goto fallback;
+   }
+   }
+   }
+   memset(vec, val, nr);
spin_unlock(ptl);
goto out;
}
 
+fallback:
if (pmd_trans_unstable(pmd)) {
__mincore_unmapped_range(addr, end, vma, vec);
goto out;
@@ -150,8 +176,7 @@ static int mincore_pte_range(pmd_t *pmd, unsigned long 
addr, unsigned long end,
else if (pte_present(pte))
*vec = 1;
else { /* pte is a swap entry */
-   swp_entry_t entry = pte_to_swp_entry(pte);
-
+   entry = pte_to_swp_entry(pte);
if (non_swap_entry(entry)) {
/*
 * migration or hwpoison entries are always
@@ -161,7 +186,7 @@ static int mincore_pte_range(pmd_t *pmd, unsigned long 
addr, unsigned long end,
} else {
 #ifdef CONFIG_SWAP
*vec = mincore_page(swap_address_space(entry),
-   swp_offset(entry));
+   swp_offset(entry), NULL);
 #else
WARN_ON(1);
*vec = 1;
-- 
2.16.4

[PATCH -mm -v4 20/21] mm, THP, swap: create PMD swap mapping when unmap the THP

2018-06-21 Thread Huang, Ying

From: Huang Ying 

This is the final step of the THP swapin support.  When reclaiming a
anonymous THP, after allocating the huge swap cluster and add the THP
into swap cache, the PMD page mapping will be changed to the mapping
to the swap space.  Previously, the PMD page mapping will be split
before being changed.  In this patch, the unmap code is enhanced not
to split the PMD mapping, but create a PMD swap mapping to replace it
instead.  So later when clear the SWAP_HAS_CACHE flag in the last step
of swapout, the huge swap cluster will be kept instead of being split,
and when swapin, the huge swap cluster will be read as a whole into a
THP.  That is, the THP will not be split during swapout/swapin.  This
can eliminate the overhead of splitting/collapsing, and reduce the
page fault count, etc.  But more important, the utilization of THP is
improved greatly, that is, much more THP will be kept when swapping is
used, so that we can take full advantage of THP including its high
performance for swapout/swapin.

Signed-off-by: "Huang, Ying" 
Cc: "Kirill A. Shutemov" 
Cc: Andrea Arcangeli 
Cc: Michal Hocko 
Cc: Johannes Weiner 
Cc: Shaohua Li 
Cc: Hugh Dickins 
Cc: Minchan Kim 
Cc: Rik van Riel 
Cc: Dave Hansen 
Cc: Naoya Horiguchi 
Cc: Zi Yan 
Cc: Daniel Jordan 
---
 include/linux/huge_mm.h | 11 +++
 mm/huge_memory.c| 30 ++
 mm/rmap.c   | 43 +--
 mm/vmscan.c |  6 +-
 4 files changed, 83 insertions(+), 7 deletions(-)

diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h
index 8e706590fbc1..28e46f078e73 100644
--- a/include/linux/huge_mm.h
+++ b/include/linux/huge_mm.h
@@ -405,6 +405,8 @@ static inline gfp_t alloc_hugepage_direct_gfpmask(struct 
vm_area_struct *vma)
 }
 #endif /* CONFIG_TRANSPARENT_HUGEPAGE */
 
+struct page_vma_mapped_walk;
+
 #ifdef CONFIG_THP_SWAP
 extern void __split_huge_swap_pmd(struct vm_area_struct *vma,
  unsigned long haddr,
@@ -412,6 +414,8 @@ extern void __split_huge_swap_pmd(struct vm_area_struct 
*vma,
 extern int split_huge_swap_pmd(struct vm_area_struct *vma, pmd_t *pmd,
   unsigned long address, pmd_t orig_pmd);
 extern int do_huge_pmd_swap_page(struct vm_fault *vmf, pmd_t orig_pmd);
+extern bool set_pmd_swap_entry(struct page_vma_mapped_walk *pvmw,
+   struct page *page, unsigned long address, pmd_t pmdval);
 
 static inline bool transparent_hugepage_swapin_enabled(
struct vm_area_struct *vma)
@@ -453,6 +457,13 @@ static inline int do_huge_pmd_swap_page(struct vm_fault 
*vmf, pmd_t orig_pmd)
return 0;
 }
 
+static inline bool set_pmd_swap_entry(struct page_vma_mapped_walk *pvmw,
+ struct page *page, unsigned long address,
+ pmd_t pmdval)
+{
+   return false;
+}
+
 static inline bool transparent_hugepage_swapin_enabled(
struct vm_area_struct *vma)
 {
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index e50adc6b59b2..195f24040b41 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -1876,6 +1876,36 @@ int do_huge_pmd_swap_page(struct vm_fault *vmf, pmd_t 
orig_pmd)
count_vm_event(THP_SWPIN_FALLBACK);
goto fallback;
 }
+
+bool set_pmd_swap_entry(struct page_vma_mapped_walk *pvmw, struct page *page,
+   unsigned long address, pmd_t pmdval)
+{
+   struct vm_area_struct *vma = pvmw->vma;
+   struct mm_struct *mm = vma->vm_mm;
+   pmd_t swp_pmd;
+   swp_entry_t entry = { .val = page_private(page) };
+
+   if (swap_duplicate(&entry, true) < 0) {
+   set_pmd_at(mm, address, pvmw->pmd, pmdval);
+   return false;
+   }
+   if (list_empty(&mm->mmlist)) {
+   spin_lock(&mmlist_lock);
+   if (list_empty(&mm->mmlist))
+   list_add(&mm->mmlist, &init_mm.mmlist);
+   spin_unlock(&mmlist_lock);
+   }
+   add_mm_counter(mm, MM_ANONPAGES, -HPAGE_PMD_NR);
+   add_mm_counter(mm, MM_SWAPENTS, HPAGE_PMD_NR);
+   swp_pmd = swp_entry_to_pmd(entry);
+   if (pmd_soft_dirty(pmdval))
+   swp_pmd = pmd_swp_mksoft_dirty(swp_pmd);
+   set_pmd_at(mm, address, pvmw->pmd, swp_pmd);
+
+   page_remove_rmap(page, true);
+   put_page(page);
+   return true;
+}
 #endif
 
 static inline void zap_deposited_table(struct mm_struct *mm, pmd_t *pmd)
diff --git a/mm/rmap.c b/mm/rmap.c
index 5f45d6325c40..4861b1a86e2a 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -1402,12 +1402,51 @@ static bool try_to_unmap_one(struct page *page, struct 
vm_area_struct *vma,
continue;
}
 
+   address = pvmw.address;
+
+#ifdef CONFIG_THP_SWAP
+   /* PMD-mapped THP swap entry */
+   if (thp_swap_supported(

[PATCH -mm -v4 16/21] mm, THP, swap: Free PMD swap mapping when zap_huge_pmd()

2018-06-21 Thread Huang, Ying

From: Huang Ying 

For a PMD swap mapping, zap_huge_pmd() will clear the PMD and call
free_swap_and_cache() to decrease the swap reference count and maybe
free or split the huge swap cluster and the THP in swap cache.

Signed-off-by: "Huang, Ying" 
Cc: "Kirill A. Shutemov" 
Cc: Andrea Arcangeli 
Cc: Michal Hocko 
Cc: Johannes Weiner 
Cc: Shaohua Li 
Cc: Hugh Dickins 
Cc: Minchan Kim 
Cc: Rik van Riel 
Cc: Dave Hansen 
Cc: Naoya Horiguchi 
Cc: Zi Yan 
Cc: Daniel Jordan 
---
 mm/huge_memory.c | 32 +---
 1 file changed, 21 insertions(+), 11 deletions(-)

diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index 38c247a38f67..6b9ca1c14500 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -2007,7 +2007,7 @@ int zap_huge_pmd(struct mmu_gather *tlb, struct 
vm_area_struct *vma,
spin_unlock(ptl);
if (is_huge_zero_pmd(orig_pmd))
tlb_remove_page_size(tlb, pmd_page(orig_pmd), 
HPAGE_PMD_SIZE);
-   } else if (is_huge_zero_pmd(orig_pmd)) {
+   } else if (pmd_present(orig_pmd) && is_huge_zero_pmd(orig_pmd)) {
zap_deposited_table(tlb->mm, pmd);
spin_unlock(ptl);
tlb_remove_page_size(tlb, pmd_page(orig_pmd), HPAGE_PMD_SIZE);
@@ -2020,17 +2020,27 @@ int zap_huge_pmd(struct mmu_gather *tlb, struct 
vm_area_struct *vma,
page_remove_rmap(page, true);
VM_BUG_ON_PAGE(page_mapcount(page) < 0, page);
VM_BUG_ON_PAGE(!PageHead(page), page);
-   } else if (thp_migration_supported()) {
-   swp_entry_t entry;
-
-   VM_BUG_ON(!is_pmd_migration_entry(orig_pmd));
-   entry = pmd_to_swp_entry(orig_pmd);
-   page = pfn_to_page(swp_offset(entry));
+   } else {
+   swp_entry_t entry = pmd_to_swp_entry(orig_pmd);
+
+   if (thp_migration_supported() &&
+   is_migration_entry(entry))
+   page = pfn_to_page(swp_offset(entry));
+   else if (thp_swap_supported() &&
+!non_swap_entry(entry))
+   free_swap_and_cache(entry, true);
+   else {
+   WARN_ONCE(1,
+"Non present huge pmd without pmd migration or swap enabled!");
+   goto unlock;
+   }
flush_needed = 0;
-   } else
-   WARN_ONCE(1, "Non present huge pmd without pmd 
migration enabled!");
+   }
 
-   if (PageAnon(page)) {
+   if (!page) {
+   zap_deposited_table(tlb->mm, pmd);
+   add_mm_counter(tlb->mm, MM_SWAPENTS, -HPAGE_PMD_NR);
+   } else if (PageAnon(page)) {
zap_deposited_table(tlb->mm, pmd);
add_mm_counter(tlb->mm, MM_ANONPAGES, -HPAGE_PMD_NR);
} else {
@@ -2038,7 +2048,7 @@ int zap_huge_pmd(struct mmu_gather *tlb, struct 
vm_area_struct *vma,
zap_deposited_table(tlb->mm, pmd);
add_mm_counter(tlb->mm, MM_FILEPAGES, -HPAGE_PMD_NR);
}
-
+unlock:
spin_unlock(ptl);
if (flush_needed)
tlb_remove_page_size(tlb, page, HPAGE_PMD_SIZE);
-- 
2.16.4

< 1 2 3 4 5 6 7 8 9 10 >

501 - 600 of 1379 matches

Mail list logo