from:"Huang, Ying"

[PATCH -V9 14/21] swap: Support PMD swap mapping in madvise_free()

2018-12-13 Thread Huang Ying

When madvise_free() found a PMD swap mapping, if only part of the huge
swap cluster is operated on, the PMD swap mapping will be split and
fallback to PTE swap mapping processing.  Otherwise, if all huge swap
cluster is operated on, free_swap_and_cache() will be called to
decrease the PMD swap mapping count and probably free the swap space
and the THP in swap cache too.

Signed-off-by: "Huang, Ying" 
Cc: "Kirill A. Shutemov" 
Cc: Andrea Arcangeli 
Cc: Michal Hocko 
Cc: Johannes Weiner 
Cc: Shaohua Li 
Cc: Hugh Dickins 
Cc: Minchan Kim 
Cc: Rik van Riel 
Cc: Dave Hansen 
Cc: Naoya Horiguchi 
Cc: Zi Yan 
Cc: Daniel Jordan 
---
 mm/huge_memory.c | 52 ++--
 mm/madvise.c |  2 +-
 2 files changed, 38 insertions(+), 16 deletions(-)

diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index fdffa07bff98..c895c2a2db6e 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -1883,6 +1883,15 @@ int do_huge_pmd_swap_page(struct vm_fault *vmf, pmd_t 
orig_pmd)
 }
 #endif
 
+static inline void zap_deposited_table(struct mm_struct *mm, pmd_t *pmd)
+{
+   pgtable_t pgtable;
+
+   pgtable = pgtable_trans_huge_withdraw(mm, pmd);
+   pte_free(mm, pgtable);
+   mm_dec_nr_ptes(mm);
+}
+
 /*
  * Return true if we do MADV_FREE successfully on entire pmd page.
  * Otherwise, return false.
@@ -1903,15 +1912,37 @@ bool madvise_free_huge_pmd(struct mmu_gather *tlb, 
struct vm_area_struct *vma,
goto out_unlocked;
 
orig_pmd = *pmd;
-   if (is_huge_zero_pmd(orig_pmd))
-   goto out;
-
if (unlikely(!pmd_present(orig_pmd))) {
-   VM_BUG_ON(thp_migration_supported() &&
- !is_pmd_migration_entry(orig_pmd));
-   goto out;
+   swp_entry_t entry = pmd_to_swp_entry(orig_pmd);
+
+   if (is_migration_entry(entry)) {
+   VM_BUG_ON(!thp_migration_supported());
+   goto out;
+   } else if (IS_ENABLED(CONFIG_THP_SWAP) &&
+  !non_swap_entry(entry)) {
+   /*
+* If part of THP is discarded, split the PMD
+* swap mapping and operate on the PTEs
+*/
+   if (next - addr != HPAGE_PMD_SIZE) {
+   __split_huge_swap_pmd(vma, addr, pmd);
+   goto out;
+   }
+   free_swap_and_cache(entry, HPAGE_PMD_NR);
+   pmd_clear(pmd);
+   zap_deposited_table(mm, pmd);
+   if (current->mm == mm)
+   sync_mm_rss(mm);
+   add_mm_counter(mm, MM_SWAPENTS, -HPAGE_PMD_NR);
+   ret = true;
+   goto out;
+   } else
+   VM_BUG_ON(1);
}
 
+   if (is_huge_zero_pmd(orig_pmd))
+   goto out;
+
page = pmd_page(orig_pmd);
/*
 * If other processes are mapping this page, we couldn't discard
@@ -1957,15 +1988,6 @@ bool madvise_free_huge_pmd(struct mmu_gather *tlb, 
struct vm_area_struct *vma,
return ret;
 }
 
-static inline void zap_deposited_table(struct mm_struct *mm, pmd_t *pmd)
-{
-   pgtable_t pgtable;
-
-   pgtable = pgtable_trans_huge_withdraw(mm, pmd);
-   pte_free(mm, pgtable);
-   mm_dec_nr_ptes(mm);
-}
-
 int zap_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma,
 pmd_t *pmd, unsigned long addr)
 {
diff --git a/mm/madvise.c b/mm/madvise.c
index fac48161b015..c1845dab2dd4 100644
--- a/mm/madvise.c
+++ b/mm/madvise.c
@@ -321,7 +321,7 @@ static int madvise_free_pte_range(pmd_t *pmd, unsigned long 
addr,
unsigned long next;
 
next = pmd_addr_end(addr, end);
-   if (pmd_trans_huge(*pmd))
+   if (pmd_trans_huge(*pmd) || is_swap_pmd(*pmd))
if (madvise_free_huge_pmd(tlb, vma, pmd, addr, next))
goto next;
 
-- 
2.18.1

[PATCH -V9 02/21] swap: Enable PMD swap operations for CONFIG_THP_SWAP

2018-12-13 Thread Huang Ying

Currently, "the swap entry" in the page tables is used for a number of
things outside of actual swap, like page migration, etc.  We support
the THP/PMD "swap entry" for page migration currently and the
functions behind this are tied to page migration's config
option (CONFIG_ARCH_ENABLE_THP_MIGRATION).

But, we also need them for THP swap optimization.  So a new config
option (CONFIG_HAVE_PMD_SWAP_ENTRY) is added.  It is enabled when
either CONFIG_ARCH_ENABLE_THP_MIGRATION or CONFIG_THP_SWAP is enabled.
And PMD swap entry functions are tied to this new config option
instead.  Some functions enabled by CONFIG_ARCH_ENABLE_THP_MIGRATION
are for page migration only, they are still enabled only for that.

Signed-off-by: "Huang, Ying" 
Cc: "Kirill A. Shutemov" 
Cc: Andrea Arcangeli 
Cc: Michal Hocko 
Cc: Johannes Weiner 
Cc: Shaohua Li 
Cc: Hugh Dickins 
Cc: Minchan Kim 
Cc: Rik van Riel 
Cc: Dave Hansen 
Cc: Naoya Horiguchi 
Cc: Zi Yan 
Cc: Daniel Jordan 
---
 arch/x86/include/asm/pgtable.h |  2 +-
 include/asm-generic/pgtable.h  |  2 +-
 include/linux/swapops.h| 44 ++
 mm/Kconfig |  8 +++
 4 files changed, 33 insertions(+), 23 deletions(-)

diff --git a/arch/x86/include/asm/pgtable.h b/arch/x86/include/asm/pgtable.h
index 40616e805292..e830ab345551 100644
--- a/arch/x86/include/asm/pgtable.h
+++ b/arch/x86/include/asm/pgtable.h
@@ -1333,7 +1333,7 @@ static inline pte_t pte_swp_clear_soft_dirty(pte_t pte)
return pte_clear_flags(pte, _PAGE_SWP_SOFT_DIRTY);
 }
 
-#ifdef CONFIG_ARCH_ENABLE_THP_MIGRATION
+#ifdef CONFIG_HAVE_PMD_SWAP_ENTRY
 static inline pmd_t pmd_swp_mksoft_dirty(pmd_t pmd)
 {
return pmd_set_flags(pmd, _PAGE_SWP_SOFT_DIRTY);
diff --git a/include/asm-generic/pgtable.h b/include/asm-generic/pgtable.h
index e0381a4ce7d4..2a619f378297 100644
--- a/include/asm-generic/pgtable.h
+++ b/include/asm-generic/pgtable.h
@@ -675,7 +675,7 @@ static inline void ptep_modify_prot_commit(struct mm_struct 
*mm,
 #endif
 
 #ifdef CONFIG_HAVE_ARCH_SOFT_DIRTY
-#ifndef CONFIG_ARCH_ENABLE_THP_MIGRATION
+#ifndef CONFIG_HAVE_PMD_SWAP_ENTRY
 static inline pmd_t pmd_swp_mksoft_dirty(pmd_t pmd)
 {
return pmd;
diff --git a/include/linux/swapops.h b/include/linux/swapops.h
index 4d961668e5fc..905ddc65caa3 100644
--- a/include/linux/swapops.h
+++ b/include/linux/swapops.h
@@ -254,17 +254,7 @@ static inline int is_write_migration_entry(swp_entry_t 
entry)
 
 #endif
 
-struct page_vma_mapped_walk;
-
-#ifdef CONFIG_ARCH_ENABLE_THP_MIGRATION
-extern void set_pmd_migration_entry(struct page_vma_mapped_walk *pvmw,
-   struct page *page);
-
-extern void remove_migration_pmd(struct page_vma_mapped_walk *pvmw,
-   struct page *new);
-
-extern void pmd_migration_entry_wait(struct mm_struct *mm, pmd_t *pmd);
-
+#ifdef CONFIG_HAVE_PMD_SWAP_ENTRY
 static inline swp_entry_t pmd_to_swp_entry(pmd_t pmd)
 {
swp_entry_t arch_entry;
@@ -282,6 +272,28 @@ static inline pmd_t swp_entry_to_pmd(swp_entry_t entry)
arch_entry = __swp_entry(swp_type(entry), swp_offset(entry));
return __swp_entry_to_pmd(arch_entry);
 }
+#else
+static inline swp_entry_t pmd_to_swp_entry(pmd_t pmd)
+{
+   return swp_entry(0, 0);
+}
+
+static inline pmd_t swp_entry_to_pmd(swp_entry_t entry)
+{
+   return __pmd(0);
+}
+#endif
+
+struct page_vma_mapped_walk;
+
+#ifdef CONFIG_ARCH_ENABLE_THP_MIGRATION
+extern void set_pmd_migration_entry(struct page_vma_mapped_walk *pvmw,
+   struct page *page);
+
+extern void remove_migration_pmd(struct page_vma_mapped_walk *pvmw,
+   struct page *new);
+
+extern void pmd_migration_entry_wait(struct mm_struct *mm, pmd_t *pmd);
 
 static inline int is_pmd_migration_entry(pmd_t pmd)
 {
@@ -302,16 +314,6 @@ static inline void remove_migration_pmd(struct 
page_vma_mapped_walk *pvmw,
 
 static inline void pmd_migration_entry_wait(struct mm_struct *m, pmd_t *p) { }
 
-static inline swp_entry_t pmd_to_swp_entry(pmd_t pmd)
-{
-   return swp_entry(0, 0);
-}
-
-static inline pmd_t swp_entry_to_pmd(swp_entry_t entry)
-{
-   return __pmd(0);
-}
-
 static inline int is_pmd_migration_entry(pmd_t pmd)
 {
return 0;
diff --git a/mm/Kconfig b/mm/Kconfig
index 25c71eb8a7db..d7c5299c5b7d 100644
--- a/mm/Kconfig
+++ b/mm/Kconfig
@@ -422,6 +422,14 @@ config THP_SWAP
 
  For selection by architectures with reasonable THP sizes.
 
+#
+# "PMD swap entry" in the page table is used both for migration and
+# actual swap.
+#
+config HAVE_PMD_SWAP_ENTRY
+   def_bool y
+   depends on THP_SWAP || ARCH_ENABLE_THP_MIGRATION
+
 config TRANSPARENT_HUGE_PAGECACHE
def_bool y
depends on TRANSPARENT_HUGEPAGE
-- 
2.18.1

[PATCH -V9 08/21] swap: Support PMD swap mapping in split_swap_cluster()

2018-12-13 Thread Huang Ying

When splitting a THP in swap cache or failing to allocate a THP when
swapin a huge swap cluster, the huge swap cluster will be split.  In
addition to clear the huge flag of the swap cluster, the PMD swap
mapping count recorded in cluster_count() will be set to 0.  But we
will not touch PMD swap mappings themselves, because it is hard to
find them all sometimes.  When the PMD swap mappings are operated
later, it will be found that the huge swap cluster has been split and
the PMD swap mappings will be split at that time.

Unless splitting a THP in swap cache (specified via "force"
parameter), split_swap_cluster() will return -EEXIST if there is
SWAP_HAS_CACHE flag in swap_map[offset].  Because this indicates there
is a THP corresponds to this huge swap cluster, and it isn't desired
to split the THP.

When splitting a THP in swap cache, the position to call
split_swap_cluster() is changed to before unlocking sub-pages.  So
that all sub-pages will be kept locked from the THP has been split to
the huge swap cluster is split.  This makes the code much easier to be
reasoned.

Signed-off-by: "Huang, Ying" 
Cc: "Kirill A. Shutemov" 
Cc: Andrea Arcangeli 
Cc: Michal Hocko 
Cc: Johannes Weiner 
Cc: Shaohua Li 
Cc: Hugh Dickins 
Cc: Minchan Kim 
Cc: Rik van Riel 
Cc: Dave Hansen 
Cc: Naoya Horiguchi 
Cc: Zi Yan 
Cc: Daniel Jordan 
---
 include/linux/swap.h |  6 +++--
 mm/huge_memory.c | 18 +-
 mm/swapfile.c| 58 +++-
 3 files changed, 57 insertions(+), 25 deletions(-)

diff --git a/include/linux/swap.h b/include/linux/swap.h
index a24d101b131d..441da4a832a6 100644
--- a/include/linux/swap.h
+++ b/include/linux/swap.h
@@ -617,11 +617,13 @@ static inline swp_entry_t get_swap_page(struct page *page)
 
 #endif /* CONFIG_SWAP */
 
+#define SSC_SPLIT_CACHED   0x1
+
 #ifdef CONFIG_THP_SWAP
-extern int split_swap_cluster(swp_entry_t entry);
+extern int split_swap_cluster(swp_entry_t entry, unsigned long flags);
 extern int split_swap_cluster_map(swp_entry_t entry);
 #else
-static inline int split_swap_cluster(swp_entry_t entry)
+static inline int split_swap_cluster(swp_entry_t entry, unsigned long flags)
 {
return 0;
 }
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index 49df3e7c96c7..fc31fc1ae0b3 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -2507,6 +2507,17 @@ static void __split_huge_page(struct page *page, struct 
list_head *list,
 
remap_page(head);
 
+   /*
+* Split swap cluster before unlocking sub-pages.  So all
+* sub-pages will be kept locked from THP has been split to
+* swap cluster is split.
+*/
+   if (PageSwapCache(head)) {
+   swp_entry_t entry = { .val = page_private(head) };
+
+   split_swap_cluster(entry, SSC_SPLIT_CACHED);
+   }
+
for (i = 0; i < HPAGE_PMD_NR; i++) {
struct page *subpage = head + i;
if (subpage == page)
@@ -2741,12 +2752,7 @@ int split_huge_page_to_list(struct page *page, struct 
list_head *list)
__dec_node_page_state(page, NR_SHMEM_THPS);
spin_unlock(&pgdata->split_queue_lock);
__split_huge_page(page, list, end, flags);
-   if (PageSwapCache(head)) {
-   swp_entry_t entry = { .val = page_private(head) };
-
-   ret = split_swap_cluster(entry);
-   } else
-   ret = 0;
+   ret = 0;
} else {
if (IS_ENABLED(CONFIG_DEBUG_VM) && mapcount) {
pr_alert("total_mapcount: %u, page_count(): %u\n",
diff --git a/mm/swapfile.c b/mm/swapfile.c
index d38760b6d495..c59cc2ca7c2c 100644
--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -1469,23 +1469,6 @@ void put_swap_page(struct page *page, swp_entry_t entry)
unlock_cluster_or_swap_info(si, ci);
 }
 
-#ifdef CONFIG_THP_SWAP
-int split_swap_cluster(swp_entry_t entry)
-{
-   struct swap_info_struct *si;
-   struct swap_cluster_info *ci;
-   unsigned long offset = swp_offset(entry);
-
-   si = _swap_info_get(entry);
-   if (!si)
-   return -EBUSY;
-   ci = lock_cluster(si, offset);
-   cluster_clear_huge(ci);
-   unlock_cluster(ci);
-   return 0;
-}
-#endif
-
 static int swp_entry_cmp(const void *ent1, const void *ent2)
 {
const swp_entry_t *e1 = ent1, *e2 = ent2;
@@ -3972,6 +3955,47 @@ int split_swap_cluster_map(swp_entry_t entry)
unlock_cluster(ci);
return 0;
 }
+
+/*
+ * We will not try to split all PMD swap mappings to the swap cluster,
+ * because we haven't enough information available for that.  Later,
+ * when the PMD swap mapping is duplicated or swapin, etc, the PMD
+ * swap mapping will be split and fallback to the PTE operations.
+ */
+int split_swap_cluster(swp_entry_t entry, unsigned long flags)
+{
+

[PATCH -V9 09/21] swap: Support to read a huge swap cluster for swapin a THP

2018-12-13 Thread Huang Ying

To swapin a THP in one piece, we need to read a huge swap cluster from
the swap device.  This patch revised the __read_swap_cache_async() and
its callers and callees to support this.  If __read_swap_cache_async()
find the swap cluster of the specified swap entry is huge, it will try
to allocate a THP, add it into the swap cache.  So later the contents
of the huge swap cluster can be read into the THP.

Signed-off-by: "Huang, Ying" 
Cc: "Kirill A. Shutemov" 
Cc: Andrea Arcangeli 
Cc: Michal Hocko 
Cc: Johannes Weiner 
Cc: Shaohua Li 
Cc: Hugh Dickins 
Cc: Minchan Kim 
Cc: Rik van Riel 
Cc: Dave Hansen 
Cc: Naoya Horiguchi 
Cc: Zi Yan 
Cc: Daniel Jordan 
---
 include/linux/huge_mm.h |  6 +
 include/linux/swap.h|  4 +--
 mm/huge_memory.c|  4 +--
 mm/swap_state.c | 60 -
 mm/swapfile.c   |  9 ---
 5 files changed, 64 insertions(+), 19 deletions(-)

diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h
index 1c0fda003d6a..72f2617d336b 100644
--- a/include/linux/huge_mm.h
+++ b/include/linux/huge_mm.h
@@ -250,6 +250,7 @@ static inline bool thp_migration_supported(void)
return IS_ENABLED(CONFIG_ARCH_ENABLE_THP_MIGRATION);
 }
 
+gfp_t alloc_hugepage_direct_gfpmask(struct vm_area_struct *vma);
 #else /* CONFIG_TRANSPARENT_HUGEPAGE */
 #define HPAGE_PMD_SHIFT ({ BUILD_BUG(); 0; })
 #define HPAGE_PMD_MASK ({ BUILD_BUG(); 0; })
@@ -363,6 +364,11 @@ static inline bool thp_migration_supported(void)
 {
return false;
 }
+
+static inline gfp_t alloc_hugepage_direct_gfpmask(struct vm_area_struct *vma)
+{
+   return 0;
+}
 #endif /* CONFIG_TRANSPARENT_HUGEPAGE */
 
 #endif /* _LINUX_HUGE_MM_H */
diff --git a/include/linux/swap.h b/include/linux/swap.h
index 441da4a832a6..4bd532c9315e 100644
--- a/include/linux/swap.h
+++ b/include/linux/swap.h
@@ -462,7 +462,7 @@ extern sector_t map_swap_page(struct page *, struct 
block_device **);
 extern sector_t swapdev_block(int, pgoff_t);
 extern int page_swapcount(struct page *);
 extern int __swap_count(swp_entry_t entry);
-extern int __swp_swapcount(swp_entry_t entry);
+extern int __swp_swapcount(swp_entry_t entry, int *entry_size);
 extern int swp_swapcount(swp_entry_t entry);
 extern struct swap_info_struct *page_swap_info(struct page *);
 extern struct swap_info_struct *swp_swap_info(swp_entry_t entry);
@@ -590,7 +590,7 @@ static inline int __swap_count(swp_entry_t entry)
return 0;
 }
 
-static inline int __swp_swapcount(swp_entry_t entry)
+static inline int __swp_swapcount(swp_entry_t entry, int *entry_size)
 {
return 0;
 }
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index fc31fc1ae0b3..1cec1eec340e 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -629,9 +629,9 @@ static vm_fault_t __do_huge_pmd_anonymous_page(struct 
vm_fault *vmf,
  * available
  * never: never stall for any thp allocation
  */
-static inline gfp_t alloc_hugepage_direct_gfpmask(struct vm_area_struct *vma)
+gfp_t alloc_hugepage_direct_gfpmask(struct vm_area_struct *vma)
 {
-   const bool vma_madvised = !!(vma->vm_flags & VM_HUGEPAGE);
+   const bool vma_madvised = vma ? !!(vma->vm_flags & VM_HUGEPAGE) : false;
 
/* Always do synchronous compaction */
if (test_bit(TRANSPARENT_HUGEPAGE_DEFRAG_DIRECT_FLAG, 
&transparent_hugepage_flags))
diff --git a/mm/swap_state.c b/mm/swap_state.c
index 97831166994a..5e761bb6e354 100644
--- a/mm/swap_state.c
+++ b/mm/swap_state.c
@@ -361,7 +361,9 @@ struct page *__read_swap_cache_async(swp_entry_t entry, 
gfp_t gfp_mask,
 {
struct page *found_page = NULL, *new_page = NULL;
struct swap_info_struct *si;
-   int err;
+   int err, entry_size = 1;
+   swp_entry_t hentry;
+
*new_page_allocated = false;
 
do {
@@ -387,14 +389,41 @@ struct page *__read_swap_cache_async(swp_entry_t entry, 
gfp_t gfp_mask,
 * as SWAP_HAS_CACHE.  That's done in later part of code or
 * else swap_off will be aborted if we return NULL.
 */
-   if (!__swp_swapcount(entry) && swap_slot_cache_enabled)
+   if (!__swp_swapcount(entry, &entry_size) &&
+   swap_slot_cache_enabled)
break;
 
/*
 * Get a new page to read into from swap.
 */
-   if (!new_page) {
-   new_page = alloc_page_vma(gfp_mask, vma, addr);
+   if (!new_page ||
+   (IS_ENABLED(CONFIG_THP_SWAP) &&
+hpage_nr_pages(new_page) != entry_size)) {
+   if (new_page)
+   put_page(new_page);
+   if (IS_ENABLED(CONFIG_THP_SWAP) &&
+   entry_size == HPAGE_PMD_NR) {
+   gfp_t

[PATCH -V9 11/21] swap: Support to count THP swapin and its fallback

2018-12-13 Thread Huang Ying

2 new /proc/vmstat fields are added, "thp_swapin" and
"thp_swapin_fallback" to count swapin a THP from swap device in one
piece and fallback to normal page swapin.

Signed-off-by: "Huang, Ying" 
Cc: "Kirill A. Shutemov" 
Cc: Andrea Arcangeli 
Cc: Michal Hocko 
Cc: Johannes Weiner 
Cc: Shaohua Li 
Cc: Hugh Dickins 
Cc: Minchan Kim 
Cc: Rik van Riel 
Cc: Dave Hansen 
Cc: Naoya Horiguchi 
Cc: Zi Yan 
Cc: Daniel Jordan 
---
 Documentation/admin-guide/mm/transhuge.rst |  8 
 include/linux/vm_event_item.h  |  2 ++
 mm/huge_memory.c   |  4 +++-
 mm/page_io.c   | 15 ---
 mm/vmstat.c|  2 ++
 5 files changed, 27 insertions(+), 4 deletions(-)

diff --git a/Documentation/admin-guide/mm/transhuge.rst 
b/Documentation/admin-guide/mm/transhuge.rst
index 7ab93a8404b9..85e33f785fd7 100644
--- a/Documentation/admin-guide/mm/transhuge.rst
+++ b/Documentation/admin-guide/mm/transhuge.rst
@@ -364,6 +364,14 @@ thp_swpout_fallback
Usually because failed to allocate some continuous swap space
for the huge page.
 
+thp_swpin
+   is incremented every time a huge page is swapin in one piece
+   without splitting.
+
+thp_swpin_fallback
+   is incremented if a huge page has to be split during swapin.
+   Usually because failed to allocate a huge page.
+
 As the system ages, allocating huge pages may be expensive as the
 system uses memory compaction to copy data around memory to free a
 huge page for use. There are some counters in ``/proc/vmstat`` to help
diff --git a/include/linux/vm_event_item.h b/include/linux/vm_event_item.h
index 47a3441cf4c4..c20b655cfdcc 100644
--- a/include/linux/vm_event_item.h
+++ b/include/linux/vm_event_item.h
@@ -88,6 +88,8 @@ enum vm_event_item { PGPGIN, PGPGOUT, PSWPIN, PSWPOUT,
THP_ZERO_PAGE_ALLOC_FAILED,
THP_SWPOUT,
THP_SWPOUT_FALLBACK,
+   THP_SWPIN,
+   THP_SWPIN_FALLBACK,
 #endif
 #ifdef CONFIG_MEMORY_BALLOON
BALLOON_INFLATE,
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index 644cb5d6b056..e1e95e6c86e3 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -1708,8 +1708,10 @@ int do_huge_pmd_swap_page(struct vm_fault *vmf, pmd_t 
orig_pmd)
/* swapoff occurs under us */
} else if (ret == -EINVAL)
ret = 0;
-   else
+   else {
+   count_vm_event(THP_SWPIN_FALLBACK);
goto fallback;
+   }
}
delayacct_clear_flag(DELAYACCT_PF_SWAPIN);
goto out;
diff --git a/mm/page_io.c b/mm/page_io.c
index 67a7f64d6c1a..00774b453dca 100644
--- a/mm/page_io.c
+++ b/mm/page_io.c
@@ -348,6 +348,15 @@ int __swap_writepage(struct page *page, struct 
writeback_control *wbc,
return ret;
 }
 
+static inline void count_swpin_vm_event(struct page *page)
+{
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+   if (unlikely(PageTransHuge(page)))
+   count_vm_event(THP_SWPIN);
+#endif
+   count_vm_events(PSWPIN, hpage_nr_pages(page));
+}
+
 int swap_readpage(struct page *page, bool synchronous)
 {
struct bio *bio;
@@ -371,7 +380,7 @@ int swap_readpage(struct page *page, bool synchronous)
 
ret = mapping->a_ops->readpage(swap_file, page);
if (!ret)
-   count_vm_event(PSWPIN);
+   count_swpin_vm_event(page);
return ret;
}
 
@@ -382,7 +391,7 @@ int swap_readpage(struct page *page, bool synchronous)
unlock_page(page);
}
 
-   count_vm_event(PSWPIN);
+   count_swpin_vm_event(page);
return 0;
}
 
@@ -403,7 +412,7 @@ int swap_readpage(struct page *page, bool synchronous)
bio_set_op_attrs(bio, REQ_OP_READ, 0);
if (synchronous)
bio->bi_opf |= REQ_HIPRI;
-   count_vm_event(PSWPIN);
+   count_swpin_vm_event(page);
bio_get(bio);
qc = submit_bio(bio);
while (synchronous) {
diff --git a/mm/vmstat.c b/mm/vmstat.c
index 83b30edc2f7f..80a731e9a5e5 100644
--- a/mm/vmstat.c
+++ b/mm/vmstat.c
@@ -1265,6 +1265,8 @@ const char * const vmstat_text[] = {
"thp_zero_page_alloc_failed",
"thp_swpout",
"thp_swpout_fallback",
+   "thp_swpin",
+   "thp_swpin_fallback",
 #endif
 #ifdef CONFIG_MEMORY_BALLOON
"balloon_inflate",
-- 
2.18.1

[PATCH -V9 21/21] swap: create PMD swap mapping when unmap the THP

2018-12-13 Thread Huang Ying

This is the final step of the THP swapin support.  When reclaiming a
anonymous THP, after allocating the huge swap cluster and add the THP
into swap cache, the PMD page mapping will be changed to the mapping
to the swap space.  Previously, the PMD page mapping will be split
before being changed.  In this patch, the unmap code is enhanced not
to split the PMD mapping, but create a PMD swap mapping to replace it
instead.  So later when clear the SWAP_HAS_CACHE flag in the last step
of swapout, the huge swap cluster will be kept instead of being split,
and when swapin, the huge swap cluster will be read in one piece into a
THP.  That is, the THP will not be split during swapout/swapin.  This
can eliminate the overhead of splitting/collapsing, and reduce the
page fault count, etc.  But more important, the utilization of THP is
improved greatly, that is, much more THP will be kept when swapping is
used, so that we can take full advantage of THP including its high
performance for swapout/swapin.

Signed-off-by: "Huang, Ying" 
Cc: "Kirill A. Shutemov" 
Cc: Andrea Arcangeli 
Cc: Michal Hocko 
Cc: Johannes Weiner 
Cc: Shaohua Li 
Cc: Hugh Dickins 
Cc: Minchan Kim 
Cc: Rik van Riel 
Cc: Dave Hansen 
Cc: Naoya Horiguchi 
Cc: Zi Yan 
Cc: Daniel Jordan 
---
 include/linux/huge_mm.h | 11 +++
 mm/huge_memory.c| 30 ++
 mm/rmap.c   | 41 -
 mm/vmscan.c |  6 +-
 4 files changed, 82 insertions(+), 6 deletions(-)

diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h
index 3c05294689c1..fef5d27c2083 100644
--- a/include/linux/huge_mm.h
+++ b/include/linux/huge_mm.h
@@ -373,12 +373,16 @@ static inline gfp_t alloc_hugepage_direct_gfpmask(struct 
vm_area_struct *vma)
 }
 #endif /* CONFIG_TRANSPARENT_HUGEPAGE */
 
+struct page_vma_mapped_walk;
+
 #ifdef CONFIG_THP_SWAP
 extern void __split_huge_swap_pmd(struct vm_area_struct *vma,
  unsigned long addr, pmd_t *pmd);
 extern int split_huge_swap_pmd(struct vm_area_struct *vma, pmd_t *pmd,
   unsigned long address, pmd_t orig_pmd);
 extern int do_huge_pmd_swap_page(struct vm_fault *vmf, pmd_t orig_pmd);
+extern bool set_pmd_swap_entry(struct page_vma_mapped_walk *pvmw,
+   struct page *page, unsigned long address, pmd_t pmdval);
 
 static inline bool transparent_hugepage_swapin_enabled(
struct vm_area_struct *vma)
@@ -419,6 +423,13 @@ static inline int do_huge_pmd_swap_page(struct vm_fault 
*vmf, pmd_t orig_pmd)
return 0;
 }
 
+static inline bool set_pmd_swap_entry(struct page_vma_mapped_walk *pvmw,
+ struct page *page, unsigned long address,
+ pmd_t pmdval)
+{
+   return false;
+}
+
 static inline bool transparent_hugepage_swapin_enabled(
struct vm_area_struct *vma)
 {
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index 38904d673339..e0205fceb84c 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -1922,6 +1922,36 @@ int do_huge_pmd_swap_page(struct vm_fault *vmf, pmd_t 
orig_pmd)
put_page(page);
return ret;
 }
+
+bool set_pmd_swap_entry(struct page_vma_mapped_walk *pvmw, struct page *page,
+   unsigned long address, pmd_t pmdval)
+{
+   struct vm_area_struct *vma = pvmw->vma;
+   struct mm_struct *mm = vma->vm_mm;
+   pmd_t swp_pmd;
+   swp_entry_t entry = { .val = page_private(page) };
+
+   if (swap_duplicate(&entry, HPAGE_PMD_NR) < 0) {
+   set_pmd_at(mm, address, pvmw->pmd, pmdval);
+   return false;
+   }
+   if (list_empty(&mm->mmlist)) {
+   spin_lock(&mmlist_lock);
+   if (list_empty(&mm->mmlist))
+   list_add(&mm->mmlist, &init_mm.mmlist);
+   spin_unlock(&mmlist_lock);
+   }
+   add_mm_counter(mm, MM_ANONPAGES, -HPAGE_PMD_NR);
+   add_mm_counter(mm, MM_SWAPENTS, HPAGE_PMD_NR);
+   swp_pmd = swp_entry_to_pmd(entry);
+   if (pmd_soft_dirty(pmdval))
+   swp_pmd = pmd_swp_mksoft_dirty(swp_pmd);
+   set_pmd_at(mm, address, pvmw->pmd, swp_pmd);
+
+   page_remove_rmap(page, true);
+   put_page(page);
+   return true;
+}
 #endif
 
 static inline void zap_deposited_table(struct mm_struct *mm, pmd_t *pmd)
diff --git a/mm/rmap.c b/mm/rmap.c
index e9b07016f587..a957af84ec12 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -1423,11 +1423,50 @@ static bool try_to_unmap_one(struct page *page, struct 
vm_area_struct *vma,
continue;
}
 
+   address = pvmw.address;
+
+#ifdef CONFIG_THP_SWAP
+   /* PMD-mapped THP swap entry */
+   if (IS_ENABLED(CONFIG_THP_SWAP) &&
+   !pvmw.pte && PageAnon(page)) {
+

[PATCH -V9 15/21] swap: Support to move swap account for PMD swap mapping

2018-12-13 Thread Huang Ying

Previously the huge swap cluster will be split after the THP is
swapout.  Now, to support to swapin the THP in one piece, the huge
swap cluster will not be split after the THP is reclaimed.  So in
memcg, we need to move the swap account for PMD swap mappings in the
process's page table.

When the page table is scanned during moving memcg charge, the PMD
swap mapping will be identified.  And mem_cgroup_move_swap_account()
and its callee is revised to move account for the whole huge swap
cluster.  If the swap cluster mapped by PMD has been split, the PMD
swap mapping will be split and fallback to PTE processing.

Signed-off-by: "Huang, Ying" 
Cc: "Kirill A. Shutemov" 
Cc: Andrea Arcangeli 
Cc: Michal Hocko 
Cc: Johannes Weiner 
Cc: Shaohua Li 
Cc: Hugh Dickins 
Cc: Minchan Kim 
Cc: Rik van Riel 
Cc: Dave Hansen 
Cc: Naoya Horiguchi 
Cc: Zi Yan 
Cc: Daniel Jordan 
---
 include/linux/huge_mm.h |   7 ++
 include/linux/swap.h|   6 ++
 include/linux/swap_cgroup.h |   3 +-
 mm/huge_memory.c|   7 +-
 mm/memcontrol.c | 131 
 mm/swap_cgroup.c|  45 ++---
 mm/swapfile.c   |  14 
 7 files changed, 173 insertions(+), 40 deletions(-)

diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h
index 7c72e63757af..3c05294689c1 100644
--- a/include/linux/huge_mm.h
+++ b/include/linux/huge_mm.h
@@ -374,6 +374,8 @@ static inline gfp_t alloc_hugepage_direct_gfpmask(struct 
vm_area_struct *vma)
 #endif /* CONFIG_TRANSPARENT_HUGEPAGE */
 
 #ifdef CONFIG_THP_SWAP
+extern void __split_huge_swap_pmd(struct vm_area_struct *vma,
+ unsigned long addr, pmd_t *pmd);
 extern int split_huge_swap_pmd(struct vm_area_struct *vma, pmd_t *pmd,
   unsigned long address, pmd_t orig_pmd);
 extern int do_huge_pmd_swap_page(struct vm_fault *vmf, pmd_t orig_pmd);
@@ -401,6 +403,11 @@ static inline bool transparent_hugepage_swapin_enabled(
return false;
 }
 #else /* CONFIG_THP_SWAP */
+static inline void __split_huge_swap_pmd(struct vm_area_struct *vma,
+unsigned long addr, pmd_t *pmd)
+{
+}
+
 static inline int split_huge_swap_pmd(struct vm_area_struct *vma, pmd_t *pmd,
  unsigned long address, pmd_t orig_pmd)
 {
diff --git a/include/linux/swap.h b/include/linux/swap.h
index 4bd532c9315e..6463784fd5e8 100644
--- a/include/linux/swap.h
+++ b/include/linux/swap.h
@@ -622,6 +622,7 @@ static inline swp_entry_t get_swap_page(struct page *page)
 #ifdef CONFIG_THP_SWAP
 extern int split_swap_cluster(swp_entry_t entry, unsigned long flags);
 extern int split_swap_cluster_map(swp_entry_t entry);
+extern int get_swap_entry_size(swp_entry_t entry);
 #else
 static inline int split_swap_cluster(swp_entry_t entry, unsigned long flags)
 {
@@ -632,6 +633,11 @@ static inline int split_swap_cluster_map(swp_entry_t entry)
 {
return 0;
 }
+
+static inline int get_swap_entry_size(swp_entry_t entry)
+{
+   return 1;
+}
 #endif
 
 #ifdef CONFIG_MEMCG
diff --git a/include/linux/swap_cgroup.h b/include/linux/swap_cgroup.h
index a12dd1c3966c..c40fb52b0563 100644
--- a/include/linux/swap_cgroup.h
+++ b/include/linux/swap_cgroup.h
@@ -7,7 +7,8 @@
 #ifdef CONFIG_MEMCG_SWAP
 
 extern unsigned short swap_cgroup_cmpxchg(swp_entry_t ent,
-   unsigned short old, unsigned short new);
+   unsigned short old, unsigned short new,
+   unsigned int nr_ents);
 extern unsigned short swap_cgroup_record(swp_entry_t ent, unsigned short id,
 unsigned int nr_ents);
 extern unsigned short lookup_swap_cgroup_id(swp_entry_t ent);
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index c895c2a2db6e..e460241ea761 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -1670,10 +1670,10 @@ vm_fault_t do_huge_pmd_numa_page(struct vm_fault *vmf, 
pmd_t pmd)
return 0;
 }
 
+#ifdef CONFIG_THP_SWAP
 /* Convert a PMD swap mapping to a set of PTE swap mappings */
-static void __split_huge_swap_pmd(struct vm_area_struct *vma,
- unsigned long addr,
- pmd_t *pmd)
+void __split_huge_swap_pmd(struct vm_area_struct *vma,
+  unsigned long addr, pmd_t *pmd)
 {
struct mm_struct *mm = vma->vm_mm;
pgtable_t pgtable;
@@ -1705,7 +1705,6 @@ static void __split_huge_swap_pmd(struct vm_area_struct 
*vma,
pmd_populate(mm, pmd, pgtable);
 }
 
-#ifdef CONFIG_THP_SWAP
 int split_huge_swap_pmd(struct vm_area_struct *vma, pmd_t *pmd,
unsigned long address, pmd_t orig_pmd)
 {
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index b860dd4f75f2..ac1abfcfab88 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -2667,9 +2667,10 @@ void mem_cgroup_split_

[PATCH -V9 17/21] swap: Free PMD swap mapping when zap_huge_pmd()

2018-12-13 Thread Huang Ying

For a PMD swap mapping, zap_huge_pmd() will clear the PMD and call
free_swap_and_cache() to decrease the swap reference count and maybe
free or split the huge swap cluster and the THP in swap cache.

Signed-off-by: "Huang, Ying" 
Cc: "Kirill A. Shutemov" 
Cc: Andrea Arcangeli 
Cc: Michal Hocko 
Cc: Johannes Weiner 
Cc: Shaohua Li 
Cc: Hugh Dickins 
Cc: Minchan Kim 
Cc: Rik van Riel 
Cc: Dave Hansen 
Cc: Naoya Horiguchi 
Cc: Zi Yan 
Cc: Daniel Jordan 
---
 mm/huge_memory.c | 32 +---
 1 file changed, 21 insertions(+), 11 deletions(-)

diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index b083c66a9d09..6d144d687e69 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -2055,7 +2055,7 @@ int zap_huge_pmd(struct mmu_gather *tlb, struct 
vm_area_struct *vma,
spin_unlock(ptl);
if (is_huge_zero_pmd(orig_pmd))
tlb_remove_page_size(tlb, pmd_page(orig_pmd), 
HPAGE_PMD_SIZE);
-   } else if (is_huge_zero_pmd(orig_pmd)) {
+   } else if (pmd_present(orig_pmd) && is_huge_zero_pmd(orig_pmd)) {
zap_deposited_table(tlb->mm, pmd);
spin_unlock(ptl);
tlb_remove_page_size(tlb, pmd_page(orig_pmd), HPAGE_PMD_SIZE);
@@ -2068,17 +2068,27 @@ int zap_huge_pmd(struct mmu_gather *tlb, struct 
vm_area_struct *vma,
page_remove_rmap(page, true);
VM_BUG_ON_PAGE(page_mapcount(page) < 0, page);
VM_BUG_ON_PAGE(!PageHead(page), page);
-   } else if (thp_migration_supported()) {
-   swp_entry_t entry;
-
-   VM_BUG_ON(!is_pmd_migration_entry(orig_pmd));
-   entry = pmd_to_swp_entry(orig_pmd);
-   page = pfn_to_page(swp_offset(entry));
+   } else {
+   swp_entry_t entry = pmd_to_swp_entry(orig_pmd);
+
+   if (thp_migration_supported() &&
+   is_migration_entry(entry))
+   page = pfn_to_page(swp_offset(entry));
+   else if (IS_ENABLED(CONFIG_THP_SWAP) &&
+!non_swap_entry(entry))
+   free_swap_and_cache(entry, HPAGE_PMD_NR);
+   else {
+   WARN_ONCE(1,
+"Non present huge pmd without pmd migration or swap enabled!");
+   goto unlock;
+   }
flush_needed = 0;
-   } else
-   WARN_ONCE(1, "Non present huge pmd without pmd 
migration enabled!");
+   }
 
-   if (PageAnon(page)) {
+   if (!page) {
+   zap_deposited_table(tlb->mm, pmd);
+   add_mm_counter(tlb->mm, MM_SWAPENTS, -HPAGE_PMD_NR);
+   } else if (PageAnon(page)) {
zap_deposited_table(tlb->mm, pmd);
add_mm_counter(tlb->mm, MM_ANONPAGES, -HPAGE_PMD_NR);
} else {
@@ -2086,7 +2096,7 @@ int zap_huge_pmd(struct mmu_gather *tlb, struct 
vm_area_struct *vma,
zap_deposited_table(tlb->mm, pmd);
add_mm_counter(tlb->mm, mm_counter_file(page), 
-HPAGE_PMD_NR);
}
-
+unlock:
spin_unlock(ptl);
if (flush_needed)
tlb_remove_page_size(tlb, page, HPAGE_PMD_SIZE);
-- 
2.18.1

[PATCH -V9 03/21] swap: Add __swap_duplicate_locked()

2018-12-13 Thread Huang Ying

The part of __swap_duplicate() with lock held is separated into a new
function __swap_duplicate_locked().  Because we will add more logic
about the PMD swap mapping into __swap_duplicate() and keep the most
PTE swap mapping related logic in __swap_duplicate_locked().

Just mechanical code refactoring, there is no any functional change in
this patch.

Signed-off-by: "Huang, Ying" 
Cc: "Kirill A. Shutemov" 
Cc: Andrea Arcangeli 
Cc: Michal Hocko 
Cc: Johannes Weiner 
Cc: Shaohua Li 
Cc: Hugh Dickins 
Cc: Minchan Kim 
Cc: Rik van Riel 
Cc: Dave Hansen 
Cc: Naoya Horiguchi 
Cc: Zi Yan 
Cc: Daniel Jordan 
---
 mm/swapfile.c | 63 ---
 1 file changed, 35 insertions(+), 28 deletions(-)

diff --git a/mm/swapfile.c b/mm/swapfile.c
index 9e6da494781f..5adc0787343f 100644
--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -3343,32 +3343,12 @@ void si_swapinfo(struct sysinfo *val)
spin_unlock(&swap_lock);
 }
 
-/*
- * Verify that a swap entry is valid and increment its swap map count.
- *
- * Returns error code in following case.
- * - success -> 0
- * - swp_entry is invalid -> EINVAL
- * - swp_entry is migration entry -> EINVAL
- * - swap-cache reference is requested but there is already one. -> EEXIST
- * - swap-cache reference is requested but the entry is not used. -> ENOENT
- * - swap-mapped reference requested but needs continued swap count. -> ENOMEM
- */
-static int __swap_duplicate(swp_entry_t entry, unsigned char usage)
+static int __swap_duplicate_locked(struct swap_info_struct *p,
+  unsigned long offset, unsigned char usage)
 {
-   struct swap_info_struct *p;
-   struct swap_cluster_info *ci;
-   unsigned long offset;
unsigned char count;
unsigned char has_cache;
-   int err = -EINVAL;
-
-   p = get_swap_device(entry);
-   if (!p)
-   goto out;
-
-   offset = swp_offset(entry);
-   ci = lock_cluster_or_swap_info(p, offset);
+   int err = 0;
 
count = p->swap_map[offset];
 
@@ -3378,12 +3358,11 @@ static int __swap_duplicate(swp_entry_t entry, unsigned 
char usage)
 */
if (unlikely(swap_count(count) == SWAP_MAP_BAD)) {
err = -ENOENT;
-   goto unlock_out;
+   goto out;
}
 
has_cache = count & SWAP_HAS_CACHE;
count &= ~SWAP_HAS_CACHE;
-   err = 0;
 
if (usage == SWAP_HAS_CACHE) {
 
@@ -3410,11 +3389,39 @@ static int __swap_duplicate(swp_entry_t entry, unsigned 
char usage)
 
p->swap_map[offset] = count | has_cache;
 
-unlock_out:
+out:
+   return err;
+}
+
+/*
+ * Verify that a swap entry is valid and increment its swap map count.
+ *
+ * Returns error code in following case.
+ * - success -> 0
+ * - swp_entry is invalid -> EINVAL
+ * - swp_entry is migration entry -> EINVAL
+ * - swap-cache reference is requested but there is already one. -> EEXIST
+ * - swap-cache reference is requested but the entry is not used. -> ENOENT
+ * - swap-mapped reference requested but needs continued swap count. -> ENOMEM
+ */
+static int __swap_duplicate(swp_entry_t entry, unsigned char usage)
+{
+   struct swap_info_struct *p;
+   struct swap_cluster_info *ci;
+   unsigned long offset;
+   int err = -EINVAL;
+
+   p = get_swap_device(entry);
+   if (!p)
+   goto out;
+
+   offset = swp_offset(entry);
+   ci = lock_cluster_or_swap_info(p, offset);
+   err = __swap_duplicate_locked(p, offset, usage);
unlock_cluster_or_swap_info(p, ci);
+
+   put_swap_device(p);
 out:
-   if (p)
-   put_swap_device(p);
return err;
 }
 
-- 
2.18.1

[PATCH -V9 16/21] swap: Support to copy PMD swap mapping when fork()

2018-12-13 Thread Huang Ying

During fork, the page table need to be copied from parent to child.  A
PMD swap mapping need to be copied too and the swap reference count
need to be increased.

When the huge swap cluster has been split already, we need to split
the PMD swap mapping and fallback to PTE copying.

When swap count continuation failed to allocate a page with
GFP_ATOMIC, we need to unlock the spinlock and try again with
GFP_KERNEL.

Signed-off-by: "Huang, Ying" 
Cc: "Kirill A. Shutemov" 
Cc: Andrea Arcangeli 
Cc: Michal Hocko 
Cc: Johannes Weiner 
Cc: Shaohua Li 
Cc: Hugh Dickins 
Cc: Minchan Kim 
Cc: Rik van Riel 
Cc: Dave Hansen 
Cc: Naoya Horiguchi 
Cc: Zi Yan 
Cc: Daniel Jordan 
---
 mm/huge_memory.c | 72 ++--
 1 file changed, 57 insertions(+), 15 deletions(-)

diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index e460241ea761..b083c66a9d09 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -974,6 +974,7 @@ int copy_huge_pmd(struct mm_struct *dst_mm, struct 
mm_struct *src_mm,
if (unlikely(!pgtable))
goto out;
 
+retry:
dst_ptl = pmd_lock(dst_mm, dst_pmd);
src_ptl = pmd_lockptr(src_mm, src_pmd);
spin_lock_nested(src_ptl, SINGLE_DEPTH_NESTING);
@@ -981,26 +982,67 @@ int copy_huge_pmd(struct mm_struct *dst_mm, struct 
mm_struct *src_mm,
ret = -EAGAIN;
pmd = *src_pmd;
 
-#ifdef CONFIG_ARCH_ENABLE_THP_MIGRATION
if (unlikely(is_swap_pmd(pmd))) {
swp_entry_t entry = pmd_to_swp_entry(pmd);
 
-   VM_BUG_ON(!is_pmd_migration_entry(pmd));
-   if (is_write_migration_entry(entry)) {
-   make_migration_entry_read(&entry);
-   pmd = swp_entry_to_pmd(entry);
-   if (pmd_swp_soft_dirty(*src_pmd))
-   pmd = pmd_swp_mksoft_dirty(pmd);
-   set_pmd_at(src_mm, addr, src_pmd, pmd);
+#ifdef CONFIG_ARCH_ENABLE_THP_MIGRATION
+   if (is_migration_entry(entry)) {
+   if (is_write_migration_entry(entry)) {
+   make_migration_entry_read(&entry);
+   pmd = swp_entry_to_pmd(entry);
+   if (pmd_swp_soft_dirty(*src_pmd))
+   pmd = pmd_swp_mksoft_dirty(pmd);
+   set_pmd_at(src_mm, addr, src_pmd, pmd);
+   }
+   add_mm_counter(dst_mm, MM_ANONPAGES, HPAGE_PMD_NR);
+   mm_inc_nr_ptes(dst_mm);
+   pgtable_trans_huge_deposit(dst_mm, dst_pmd, pgtable);
+   set_pmd_at(dst_mm, addr, dst_pmd, pmd);
+   ret = 0;
+   goto out_unlock;
}
-   add_mm_counter(dst_mm, MM_ANONPAGES, HPAGE_PMD_NR);
-   mm_inc_nr_ptes(dst_mm);
-   pgtable_trans_huge_deposit(dst_mm, dst_pmd, pgtable);
-   set_pmd_at(dst_mm, addr, dst_pmd, pmd);
-   ret = 0;
-   goto out_unlock;
-   }
 #endif
+   if (IS_ENABLED(CONFIG_THP_SWAP) && !non_swap_entry(entry)) {
+   ret = swap_duplicate(&entry, HPAGE_PMD_NR);
+   if (!ret) {
+   add_mm_counter(dst_mm, MM_SWAPENTS,
+  HPAGE_PMD_NR);
+   mm_inc_nr_ptes(dst_mm);
+   pgtable_trans_huge_deposit(dst_mm, dst_pmd,
+  pgtable);
+   set_pmd_at(dst_mm, addr, dst_pmd, pmd);
+   /* make sure dst_mm is on swapoff's mmlist. */
+   if (unlikely(list_empty(&dst_mm->mmlist))) {
+   spin_lock(&mmlist_lock);
+   if (list_empty(&dst_mm->mmlist))
+   list_add(&dst_mm->mmlist,
+&src_mm->mmlist);
+   spin_unlock(&mmlist_lock);
+   }
+   } else if (ret == -ENOTDIR) {
+   /*
+* The huge swap cluster has been split, split
+* the PMD swap mapping and fallback to PTE
+*/
+   __split_huge_swap_pmd(vma, addr, src_pmd);
+   pte_free(dst_mm, pgtable);
+   } else if (ret == -ENOMEM) {
+   spin_unlock(src_ptl);
+   spin_unlock(dst_ptl);
+   ret = add_swap_count_continuation(en

[PATCH -V9 18/21] swap: Support PMD swap mapping for MADV_WILLNEED

2018-12-13 Thread Huang Ying

During MADV_WILLNEED, for a PMD swap mapping, if THP swapin is enabled
for the VMA, the whole swap cluster will be swapin.  Otherwise, the
huge swap cluster and the PMD swap mapping will be split and fallback
to PTE swap mapping.

Signed-off-by: "Huang, Ying" 
Cc: "Kirill A. Shutemov" 
Cc: Andrea Arcangeli 
Cc: Michal Hocko 
Cc: Johannes Weiner 
Cc: Shaohua Li 
Cc: Hugh Dickins 
Cc: Minchan Kim 
Cc: Rik van Riel 
Cc: Dave Hansen 
Cc: Naoya Horiguchi 
Cc: Zi Yan 
Cc: Daniel Jordan 
---
 mm/madvise.c | 26 --
 1 file changed, 24 insertions(+), 2 deletions(-)

diff --git a/mm/madvise.c b/mm/madvise.c
index c1845dab2dd4..84d055c19dd4 100644
--- a/mm/madvise.c
+++ b/mm/madvise.c
@@ -196,14 +196,36 @@ static int swapin_walk_pmd_entry(pmd_t *pmd, unsigned 
long start,
pte_t *orig_pte;
struct vm_area_struct *vma = walk->private;
unsigned long index;
+   swp_entry_t entry;
+   struct page *page;
+   pmd_t pmdval;
+
+   pmdval = *pmd;
+   if (IS_ENABLED(CONFIG_THP_SWAP) && is_swap_pmd(pmdval) &&
+   !is_pmd_migration_entry(pmdval)) {
+   entry = pmd_to_swp_entry(pmdval);
+   if (!transparent_hugepage_swapin_enabled(vma)) {
+   if (!split_swap_cluster(entry, 0))
+   split_huge_swap_pmd(vma, pmd, start, pmdval);
+   } else {
+   page = read_swap_cache_async(entry,
+GFP_HIGHUSER_MOVABLE,
+vma, start, false);
+   if (page) {
+   /* The swap cluster has been split under us */
+   if (!PageTransHuge(page))
+   split_huge_swap_pmd(vma, pmd, start,
+   pmdval);
+   put_page(page);
+   }
+   }
+   }
 
if (pmd_none_or_trans_huge_or_clear_bad(pmd))
return 0;
 
for (index = start; index != end; index += PAGE_SIZE) {
pte_t pte;
-   swp_entry_t entry;
-   struct page *page;
spinlock_t *ptl;
 
orig_pte = pte_offset_map_lock(vma->vm_mm, pmd, start, &ptl);
-- 
2.18.1

[PATCH -V9 19/21] swap: Support PMD swap mapping in mincore()

2018-12-13 Thread Huang Ying

During mincore(), for PMD swap mapping, swap cache will be looked up.
If the resulting page isn't compound page, the PMD swap mapping will
be split and fallback to PTE swap mapping processing.

Signed-off-by: "Huang, Ying" 
Cc: "Kirill A. Shutemov" 
Cc: Andrea Arcangeli 
Cc: Michal Hocko 
Cc: Johannes Weiner 
Cc: Shaohua Li 
Cc: Hugh Dickins 
Cc: Minchan Kim 
Cc: Rik van Riel 
Cc: Dave Hansen 
Cc: Naoya Horiguchi 
Cc: Zi Yan 
Cc: Daniel Jordan 
---
 mm/mincore.c | 37 +++--
 1 file changed, 31 insertions(+), 6 deletions(-)

diff --git a/mm/mincore.c b/mm/mincore.c
index aa0e542569f9..1d861fac82ee 100644
--- a/mm/mincore.c
+++ b/mm/mincore.c
@@ -48,7 +48,8 @@ static int mincore_hugetlb(pte_t *pte, unsigned long hmask, 
unsigned long addr,
  * and is up to date; i.e. that no page-in operation would be required
  * at this time if an application were to map and access this page.
  */
-static unsigned char mincore_page(struct address_space *mapping, pgoff_t pgoff)
+static unsigned char mincore_page(struct address_space *mapping, pgoff_t pgoff,
+ bool *compound)
 {
unsigned char present = 0;
struct page *page;
@@ -86,6 +87,8 @@ static unsigned char mincore_page(struct address_space 
*mapping, pgoff_t pgoff)
 #endif
if (page) {
present = PageUptodate(page);
+   if (compound)
+   *compound = PageCompound(page);
put_page(page);
}
 
@@ -103,7 +106,8 @@ static int __mincore_unmapped_range(unsigned long addr, 
unsigned long end,
 
pgoff = linear_page_index(vma, addr);
for (i = 0; i < nr; i++, pgoff++)
-   vec[i] = mincore_page(vma->vm_file->f_mapping, pgoff);
+   vec[i] = mincore_page(vma->vm_file->f_mapping,
+ pgoff, NULL);
} else {
for (i = 0; i < nr; i++)
vec[i] = 0;
@@ -127,14 +131,36 @@ static int mincore_pte_range(pmd_t *pmd, unsigned long 
addr, unsigned long end,
pte_t *ptep;
unsigned char *vec = walk->private;
int nr = (end - addr) >> PAGE_SHIFT;
+   swp_entry_t entry;
 
ptl = pmd_trans_huge_lock(pmd, vma);
if (ptl) {
-   memset(vec, 1, nr);
+   unsigned char val = 1;
+   bool compound;
+
+   if (IS_ENABLED(CONFIG_THP_SWAP) && is_swap_pmd(*pmd)) {
+   entry = pmd_to_swp_entry(*pmd);
+   if (!non_swap_entry(entry)) {
+   val = mincore_page(swap_address_space(entry),
+  swp_offset(entry),
+  &compound);
+   /*
+* The huge swap cluster has been
+* split under us
+*/
+   if (!compound) {
+   __split_huge_swap_pmd(vma, addr, pmd);
+   spin_unlock(ptl);
+   goto fallback;
+   }
+   }
+   }
+   memset(vec, val, nr);
spin_unlock(ptl);
goto out;
}
 
+fallback:
if (pmd_trans_unstable(pmd)) {
__mincore_unmapped_range(addr, end, vma, vec);
goto out;
@@ -150,8 +176,7 @@ static int mincore_pte_range(pmd_t *pmd, unsigned long 
addr, unsigned long end,
else if (pte_present(pte))
*vec = 1;
else { /* pte is a swap entry */
-   swp_entry_t entry = pte_to_swp_entry(pte);
-
+   entry = pte_to_swp_entry(pte);
if (non_swap_entry(entry)) {
/*
 * migration or hwpoison entries are always
@@ -161,7 +186,7 @@ static int mincore_pte_range(pmd_t *pmd, unsigned long 
addr, unsigned long end,
} else {
 #ifdef CONFIG_SWAP
*vec = mincore_page(swap_address_space(entry),
-   swp_offset(entry));
+   swp_offset(entry), NULL);
 #else
WARN_ON(1);
*vec = 1;
-- 
2.18.1

[PATCH -V9 07/21] swap: Support PMD swap mapping when splitting huge PMD

2018-12-13 Thread Huang Ying

A huge PMD need to be split when zap a part of the PMD mapping etc.
If the PMD mapping is a swap mapping, we need to split it too.  This
patch implemented the support for this.  This is similar as splitting
the PMD page mapping, except we need to decrease the PMD swap mapping
count for the huge swap cluster too.  If the PMD swap mapping count
becomes 0, the huge swap cluster will be split.

Notice: is_huge_zero_pmd() and pmd_page() doesn't work well with swap
PMD, so pmd_present() check is called before them.

Thanks Daniel Jordan for testing and reporting a data corruption bug
caused by misaligned address processing issue in __split_huge_swap_pmd().

Signed-off-by: "Huang, Ying" 
Cc: "Kirill A. Shutemov" 
Cc: Andrea Arcangeli 
Cc: Michal Hocko 
Cc: Johannes Weiner 
Cc: Shaohua Li 
Cc: Hugh Dickins 
Cc: Minchan Kim 
Cc: Rik van Riel 
Cc: Dave Hansen 
Cc: Naoya Horiguchi 
Cc: Zi Yan 
Cc: Daniel Jordan 
---
 include/linux/huge_mm.h |  4 
 include/linux/swap.h|  6 +
 mm/huge_memory.c| 49 -
 mm/swapfile.c   | 32 +++
 4 files changed, 86 insertions(+), 5 deletions(-)

diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h
index 4663ee96cf59..1c0fda003d6a 100644
--- a/include/linux/huge_mm.h
+++ b/include/linux/huge_mm.h
@@ -226,6 +226,10 @@ static inline bool is_huge_zero_page(struct page *page)
return READ_ONCE(huge_zero_page) == page;
 }
 
+/*
+ * is_huge_zero_pmd() must be called after checking pmd_present(),
+ * otherwise, it may report false positive for PMD swap entry.
+ */
 static inline bool is_huge_zero_pmd(pmd_t pmd)
 {
return is_huge_zero_page(pmd_page(pmd));
diff --git a/include/linux/swap.h b/include/linux/swap.h
index 24c3014894dd..a24d101b131d 100644
--- a/include/linux/swap.h
+++ b/include/linux/swap.h
@@ -619,11 +619,17 @@ static inline swp_entry_t get_swap_page(struct page *page)
 
 #ifdef CONFIG_THP_SWAP
 extern int split_swap_cluster(swp_entry_t entry);
+extern int split_swap_cluster_map(swp_entry_t entry);
 #else
 static inline int split_swap_cluster(swp_entry_t entry)
 {
return 0;
 }
+
+static inline int split_swap_cluster_map(swp_entry_t entry)
+{
+   return 0;
+}
 #endif
 
 #ifdef CONFIG_MEMCG
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index bd2543e10938..49df3e7c96c7 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -1617,6 +1617,41 @@ vm_fault_t do_huge_pmd_numa_page(struct vm_fault *vmf, 
pmd_t pmd)
return 0;
 }
 
+/* Convert a PMD swap mapping to a set of PTE swap mappings */
+static void __split_huge_swap_pmd(struct vm_area_struct *vma,
+ unsigned long addr,
+ pmd_t *pmd)
+{
+   struct mm_struct *mm = vma->vm_mm;
+   pgtable_t pgtable;
+   pmd_t _pmd;
+   swp_entry_t entry;
+   int i, soft_dirty;
+
+   addr &= HPAGE_PMD_MASK;
+   entry = pmd_to_swp_entry(*pmd);
+   soft_dirty = pmd_soft_dirty(*pmd);
+
+   split_swap_cluster_map(entry);
+
+   pgtable = pgtable_trans_huge_withdraw(mm, pmd);
+   pmd_populate(mm, &_pmd, pgtable);
+
+   for (i = 0; i < HPAGE_PMD_NR; i++, addr += PAGE_SIZE, entry.val++) {
+   pte_t *pte, ptent;
+
+   pte = pte_offset_map(&_pmd, addr);
+   VM_BUG_ON(!pte_none(*pte));
+   ptent = swp_entry_to_pte(entry);
+   if (soft_dirty)
+   ptent = pte_swp_mksoft_dirty(ptent);
+   set_pte_at(mm, addr, pte, ptent);
+   pte_unmap(pte);
+   }
+   smp_wmb(); /* make pte visible before pmd */
+   pmd_populate(mm, pmd, pgtable);
+}
+
 /*
  * Return true if we do MADV_FREE successfully on entire pmd page.
  * Otherwise, return false.
@@ -2082,7 +2117,7 @@ static void __split_huge_pmd_locked(struct vm_area_struct 
*vma, pmd_t *pmd,
VM_BUG_ON(haddr & ~HPAGE_PMD_MASK);
VM_BUG_ON_VMA(vma->vm_start > haddr, vma);
VM_BUG_ON_VMA(vma->vm_end < haddr + HPAGE_PMD_SIZE, vma);
-   VM_BUG_ON(!is_pmd_migration_entry(*pmd) && !pmd_trans_huge(*pmd)
+   VM_BUG_ON(!is_swap_pmd(*pmd) && !pmd_trans_huge(*pmd)
&& !pmd_devmap(*pmd));
 
count_vm_event(THP_SPLIT_PMD);
@@ -2106,7 +2141,7 @@ static void __split_huge_pmd_locked(struct vm_area_struct 
*vma, pmd_t *pmd,
put_page(page);
add_mm_counter(mm, mm_counter_file(page), -HPAGE_PMD_NR);
return;
-   } else if (is_huge_zero_pmd(*pmd)) {
+   } else if (pmd_present(*pmd) && is_huge_zero_pmd(*pmd)) {
/*
 * FIXME: Do we want to invalidate secondary mmu by calling
 * mmu_notifier_invalidate_range() see comments below inside
@@ -2150,6 +2185,9 @@ static void __split_huge_pmd_locked(struct vm_area_struct 
*vma, pmd_t *pm

[PATCH -V9 12/21] swap: Add sysfs interface to configure THP swapin

2018-12-13 Thread Huang Ying

Swapin a THP as a whole isn't desirable in some situations.  For
example, for completely random access pattern, swapin a THP in one
piece will inflate the reading greatly.  So a sysfs interface:
/sys/kernel/mm/transparent_hugepage/swapin_enabled is added to
configure it.  Three options as follow are provided,

- always: THP swapin will be enabled always

- madvise: THP swapin will be enabled only for VMA with VM_HUGEPAGE
  flag set.

- never: THP swapin will be disabled always

The default configuration is: madvise.

During page fault, if a PMD swap mapping is found and THP swapin is
disabled, the huge swap cluster and the PMD swap mapping will be split
and fallback to normal page swapin.

Signed-off-by: "Huang, Ying" 
Cc: "Kirill A. Shutemov" 
Cc: Andrea Arcangeli 
Cc: Michal Hocko 
Cc: Johannes Weiner 
Cc: Shaohua Li 
Cc: Hugh Dickins 
Cc: Minchan Kim 
Cc: Rik van Riel 
Cc: Dave Hansen 
Cc: Naoya Horiguchi 
Cc: Zi Yan 
Cc: Daniel Jordan 
---
 Documentation/admin-guide/mm/transhuge.rst | 21 +
 include/linux/huge_mm.h| 31 
 mm/huge_memory.c   | 93 +-
 3 files changed, 126 insertions(+), 19 deletions(-)

diff --git a/Documentation/admin-guide/mm/transhuge.rst 
b/Documentation/admin-guide/mm/transhuge.rst
index 85e33f785fd7..23aefb17101c 100644
--- a/Documentation/admin-guide/mm/transhuge.rst
+++ b/Documentation/admin-guide/mm/transhuge.rst
@@ -160,6 +160,27 @@ Some userspace (such as a test program, or an optimized 
memory allocation
 
cat /sys/kernel/mm/transparent_hugepage/hpage_pmd_size
 
+Transparent hugepage may be swapout and swapin in one piece without
+splitting.  This will improve the utility of transparent hugepage but
+may inflate the read/write too.  So whether to enable swapin
+transparent hugepage in one piece can be configured as follow.
+
+   echo always >/sys/kernel/mm/transparent_hugepage/swapin_enabled
+   echo madvise >/sys/kernel/mm/transparent_hugepage/swapin_enabled
+   echo never >/sys/kernel/mm/transparent_hugepage/swapin_enabled
+
+always
+   Attempt to allocate a transparent huge page and read it from
+   swap space in one piece every time.
+
+never
+   Always split the swap space and PMD swap mapping and swapin
+   the fault normal page during swapin.
+
+madvise
+   Only swapin the transparent huge page in one piece for
+   MADV_HUGEPAGE madvise regions.
+
 khugepaged will be automatically started when
 transparent_hugepage/enabled is set to "always" or "madvise, and it'll
 be automatically shutdown if it's set to "never".
diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h
index debe3760e894..06dbbcf6a6dd 100644
--- a/include/linux/huge_mm.h
+++ b/include/linux/huge_mm.h
@@ -63,6 +63,8 @@ enum transparent_hugepage_flag {
 #ifdef CONFIG_DEBUG_VM
TRANSPARENT_HUGEPAGE_DEBUG_COW_FLAG,
 #endif
+   TRANSPARENT_HUGEPAGE_SWAPIN_FLAG,
+   TRANSPARENT_HUGEPAGE_SWAPIN_REQ_MADV_FLAG,
 };
 
 struct kobject;
@@ -373,11 +375,40 @@ static inline gfp_t alloc_hugepage_direct_gfpmask(struct 
vm_area_struct *vma)
 
 #ifdef CONFIG_THP_SWAP
 extern int do_huge_pmd_swap_page(struct vm_fault *vmf, pmd_t orig_pmd);
+
+static inline bool transparent_hugepage_swapin_enabled(
+   struct vm_area_struct *vma)
+{
+   if (vma->vm_flags & VM_NOHUGEPAGE)
+   return false;
+
+   if (is_vma_temporary_stack(vma))
+   return false;
+
+   if (test_bit(MMF_DISABLE_THP, &vma->vm_mm->flags))
+   return false;
+
+   if (transparent_hugepage_flags &
+   (1 << TRANSPARENT_HUGEPAGE_SWAPIN_FLAG))
+   return true;
+
+   if (transparent_hugepage_flags &
+   (1 << TRANSPARENT_HUGEPAGE_SWAPIN_REQ_MADV_FLAG))
+   return !!(vma->vm_flags & VM_HUGEPAGE);
+
+   return false;
+}
 #else /* CONFIG_THP_SWAP */
 static inline int do_huge_pmd_swap_page(struct vm_fault *vmf, pmd_t orig_pmd)
 {
return 0;
 }
+
+static inline bool transparent_hugepage_swapin_enabled(
+   struct vm_area_struct *vma)
+{
+   return false;
+}
 #endif /* CONFIG_THP_SWAP */
 
 #endif /* _LINUX_HUGE_MM_H */
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index e1e95e6c86e3..8e8952938c25 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -57,7 +57,8 @@ unsigned long transparent_hugepage_flags __read_mostly =
 #endif
(1<address);
if (!page) {
+   if (!transparent_hugepage_swapin_enabled(vma))
+   goto split;
+
page = read_swap_cache_async(entry, GFP_HIGHUSER_MOVABLE, vma,
 haddr, false);
if (!page) {
@@ -1695,24 +1749,8 @@ int do_huge_pmd_swap_page(struct vm_fault *vmf, pmd_t 
orig_pmd)

[PATCH -V9 20/21] swap: Support PMD swap mapping in common path

2018-12-13 Thread Huang Ying

Original code is only for PMD migration entry, it is revised to
support PMD swap mapping.

Signed-off-by: "Huang, Ying" 
Cc: "Kirill A. Shutemov" 
Cc: Andrea Arcangeli 
Cc: Michal Hocko 
Cc: Johannes Weiner 
Cc: Shaohua Li 
Cc: Hugh Dickins 
Cc: Minchan Kim 
Cc: Rik van Riel 
Cc: Dave Hansen 
Cc: Naoya Horiguchi 
Cc: Zi Yan 
Cc: Daniel Jordan 
---
 fs/proc/task_mmu.c | 12 +---
 mm/gup.c   | 36 
 mm/huge_memory.c   |  7 ---
 mm/mempolicy.c |  2 +-
 4 files changed, 34 insertions(+), 23 deletions(-)

diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c
index dc36909a73c6..fa41822574e1 100644
--- a/fs/proc/task_mmu.c
+++ b/fs/proc/task_mmu.c
@@ -986,7 +986,7 @@ static inline void clear_soft_dirty_pmd(struct 
vm_area_struct *vma,
pmd = pmd_clear_soft_dirty(pmd);
 
set_pmd_at(vma->vm_mm, addr, pmdp, pmd);
-   } else if (is_migration_entry(pmd_to_swp_entry(pmd))) {
+   } else if (is_swap_pmd(pmd)) {
pmd = pmd_swp_clear_soft_dirty(pmd);
set_pmd_at(vma->vm_mm, addr, pmdp, pmd);
}
@@ -1320,9 +1320,8 @@ static int pagemap_pmd_range(pmd_t *pmdp, unsigned long 
addr, unsigned long end,
if (pm->show_pfn)
frame = pmd_pfn(pmd) +
((addr & ~PMD_MASK) >> PAGE_SHIFT);
-   }
-#ifdef CONFIG_ARCH_ENABLE_THP_MIGRATION
-   else if (is_swap_pmd(pmd)) {
+   } else if (IS_ENABLED(CONFIG_HAVE_PMD_SWAP_ENTRY) &&
+  is_swap_pmd(pmd)) {
swp_entry_t entry = pmd_to_swp_entry(pmd);
unsigned long offset;
 
@@ -1335,10 +1334,9 @@ static int pagemap_pmd_range(pmd_t *pmdp, unsigned long 
addr, unsigned long end,
flags |= PM_SWAP;
if (pmd_swp_soft_dirty(pmd))
flags |= PM_SOFT_DIRTY;
-   VM_BUG_ON(!is_pmd_migration_entry(pmd));
-   page = migration_entry_to_page(entry);
+   if (is_pmd_migration_entry(pmd))
+   page = migration_entry_to_page(entry);
}
-#endif
 
if (page && page_mapcount(page) == 1)
flags |= PM_MMAP_EXCLUSIVE;
diff --git a/mm/gup.c b/mm/gup.c
index 6dd33e16a806..460565825ef0 100644
--- a/mm/gup.c
+++ b/mm/gup.c
@@ -215,6 +215,7 @@ static struct page *follow_pmd_mask(struct vm_area_struct 
*vma,
spinlock_t *ptl;
struct page *page;
struct mm_struct *mm = vma->vm_mm;
+   swp_entry_t entry;
 
pmd = pmd_offset(pudp, address);
/*
@@ -242,18 +243,22 @@ static struct page *follow_pmd_mask(struct vm_area_struct 
*vma,
if (!pmd_present(pmdval)) {
if (likely(!(flags & FOLL_MIGRATION)))
return no_page_table(vma, flags);
-   VM_BUG_ON(thp_migration_supported() &&
- !is_pmd_migration_entry(pmdval));
-   if (is_pmd_migration_entry(pmdval))
+   entry = pmd_to_swp_entry(pmdval);
+   if (thp_migration_supported() && is_migration_entry(entry)) {
pmd_migration_entry_wait(mm, pmd);
-   pmdval = READ_ONCE(*pmd);
-   /*
-* MADV_DONTNEED may convert the pmd to null because
-* mmap_sem is held in read mode
-*/
-   if (pmd_none(pmdval))
+   pmdval = READ_ONCE(*pmd);
+   /*
+* MADV_DONTNEED may convert the pmd to null because
+* mmap_sem is held in read mode
+*/
+   if (pmd_none(pmdval))
+   return no_page_table(vma, flags);
+   goto retry;
+   }
+   if (IS_ENABLED(CONFIG_THP_SWAP) && !non_swap_entry(entry))
return no_page_table(vma, flags);
-   goto retry;
+   WARN_ON(1);
+   return no_page_table(vma, flags);
}
if (pmd_devmap(pmdval)) {
ptl = pmd_lock(mm, pmd);
@@ -275,11 +280,18 @@ static struct page *follow_pmd_mask(struct vm_area_struct 
*vma,
return no_page_table(vma, flags);
}
if (unlikely(!pmd_present(*pmd))) {
+   entry = pmd_to_swp_entry(*pmd);
spin_unlock(ptl);
if (likely(!(flags & FOLL_MIGRATION)))
return no_page_table(vma, flags);
-   pmd_migration_entry_wait(mm, pmd);
-   goto retry_locked;
+   if (thp_migration_supported() && is_migration_entry(entry)) {
+

[PATCH -V9 05/21] swap: Support PMD swap mapping in put_swap_page()

2018-12-13 Thread Huang Ying

Previously, during swapout, all PMD page mapping will be split and
replaced with PTE swap mapping.  And when clearing the SWAP_HAS_CACHE
flag for the huge swap cluster in put_swap_page(), the huge swap
cluster will be split.  Now, during swapout, the PMD page mappings to
the THP will be changed to PMD swap mappings to the corresponding swap
cluster.  So when clearing the SWAP_HAS_CACHE flag, the huge swap
cluster will only be split if the PMD swap mapping count is 0.
Otherwise, we will keep it as the huge swap cluster.  So that we can
swapin a THP in one piece later.

Signed-off-by: "Huang, Ying" 
Cc: "Kirill A. Shutemov" 
Cc: Andrea Arcangeli 
Cc: Michal Hocko 
Cc: Johannes Weiner 
Cc: Shaohua Li 
Cc: Hugh Dickins 
Cc: Minchan Kim 
Cc: Rik van Riel 
Cc: Dave Hansen 
Cc: Naoya Horiguchi 
Cc: Zi Yan 
Cc: Daniel Jordan 
---
 mm/swapfile.c | 31 ---
 1 file changed, 24 insertions(+), 7 deletions(-)

diff --git a/mm/swapfile.c b/mm/swapfile.c
index bd8756ac3bcc..04cf6b95cae0 100644
--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -1314,6 +1314,15 @@ void swap_free(swp_entry_t entry)
 
 /*
  * Called after dropping swapcache to decrease refcnt to swap entries.
+ *
+ * When a THP is added into swap cache, the SWAP_HAS_CACHE flag will
+ * be set in the swap_map[] of all swap entries in the huge swap
+ * cluster backing the THP.  This huge swap cluster will not be split
+ * unless the THP is split even if its PMD swap mapping count dropped
+ * to 0.  Later, when the THP is removed from swap cache, the
+ * SWAP_HAS_CACHE flag will be cleared in the swap_map[] of all swap
+ * entries in the huge swap cluster.  And this huge swap cluster will
+ * be split if its PMD swap mapping count is 0.
  */
 void put_swap_page(struct page *page, swp_entry_t entry)
 {
@@ -1332,15 +1341,23 @@ void put_swap_page(struct page *page, swp_entry_t entry)
 
ci = lock_cluster_or_swap_info(si, offset);
if (size == SWAPFILE_CLUSTER) {
-   VM_BUG_ON(!cluster_is_huge(ci));
+   VM_BUG_ON(!IS_ALIGNED(offset, size));
map = si->swap_map + offset;
-   for (i = 0; i < SWAPFILE_CLUSTER; i++) {
-   val = map[i];
-   VM_BUG_ON(!(val & SWAP_HAS_CACHE));
-   if (val == SWAP_HAS_CACHE)
-   free_entries++;
+   /*
+* No PMD swap mapping, the swap cluster will be freed
+* if all swap entries becoming free, otherwise the
+* huge swap cluster will be split.
+*/
+   if (!cluster_swapcount(ci)) {
+   for (i = 0; i < SWAPFILE_CLUSTER; i++) {
+   val = map[i];
+   VM_BUG_ON(!(val & SWAP_HAS_CACHE));
+   if (val == SWAP_HAS_CACHE)
+   free_entries++;
+   }
+   if (free_entries != SWAPFILE_CLUSTER)
+   cluster_clear_huge(ci);
}
-   cluster_clear_huge(ci);
if (free_entries == SWAPFILE_CLUSTER) {
unlock_cluster_or_swap_info(si, ci);
spin_lock(&si->lock);
-- 
2.18.1

Re: [PATCH -V9 10/21] swap: Swapin a THP in one piece

2018-12-18 Thread Huang, Ying

Daniel Jordan  writes:

> On Fri, Dec 14, 2018 at 02:27:43PM +0800, Huang Ying wrote:
>> diff --git a/mm/huge_memory.c b/mm/huge_memory.c
>> index 1cec1eec340e..644cb5d6b056 100644
>> --- a/mm/huge_memory.c
>> +++ b/mm/huge_memory.c
>> @@ -33,6 +33,8 @@
>>  #include 
>>  #include 
>>  #include 
>> +#include 
>> +#include 
>
> swap.h is already #included in this file.

Will fix this, Thanks!

Best Regards,
Huang, Ying

Re: [PATCH -V9 07/21] swap: Support PMD swap mapping when splitting huge PMD

2018-12-18 Thread Huang, Ying

Hi, Daniel,

Daniel Jordan  writes:

> +Aneesh
>
> On Fri, Dec 14, 2018 at 02:27:40PM +0800, Huang Ying wrote:
>> diff --git a/mm/huge_memory.c b/mm/huge_memory.c
>> index bd2543e10938..49df3e7c96c7 100644
>> --- a/mm/huge_memory.c
>> +++ b/mm/huge_memory.c
>
>> +int split_swap_cluster_map(swp_entry_t entry)
> ...
>> +VM_BUG_ON(!IS_ALIGNED(offset, SWAPFILE_CLUSTER));
>
> Hi Ying, I crashed on this in v6 as reported and it still dies on me now.

Sorry, I missed the original report for v6.  I should have been more careful.

>> @@ -2150,6 +2185,9 @@ static void __split_huge_pmd_locked(struct 
>> vm_area_struct *vma, pmd_t *pmd,
> ...
>> +if (IS_ENABLED(CONFIG_THP_SWAP) && is_swap_pmd(old_pmd))
>> +return __split_huge_swap_pmd(vma, haddr, pmd);
>
> Problem is 'pmd' is passed here, which has been pmdp_invalidate()ed under the
> assumption that it is not a swap entry.  pmd's pfn bits get inverted for L1TF,
> so the swap entry gets corrupted and this BUG is the first place that notices.
>
> I don't see a reason to invalidate so soon, so what about just moving the
> invalidation down, past the migration/swap checks?

Yes.  That can fix the issue.  I will fix this in that way.

Best Regards,
Huang, Ying

[PATCH 1/2] swap: Fix general protection fault when swapoff

2018-12-11 Thread Huang Ying

When VMA based swap readahead is used, which is default if all swap
devices are SSD, swapoff will trigger general protection fault as
follow, because vmf->pmd isn't initialized when calling
swapin_readahead().  This fix could be folded into the patch: mm,
swap: rid swapoff of quadratic complexity in -mm patchset.

general protection fault:  [#1] PREEMPT SMP DEBUG_PAGEALLOC
CPU: 3 PID: 352 Comm: swapoff Not tainted 4.20.0-rc5-mm1-kvm+ #535
Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.11.0-1.fc28 
04/01/2014
RIP: 0010:swapin_readahead+0xb7/0x39b
Code: ff 01 00 00 40 f6 c7 80 49 0f 45 d0 48 21 d7 48 ba 00 00 00 00 80 88 ff 
ff 48 8d 14 f2 48 01 d7 48 ba ff ff ff ff ff ff ff ef <48> 39 17 0f 87 5d 01 00 
00 49 8b 95 b0 00 00 00 48 85 d2 75 05 ba
RSP: 0018:c94c3ca0 EFLAGS: 00010207
RAX: 55de5d252000 RBX: 00055de5d252 RCX: 0003
RDX: efff RSI: 0052 RDI: 000f0bc11c600290
RBP: c94c3d30 R08: 000fffe0 R09: c94c3ea0
R10: c94c3d50 R11: 0002 R12: c94c3da8
R13: 88803b71d780 R14: 0001 R15: 88803b71d780
FS:  7f87c20e02c0() GS:88803e80() knlGS:
CS:  0010 DS:  ES:  CR0: 80050033
CR2: 55a439825398 CR3: 3b49a004 CR4: 00360ea0
DR0:  DR1:  DR2: 
DR3:  DR6: fffe0ff0 DR7: 0400
Call Trace:
 ? list_add_tail_rcu+0x19/0x31
 ? __lock_acquire+0xd61/0xe1c
 ? find_held_lock+0x2b/0x6e
 ? unuse_pte_range+0xe9/0x429
 unuse_pte_range+0xe9/0x429
 ? find_held_lock+0x2b/0x6e
 ? __lock_is_held+0x40/0x71
 try_to_unuse+0x311/0x54b
 __do_sys_swapoff+0x254/0x625
 ? lockdep_hardirqs_off+0x29/0x86
 ? do_syscall_64+0x12/0x65
 do_syscall_64+0x57/0x65
 entry_SYSCALL_64_after_hwframe+0x49/0xbe

Signed-off-by: "Huang, Ying" 
Cc: Vineeth Remanan Pillai 
Cc: Kelley Nielsen 
Cc: Rik van Riel 
Cc: Matthew Wilcox 
Cc: Hugh Dickins 
---
 mm/swapfile.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/mm/swapfile.c b/mm/swapfile.c
index 9ca162cc45dc..7464d0a92869 100644
--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -1904,6 +1904,7 @@ static int unuse_pte_range(struct vm_area_struct *vma, 
pmd_t *pmd,
swap_map = &si->swap_map[offset];
vmf.vma = vma;
vmf.address = addr;
+   vmf.pmd = pmd;
page = swapin_readahead(entry, GFP_HIGHUSER_MOVABLE, &vmf);
if (!page) {
if (*swap_map == 0 || *swap_map == SWAP_MAP_BAD)
-- 
2.18.1

[PATCH 2/2] swap: Deal with PTE mapped THP when unuse PTE

2018-12-11 Thread Huang Ying

A PTE swap entry may map to a normal swap slot inside a huge swap
cluster.  To free the huge swap cluster and the corresponding
THP (transparent huge page), all PTE swap entry mappings need to be
unmapped.  The original implementation only checks current PTE swap
entry mapping, this is fixed via calling try_to_free_swap() instead,
which will check all PTE swap mappings inside the huge swap cluster.

This fix could be folded into the patch: mm, swap: rid swapoff of
quadratic complexity in -mm patchset.

Signed-off-by: "Huang, Ying" 
Cc: Vineeth Remanan Pillai 
Cc: Kelley Nielsen 
Cc: Rik van Riel 
Cc: Matthew Wilcox 
Cc: Hugh Dickins 
---
 mm/swapfile.c | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/mm/swapfile.c b/mm/swapfile.c
index 7464d0a92869..9e6da494781f 100644
--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -1921,10 +1921,8 @@ static int unuse_pte_range(struct vm_area_struct *vma, 
pmd_t *pmd,
goto out;
}
 
-   if (PageSwapCache(page) && (swap_count(*swap_map) == 0))
-   delete_from_swap_cache(compound_head(page));
+   try_to_free_swap(page);
 
-   SetPageDirty(page);
unlock_page(page);
put_page(page);
 
-- 
2.18.1

Re: [v4 PATCH 2/2] mm: swap: add comment for swap_vma_readahead

2019-01-02 Thread Huang, Ying

Yang Shi  writes:

> swap_vma_readahead()'s comment is missed, just add it.
>
> Cc: Huang Ying 
> Cc: Tim Chen 
> Cc: Minchan Kim 
> Signed-off-by: Yang Shi 
> ---
>  mm/swap_state.c | 17 +
>  1 file changed, 17 insertions(+)
>
> diff --git a/mm/swap_state.c b/mm/swap_state.c
> index 78d500e..dd8f698 100644
> --- a/mm/swap_state.c
> +++ b/mm/swap_state.c
> @@ -698,6 +698,23 @@ static void swap_ra_info(struct vm_fault *vmf,
>   pte_unmap(orig_pte);
>  }
>  
> +/**
> + * swap_vm_readahead - swap in pages in hope we need them soon

s/swap_vm_readahead/swap_vma_readahead/

> + * @entry: swap entry of this memory
> + * @gfp_mask: memory allocation flags
> + * @vmf: fault information
> + *
> + * Returns the struct page for entry and addr, after queueing swapin.
> + *
> + * Primitive swap readahead code. We simply read in a few pages whoes
> + * virtual addresses are around the fault address in the same vma.
> + *
> + * This has been extended to use the NUMA policies from the mm triggering
> + * the readahead.

What is this?  I know you copy it from swap_cluster_readahead(), but we
have only one mm for vma readahead.

> + * Caller must hold down_read on the vma->vm_mm if vmf->vma is not NULL.

Better to make it explicit that your are talking about mmap_sem?

Best Regards,
Huang, Ying

> + *
> + */
>  static struct page *swap_vma_readahead(swp_entry_t fentry, gfp_t gfp_mask,
>  struct vm_fault *vmf)
>  {

Re: [v5 PATCH 2/2] mm: swap: add comment for swap_vma_readahead

2019-01-03 Thread Huang, Ying

Yang Shi  writes:

> swap_vma_readahead()'s comment is missed, just add it.
>
> Cc: Huang Ying 
> Cc: Tim Chen 
> Cc: Minchan Kim 
> Signed-off-by: Yang Shi 

Thank!

Reviewed-by: "Huang, Ying" 

Best Regards,
Huang, Ying

> ---
> v5: Fixed the comments per Ying Huang
>
>  mm/swap_state.c | 16 +++-
>  1 file changed, 15 insertions(+), 1 deletion(-)
>
> diff --git a/mm/swap_state.c b/mm/swap_state.c
> index 78d500e..c8730d7 100644
> --- a/mm/swap_state.c
> +++ b/mm/swap_state.c
> @@ -523,7 +523,7 @@ static unsigned long swapin_nr_pages(unsigned long offset)
>   * This has been extended to use the NUMA policies from the mm triggering
>   * the readahead.
>   *
> - * Caller must hold down_read on the vma->vm_mm if vmf->vma is not NULL.
> + * Caller must hold read mmap_sem if vmf->vma is not NULL.
>   */
>  struct page *swap_cluster_readahead(swp_entry_t entry, gfp_t gfp_mask,
>   struct vm_fault *vmf)
> @@ -698,6 +698,20 @@ static void swap_ra_info(struct vm_fault *vmf,
>   pte_unmap(orig_pte);
>  }
>  
> +/**
> + * swap_vma_readahead - swap in pages in hope we need them soon
> + * @entry: swap entry of this memory
> + * @gfp_mask: memory allocation flags
> + * @vmf: fault information
> + *
> + * Returns the struct page for entry and addr, after queueing swapin.
> + *
> + * Primitive swap readahead code. We simply read in a few pages whoes
> + * virtual addresses are around the fault address in the same vma.
> + *
> + * Caller must hold read mmap_sem if vmf->vma is not NULL.
> + *
> + */
>  static struct page *swap_vma_readahead(swp_entry_t fentry, gfp_t gfp_mask,
>  struct vm_fault *vmf)
>  {

[PATCH] mm, swap: Fix swapoff with KSM pages

2018-12-25 Thread Huang Ying

KSM pages may be mapped to the multiple VMAs that cannot be reached
from one anon_vma.  So during swapin, a new copy of the page need to
be generated if a different anon_vma is needed, please refer to
comments of ksm_might_need_to_copy() for details.

During swapoff, unuse_vma() uses anon_vma (if available) to locate VMA
and virtual address mapped to the page, so not all mappings to a
swapped out KSM page could be found.  So in try_to_unuse(), even if
the swap count of a swap entry isn't zero, the page needs to be
deleted from swap cache, so that, in the next round a new page could
be allocated and swapin for the other mappings of the swapped out KSM
page.

But this contradicts with the THP swap support.  Where the THP could
be deleted from swap cache only after the swap count of every swap
entry in the huge swap cluster backing the THP has reach 0.  So
try_to_unuse() is changed in commit e07098294adf ("mm, THP, swap:
support to reclaim swap space for THP swapped out") to check that
before delete a page from swap cache, but this has broken KSM swapoff
too.

Fortunately, KSM is for the normal pages only, so the original
behavior for KSM pages could be restored easily via checking
PageTransCompound().  That is how this patch works.

Fixes: e07098294adf ("mm, THP, swap: support to reclaim swap space for THP 
swapped out")
Signed-off-by: "Huang, Ying" 
Reported-and-Tested-and-Acked-by: Hugh Dickins 
Cc: Rik van Riel 
Cc: Johannes Weiner 
Cc: Minchan Kim 
Cc: Shaohua Li 
Cc: Daniel Jordan 
---
 mm/swapfile.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/mm/swapfile.c b/mm/swapfile.c
index 8688ae65ef58..20d3c0f47a5f 100644
--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -2197,7 +2197,8 @@ int try_to_unuse(unsigned int type, bool frontswap,
 */
if (PageSwapCache(page) &&
likely(page_private(page) == entry.val) &&
-   !page_swapped(page))
+   (!PageTransCompound(page) ||
+!swap_page_trans_huge_swapped(si, entry)))
delete_from_swap_cache(compound_head(page));
 
/*
-- 
2.19.2

Re: [PATCH] mm, swap: Fix swapoff with KSM pages

2018-12-25 Thread Huang, Ying

Hi, Andrew,

This patch is based on linus' tree instead of the head of mmotm tree
because it is to fix a bug there.

The bug is introduced by commit e07098294adf ("mm, THP, swap: support to
reclaim swap space for THP swapped out"), which is merged by v4.14-rc1.
So I think we should backport the fix to from 4.14 on.  But Hugh thinks
it may be rare for the KSM pages being in the swap device when swapoff,
so nobody reports the bug so far.

Best Regards,
Huang, Ying

Huang Ying  writes:

> KSM pages may be mapped to the multiple VMAs that cannot be reached
> from one anon_vma.  So during swapin, a new copy of the page need to
> be generated if a different anon_vma is needed, please refer to
> comments of ksm_might_need_to_copy() for details.
>
> During swapoff, unuse_vma() uses anon_vma (if available) to locate VMA
> and virtual address mapped to the page, so not all mappings to a
> swapped out KSM page could be found.  So in try_to_unuse(), even if
> the swap count of a swap entry isn't zero, the page needs to be
> deleted from swap cache, so that, in the next round a new page could
> be allocated and swapin for the other mappings of the swapped out KSM
> page.
>
> But this contradicts with the THP swap support.  Where the THP could
> be deleted from swap cache only after the swap count of every swap
> entry in the huge swap cluster backing the THP has reach 0.  So
> try_to_unuse() is changed in commit e07098294adf ("mm, THP, swap:
> support to reclaim swap space for THP swapped out") to check that
> before delete a page from swap cache, but this has broken KSM swapoff
> too.
>
> Fortunately, KSM is for the normal pages only, so the original
> behavior for KSM pages could be restored easily via checking
> PageTransCompound().  That is how this patch works.
>
> Fixes: e07098294adf ("mm, THP, swap: support to reclaim swap space for THP 
> swapped out")
> Signed-off-by: "Huang, Ying" 
> Reported-and-Tested-and-Acked-by: Hugh Dickins 
> Cc: Rik van Riel 
> Cc: Johannes Weiner 
> Cc: Minchan Kim 
> Cc: Shaohua Li 
> Cc: Daniel Jordan 
> ---
>  mm/swapfile.c | 3 ++-
>  1 file changed, 2 insertions(+), 1 deletion(-)
>
> diff --git a/mm/swapfile.c b/mm/swapfile.c
> index 8688ae65ef58..20d3c0f47a5f 100644
> --- a/mm/swapfile.c
> +++ b/mm/swapfile.c
> @@ -2197,7 +2197,8 @@ int try_to_unuse(unsigned int type, bool frontswap,
>*/
>   if (PageSwapCache(page) &&
>   likely(page_private(page) == entry.val) &&
> - !page_swapped(page))
> + (!PageTransCompound(page) ||
> +  !swap_page_trans_huge_swapped(si, entry)))
>   delete_from_swap_cache(compound_head(page));
>  
>   /*

[PATCH -V6 10/21] swap: Support to count THP swapin and its fallback

2018-10-10 Thread Huang Ying

2 new /proc/vmstat fields are added, "thp_swapin" and
"thp_swapin_fallback" to count swapin a THP from swap device in one
piece and fallback to normal page swapin.

Signed-off-by: "Huang, Ying" 
Cc: "Kirill A. Shutemov" 
Cc: Andrea Arcangeli 
Cc: Michal Hocko 
Cc: Johannes Weiner 
Cc: Shaohua Li 
Cc: Hugh Dickins 
Cc: Minchan Kim 
Cc: Rik van Riel 
Cc: Dave Hansen 
Cc: Naoya Horiguchi 
Cc: Zi Yan 
Cc: Daniel Jordan 
---
 Documentation/admin-guide/mm/transhuge.rst |  8 
 include/linux/vm_event_item.h  |  2 ++
 mm/huge_memory.c   |  4 +++-
 mm/page_io.c   | 15 ---
 mm/vmstat.c|  2 ++
 5 files changed, 27 insertions(+), 4 deletions(-)

diff --git a/Documentation/admin-guide/mm/transhuge.rst 
b/Documentation/admin-guide/mm/transhuge.rst
index 7ab93a8404b9..85e33f785fd7 100644
--- a/Documentation/admin-guide/mm/transhuge.rst
+++ b/Documentation/admin-guide/mm/transhuge.rst
@@ -364,6 +364,14 @@ thp_swpout_fallback
Usually because failed to allocate some continuous swap space
for the huge page.
 
+thp_swpin
+   is incremented every time a huge page is swapin in one piece
+   without splitting.
+
+thp_swpin_fallback
+   is incremented if a huge page has to be split during swapin.
+   Usually because failed to allocate a huge page.
+
 As the system ages, allocating huge pages may be expensive as the
 system uses memory compaction to copy data around memory to free a
 huge page for use. There are some counters in ``/proc/vmstat`` to help
diff --git a/include/linux/vm_event_item.h b/include/linux/vm_event_item.h
index 47a3441cf4c4..c20b655cfdcc 100644
--- a/include/linux/vm_event_item.h
+++ b/include/linux/vm_event_item.h
@@ -88,6 +88,8 @@ enum vm_event_item { PGPGIN, PGPGOUT, PSWPIN, PSWPOUT,
THP_ZERO_PAGE_ALLOC_FAILED,
THP_SWPOUT,
THP_SWPOUT_FALLBACK,
+   THP_SWPIN,
+   THP_SWPIN_FALLBACK,
 #endif
 #ifdef CONFIG_MEMORY_BALLOON
BALLOON_INFLATE,
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index fbc9c9e30992..8efcc84fb4b0 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -1715,8 +1715,10 @@ int do_huge_pmd_swap_page(struct vm_fault *vmf, pmd_t 
orig_pmd)
/* swapoff occurs under us */
} else if (ret == -EINVAL)
ret = 0;
-   else
+   else {
+   count_vm_event(THP_SWPIN_FALLBACK);
goto fallback;
+   }
}
delayacct_clear_flag(DELAYACCT_PF_SWAPIN);
goto out;
diff --git a/mm/page_io.c b/mm/page_io.c
index 573d3663d846..bcc2da750590 100644
--- a/mm/page_io.c
+++ b/mm/page_io.c
@@ -348,6 +348,15 @@ int __swap_writepage(struct page *page, struct 
writeback_control *wbc,
return ret;
 }
 
+static inline void count_swpin_vm_event(struct page *page)
+{
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+   if (unlikely(PageTransHuge(page)))
+   count_vm_event(THP_SWPIN);
+#endif
+   count_vm_events(PSWPIN, hpage_nr_pages(page));
+}
+
 int swap_readpage(struct page *page, bool synchronous)
 {
struct bio *bio;
@@ -371,7 +380,7 @@ int swap_readpage(struct page *page, bool synchronous)
 
ret = mapping->a_ops->readpage(swap_file, page);
if (!ret)
-   count_vm_event(PSWPIN);
+   count_swpin_vm_event(page);
return ret;
}
 
@@ -382,7 +391,7 @@ int swap_readpage(struct page *page, bool synchronous)
unlock_page(page);
}
 
-   count_vm_event(PSWPIN);
+   count_swpin_vm_event(page);
return 0;
}
 
@@ -401,7 +410,7 @@ int swap_readpage(struct page *page, bool synchronous)
get_task_struct(current);
bio->bi_private = current;
bio_set_op_attrs(bio, REQ_OP_READ, 0);
-   count_vm_event(PSWPIN);
+   count_swpin_vm_event(page);
bio_get(bio);
qc = submit_bio(bio);
while (synchronous) {
diff --git a/mm/vmstat.c b/mm/vmstat.c
index d08ed044759d..823856fae136 100644
--- a/mm/vmstat.c
+++ b/mm/vmstat.c
@@ -1264,6 +1264,8 @@ const char * const vmstat_text[] = {
"thp_zero_page_alloc_failed",
"thp_swpout",
"thp_swpout_fallback",
+   "thp_swpin",
+   "thp_swpin_fallback",
 #endif
 #ifdef CONFIG_MEMORY_BALLOON
"balloon_inflate",
-- 
2.16.4

[PATCH -V6 04/21] swap: Support PMD swap mapping in put_swap_page()

2018-10-10 Thread Huang Ying

Previously, during swapout, all PMD page mapping will be split and
replaced with PTE swap mapping.  And when clearing the SWAP_HAS_CACHE
flag for the huge swap cluster in put_swap_page(), the huge swap
cluster will be split.  Now, during swapout, the PMD page mappings to
the THP will be changed to PMD swap mappings to the corresponding swap
cluster.  So when clearing the SWAP_HAS_CACHE flag, the huge swap
cluster will only be split if the PMD swap mapping count is 0.
Otherwise, we will keep it as the huge swap cluster.  So that we can
swapin a THP in one piece later.

Signed-off-by: "Huang, Ying" 
Cc: "Kirill A. Shutemov" 
Cc: Andrea Arcangeli 
Cc: Michal Hocko 
Cc: Johannes Weiner 
Cc: Shaohua Li 
Cc: Hugh Dickins 
Cc: Minchan Kim 
Cc: Rik van Riel 
Cc: Dave Hansen 
Cc: Naoya Horiguchi 
Cc: Zi Yan 
Cc: Daniel Jordan 
---
 mm/swapfile.c | 31 ---
 1 file changed, 24 insertions(+), 7 deletions(-)

diff --git a/mm/swapfile.c b/mm/swapfile.c
index a5a1ab46dab7..45c12abcb467 100644
--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -1314,6 +1314,15 @@ void swap_free(swp_entry_t entry)
 
 /*
  * Called after dropping swapcache to decrease refcnt to swap entries.
+ *
+ * When a THP is added into swap cache, the SWAP_HAS_CACHE flag will
+ * be set in the swap_map[] of all swap entries in the huge swap
+ * cluster backing the THP.  This huge swap cluster will not be split
+ * unless the THP is split even if its PMD swap mapping count dropped
+ * to 0.  Later, when the THP is removed from swap cache, the
+ * SWAP_HAS_CACHE flag will be cleared in the swap_map[] of all swap
+ * entries in the huge swap cluster.  And this huge swap cluster will
+ * be split if its PMD swap mapping count is 0.
  */
 void put_swap_page(struct page *page, swp_entry_t entry)
 {
@@ -1332,15 +1341,23 @@ void put_swap_page(struct page *page, swp_entry_t entry)
 
ci = lock_cluster_or_swap_info(si, offset);
if (size == SWAPFILE_CLUSTER) {
-   VM_BUG_ON(!cluster_is_huge(ci));
+   VM_BUG_ON(!IS_ALIGNED(offset, size));
map = si->swap_map + offset;
-   for (i = 0; i < SWAPFILE_CLUSTER; i++) {
-   val = map[i];
-   VM_BUG_ON(!(val & SWAP_HAS_CACHE));
-   if (val == SWAP_HAS_CACHE)
-   free_entries++;
+   /*
+* No PMD swap mapping, the swap cluster will be freed
+* if all swap entries becoming free, otherwise the
+* huge swap cluster will be split.
+*/
+   if (!cluster_swapcount(ci)) {
+   for (i = 0; i < SWAPFILE_CLUSTER; i++) {
+   val = map[i];
+   VM_BUG_ON(!(val & SWAP_HAS_CACHE));
+   if (val == SWAP_HAS_CACHE)
+   free_entries++;
+   }
+   if (free_entries != SWAPFILE_CLUSTER)
+   cluster_clear_huge(ci);
}
-   cluster_clear_huge(ci);
if (free_entries == SWAPFILE_CLUSTER) {
unlock_cluster_or_swap_info(si, ci);
spin_lock(&si->lock);
-- 
2.16.4

[PATCH -V6 08/21] swap: Support to read a huge swap cluster for swapin a THP

2018-10-10 Thread Huang Ying

To swapin a THP in one piece, we need to read a huge swap cluster from
the swap device.  This patch revised the __read_swap_cache_async() and
its callers and callees to support this.  If __read_swap_cache_async()
find the swap cluster of the specified swap entry is huge, it will try
to allocate a THP, add it into the swap cache.  So later the contents
of the huge swap cluster can be read into the THP.

Signed-off-by: "Huang, Ying" 
Cc: "Kirill A. Shutemov" 
Cc: Andrea Arcangeli 
Cc: Michal Hocko 
Cc: Johannes Weiner 
Cc: Shaohua Li 
Cc: Hugh Dickins 
Cc: Minchan Kim 
Cc: Rik van Riel 
Cc: Dave Hansen 
Cc: Naoya Horiguchi 
Cc: Zi Yan 
Cc: Daniel Jordan 
---
 include/linux/huge_mm.h |  8 +++
 include/linux/swap.h|  4 ++--
 mm/huge_memory.c|  3 ++-
 mm/swap_state.c | 59 -
 mm/swapfile.c   |  9 +---
 5 files changed, 66 insertions(+), 17 deletions(-)

diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h
index 0f3e1739986f..a0e7f4f9c12b 100644
--- a/include/linux/huge_mm.h
+++ b/include/linux/huge_mm.h
@@ -250,6 +250,8 @@ static inline bool thp_migration_supported(void)
return IS_ENABLED(CONFIG_ARCH_ENABLE_THP_MIGRATION);
 }
 
+gfp_t alloc_hugepage_direct_gfpmask(struct vm_area_struct *vma,
+   unsigned long addr);
 #else /* CONFIG_TRANSPARENT_HUGEPAGE */
 #define HPAGE_PMD_SHIFT ({ BUILD_BUG(); 0; })
 #define HPAGE_PMD_MASK ({ BUILD_BUG(); 0; })
@@ -363,6 +365,12 @@ static inline bool thp_migration_supported(void)
 {
return false;
 }
+
+static inline gfp_t alloc_hugepage_direct_gfpmask(struct vm_area_struct *vma,
+ unsigned long addr)
+{
+   return 0;
+}
 #endif /* CONFIG_TRANSPARENT_HUGEPAGE */
 
 #endif /* _LINUX_HUGE_MM_H */
diff --git a/include/linux/swap.h b/include/linux/swap.h
index 60fd5189fde9..f2daf3fbdd4b 100644
--- a/include/linux/swap.h
+++ b/include/linux/swap.h
@@ -457,7 +457,7 @@ extern sector_t map_swap_page(struct page *, struct 
block_device **);
 extern sector_t swapdev_block(int, pgoff_t);
 extern int page_swapcount(struct page *);
 extern int __swap_count(swp_entry_t entry);
-extern int __swp_swapcount(swp_entry_t entry);
+extern int __swp_swapcount(swp_entry_t entry, int *entry_size);
 extern int swp_swapcount(swp_entry_t entry);
 extern struct swap_info_struct *page_swap_info(struct page *);
 extern struct swap_info_struct *swp_swap_info(swp_entry_t entry);
@@ -585,7 +585,7 @@ static inline int __swap_count(swp_entry_t entry)
return 0;
 }
 
-static inline int __swp_swapcount(swp_entry_t entry)
+static inline int __swp_swapcount(swp_entry_t entry, int *entry_size)
 {
return 0;
 }
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index 92e0cdb99c5a..a025494dd828 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -629,7 +629,8 @@ static vm_fault_t __do_huge_pmd_anonymous_page(struct 
vm_fault *vmf,
  * available
  * never: never stall for any thp allocation
  */
-static inline gfp_t alloc_hugepage_direct_gfpmask(struct vm_area_struct *vma, 
unsigned long addr)
+gfp_t alloc_hugepage_direct_gfpmask(struct vm_area_struct *vma,
+   unsigned long addr)
 {
const bool vma_madvised = !!(vma->vm_flags & VM_HUGEPAGE);
gfp_t this_node = 0;
diff --git a/mm/swap_state.c b/mm/swap_state.c
index bca34fc7a5e5..784ad6388da0 100644
--- a/mm/swap_state.c
+++ b/mm/swap_state.c
@@ -361,7 +361,9 @@ struct page *__read_swap_cache_async(swp_entry_t entry, 
gfp_t gfp_mask,
 {
struct page *found_page = NULL, *new_page = NULL;
struct swap_info_struct *si;
-   int err;
+   int err, entry_size = 1;
+   swp_entry_t hentry;
+
*new_page_allocated = false;
 
do {
@@ -387,14 +389,42 @@ struct page *__read_swap_cache_async(swp_entry_t entry, 
gfp_t gfp_mask,
 * as SWAP_HAS_CACHE.  That's done in later part of code or
 * else swap_off will be aborted if we return NULL.
 */
-   if (!__swp_swapcount(entry) && swap_slot_cache_enabled)
+   if (!__swp_swapcount(entry, &entry_size) &&
+   swap_slot_cache_enabled)
break;
 
/*
 * Get a new page to read into from swap.
 */
-   if (!new_page) {
-   new_page = alloc_page_vma(gfp_mask, vma, addr);
+   if (!new_page ||
+   (IS_ENABLED(CONFIG_THP_SWAP) &&
+hpage_nr_pages(new_page) != entry_size)) {
+   if (new_page)
+   put_page(new_page);
+   if (IS_ENABLED(CONFIG_THP_SWAP) &&
+   entry_size == HPAGE_PMD_NR) {
+   gfp_t gfp;
+
+

[PATCH -V6 07/21] swap: Support PMD swap mapping in split_swap_cluster()

2018-10-10 Thread Huang Ying

When splitting a THP in swap cache or failing to allocate a THP when
swapin a huge swap cluster, the huge swap cluster will be split.  In
addition to clear the huge flag of the swap cluster, the PMD swap
mapping count recorded in cluster_count() will be set to 0.  But we
will not touch PMD swap mappings themselves, because it is hard to
find them all sometimes.  When the PMD swap mappings are operated
later, it will be found that the huge swap cluster has been split and
the PMD swap mappings will be split at that time.

Unless splitting a THP in swap cache (specified via "force"
parameter), split_swap_cluster() will return -EEXIST if there is
SWAP_HAS_CACHE flag in swap_map[offset].  Because this indicates there
is a THP corresponds to this huge swap cluster, and it isn't desired
to split the THP.

When splitting a THP in swap cache, the position to call
split_swap_cluster() is changed to before unlocking sub-pages.  So
that all sub-pages will be kept locked from the THP has been split to
the huge swap cluster is split.  This makes the code much easier to be
reasoned.

Signed-off-by: "Huang, Ying" 
Cc: "Kirill A. Shutemov" 
Cc: Andrea Arcangeli 
Cc: Michal Hocko 
Cc: Johannes Weiner 
Cc: Shaohua Li 
Cc: Hugh Dickins 
Cc: Minchan Kim 
Cc: Rik van Riel 
Cc: Dave Hansen 
Cc: Naoya Horiguchi 
Cc: Zi Yan 
Cc: Daniel Jordan 
---
 include/linux/swap.h |  6 --
 mm/huge_memory.c | 18 ++--
 mm/swapfile.c| 58 +---
 3 files changed, 57 insertions(+), 25 deletions(-)

diff --git a/include/linux/swap.h b/include/linux/swap.h
index 9bb3f73b5d68..60fd5189fde9 100644
--- a/include/linux/swap.h
+++ b/include/linux/swap.h
@@ -612,11 +612,13 @@ static inline swp_entry_t get_swap_page(struct page *page)
 
 #endif /* CONFIG_SWAP */
 
+#define SSC_SPLIT_CACHED   0x1
+
 #ifdef CONFIG_THP_SWAP
-extern int split_swap_cluster(swp_entry_t entry);
+extern int split_swap_cluster(swp_entry_t entry, unsigned long flags);
 extern int split_swap_cluster_map(swp_entry_t entry);
 #else
-static inline int split_swap_cluster(swp_entry_t entry)
+static inline int split_swap_cluster(swp_entry_t entry, unsigned long flags)
 {
return 0;
 }
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index 9f1c74487576..92e0cdb99c5a 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -2517,6 +2517,17 @@ static void __split_huge_page(struct page *page, struct 
list_head *list,
 
unfreeze_page(head);
 
+   /*
+* Split swap cluster before unlocking sub-pages.  So all
+* sub-pages will be kept locked from THP has been split to
+* swap cluster is split.
+*/
+   if (PageSwapCache(head)) {
+   swp_entry_t entry = { .val = page_private(head) };
+
+   split_swap_cluster(entry, SSC_SPLIT_CACHED);
+   }
+
for (i = 0; i < HPAGE_PMD_NR; i++) {
struct page *subpage = head + i;
if (subpage == page)
@@ -2740,12 +2751,7 @@ int split_huge_page_to_list(struct page *page, struct 
list_head *list)
__dec_node_page_state(page, NR_SHMEM_THPS);
spin_unlock(&pgdata->split_queue_lock);
__split_huge_page(page, list, flags);
-   if (PageSwapCache(head)) {
-   swp_entry_t entry = { .val = page_private(head) };
-
-   ret = split_swap_cluster(entry);
-   } else
-   ret = 0;
+   ret = 0;
} else {
if (IS_ENABLED(CONFIG_DEBUG_VM) && mapcount) {
pr_alert("total_mapcount: %u, page_count(): %u\n",
diff --git a/mm/swapfile.c b/mm/swapfile.c
index fa6b81b4e185..2020bd494419 100644
--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -1469,23 +1469,6 @@ void put_swap_page(struct page *page, swp_entry_t entry)
unlock_cluster_or_swap_info(si, ci);
 }
 
-#ifdef CONFIG_THP_SWAP
-int split_swap_cluster(swp_entry_t entry)
-{
-   struct swap_info_struct *si;
-   struct swap_cluster_info *ci;
-   unsigned long offset = swp_offset(entry);
-
-   si = _swap_info_get(entry);
-   if (!si)
-   return -EBUSY;
-   ci = lock_cluster(si, offset);
-   cluster_clear_huge(ci);
-   unlock_cluster(ci);
-   return 0;
-}
-#endif
-
 static int swp_entry_cmp(const void *ent1, const void *ent2)
 {
const swp_entry_t *e1 = ent1, *e2 = ent2;
@@ -4066,6 +4049,47 @@ int split_swap_cluster_map(swp_entry_t entry)
unlock_cluster(ci);
return 0;
 }
+
+/*
+ * We will not try to split all PMD swap mappings to the swap cluster,
+ * because we haven't enough information available for that.  Later,
+ * when the PMD swap mapping is duplicated or swapin, etc, the PMD
+ * swap mapping will be split and fallback to the PTE operations.
+ */
+int split_swap_cluster(swp_entry_t entry, unsigned long flags)
+{
+

[PATCH -V6 09/21] swap: Swapin a THP in one piece

2018-10-10 Thread Huang Ying

With this patch, when page fault handler find a PMD swap mapping, it
will swap in a THP in one piece.  This avoids the overhead of
splitting/collapsing before/after the THP swapping.  And improves the
swap performance greatly for reduced page fault count etc.

do_huge_pmd_swap_page() is added in the patch to implement this.  It
is similar to do_swap_page() for normal page swapin.

If failing to allocate a THP, the huge swap cluster and the PMD swap
mapping will be split to fallback to normal page swapin.

If the huge swap cluster has been split already, the PMD swap mapping
will be split to fallback to normal page swapin.

Signed-off-by: "Huang, Ying" 
Cc: "Kirill A. Shutemov" 
Cc: Andrea Arcangeli 
Cc: Michal Hocko 
Cc: Johannes Weiner 
Cc: Shaohua Li 
Cc: Hugh Dickins 
Cc: Minchan Kim 
Cc: Rik van Riel 
Cc: Dave Hansen 
Cc: Naoya Horiguchi 
Cc: Zi Yan 
Cc: Daniel Jordan 
---
 include/linux/huge_mm.h |   9 +++
 mm/huge_memory.c| 174 
 mm/memory.c |  16 +++--
 3 files changed, 193 insertions(+), 6 deletions(-)

diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h
index a0e7f4f9c12b..d88579cb059a 100644
--- a/include/linux/huge_mm.h
+++ b/include/linux/huge_mm.h
@@ -373,4 +373,13 @@ static inline gfp_t alloc_hugepage_direct_gfpmask(struct 
vm_area_struct *vma,
 }
 #endif /* CONFIG_TRANSPARENT_HUGEPAGE */
 
+#ifdef CONFIG_THP_SWAP
+extern int do_huge_pmd_swap_page(struct vm_fault *vmf, pmd_t orig_pmd);
+#else /* CONFIG_THP_SWAP */
+static inline int do_huge_pmd_swap_page(struct vm_fault *vmf, pmd_t orig_pmd)
+{
+   return 0;
+}
+#endif /* CONFIG_THP_SWAP */
+
 #endif /* _LINUX_HUGE_MM_H */
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index a025494dd828..fbc9c9e30992 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -33,6 +33,8 @@
 #include 
 #include 
 #include 
+#include 
+#include 
 
 #include 
 #include 
@@ -1659,6 +1661,178 @@ static void __split_huge_swap_pmd(struct vm_area_struct 
*vma,
pmd_populate(mm, pmd, pgtable);
 }
 
+#ifdef CONFIG_THP_SWAP
+static int split_huge_swap_pmd(struct vm_area_struct *vma, pmd_t *pmd,
+  unsigned long address, pmd_t orig_pmd)
+{
+   struct mm_struct *mm = vma->vm_mm;
+   spinlock_t *ptl;
+   int ret = 0;
+
+   ptl = pmd_lock(mm, pmd);
+   if (pmd_same(*pmd, orig_pmd))
+   __split_huge_swap_pmd(vma, address & HPAGE_PMD_MASK, pmd);
+   else
+   ret = -ENOENT;
+   spin_unlock(ptl);
+
+   return ret;
+}
+
+int do_huge_pmd_swap_page(struct vm_fault *vmf, pmd_t orig_pmd)
+{
+   struct page *page;
+   struct mem_cgroup *memcg;
+   struct vm_area_struct *vma = vmf->vma;
+   unsigned long haddr = vmf->address & HPAGE_PMD_MASK;
+   swp_entry_t entry;
+   pmd_t pmd;
+   int i, locked, exclusive = 0, ret = 0;
+
+   entry = pmd_to_swp_entry(orig_pmd);
+   VM_BUG_ON(non_swap_entry(entry));
+   delayacct_set_flag(DELAYACCT_PF_SWAPIN);
+retry:
+   page = lookup_swap_cache(entry, NULL, vmf->address);
+   if (!page) {
+   page = read_swap_cache_async(entry, GFP_HIGHUSER_MOVABLE, vma,
+haddr, false);
+   if (!page) {
+   /*
+* Back out if somebody else faulted in this pmd
+* while we released the pmd lock.
+*/
+   if (likely(pmd_same(*vmf->pmd, orig_pmd))) {
+   /*
+* Failed to allocate huge page, split huge swap
+* cluster, and fallback to swapin normal page
+*/
+   ret = split_swap_cluster(entry, 0);
+   /* Somebody else swapin the swap entry, retry */
+   if (ret == -EEXIST) {
+   ret = 0;
+   goto retry;
+   /* swapoff occurs under us */
+   } else if (ret == -EINVAL)
+   ret = 0;
+   else
+   goto fallback;
+   }
+   delayacct_clear_flag(DELAYACCT_PF_SWAPIN);
+   goto out;
+   }
+
+   /* Had to read the page from swap area: Major fault */
+   ret = VM_FAULT_MAJOR;
+   count_vm_event(PGMAJFAULT);
+   count_memcg_event_mm(vma->vm_mm, PGMAJFAULT);
+   } else if (!PageTransCompound(page))
+   goto fallback;
+
+   locked = lock_page_or_retry(page, vma->vm_mm, vmf->flags);
+
+   delayacct_clear_flag(DELAYACCT_PF_SWAPIN);
+   if (!locked) {
+   ret

[PATCH -V6 19/21] swap: Support PMD swap mapping in common path

2018-10-10 Thread Huang Ying

Original code is only for PMD migration entry, it is revised to
support PMD swap mapping.

Signed-off-by: "Huang, Ying" 
Cc: "Kirill A. Shutemov" 
Cc: Andrea Arcangeli 
Cc: Michal Hocko 
Cc: Johannes Weiner 
Cc: Shaohua Li 
Cc: Hugh Dickins 
Cc: Minchan Kim 
Cc: Rik van Riel 
Cc: Dave Hansen 
Cc: Naoya Horiguchi 
Cc: Zi Yan 
Cc: Daniel Jordan 
---
 fs/proc/task_mmu.c | 12 +---
 mm/gup.c   | 36 
 mm/huge_memory.c   |  7 ---
 mm/mempolicy.c |  2 +-
 4 files changed, 34 insertions(+), 23 deletions(-)

diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c
index 0995a84c78dc..befac96b42d9 100644
--- a/fs/proc/task_mmu.c
+++ b/fs/proc/task_mmu.c
@@ -984,7 +984,7 @@ static inline void clear_soft_dirty_pmd(struct 
vm_area_struct *vma,
pmd = pmd_clear_soft_dirty(pmd);
 
set_pmd_at(vma->vm_mm, addr, pmdp, pmd);
-   } else if (is_migration_entry(pmd_to_swp_entry(pmd))) {
+   } else if (is_swap_pmd(pmd)) {
pmd = pmd_swp_clear_soft_dirty(pmd);
set_pmd_at(vma->vm_mm, addr, pmdp, pmd);
}
@@ -1314,9 +1314,8 @@ static int pagemap_pmd_range(pmd_t *pmdp, unsigned long 
addr, unsigned long end,
if (pm->show_pfn)
frame = pmd_pfn(pmd) +
((addr & ~PMD_MASK) >> PAGE_SHIFT);
-   }
-#ifdef CONFIG_ARCH_ENABLE_THP_MIGRATION
-   else if (is_swap_pmd(pmd)) {
+   } else if (IS_ENABLED(CONFIG_HAVE_PMD_SWAP_ENTRY) &&
+  is_swap_pmd(pmd)) {
swp_entry_t entry = pmd_to_swp_entry(pmd);
unsigned long offset;
 
@@ -1329,10 +1328,9 @@ static int pagemap_pmd_range(pmd_t *pmdp, unsigned long 
addr, unsigned long end,
flags |= PM_SWAP;
if (pmd_swp_soft_dirty(pmd))
flags |= PM_SOFT_DIRTY;
-   VM_BUG_ON(!is_pmd_migration_entry(pmd));
-   page = migration_entry_to_page(entry);
+   if (is_pmd_migration_entry(pmd))
+   page = migration_entry_to_page(entry);
}
-#endif
 
if (page && page_mapcount(page) == 1)
flags |= PM_MMAP_EXCLUSIVE;
diff --git a/mm/gup.c b/mm/gup.c
index 08eb350e0f35..17fd850aa2cc 100644
--- a/mm/gup.c
+++ b/mm/gup.c
@@ -216,6 +216,7 @@ static struct page *follow_pmd_mask(struct vm_area_struct 
*vma,
spinlock_t *ptl;
struct page *page;
struct mm_struct *mm = vma->vm_mm;
+   swp_entry_t entry;
 
pmd = pmd_offset(pudp, address);
/*
@@ -243,18 +244,22 @@ static struct page *follow_pmd_mask(struct vm_area_struct 
*vma,
if (!pmd_present(pmdval)) {
if (likely(!(flags & FOLL_MIGRATION)))
return no_page_table(vma, flags);
-   VM_BUG_ON(thp_migration_supported() &&
- !is_pmd_migration_entry(pmdval));
-   if (is_pmd_migration_entry(pmdval))
+   entry = pmd_to_swp_entry(pmdval);
+   if (thp_migration_supported() && is_migration_entry(entry)) {
pmd_migration_entry_wait(mm, pmd);
-   pmdval = READ_ONCE(*pmd);
-   /*
-* MADV_DONTNEED may convert the pmd to null because
-* mmap_sem is held in read mode
-*/
-   if (pmd_none(pmdval))
+   pmdval = READ_ONCE(*pmd);
+   /*
+* MADV_DONTNEED may convert the pmd to null because
+* mmap_sem is held in read mode
+*/
+   if (pmd_none(pmdval))
+   return no_page_table(vma, flags);
+   goto retry;
+   }
+   if (IS_ENABLED(CONFIG_THP_SWAP) && !non_swap_entry(entry))
return no_page_table(vma, flags);
-   goto retry;
+   WARN_ON(1);
+   return no_page_table(vma, flags);
}
if (pmd_devmap(pmdval)) {
ptl = pmd_lock(mm, pmd);
@@ -276,11 +281,18 @@ static struct page *follow_pmd_mask(struct vm_area_struct 
*vma,
return no_page_table(vma, flags);
}
if (unlikely(!pmd_present(*pmd))) {
+   entry = pmd_to_swp_entry(*pmd);
spin_unlock(ptl);
if (likely(!(flags & FOLL_MIGRATION)))
return no_page_table(vma, flags);
-   pmd_migration_entry_wait(mm, pmd);
-   goto retry_locked;
+   if (thp_migration_supported() && is_migration_entry(entry)) {
+

[PATCH -V6 18/21] swap: Support PMD swap mapping in mincore()

2018-10-10 Thread Huang Ying

During mincore(), for PMD swap mapping, swap cache will be looked up.
If the resulting page isn't compound page, the PMD swap mapping will
be split and fallback to PTE swap mapping processing.

Signed-off-by: "Huang, Ying" 
Cc: "Kirill A. Shutemov" 
Cc: Andrea Arcangeli 
Cc: Michal Hocko 
Cc: Johannes Weiner 
Cc: Shaohua Li 
Cc: Hugh Dickins 
Cc: Minchan Kim 
Cc: Rik van Riel 
Cc: Dave Hansen 
Cc: Naoya Horiguchi 
Cc: Zi Yan 
Cc: Daniel Jordan 
---
 mm/mincore.c | 37 +++--
 1 file changed, 31 insertions(+), 6 deletions(-)

diff --git a/mm/mincore.c b/mm/mincore.c
index aa0e542569f9..1d861fac82ee 100644
--- a/mm/mincore.c
+++ b/mm/mincore.c
@@ -48,7 +48,8 @@ static int mincore_hugetlb(pte_t *pte, unsigned long hmask, 
unsigned long addr,
  * and is up to date; i.e. that no page-in operation would be required
  * at this time if an application were to map and access this page.
  */
-static unsigned char mincore_page(struct address_space *mapping, pgoff_t pgoff)
+static unsigned char mincore_page(struct address_space *mapping, pgoff_t pgoff,
+ bool *compound)
 {
unsigned char present = 0;
struct page *page;
@@ -86,6 +87,8 @@ static unsigned char mincore_page(struct address_space 
*mapping, pgoff_t pgoff)
 #endif
if (page) {
present = PageUptodate(page);
+   if (compound)
+   *compound = PageCompound(page);
put_page(page);
}
 
@@ -103,7 +106,8 @@ static int __mincore_unmapped_range(unsigned long addr, 
unsigned long end,
 
pgoff = linear_page_index(vma, addr);
for (i = 0; i < nr; i++, pgoff++)
-   vec[i] = mincore_page(vma->vm_file->f_mapping, pgoff);
+   vec[i] = mincore_page(vma->vm_file->f_mapping,
+ pgoff, NULL);
} else {
for (i = 0; i < nr; i++)
vec[i] = 0;
@@ -127,14 +131,36 @@ static int mincore_pte_range(pmd_t *pmd, unsigned long 
addr, unsigned long end,
pte_t *ptep;
unsigned char *vec = walk->private;
int nr = (end - addr) >> PAGE_SHIFT;
+   swp_entry_t entry;
 
ptl = pmd_trans_huge_lock(pmd, vma);
if (ptl) {
-   memset(vec, 1, nr);
+   unsigned char val = 1;
+   bool compound;
+
+   if (IS_ENABLED(CONFIG_THP_SWAP) && is_swap_pmd(*pmd)) {
+   entry = pmd_to_swp_entry(*pmd);
+   if (!non_swap_entry(entry)) {
+   val = mincore_page(swap_address_space(entry),
+  swp_offset(entry),
+  &compound);
+   /*
+* The huge swap cluster has been
+* split under us
+*/
+   if (!compound) {
+   __split_huge_swap_pmd(vma, addr, pmd);
+   spin_unlock(ptl);
+   goto fallback;
+   }
+   }
+   }
+   memset(vec, val, nr);
spin_unlock(ptl);
goto out;
}
 
+fallback:
if (pmd_trans_unstable(pmd)) {
__mincore_unmapped_range(addr, end, vma, vec);
goto out;
@@ -150,8 +176,7 @@ static int mincore_pte_range(pmd_t *pmd, unsigned long 
addr, unsigned long end,
else if (pte_present(pte))
*vec = 1;
else { /* pte is a swap entry */
-   swp_entry_t entry = pte_to_swp_entry(pte);
-
+   entry = pte_to_swp_entry(pte);
if (non_swap_entry(entry)) {
/*
 * migration or hwpoison entries are always
@@ -161,7 +186,7 @@ static int mincore_pte_range(pmd_t *pmd, unsigned long 
addr, unsigned long end,
} else {
 #ifdef CONFIG_SWAP
*vec = mincore_page(swap_address_space(entry),
-   swp_offset(entry));
+   swp_offset(entry), NULL);
 #else
WARN_ON(1);
*vec = 1;
-- 
2.16.4

[PATCH -V6 15/21] swap: Support to copy PMD swap mapping when fork()

2018-10-10 Thread Huang Ying

During fork, the page table need to be copied from parent to child.  A
PMD swap mapping need to be copied too and the swap reference count
need to be increased.

When the huge swap cluster has been split already, we need to split
the PMD swap mapping and fallback to PTE copying.

When swap count continuation failed to allocate a page with
GFP_ATOMIC, we need to unlock the spinlock and try again with
GFP_KERNEL.

Signed-off-by: "Huang, Ying" 
Cc: "Kirill A. Shutemov" 
Cc: Andrea Arcangeli 
Cc: Michal Hocko 
Cc: Johannes Weiner 
Cc: Shaohua Li 
Cc: Hugh Dickins 
Cc: Minchan Kim 
Cc: Rik van Riel 
Cc: Dave Hansen 
Cc: Naoya Horiguchi 
Cc: Zi Yan 
Cc: Daniel Jordan 
---
 mm/huge_memory.c | 72 
 1 file changed, 57 insertions(+), 15 deletions(-)

diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index ebd043528309..74c8621619cb 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -987,6 +987,7 @@ int copy_huge_pmd(struct mm_struct *dst_mm, struct 
mm_struct *src_mm,
if (unlikely(!pgtable))
goto out;
 
+retry:
dst_ptl = pmd_lock(dst_mm, dst_pmd);
src_ptl = pmd_lockptr(src_mm, src_pmd);
spin_lock_nested(src_ptl, SINGLE_DEPTH_NESTING);
@@ -994,26 +995,67 @@ int copy_huge_pmd(struct mm_struct *dst_mm, struct 
mm_struct *src_mm,
ret = -EAGAIN;
pmd = *src_pmd;
 
-#ifdef CONFIG_ARCH_ENABLE_THP_MIGRATION
if (unlikely(is_swap_pmd(pmd))) {
swp_entry_t entry = pmd_to_swp_entry(pmd);
 
-   VM_BUG_ON(!is_pmd_migration_entry(pmd));
-   if (is_write_migration_entry(entry)) {
-   make_migration_entry_read(&entry);
-   pmd = swp_entry_to_pmd(entry);
-   if (pmd_swp_soft_dirty(*src_pmd))
-   pmd = pmd_swp_mksoft_dirty(pmd);
-   set_pmd_at(src_mm, addr, src_pmd, pmd);
+#ifdef CONFIG_ARCH_ENABLE_THP_MIGRATION
+   if (is_migration_entry(entry)) {
+   if (is_write_migration_entry(entry)) {
+   make_migration_entry_read(&entry);
+   pmd = swp_entry_to_pmd(entry);
+   if (pmd_swp_soft_dirty(*src_pmd))
+   pmd = pmd_swp_mksoft_dirty(pmd);
+   set_pmd_at(src_mm, addr, src_pmd, pmd);
+   }
+   add_mm_counter(dst_mm, MM_ANONPAGES, HPAGE_PMD_NR);
+   mm_inc_nr_ptes(dst_mm);
+   pgtable_trans_huge_deposit(dst_mm, dst_pmd, pgtable);
+   set_pmd_at(dst_mm, addr, dst_pmd, pmd);
+   ret = 0;
+   goto out_unlock;
}
-   add_mm_counter(dst_mm, MM_ANONPAGES, HPAGE_PMD_NR);
-   mm_inc_nr_ptes(dst_mm);
-   pgtable_trans_huge_deposit(dst_mm, dst_pmd, pgtable);
-   set_pmd_at(dst_mm, addr, dst_pmd, pmd);
-   ret = 0;
-   goto out_unlock;
-   }
 #endif
+   if (IS_ENABLED(CONFIG_THP_SWAP) && !non_swap_entry(entry)) {
+   ret = swap_duplicate(&entry, HPAGE_PMD_NR);
+   if (!ret) {
+   add_mm_counter(dst_mm, MM_SWAPENTS,
+  HPAGE_PMD_NR);
+   mm_inc_nr_ptes(dst_mm);
+   pgtable_trans_huge_deposit(dst_mm, dst_pmd,
+  pgtable);
+   set_pmd_at(dst_mm, addr, dst_pmd, pmd);
+   /* make sure dst_mm is on swapoff's mmlist. */
+   if (unlikely(list_empty(&dst_mm->mmlist))) {
+   spin_lock(&mmlist_lock);
+   if (list_empty(&dst_mm->mmlist))
+   list_add(&dst_mm->mmlist,
+&src_mm->mmlist);
+   spin_unlock(&mmlist_lock);
+   }
+   } else if (ret == -ENOTDIR) {
+   /*
+* The huge swap cluster has been split, split
+* the PMD swap mapping and fallback to PTE
+*/
+   __split_huge_swap_pmd(vma, addr, src_pmd);
+   pte_free(dst_mm, pgtable);
+   } else if (ret == -ENOMEM) {
+   spin_unlock(src_ptl);
+   spin_unlock(dst_ptl);
+   ret = add_swap_count_continuation(en

[PATCH -V6 17/21] swap: Support PMD swap mapping for MADV_WILLNEED

2018-10-10 Thread Huang Ying

During MADV_WILLNEED, for a PMD swap mapping, if THP swapin is enabled
for the VMA, the whole swap cluster will be swapin.  Otherwise, the
huge swap cluster and the PMD swap mapping will be split and fallback
to PTE swap mapping.

Signed-off-by: "Huang, Ying" 
Cc: "Kirill A. Shutemov" 
Cc: Andrea Arcangeli 
Cc: Michal Hocko 
Cc: Johannes Weiner 
Cc: Shaohua Li 
Cc: Hugh Dickins 
Cc: Minchan Kim 
Cc: Rik van Riel 
Cc: Dave Hansen 
Cc: Naoya Horiguchi 
Cc: Zi Yan 
Cc: Daniel Jordan 
---
 mm/madvise.c | 26 --
 1 file changed, 24 insertions(+), 2 deletions(-)

diff --git a/mm/madvise.c b/mm/madvise.c
index 20101ff125d0..0413659ff6ba 100644
--- a/mm/madvise.c
+++ b/mm/madvise.c
@@ -196,14 +196,36 @@ static int swapin_walk_pmd_entry(pmd_t *pmd, unsigned 
long start,
pte_t *orig_pte;
struct vm_area_struct *vma = walk->private;
unsigned long index;
+   swp_entry_t entry;
+   struct page *page;
+   pmd_t pmdval;
+
+   pmdval = *pmd;
+   if (IS_ENABLED(CONFIG_THP_SWAP) && is_swap_pmd(pmdval) &&
+   !is_pmd_migration_entry(pmdval)) {
+   entry = pmd_to_swp_entry(pmdval);
+   if (!transparent_hugepage_swapin_enabled(vma)) {
+   if (!split_swap_cluster(entry, 0))
+   split_huge_swap_pmd(vma, pmd, start, pmdval);
+   } else {
+   page = read_swap_cache_async(entry,
+GFP_HIGHUSER_MOVABLE,
+vma, start, false);
+   if (page) {
+   /* The swap cluster has been split under us */
+   if (!PageTransHuge(page))
+   split_huge_swap_pmd(vma, pmd, start,
+   pmdval);
+   put_page(page);
+   }
+   }
+   }
 
if (pmd_none_or_trans_huge_or_clear_bad(pmd))
return 0;
 
for (index = start; index != end; index += PAGE_SIZE) {
pte_t pte;
-   swp_entry_t entry;
-   struct page *page;
spinlock_t *ptl;
 
orig_pte = pte_offset_map_lock(vma->vm_mm, pmd, start, &ptl);
-- 
2.16.4

[PATCH -V6 13/21] swap: Support PMD swap mapping in madvise_free()

2018-10-10 Thread Huang Ying

When madvise_free() found a PMD swap mapping, if only part of the huge
swap cluster is operated on, the PMD swap mapping will be split and
fallback to PTE swap mapping processing.  Otherwise, if all huge swap
cluster is operated on, free_swap_and_cache() will be called to
decrease the PMD swap mapping count and probably free the swap space
and the THP in swap cache too.

Signed-off-by: "Huang, Ying" 
Cc: "Kirill A. Shutemov" 
Cc: Andrea Arcangeli 
Cc: Michal Hocko 
Cc: Johannes Weiner 
Cc: Shaohua Li 
Cc: Hugh Dickins 
Cc: Minchan Kim 
Cc: Rik van Riel 
Cc: Dave Hansen 
Cc: Naoya Horiguchi 
Cc: Zi Yan 
Cc: Daniel Jordan 
---
 mm/huge_memory.c | 54 +++---
 mm/madvise.c |  2 +-
 2 files changed, 40 insertions(+), 16 deletions(-)

diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index 0ec71f907fa5..60b4105734b1 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -1891,6 +1891,15 @@ int do_huge_pmd_swap_page(struct vm_fault *vmf, pmd_t 
orig_pmd)
 }
 #endif
 
+static inline void zap_deposited_table(struct mm_struct *mm, pmd_t *pmd)
+{
+   pgtable_t pgtable;
+
+   pgtable = pgtable_trans_huge_withdraw(mm, pmd);
+   pte_free(mm, pgtable);
+   mm_dec_nr_ptes(mm);
+}
+
 /*
  * Return true if we do MADV_FREE successfully on entire pmd page.
  * Otherwise, return false.
@@ -1911,15 +1920,39 @@ bool madvise_free_huge_pmd(struct mmu_gather *tlb, 
struct vm_area_struct *vma,
goto out_unlocked;
 
orig_pmd = *pmd;
-   if (is_huge_zero_pmd(orig_pmd))
-   goto out;
-
if (unlikely(!pmd_present(orig_pmd))) {
-   VM_BUG_ON(thp_migration_supported() &&
- !is_pmd_migration_entry(orig_pmd));
-   goto out;
+   swp_entry_t entry = pmd_to_swp_entry(orig_pmd);
+
+   if (is_migration_entry(entry)) {
+   VM_BUG_ON(!thp_migration_supported());
+   goto out;
+   } else if (IS_ENABLED(CONFIG_THP_SWAP) &&
+  !non_swap_entry(entry)) {
+   /*
+* If part of THP is discarded, split the PMD
+* swap mapping and operate on the PTEs
+*/
+   if (next - addr != HPAGE_PMD_SIZE) {
+   unsigned long haddr = addr & HPAGE_PMD_MASK;
+
+   __split_huge_swap_pmd(vma, haddr, pmd);
+   goto out;
+   }
+   free_swap_and_cache(entry, HPAGE_PMD_NR);
+   pmd_clear(pmd);
+   zap_deposited_table(mm, pmd);
+   if (current->mm == mm)
+   sync_mm_rss(mm);
+   add_mm_counter(mm, MM_SWAPENTS, -HPAGE_PMD_NR);
+   ret = true;
+   goto out;
+   } else
+   VM_BUG_ON(1);
}
 
+   if (is_huge_zero_pmd(orig_pmd))
+   goto out;
+
page = pmd_page(orig_pmd);
/*
 * If other processes are mapping this page, we couldn't discard
@@ -1965,15 +1998,6 @@ bool madvise_free_huge_pmd(struct mmu_gather *tlb, 
struct vm_area_struct *vma,
return ret;
 }
 
-static inline void zap_deposited_table(struct mm_struct *mm, pmd_t *pmd)
-{
-   pgtable_t pgtable;
-
-   pgtable = pgtable_trans_huge_withdraw(mm, pmd);
-   pte_free(mm, pgtable);
-   mm_dec_nr_ptes(mm);
-}
-
 int zap_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma,
 pmd_t *pmd, unsigned long addr)
 {
diff --git a/mm/madvise.c b/mm/madvise.c
index 50282ba862e2..20101ff125d0 100644
--- a/mm/madvise.c
+++ b/mm/madvise.c
@@ -321,7 +321,7 @@ static int madvise_free_pte_range(pmd_t *pmd, unsigned long 
addr,
unsigned long next;
 
next = pmd_addr_end(addr, end);
-   if (pmd_trans_huge(*pmd))
+   if (pmd_trans_huge(*pmd) || is_swap_pmd(*pmd))
if (madvise_free_huge_pmd(tlb, vma, pmd, addr, next))
goto next;
 
-- 
2.16.4

[PATCH -V6 20/21] swap: create PMD swap mapping when unmap the THP

2018-10-10 Thread Huang Ying

This is the final step of the THP swapin support.  When reclaiming a
anonymous THP, after allocating the huge swap cluster and add the THP
into swap cache, the PMD page mapping will be changed to the mapping
to the swap space.  Previously, the PMD page mapping will be split
before being changed.  In this patch, the unmap code is enhanced not
to split the PMD mapping, but create a PMD swap mapping to replace it
instead.  So later when clear the SWAP_HAS_CACHE flag in the last step
of swapout, the huge swap cluster will be kept instead of being split,
and when swapin, the huge swap cluster will be read in one piece into a
THP.  That is, the THP will not be split during swapout/swapin.  This
can eliminate the overhead of splitting/collapsing, and reduce the
page fault count, etc.  But more important, the utilization of THP is
improved greatly, that is, much more THP will be kept when swapping is
used, so that we can take full advantage of THP including its high
performance for swapout/swapin.

Signed-off-by: "Huang, Ying" 
Cc: "Kirill A. Shutemov" 
Cc: Andrea Arcangeli 
Cc: Michal Hocko 
Cc: Johannes Weiner 
Cc: Shaohua Li 
Cc: Hugh Dickins 
Cc: Minchan Kim 
Cc: Rik van Riel 
Cc: Dave Hansen 
Cc: Naoya Horiguchi 
Cc: Zi Yan 
Cc: Daniel Jordan 
---
 include/linux/huge_mm.h | 11 +++
 mm/huge_memory.c| 30 ++
 mm/rmap.c   | 43 ++-
 mm/vmscan.c |  6 +-
 4 files changed, 84 insertions(+), 6 deletions(-)

diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h
index e573774f9014..f6370e8c7742 100644
--- a/include/linux/huge_mm.h
+++ b/include/linux/huge_mm.h
@@ -375,6 +375,8 @@ static inline gfp_t alloc_hugepage_direct_gfpmask(struct 
vm_area_struct *vma,
 }
 #endif /* CONFIG_TRANSPARENT_HUGEPAGE */
 
+struct page_vma_mapped_walk;
+
 #ifdef CONFIG_THP_SWAP
 extern void __split_huge_swap_pmd(struct vm_area_struct *vma,
  unsigned long haddr,
@@ -382,6 +384,8 @@ extern void __split_huge_swap_pmd(struct vm_area_struct 
*vma,
 extern int split_huge_swap_pmd(struct vm_area_struct *vma, pmd_t *pmd,
   unsigned long address, pmd_t orig_pmd);
 extern int do_huge_pmd_swap_page(struct vm_fault *vmf, pmd_t orig_pmd);
+extern bool set_pmd_swap_entry(struct page_vma_mapped_walk *pvmw,
+   struct page *page, unsigned long address, pmd_t pmdval);
 
 static inline bool transparent_hugepage_swapin_enabled(
struct vm_area_struct *vma)
@@ -423,6 +427,13 @@ static inline int do_huge_pmd_swap_page(struct vm_fault 
*vmf, pmd_t orig_pmd)
return 0;
 }
 
+static inline bool set_pmd_swap_entry(struct page_vma_mapped_walk *pvmw,
+ struct page *page, unsigned long address,
+ pmd_t pmdval)
+{
+   return false;
+}
+
 static inline bool transparent_hugepage_swapin_enabled(
struct vm_area_struct *vma)
 {
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index abefc50b08b7..87795529c547 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -1931,6 +1931,36 @@ int do_huge_pmd_swap_page(struct vm_fault *vmf, pmd_t 
orig_pmd)
count_vm_event(THP_SWPIN_FALLBACK);
goto fallback;
 }
+
+bool set_pmd_swap_entry(struct page_vma_mapped_walk *pvmw, struct page *page,
+   unsigned long address, pmd_t pmdval)
+{
+   struct vm_area_struct *vma = pvmw->vma;
+   struct mm_struct *mm = vma->vm_mm;
+   pmd_t swp_pmd;
+   swp_entry_t entry = { .val = page_private(page) };
+
+   if (swap_duplicate(&entry, HPAGE_PMD_NR) < 0) {
+   set_pmd_at(mm, address, pvmw->pmd, pmdval);
+   return false;
+   }
+   if (list_empty(&mm->mmlist)) {
+   spin_lock(&mmlist_lock);
+   if (list_empty(&mm->mmlist))
+   list_add(&mm->mmlist, &init_mm.mmlist);
+   spin_unlock(&mmlist_lock);
+   }
+   add_mm_counter(mm, MM_ANONPAGES, -HPAGE_PMD_NR);
+   add_mm_counter(mm, MM_SWAPENTS, HPAGE_PMD_NR);
+   swp_pmd = swp_entry_to_pmd(entry);
+   if (pmd_soft_dirty(pmdval))
+   swp_pmd = pmd_swp_mksoft_dirty(swp_pmd);
+   set_pmd_at(mm, address, pvmw->pmd, swp_pmd);
+
+   page_remove_rmap(page, true);
+   put_page(page);
+   return true;
+}
 #endif
 
 static inline void zap_deposited_table(struct mm_struct *mm, pmd_t *pmd)
diff --git a/mm/rmap.c b/mm/rmap.c
index 3bb4be720bc0..a180cb1fe2db 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -1413,11 +1413,52 @@ static bool try_to_unmap_one(struct page *page, struct 
vm_area_struct *vma,
continue;
}
 
+   address = pvmw.address;
+
+#ifdef CONFIG_THP_SWAP
+   /* PMD-mapped THP swap entry */
+   if (IS_ENABLED(CONFIG_THP_SWAP) &&

[PATCH -V6 21/21] swap: Update help of CONFIG_THP_SWAP

2018-10-10 Thread Huang Ying

The help of CONFIG_THP_SWAP is updated to reflect the latest progress
of THP (Tranparent Huge Page) swap optimization.

Signed-off-by: "Huang, Ying" 
Reviewed-by: Dan Williams 
Cc: "Kirill A. Shutemov" 
Cc: Andrea Arcangeli 
Cc: Michal Hocko 
Cc: Johannes Weiner 
Cc: Shaohua Li 
Cc: Hugh Dickins 
Cc: Minchan Kim 
Cc: Rik van Riel 
Cc: Dave Hansen 
Cc: Naoya Horiguchi 
Cc: Zi Yan 
Cc: Daniel Jordan 
---
 mm/Kconfig | 2 --
 1 file changed, 2 deletions(-)

diff --git a/mm/Kconfig b/mm/Kconfig
index 44f7d72010fd..63caae29ae2b 100644
--- a/mm/Kconfig
+++ b/mm/Kconfig
@@ -419,8 +419,6 @@ config THP_SWAP
depends on TRANSPARENT_HUGEPAGE && ARCH_WANTS_THP_SWAP && SWAP
help
  Swap transparent huge pages in one piece, without splitting.
- XXX: For now, swap cluster backing transparent huge page
- will be split after swapout.
 
  For selection by architectures with reasonable THP sizes.
 
-- 
2.16.4

[PATCH -V6 02/21] swap: Add __swap_duplicate_locked()

2018-10-10 Thread Huang Ying

The part of __swap_duplicate() with lock held is separated into a new
function __swap_duplicate_locked().  Because we will add more logic
about the PMD swap mapping into __swap_duplicate() and keep the most
PTE swap mapping related logic in __swap_duplicate_locked().

Just mechanical code refactoring, there is no any functional change in
this patch.

Signed-off-by: "Huang, Ying" 
Cc: "Kirill A. Shutemov" 
Cc: Andrea Arcangeli 
Cc: Michal Hocko 
Cc: Johannes Weiner 
Cc: Shaohua Li 
Cc: Hugh Dickins 
Cc: Minchan Kim 
Cc: Rik van Riel 
Cc: Dave Hansen 
Cc: Naoya Horiguchi 
Cc: Zi Yan 
Cc: Daniel Jordan 
---
 mm/swapfile.c | 63 +--
 1 file changed, 35 insertions(+), 28 deletions(-)

diff --git a/mm/swapfile.c b/mm/swapfile.c
index 97a1bd1a7c9a..6a570ef00fa7 100644
--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -3436,32 +3436,12 @@ void si_swapinfo(struct sysinfo *val)
spin_unlock(&swap_lock);
 }
 
-/*
- * Verify that a swap entry is valid and increment its swap map count.
- *
- * Returns error code in following case.
- * - success -> 0
- * - swp_entry is invalid -> EINVAL
- * - swp_entry is migration entry -> EINVAL
- * - swap-cache reference is requested but there is already one. -> EEXIST
- * - swap-cache reference is requested but the entry is not used. -> ENOENT
- * - swap-mapped reference requested but needs continued swap count. -> ENOMEM
- */
-static int __swap_duplicate(swp_entry_t entry, unsigned char usage)
+static int __swap_duplicate_locked(struct swap_info_struct *p,
+  unsigned long offset, unsigned char usage)
 {
-   struct swap_info_struct *p;
-   struct swap_cluster_info *ci;
-   unsigned long offset;
unsigned char count;
unsigned char has_cache;
-   int err = -EINVAL;
-
-   p = get_swap_device(entry);
-   if (!p)
-   goto out;
-
-   offset = swp_offset(entry);
-   ci = lock_cluster_or_swap_info(p, offset);
+   int err = 0;
 
count = p->swap_map[offset];
 
@@ -3471,12 +3451,11 @@ static int __swap_duplicate(swp_entry_t entry, unsigned 
char usage)
 */
if (unlikely(swap_count(count) == SWAP_MAP_BAD)) {
err = -ENOENT;
-   goto unlock_out;
+   goto out;
}
 
has_cache = count & SWAP_HAS_CACHE;
count &= ~SWAP_HAS_CACHE;
-   err = 0;
 
if (usage == SWAP_HAS_CACHE) {
 
@@ -3503,11 +3482,39 @@ static int __swap_duplicate(swp_entry_t entry, unsigned 
char usage)
 
p->swap_map[offset] = count | has_cache;
 
-unlock_out:
+out:
+   return err;
+}
+
+/*
+ * Verify that a swap entry is valid and increment its swap map count.
+ *
+ * Returns error code in following case.
+ * - success -> 0
+ * - swp_entry is invalid -> EINVAL
+ * - swp_entry is migration entry -> EINVAL
+ * - swap-cache reference is requested but there is already one. -> EEXIST
+ * - swap-cache reference is requested but the entry is not used. -> ENOENT
+ * - swap-mapped reference requested but needs continued swap count. -> ENOMEM
+ */
+static int __swap_duplicate(swp_entry_t entry, unsigned char usage)
+{
+   struct swap_info_struct *p;
+   struct swap_cluster_info *ci;
+   unsigned long offset;
+   int err = -EINVAL;
+
+   p = get_swap_device(entry);
+   if (!p)
+   goto out;
+
+   offset = swp_offset(entry);
+   ci = lock_cluster_or_swap_info(p, offset);
+   err = __swap_duplicate_locked(p, offset, usage);
unlock_cluster_or_swap_info(p, ci);
+
+   put_swap_device(p);
 out:
-   if (p)
-   put_swap_device(p);
return err;
 }
 
-- 
2.16.4

[PATCH -V6 16/21] swap: Free PMD swap mapping when zap_huge_pmd()

2018-10-10 Thread Huang Ying

For a PMD swap mapping, zap_huge_pmd() will clear the PMD and call
free_swap_and_cache() to decrease the swap reference count and maybe
free or split the huge swap cluster and the THP in swap cache.

Signed-off-by: "Huang, Ying" 
Cc: "Kirill A. Shutemov" 
Cc: Andrea Arcangeli 
Cc: Michal Hocko 
Cc: Johannes Weiner 
Cc: Shaohua Li 
Cc: Hugh Dickins 
Cc: Minchan Kim 
Cc: Rik van Riel 
Cc: Dave Hansen 
Cc: Naoya Horiguchi 
Cc: Zi Yan 
Cc: Daniel Jordan 
---
 mm/huge_memory.c | 32 +---
 1 file changed, 21 insertions(+), 11 deletions(-)

diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index 74c8621619cb..b9c766683ee1 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -2066,7 +2066,7 @@ int zap_huge_pmd(struct mmu_gather *tlb, struct 
vm_area_struct *vma,
spin_unlock(ptl);
if (is_huge_zero_pmd(orig_pmd))
tlb_remove_page_size(tlb, pmd_page(orig_pmd), 
HPAGE_PMD_SIZE);
-   } else if (is_huge_zero_pmd(orig_pmd)) {
+   } else if (pmd_present(orig_pmd) && is_huge_zero_pmd(orig_pmd)) {
zap_deposited_table(tlb->mm, pmd);
spin_unlock(ptl);
tlb_remove_page_size(tlb, pmd_page(orig_pmd), HPAGE_PMD_SIZE);
@@ -2079,17 +2079,27 @@ int zap_huge_pmd(struct mmu_gather *tlb, struct 
vm_area_struct *vma,
page_remove_rmap(page, true);
VM_BUG_ON_PAGE(page_mapcount(page) < 0, page);
VM_BUG_ON_PAGE(!PageHead(page), page);
-   } else if (thp_migration_supported()) {
-   swp_entry_t entry;
-
-   VM_BUG_ON(!is_pmd_migration_entry(orig_pmd));
-   entry = pmd_to_swp_entry(orig_pmd);
-   page = pfn_to_page(swp_offset(entry));
+   } else {
+   swp_entry_t entry = pmd_to_swp_entry(orig_pmd);
+
+   if (thp_migration_supported() &&
+   is_migration_entry(entry))
+   page = pfn_to_page(swp_offset(entry));
+   else if (IS_ENABLED(CONFIG_THP_SWAP) &&
+!non_swap_entry(entry))
+   free_swap_and_cache(entry, HPAGE_PMD_NR);
+   else {
+   WARN_ONCE(1,
+"Non present huge pmd without pmd migration or swap enabled!");
+   goto unlock;
+   }
flush_needed = 0;
-   } else
-   WARN_ONCE(1, "Non present huge pmd without pmd 
migration enabled!");
+   }
 
-   if (PageAnon(page)) {
+   if (!page) {
+   zap_deposited_table(tlb->mm, pmd);
+   add_mm_counter(tlb->mm, MM_SWAPENTS, -HPAGE_PMD_NR);
+   } else if (PageAnon(page)) {
zap_deposited_table(tlb->mm, pmd);
add_mm_counter(tlb->mm, MM_ANONPAGES, -HPAGE_PMD_NR);
} else {
@@ -2097,7 +2107,7 @@ int zap_huge_pmd(struct mmu_gather *tlb, struct 
vm_area_struct *vma,
zap_deposited_table(tlb->mm, pmd);
add_mm_counter(tlb->mm, mm_counter_file(page), 
-HPAGE_PMD_NR);
}
-
+unlock:
spin_unlock(ptl);
if (flush_needed)
tlb_remove_page_size(tlb, page, HPAGE_PMD_SIZE);
-- 
2.16.4

[PATCH -V6 03/21] swap: Support PMD swap mapping in swap_duplicate()

2018-10-10 Thread Huang Ying

To support to swapin the THP in one piece, we need to create PMD swap
mapping during swapout, and maintain PMD swap mapping count.  This
patch implements the support to increase the PMD swap mapping
count (for swapout, fork, etc.)  and set SWAP_HAS_CACHE flag (for
swapin, etc.) for a huge swap cluster in swap_duplicate() function
family.  Although it only implements a part of the design of the swap
reference count with PMD swap mapping, the whole design is described
as follow to make it easy to understand the patch and the whole
picture.

A huge swap cluster is used to hold the contents of a swapouted THP.
After swapout, a PMD page mapping to the THP will become a PMD
swap mapping to the huge swap cluster via a swap entry in PMD.  While
a PTE page mapping to a subpage of the THP will become the PTE swap
mapping to a swap slot in the huge swap cluster via a swap entry in
PTE.

If there is no PMD swap mapping and the corresponding THP is removed
from the page cache (reclaimed), the huge swap cluster will be split
and become a normal swap cluster.

The count (cluster_count()) of the huge swap cluster is
SWAPFILE_CLUSTER (= HPAGE_PMD_NR) + PMD swap mapping count.  Because
all swap slots in the huge swap cluster are mapped by PTE or PMD, or
has SWAP_HAS_CACHE bit set, the usage count of the swap cluster is
HPAGE_PMD_NR.  And the PMD swap mapping count is recorded too to make
it easy to determine whether there are remaining PMD swap mappings.

The count in swap_map[offset] is the sum of PTE and PMD swap mapping
count.  This means when we increase the PMD swap mapping count, we
need to increase swap_map[offset] for all swap slots inside the swap
cluster.  An alternative choice is to make swap_map[offset] to record
PTE swap map count only, given we have recorded PMD swap mapping count
in the count of the huge swap cluster.  But this need to increase
swap_map[offset] when splitting the PMD swap mapping, that may fail
because of memory allocation for swap count continuation.  That is
hard to dealt with.  So we choose current solution.

The PMD swap mapping to a huge swap cluster may be split when unmap a
part of PMD mapping etc.  That is easy because only the count of the
huge swap cluster need to be changed.  When the last PMD swap mapping
is gone and SWAP_HAS_CACHE is unset, we will split the huge swap
cluster (clear the huge flag).  This makes it easy to reason the
cluster state.

A huge swap cluster will be split when splitting the THP in swap
cache, or failing to allocate THP during swapin, etc.  But when
splitting the huge swap cluster, we will not try to split all PMD swap
mappings, because we haven't enough information available for that
sometimes.  Later, when the PMD swap mapping is duplicated or swapin,
etc, the PMD swap mapping will be split and fallback to the PTE
operation.

When a THP is added into swap cache, the SWAP_HAS_CACHE flag will be
set in the swap_map[offset] of all swap slots inside the huge swap
cluster backing the THP.  This huge swap cluster will not be split
unless the THP is split even if its PMD swap mapping count dropped to
0.  Later, when the THP is removed from swap cache, the SWAP_HAS_CACHE
flag will be cleared in the swap_map[offset] of all swap slots inside
the huge swap cluster.  And this huge swap cluster will be split if
its PMD swap mapping count is 0.

The first parameter of swap_duplicate() is changed to return the swap
entry to call add_swap_count_continuation() for.  Because we may need
to call it for a swap entry in the middle of a huge swap cluster.

Signed-off-by: "Huang, Ying" 
Cc: "Kirill A. Shutemov" 
Cc: Andrea Arcangeli 
Cc: Michal Hocko 
Cc: Johannes Weiner 
Cc: Shaohua Li 
Cc: Hugh Dickins 
Cc: Minchan Kim 
Cc: Rik van Riel 
Cc: Dave Hansen 
Cc: Naoya Horiguchi 
Cc: Zi Yan 
Cc: Daniel Jordan 
---
 include/linux/swap.h |   9 +++--
 mm/memory.c  |   2 +-
 mm/rmap.c|   2 +-
 mm/swap_state.c  |   2 +-
 mm/swapfile.c| 109 ++-
 5 files changed, 99 insertions(+), 25 deletions(-)

diff --git a/include/linux/swap.h b/include/linux/swap.h
index 34de0d8bf4fa..984a652b9925 100644
--- a/include/linux/swap.h
+++ b/include/linux/swap.h
@@ -446,8 +446,8 @@ extern swp_entry_t get_swap_page_of_type(int);
 extern int get_swap_pages(int n, swp_entry_t swp_entries[], int entry_size);
 extern int add_swap_count_continuation(swp_entry_t, gfp_t);
 extern void swap_shmem_alloc(swp_entry_t);
-extern int swap_duplicate(swp_entry_t);
-extern int swapcache_prepare(swp_entry_t);
+extern int swap_duplicate(swp_entry_t *entry, int entry_size);
+extern int swapcache_prepare(swp_entry_t entry, int entry_size);
 extern void swap_free(swp_entry_t);
 extern void swapcache_free_entries(swp_entry_t *entries, int n);
 extern int free_swap_and_cache(swp_entry_t);
@@ -505,7 +505,8 @@ static inline void show_swap_cache_info(void)
 }
 
 #define free_swap_and_cache(e) ({(is_migration_entry(e) ||

[PATCH -V6 01/21] swap: Enable PMD swap operations for CONFIG_THP_SWAP

2018-10-10 Thread Huang Ying

Currently, "the swap entry" in the page tables is used for a number of
things outside of actual swap, like page migration, etc.  We support
the THP/PMD "swap entry" for page migration currently and the
functions behind this are tied to page migration's config
option (CONFIG_ARCH_ENABLE_THP_MIGRATION).

But, we also need them for THP swap optimization.  So a new config
option (CONFIG_HAVE_PMD_SWAP_ENTRY) is added.  It is enabled when
either CONFIG_ARCH_ENABLE_THP_MIGRATION or CONFIG_THP_SWAP is enabled.
And PMD swap entry functions are tied to this new config option
instead.  Some functions enabled by CONFIG_ARCH_ENABLE_THP_MIGRATION
are for page migration only, they are still enabled only for that.

Signed-off-by: "Huang, Ying" 
Cc: "Kirill A. Shutemov" 
Cc: Andrea Arcangeli 
Cc: Michal Hocko 
Cc: Johannes Weiner 
Cc: Shaohua Li 
Cc: Hugh Dickins 
Cc: Minchan Kim 
Cc: Rik van Riel 
Cc: Dave Hansen 
Cc: Naoya Horiguchi 
Cc: Zi Yan 
Cc: Daniel Jordan 
---
 arch/x86/include/asm/pgtable.h |  2 +-
 include/asm-generic/pgtable.h  |  2 +-
 include/linux/swapops.h| 44 ++
 mm/Kconfig |  8 
 4 files changed, 33 insertions(+), 23 deletions(-)

diff --git a/arch/x86/include/asm/pgtable.h b/arch/x86/include/asm/pgtable.h
index 40616e805292..e830ab345551 100644
--- a/arch/x86/include/asm/pgtable.h
+++ b/arch/x86/include/asm/pgtable.h
@@ -1333,7 +1333,7 @@ static inline pte_t pte_swp_clear_soft_dirty(pte_t pte)
return pte_clear_flags(pte, _PAGE_SWP_SOFT_DIRTY);
 }
 
-#ifdef CONFIG_ARCH_ENABLE_THP_MIGRATION
+#ifdef CONFIG_HAVE_PMD_SWAP_ENTRY
 static inline pmd_t pmd_swp_mksoft_dirty(pmd_t pmd)
 {
return pmd_set_flags(pmd, _PAGE_SWP_SOFT_DIRTY);
diff --git a/include/asm-generic/pgtable.h b/include/asm-generic/pgtable.h
index 5657a20e0c59..eb1e9d17371b 100644
--- a/include/asm-generic/pgtable.h
+++ b/include/asm-generic/pgtable.h
@@ -675,7 +675,7 @@ static inline void ptep_modify_prot_commit(struct mm_struct 
*mm,
 #endif
 
 #ifdef CONFIG_HAVE_ARCH_SOFT_DIRTY
-#ifndef CONFIG_ARCH_ENABLE_THP_MIGRATION
+#ifndef CONFIG_HAVE_PMD_SWAP_ENTRY
 static inline pmd_t pmd_swp_mksoft_dirty(pmd_t pmd)
 {
return pmd;
diff --git a/include/linux/swapops.h b/include/linux/swapops.h
index 4d961668e5fc..905ddc65caa3 100644
--- a/include/linux/swapops.h
+++ b/include/linux/swapops.h
@@ -254,17 +254,7 @@ static inline int is_write_migration_entry(swp_entry_t 
entry)
 
 #endif
 
-struct page_vma_mapped_walk;
-
-#ifdef CONFIG_ARCH_ENABLE_THP_MIGRATION
-extern void set_pmd_migration_entry(struct page_vma_mapped_walk *pvmw,
-   struct page *page);
-
-extern void remove_migration_pmd(struct page_vma_mapped_walk *pvmw,
-   struct page *new);
-
-extern void pmd_migration_entry_wait(struct mm_struct *mm, pmd_t *pmd);
-
+#ifdef CONFIG_HAVE_PMD_SWAP_ENTRY
 static inline swp_entry_t pmd_to_swp_entry(pmd_t pmd)
 {
swp_entry_t arch_entry;
@@ -282,6 +272,28 @@ static inline pmd_t swp_entry_to_pmd(swp_entry_t entry)
arch_entry = __swp_entry(swp_type(entry), swp_offset(entry));
return __swp_entry_to_pmd(arch_entry);
 }
+#else
+static inline swp_entry_t pmd_to_swp_entry(pmd_t pmd)
+{
+   return swp_entry(0, 0);
+}
+
+static inline pmd_t swp_entry_to_pmd(swp_entry_t entry)
+{
+   return __pmd(0);
+}
+#endif
+
+struct page_vma_mapped_walk;
+
+#ifdef CONFIG_ARCH_ENABLE_THP_MIGRATION
+extern void set_pmd_migration_entry(struct page_vma_mapped_walk *pvmw,
+   struct page *page);
+
+extern void remove_migration_pmd(struct page_vma_mapped_walk *pvmw,
+   struct page *new);
+
+extern void pmd_migration_entry_wait(struct mm_struct *mm, pmd_t *pmd);
 
 static inline int is_pmd_migration_entry(pmd_t pmd)
 {
@@ -302,16 +314,6 @@ static inline void remove_migration_pmd(struct 
page_vma_mapped_walk *pvmw,
 
 static inline void pmd_migration_entry_wait(struct mm_struct *m, pmd_t *p) { }
 
-static inline swp_entry_t pmd_to_swp_entry(pmd_t pmd)
-{
-   return swp_entry(0, 0);
-}
-
-static inline pmd_t swp_entry_to_pmd(swp_entry_t entry)
-{
-   return __pmd(0);
-}
-
 static inline int is_pmd_migration_entry(pmd_t pmd)
 {
return 0;
diff --git a/mm/Kconfig b/mm/Kconfig
index b1006cdf3aff..44f7d72010fd 100644
--- a/mm/Kconfig
+++ b/mm/Kconfig
@@ -424,6 +424,14 @@ config THP_SWAP
 
  For selection by architectures with reasonable THP sizes.
 
+#
+# "PMD swap entry" in the page table is used both for migration and
+# actual swap.
+#
+config HAVE_PMD_SWAP_ENTRY
+   def_bool y
+   depends on THP_SWAP || ARCH_ENABLE_THP_MIGRATION
+
 config TRANSPARENT_HUGE_PAGECACHE
def_bool y
depends on TRANSPARENT_HUGEPAGE
-- 
2.16.4

[PATCH -V6 14/21] swap: Support to move swap account for PMD swap mapping

2018-10-10 Thread Huang Ying

Previously the huge swap cluster will be split after the THP is
swapout.  Now, to support to swapin the THP in one piece, the huge
swap cluster will not be split after the THP is reclaimed.  So in
memcg, we need to move the swap account for PMD swap mappings in the
process's page table.

When the page table is scanned during moving memcg charge, the PMD
swap mapping will be identified.  And mem_cgroup_move_swap_account()
and its callee is revised to move account for the whole huge swap
cluster.  If the swap cluster mapped by PMD has been split, the PMD
swap mapping will be split and fallback to PTE processing.

Signed-off-by: "Huang, Ying" 
Cc: "Kirill A. Shutemov" 
Cc: Andrea Arcangeli 
Cc: Michal Hocko 
Cc: Johannes Weiner 
Cc: Shaohua Li 
Cc: Hugh Dickins 
Cc: Minchan Kim 
Cc: Rik van Riel 
Cc: Dave Hansen 
Cc: Naoya Horiguchi 
Cc: Zi Yan 
Cc: Daniel Jordan 
---
 include/linux/huge_mm.h |   9 
 include/linux/swap.h|   6 +++
 include/linux/swap_cgroup.h |   3 +-
 mm/huge_memory.c|   8 +--
 mm/memcontrol.c | 129 ++--
 mm/swap_cgroup.c|  45 +---
 mm/swapfile.c   |  14 +
 7 files changed, 174 insertions(+), 40 deletions(-)

diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h
index 1927b2edb74a..e573774f9014 100644
--- a/include/linux/huge_mm.h
+++ b/include/linux/huge_mm.h
@@ -376,6 +376,9 @@ static inline gfp_t alloc_hugepage_direct_gfpmask(struct 
vm_area_struct *vma,
 #endif /* CONFIG_TRANSPARENT_HUGEPAGE */
 
 #ifdef CONFIG_THP_SWAP
+extern void __split_huge_swap_pmd(struct vm_area_struct *vma,
+ unsigned long haddr,
+ pmd_t *pmd);
 extern int split_huge_swap_pmd(struct vm_area_struct *vma, pmd_t *pmd,
   unsigned long address, pmd_t orig_pmd);
 extern int do_huge_pmd_swap_page(struct vm_fault *vmf, pmd_t orig_pmd);
@@ -403,6 +406,12 @@ static inline bool transparent_hugepage_swapin_enabled(
return false;
 }
 #else /* CONFIG_THP_SWAP */
+static inline void __split_huge_swap_pmd(struct vm_area_struct *vma,
+unsigned long haddr,
+pmd_t *pmd)
+{
+}
+
 static inline int split_huge_swap_pmd(struct vm_area_struct *vma, pmd_t *pmd,
  unsigned long address, pmd_t orig_pmd)
 {
diff --git a/include/linux/swap.h b/include/linux/swap.h
index f2daf3fbdd4b..1210f70f72bc 100644
--- a/include/linux/swap.h
+++ b/include/linux/swap.h
@@ -617,6 +617,7 @@ static inline swp_entry_t get_swap_page(struct page *page)
 #ifdef CONFIG_THP_SWAP
 extern int split_swap_cluster(swp_entry_t entry, unsigned long flags);
 extern int split_swap_cluster_map(swp_entry_t entry);
+extern int get_swap_entry_size(swp_entry_t entry);
 #else
 static inline int split_swap_cluster(swp_entry_t entry, unsigned long flags)
 {
@@ -627,6 +628,11 @@ static inline int split_swap_cluster_map(swp_entry_t entry)
 {
return 0;
 }
+
+static inline int get_swap_entry_size(swp_entry_t entry)
+{
+   return 1;
+}
 #endif
 
 #ifdef CONFIG_MEMCG
diff --git a/include/linux/swap_cgroup.h b/include/linux/swap_cgroup.h
index a12dd1c3966c..c40fb52b0563 100644
--- a/include/linux/swap_cgroup.h
+++ b/include/linux/swap_cgroup.h
@@ -7,7 +7,8 @@
 #ifdef CONFIG_MEMCG_SWAP
 
 extern unsigned short swap_cgroup_cmpxchg(swp_entry_t ent,
-   unsigned short old, unsigned short new);
+   unsigned short old, unsigned short new,
+   unsigned int nr_ents);
 extern unsigned short swap_cgroup_record(swp_entry_t ent, unsigned short id,
 unsigned int nr_ents);
 extern unsigned short lookup_swap_cgroup_id(swp_entry_t ent);
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index 60b4105734b1..ebd043528309 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -1678,10 +1678,11 @@ vm_fault_t do_huge_pmd_numa_page(struct vm_fault *vmf, 
pmd_t pmd)
return 0;
 }
 
+#ifdef CONFIG_THP_SWAP
 /* Convert a PMD swap mapping to a set of PTE swap mappings */
-static void __split_huge_swap_pmd(struct vm_area_struct *vma,
- unsigned long haddr,
- pmd_t *pmd)
+void __split_huge_swap_pmd(struct vm_area_struct *vma,
+  unsigned long haddr,
+  pmd_t *pmd)
 {
struct mm_struct *mm = vma->vm_mm;
pgtable_t pgtable;
@@ -1712,7 +1713,6 @@ static void __split_huge_swap_pmd(struct vm_area_struct 
*vma,
pmd_populate(mm, pmd, pgtable);
 }
 
-#ifdef CONFIG_THP_SWAP
 int split_huge_swap_pmd(struct vm_area_struct *vma, pmd_t *pmd,
unsigned long address, pmd_t orig_pmd)
 {
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 7bebe

[PATCH -V6 05/21] swap: Support PMD swap mapping in free_swap_and_cache()/swap_free()

2018-10-10 Thread Huang Ying

When a PMD swap mapping is removed from a huge swap cluster, for
example, unmap a memory range mapped with PMD swap mapping, etc,
free_swap_and_cache() will be called to decrease the reference count
to the huge swap cluster.  free_swap_and_cache() may also free or
split the huge swap cluster, and free the corresponding THP in swap
cache if necessary.  swap_free() is similar, and shares most
implementation with free_swap_and_cache().  This patch revises
free_swap_and_cache() and swap_free() to implement this.

If the swap cluster has been split already, for example, because of
failing to allocate a THP during swapin, we just decrease one from the
reference count of all swap slots.

Otherwise, we will decrease one from the reference count of all swap
slots and the PMD swap mapping count in cluster_count().  When the
corresponding THP isn't in swap cache, if PMD swap mapping count
becomes 0, the huge swap cluster will be split, and if all swap count
becomes 0, the huge swap cluster will be freed.  When the corresponding
THP is in swap cache, if every swap_map[offset] == SWAP_HAS_CACHE, we
will try to delete the THP from swap cache.  Which will cause the THP
and the huge swap cluster be freed.

Signed-off-by: "Huang, Ying" 
Cc: "Kirill A. Shutemov" 
Cc: Andrea Arcangeli 
Cc: Michal Hocko 
Cc: Johannes Weiner 
Cc: Shaohua Li 
Cc: Hugh Dickins 
Cc: Minchan Kim 
Cc: Rik van Riel 
Cc: Dave Hansen 
Cc: Naoya Horiguchi 
Cc: Zi Yan 
Cc: Daniel Jordan 
---
 arch/s390/mm/pgtable.c |   2 +-
 include/linux/swap.h   |   9 +--
 kernel/power/swap.c|   4 +-
 mm/madvise.c   |   2 +-
 mm/memory.c|   4 +-
 mm/shmem.c |   6 +-
 mm/swapfile.c  | 171 ++---
 7 files changed, 149 insertions(+), 49 deletions(-)

diff --git a/arch/s390/mm/pgtable.c b/arch/s390/mm/pgtable.c
index f2cc7da473e4..ffd4b68adbb3 100644
--- a/arch/s390/mm/pgtable.c
+++ b/arch/s390/mm/pgtable.c
@@ -675,7 +675,7 @@ static void ptep_zap_swap_entry(struct mm_struct *mm, 
swp_entry_t entry)
 
dec_mm_counter(mm, mm_counter(page));
}
-   free_swap_and_cache(entry);
+   free_swap_and_cache(entry, 1);
 }
 
 void ptep_zap_unused(struct mm_struct *mm, unsigned long addr,
diff --git a/include/linux/swap.h b/include/linux/swap.h
index 984a652b9925..e79d7aead142 100644
--- a/include/linux/swap.h
+++ b/include/linux/swap.h
@@ -448,9 +448,9 @@ extern int add_swap_count_continuation(swp_entry_t, gfp_t);
 extern void swap_shmem_alloc(swp_entry_t);
 extern int swap_duplicate(swp_entry_t *entry, int entry_size);
 extern int swapcache_prepare(swp_entry_t entry, int entry_size);
-extern void swap_free(swp_entry_t);
+extern void swap_free(swp_entry_t entry, int entry_size);
 extern void swapcache_free_entries(swp_entry_t *entries, int n);
-extern int free_swap_and_cache(swp_entry_t);
+extern int free_swap_and_cache(swp_entry_t entry, int entry_size);
 extern int swap_type_of(dev_t, sector_t, struct block_device **);
 extern unsigned int count_swap_pages(int, int);
 extern sector_t map_swap_page(struct page *, struct block_device **);
@@ -504,7 +504,8 @@ static inline void show_swap_cache_info(void)
 {
 }
 
-#define free_swap_and_cache(e) ({(is_migration_entry(e) || 
is_device_private_entry(e));})
+#define free_swap_and_cache(e, s)  \
+   ({(is_migration_entry(e) || is_device_private_entry(e)); })
 #define swapcache_prepare(e, s)
\
({(is_migration_entry(e) || is_device_private_entry(e)); })
 
@@ -522,7 +523,7 @@ static inline int swap_duplicate(swp_entry_t *swp, int 
entry_size)
return 0;
 }
 
-static inline void swap_free(swp_entry_t swp)
+static inline void swap_free(swp_entry_t swp, int entry_size)
 {
 }
 
diff --git a/kernel/power/swap.c b/kernel/power/swap.c
index d7f6c1a288d3..0275df84ed3d 100644
--- a/kernel/power/swap.c
+++ b/kernel/power/swap.c
@@ -182,7 +182,7 @@ sector_t alloc_swapdev_block(int swap)
offset = swp_offset(get_swap_page_of_type(swap));
if (offset) {
if (swsusp_extents_insert(offset))
-   swap_free(swp_entry(swap, offset));
+   swap_free(swp_entry(swap, offset), 1);
else
return swapdev_block(swap, offset);
}
@@ -206,7 +206,7 @@ void free_all_swap_pages(int swap)
ext = rb_entry(node, struct swsusp_extent, node);
rb_erase(node, &swsusp_extents);
for (offset = ext->start; offset <= ext->end; offset++)
-   swap_free(swp_entry(swap, offset));
+   swap_free(swp_entry(swap, offset), 1);
 
kfree(ext);
}
diff --git a/mm/madvise.c b/mm/madvise.c
index 9d802566c494..50282ba862e2 100644
--- a/mm/madvise.c
+++ b/mm/madvise.c
@@ -349,7 +349,7 @@ static int madvise_free_pte

[PATCH -V6 00/21] swap: Swapout/swapin THP in one piece

2018-10-10 Thread Huang Ying

Hi, Andrew, could you help me to check whether the overall design is
reasonable?

Hi, Hugh, Shaohua, Minchan and Rik, could you help me to review the
swap part of the patchset?  Especially [02/21], [03/21], [04/21],
[05/21], [06/21], [07/21], [08/21], [09/21], [10/21], [11/21],
[12/21], [20/21], [21/21].

Hi, Andrea and Kirill, could you help me to review the THP part of the
patchset?  Especially [01/21], [07/21], [09/21], [11/21], [13/21],
[15/21], [16/21], [17/21], [18/21], [19/21], [20/21].

Hi, Johannes and Michal, could you help me to review the cgroup part
of the patchset?  Especially [14/21].

And for all, Any comment is welcome!

This patchset is based on the 2018-10-3 head of mmotm/master.

This is the final step of THP (Transparent Huge Page) swap
optimization.  After the first and second step, the splitting huge
page is delayed from almost the first step of swapout to after swapout
has been finished.  In this step, we avoid splitting THP for swapout
and swapout/swapin the THP in one piece.

We tested the patchset with vm-scalability benchmark swap-w-seq test
case, with 16 processes.  The test case forks 16 processes.  Each
process allocates large anonymous memory range, and writes it from
begin to end for 8 rounds.  The first round will swapout, while the
remaining rounds will swapin and swapout.  The test is done on a Xeon
E5 v3 system, the swap device used is a RAM simulated PMEM (persistent
memory) device.  The test result is as follow,

base  optimized
 -- 
 %stddev %change %stddev
 \  |\  
   1417897 ±  2%+992.8%   15494673vm-scalability.throughput
   1020489 ±  4%   +1091.2%   12156349vmstat.swap.si
   1255093 ±  3%+940.3%   13056114vmstat.swap.so
   1259769 ±  7%   +1818.3%   24166779meminfo.AnonHugePages
  28021761   -10.7%   25018848 ±  2%  meminfo.AnonPages
  64080064 ±  4% -95.6%2787565 ± 33%  
interrupts.CAL:Function_call_interrupts
 13.91 ±  5% -13.80.10 ± 27%  
perf-profile.children.cycles-pp.native_queued_spin_lock_slowpath

Where, the score of benchmark (bytes written per second) improved
992.8%.  The swapout/swapin throughput improved 1008% (from about
2.17GB/s to 24.04GB/s).  The performance difference is huge.  In base
kernel, for the first round of writing, the THP is swapout and split,
so in the remaining rounds, there is only normal page swapin and
swapout.  While in optimized kernel, the THP is kept after first
swapout, so THP swapin and swapout is used in the remaining rounds.
This shows the key benefit to swapout/swapin THP in one piece, the THP
will be kept instead of being split.  meminfo information verified
this, in base kernel only 4.5% of anonymous page are THP during the
test, while in optimized kernel, that is 96.6%.  The TLB flushing IPI
(represented as interrupts.CAL:Function_call_interrupts) reduced
95.6%, while cycles for spinlock reduced from 13.9% to 0.1%.  These
are performance benefit of THP swapout/swapin too.

Below is the description for all steps of THP swap optimization.

Recently, the performance of the storage devices improved so fast that
we cannot saturate the disk bandwidth with single logical CPU when do
page swapping even on a high-end server machine.  Because the
performance of the storage device improved faster than that of single
logical CPU.  And it seems that the trend will not change in the near
future.  On the other hand, the THP becomes more and more popular
because of increased memory size.  So it becomes necessary to optimize
THP swap performance.

The advantages to swapout/swapin a THP in one piece include:

- Batch various swap operations for the THP.  Many operations need to
  be done once per THP instead of per normal page, for example,
  allocating/freeing the swap space, writing/reading the swap space,
  flushing TLB, page fault, etc.  This will improve the performance of
  the THP swap greatly.

- The THP swap space read/write will be large sequential IO (2M on
  x86_64).  It is particularly helpful for the swapin, which are
  usually 4k random IO.  This will improve the performance of the THP
  swap too.

- It will help the memory fragmentation, especially when the THP is
  heavily used by the applications.  The THP order pages will be free
  up after THP swapout.

- It will improve the THP utilization on the system with the swap
  turned on.  Because the speed for khugepaged to collapse the normal
  pages into the THP is quite slow.  After the THP is split during the
  swapout, it will take quite long time for the normal pages to
  collapse back into the THP after being swapin.  The high THP
  utilization helps the efficiency of the page based memory management
  too.

There are some concerns regarding THP swapin, mainly because possible
enlarged read/write IO size (for swapout/swapin) may put more overhead
on the storage device.  T

[PATCH -V6 06/21] swap: Support PMD swap mapping when splitting huge PMD

2018-10-10 Thread Huang Ying

A huge PMD need to be split when zap a part of the PMD mapping etc.
If the PMD mapping is a swap mapping, we need to split it too.  This
patch implemented the support for this.  This is similar as splitting
the PMD page mapping, except we need to decrease the PMD swap mapping
count for the huge swap cluster too.  If the PMD swap mapping count
becomes 0, the huge swap cluster will be split.

Notice: is_huge_zero_pmd() and pmd_page() doesn't work well with swap
PMD, so pmd_present() check is called before them.

Signed-off-by: "Huang, Ying" 
Cc: "Kirill A. Shutemov" 
Cc: Andrea Arcangeli 
Cc: Michal Hocko 
Cc: Johannes Weiner 
Cc: Shaohua Li 
Cc: Hugh Dickins 
Cc: Minchan Kim 
Cc: Rik van Riel 
Cc: Dave Hansen 
Cc: Naoya Horiguchi 
Cc: Zi Yan 
Cc: Daniel Jordan 
---
 include/linux/huge_mm.h |  4 
 include/linux/swap.h|  6 ++
 mm/huge_memory.c| 48 +++-
 mm/swapfile.c   | 32 
 4 files changed, 85 insertions(+), 5 deletions(-)

diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h
index 99c19b06d9a4..0f3e1739986f 100644
--- a/include/linux/huge_mm.h
+++ b/include/linux/huge_mm.h
@@ -226,6 +226,10 @@ static inline bool is_huge_zero_page(struct page *page)
return READ_ONCE(huge_zero_page) == page;
 }
 
+/*
+ * is_huge_zero_pmd() must be called after checking pmd_present(),
+ * otherwise, it may report false positive for PMD swap entry.
+ */
 static inline bool is_huge_zero_pmd(pmd_t pmd)
 {
return is_huge_zero_page(pmd_page(pmd));
diff --git a/include/linux/swap.h b/include/linux/swap.h
index e79d7aead142..9bb3f73b5d68 100644
--- a/include/linux/swap.h
+++ b/include/linux/swap.h
@@ -614,11 +614,17 @@ static inline swp_entry_t get_swap_page(struct page *page)
 
 #ifdef CONFIG_THP_SWAP
 extern int split_swap_cluster(swp_entry_t entry);
+extern int split_swap_cluster_map(swp_entry_t entry);
 #else
 static inline int split_swap_cluster(swp_entry_t entry)
 {
return 0;
 }
+
+static inline int split_swap_cluster_map(swp_entry_t entry)
+{
+   return 0;
+}
 #endif
 
 #ifdef CONFIG_MEMCG
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index bae21d3e88cf..9f1c74487576 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -1624,6 +1624,40 @@ vm_fault_t do_huge_pmd_numa_page(struct vm_fault *vmf, 
pmd_t pmd)
return 0;
 }
 
+/* Convert a PMD swap mapping to a set of PTE swap mappings */
+static void __split_huge_swap_pmd(struct vm_area_struct *vma,
+ unsigned long haddr,
+ pmd_t *pmd)
+{
+   struct mm_struct *mm = vma->vm_mm;
+   pgtable_t pgtable;
+   pmd_t _pmd;
+   swp_entry_t entry;
+   int i, soft_dirty;
+
+   entry = pmd_to_swp_entry(*pmd);
+   soft_dirty = pmd_soft_dirty(*pmd);
+
+   split_swap_cluster_map(entry);
+
+   pgtable = pgtable_trans_huge_withdraw(mm, pmd);
+   pmd_populate(mm, &_pmd, pgtable);
+
+   for (i = 0; i < HPAGE_PMD_NR; i++, haddr += PAGE_SIZE, entry.val++) {
+   pte_t *pte, ptent;
+
+   pte = pte_offset_map(&_pmd, haddr);
+   VM_BUG_ON(!pte_none(*pte));
+   ptent = swp_entry_to_pte(entry);
+   if (soft_dirty)
+   ptent = pte_swp_mksoft_dirty(ptent);
+   set_pte_at(mm, haddr, pte, ptent);
+   pte_unmap(pte);
+   }
+   smp_wmb(); /* make pte visible before pmd */
+   pmd_populate(mm, pmd, pgtable);
+}
+
 /*
  * Return true if we do MADV_FREE successfully on entire pmd page.
  * Otherwise, return false.
@@ -2090,7 +2124,7 @@ static void __split_huge_pmd_locked(struct vm_area_struct 
*vma, pmd_t *pmd,
VM_BUG_ON(haddr & ~HPAGE_PMD_MASK);
VM_BUG_ON_VMA(vma->vm_start > haddr, vma);
VM_BUG_ON_VMA(vma->vm_end < haddr + HPAGE_PMD_SIZE, vma);
-   VM_BUG_ON(!is_pmd_migration_entry(*pmd) && !pmd_trans_huge(*pmd)
+   VM_BUG_ON(!is_swap_pmd(*pmd) && !pmd_trans_huge(*pmd)
&& !pmd_devmap(*pmd));
 
count_vm_event(THP_SPLIT_PMD);
@@ -2114,7 +2148,7 @@ static void __split_huge_pmd_locked(struct vm_area_struct 
*vma, pmd_t *pmd,
put_page(page);
add_mm_counter(mm, mm_counter_file(page), -HPAGE_PMD_NR);
return;
-   } else if (is_huge_zero_pmd(*pmd)) {
+   } else if (pmd_present(*pmd) && is_huge_zero_pmd(*pmd)) {
/*
 * FIXME: Do we want to invalidate secondary mmu by calling
 * mmu_notifier_invalidate_range() see comments below inside
@@ -2158,6 +2192,9 @@ static void __split_huge_pmd_locked(struct vm_area_struct 
*vma, pmd_t *pmd,
page = pfn_to_page(swp_offset(entry));
} else
 #endif
+   if (IS_ENABLED(CONFIG_THP_SWAP) && is_swap_pmd(old_pmd))
+

[PATCH -V6 12/21] swap: Support PMD swap mapping in swapoff

2018-10-10 Thread Huang Ying

During swapoff, for a huge swap cluster, we need to allocate a THP,
read its contents into the THP and unuse the PMD and PTE swap mappings
to it.  If failed to allocate a THP, the huge swap cluster will be
split.

During unuse, if it is found that the swap cluster mapped by a PMD
swap mapping is split already, we will split the PMD swap mapping and
unuse the PTEs.

Signed-off-by: "Huang, Ying" 
Cc: "Kirill A. Shutemov" 
Cc: Andrea Arcangeli 
Cc: Michal Hocko 
Cc: Johannes Weiner 
Cc: Shaohua Li 
Cc: Hugh Dickins 
Cc: Minchan Kim 
Cc: Rik van Riel 
Cc: Dave Hansen 
Cc: Naoya Horiguchi 
Cc: Zi Yan 
Cc: Daniel Jordan 
---
 include/asm-generic/pgtable.h | 14 +--
 include/linux/huge_mm.h   |  8 
 mm/huge_memory.c  |  4 +-
 mm/swapfile.c | 86 ++-
 4 files changed, 97 insertions(+), 15 deletions(-)

diff --git a/include/asm-generic/pgtable.h b/include/asm-generic/pgtable.h
index eb1e9d17371b..d64cef2bff04 100644
--- a/include/asm-generic/pgtable.h
+++ b/include/asm-generic/pgtable.h
@@ -931,22 +931,12 @@ static inline int 
pmd_none_or_trans_huge_or_clear_bad(pmd_t *pmd)
barrier();
 #endif
/*
-* !pmd_present() checks for pmd migration entries
-*
-* The complete check uses is_pmd_migration_entry() in linux/swapops.h
-* But using that requires moving current function and 
pmd_trans_unstable()
-* to linux/swapops.h to resovle dependency, which is too much code 
move.
-*
-* !pmd_present() is equivalent to is_pmd_migration_entry() currently,
-* because !pmd_present() pages can only be under migration not swapped
-* out.
-*
-* pmd_none() is preseved for future condition checks on pmd migration
+* pmd_none() is preseved for future condition checks on pmd swap
 * entries and not confusing with this function name, although it is
 * redundant with !pmd_present().
 */
if (pmd_none(pmdval) || pmd_trans_huge(pmdval) ||
-   (IS_ENABLED(CONFIG_ARCH_ENABLE_THP_MIGRATION) && 
!pmd_present(pmdval)))
+   (IS_ENABLED(CONFIG_HAVE_PMD_SWAP_ENTRY) && !pmd_present(pmdval)))
return 1;
if (unlikely(pmd_bad(pmdval))) {
pmd_clear_bad(pmd);
diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h
index a13cd19b6047..1927b2edb74a 100644
--- a/include/linux/huge_mm.h
+++ b/include/linux/huge_mm.h
@@ -376,6 +376,8 @@ static inline gfp_t alloc_hugepage_direct_gfpmask(struct 
vm_area_struct *vma,
 #endif /* CONFIG_TRANSPARENT_HUGEPAGE */
 
 #ifdef CONFIG_THP_SWAP
+extern int split_huge_swap_pmd(struct vm_area_struct *vma, pmd_t *pmd,
+  unsigned long address, pmd_t orig_pmd);
 extern int do_huge_pmd_swap_page(struct vm_fault *vmf, pmd_t orig_pmd);
 
 static inline bool transparent_hugepage_swapin_enabled(
@@ -401,6 +403,12 @@ static inline bool transparent_hugepage_swapin_enabled(
return false;
 }
 #else /* CONFIG_THP_SWAP */
+static inline int split_huge_swap_pmd(struct vm_area_struct *vma, pmd_t *pmd,
+ unsigned long address, pmd_t orig_pmd)
+{
+   return 0;
+}
+
 static inline int do_huge_pmd_swap_page(struct vm_fault *vmf, pmd_t orig_pmd)
 {
return 0;
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index 0ccb1b78d661..0ec71f907fa5 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -1713,8 +1713,8 @@ static void __split_huge_swap_pmd(struct vm_area_struct 
*vma,
 }
 
 #ifdef CONFIG_THP_SWAP
-static int split_huge_swap_pmd(struct vm_area_struct *vma, pmd_t *pmd,
-  unsigned long address, pmd_t orig_pmd)
+int split_huge_swap_pmd(struct vm_area_struct *vma, pmd_t *pmd,
+   unsigned long address, pmd_t orig_pmd)
 {
struct mm_struct *mm = vma->vm_mm;
spinlock_t *ptl;
diff --git a/mm/swapfile.c b/mm/swapfile.c
index 2ca013df35e1..93b6a5d4e44a 100644
--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -1931,6 +1931,11 @@ static inline int pte_same_as_swp(pte_t pte, pte_t 
swp_pte)
return pte_same(pte_swp_clear_soft_dirty(pte), swp_pte);
 }
 
+static inline int pmd_same_as_swp(pmd_t pmd, pmd_t swp_pmd)
+{
+   return pmd_same(pmd_swp_clear_soft_dirty(pmd), swp_pmd);
+}
+
 /*
  * No need to decide whether this PTE shares the swap entry with others,
  * just let do_wp_page work it out if a write is requested later - to
@@ -1992,6 +1997,53 @@ static int unuse_pte(struct vm_area_struct *vma, pmd_t 
*pmd,
return ret;
 }
 
+#ifdef CONFIG_THP_SWAP
+static int unuse_pmd(struct vm_area_struct *vma, pmd_t *pmd,
+unsigned long addr, swp_entry_t entry, struct page *page)
+{
+   struct mem_cgroup *memcg;
+   spinlock_t *ptl;
+   int ret = 1;
+
+   if (mem_cgroup_try_charge(page, vma->vm_mm, GFP_KERNEL,
+ &

[PATCH -V6 11/21] swap: Add sysfs interface to configure THP swapin

2018-10-10 Thread Huang Ying

Swapin a THP as a whole isn't desirable in some situations.  For
example, for completely random access pattern, swapin a THP in one
piece will inflate the reading greatly.  So a sysfs interface:
/sys/kernel/mm/transparent_hugepage/swapin_enabled is added to
configure it.  Three options as follow are provided,

- always: THP swapin will be enabled always

- madvise: THP swapin will be enabled only for VMA with VM_HUGEPAGE
  flag set.

- never: THP swapin will be disabled always

The default configuration is: madvise.

During page fault, if a PMD swap mapping is found and THP swapin is
disabled, the huge swap cluster and the PMD swap mapping will be split
and fallback to normal page swapin.

Signed-off-by: "Huang, Ying" 
Cc: "Kirill A. Shutemov" 
Cc: Andrea Arcangeli 
Cc: Michal Hocko 
Cc: Johannes Weiner 
Cc: Shaohua Li 
Cc: Hugh Dickins 
Cc: Minchan Kim 
Cc: Rik van Riel 
Cc: Dave Hansen 
Cc: Naoya Horiguchi 
Cc: Zi Yan 
Cc: Daniel Jordan 
---
 Documentation/admin-guide/mm/transhuge.rst | 21 +++
 include/linux/huge_mm.h| 31 ++
 mm/huge_memory.c   | 94 --
 3 files changed, 127 insertions(+), 19 deletions(-)

diff --git a/Documentation/admin-guide/mm/transhuge.rst 
b/Documentation/admin-guide/mm/transhuge.rst
index 85e33f785fd7..23aefb17101c 100644
--- a/Documentation/admin-guide/mm/transhuge.rst
+++ b/Documentation/admin-guide/mm/transhuge.rst
@@ -160,6 +160,27 @@ Some userspace (such as a test program, or an optimized 
memory allocation
 
cat /sys/kernel/mm/transparent_hugepage/hpage_pmd_size
 
+Transparent hugepage may be swapout and swapin in one piece without
+splitting.  This will improve the utility of transparent hugepage but
+may inflate the read/write too.  So whether to enable swapin
+transparent hugepage in one piece can be configured as follow.
+
+   echo always >/sys/kernel/mm/transparent_hugepage/swapin_enabled
+   echo madvise >/sys/kernel/mm/transparent_hugepage/swapin_enabled
+   echo never >/sys/kernel/mm/transparent_hugepage/swapin_enabled
+
+always
+   Attempt to allocate a transparent huge page and read it from
+   swap space in one piece every time.
+
+never
+   Always split the swap space and PMD swap mapping and swapin
+   the fault normal page during swapin.
+
+madvise
+   Only swapin the transparent huge page in one piece for
+   MADV_HUGEPAGE madvise regions.
+
 khugepaged will be automatically started when
 transparent_hugepage/enabled is set to "always" or "madvise, and it'll
 be automatically shutdown if it's set to "never".
diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h
index d88579cb059a..a13cd19b6047 100644
--- a/include/linux/huge_mm.h
+++ b/include/linux/huge_mm.h
@@ -63,6 +63,8 @@ enum transparent_hugepage_flag {
 #ifdef CONFIG_DEBUG_VM
TRANSPARENT_HUGEPAGE_DEBUG_COW_FLAG,
 #endif
+   TRANSPARENT_HUGEPAGE_SWAPIN_FLAG,
+   TRANSPARENT_HUGEPAGE_SWAPIN_REQ_MADV_FLAG,
 };
 
 struct kobject;
@@ -375,11 +377,40 @@ static inline gfp_t alloc_hugepage_direct_gfpmask(struct 
vm_area_struct *vma,
 
 #ifdef CONFIG_THP_SWAP
 extern int do_huge_pmd_swap_page(struct vm_fault *vmf, pmd_t orig_pmd);
+
+static inline bool transparent_hugepage_swapin_enabled(
+   struct vm_area_struct *vma)
+{
+   if (vma->vm_flags & VM_NOHUGEPAGE)
+   return false;
+
+   if (is_vma_temporary_stack(vma))
+   return false;
+
+   if (test_bit(MMF_DISABLE_THP, &vma->vm_mm->flags))
+   return false;
+
+   if (transparent_hugepage_flags &
+   (1 << TRANSPARENT_HUGEPAGE_SWAPIN_FLAG))
+   return true;
+
+   if (transparent_hugepage_flags &
+   (1 << TRANSPARENT_HUGEPAGE_SWAPIN_REQ_MADV_FLAG))
+   return !!(vma->vm_flags & VM_HUGEPAGE);
+
+   return false;
+}
 #else /* CONFIG_THP_SWAP */
 static inline int do_huge_pmd_swap_page(struct vm_fault *vmf, pmd_t orig_pmd)
 {
return 0;
 }
+
+static inline bool transparent_hugepage_swapin_enabled(
+   struct vm_area_struct *vma)
+{
+   return false;
+}
 #endif /* CONFIG_THP_SWAP */
 
 #endif /* _LINUX_HUGE_MM_H */
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index 8efcc84fb4b0..0ccb1b78d661 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -57,7 +57,8 @@ unsigned long transparent_hugepage_flags __read_mostly =
 #endif
(1<address);
if (!page) {
+   if (!transparent_hugepage_swapin_enabled(vma))
+   goto split;
+
page = read_swap_cache_async(entry, GFP_HIGHUSER_MOVABLE, vma,
 haddr, false);
if (!page) {
@@ -1702,24 +1756,8 @@ int do_huge_pmd_swap_page(struct vm_fault *vmf, pmd_t 
orig_pmd)

Re: [PATCH 1/2] mm: use kvzalloc for swap_info_struct allocation

2018-11-04 Thread Huang, Ying

Vasily Averin  writes:

> commit a2468cc9bfdf ("swap: choose swap device according to numa node")
> increased size of swap_info_struct up to 44 Kbytes, now it requires
> 4th order page.

Why swap_info_struct could be so large?  Because MAX_NUMNODES could be
thousands so that 'avail_lists' field could be tens KB?  If so, I think
it's fair to use kvzalloc().  Can you add one line comment?  Because
struct swap_info_struct is quite small in default configuration.

Best Regards,
Huang, Ying

> Switch to kvzmalloc allows to avoid unexpected allocation failures.
>
> Signed-off-by: Vasily Averin 
> ---
>  mm/swapfile.c | 6 +++---
>  1 file changed, 3 insertions(+), 3 deletions(-)
>
> diff --git a/mm/swapfile.c b/mm/swapfile.c
> index 644f746e167a..8688ae65ef58 100644
> --- a/mm/swapfile.c
> +++ b/mm/swapfile.c
> @@ -2813,7 +2813,7 @@ static struct swap_info_struct *alloc_swap_info(void)
>   unsigned int type;
>   int i;
>  
> - p = kzalloc(sizeof(*p), GFP_KERNEL);
> + p = kvzalloc(sizeof(*p), GFP_KERNEL);
>   if (!p)
>   return ERR_PTR(-ENOMEM);
>  
> @@ -2824,7 +2824,7 @@ static struct swap_info_struct *alloc_swap_info(void)
>   }
>   if (type >= MAX_SWAPFILES) {
>   spin_unlock(&swap_lock);
> - kfree(p);
> + kvfree(p);
>   return ERR_PTR(-EPERM);
>   }
>   if (type >= nr_swapfiles) {
> @@ -2838,7 +2838,7 @@ static struct swap_info_struct *alloc_swap_info(void)
>   smp_wmb();
>   nr_swapfiles++;
>   } else {
> - kfree(p);
> + kvfree(p);
>   p = swap_info[type];
>   /*
>* Do not memset this entry: a racing procfs swap_next()

Re: [PATCH 2/2] mm: avoid unnecessary swap_info_struct allocation

2018-11-04 Thread Huang, Ying

Vasily Averin  writes:

> Currently newly allocated swap_info_struct can be quickly freed.
> This patch avoid uneccessary high-order page allocation and helps
> to decrease the memory pressure.

I think swapon/swapoff are rare operations, so it will not increase the
memory pressure much.  

Best Regards,
Huang, Ying

> Signed-off-by: Vasily Averin 
> ---
>  mm/swapfile.c | 18 +-
>  1 file changed, 13 insertions(+), 5 deletions(-)
>
> diff --git a/mm/swapfile.c b/mm/swapfile.c
> index 8688ae65ef58..53ec2f0cdf26 100644
> --- a/mm/swapfile.c
> +++ b/mm/swapfile.c
> @@ -2809,14 +2809,17 @@ late_initcall(max_swapfiles_check);
>  
>  static struct swap_info_struct *alloc_swap_info(void)
>  {
> - struct swap_info_struct *p;
> + struct swap_info_struct *p = NULL;
>   unsigned int type;
>   int i;
> + bool force_alloc = false;
>  
> - p = kvzalloc(sizeof(*p), GFP_KERNEL);
> - if (!p)
> - return ERR_PTR(-ENOMEM);
> -
> +retry:
> + if (force_alloc) {
> + p = kvzalloc(sizeof(*p), GFP_KERNEL);
> + if (!p)
> + return ERR_PTR(-ENOMEM);
> + }
>   spin_lock(&swap_lock);
>   for (type = 0; type < nr_swapfiles; type++) {
>   if (!(swap_info[type]->flags & SWP_USED))
> @@ -2828,6 +2831,11 @@ static struct swap_info_struct *alloc_swap_info(void)
>   return ERR_PTR(-EPERM);
>   }
>   if (type >= nr_swapfiles) {
> + if (!force_alloc) {
> + force_alloc = true;
> + spin_unlock(&swap_lock);
> + goto retry;
> + }
>   p->type = type;
>   swap_info[type] = p;
>   /*

Re: [PATCH 1/2] mm: use kvzalloc for swap_info_struct allocation

2018-11-04 Thread Huang, Ying

Vasily Averin  writes:

> On 11/5/18 3:50 AM, Huang, Ying wrote:
>> Vasily Averin  writes:
>> 
>>> commit a2468cc9bfdf ("swap: choose swap device according to numa node")
>>> increased size of swap_info_struct up to 44 Kbytes, now it requires
>>> 4th order page.
>> 
>> Why swap_info_struct could be so large?  Because MAX_NUMNODES could be
>> thousands so that 'avail_lists' field could be tens KB?  If so, I think
>> it's fair to use kvzalloc().  Can you add one line comment?  Because
>> struct swap_info_struct is quite small in default configuration.
>
> I was incorrect not 44Kb but 40kb should be here.
> We have found CONFIG_NODES_SHIFT=10 in new RHEL7 update 6 kernel,
> default ubuntu kernels have the same setting too.
>
> crash> struct swap_info_struct -o
> struct swap_info_struct {
>   [0] unsigned long flags;
>   [8] short prio;
>...
> [140] spinlock_t lock;
> [144] struct plist_node list;
> [184] struct plist_node avail_lists[1024]; <<<< here
>   [41144] struct swap_cluster_info *cluster_info;
>   [41152] struct swap_cluster_list free_clusters;
>   ...
>   [41224] spinlock_t cont_lock;
> }
> SIZE: 41232
>
> struct swap_info_struct {
> ...
> RH_KABI_EXTEND(struct plist_node avail_lists[MAX_NUMNODES]) /* entry 
> in swap_avail_head */
> ...
> }
>
> #define MAX_NUMNODES(1 << NODES_SHIFT)
>
> #ifdef CONFIG_NODES_SHIFT 
> #define NODES_SHIFT CONFIG_NODES_SHIFT
> #else
> #define NODES_SHIFT 0
> #endif
>
> /boot/config-4.15.0-38-generic:CONFIG_NODES_SHIFT=10

I see.  So this is a more practical issue than my original imagination.

But for default config, I mean

$ make defconfig

And it turns out,

CONFIG_NODES_SHIFT=6

Best Regards,
Huang, Ying

Re: [PATCH -V6 00/21] swap: Swapout/swapin THP in one piece

2018-10-23 Thread Huang, Ying

Hi, Daniel,

Daniel Jordan  writes:

> On Wed, Oct 10, 2018 at 03:19:03PM +0800, Huang Ying wrote:
>> And for all, Any comment is welcome!
>> 
>> This patchset is based on the 2018-10-3 head of mmotm/master.
>
> There seems to be some infrequent memory corruption with THPs that have been
> swapped out: page contents differ after swapin.

Thanks a lot for testing this!  I know there were big effort behind this
and it definitely will improve the quality of the patchset greatly!

> Reproducer at the bottom.  Part of some tests I'm writing, had to separate it 
> a
> little hack-ily.  Basically it writes the word offset _at_ each word offset in
> a memory blob, tries to push it to swap, and verifies the offset is the same
> after swapin.
>
> I ran with THP enabled=always.  THP swapin_enabled could be always or never, 
> it
> happened with both.  Every time swapping occurred, a single THP-sized chunk in
> the middle of the blob had different offsets.  Example:
>
> ** > word corruption gap
> ** corruption detected 14929920 bytes in (got 15179776, expected 14929920) **
> ** corruption detected 14929928 bytes in (got 15179784, expected 14929928) **
> ** corruption detected 14929936 bytes in (got 15179792, expected 14929936) **
> ...pattern continues...
> ** corruption detected 17027048 bytes in (got 15179752, expected 17027048) **
> ** corruption detected 17027056 bytes in (got 15179760, expected 17027056) **
> ** corruption detected 17027064 bytes in (got 15179768, expected 17027064) **

15179776 < 15179xxx <= 17027064

15179776 % 4096 = 0

And 15179776 = 15179768 + 8

So I guess we have some alignment bug.  Could you try the patches
attached?  It deal with some alignment issue.

> 100.0% of memory was swapped out at mincore time
> 0.00305% of pages were corrupted (first corrupt word 14929920, last corrupt 
> word 17027064)
>
> The problem goes away with THP enabled=never, and I don't see it on 2018-10-3
> mmotm/master with THP enabled=always.
>
> The server had an NVMe swap device and ~760G memory over two nodes, and the
> program was always run like this:  swap-verify -s $((64 * 2**30))
>
> The kernels had one extra patch, Alexander Duyck's
> "dma-direct: Fix return value of dma_direct_supported", which was required to
> get them to build.
>

Thanks again!

Best Regards,
Huang, Ying

-->8-
>From e1c3e4f565deeb8245bdc4ee53a1f1e4188b6d4a Mon Sep 17 00:00:00 2001
From: Huang Ying 
Date: Wed, 24 Oct 2018 11:24:15 +0800
Subject: [PATCH] Fix alignment bug

---
 include/linux/huge_mm.h | 6 ++
 mm/huge_memory.c| 9 -
 mm/swap_state.c | 2 +-
 3 files changed, 7 insertions(+), 10 deletions(-)

diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h
index 96baae08f47c..e7b3527bc493 100644
--- a/include/linux/huge_mm.h
+++ b/include/linux/huge_mm.h
@@ -379,8 +379,7 @@ struct page_vma_mapped_walk;
 
 #ifdef CONFIG_THP_SWAP
 extern void __split_huge_swap_pmd(struct vm_area_struct *vma,
- unsigned long haddr,
- pmd_t *pmd);
+ unsigned long addr, pmd_t *pmd);
 extern int split_huge_swap_pmd(struct vm_area_struct *vma, pmd_t *pmd,
   unsigned long address, pmd_t orig_pmd);
 extern int do_huge_pmd_swap_page(struct vm_fault *vmf, pmd_t orig_pmd);
@@ -411,8 +410,7 @@ static inline bool transparent_hugepage_swapin_enabled(
 }
 #else /* CONFIG_THP_SWAP */
 static inline void __split_huge_swap_pmd(struct vm_area_struct *vma,
-unsigned long haddr,
-pmd_t *pmd)
+unsigned long addr, pmd_t *pmd)
 {
 }
 
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index ed64266b63dc..b2af3bff7624 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -1731,10 +1731,11 @@ vm_fault_t do_huge_pmd_numa_page(struct vm_fault *vmf, 
pmd_t pmd)
 #ifdef CONFIG_THP_SWAP
 /* Convert a PMD swap mapping to a set of PTE swap mappings */
 void __split_huge_swap_pmd(struct vm_area_struct *vma,
-  unsigned long haddr,
+  unsigned long addr,
   pmd_t *pmd)
 {
struct mm_struct *mm = vma->vm_mm;
+   unsigned long haddr = addr & HPAGE_PMD_MASK;
pgtable_t pgtable;
pmd_t _pmd;
swp_entry_t entry;
@@ -1772,7 +1773,7 @@ int split_huge_swap_pmd(struct vm_area_struct *vma, pmd_t 
*pmd,
 
ptl = pmd_lock(mm, pmd);
if (pmd_same(*pmd, orig_pmd))
-   __split_huge_swap_pmd(vma, address & HPAGE_PMD_MASK, pmd);
+   __split_huge_swap_pmd(vma, address, pmd);
else
ret = -ENOENT;
spi

Re: [PATCH -V6 00/21] swap: Swapout/swapin THP in one piece

2018-10-24 Thread Huang, Ying

Daniel Jordan  writes:

> On Wed, Oct 24, 2018 at 11:31:42AM +0800, Huang, Ying wrote:
>> Hi, Daniel,
>> 
>> Daniel Jordan  writes:
>> 
>> > On Wed, Oct 10, 2018 at 03:19:03PM +0800, Huang Ying wrote:
>> >> And for all, Any comment is welcome!
>> >> 
>> >> This patchset is based on the 2018-10-3 head of mmotm/master.
>> >
>> > There seems to be some infrequent memory corruption with THPs that have 
>> > been
>> > swapped out: page contents differ after swapin.
>> 
>> Thanks a lot for testing this!  I know there were big effort behind this
>> and it definitely will improve the quality of the patchset greatly!
>
> You're welcome!  Hopefully I'll have more results and tests to share in the
> next two weeks.
>
>> 
>> > Reproducer at the bottom.  Part of some tests I'm writing, had to separate 
>> > it a
>> > little hack-ily.  Basically it writes the word offset _at_ each word 
>> > offset in
>> > a memory blob, tries to push it to swap, and verifies the offset is the 
>> > same
>> > after swapin.
>> >
>> > I ran with THP enabled=always.  THP swapin_enabled could be always or 
>> > never, it
>> > happened with both.  Every time swapping occurred, a single THP-sized 
>> > chunk in
>> > the middle of the blob had different offsets.  Example:
>> >
>> > ** > word corruption gap
>> > ** corruption detected 14929920 bytes in (got 15179776, expected 14929920) 
>> > **
>> > ** corruption detected 14929928 bytes in (got 15179784, expected 14929928) 
>> > **
>> > ** corruption detected 14929936 bytes in (got 15179792, expected 14929936) 
>> > **
>> > ...pattern continues...
>> > ** corruption detected 17027048 bytes in (got 15179752, expected 17027048) 
>> > **
>> > ** corruption detected 17027056 bytes in (got 15179760, expected 17027056) 
>> > **
>> > ** corruption detected 17027064 bytes in (got 15179768, expected 17027064) 
>> > **
>> 
>> 15179776 < 15179xxx <= 17027064
>> 
>> 15179776 % 4096 = 0
>> 
>> And 15179776 = 15179768 + 8
>> 
>> So I guess we have some alignment bug.  Could you try the patches
>> attached?  It deal with some alignment issue.
>
> That fixed it.  And removed three lines of code.  Nice :)

Thanks!  I will merge the fixes into the patchset.

Best Regards,
Huang, Ying

Re: [PATCH -V6 06/21] swap: Support PMD swap mapping when splitting huge PMD

2018-10-24 Thread Huang, Ying

Daniel Jordan  writes:

> On Wed, Oct 10, 2018 at 03:19:09PM +0800, Huang Ying wrote:
>> +#ifdef CONFIG_THP_SWAP
>> +/*
>> + * The corresponding page table shouldn't be changed under us, that
>> + * is, the page table lock should be held.
>> + */
>> +int split_swap_cluster_map(swp_entry_t entry)
>> +{
>> +struct swap_info_struct *si;
>> +struct swap_cluster_info *ci;
>> +unsigned long offset = swp_offset(entry);
>> +
>> +VM_BUG_ON(!IS_ALIGNED(offset, SWAPFILE_CLUSTER));
>> +si = _swap_info_get(entry);
>> +if (!si)
>> +return -EBUSY;
>
> I think this return value doesn't get used anywhere?

Yes.  And the error is only possible if page table is corrupted.  So
maybe add a VM_BUG_ON() in it caller __split_huge_swap_pmd()?

Best Regards,
Huang, Ying

Re: [PATCH -V6 14/21] swap: Support to move swap account for PMD swap mapping

2018-10-24 Thread Huang, Ying

Daniel Jordan  writes:

> On Wed, Oct 10, 2018 at 03:19:17PM +0800, Huang Ying wrote:
>> +static struct page *mc_handle_swap_pmd(struct vm_area_struct *vma,
>> +pmd_t pmd, swp_entry_t *entry)
>> +{
>
> Got
> /home/dbbench/linux/mm/memcontrol.c:4719:21: warning: ‘mc_handle_swap_pmd’ 
> defined but not used [-Wunused-function]
>  static struct page *mc_handle_swap_pmd(struct vm_area_struct *vma,
> when
> # CONFIG_TRANSPARENT_HUGEPAGE is not set

Thanks for pointing this out.  Will fix it in the next version.

Best Regards,
Huang, Ying

[PATCH v2 0/7] swap: THP optimizing refactoring

2018-07-16 Thread Huang, Ying

This patchset is based on 2018-07-13 head of mmotm tree.

Now the THP (Transparent Huge Page) swap optimizing is implemented in
the way like below,

#ifdef CONFIG_THP_SWAP
huge_function(...)
{
}
#else
normal_function(...)
{
}
#endif

general_function(...)
{
if (huge)
return thp_function(...);
else
return normal_function(...);
}

As pointed out by Dave Hansen, this will,

1. Created a new, wholly untested code path for huge page
2. Created two places to patch bugs
3. Are not reusing code when possible

This patchset is to address these problems via merging huge/normal
code path/functions if possible.

One concern is that this may cause code size to dilate when
!CONFIG_TRANSPARENT_HUGEPAGE.  The data shows that most refactoring
will only cause quite slight code size increase.

Best Regards,
Huang, Ying

[PATCH v2 2/7] mm/swapfile.c: Replace some #ifdef with IS_ENABLED()

2018-07-16 Thread Huang, Ying

In mm/swapfile.c, THP (Transparent Huge Page) swap specific code is
enclosed by #ifdef CONFIG_THP_SWAP/#endif to avoid code dilating when
THP isn't enabled.  But #ifdef/#endif in .c file hurt the code
readability, so Dave suggested to use IS_ENABLED(CONFIG_THP_SWAP)
instead and let compiler to do the dirty job for us.  This has
potential to remove some duplicated code too.  From output of `size`,

text   data bss dec hex filename
THP=y: 26269   2076 340   28685700d mm/swapfile.o
ifdef/endif:   24115   2028 340   264836773 mm/swapfile.o
IS_ENABLED:24179   2028 340   2654767b3 mm/swapfile.o

IS_ENABLED() based solution works quite well, almost as good as that
of #ifdef/#endif.  And from the diffstat, the removed lines are more
than added lines.

One #ifdef for split_swap_cluster() is kept.  Because it is a public
function with a stub implementation for CONFIG_THP_SWAP=n in swap.h.

Signed-off-by: "Huang, Ying" 
Suggested-by: Dave Hansen 
Cc: Michal Hocko 
Cc: Johannes Weiner 
Cc: Shaohua Li 
Cc: Hugh Dickins 
Cc: Minchan Kim 
Cc: Rik van Riel 
Cc: Daniel Jordan 
Cc: Dan Williams 
---
 mm/swapfile.c | 56 
 1 file changed, 16 insertions(+), 40 deletions(-)

diff --git a/mm/swapfile.c b/mm/swapfile.c
index 0a2a9643dd78..dd9263411f11 100644
--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -870,7 +870,6 @@ static int scan_swap_map_slots(struct swap_info_struct *si,
return n_ret;
 }
 
-#ifdef CONFIG_THP_SWAP
 static int swap_alloc_cluster(struct swap_info_struct *si, swp_entry_t *slot)
 {
unsigned long idx;
@@ -878,6 +877,11 @@ static int swap_alloc_cluster(struct swap_info_struct *si, 
swp_entry_t *slot)
unsigned long offset, i;
unsigned char *map;
 
+   if (!IS_ENABLED(CONFIG_THP_SWAP)) {
+   VM_WARN_ON_ONCE(1);
+   return 0;
+   }
+
if (cluster_list_empty(&si->free_clusters))
return 0;
 
@@ -908,13 +912,6 @@ static void swap_free_cluster(struct swap_info_struct *si, 
unsigned long idx)
unlock_cluster(ci);
swap_range_free(si, offset, SWAPFILE_CLUSTER);
 }
-#else
-static int swap_alloc_cluster(struct swap_info_struct *si, swp_entry_t *slot)
-{
-   VM_WARN_ON_ONCE(1);
-   return 0;
-}
-#endif /* CONFIG_THP_SWAP */
 
 static unsigned long scan_swap_map(struct swap_info_struct *si,
   unsigned char usage)
@@ -1260,7 +1257,6 @@ static void swapcache_free(swp_entry_t entry)
}
 }
 
-#ifdef CONFIG_THP_SWAP
 static void swapcache_free_cluster(swp_entry_t entry)
 {
unsigned long offset = swp_offset(entry);
@@ -1271,6 +1267,9 @@ static void swapcache_free_cluster(swp_entry_t entry)
unsigned int i, free_entries = 0;
unsigned char val;
 
+   if (!IS_ENABLED(CONFIG_THP_SWAP))
+   return;
+
si = _swap_info_get(entry);
if (!si)
return;
@@ -1306,6 +1305,7 @@ static void swapcache_free_cluster(swp_entry_t entry)
}
 }
 
+#ifdef CONFIG_THP_SWAP
 int split_swap_cluster(swp_entry_t entry)
 {
struct swap_info_struct *si;
@@ -1320,11 +1320,7 @@ int split_swap_cluster(swp_entry_t entry)
unlock_cluster(ci);
return 0;
 }
-#else
-static inline void swapcache_free_cluster(swp_entry_t entry)
-{
-}
-#endif /* CONFIG_THP_SWAP */
+#endif
 
 void put_swap_page(struct page *page, swp_entry_t entry)
 {
@@ -1483,7 +1479,6 @@ int swp_swapcount(swp_entry_t entry)
return count;
 }
 
-#ifdef CONFIG_THP_SWAP
 static bool swap_page_trans_huge_swapped(struct swap_info_struct *si,
 swp_entry_t entry)
 {
@@ -1494,6 +1489,9 @@ static bool swap_page_trans_huge_swapped(struct 
swap_info_struct *si,
int i;
bool ret = false;
 
+   if (!IS_ENABLED(CONFIG_THP_SWAP))
+   return swap_swapcount(si, entry) != 0;
+
ci = lock_cluster_or_swap_info(si, offset);
if (!ci || !cluster_is_huge(ci)) {
if (map[roffset] != SWAP_HAS_CACHE)
@@ -1516,7 +1514,7 @@ static bool page_swapped(struct page *page)
swp_entry_t entry;
struct swap_info_struct *si;
 
-   if (likely(!PageTransCompound(page)))
+   if (!IS_ENABLED(CONFIG_THP_SWAP) || likely(!PageTransCompound(page)))
return page_swapcount(page) != 0;
 
page = compound_head(page);
@@ -1540,10 +1538,8 @@ static int page_trans_huge_map_swapcount(struct page 
*page, int *total_mapcount,
/* hugetlbfs shouldn't call it */
VM_BUG_ON_PAGE(PageHuge(page), page);
 
-   if (likely(!PageTransCompound(page))) {
-   mapcount = atomic_read(&page->_mapcount) + 1;
-   if (total_mapcount)
-   *total_mapcount = mapcount;
+   if (!IS_ENABLED(CONFIG_THP_SWAP) || likely(!PageTransCompound(page))) {
+   m

[PATCH v2 3/7] swap: Use swap_count() in swap_page_trans_huge_swapped()

2018-07-16 Thread Huang, Ying

In swap_page_trans_huge_swapped(), to identify whether there's any
page table mapping for a 4k sized swap entry, "si->swap_map[i] !=
SWAP_HAS_CACHE" is used.  This works correctly now, because all users
of the function will only call it after checking SWAP_HAS_CACHE.  But
as pointed out by Daniel, it is better to use "swap_count(map[i])"
here, because it works for "map[i] == 0" case too.

And this makes the implementation more consistent between normal and
huge swap entry.

Signed-off-by: "Huang, Ying" 
Suggested-by: Daniel Jordan 
Cc: Dave Hansen 
Cc: Michal Hocko 
Cc: Johannes Weiner 
Cc: Shaohua Li 
Cc: Hugh Dickins 
Cc: Minchan Kim 
Cc: Rik van Riel 
Cc: Dan Williams 
---
 mm/swapfile.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/mm/swapfile.c b/mm/swapfile.c
index dd9263411f11..92c24402706c 100644
--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -1494,12 +1494,12 @@ static bool swap_page_trans_huge_swapped(struct 
swap_info_struct *si,
 
ci = lock_cluster_or_swap_info(si, offset);
if (!ci || !cluster_is_huge(ci)) {
-   if (map[roffset] != SWAP_HAS_CACHE)
+   if (swap_count(map[roffset]))
ret = true;
goto unlock_out;
}
for (i = 0; i < SWAPFILE_CLUSTER; i++) {
-   if (map[offset + i] != SWAP_HAS_CACHE) {
+   if (swap_count(map[offset + i])) {
ret = true;
break;
}
-- 
2.16.4

[PATCH v2 1/7] swap: Add comments to lock_cluster_or_swap_info()

2018-07-16 Thread Huang, Ying

To improve the code readability.

Signed-off-by: "Huang, Ying" 
Suggested-by: Dave Hansen 
Cc: Michal Hocko 
Cc: Johannes Weiner 
Cc: Shaohua Li 
Cc: Hugh Dickins 
Cc: Minchan Kim 
Cc: Rik van Riel 
Cc: Daniel Jordan 
Cc: Dan Williams 
---
 mm/swapfile.c | 6 ++
 1 file changed, 6 insertions(+)

diff --git a/mm/swapfile.c b/mm/swapfile.c
index d8fddfb000ec..0a2a9643dd78 100644
--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -297,6 +297,12 @@ static inline void unlock_cluster(struct swap_cluster_info 
*ci)
spin_unlock(&ci->lock);
 }
 
+/*
+ * For non-HDD swap devices, the fine grained cluster lock is used to
+ * protect si->swap_map.  But cluster and cluster locks isn't
+ * available for HDD, so coarse grained si->lock will be used instead
+ * for that.
+ */
 static inline struct swap_cluster_info *lock_cluster_or_swap_info(
struct swap_info_struct *si,
unsigned long offset)
-- 
2.16.4

[PATCH v2 4/7] swap: Unify normal/huge code path in swap_page_trans_huge_swapped()

2018-07-16 Thread Huang, Ying

As suggested by Dave, we should unify the code path for normal and
huge swap support if possible to avoid duplicated code, bugs, etc. and
make it easier to review code.

In this patch, the normal/huge code path in swap_page_trans_huge_swapped()
is unified, the added and removed lines are same.  And the binary size
is kept almost same when CONFIG_TRANSPARENT_HUGEPAGE=n.

 text  data bss dec hex filename
base:   24179  2028 340   2654767b3 mm/swapfile.o
unified:24215  2028 340   2658367d7 mm/swapfile.o

Signed-off-by: "Huang, Ying" 
Suggested-by: Dave Hansen 
Cc: Michal Hocko 
Cc: Johannes Weiner 
Cc: Shaohua Li 
Cc: Hugh Dickins 
Cc: Minchan Kim 
Cc: Rik van Riel 
Cc: Daniel Jordan 
Cc: Dan Williams 
---
 mm/swapfile.c | 8 
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/mm/swapfile.c b/mm/swapfile.c
index 92c24402706c..a6d8b8117bc5 100644
--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -270,7 +270,10 @@ static inline void cluster_set_null(struct 
swap_cluster_info *info)
 
 static inline bool cluster_is_huge(struct swap_cluster_info *info)
 {
-   return info->flags & CLUSTER_FLAG_HUGE;
+   if (IS_ENABLED(CONFIG_THP_SWAP))
+   return info->flags & CLUSTER_FLAG_HUGE;
+   else
+   return false;
 }
 
 static inline void cluster_clear_huge(struct swap_cluster_info *info)
@@ -1489,9 +1492,6 @@ static bool swap_page_trans_huge_swapped(struct 
swap_info_struct *si,
int i;
bool ret = false;
 
-   if (!IS_ENABLED(CONFIG_THP_SWAP))
-   return swap_swapcount(si, entry) != 0;
-
ci = lock_cluster_or_swap_info(si, offset);
if (!ci || !cluster_is_huge(ci)) {
if (swap_count(map[roffset]))
-- 
2.16.4

[PATCH v2 0/7] swap: THP optimizing refactoring

2018-07-16 Thread Huang, Ying

This patchset is based on 2018-07-13 head of mmotm tree.

Now the THP (Transparent Huge Page) swap optimizing is implemented in
the way like below,

#ifdef CONFIG_THP_SWAP
huge_function(...)
{
}
#else
normal_function(...)
{
}
#endif

general_function(...)
{
if (huge)
return thp_function(...);
else
return normal_function(...);
}

As pointed out by Dave Hansen, this will,

1. Created a new, wholly untested code path for huge page
2. Created two places to patch bugs
3. Are not reusing code when possible

This patchset is to address these problems via merging huge/normal
code path/functions if possible.

One concern is that this may cause code size to dilate when
!CONFIG_TRANSPARENT_HUGEPAGE.  The data shows that most refactoring
will only cause quite slight code size increase.

Best Regards,
Huang, Ying

[PATCH v2 7/7] swap, put_swap_page: Share more between huge/normal code path

2018-07-16 Thread Huang, Ying

In this patch, locking related code is shared between huge/normal code
path in put_swap_page() to reduce code duplication.  And `free_entries
== 0` case is merged into more general `free_entries !=
SWAPFILE_CLUSTER` case, because the new locking method makes it easy.

The added lines is same as the removed lines.  But the code size is
increased when CONFIG_TRANSPARENT_HUGEPAGE=n.

text   data bss dec hex filename
base:  24215   2028 340   2658367d7 mm/swapfile.o
unified:   24577   2028 340   269456941 mm/swapfile.o

Signed-off-by: "Huang, Ying" 
Cc: Dave Hansen 
Cc: Michal Hocko 
Cc: Johannes Weiner 
Cc: Shaohua Li 
Cc: Hugh Dickins 
Cc: Minchan Kim 
Cc: Rik van Riel 
Cc: Daniel Jordan 
Cc: Dan Williams 
---
 mm/swapfile.c | 20 ++--
 1 file changed, 10 insertions(+), 10 deletions(-)

diff --git a/mm/swapfile.c b/mm/swapfile.c
index fec28f6c05b0..cd75f449896b 100644
--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -1280,8 +1280,8 @@ void put_swap_page(struct page *page, swp_entry_t entry)
if (!si)
return;
 
+   ci = lock_cluster_or_swap_info(si, offset);
if (nr == SWAPFILE_CLUSTER) {
-   ci = lock_cluster(si, offset);
VM_BUG_ON(!cluster_is_huge(ci));
map = si->swap_map + offset;
for (i = 0; i < SWAPFILE_CLUSTER; i++) {
@@ -1290,13 +1290,9 @@ void put_swap_page(struct page *page, swp_entry_t entry)
if (val == SWAP_HAS_CACHE)
free_entries++;
}
-   if (!free_entries) {
-   for (i = 0; i < SWAPFILE_CLUSTER; i++)
-   map[i] &= ~SWAP_HAS_CACHE;
-   }
cluster_clear_huge(ci);
-   unlock_cluster(ci);
if (free_entries == SWAPFILE_CLUSTER) {
+   unlock_cluster_or_swap_info(si, ci);
spin_lock(&si->lock);
ci = lock_cluster(si, offset);
memset(map, 0, SWAPFILE_CLUSTER);
@@ -1307,12 +1303,16 @@ void put_swap_page(struct page *page, swp_entry_t entry)
return;
}
}
-   if (nr == 1 || free_entries) {
-   for (i = 0; i < nr; i++, entry.val++) {
-   if (!__swap_entry_free(si, entry, SWAP_HAS_CACHE))
-   free_swap_slot(entry);
+   for (i = 0; i < nr; i++, entry.val++) {
+   if (!__swap_entry_free_locked(si, offset + i, SWAP_HAS_CACHE)) {
+   unlock_cluster_or_swap_info(si, ci);
+   free_swap_slot(entry);
+   if (i == nr - 1)
+   return;
+   lock_cluster_or_swap_info(si, offset);
}
}
+   unlock_cluster_or_swap_info(si, ci);
 }
 
 #ifdef CONFIG_THP_SWAP
-- 
2.16.4

[PATCH v2 5/7] swap: Unify normal/huge code path in put_swap_page()

2018-07-16 Thread Huang, Ying

In this patch, the normal/huge code path in put_swap_page() and
several helper functions are unified to avoid duplicated code, bugs,
etc. and make it easier to review the code.

The removed lines are more than added lines.  And the binary size is
kept exactly same when CONFIG_TRANSPARENT_HUGEPAGE=n.

Signed-off-by: "Huang, Ying" 
Suggested-by: Dave Hansen 
Cc: Michal Hocko 
Cc: Johannes Weiner 
Cc: Shaohua Li 
Cc: Hugh Dickins 
Cc: Minchan Kim 
Cc: Rik van Riel 
Cc: Daniel Jordan 
Cc: Dan Williams 
---
 mm/swapfile.c | 83 ++-
 1 file changed, 37 insertions(+), 46 deletions(-)

diff --git a/mm/swapfile.c b/mm/swapfile.c
index a6d8b8117bc5..622edc47b67a 100644
--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -205,8 +205,16 @@ static void discard_swap_cluster(struct swap_info_struct 
*si,
 
 #ifdef CONFIG_THP_SWAP
 #define SWAPFILE_CLUSTER   HPAGE_PMD_NR
+
+#define nr_swap_entries(nr)(nr)
 #else
 #define SWAPFILE_CLUSTER   256
+
+/*
+ * Define nr_swap_entries() as constant to let compiler to optimize
+ * out some code if !CONFIG_THP_SWAP
+ */
+#define nr_swap_entries(nr)1
 #endif
 #define LATENCY_LIMIT  256
 
@@ -1249,18 +1257,7 @@ void swap_free(swp_entry_t entry)
 /*
  * Called after dropping swapcache to decrease refcnt to swap entries.
  */
-static void swapcache_free(swp_entry_t entry)
-{
-   struct swap_info_struct *p;
-
-   p = _swap_info_get(entry);
-   if (p) {
-   if (!__swap_entry_free(p, entry, SWAP_HAS_CACHE))
-   free_swap_slot(entry);
-   }
-}
-
-static void swapcache_free_cluster(swp_entry_t entry)
+void put_swap_page(struct page *page, swp_entry_t entry)
 {
unsigned long offset = swp_offset(entry);
unsigned long idx = offset / SWAPFILE_CLUSTER;
@@ -1269,39 +1266,41 @@ static void swapcache_free_cluster(swp_entry_t entry)
unsigned char *map;
unsigned int i, free_entries = 0;
unsigned char val;
-
-   if (!IS_ENABLED(CONFIG_THP_SWAP))
-   return;
+   int nr = nr_swap_entries(hpage_nr_pages(page));
 
si = _swap_info_get(entry);
if (!si)
return;
 
-   ci = lock_cluster(si, offset);
-   VM_BUG_ON(!cluster_is_huge(ci));
-   map = si->swap_map + offset;
-   for (i = 0; i < SWAPFILE_CLUSTER; i++) {
-   val = map[i];
-   VM_BUG_ON(!(val & SWAP_HAS_CACHE));
-   if (val == SWAP_HAS_CACHE)
-   free_entries++;
-   }
-   if (!free_entries) {
-   for (i = 0; i < SWAPFILE_CLUSTER; i++)
-   map[i] &= ~SWAP_HAS_CACHE;
-   }
-   cluster_clear_huge(ci);
-   unlock_cluster(ci);
-   if (free_entries == SWAPFILE_CLUSTER) {
-   spin_lock(&si->lock);
+   if (nr == SWAPFILE_CLUSTER) {
ci = lock_cluster(si, offset);
-   memset(map, 0, SWAPFILE_CLUSTER);
+   VM_BUG_ON(!cluster_is_huge(ci));
+   map = si->swap_map + offset;
+   for (i = 0; i < SWAPFILE_CLUSTER; i++) {
+   val = map[i];
+   VM_BUG_ON(!(val & SWAP_HAS_CACHE));
+   if (val == SWAP_HAS_CACHE)
+   free_entries++;
+   }
+   if (!free_entries) {
+   for (i = 0; i < SWAPFILE_CLUSTER; i++)
+   map[i] &= ~SWAP_HAS_CACHE;
+   }
+   cluster_clear_huge(ci);
unlock_cluster(ci);
-   mem_cgroup_uncharge_swap(entry, SWAPFILE_CLUSTER);
-   swap_free_cluster(si, idx);
-   spin_unlock(&si->lock);
-   } else if (free_entries) {
-   for (i = 0; i < SWAPFILE_CLUSTER; i++, entry.val++) {
+   if (free_entries == SWAPFILE_CLUSTER) {
+   spin_lock(&si->lock);
+   ci = lock_cluster(si, offset);
+   memset(map, 0, SWAPFILE_CLUSTER);
+   unlock_cluster(ci);
+   mem_cgroup_uncharge_swap(entry, SWAPFILE_CLUSTER);
+   swap_free_cluster(si, idx);
+   spin_unlock(&si->lock);
+   return;
+   }
+   }
+   if (nr == 1 || free_entries) {
+   for (i = 0; i < nr; i++, entry.val++) {
if (!__swap_entry_free(si, entry, SWAP_HAS_CACHE))
free_swap_slot(entry);
}
@@ -1325,14 +1324,6 @@ int split_swap_cluster(swp_entry_t entry)
 }
 #endif
 
-void put_swap_page(struct page *page, swp_entry_t entry)
-{
-   if (!PageTransHuge(page))
-   swapcache_free(entry);
-   else
-   swapcache_free_cluster(entry);
-}
-
 static int swp_entry_cmp(const void *ent1, const void *ent2)
 {
const swp_entry_t *e1 = ent1, *e2 = ent2;
-- 
2.16.4

[PATCH v2 6/7] swap: Add __swap_entry_free_locked()

2018-07-16 Thread Huang, Ying

The part of __swap_entry_free() with lock held is separated into a new
function __swap_entry_free_locked().  Because we want to reuse that
piece of code in some other places.

Just mechanical code refactoring, there is no any functional change in
this function.

Signed-off-by: "Huang, Ying" 
Cc: Dave Hansen 
Cc: Michal Hocko 
Cc: Johannes Weiner 
Cc: Shaohua Li 
Cc: Hugh Dickins 
Cc: Minchan Kim 
Cc: Rik van Riel 
Cc: Daniel Jordan 
Cc: Dan Williams 
---
 mm/swapfile.c | 20 ++--
 1 file changed, 14 insertions(+), 6 deletions(-)

diff --git a/mm/swapfile.c b/mm/swapfile.c
index 622edc47b67a..fec28f6c05b0 100644
--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -1180,16 +1180,13 @@ struct swap_info_struct *get_swap_device(swp_entry_t 
entry)
return NULL;
 }
 
-static unsigned char __swap_entry_free(struct swap_info_struct *p,
-  swp_entry_t entry, unsigned char usage)
+static unsigned char __swap_entry_free_locked(struct swap_info_struct *p,
+ unsigned long offset,
+ unsigned char usage)
 {
-   struct swap_cluster_info *ci;
-   unsigned long offset = swp_offset(entry);
unsigned char count;
unsigned char has_cache;
 
-   ci = lock_cluster_or_swap_info(p, offset);
-
count = p->swap_map[offset];
 
has_cache = count & SWAP_HAS_CACHE;
@@ -1217,6 +1214,17 @@ static unsigned char __swap_entry_free(struct 
swap_info_struct *p,
usage = count | has_cache;
p->swap_map[offset] = usage ? : SWAP_HAS_CACHE;
 
+   return usage;
+}
+
+static unsigned char __swap_entry_free(struct swap_info_struct *p,
+  swp_entry_t entry, unsigned char usage)
+{
+   struct swap_cluster_info *ci;
+   unsigned long offset = swp_offset(entry);
+
+   ci = lock_cluster_or_swap_info(p, offset);
+   usage = __swap_entry_free_locked(p, offset, usage);
unlock_cluster_or_swap_info(p, ci);
 
return usage;
-- 
2.16.4

Re: [PATCH v2 7/7] swap, put_swap_page: Share more between huge/normal code path

2018-07-17 Thread Huang, Ying

Dave Hansen  writes:

> On 07/16/2018 05:55 PM, Huang, Ying wrote:
>>  text   data bss dec hex filename
>> base:   24215   2028 340   2658367d7 mm/swapfile.o
>> unified:   245772028 340   269456941 mm/swapfile.o
>
> That's a bit more than I'd expect looking at the rest of the diff.  Make
> me wonder if we missed an #ifdef somewhere or the compiler is getting
> otherwise confused.
>
> Might be worth a 10-minute look at the disassembly.

Dig one step deeper via 'size -A mm/swapfile.o' and diff between base
and unified,

--- b.s 2018-07-18 09:42:07.872501680 +0800
+++ h.s 2018-07-18 09:50:37.984499168 +0800
@@ -1,6 +1,6 @@
 mm/swapfile.o  :
 section   size   addr
-.text17815  0
+.text17927  0
 .data 1288  0
 .bss   340  0
 ___ksymtab_gpl+nr_swap_pages 8  0
@@ -26,8 +26,8 @@
 .data.once   1  0
 .comment35  0
 .note.GNU-stack  0  0
-.orc_unwind_ip1380  0
-.orc_unwind   2070  0
-Total26810
+.orc_unwind_ip1480  0
+.orc_unwind   2220  0
+Total27172

The total difference is same: 27172 - 26810 = 362 = 24577 - 24215.

The text section difference is small: 17927 - 17815 = 112.  The
additional size change comes from unwinder information: (1480 + 2220) -
(1380 + 2070) = 250.  If the frame pointer unwinder is chosen, this cost
nothing, but if the ORC unwinder is chosen, this is the real difference.

For 112 text section difference, use 'objdump -t' to get symbol size and
compare,

--- b.od2018-07-18 10:45:05.768483075 +0800
+++ h.od2018-07-18 10:44:39.556483204 +0800
@@ -30,9 +30,9 @@
 00a3 cluster_list_add_tail
 001e __kunmap_atomic.isra.34
 018c swap_count_continued
-00ac __swap_entry_free
 000f put_swap_device.isra.35
 00b4 inc_cluster_info_page
+006f __swap_entry_free_locked
 004a _enable_swap_info
 0046 wait_on_page_writeback
 002e inode_to_bdi
@@ -53,8 +53,8 @@
 0012 __x64_sys_swapon
 0011 __ia32_sys_swapon
 007a get_swap_device
-0032 swap_free
-0035 put_swap_page
+006e swap_free
+0078 put_swap_page
 0267 swapcache_free_entries
 0058 page_swapcount
 003a __swap_count
@@ -64,7 +64,7 @@
 011a try_to_free_swap
 01fb get_swap_pages
 0098 get_swap_page_of_type
-01b8 free_swap_and_cache
+01e6 free_swap_and_cache
 0543 try_to_unuse
 000e __x64_sys_swapoff
 000d __ia32_sys_swapoff

The size of put_swap_page() change is small: 0x78 - 0x35 = 67.  But
__swap_entry_free() is inlined by compiler, which cause some code
dilating.

Best Regards,
Huang, Ying

Re: [PATCH v2 0/7] swap: THP optimizing refactoring

2018-07-17 Thread Huang, Ying

Daniel Jordan  writes:

> On Tue, Jul 17, 2018 at 08:55:49AM +0800, Huang, Ying wrote:
>> This patchset is based on 2018-07-13 head of mmotm tree.
>
> Looks good.
>
> Still think patch 7 would be easier to review if split into two logical
> changes.  Either way, though.
>
> For the series,
> Reviewed-by: Daniel Jordan 

Thanks a lot for your review!

Best Regards,
Huang, Ying

Re: [PATCH v2 1/7] swap: Add comments to lock_cluster_or_swap_info()

2018-07-17 Thread Huang, Ying

Dave Hansen  writes:

> On 07/16/2018 05:55 PM, Huang, Ying wrote:
>> +/*
>> + * For non-HDD swap devices, the fine grained cluster lock is used to
>> + * protect si->swap_map.  But cluster and cluster locks isn't
>> + * available for HDD, so coarse grained si->lock will be used instead
>> + * for that.
>> + */
>>  static inline struct swap_cluster_info *lock_cluster_or_swap_info(
>>  struct swap_info_struct *si,
>>  unsigned long offset)
>
> This nomenclature is not consistent with the rest of the file.  We call
> a "non-HDD" device an "ssd" absolutely everywhere else in the file.  Why
> are you calling it a non-HDD here?  (fwiw, HDD _barely_ hits my acronym
> cache anyway).
>
> How about this?
>
> /*
>  * Determine the locking method in use for this device.  Return
>  * swap_cluster_info if SSD-style cluster-based locking is in place.
>  */
> static inline struct swap_cluster_info *lock_cluster_or_swap_info(
> struct swap_info_struct *si,
> unsigned long offset)
> {
> struct swap_cluster_info *ci;
>
>   /* Try to use fine-grained SSD-style locking if available: */
> ci = lock_cluster(si, offset);
>
>   /* Otherwise, fall back to traditional, coarse locking: */
> if (!ci)
> spin_lock(&si->lock);
>
> return ci;
> }

This is better than my one, will use this.  Thanks!

> Which reminds me?  Why do we even bother having two locking models?

Because si->cluster_info is NULL for non-SSD, so we cannot use cluster
lock.

About why not use struct swap_cluster_info for non-SSD?  Per my
understanding, struct swap_cluster_info is optimized for SSD.
Especially it assumes seeking is cheap.  So different free swap slot
scanning policy is used for SSD and non-SSD.

Best Regards,
Huang, Ying

Re: [PATCH v2 2/7] mm/swapfile.c: Replace some #ifdef with IS_ENABLED()

2018-07-17 Thread Huang, Ying

Dave Hansen  writes:

>> @@ -878,6 +877,11 @@ static int swap_alloc_cluster(struct swap_info_struct 
>> *si, swp_entry_t *slot)
>>  unsigned long offset, i;
>>  unsigned char *map;
>>  
>> +if (!IS_ENABLED(CONFIG_THP_SWAP)) {
>> +VM_WARN_ON_ONCE(1);
>> +return 0;
>> +}
>
> I see you seized the opportunity to keep this code gloriously
> unencumbered by pesky comments.  This seems like a time when you might
> have slipped up and been temped to add a comment or two.  Guess not. :)
>
> Seriously, though, does it hurt us to add a comment or two to say
> something like:
>
>   /*
>* Should not even be attempting cluster allocations when
>* huge page swap is disabled.  Warn and fail the allocation.
>*/
>   if (!IS_ENABLED(CONFIG_THP_SWAP)) {
>   VM_WARN_ON_ONCE(1);
>   return 0;
>   }

I totally agree with you that we should add more comments for THP swap
to improve the code readability.  As for this specific case,
VM_WARN_ON_ONCE() here is just to capture some programming error during
development.  Do we really need comments here?

I will try to add more comments for other places in code regardless this
one.

Best Regards,
Huang, Ying

Re: [PATCH v2 1/1] memory tier: acpi/hmat: create CPUless memory tiers after obtaining HMAT info

2024-03-12 Thread Huang, Ying

; +EXPORT_SYMBOL_GPL(memory_tier_late_init);
> +
>  static void dump_hmem_attrs(struct access_coordinate *coord, const char 
> *prefix)
>  {
>   pr_info(
> @@ -636,7 +690,7 @@ int mt_set_default_dram_perf(int nid, struct 
> access_coordinate *perf,
>  {
>   int rc = 0;
>  
> - mutex_lock(&memory_tier_lock);
> + mutex_lock(&mt_perf_lock);
>   if (default_dram_perf_error) {
>   rc = -EIO;
>   goto out;
> @@ -684,7 +738,7 @@ int mt_set_default_dram_perf(int nid, struct 
> access_coordinate *perf,
>   }
>  
>  out:
> - mutex_unlock(&memory_tier_lock);
> + mutex_unlock(&mt_perf_lock);
>   return rc;
>  }
>  
> @@ -700,7 +754,7 @@ int mt_perf_to_adistance(struct access_coordinate *perf, 
> int *adist)
>   perf->read_bandwidth + perf->write_bandwidth == 0)
>   return -EINVAL;
>  
> - mutex_lock(&memory_tier_lock);
> + mutex_lock(&mt_perf_lock);
>   /*
>* The abstract distance of a memory node is in direct proportion to
>* its memory latency (read + write) and inversely proportional to its
> @@ -713,7 +767,7 @@ int mt_perf_to_adistance(struct access_coordinate *perf, 
> int *adist)
>   (default_dram_perf.read_latency + 
> default_dram_perf.write_latency) *
>       (default_dram_perf.read_bandwidth + 
> default_dram_perf.write_bandwidth) /
>   (perf->read_bandwidth + perf->write_bandwidth);
> - mutex_unlock(&memory_tier_lock);
> + mutex_unlock(&mt_perf_lock);
>  
>   return 0;
>  }
> @@ -836,6 +890,14 @@ static int __init memory_tier_init(void)
>* types assigned.
>*/
>   for_each_node_state(node, N_MEMORY) {
> + if (!node_state(node, N_CPU))
> + /*
> +  * Defer memory tier initialization on CPUless numa 
> nodes.
> +  * These will be initialized when HMAT information is

HMAT is platform specific, we should avoid to mention it in general code
if possible.

> +  * available.
> +  */
> + continue;
> +
>   memtier = set_node_memory_tier(node);
>   if (IS_ERR(memtier))
>   /*

--
Best Regards,
Huang, Ying

Re: [External] Re: [PATCH v2 1/1] memory tier: acpi/hmat: create CPUless memory tiers after obtaining HMAT info

2024-03-14 Thread Huang, Ying

"Ho-Ren (Jack) Chuang"  writes:

> On Tue, Mar 12, 2024 at 2:21 AM Huang, Ying  wrote:
>>
>> "Ho-Ren (Jack) Chuang"  writes:
>>
>> > The current implementation treats emulated memory devices, such as
>> > CXL1.1 type3 memory, as normal DRAM when they are emulated as normal memory
>> > (E820_TYPE_RAM). However, these emulated devices have different
>> > characteristics than traditional DRAM, making it important to
>> > distinguish them. Thus, we modify the tiered memory initialization process
>> > to introduce a delay specifically for CPUless NUMA nodes. This delay
>> > ensures that the memory tier initialization for these nodes is deferred
>> > until HMAT information is obtained during the boot process. Finally,
>> > demotion tables are recalculated at the end.
>> >
>> > * Abstract common functions into `find_alloc_memory_type()`
>>
>> We should move kmem_put_memory_types() (renamed to
>> mt_put_memory_types()?) too.  This can be put in a separate patch.
>>
>
> Will do! Thanks,
>
>
>>
>> > Since different memory devices require finding or allocating a memory type,
>> > these common steps are abstracted into a single function,
>> > `find_alloc_memory_type()`, enhancing code scalability and conciseness.
>> >
>> > * Handle cases where there is no HMAT when creating memory tiers
>> > There is a scenario where a CPUless node does not provide HMAT information.
>> > If no HMAT is specified, it falls back to using the default DRAM tier.
>> >
>> > * Change adist calculation code to use another new lock, mt_perf_lock.
>> > In the current implementation, iterating through CPUlist nodes requires
>> > holding the `memory_tier_lock`. However, `mt_calc_adistance()` will end up
>> > trying to acquire the same lock, leading to a potential deadlock.
>> > Therefore, we propose introducing a standalone `mt_perf_lock` to protect
>> > `default_dram_perf`. This approach not only avoids deadlock but also
>> > prevents holding a large lock simultaneously.
>> >
>> > Signed-off-by: Ho-Ren (Jack) Chuang 
>> > Signed-off-by: Hao Xiang 
>> > ---
>> >  drivers/acpi/numa/hmat.c | 11 ++
>> >  drivers/dax/kmem.c   | 13 +--
>> >  include/linux/acpi.h |  6 
>> >  include/linux/memory-tiers.h |  8 +
>> >  mm/memory-tiers.c| 70 +---
>> >  5 files changed, 92 insertions(+), 16 deletions(-)
>> >
>> > diff --git a/drivers/acpi/numa/hmat.c b/drivers/acpi/numa/hmat.c
>> > index d6b85f0f6082..28812ec2c793 100644
>> > --- a/drivers/acpi/numa/hmat.c
>> > +++ b/drivers/acpi/numa/hmat.c
>> > @@ -38,6 +38,8 @@ static LIST_HEAD(targets);
>> >  static LIST_HEAD(initiators);
>> >  static LIST_HEAD(localities);
>> >
>> > +static LIST_HEAD(hmat_memory_types);
>> > +
>>
>> HMAT isn't a device driver for some memory devices.  So I don't think we
>> should manage memory types in HMAT.
>
> I can put it back in memory-tier.c. How about the list? Do we still
> need to keep a separate list for storing late_inited memory nodes?
> And how about the list name if we need to remove the prefix "hmat_"?

I don't think we need a separate list for memory-less nodes.  Just
iterate all memory-less nodes.

>
>> Instead, if the memory_type of a
>> node isn't set by the driver, we should manage it in memory-tier.c as
>> fallback.
>>
>
> Do you mean some device drivers may init memory tiers between
> memory_tier_init() and late_initcall(memory_tier_late_init);?
> And this is the reason why you mention to exclude
> "node_memory_types[nid].memtype != NULL" in memory_tier_late_init().
> Is my understanding correct?

Yes.

>> >  static DEFINE_MUTEX(target_lock);
>> >
>> >  /*
>> > @@ -149,6 +151,12 @@ int acpi_get_genport_coordinates(u32 uid,
>> >  }
>> >  EXPORT_SYMBOL_NS_GPL(acpi_get_genport_coordinates, CXL);
>> >
>> > +struct memory_dev_type *hmat_find_alloc_memory_type(int adist)
>> > +{
>> > + return find_alloc_memory_type(adist, &hmat_memory_types);
>> > +}
>> > +EXPORT_SYMBOL_GPL(hmat_find_alloc_memory_type);
>> > +
>> >  static __init void alloc_memory_initiator(unsigned int cpu_pxm)
>> >  {
>> >   struct memory_initiator *initiator;
>> > @@ -1038,6 +1046,9 @@ static __init int hmat_init(void)
>> >   if (!hma

Re: [PATCH v3 1/2] memory tier: dax/kmem: create CPUless memory tiers after obtaining HMAT info

2024-03-20 Thread Huang, Ying

de_memory_types[nid].memtype == NULL)
> + /*
> +  * Some device drivers may have initialized memory tiers
> +  * between `memory_tier_init()` and 
> `memory_tier_late_init()`,
> +  * potentially bringing online memory nodes and
> +  * configuring memory tiers. Exclude them here.
> +  */
> + set_node_memory_tier(nid);
> +
> + establish_demotion_targets();
> + mutex_unlock(&memory_tier_lock);
> +
> + return 0;
> +}
> +late_initcall(memory_tier_late_init);
> +
>  static void dump_hmem_attrs(struct access_coordinate *coord, const char 
> *prefix)
>  {
>   pr_info(
> @@ -631,12 +698,16 @@ static void dump_hmem_attrs(struct access_coordinate 
> *coord, const char *prefix)
>   coord->read_bandwidth, coord->write_bandwidth);
>  }
>  
> +/*
> + * The lock is used to protect the default_dram_perf.
> + */
> +static DEFINE_MUTEX(mt_perf_lock);

Miscommunication here too.  Should be moved to near the
"default_dram_perf" definition.  And it protects not only
default_dram_perf.

>  int mt_set_default_dram_perf(int nid, struct access_coordinate *perf,
>const char *source)
>  {
>   int rc = 0;
>  
> - mutex_lock(&memory_tier_lock);
> + mutex_lock(&mt_perf_lock);
>   if (default_dram_perf_error) {
>   rc = -EIO;
>   goto out;
> @@ -684,7 +755,7 @@ int mt_set_default_dram_perf(int nid, struct 
> access_coordinate *perf,
>   }
>  
>  out:
> - mutex_unlock(&memory_tier_lock);
> + mutex_unlock(&mt_perf_lock);
>   return rc;
>  }
>  
> @@ -700,7 +771,7 @@ int mt_perf_to_adistance(struct access_coordinate *perf, 
> int *adist)
>   perf->read_bandwidth + perf->write_bandwidth == 0)
>   return -EINVAL;
>  
> - mutex_lock(&memory_tier_lock);
> + mutex_lock(&mt_perf_lock);
>   /*
>* The abstract distance of a memory node is in direct proportion to
>* its memory latency (read + write) and inversely proportional to its
> @@ -713,7 +784,7 @@ int mt_perf_to_adistance(struct access_coordinate *perf, 
> int *adist)
>   (default_dram_perf.read_latency + 
> default_dram_perf.write_latency) *
>   (default_dram_perf.read_bandwidth + 
> default_dram_perf.write_bandwidth) /
>   (perf->read_bandwidth + perf->write_bandwidth);
> - mutex_unlock(&memory_tier_lock);
> + mutex_unlock(&mt_perf_lock);
>  
>   return 0;
>  }
> @@ -826,7 +897,8 @@ static int __init memory_tier_init(void)
>* For now we can have 4 faster memory tiers with smaller adistance
>* than default DRAM tier.
>*/
> - default_dram_type = alloc_memory_type(MEMTIER_ADISTANCE_DRAM);
> + default_dram_type = mt_find_alloc_memory_type(
> + MEMTIER_ADISTANCE_DRAM, 
> &default_memory_types);
>   if (IS_ERR(default_dram_type))
>   panic("%s() failed to allocate default DRAM tier\n", __func__);
>  
> @@ -836,6 +908,14 @@ static int __init memory_tier_init(void)
>* types assigned.
>*/
>   for_each_node_state(node, N_MEMORY) {
> + if (!node_state(node, N_CPU))
> + /*
> +  * Defer memory tier initialization on CPUless numa 
> nodes.
> +  * These will be initialized after firmware and devices 
> are
> +  * initialized.
> +  */
> + continue;
> +
>   memtier = set_node_memory_tier(node);
>   if (IS_ERR(memtier))
>   /*

--
Best Regards,
Huang, Ying

Re: [PATCH v4 2/2] memory tier: create CPUless memory tiers after obtaining HMAT info

2024-03-22 Thread Huang, Ying

mtier = find_create_memory_tier(memtype);
> + __init_node_memory_type(node, mtype);
> +
> + mtype = node_memory_types[node].memtype;
> + node_set(node, mtype->nodes);
> + memtier = find_create_memory_tier(mtype);
>   if (!IS_ERR(memtier))
>   rcu_assign_pointer(pgdat->memtier, memtier);
>   return memtier;
> @@ -655,6 +671,34 @@ void mt_put_memory_types(struct list_head *memory_types)
>  }
>  EXPORT_SYMBOL_GPL(mt_put_memory_types);
>  
> +/*
> + * This is invoked via `late_initcall()` to initialize memory tiers for
> + * CPU-less memory nodes after driver initialization, which is
> + * expected to provide `adistance` algorithms.
> + */
> +static int __init memory_tier_late_init(void)
> +{
> + int nid;
> +
> + mutex_lock(&memory_tier_lock);
> + for_each_node_state(nid, N_MEMORY)
> + if (!node_state(nid, N_CPU) &&
> + node_memory_types[nid].memtype == NULL)
> + /*
> +  * Some device drivers may have initialized memory tiers
> +  * between `memory_tier_init()` and 
> `memory_tier_late_init()`,
> +  * potentially bringing online memory nodes and
> +  * configuring memory tiers. Exclude them here.
> +  */
> + set_node_memory_tier(nid);
> +
> + establish_demotion_targets();
> + mutex_unlock(&memory_tier_lock);
> +
> + return 0;
> +}
> +late_initcall(memory_tier_late_init);
> +
>  static void dump_hmem_attrs(struct access_coordinate *coord, const char 
> *prefix)
>  {
>   pr_info(
> @@ -668,7 +712,7 @@ int mt_set_default_dram_perf(int nid, struct 
> access_coordinate *perf,
>  {
>   int rc = 0;
>  
> - mutex_lock(&memory_tier_lock);
> + mutex_lock(&default_dram_perf_lock);
>   if (default_dram_perf_error) {
>   rc = -EIO;
>   goto out;
> @@ -716,7 +760,7 @@ int mt_set_default_dram_perf(int nid, struct 
> access_coordinate *perf,
>   }
>  
>  out:
> - mutex_unlock(&memory_tier_lock);
> + mutex_unlock(&default_dram_perf_lock);
>   return rc;
>  }
>  
> @@ -732,7 +776,7 @@ int mt_perf_to_adistance(struct access_coordinate *perf, 
> int *adist)
>   perf->read_bandwidth + perf->write_bandwidth == 0)
>   return -EINVAL;
>  
> - mutex_lock(&memory_tier_lock);
> + mutex_lock(&default_dram_perf_lock);
>   /*
>* The abstract distance of a memory node is in direct proportion to
>* its memory latency (read + write) and inversely proportional to its
> @@ -745,7 +789,7 @@ int mt_perf_to_adistance(struct access_coordinate *perf, 
> int *adist)
>   (default_dram_perf.read_latency + 
> default_dram_perf.write_latency) *
>   (default_dram_perf.read_bandwidth + 
> default_dram_perf.write_bandwidth) /
>   (perf->read_bandwidth + perf->write_bandwidth);
> - mutex_unlock(&memory_tier_lock);
> + mutex_unlock(&default_dram_perf_lock);
>  
>   return 0;
>  }
> @@ -858,7 +902,8 @@ static int __init memory_tier_init(void)
>* For now we can have 4 faster memory tiers with smaller adistance
>* than default DRAM tier.
>*/
> - default_dram_type = alloc_memory_type(MEMTIER_ADISTANCE_DRAM);
> + default_dram_type = mt_find_alloc_memory_type(MEMTIER_ADISTANCE_DRAM,
> + 
> &default_memory_types);
>   if (IS_ERR(default_dram_type))
>   panic("%s() failed to allocate default DRAM tier\n", __func__);
>  
> @@ -868,6 +913,14 @@ static int __init memory_tier_init(void)
>* types assigned.
>*/
>   for_each_node_state(node, N_MEMORY) {
> + if (!node_state(node, N_CPU))
> + /*
> +  * Defer memory tier initialization on CPUless numa 
> nodes.
> +  * These will be initialized after firmware and devices 
> are
> +  * initialized.
> +  */
> + continue;
> +
>   memtier = set_node_memory_tier(node);
>   if (IS_ERR(memtier))
>   /*

--
Best Regards,
Huang, Ying

Re: [External] Re: [PATCH v4 2/2] memory tier: create CPUless memory tiers after obtaining HMAT info

2024-03-25 Thread Huang, Ying

"Ho-Ren (Jack) Chuang"  writes:

> On Fri, Mar 22, 2024 at 1:41 AM Huang, Ying  wrote:
>>
>> "Ho-Ren (Jack) Chuang"  writes:
>>
>> > The current implementation treats emulated memory devices, such as
>> > CXL1.1 type3 memory, as normal DRAM when they are emulated as normal memory
>> > (E820_TYPE_RAM). However, these emulated devices have different
>> > characteristics than traditional DRAM, making it important to
>> > distinguish them. Thus, we modify the tiered memory initialization process
>> > to introduce a delay specifically for CPUless NUMA nodes. This delay
>> > ensures that the memory tier initialization for these nodes is deferred
>> > until HMAT information is obtained during the boot process. Finally,
>> > demotion tables are recalculated at the end.
>> >
>> > * late_initcall(memory_tier_late_init);
>> > Some device drivers may have initialized memory tiers between
>> > `memory_tier_init()` and `memory_tier_late_init()`, potentially bringing
>> > online memory nodes and configuring memory tiers. They should be excluded
>> > in the late init.
>> >
>> > * Handle cases where there is no HMAT when creating memory tiers
>> > There is a scenario where a CPUless node does not provide HMAT information.
>> > If no HMAT is specified, it falls back to using the default DRAM tier.
>> >
>> > * Introduce another new lock `default_dram_perf_lock` for adist calculation
>> > In the current implementation, iterating through CPUlist nodes requires
>> > holding the `memory_tier_lock`. However, `mt_calc_adistance()` will end up
>> > trying to acquire the same lock, leading to a potential deadlock.
>> > Therefore, we propose introducing a standalone `default_dram_perf_lock` to
>> > protect `default_dram_perf_*`. This approach not only avoids deadlock
>> > but also prevents holding a large lock simultaneously.
>> >
>> > * Upgrade `set_node_memory_tier` to support additional cases, including
>> >   default DRAM, late CPUless, and hot-plugged initializations.
>> > To cover hot-plugged memory nodes, `mt_calc_adistance()` and
>> > `mt_find_alloc_memory_type()` are moved into `set_node_memory_tier()` to
>> > handle cases where memtype is not initialized and where HMAT information is
>> > available.
>> >
>> > * Introduce `default_memory_types` for those memory types that are not
>> >   initialized by device drivers.
>> > Because late initialized memory and default DRAM memory need to be managed,
>> > a default memory type is created for storing all memory types that are
>> > not initialized by device drivers and as a fallback.
>> >
>> > Signed-off-by: Ho-Ren (Jack) Chuang 
>> > Signed-off-by: Hao Xiang 
>> > ---
>> >  mm/memory-tiers.c | 73 ---
>> >  1 file changed, 63 insertions(+), 10 deletions(-)
>> >
>> > diff --git a/mm/memory-tiers.c b/mm/memory-tiers.c
>> > index 974af10cfdd8..9396330fa162 100644
>> > --- a/mm/memory-tiers.c
>> > +++ b/mm/memory-tiers.c
>> > @@ -36,6 +36,11 @@ struct node_memory_type_map {
>> >
>> >  static DEFINE_MUTEX(memory_tier_lock);
>> >  static LIST_HEAD(memory_tiers);
>> > +/*
>> > + * The list is used to store all memory types that are not created
>> > + * by a device driver.
>> > + */
>> > +static LIST_HEAD(default_memory_types);
>> >  static struct node_memory_type_map node_memory_types[MAX_NUMNODES];
>> >  struct memory_dev_type *default_dram_type;
>> >
>> > @@ -108,6 +113,7 @@ static struct demotion_nodes *node_demotion 
>> > __read_mostly;
>> >
>> >  static BLOCKING_NOTIFIER_HEAD(mt_adistance_algorithms);
>> >
>> > +static DEFINE_MUTEX(default_dram_perf_lock);
>>
>> Better to add comments about what is protected by this lock.
>>
>
> Thank you. I will add a comment like this:
> + /* The lock is used to protect `default_dram_perf*` info and nid. */
> +static DEFINE_MUTEX(default_dram_perf_lock);
>
> I also found an error path was not handled and
> found the lock could be put closer to what it protects.
> I will have them fixed in V5.
>
>> >  static bool default_dram_perf_error;
>> >  static struct access_coordinate default_dram_perf;
>> >  static int default_dram_perf_ref_nid = NUMA_NO_NODE;
>> > @@ -505,7 +511,8 @@ static inline void __init_node_memory_type(int node, 
>> > struct memory_dev_type *mem

Re: [PATCH v5 2/2] memory tier: create CPUless memory tiers after obtaining HMAT info

2024-03-26 Thread Huang, Ying

*/
> - default_dram_type = alloc_memory_type(MEMTIER_ADISTANCE_DRAM);
> + default_dram_type = mt_find_alloc_memory_type(MEMTIER_ADISTANCE_DRAM,
> + 
> &default_memory_types);
>   if (IS_ERR(default_dram_type))
>   panic("%s() failed to allocate default DRAM tier\n", __func__);
>  
> @@ -868,6 +919,14 @@ static int __init memory_tier_init(void)
>* types assigned.
>*/
>   for_each_node_state(node, N_MEMORY) {
> + if (!node_state(node, N_CPU))
> + /*
> +  * Defer memory tier initialization on CPUless numa 
> nodes.
> +  * These will be initialized after firmware and devices 
> are
> +  * initialized.
> +  */
> + continue;
> +
>   memtier = set_node_memory_tier(node);
>   if (IS_ERR(memtier))
>   /*

--
Best Regards,
Huang, Ying

Re: [PATCH v6 2/2] memory tier: create CPUless memory tiers after obtaining HMAT info

2024-03-27 Thread Huang, Ying

"Ho-Ren (Jack) Chuang"  writes:

[snip]

> @@ -655,6 +672,34 @@ void mt_put_memory_types(struct list_head *memory_types)
>  }
>  EXPORT_SYMBOL_GPL(mt_put_memory_types);
>  
> +/*
> + * This is invoked via `late_initcall()` to initialize memory tiers for
> + * CPU-less memory nodes after driver initialization, which is
> + * expected to provide `adistance` algorithms.
> + */
> +static int __init memory_tier_late_init(void)
> +{
> + int nid;
> +
> + mutex_lock(&memory_tier_lock);
> + for_each_node_state(nid, N_MEMORY)
> + if (!node_state(nid, N_CPU) &&
> + node_memory_types[nid].memtype == NULL)

Think about this again.  It seems that it is better to check
"node_memory_types[nid].memtype == NULL" only here.  Because for all
node with N_CPU in memory_tier_init(), "node_memory_types[nid].memtype"
will be !NULL.  And it's possible (in theory) that some nodes becomes
"node_state(nid, N_CPU) == true" between memory_tier_init() and
memory_tier_late_init().

Otherwise, Looks good to me.  Feel free to add

Reviewed-by: "Huang, Ying" 

in the future version.

> + /*
> +  * Some device drivers may have initialized memory tiers
> +  * between `memory_tier_init()` and 
> `memory_tier_late_init()`,
> +  * potentially bringing online memory nodes and
> +  * configuring memory tiers. Exclude them here.
> +  */
> + set_node_memory_tier(nid);
> +
> + establish_demotion_targets();
> + mutex_unlock(&memory_tier_lock);
> +
> + return 0;
> +}
> +late_initcall(memory_tier_late_init);
> +

[snip]

--
Best Regards,
Huang, Ying

Re: [PATCH v8 2/2] memory tier: create CPUless memory tiers after obtaining HMAT info

2024-03-28 Thread Huang, Ying

"Ho-Ren (Jack) Chuang"  writes:

> The current implementation treats emulated memory devices, such as
> CXL1.1 type3 memory, as normal DRAM when they are emulated as normal memory
> (E820_TYPE_RAM). However, these emulated devices have different
> characteristics than traditional DRAM, making it important to
> distinguish them. Thus, we modify the tiered memory initialization process
> to introduce a delay specifically for CPUless NUMA nodes. This delay
> ensures that the memory tier initialization for these nodes is deferred
> until HMAT information is obtained during the boot process. Finally,
> demotion tables are recalculated at the end.
>
> * late_initcall(memory_tier_late_init);
> Some device drivers may have initialized memory tiers between
> `memory_tier_init()` and `memory_tier_late_init()`, potentially bringing
> online memory nodes and configuring memory tiers. They should be excluded
> in the late init.
>
> * Handle cases where there is no HMAT when creating memory tiers
> There is a scenario where a CPUless node does not provide HMAT information.
> If no HMAT is specified, it falls back to using the default DRAM tier.
>
> * Introduce another new lock `default_dram_perf_lock` for adist calculation
> In the current implementation, iterating through CPUlist nodes requires
> holding the `memory_tier_lock`. However, `mt_calc_adistance()` will end up
> trying to acquire the same lock, leading to a potential deadlock.
> Therefore, we propose introducing a standalone `default_dram_perf_lock` to
> protect `default_dram_perf_*`. This approach not only avoids deadlock
> but also prevents holding a large lock simultaneously.
>
> * Upgrade `set_node_memory_tier` to support additional cases, including
>   default DRAM, late CPUless, and hot-plugged initializations.
> To cover hot-plugged memory nodes, `mt_calc_adistance()` and
> `mt_find_alloc_memory_type()` are moved into `set_node_memory_tier()` to
> handle cases where memtype is not initialized and where HMAT information is
> available.
>
> * Introduce `default_memory_types` for those memory types that are not
>   initialized by device drivers.
> Because late initialized memory and default DRAM memory need to be managed,
> a default memory type is created for storing all memory types that are
> not initialized by device drivers and as a fallback.
>
> Signed-off-by: Ho-Ren (Jack) Chuang 
> Signed-off-by: Hao Xiang 
> Reviewed-by: "Huang, Ying" 
> ---
>  mm/memory-tiers.c | 94 +++
>  1 file changed, 78 insertions(+), 16 deletions(-)
>
> diff --git a/mm/memory-tiers.c b/mm/memory-tiers.c
> index 974af10cfdd8..e24fc3bebae4 100644
> --- a/mm/memory-tiers.c
> +++ b/mm/memory-tiers.c
> @@ -36,6 +36,11 @@ struct node_memory_type_map {
>  
>  static DEFINE_MUTEX(memory_tier_lock);
>  static LIST_HEAD(memory_tiers);
> +/*
> + * The list is used to store all memory types that are not created
> + * by a device driver.
> + */
> +static LIST_HEAD(default_memory_types);
>  static struct node_memory_type_map node_memory_types[MAX_NUMNODES];
>  struct memory_dev_type *default_dram_type;
>  
> @@ -108,6 +113,8 @@ static struct demotion_nodes *node_demotion __read_mostly;
>  
>  static BLOCKING_NOTIFIER_HEAD(mt_adistance_algorithms);
>  
> +/* The lock is used to protect `default_dram_perf*` info and nid. */
> +static DEFINE_MUTEX(default_dram_perf_lock);
>  static bool default_dram_perf_error;
>  static struct access_coordinate default_dram_perf;
>  static int default_dram_perf_ref_nid = NUMA_NO_NODE;
> @@ -505,7 +512,8 @@ static inline void __init_node_memory_type(int node, 
> struct memory_dev_type *mem
>  static struct memory_tier *set_node_memory_tier(int node)
>  {
>   struct memory_tier *memtier;
> - struct memory_dev_type *memtype;
> + struct memory_dev_type *mtype = default_dram_type;
> + int adist = MEMTIER_ADISTANCE_DRAM;
>   pg_data_t *pgdat = NODE_DATA(node);
>  
>  
> @@ -514,11 +522,20 @@ static struct memory_tier *set_node_memory_tier(int 
> node)
>   if (!node_state(node, N_MEMORY))
>   return ERR_PTR(-EINVAL);
>  
> - __init_node_memory_type(node, default_dram_type);
> + mt_calc_adistance(node, &adist);
> + if (node_memory_types[node].memtype == NULL) {
> + mtype = mt_find_alloc_memory_type(adist, &default_memory_types);
> + if (IS_ERR(mtype)) {
> + mtype = default_dram_type;
> + pr_info("Failed to allocate a memory type. Fall 
> back.\n");
> + }
> + }
> +
> + __init_node_memory_type(node, mtype);
>  
> - memtype = node_memory_types[no

Re: [PATCH v11 2/2] memory tier: create CPUless memory tiers after obtaining HMAT info

2024-04-09 Thread Huang, Ying

"Ho-Ren (Jack) Chuang"  writes:

> On Fri, Apr 5, 2024 at 7:03 AM Jonathan Cameron
>  wrote:
>>
>> On Fri,  5 Apr 2024 00:07:06 +
>> "Ho-Ren (Jack) Chuang"  wrote:
>>
>> > The current implementation treats emulated memory devices, such as
>> > CXL1.1 type3 memory, as normal DRAM when they are emulated as normal memory
>> > (E820_TYPE_RAM). However, these emulated devices have different
>> > characteristics than traditional DRAM, making it important to
>> > distinguish them. Thus, we modify the tiered memory initialization process
>> > to introduce a delay specifically for CPUless NUMA nodes. This delay
>> > ensures that the memory tier initialization for these nodes is deferred
>> > until HMAT information is obtained during the boot process. Finally,
>> > demotion tables are recalculated at the end.
>> >
>> > * late_initcall(memory_tier_late_init);
>> > Some device drivers may have initialized memory tiers between
>> > `memory_tier_init()` and `memory_tier_late_init()`, potentially bringing
>> > online memory nodes and configuring memory tiers. They should be excluded
>> > in the late init.
>> >
>> > * Handle cases where there is no HMAT when creating memory tiers
>> > There is a scenario where a CPUless node does not provide HMAT information.
>> > If no HMAT is specified, it falls back to using the default DRAM tier.
>> >
>> > * Introduce another new lock `default_dram_perf_lock` for adist calculation
>> > In the current implementation, iterating through CPUlist nodes requires
>> > holding the `memory_tier_lock`. However, `mt_calc_adistance()` will end up
>> > trying to acquire the same lock, leading to a potential deadlock.
>> > Therefore, we propose introducing a standalone `default_dram_perf_lock` to
>> > protect `default_dram_perf_*`. This approach not only avoids deadlock
>> > but also prevents holding a large lock simultaneously.
>> >
>> > * Upgrade `set_node_memory_tier` to support additional cases, including
>> >   default DRAM, late CPUless, and hot-plugged initializations.
>> > To cover hot-plugged memory nodes, `mt_calc_adistance()` and
>> > `mt_find_alloc_memory_type()` are moved into `set_node_memory_tier()` to
>> > handle cases where memtype is not initialized and where HMAT information is
>> > available.
>> >
>> > * Introduce `default_memory_types` for those memory types that are not
>> >   initialized by device drivers.
>> > Because late initialized memory and default DRAM memory need to be managed,
>> > a default memory type is created for storing all memory types that are
>> > not initialized by device drivers and as a fallback.
>> >
>> > Signed-off-by: Ho-Ren (Jack) Chuang 
>> > Signed-off-by: Hao Xiang 
>> > Reviewed-by: "Huang, Ying" 
>>
>> Hi - one remaining question. Why can't we delay init for all nodes
>> to either drivers or your fallback late_initcall code.
>> It would be nice to reduce possible code paths.
>
> I try not to change too much of the existing code structure in
> this patchset.
>
> To me, postponing/moving all memory tier registrations to
> late_initcall() is another possible action item for the next patchset.
>
> After tier_mem(), hmat_init() is called, which requires registering
> `default_dram_type` info. This is when `default_dram_type` is needed.
> However, it is indeed possible to postpone the latter part,
> set_node_memory_tier(), to `late_init(). So, memory_tier_init() can
> indeed be split into two parts, and the latter part can be moved to
> late_initcall() to be processed together.

I don't think that it's good to move all memory_tier initialization in
drivers to late_initcall().  It's natural to keep them in
device_initcall() level.

If so, we can allocate default_dram_type in memory_tier_init(), and call
set_node_memory_tier() only in memory_tier_lateinit().  We can call
memory_tier_lateinit() in device_initcall() level too.

--
Best Regards,
Huang, Ying

> Doing this all memory-type drivers have to call late_initcall() to
> register a memory tier. I’m not sure how many they are?
>
> What do you guys think?
>
>>
>> Jonathan
>>
>>
>> > ---
>> >  mm/memory-tiers.c | 94 +++
>> >  1 file changed, 70 insertions(+), 24 deletions(-)
>> >
>> > diff --git a/mm/memory-tiers.c b/mm/memory-tiers.c
>> > index 516b144fd45a..6632102bd5c9 100644
>> > --- a/mm/memory-tiers.c
>> &

Re: [PATCH v5 1/2] mm/memory_hotplug: split memmap_on_memory requests across memblocks

2023-10-07 Thread Huang, Ying

Vishal Verma  writes:

> The MHP_MEMMAP_ON_MEMORY flag for hotplugged memory is restricted to
> 'memblock_size' chunks of memory being added. Adding a larger span of
> memory precludes memmap_on_memory semantics.
>
> For users of hotplug such as kmem, large amounts of memory might get
> added from the CXL subsystem. In some cases, this amount may exceed the
> available 'main memory' to store the memmap for the memory being added.
> In this case, it is useful to have a way to place the memmap on the
> memory being added, even if it means splitting the addition into
> memblock-sized chunks.
>
> Change add_memory_resource() to loop over memblock-sized chunks of
> memory if caller requested memmap_on_memory, and if other conditions for
> it are met. Teach try_remove_memory() to also expect that a memory
> range being removed might have been split up into memblock sized chunks,
> and to loop through those as needed.
>
> Cc: Andrew Morton 
> Cc: David Hildenbrand 
> Cc: Michal Hocko 
> Cc: Oscar Salvador 
> Cc: Dan Williams 
> Cc: Dave Jiang 
> Cc: Dave Hansen 
> Cc: Huang Ying 
> Suggested-by: David Hildenbrand 
> Signed-off-by: Vishal Verma 
> ---
>  mm/memory_hotplug.c | 162 
> 
>  1 file changed, 99 insertions(+), 63 deletions(-)
>
> diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
> index f8d3e7427e32..77ec6f15f943 100644
> --- a/mm/memory_hotplug.c
> +++ b/mm/memory_hotplug.c
> @@ -1380,6 +1380,44 @@ static bool mhp_supports_memmap_on_memory(unsigned 
> long size)
>   return arch_supports_memmap_on_memory(vmemmap_size);
>  }
>  
> +static int add_memory_create_devices(int nid, struct memory_group *group,
> +  u64 start, u64 size, mhp_t mhp_flags)
> +{
> + struct mhp_params params = { .pgprot = pgprot_mhp(PAGE_KERNEL) };
> + struct vmem_altmap mhp_altmap = {
> + .base_pfn =  PHYS_PFN(start),
> + .end_pfn  =  PHYS_PFN(start + size - 1),
> + };
> + int ret;
> +
> + if ((mhp_flags & MHP_MEMMAP_ON_MEMORY)) {
> + mhp_altmap.free = memory_block_memmap_on_memory_pages();
> + params.altmap = kmalloc(sizeof(struct vmem_altmap), GFP_KERNEL);
> + if (!params.altmap)
> + return -ENOMEM;
> +
> + memcpy(params.altmap, &mhp_altmap, sizeof(mhp_altmap));
> + }
> +
> + /* call arch's memory hotadd */
> + ret = arch_add_memory(nid, start, size, ¶ms);
> + if (ret < 0)
> + goto error;
> +
> + /* create memory block devices after memory was added */
> + ret = create_memory_block_devices(start, size, params.altmap, group);
> + if (ret)
> + goto err_bdev;
> +
> + return 0;
> +
> +err_bdev:
> + arch_remove_memory(start, size, NULL);
> +error:
> + kfree(params.altmap);
> + return ret;
> +}
> +
>  /*
>   * NOTE: The caller must call lock_device_hotplug() to serialize hotplug
>   * and online/offline operations (triggered e.g. by sysfs).
> @@ -1388,14 +1426,10 @@ static bool mhp_supports_memmap_on_memory(unsigned 
> long size)
>   */
>  int __ref add_memory_resource(int nid, struct resource *res, mhp_t mhp_flags)
>  {
> - struct mhp_params params = { .pgprot = pgprot_mhp(PAGE_KERNEL) };
> + unsigned long memblock_size = memory_block_size_bytes();
>   enum memblock_flags memblock_flags = MEMBLOCK_NONE;
> - struct vmem_altmap mhp_altmap = {
> - .base_pfn =  PHYS_PFN(res->start),
> - .end_pfn  =  PHYS_PFN(res->end),
> - };
>   struct memory_group *group = NULL;
> - u64 start, size;
> + u64 start, size, cur_start;
>   bool new_node = false;
>   int ret;
>  
> @@ -1436,28 +1470,21 @@ int __ref add_memory_resource(int nid, struct 
> resource *res, mhp_t mhp_flags)
>   /*
>* Self hosted memmap array
>*/
> - if (mhp_flags & MHP_MEMMAP_ON_MEMORY) {
> - if (mhp_supports_memmap_on_memory(size)) {
> - mhp_altmap.free = memory_block_memmap_on_memory_pages();
> - params.altmap = kmalloc(sizeof(struct vmem_altmap), 
> GFP_KERNEL);
> - if (!params.altmap)
> + if ((mhp_flags & MHP_MEMMAP_ON_MEMORY) &&
> + mhp_supports_memmap_on_memory(memblock_size)) {
> + for (cur_start = start; cur_start < start + size;
> +  cur_start += memblock_size) {
> + ret = add_memory_create_devices(nid, group, cur_start,
> + membloc

Re: [PATCH v5 2/2] dax/kmem: allow kmem to add memory with memmap_on_memory

2023-10-16 Thread Huang, Ying

"Verma, Vishal L"  writes:

> On Thu, 2023-10-05 at 14:16 -0700, Dan Williams wrote:
>> Vishal Verma wrote:
>> >
> <..>
>
>> > +
>> > +   rc = kstrtobool(buf, &val);
>> > +   if (rc)
>> > +   return rc;
>>
>> Perhaps:
>>
>> if (dev_dax->memmap_on_memory == val)
>> return len;
>>
>> ...and skip the check below when it is going to be a nop
>>
>> > +
>> > +   device_lock(dax_region->dev);
>> > +   if (!dax_region->dev->driver) {
>>
>> Is the polarity backwards here? I.e. if the device is already attached to
>> the kmem driver it is too late to modify memmap_on_memory policy.
>
> Hm this sounded logical until I tried it. After a reconfigure-device to
> devdax (i.e. detach kmem), I get the -EBUSY if I invert this check.

Can you try to unbind the device via sysfs by hand and retry?

--
Best Regards,
Huang, Ying

>>
>> > +   device_unlock(dax_region->dev);
>> > +   return -ENXIO;
>>

[snip]

Re: [PATCH v5 2/2] dax/kmem: allow kmem to add memory with memmap_on_memory

2023-10-16 Thread Huang, Ying

"Verma, Vishal L"  writes:

> On Tue, 2023-10-17 at 13:18 +0800, Huang, Ying wrote:
>> "Verma, Vishal L"  writes:
>>
>> > On Thu, 2023-10-05 at 14:16 -0700, Dan Williams wrote:
>> > > Vishal Verma wrote:
>> > > >
>> > <..>
>> >
>> > > > +
>> > > > +   rc = kstrtobool(buf, &val);
>> > > > +   if (rc)
>> > > > +   return rc;
>> > >
>> > > Perhaps:
>> > >
>> > > if (dev_dax->memmap_on_memory == val)
>> > > return len;
>> > >
>> > > ...and skip the check below when it is going to be a nop
>> > >
>> > > > +
>> > > > +   device_lock(dax_region->dev);
>> > > > +   if (!dax_region->dev->driver) {
>> > >
>> > > Is the polarity backwards here? I.e. if the device is already
>> > > attached to
>> > > the kmem driver it is too late to modify memmap_on_memory policy.
>> >
>> > Hm this sounded logical until I tried it. After a reconfigure-
>> > device to
>> > devdax (i.e. detach kmem), I get the -EBUSY if I invert this check.
>>
>> Can you try to unbind the device via sysfs by hand and retry?
>>
> I think what is happening maybe is while kmem gets detached, the device
> goes back to another dax driver (hmem in my tests). So either way, the
> check for if (driver) or if (!driver) won't distinguish between kmem
> vs. something else.
>
> Maybe we just remove this check? Or add an explicit kmem check somehow?

I think it's good to check kmem explicitly here.

--
Best Regards,
Huang, Ying

Re: [PATCH v6 2/3] mm/memory_hotplug: split memmap_on_memory requests across memblocks

2023-10-16 Thread Huang, Ying

Vishal Verma  writes:

> The MHP_MEMMAP_ON_MEMORY flag for hotplugged memory is restricted to
> 'memblock_size' chunks of memory being added. Adding a larger span of
> memory precludes memmap_on_memory semantics.
>
> For users of hotplug such as kmem, large amounts of memory might get
> added from the CXL subsystem. In some cases, this amount may exceed the
> available 'main memory' to store the memmap for the memory being added.
> In this case, it is useful to have a way to place the memmap on the
> memory being added, even if it means splitting the addition into
> memblock-sized chunks.
>
> Change add_memory_resource() to loop over memblock-sized chunks of
> memory if caller requested memmap_on_memory, and if other conditions for
> it are met. Teach try_remove_memory() to also expect that a memory
> range being removed might have been split up into memblock sized chunks,
> and to loop through those as needed.
>
> This does preclude being able to use PUD mappings in the direct map; a
> proposal to how this could be optimized in the future is laid out
> here[1].
>
> [1]: 
> https://lore.kernel.org/linux-mm/b6753402-2de9-25b2-36e9-eacd49752...@redhat.com/
>
> Cc: Andrew Morton 
> Cc: David Hildenbrand 
> Cc: Michal Hocko 
> Cc: Oscar Salvador 
> Cc: Dan Williams 
> Cc: Dave Jiang 
> Cc: Dave Hansen 
> Cc: Huang Ying 
> Suggested-by: David Hildenbrand 
> Reviewed-by: Dan Williams 
> Signed-off-by: Vishal Verma 
> ---
>  mm/memory_hotplug.c | 214 
> 
>  1 file changed, 148 insertions(+), 66 deletions(-)
>
> diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
> index 6be7de9efa55..83e5ec377aad 100644
> --- a/mm/memory_hotplug.c
> +++ b/mm/memory_hotplug.c
> @@ -1380,6 +1380,43 @@ static bool mhp_supports_memmap_on_memory(unsigned 
> long size)
>   return arch_supports_memmap_on_memory(vmemmap_size);
>  }
>  
> +static int add_memory_create_devices(int nid, struct memory_group *group,
> +  u64 start, u64 size, mhp_t mhp_flags)
> +{
> + struct mhp_params params = { .pgprot = pgprot_mhp(PAGE_KERNEL) };
> + struct vmem_altmap mhp_altmap = {
> + .base_pfn =  PHYS_PFN(start),
> + .end_pfn  =  PHYS_PFN(start + size - 1),
> + };
> + int ret;
> +
> + if ((mhp_flags & MHP_MEMMAP_ON_MEMORY)) {
> + mhp_altmap.free = memory_block_memmap_on_memory_pages();
> + params.altmap = kmemdup(&mhp_altmap, sizeof(struct vmem_altmap),
> + GFP_KERNEL);
> + if (!params.altmap)
> + return -ENOMEM;
> + }
> +
> + /* call arch's memory hotadd */
> + ret = arch_add_memory(nid, start, size, ¶ms);
> + if (ret < 0)
> + goto error;
> +
> + /* create memory block devices after memory was added */
> + ret = create_memory_block_devices(start, size, params.altmap, group);
> + if (ret)
> + goto err_bdev;
> +
> + return 0;
> +
> +err_bdev:
> + arch_remove_memory(start, size, NULL);
> +error:
> + kfree(params.altmap);
> + return ret;
> +}
> +
>  /*
>   * NOTE: The caller must call lock_device_hotplug() to serialize hotplug
>   * and online/offline operations (triggered e.g. by sysfs).
> @@ -1388,14 +1425,10 @@ static bool mhp_supports_memmap_on_memory(unsigned 
> long size)
>   */
>  int __ref add_memory_resource(int nid, struct resource *res, mhp_t mhp_flags)
>  {
> - struct mhp_params params = { .pgprot = pgprot_mhp(PAGE_KERNEL) };
> + unsigned long memblock_size = memory_block_size_bytes();
>   enum memblock_flags memblock_flags = MEMBLOCK_NONE;
> - struct vmem_altmap mhp_altmap = {
> - .base_pfn =  PHYS_PFN(res->start),
> - .end_pfn  =  PHYS_PFN(res->end),
> - };
>   struct memory_group *group = NULL;
> - u64 start, size;
> + u64 start, size, cur_start;
>   bool new_node = false;
>   int ret;
>  
> @@ -1436,28 +1469,21 @@ int __ref add_memory_resource(int nid, struct 
> resource *res, mhp_t mhp_flags)
>   /*
>* Self hosted memmap array
>*/
> - if (mhp_flags & MHP_MEMMAP_ON_MEMORY) {
> - if (mhp_supports_memmap_on_memory(size)) {
> - mhp_altmap.free = memory_block_memmap_on_memory_pages();
> - params.altmap = kmemdup(&mhp_altmap,
> - sizeof(struct vmem_altmap),
> - GFP_KERNEL);
> -

Re: [PATCH v7 2/3] mm/memory_hotplug: split memmap_on_memory requests across memblocks

2023-10-29 Thread Huang, Ying

Vishal Verma  writes:

> The MHP_MEMMAP_ON_MEMORY flag for hotplugged memory is restricted to
> 'memblock_size' chunks of memory being added. Adding a larger span of
> memory precludes memmap_on_memory semantics.
>
> For users of hotplug such as kmem, large amounts of memory might get
> added from the CXL subsystem. In some cases, this amount may exceed the
> available 'main memory' to store the memmap for the memory being added.
> In this case, it is useful to have a way to place the memmap on the
> memory being added, even if it means splitting the addition into
> memblock-sized chunks.
>
> Change add_memory_resource() to loop over memblock-sized chunks of
> memory if caller requested memmap_on_memory, and if other conditions for
> it are met. Teach try_remove_memory() to also expect that a memory
> range being removed might have been split up into memblock sized chunks,
> and to loop through those as needed.
>
> This does preclude being able to use PUD mappings in the direct map; a
> proposal to how this could be optimized in the future is laid out
> here[1].
>
> [1]: 
> https://lore.kernel.org/linux-mm/b6753402-2de9-25b2-36e9-eacd49752...@redhat.com/
>
> Cc: Andrew Morton 
> Cc: David Hildenbrand 
> Cc: Michal Hocko 
> Cc: Oscar Salvador 
> Cc: Dan Williams 
> Cc: Dave Jiang 
> Cc: Dave Hansen 
> Cc: Huang Ying 
> Suggested-by: David Hildenbrand 
> Reviewed-by: Dan Williams 
> Signed-off-by: Vishal Verma 
> ---
>  mm/memory_hotplug.c | 209 
> 
>  1 file changed, 144 insertions(+), 65 deletions(-)
>
> diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
> index 6be7de9efa55..b97035193090 100644
> --- a/mm/memory_hotplug.c
> +++ b/mm/memory_hotplug.c
> @@ -1380,6 +1380,48 @@ static bool mhp_supports_memmap_on_memory(unsigned 
> long size)
>   return arch_supports_memmap_on_memory(vmemmap_size);
>  }
>  
> +static int create_altmaps_and_memory_blocks(int nid, struct memory_group 
> *group,
> + u64 start, u64 size)
> +{
> + unsigned long memblock_size = memory_block_size_bytes();
> + u64 cur_start;
> + int ret;
> +
> + for (cur_start = start; cur_start < start + size;
> +  cur_start += memblock_size) {
> + struct mhp_params params = { .pgprot =
> +  pgprot_mhp(PAGE_KERNEL) };
> + struct vmem_altmap mhp_altmap = {
> + .base_pfn = PHYS_PFN(cur_start),
> + .end_pfn = PHYS_PFN(cur_start + memblock_size - 1),
> + };
> +
> + mhp_altmap.free = memory_block_memmap_on_memory_pages();
> + params.altmap = kmemdup(&mhp_altmap, sizeof(struct vmem_altmap),
> + GFP_KERNEL);
> + if (!params.altmap)
> + return -ENOMEM;
> +
> + /* call arch's memory hotadd */
> + ret = arch_add_memory(nid, cur_start, memblock_size, ¶ms);
> + if (ret < 0) {
> + kfree(params.altmap);

Should we call

remove_memory_blocks_and_altmaps(start, cur_start - start);

here to clean up resources?

--
Best Regards,
Huang, Ying

> + return ret;
> + }
> +
> + /* create memory block devices after memory was added */
> + ret = create_memory_block_devices(cur_start, memblock_size,
> +   params.altmap, group);
> + if (ret) {
> + arch_remove_memory(cur_start, memblock_size, NULL);
> + kfree(params.altmap);
> + return ret;
> + }
> + }
> +
> + return 0;
> +}
> +
>  /*
>   * NOTE: The caller must call lock_device_hotplug() to serialize hotplug
>   * and online/offline operations (triggered e.g. by sysfs).
> @@ -1390,10 +1432,6 @@ int __ref add_memory_resource(int nid, struct resource 
> *res, mhp_t mhp_flags)
>  {
>   struct mhp_params params = { .pgprot = pgprot_mhp(PAGE_KERNEL) };
>   enum memblock_flags memblock_flags = MEMBLOCK_NONE;
> - struct vmem_altmap mhp_altmap = {
> - .base_pfn =  PHYS_PFN(res->start),
> - .end_pfn  =  PHYS_PFN(res->end),
> - };
>   struct memory_group *group = NULL;
>   u64 start, size;
>   bool new_node = false;
> @@ -1436,28 +1474,22 @@ int __ref add_memory_resource(int nid, struct 
> resource *res, mhp_t mhp_flags)
>   /*
>* Self hosted memmap array
>*/
> - if (mhp_flags & MHP_MEMMAP_ON_MEMORY

Re: [PATCH v7 3/3] dax/kmem: allow kmem to add memory with memmap_on_memory

2023-10-29 Thread Huang, Ying

Vishal Verma  writes:

> Large amounts of memory managed by the kmem driver may come in via CXL,
> and it is often desirable to have the memmap for this memory on the new
> memory itself.
>
> Enroll kmem-managed memory for memmap_on_memory semantics if the dax
> region originates via CXL. For non-CXL dax regions, retain the existing
> default behavior of hot adding without memmap_on_memory semantics.
>
> Cc: Andrew Morton 
> Cc: David Hildenbrand 
> Cc: Michal Hocko 
> Cc: Oscar Salvador 
> Cc: Dan Williams 
> Cc: Dave Jiang 
> Cc: Dave Hansen 
> Cc: Huang Ying 
> Reviewed-by: Jonathan Cameron 
> Reviewed-by: David Hildenbrand 
> Signed-off-by: Vishal Verma 

LGTM, Thanks!

Reviewed-by: "Huang, Ying" 

> ---
>  drivers/dax/bus.h | 1 +
>  drivers/dax/dax-private.h | 1 +
>  drivers/dax/bus.c | 3 +++
>  drivers/dax/cxl.c | 1 +
>  drivers/dax/hmem/hmem.c   | 1 +
>  drivers/dax/kmem.c| 8 +++-
>  drivers/dax/pmem.c| 1 +
>  7 files changed, 15 insertions(+), 1 deletion(-)
>
> diff --git a/drivers/dax/bus.h b/drivers/dax/bus.h
> index 1ccd23360124..cbbf64443098 100644
> --- a/drivers/dax/bus.h
> +++ b/drivers/dax/bus.h
> @@ -23,6 +23,7 @@ struct dev_dax_data {
>   struct dev_pagemap *pgmap;
>   resource_size_t size;
>   int id;
> + bool memmap_on_memory;
>  };
>  
>  struct dev_dax *devm_create_dev_dax(struct dev_dax_data *data);
> diff --git a/drivers/dax/dax-private.h b/drivers/dax/dax-private.h
> index 27cf2d79..446617b73aea 100644
> --- a/drivers/dax/dax-private.h
> +++ b/drivers/dax/dax-private.h
> @@ -70,6 +70,7 @@ struct dev_dax {
>   struct ida ida;
>   struct device dev;
>   struct dev_pagemap *pgmap;
> + bool memmap_on_memory;
>   int nr_range;
>   struct dev_dax_range {
>   unsigned long pgoff;
> diff --git a/drivers/dax/bus.c b/drivers/dax/bus.c
> index 0ee96e6fc426..ad9f821b8c78 100644
> --- a/drivers/dax/bus.c
> +++ b/drivers/dax/bus.c
> @@ -367,6 +367,7 @@ static ssize_t create_store(struct device *dev, struct 
> device_attribute *attr,
>   .dax_region = dax_region,
>   .size = 0,
>   .id = -1,
> + .memmap_on_memory = false,
>   };
>   struct dev_dax *dev_dax = devm_create_dev_dax(&data);
>  
> @@ -1400,6 +1401,8 @@ struct dev_dax *devm_create_dev_dax(struct dev_dax_data 
> *data)
>   dev_dax->align = dax_region->align;
>   ida_init(&dev_dax->ida);
>  
> + dev_dax->memmap_on_memory = data->memmap_on_memory;
> +
>   inode = dax_inode(dax_dev);
>   dev->devt = inode->i_rdev;
>   dev->bus = &dax_bus_type;
> diff --git a/drivers/dax/cxl.c b/drivers/dax/cxl.c
> index 8bc9d04034d6..c696837ab23c 100644
> --- a/drivers/dax/cxl.c
> +++ b/drivers/dax/cxl.c
> @@ -26,6 +26,7 @@ static int cxl_dax_region_probe(struct device *dev)
>   .dax_region = dax_region,
>   .id = -1,
>   .size = range_len(&cxlr_dax->hpa_range),
> + .memmap_on_memory = true,
>   };
>  
>   return PTR_ERR_OR_ZERO(devm_create_dev_dax(&data));
> diff --git a/drivers/dax/hmem/hmem.c b/drivers/dax/hmem/hmem.c
> index 5d2ddef0f8f5..b9da69f92697 100644
> --- a/drivers/dax/hmem/hmem.c
> +++ b/drivers/dax/hmem/hmem.c
> @@ -36,6 +36,7 @@ static int dax_hmem_probe(struct platform_device *pdev)
>   .dax_region = dax_region,
>   .id = -1,
>   .size = region_idle ? 0 : range_len(&mri->range),
> + .memmap_on_memory = false,
>   };
>  
>   return PTR_ERR_OR_ZERO(devm_create_dev_dax(&data));
> diff --git a/drivers/dax/kmem.c b/drivers/dax/kmem.c
> index c57acb73e3db..0aa6c45a4e5a 100644
> --- a/drivers/dax/kmem.c
> +++ b/drivers/dax/kmem.c
> @@ -12,6 +12,7 @@
>  #include 
>  #include 
>  #include 
> +#include 
>  #include "dax-private.h"
>  #include "bus.h"
>  
> @@ -56,6 +57,7 @@ static int dev_dax_kmem_probe(struct dev_dax *dev_dax)
>   unsigned long total_len = 0;
>   struct dax_kmem_data *data;
>   int i, rc, mapped = 0;
> + mhp_t mhp_flags;
>   int numa_node;
>  
>   /*
> @@ -136,12 +138,16 @@ static int dev_dax_kmem_probe(struct dev_dax *dev_dax)
>*/
>   res->flags = IORESOURCE_SYSTEM_RAM;
>  
> + mhp_flags = MHP_NID_IS_MGID;
> + if (dev_dax->memmap_on_memory)
> + mhp_flags |= MHP_MEMMAP_ON_MEMORY;
> +
>   /*
>* Ensure t

Re: [PATCH v8 2/3] mm/memory_hotplug: split memmap_on_memory requests across memblocks

2023-11-01 Thread Huang, Ying

Vishal Verma  writes:

> The MHP_MEMMAP_ON_MEMORY flag for hotplugged memory is restricted to
> 'memblock_size' chunks of memory being added. Adding a larger span of
> memory precludes memmap_on_memory semantics.
>
> For users of hotplug such as kmem, large amounts of memory might get
> added from the CXL subsystem. In some cases, this amount may exceed the
> available 'main memory' to store the memmap for the memory being added.
> In this case, it is useful to have a way to place the memmap on the
> memory being added, even if it means splitting the addition into
> memblock-sized chunks.
>
> Change add_memory_resource() to loop over memblock-sized chunks of
> memory if caller requested memmap_on_memory, and if other conditions for
> it are met. Teach try_remove_memory() to also expect that a memory
> range being removed might have been split up into memblock sized chunks,
> and to loop through those as needed.
>
> This does preclude being able to use PUD mappings in the direct map; a
> proposal to how this could be optimized in the future is laid out
> here[1].
>
> [1]: 
> https://lore.kernel.org/linux-mm/b6753402-2de9-25b2-36e9-eacd49752...@redhat.com/
>
> Cc: Andrew Morton 
> Cc: David Hildenbrand 
> Cc: Michal Hocko 
> Cc: Oscar Salvador 
> Cc: Dan Williams 
> Cc: Dave Jiang 
> Cc: Dave Hansen 
> Cc: Huang Ying 
> Suggested-by: David Hildenbrand 
> Reviewed-by: Dan Williams 
> Signed-off-by: Vishal Verma 
> ---
>  mm/memory_hotplug.c | 213 
> ++--
>  1 file changed, 138 insertions(+), 75 deletions(-)
>
> diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
> index 6be7de9efa55..d242e49d7f7b 100644
> --- a/mm/memory_hotplug.c
> +++ b/mm/memory_hotplug.c
> @@ -1380,6 +1380,84 @@ static bool mhp_supports_memmap_on_memory(unsigned 
> long size)
>   return arch_supports_memmap_on_memory(vmemmap_size);
>  }
>  
> +static void __ref remove_memory_blocks_and_altmaps(u64 start, u64 size)
> +{
> + unsigned long memblock_size = memory_block_size_bytes();
> + u64 cur_start;
> +
> + /*
> +  * For memmap_on_memory, the altmaps were added on a per-memblock
> +  * basis; we have to process each individual memory block.
> +  */
> + for (cur_start = start; cur_start < start + size;
> +  cur_start += memblock_size) {
> + struct vmem_altmap *altmap = NULL;
> + struct memory_block *mem;
> +
> + mem = find_memory_block(pfn_to_section_nr(PFN_DOWN(cur_start)));
> + WARN_ON_ONCE(!mem);
> + if (!mem)
> + continue;
> +
> + altmap = mem->altmap;
> + mem->altmap = NULL;
> +
> + remove_memory_block_devices(cur_start, memblock_size);
> +
> + arch_remove_memory(cur_start, memblock_size, altmap);
> +
> + /* Verify that all vmemmap pages have actually been freed. */
> + WARN(altmap->alloc, "Altmap not fully unmapped");
> + kfree(altmap);
> + }
> +}
> +
> +static int create_altmaps_and_memory_blocks(int nid, struct memory_group 
> *group,
> + u64 start, u64 size)
> +{
> + unsigned long memblock_size = memory_block_size_bytes();
> + u64 cur_start;
> + int ret;
> +
> + for (cur_start = start; cur_start < start + size;
> +  cur_start += memblock_size) {
> + struct mhp_params params = { .pgprot =
> +  pgprot_mhp(PAGE_KERNEL) };
> + struct vmem_altmap mhp_altmap = {
> + .base_pfn = PHYS_PFN(cur_start),
> + .end_pfn = PHYS_PFN(cur_start + memblock_size - 1),
> + };
> +
> + mhp_altmap.free = memory_block_memmap_on_memory_pages();
> + params.altmap = kmemdup(&mhp_altmap, sizeof(struct vmem_altmap),
> + GFP_KERNEL);
> + if (!params.altmap)
> + return -ENOMEM;

Use "goto out" here too?

> +
> + /* call arch's memory hotadd */
> + ret = arch_add_memory(nid, cur_start, memblock_size, ¶ms);
> + if (ret < 0) {
> + kfree(params.altmap);
> + goto out;
> + }
> +
> + /* create memory block devices after memory was added */
> + ret = create_memory_block_devices(cur_start, memblock_size,
> +   params.altmap, group);
> + if (ret) {
> +

Re: [PATCH v9 2/3] mm/memory_hotplug: split memmap_on_memory requests across memblocks

2023-11-02 Thread Huang, Ying

Vishal Verma  writes:

> The MHP_MEMMAP_ON_MEMORY flag for hotplugged memory is restricted to
> 'memblock_size' chunks of memory being added. Adding a larger span of
> memory precludes memmap_on_memory semantics.
>
> For users of hotplug such as kmem, large amounts of memory might get
> added from the CXL subsystem. In some cases, this amount may exceed the
> available 'main memory' to store the memmap for the memory being added.
> In this case, it is useful to have a way to place the memmap on the
> memory being added, even if it means splitting the addition into
> memblock-sized chunks.
>
> Change add_memory_resource() to loop over memblock-sized chunks of
> memory if caller requested memmap_on_memory, and if other conditions for
> it are met. Teach try_remove_memory() to also expect that a memory
> range being removed might have been split up into memblock sized chunks,
> and to loop through those as needed.
>
> This does preclude being able to use PUD mappings in the direct map; a
> proposal to how this could be optimized in the future is laid out
> here[1].
>
> [1]: 
> https://lore.kernel.org/linux-mm/b6753402-2de9-25b2-36e9-eacd49752...@redhat.com/
>
> Cc: Andrew Morton 
> Cc: David Hildenbrand 
> Cc: Michal Hocko 
> Cc: Oscar Salvador 
> Cc: Dan Williams 
> Cc: Dave Jiang 
> Cc: Dave Hansen 
> Cc: Huang Ying 
> Suggested-by: David Hildenbrand 
> Reviewed-by: Dan Williams 
> Acked-by: David Hildenbrand 
> Signed-off-by: Vishal Verma 

LGTM, Thanks!

Reviewed-by: "Huang, Ying" 

> ---
>  mm/memory_hotplug.c | 210 
> ++--
>  1 file changed, 136 insertions(+), 74 deletions(-)
>
> diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
> index 6be7de9efa55..b380675ab932 100644
> --- a/mm/memory_hotplug.c
> +++ b/mm/memory_hotplug.c
> @@ -1380,6 +1380,85 @@ static bool mhp_supports_memmap_on_memory(unsigned 
> long size)
>   return arch_supports_memmap_on_memory(vmemmap_size);
>  }
>  
> +static void __ref remove_memory_blocks_and_altmaps(u64 start, u64 size)
> +{
> + unsigned long memblock_size = memory_block_size_bytes();
> + u64 cur_start;
> +
> + /*
> +  * For memmap_on_memory, the altmaps were added on a per-memblock
> +  * basis; we have to process each individual memory block.
> +  */
> + for (cur_start = start; cur_start < start + size;
> +  cur_start += memblock_size) {
> + struct vmem_altmap *altmap = NULL;
> + struct memory_block *mem;
> +
> + mem = find_memory_block(pfn_to_section_nr(PFN_DOWN(cur_start)));
> + if (WARN_ON_ONCE(!mem))
> + continue;
> +
> + altmap = mem->altmap;
> + mem->altmap = NULL;
> +
> + remove_memory_block_devices(cur_start, memblock_size);
> +
> + arch_remove_memory(cur_start, memblock_size, altmap);
> +
> + /* Verify that all vmemmap pages have actually been freed. */
> + WARN(altmap->alloc, "Altmap not fully unmapped");
> + kfree(altmap);
> + }
> +}
> +
> +static int create_altmaps_and_memory_blocks(int nid, struct memory_group 
> *group,
> + u64 start, u64 size)
> +{
> + unsigned long memblock_size = memory_block_size_bytes();
> + u64 cur_start;
> + int ret;
> +
> + for (cur_start = start; cur_start < start + size;
> +  cur_start += memblock_size) {
> + struct mhp_params params = { .pgprot =
> +  pgprot_mhp(PAGE_KERNEL) };
> + struct vmem_altmap mhp_altmap = {
> + .base_pfn = PHYS_PFN(cur_start),
> + .end_pfn = PHYS_PFN(cur_start + memblock_size - 1),
> + };
> +
> + mhp_altmap.free = memory_block_memmap_on_memory_pages();
> + params.altmap = kmemdup(&mhp_altmap, sizeof(struct vmem_altmap),
> + GFP_KERNEL);
> + if (!params.altmap) {
> + ret = -ENOMEM;
> + goto out;
> + }
> +
> + /* call arch's memory hotadd */
> + ret = arch_add_memory(nid, cur_start, memblock_size, ¶ms);
> + if (ret < 0) {
> + kfree(params.altmap);
> + goto out;
> + }
> +
> + /* create memory block devices after memory was added */
> + ret = create_memory_block_devices(cur_start, memblock_size,
> +

Re: [PATCH v2 2/2] dax: add a sysfs knob to control memmap_on_memory behavior

2023-12-07 Thread Huang, Ying

Vishal Verma  writes:

> Add a sysfs knob for dax devices to control the memmap_on_memory setting
> if the dax device were to be hotplugged as system memory.
>
> The default memmap_on_memory setting for dax devices originating via
> pmem or hmem is set to 'false' - i.e. no memmap_on_memory semantics, to
> preserve legacy behavior. For dax devices via CXL, the default is on.
> The sysfs control allows the administrator to override the above
> defaults if needed.
>
> Cc: David Hildenbrand 
> Cc: Dan Williams 
> Cc: Dave Jiang 
> Cc: Dave Hansen 
> Cc: Huang Ying 
> Reviewed-by: Jonathan Cameron 
> Reviewed-by: David Hildenbrand 
> Signed-off-by: Vishal Verma 
> ---
>  drivers/dax/bus.c   | 40 
> +
>  Documentation/ABI/testing/sysfs-bus-dax | 13 +++
>  2 files changed, 53 insertions(+)
>
> diff --git a/drivers/dax/bus.c b/drivers/dax/bus.c
> index 1ff1ab5fa105..11abb57cc031 100644
> --- a/drivers/dax/bus.c
> +++ b/drivers/dax/bus.c
> @@ -1270,6 +1270,45 @@ static ssize_t numa_node_show(struct device *dev,
>  }
>  static DEVICE_ATTR_RO(numa_node);
>  
> +static ssize_t memmap_on_memory_show(struct device *dev,
> +  struct device_attribute *attr, char *buf)
> +{
> + struct dev_dax *dev_dax = to_dev_dax(dev);
> +
> + return sprintf(buf, "%d\n", dev_dax->memmap_on_memory);
> +}
> +
> +static ssize_t memmap_on_memory_store(struct device *dev,
> +   struct device_attribute *attr,
> +   const char *buf, size_t len)
> +{
> + struct dev_dax *dev_dax = to_dev_dax(dev);
> + struct dax_region *dax_region = dev_dax->region;
> + ssize_t rc;
> + bool val;
> +
> + rc = kstrtobool(buf, &val);
> + if (rc)
> + return rc;
> +
> + if (dev_dax->memmap_on_memory == val)
> + return len;
> +
> + device_lock(dax_region->dev);
> + if (!dax_region->dev->driver) {

This still doesn't look right.  Can we check whether the current driver
is kmem?  And only allow change if it's not kmem?

--
Best Regards,
Huang, Ying

> + device_unlock(dax_region->dev);
> + return -ENXIO;
> + }
> +
> + device_lock(dev);
> + dev_dax->memmap_on_memory = val;
> + device_unlock(dev);
> +
> + device_unlock(dax_region->dev);
> + return rc == 0 ? len : rc;
> +}
> +static DEVICE_ATTR_RW(memmap_on_memory);
> +
>  static umode_t dev_dax_visible(struct kobject *kobj, struct attribute *a, 
> int n)
>  {
>   struct device *dev = container_of(kobj, struct device, kobj);
> @@ -1296,6 +1335,7 @@ static struct attribute *dev_dax_attributes[] = {
>   &dev_attr_align.attr,
>   &dev_attr_resource.attr,
>   &dev_attr_numa_node.attr,
> + &dev_attr_memmap_on_memory.attr,
>   NULL,
>  };
>  
> diff --git a/Documentation/ABI/testing/sysfs-bus-dax 
> b/Documentation/ABI/testing/sysfs-bus-dax
> index a61a7b186017..bb063a004e41 100644
> --- a/Documentation/ABI/testing/sysfs-bus-dax
> +++ b/Documentation/ABI/testing/sysfs-bus-dax
> @@ -149,3 +149,16 @@ KernelVersion:   v5.1
>  Contact: nvd...@lists.linux.dev
>  Description:
>   (RO) The id attribute indicates the region id of a dax region.
> +
> +What:/sys/bus/dax/devices/daxX.Y/memmap_on_memory
> +Date:October, 2023
> +KernelVersion:   v6.8
> +Contact: nvd...@lists.linux.dev
> +Description:
> + (RW) Control the memmap_on_memory setting if the dax device
> + were to be hotplugged as system memory. This determines whether
> + the 'altmap' for the hotplugged memory will be placed on the
> + device being hotplugged (memmap_on+memory=1) or if it will be
> + placed on regular memory (memmap_on_memory=0). This attribute
> + must be set before the device is handed over to the 'kmem'
> + driver (i.e.  hotplugged into system-ram).

Re: [PATCH v3 2/2] dax: add a sysfs knob to control memmap_on_memory behavior

2023-12-11 Thread Huang, Ying

Vishal Verma  writes:

> Add a sysfs knob for dax devices to control the memmap_on_memory setting
> if the dax device were to be hotplugged as system memory.
>
> The default memmap_on_memory setting for dax devices originating via
> pmem or hmem is set to 'false' - i.e. no memmap_on_memory semantics, to
> preserve legacy behavior. For dax devices via CXL, the default is on.
> The sysfs control allows the administrator to override the above
> defaults if needed.
>
> Cc: David Hildenbrand 
> Cc: Dan Williams 
> Cc: Dave Jiang 
> Cc: Dave Hansen 
> Cc: Huang Ying 
> Tested-by: Li Zhijian 
> Reviewed-by: Jonathan Cameron 
> Reviewed-by: David Hildenbrand 
> Signed-off-by: Vishal Verma 
> ---
>  drivers/dax/bus.c   | 47 
> +
>  Documentation/ABI/testing/sysfs-bus-dax | 17 
>  2 files changed, 64 insertions(+)
>
> diff --git a/drivers/dax/bus.c b/drivers/dax/bus.c
> index 1ff1ab5fa105..2871e5188f0d 100644
> --- a/drivers/dax/bus.c
> +++ b/drivers/dax/bus.c
> @@ -1270,6 +1270,52 @@ static ssize_t numa_node_show(struct device *dev,
>  }
>  static DEVICE_ATTR_RO(numa_node);
>  
> +static ssize_t memmap_on_memory_show(struct device *dev,
> +  struct device_attribute *attr, char *buf)
> +{
> + struct dev_dax *dev_dax = to_dev_dax(dev);
> +
> + return sprintf(buf, "%d\n", dev_dax->memmap_on_memory);
> +}
> +
> +static ssize_t memmap_on_memory_store(struct device *dev,
> +   struct device_attribute *attr,
> +   const char *buf, size_t len)
> +{
> + struct device_driver *drv = dev->driver;
> + struct dev_dax *dev_dax = to_dev_dax(dev);
> + struct dax_region *dax_region = dev_dax->region;
> + struct dax_device_driver *dax_drv = to_dax_drv(drv);
> + ssize_t rc;
> + bool val;
> +
> + rc = kstrtobool(buf, &val);
> + if (rc)
> + return rc;
> +
> + if (dev_dax->memmap_on_memory == val)
> + return len;
> +
> + device_lock(dax_region->dev);
> + if (!dax_region->dev->driver) {
> + device_unlock(dax_region->dev);
> + return -ENXIO;
> + }

I think that it should be OK to write to "memmap_on_memory" if no driver
is bound to the device.  We just need to avoid to write to it when kmem
driver is bound.

--
Best Regards,
Huang, Ying

> +
> + if (dax_drv->type == DAXDRV_KMEM_TYPE) {
> + device_unlock(dax_region->dev);
> + return -EBUSY;
> + }
> +
> + device_lock(dev);
> + dev_dax->memmap_on_memory = val;
> + device_unlock(dev);
> +
> + device_unlock(dax_region->dev);
> + return len;
> +}
> +static DEVICE_ATTR_RW(memmap_on_memory);
> +
>  static umode_t dev_dax_visible(struct kobject *kobj, struct attribute *a, 
> int n)
>  {
>   struct device *dev = container_of(kobj, struct device, kobj);
> @@ -1296,6 +1342,7 @@ static struct attribute *dev_dax_attributes[] = {
>   &dev_attr_align.attr,
>   &dev_attr_resource.attr,
>   &dev_attr_numa_node.attr,
> + &dev_attr_memmap_on_memory.attr,
>   NULL,
>  };
>  
> diff --git a/Documentation/ABI/testing/sysfs-bus-dax 
> b/Documentation/ABI/testing/sysfs-bus-dax
> index a61a7b186017..b1fd8bf8a7de 100644
> --- a/Documentation/ABI/testing/sysfs-bus-dax
> +++ b/Documentation/ABI/testing/sysfs-bus-dax
> @@ -149,3 +149,20 @@ KernelVersion:   v5.1
>  Contact: nvd...@lists.linux.dev
>  Description:
>   (RO) The id attribute indicates the region id of a dax region.
> +
> +What:/sys/bus/dax/devices/daxX.Y/memmap_on_memory
> +Date:October, 2023
> +KernelVersion:   v6.8
> +Contact: nvd...@lists.linux.dev
> +Description:
> + (RW) Control the memmap_on_memory setting if the dax device
> + were to be hotplugged as system memory. This determines whether
> + the 'altmap' for the hotplugged memory will be placed on the
> + device being hotplugged (memmap_on_memory=1) or if it will be
> + placed on regular memory (memmap_on_memory=0). This attribute
> + must be set before the device is handed over to the 'kmem'
> + driver (i.e.  hotplugged into system-ram). Additionally, this
> + depends on CONFIG_MHP_MEMMAP_ON_MEMORY, and a globally enabled
> + memmap_on_memory parameter for memory_hotplug. This is
> + typically set on the kernel command line -
> + memory_hotplug.memmap_on_memory set to 'true' or 'force'."

Re: [PATCH v3 2/2] dax: add a sysfs knob to control memmap_on_memory behavior

2023-12-11 Thread Huang, Ying

"Verma, Vishal L"  writes:

> On Tue, 2023-12-12 at 08:30 +0800, Huang, Ying wrote:
>> Vishal Verma  writes:
>>
>> > Add a sysfs knob for dax devices to control the memmap_on_memory setting
>> > if the dax device were to be hotplugged as system memory.
>> >
>> > The default memmap_on_memory setting for dax devices originating via
>> > pmem or hmem is set to 'false' - i.e. no memmap_on_memory semantics, to
>> > preserve legacy behavior. For dax devices via CXL, the default is on.
>> > The sysfs control allows the administrator to override the above
>> > defaults if needed.
>> >
>> > Cc: David Hildenbrand 
>> > Cc: Dan Williams 
>> > Cc: Dave Jiang 
>> > Cc: Dave Hansen 
>> > Cc: Huang Ying 
>> > Tested-by: Li Zhijian 
>> > Reviewed-by: Jonathan Cameron 
>> > Reviewed-by: David Hildenbrand 
>> > Signed-off-by: Vishal Verma 
>> > ---
>> >  drivers/dax/bus.c   | 47 
>> > +
>> >  Documentation/ABI/testing/sysfs-bus-dax | 17 
>> >  2 files changed, 64 insertions(+)
>> >
>> > diff --git a/drivers/dax/bus.c b/drivers/dax/bus.c
>> > index 1ff1ab5fa105..2871e5188f0d 100644
>> > --- a/drivers/dax/bus.c
>> > +++ b/drivers/dax/bus.c
>> > @@ -1270,6 +1270,52 @@ static ssize_t numa_node_show(struct device *dev,
>> >  }
>> >  static DEVICE_ATTR_RO(numa_node);
>> >
>> > +static ssize_t memmap_on_memory_show(struct device *dev,
>> > +struct device_attribute *attr, char 
>> > *buf)
>> > +{
>> > +   struct dev_dax *dev_dax = to_dev_dax(dev);
>> > +
>> > +   return sprintf(buf, "%d\n", dev_dax->memmap_on_memory);
>> > +}
>> > +
>> > +static ssize_t memmap_on_memory_store(struct device *dev,
>> > + struct device_attribute *attr,
>> > + const char *buf, size_t len)
>> > +{
>> > +   struct device_driver *drv = dev->driver;
>> > +   struct dev_dax *dev_dax = to_dev_dax(dev);
>> > +   struct dax_region *dax_region = dev_dax->region;
>> > +   struct dax_device_driver *dax_drv = to_dax_drv(drv);
>> > +   ssize_t rc;
>> > +   bool val;
>> > +
>> > +   rc = kstrtobool(buf, &val);
>> > +   if (rc)
>> > +   return rc;
>> > +
>> > +   if (dev_dax->memmap_on_memory == val)
>> > +   return len;
>> > +
>> > +   device_lock(dax_region->dev);
>> > +   if (!dax_region->dev->driver) {
>> > +   device_unlock(dax_region->dev);
>> > +   return -ENXIO;
>> > +   }
>>
>> I think that it should be OK to write to "memmap_on_memory" if no driver
>> is bound to the device.  We just need to avoid to write to it when kmem
>> driver is bound.
>
> Oh this is just a check on the region driver, not for a dax driver
> being bound to the device. It's the same as what things like
> align_store(), size_store() etc. do for dax device reconfiguration.

Sorry, I misunderstood it.

> That said, it might be okay to remove this check, as this operation
> doesn't change any attributes of the dax region (the other interfaces I
> mentioned above can affect regions, so we want to lock the region
> device). If removing the check, we'd drop the region lock acquisition
> as well.

This sounds good to me.

And is it necessary to check driver type with device_lock()?  Can driver
be changed between checking and lock?

--
Best Regards,
Huang, Ying

Re: [PATCH v4 3/3] dax: add a sysfs knob to control memmap_on_memory behavior

2023-12-12 Thread Huang, Ying

Vishal Verma  writes:

> Add a sysfs knob for dax devices to control the memmap_on_memory setting
> if the dax device were to be hotplugged as system memory.
>
> The default memmap_on_memory setting for dax devices originating via
> pmem or hmem is set to 'false' - i.e. no memmap_on_memory semantics, to
> preserve legacy behavior. For dax devices via CXL, the default is on.
> The sysfs control allows the administrator to override the above
> defaults if needed.
>
> Cc: David Hildenbrand 
> Cc: Dan Williams 
> Cc: Dave Jiang 
> Cc: Dave Hansen 
> Cc: Huang Ying 
> Tested-by: Li Zhijian 
> Reviewed-by: Jonathan Cameron 
> Reviewed-by: David Hildenbrand 
> Signed-off-by: Vishal Verma 
> ---
>  drivers/dax/bus.c   | 32 
>  Documentation/ABI/testing/sysfs-bus-dax | 17 +
>  2 files changed, 49 insertions(+)
>
> diff --git a/drivers/dax/bus.c b/drivers/dax/bus.c
> index ce1356ac6dc2..423adee6f802 100644
> --- a/drivers/dax/bus.c
> +++ b/drivers/dax/bus.c
> @@ -1245,6 +1245,37 @@ static ssize_t numa_node_show(struct device *dev,
>  }
>  static DEVICE_ATTR_RO(numa_node);
>  
> +static ssize_t memmap_on_memory_show(struct device *dev,
> +  struct device_attribute *attr, char *buf)
> +{
> + struct dev_dax *dev_dax = to_dev_dax(dev);
> +
> + return sprintf(buf, "%d\n", dev_dax->memmap_on_memory);
> +}
> +
> +static ssize_t memmap_on_memory_store(struct device *dev,
> +   struct device_attribute *attr,
> +   const char *buf, size_t len)
> +{
> + struct dax_device_driver *dax_drv = to_dax_drv(dev->driver);
> + struct dev_dax *dev_dax = to_dev_dax(dev);
> + ssize_t rc;
> + bool val;
> +
> + rc = kstrtobool(buf, &val);
> + if (rc)
> + return rc;
> +
> + guard(device)(dev);
> + if (dev_dax->memmap_on_memory != val &&
> + dax_drv->type == DAXDRV_KMEM_TYPE)

Should we check "dev->driver != NULL" here, and should we move

dax_drv = to_dax_drv(dev->driver);

here with device lock held?

--
Best Regards,
Huang, Ying

> + return -EBUSY;
> + dev_dax->memmap_on_memory = val;
> +
> + return len;
> +}
> +static DEVICE_ATTR_RW(memmap_on_memory);
> +
>  static umode_t dev_dax_visible(struct kobject *kobj, struct attribute *a, 
> int n)
>  {
>   struct device *dev = container_of(kobj, struct device, kobj);
> @@ -1271,6 +1302,7 @@ static struct attribute *dev_dax_attributes[] = {
>   &dev_attr_align.attr,
>   &dev_attr_resource.attr,
>   &dev_attr_numa_node.attr,
> + &dev_attr_memmap_on_memory.attr,
>   NULL,
>  };
>  
> diff --git a/Documentation/ABI/testing/sysfs-bus-dax 
> b/Documentation/ABI/testing/sysfs-bus-dax
> index a61a7b186017..b1fd8bf8a7de 100644
> --- a/Documentation/ABI/testing/sysfs-bus-dax
> +++ b/Documentation/ABI/testing/sysfs-bus-dax
> @@ -149,3 +149,20 @@ KernelVersion:   v5.1
>  Contact: nvd...@lists.linux.dev
>  Description:
>   (RO) The id attribute indicates the region id of a dax region.
> +
> +What:/sys/bus/dax/devices/daxX.Y/memmap_on_memory
> +Date:October, 2023
> +KernelVersion:   v6.8
> +Contact: nvd...@lists.linux.dev
> +Description:
> + (RW) Control the memmap_on_memory setting if the dax device
> + were to be hotplugged as system memory. This determines whether
> + the 'altmap' for the hotplugged memory will be placed on the
> + device being hotplugged (memmap_on_memory=1) or if it will be
> + placed on regular memory (memmap_on_memory=0). This attribute
> + must be set before the device is handed over to the 'kmem'
> + driver (i.e.  hotplugged into system-ram). Additionally, this
> + depends on CONFIG_MHP_MEMMAP_ON_MEMORY, and a globally enabled
> + memmap_on_memory parameter for memory_hotplug. This is
> + typically set on the kernel command line -
> + memory_hotplug.memmap_on_memory set to 'true' or 'force'."

Re: [PATCH v5 4/4] dax: add a sysfs knob to control memmap_on_memory behavior

2023-12-14 Thread Huang, Ying

Vishal Verma  writes:

> Add a sysfs knob for dax devices to control the memmap_on_memory setting
> if the dax device were to be hotplugged as system memory.
>
> The default memmap_on_memory setting for dax devices originating via
> pmem or hmem is set to 'false' - i.e. no memmap_on_memory semantics, to
> preserve legacy behavior. For dax devices via CXL, the default is on.
> The sysfs control allows the administrator to override the above
> defaults if needed.
>
> Cc: David Hildenbrand 
> Cc: Dan Williams 
> Cc: Dave Jiang 
> Cc: Dave Hansen 
> Cc: Huang Ying 
> Tested-by: Li Zhijian 
> Reviewed-by: Jonathan Cameron 
> Reviewed-by: David Hildenbrand 
> Signed-off-by: Vishal Verma 
> ---
>  drivers/dax/bus.c   | 38 
> +
>  Documentation/ABI/testing/sysfs-bus-dax | 17 +++
>  2 files changed, 55 insertions(+)
>
> diff --git a/drivers/dax/bus.c b/drivers/dax/bus.c
> index 6226de131d17..f4d3beec507c 100644
> --- a/drivers/dax/bus.c
> +++ b/drivers/dax/bus.c
> @@ -1245,6 +1245,43 @@ static ssize_t numa_node_show(struct device *dev,
>  }
>  static DEVICE_ATTR_RO(numa_node);
>  
> +static ssize_t memmap_on_memory_show(struct device *dev,
> +  struct device_attribute *attr, char *buf)
> +{
> + struct dev_dax *dev_dax = to_dev_dax(dev);
> +
> + return sprintf(buf, "%d\n", dev_dax->memmap_on_memory);
> +}
> +
> +static ssize_t memmap_on_memory_store(struct device *dev,
> +   struct device_attribute *attr,
> +   const char *buf, size_t len)
> +{
> + struct dev_dax *dev_dax = to_dev_dax(dev);
> + struct dax_device_driver *dax_drv;
> + ssize_t rc;
> + bool val;
> +
> + rc = kstrtobool(buf, &val);
> + if (rc)
> + return rc;
> +
> + if (val == true && !mhp_supports_memmap_on_memory()) {
> + dev_dbg(dev, "memmap_on_memory is not available\n");
> + return -EOPNOTSUPP;
> + }
> +
> + guard(device)(dev);
> + dax_drv = to_dax_drv(dev->driver);

Although "struct driver" is the first member of "struct
dax_device_driver", I feel the code is fragile to depends on that.  Can
we check dev->driver directly instead?

--
Best Regards,
Huang, Ying

> + if (dax_drv && dev_dax->memmap_on_memory != val &&
> + dax_drv->type == DAXDRV_KMEM_TYPE)
> + return -EBUSY;
> + dev_dax->memmap_on_memory = val;
> +
> + return len;
> +}
> +static DEVICE_ATTR_RW(memmap_on_memory);
> +
>  static umode_t dev_dax_visible(struct kobject *kobj, struct attribute *a, 
> int n)
>  {
>   struct device *dev = container_of(kobj, struct device, kobj);
> @@ -1271,6 +1308,7 @@ static struct attribute *dev_dax_attributes[] = {
>   &dev_attr_align.attr,
>   &dev_attr_resource.attr,
>   &dev_attr_numa_node.attr,
> + &dev_attr_memmap_on_memory.attr,
>   NULL,
>  };
>  
> diff --git a/Documentation/ABI/testing/sysfs-bus-dax 
> b/Documentation/ABI/testing/sysfs-bus-dax
> index 6359f7bc9bf4..40d9965733b2 100644
> --- a/Documentation/ABI/testing/sysfs-bus-dax
> +++ b/Documentation/ABI/testing/sysfs-bus-dax
> @@ -134,3 +134,20 @@ KernelVersion:   v5.1
>  Contact: nvd...@lists.linux.dev
>  Description:
>   (RO) The id attribute indicates the region id of a dax region.
> +
> +What:/sys/bus/dax/devices/daxX.Y/memmap_on_memory
> +Date:October, 2023
> +KernelVersion:   v6.8
> +Contact: nvd...@lists.linux.dev
> +Description:
> + (RW) Control the memmap_on_memory setting if the dax device
> + were to be hotplugged as system memory. This determines whether
> + the 'altmap' for the hotplugged memory will be placed on the
> + device being hotplugged (memmap_on_memory=1) or if it will be
> + placed on regular memory (memmap_on_memory=0). This attribute
> + must be set before the device is handed over to the 'kmem'
> + driver (i.e.  hotplugged into system-ram). Additionally, this
> + depends on CONFIG_MHP_MEMMAP_ON_MEMORY, and a globally enabled
> + memmap_on_memory parameter for memory_hotplug. This is
> + typically set on the kernel command line -
> + memory_hotplug.memmap_on_memory set to 'true' or 'force'."

Re: [PATCH v6 4/4] dax: add a sysfs knob to control memmap_on_memory behavior

2023-12-14 Thread Huang, Ying

Vishal Verma  writes:

> Add a sysfs knob for dax devices to control the memmap_on_memory setting
> if the dax device were to be hotplugged as system memory.
>
> The default memmap_on_memory setting for dax devices originating via
> pmem or hmem is set to 'false' - i.e. no memmap_on_memory semantics, to
> preserve legacy behavior. For dax devices via CXL, the default is on.
> The sysfs control allows the administrator to override the above
> defaults if needed.
>
> Cc: David Hildenbrand 
> Cc: Dan Williams 
> Cc: Dave Jiang 
> Cc: Dave Hansen 
> Cc: Huang Ying 
> Tested-by: Li Zhijian 
> Reviewed-by: Jonathan Cameron 
> Reviewed-by: David Hildenbrand 
> Signed-off-by: Vishal Verma 

Looks good to me!  Thanks!

Reviewed-by: "Huang, Ying" 

> ---
>  drivers/dax/bus.c   | 36 
> +
>  Documentation/ABI/testing/sysfs-bus-dax | 17 
>  2 files changed, 53 insertions(+)
>
> diff --git a/drivers/dax/bus.c b/drivers/dax/bus.c
> index 6226de131d17..3622b3d1c0de 100644
> --- a/drivers/dax/bus.c
> +++ b/drivers/dax/bus.c
> @@ -1245,6 +1245,41 @@ static ssize_t numa_node_show(struct device *dev,
>  }
>  static DEVICE_ATTR_RO(numa_node);
>  
> +static ssize_t memmap_on_memory_show(struct device *dev,
> +  struct device_attribute *attr, char *buf)
> +{
> + struct dev_dax *dev_dax = to_dev_dax(dev);
> +
> + return sysfs_emit(buf, "%d\n", dev_dax->memmap_on_memory);
> +}
> +
> +static ssize_t memmap_on_memory_store(struct device *dev,
> +   struct device_attribute *attr,
> +   const char *buf, size_t len)
> +{
> + struct dev_dax *dev_dax = to_dev_dax(dev);
> + ssize_t rc;
> + bool val;
> +
> + rc = kstrtobool(buf, &val);
> + if (rc)
> + return rc;
> +
> + if (val == true && !mhp_supports_memmap_on_memory()) {
> + dev_dbg(dev, "memmap_on_memory is not available\n");
> + return -EOPNOTSUPP;
> + }
> +
> + guard(device)(dev);
> + if (dev_dax->memmap_on_memory != val && dev->driver &&
> + to_dax_drv(dev->driver)->type == DAXDRV_KMEM_TYPE)
> + return -EBUSY;
> + dev_dax->memmap_on_memory = val;
> +
> + return len;
> +}
> +static DEVICE_ATTR_RW(memmap_on_memory);
> +
>  static umode_t dev_dax_visible(struct kobject *kobj, struct attribute *a, 
> int n)
>  {
>   struct device *dev = container_of(kobj, struct device, kobj);
> @@ -1271,6 +1306,7 @@ static struct attribute *dev_dax_attributes[] = {
>   &dev_attr_align.attr,
>   &dev_attr_resource.attr,
>   &dev_attr_numa_node.attr,
> + &dev_attr_memmap_on_memory.attr,
>   NULL,
>  };
>  
> diff --git a/Documentation/ABI/testing/sysfs-bus-dax 
> b/Documentation/ABI/testing/sysfs-bus-dax
> index 6359f7bc9bf4..b34266bfae49 100644
> --- a/Documentation/ABI/testing/sysfs-bus-dax
> +++ b/Documentation/ABI/testing/sysfs-bus-dax
> @@ -134,3 +134,20 @@ KernelVersion:   v5.1
>  Contact: nvd...@lists.linux.dev
>  Description:
>   (RO) The id attribute indicates the region id of a dax region.
> +
> +What:/sys/bus/dax/devices/daxX.Y/memmap_on_memory
> +Date:January, 2024
> +KernelVersion:   v6.8
> +Contact: nvd...@lists.linux.dev
> +Description:
> + (RW) Control the memmap_on_memory setting if the dax device
> + were to be hotplugged as system memory. This determines whether
> + the 'altmap' for the hotplugged memory will be placed on the
> + device being hotplugged (memmap_on_memory=1) or if it will be
> + placed on regular memory (memmap_on_memory=0). This attribute
> + must be set before the device is handed over to the 'kmem'
> + driver (i.e.  hotplugged into system-ram). Additionally, this
> + depends on CONFIG_MHP_MEMMAP_ON_MEMORY, and a globally enabled
> + memmap_on_memory parameter for memory_hotplug. This is
> + typically set on the kernel command line -
> + memory_hotplug.memmap_on_memory set to 'true' or 'force'."

Re: [PATCH v3 2/4] swap: fix do_swap_page() race with swapoff

2021-04-20 Thread Huang, Ying

Miaohe Lin  writes:

> When I was investigating the swap code, I found the below possible race
> window:
>
> CPU 1 CPU 2
> - -
> do_swap_page
>   if (data_race(si->flags & SWP_SYNCHRONOUS_IO)
>   swap_readpage
> if (data_race(sis->flags & SWP_FS_OPS)) {
>   swapoff
> p->flags &= ~SWP_VALID;
> ..
> synchronize_rcu();
> ..

You have deleted SWP_VALID and RCU solution in 1/4, so please revise this.

> p->swap_file = NULL;
> struct file *swap_file = sis->swap_file;
> struct address_space *mapping = swap_file->f_mapping;[oops!]
>
> Note that for the pages that are swapped in through swap cache, this isn't
> an issue. Because the page is locked, and the swap entry will be marked
> with SWAP_HAS_CACHE, so swapoff() can not proceed until the page has been
> unlocked.
>
> Using current get/put_swap_device() to guard against concurrent swapoff for
> swap_readpage() looks terrible because swap_readpage() may take really long
> time. And this race may not be really pernicious because swapoff is usually
> done when system shutdown only. To reduce the performance overhead on the
> hot-path as much as possible, it appears we can use the percpu_ref to close
> this race window(as suggested by Huang, Ying).

This needs to be revised too.  Unless you squash 1/4 and 2/4.

> Fixes: 0bcac06f27d7 ("mm,swap: skip swapcache for swapin of synchronous 
> device")
> Reported-by: kernel test robot  (auto build test ERROR)
> Signed-off-by: Miaohe Lin 
> ---
>  include/linux/swap.h | 9 +
>  mm/memory.c  | 9 +
>  2 files changed, 18 insertions(+)
>
> diff --git a/include/linux/swap.h b/include/linux/swap.h
> index c9e7fea10b83..46d51d058d05 100644
> --- a/include/linux/swap.h
> +++ b/include/linux/swap.h
> @@ -527,6 +527,15 @@ static inline struct swap_info_struct 
> *swp_swap_info(swp_entry_t entry)
>   return NULL;
>  }
>  
> +static inline struct swap_info_struct *get_swap_device(swp_entry_t entry)
> +{
> + return NULL;
> +}
> +
> +static inline void put_swap_device(struct swap_info_struct *si)
> +{
> +}
> +
>  #define swap_address_space(entry)(NULL)
>  #define get_nr_swap_pages()  0L
>  #define total_swap_pages 0L
> diff --git a/mm/memory.c b/mm/memory.c
> index 27014c3bde9f..7a2fe12cf641 100644
> --- a/mm/memory.c
> +++ b/mm/memory.c
> @@ -3311,6 +3311,7 @@ vm_fault_t do_swap_page(struct vm_fault *vmf)
>  {
>   struct vm_area_struct *vma = vmf->vma;
>   struct page *page = NULL, *swapcache;
> + struct swap_info_struct *si = NULL;
>   swp_entry_t entry;
>   pte_t pte;
>   int locked;
> @@ -3338,6 +3339,10 @@ vm_fault_t do_swap_page(struct vm_fault *vmf)
>   goto out;
>   }
>  
> + /* Prevent swapoff from happening to us. */
> + si = get_swap_device(entry);

There's

struct swap_info_struct *si = swp_swap_info(entry);

in do_swap_page(), you can remove that.

Best Regards,
Huang, Ying

> + if (unlikely(!si))
> + goto out;
>  
>   delayacct_set_flag(current, DELAYACCT_PF_SWAPIN);
>   page = lookup_swap_cache(entry, vma, vmf->address);
> @@ -3514,6 +3519,8 @@ vm_fault_t do_swap_page(struct vm_fault *vmf)
>  unlock:
>   pte_unmap_unlock(vmf->pte, vmf->ptl);
>  out:
> + if (si)
> + put_swap_device(si);
>   return ret;
>  out_nomap:
>   pte_unmap_unlock(vmf->pte, vmf->ptl);
> @@ -3525,6 +3532,8 @@ vm_fault_t do_swap_page(struct vm_fault *vmf)
>   unlock_page(swapcache);
>   put_page(swapcache);
>   }
> + if (si)
> + put_swap_device(si);
>   return ret;
>  }

Re: [PATCH v3 3/4] mm/swap: remove confusing checking for non_swap_entry() in swap_ra_info()

2021-04-20 Thread Huang, Ying

Miaohe Lin  writes:

> The non_swap_entry() was used for working with VMA based swap readahead
> via commit ec560175c0b6 ("mm, swap: VMA based swap readahead").

At that time, the non_swap_entry() checking is necessary because the
function is called before checking that in do_swap_page().

> Then it's
> moved to swap_ra_info() since commit eaf649ebc3ac ("mm: swap: clean up swap
> readahead").

After that, the non_swap_entry() checking is unnecessary, because
swap_ra_info() is called after non_swap_entry() has been checked
already.  The resulting code is confusing.

> But this makes the code confusing. The non_swap_entry() check
> looks racy because while we released the pte lock, somebody else might have
> faulted in this pte. So we should check whether it's swap pte first to
> guard against such race or swap_type will be unexpected.

The race isn't important because it will not cause problem.

Best Regards,
Huang, Ying

> But the swap_entry
> isn't used in this function and we will have enough checking when we really
> operate the PTE entries later. So checking for non_swap_entry() is not
> really needed here and should be removed to avoid confusion.
>
> Signed-off-by: Miaohe Lin 
> ---
>  mm/swap_state.c | 6 --
>  1 file changed, 6 deletions(-)
>
> diff --git a/mm/swap_state.c b/mm/swap_state.c
> index 272ea2108c9d..df5405384520 100644
> --- a/mm/swap_state.c
> +++ b/mm/swap_state.c
> @@ -721,7 +721,6 @@ static void swap_ra_info(struct vm_fault *vmf,
>  {
>   struct vm_area_struct *vma = vmf->vma;
>   unsigned long ra_val;
> - swp_entry_t entry;
>   unsigned long faddr, pfn, fpfn;
>   unsigned long start, end;
>   pte_t *pte, *orig_pte;
> @@ -739,11 +738,6 @@ static void swap_ra_info(struct vm_fault *vmf,
>  
>   faddr = vmf->address;
>   orig_pte = pte = pte_offset_map(vmf->pmd, faddr);
> - entry = pte_to_swp_entry(*pte);
> - if ((unlikely(non_swap_entry(entry {
> - pte_unmap(orig_pte);
> - return;
> - }
>  
>   fpfn = PFN_DOWN(faddr);
>   ra_val = GET_SWAP_RA_VAL(vma);

Re: [PATCH v3 4/4] mm/shmem: fix shmem_swapin() race with swapoff

2021-04-20 Thread Huang, Ying

Miaohe Lin  writes:

> When I was investigating the swap code, I found the below possible race
> window:
>
> CPU 1 CPU 2
> - -
> shmem_swapin
>   swap_cluster_readahead
> if (likely(si->flags & (SWP_BLKDEV | SWP_FS_OPS))) {
>   swapoff
> percpu_ref_kill(&p->users)
> synchronize_rcu()
> wait_for_completion

I don't think the above 3 lines are relevant for the race.

> ..
> si->swap_file = NULL;
> struct inode *inode = si->swap_file->f_mapping->host;[oops!]
>
> Close this race window by using get/put_swap_device() to guard against
> concurrent swapoff.
>
> Fixes: 8fd2e0b505d1 ("mm: swap: check if swap backing device is congested or 
> not")
> Signed-off-by: Miaohe Lin 
> ---
>  mm/shmem.c | 6 ++
>  1 file changed, 6 insertions(+)
>
> diff --git a/mm/shmem.c b/mm/shmem.c
> index 26c76b13ad23..936ba5595297 100644
> --- a/mm/shmem.c
> +++ b/mm/shmem.c
> @@ -1492,15 +1492,21 @@ static void shmem_pseudo_vma_destroy(struct 
> vm_area_struct *vma)
>  static struct page *shmem_swapin(swp_entry_t swap, gfp_t gfp,
>   struct shmem_inode_info *info, pgoff_t index)
>  {
> + struct swap_info_struct *si;
>   struct vm_area_struct pvma;
>   struct page *page;
>   struct vm_fault vmf = {
>   .vma = &pvma,
>   };
>  
> + /* Prevent swapoff from happening to us. */
> + si = get_swap_device(swap);

Better to put get/put_swap_device() in shmem_swapin_page(), that make it
possible for us to remove get/put_swap_device() in lookup_swap_cache().

Best Regards,
Huang, Ying

> + if (unlikely(!si))
> + return NULL;
>   shmem_pseudo_vma_init(&pvma, info, index);
>   page = swap_cluster_readahead(swap, gfp, &vmf);
>   shmem_pseudo_vma_destroy(&pvma);
> + put_swap_device(si);
>  
>   return page;
>  }

Re: [PATCH v3 1/4] mm/swapfile: use percpu_ref to serialize against concurrent swapoff

2021-04-20 Thread Huang, Ying

> -
> - rcu_read_lock();
> - if (data_race(!(si->flags & SWP_VALID)))
> - goto unlock_out;
> + if (!percpu_ref_tryget_live(&si->users))
> + goto out;
> + /*
> +  * Guarantee the si->users are checked before accessing other
> +  * fields of swap_info_struct.
> +  *
> +  * Paired with the spin_unlock() after setup_swap_info() in
> +  * enable_swap_info().
> +  */
> + smp_rmb();
>   offset = swp_offset(entry);
>   if (offset >= si->max)
> - goto unlock_out;
> + goto put_out;
>  
>   return si;
>  bad_nofile:
>   pr_err("%s: %s%08lx\n", __func__, Bad_file, entry.val);
>  out:
>   return NULL;
> -unlock_out:
> - rcu_read_unlock();
> +put_out:
> + percpu_ref_put(&si->users);
>   return NULL;
>  }
>  
> @@ -2466,7 +2475,7 @@ static void setup_swap_info(struct swap_info_struct *p, 
> int prio,
>  
>  static void _enable_swap_info(struct swap_info_struct *p)
>  {
> - p->flags |= SWP_WRITEOK | SWP_VALID;
> + p->flags |= SWP_WRITEOK;
>   atomic_long_add(p->pages, &nr_swap_pages);
>   total_swap_pages += p->pages;
>  
> @@ -2497,10 +2506,9 @@ static void enable_swap_info(struct swap_info_struct 
> *p, int prio,
>   spin_unlock(&p->lock);
>   spin_unlock(&swap_lock);
>   /*
> -  * Guarantee swap_map, cluster_info, etc. fields are valid
> -  * between get/put_swap_device() if SWP_VALID bit is set
> +  * Finished initialized swap device, now it's safe to reference it.

s/initialized/initializing/

Otherwise looks good to me!  Thanks!

Reviewed-by: "Huang, Ying" 

>*/
> - synchronize_rcu();
> + percpu_ref_resurrect(&p->users);
>   spin_lock(&swap_lock);
>   spin_lock(&p->lock);
>   _enable_swap_info(p);
> @@ -2616,16 +2624,16 @@ SYSCALL_DEFINE1(swapoff, const char __user *, 
> specialfile)
>  
>   reenable_swap_slots_cache_unlock();
>  
> - spin_lock(&swap_lock);
> - spin_lock(&p->lock);
> - p->flags &= ~SWP_VALID; /* mark swap device as invalid */
> - spin_unlock(&p->lock);
> - spin_unlock(&swap_lock);
>   /*
> -  * wait for swap operations protected by get/put_swap_device()
> -  * to complete
> +  * Wait for swap operations protected by get/put_swap_device()
> +  * to complete.
> +  *
> +  * We need synchronize_rcu() here to protect the accessing to
> +  * the swap cache data structure.
>*/
> + percpu_ref_kill(&p->users);
>   synchronize_rcu();
> + wait_for_completion(&p->comp);
>  
>   flush_work(&p->discard_work);
>  
> @@ -2857,6 +2865,12 @@ static struct swap_info_struct *alloc_swap_info(void)
>   if (!p)
>   return ERR_PTR(-ENOMEM);
>  
> + if (percpu_ref_init(&p->users, swap_users_ref_free,
> + PERCPU_REF_INIT_DEAD, GFP_KERNEL)) {
> + kvfree(p);
> + return ERR_PTR(-ENOMEM);
> + }
> +
>   spin_lock(&swap_lock);
>   for (type = 0; type < nr_swapfiles; type++) {
>   if (!(swap_info[type]->flags & SWP_USED))
> @@ -2864,6 +2878,7 @@ static struct swap_info_struct *alloc_swap_info(void)
>   }
>   if (type >= MAX_SWAPFILES) {
>   spin_unlock(&swap_lock);
> + percpu_ref_exit(&p->users);
>   kvfree(p);
>   return ERR_PTR(-EPERM);
>   }
> @@ -2891,9 +2906,13 @@ static struct swap_info_struct *alloc_swap_info(void)
>   plist_node_init(&p->avail_lists[i], 0);
>   p->flags = SWP_USED;
>   spin_unlock(&swap_lock);
> - kvfree(defer);
> + if (defer) {
> + percpu_ref_exit(&defer->users);
> + kvfree(defer);
> + }
>   spin_lock_init(&p->lock);
>   spin_lock_init(&p->cont_lock);
> + init_completion(&p->comp);
>  
>   return p;
>  }

Re: [PATCH 1/3] mm/memory_hotplug: Allow an override for the memmap_on_memory param

2023-06-15 Thread Huang, Ying

Hi, Vishal,

Thanks for your patch!

Vishal Verma  writes:

> For memory hotplug to consider MHP_MEMMAP_ON_MEMORY behavior, the
> 'memmap_on_memory' module parameter was a hard requirement.
>
> In preparation for the dax/kmem driver to use memmap_on_memory
> semantics, arrange for the module parameter check to be bypassed via the
> appropriate mhp_flag.
>
> Recall that the kmem driver could contribute huge amounts of hotplugged
> memory originating from special purposes devices such as CXL memory
> expanders. In some cases memmap_on_memory may be the /only/ way this new
> memory can be hotplugged. Hence it makes sense for kmem to have a way to
> force memmap_on_memory without depending on a module param, if all the
> other conditions for it are met.
>
> The only other user of this interface is acpi/acpi_memoryhotplug.c,
> which only enables the mhp_flag if an initial
> mhp_supports_memmap_on_memory() test passes. Maintain the existing
> behavior and semantics for this by performing the initial check from
> acpi without the MHP_MEMMAP_ON_MEMORY flag, so its decision falls back
> to the module parameter.
>
> Cc: "Rafael J. Wysocki" 
> Cc: Len Brown 
> Cc: Andrew Morton 
> Cc: David Hildenbrand 
> Cc: Oscar Salvador 
> Cc: Dan Williams 
> Cc: Dave Jiang 
> Cc: Dave Hansen 
> Cc: Huang Ying 
> Signed-off-by: Vishal Verma 
> ---
>  include/linux/memory_hotplug.h |  2 +-
>  drivers/acpi/acpi_memhotplug.c |  2 +-
>  mm/memory_hotplug.c| 24 
>  3 files changed, 18 insertions(+), 10 deletions(-)
>
> diff --git a/include/linux/memory_hotplug.h b/include/linux/memory_hotplug.h
> index 9fcbf5706595..c9ddcd3cad70 100644
> --- a/include/linux/memory_hotplug.h
> +++ b/include/linux/memory_hotplug.h
> @@ -358,7 +358,7 @@ extern struct zone *zone_for_pfn_range(int online_type, 
> int nid,
>  extern int arch_create_linear_mapping(int nid, u64 start, u64 size,
> struct mhp_params *params);
>  void arch_remove_linear_mapping(u64 start, u64 size);
> -extern bool mhp_supports_memmap_on_memory(unsigned long size);
> +extern bool mhp_supports_memmap_on_memory(unsigned long size, mhp_t 
> mhp_flags);
>  #endif /* CONFIG_MEMORY_HOTPLUG */
>  
>  #endif /* __LINUX_MEMORY_HOTPLUG_H */
> diff --git a/drivers/acpi/acpi_memhotplug.c b/drivers/acpi/acpi_memhotplug.c
> index 24f662d8bd39..119d3bb49753 100644
> --- a/drivers/acpi/acpi_memhotplug.c
> +++ b/drivers/acpi/acpi_memhotplug.c
> @@ -211,7 +211,7 @@ static int acpi_memory_enable_device(struct 
> acpi_memory_device *mem_device)
>   if (!info->length)
>   continue;
>  
> - if (mhp_supports_memmap_on_memory(info->length))
> + if (mhp_supports_memmap_on_memory(info->length, 0))
>   mhp_flags |= MHP_MEMMAP_ON_MEMORY;
>   result = __add_memory(mgid, info->start_addr, info->length,
> mhp_flags);
> diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
> index 8e0fa209d533..bb3845830922 100644
> --- a/mm/memory_hotplug.c
> +++ b/mm/memory_hotplug.c
> @@ -1283,15 +1283,21 @@ static int online_memory_block(struct memory_block 
> *mem, void *arg)
>   return device_online(&mem->dev);
>  }
>  
> -bool mhp_supports_memmap_on_memory(unsigned long size)
> +bool mhp_supports_memmap_on_memory(unsigned long size, mhp_t mhp_flags)
>  {
>   unsigned long nr_vmemmap_pages = size / PAGE_SIZE;
>   unsigned long vmemmap_size = nr_vmemmap_pages * sizeof(struct page);
>   unsigned long remaining_size = size - vmemmap_size;
>  
>   /*
> -  * Besides having arch support and the feature enabled at runtime, we
> -  * need a few more assumptions to hold true:
> +  * The MHP_MEMMAP_ON_MEMORY flag indicates a caller that wants to force
> +  * memmap_on_memory (if other conditions are met), regardless of the
> +  * module parameter. drivers/dax/kmem.c is an example, where large
> +  * amounts of hotplug memory may come from, and the only option to
> +  * successfully online all of it is to place the memmap on this memory.
> +  *
> +  * Besides having arch support and the feature enabled at runtime or
> +  * via the mhp_flag, we need a few more assumptions to hold true:
>*
>* a) We span a single memory block: memory onlining/offlinin;g happens
>*in memory block granularity. We don't want the vmemmap of online
> @@ -1315,10 +1321,12 @@ bool mhp_supports_memmap_on_memory(unsigned long size)
>*   altmap as an alternative source of memory, and we do not 
> exactly
>

Re: [PATCH 3/3] dax/kmem: Always enroll hotplugged memory for memmap_on_memory

2023-06-15 Thread Huang, Ying

Vishal Verma  writes:

> With DAX memory regions originating from CXL memory expanders or
> NVDIMMs, the kmem driver may be hot-adding huge amounts of system memory
> on a system without enough 'regular' main memory to support the memmap
> for it. To avoid this, ensure that all kmem managed hotplugged memory is
> added with the MHP_MEMMAP_ON_MEMORY flag to place the memmap on the
> new memory region being hot added.
>
> To do this, call add_memory() in chunks of memory_block_size_bytes() as
> that is a requirement for memmap_on_memory. Additionally, Use the
> mhp_flag to force the memmap_on_memory checks regardless of the
> respective module parameter setting.
>
> Cc: "Rafael J. Wysocki" 
> Cc: Len Brown 
> Cc: Andrew Morton 
> Cc: David Hildenbrand 
> Cc: Oscar Salvador 
> Cc: Dan Williams 
> Cc: Dave Jiang 
> Cc: Dave Hansen 
> Cc: Huang Ying 
> Signed-off-by: Vishal Verma 
> ---
>  drivers/dax/kmem.c | 49 -
>  1 file changed, 36 insertions(+), 13 deletions(-)
>
> diff --git a/drivers/dax/kmem.c b/drivers/dax/kmem.c
> index 7b36db6f1cbd..0751346193ef 100644
> --- a/drivers/dax/kmem.c
> +++ b/drivers/dax/kmem.c
> @@ -12,6 +12,7 @@
>  #include 
>  #include 
>  #include 
> +#include 
>  #include "dax-private.h"
>  #include "bus.h"
>  
> @@ -105,6 +106,7 @@ static int dev_dax_kmem_probe(struct dev_dax *dev_dax)
>   data->mgid = rc;
>  
>   for (i = 0; i < dev_dax->nr_range; i++) {
> + u64 cur_start, cur_len, remaining;
>   struct resource *res;
>   struct range range;
>  
> @@ -137,21 +139,42 @@ static int dev_dax_kmem_probe(struct dev_dax *dev_dax)
>   res->flags = IORESOURCE_SYSTEM_RAM;
>  
>   /*
> -  * Ensure that future kexec'd kernels will not treat
> -  * this as RAM automatically.
> +  * Add memory in chunks of memory_block_size_bytes() so that
> +  * it is considered for MHP_MEMMAP_ON_MEMORY
> +  * @range has already been aligned to memory_block_size_bytes(),
> +  * so the following loop will always break it down cleanly.
>*/
> - rc = add_memory_driver_managed(data->mgid, range.start,
> - range_len(&range), kmem_name, MHP_NID_IS_MGID);
> + cur_start = range.start;
> + cur_len = memory_block_size_bytes();
> + remaining = range_len(&range);
> + while (remaining) {
> + mhp_t mhp_flags = MHP_NID_IS_MGID;
>  
> - if (rc) {
> - dev_warn(dev, "mapping%d: %#llx-%#llx memory add 
> failed\n",
> - i, range.start, range.end);
> - remove_resource(res);
> - kfree(res);
> - data->res[i] = NULL;
> - if (mapped)
> - continue;
> - goto err_request_mem;
> + if (mhp_supports_memmap_on_memory(cur_len,
> +   MHP_MEMMAP_ON_MEMORY))
> + mhp_flags |= MHP_MEMMAP_ON_MEMORY;
> + /*
> +  * Ensure that future kexec'd kernels will not treat
> +  * this as RAM automatically.
> +  */
> + rc = add_memory_driver_managed(data->mgid, cur_start,
> +cur_len, kmem_name,
> +mhp_flags);
> +
> + if (rc) {
> + dev_warn(dev,
> +  "mapping%d: %#llx-%#llx memory add 
> failed\n",
> +  i, cur_start, cur_start + cur_len - 1);
> + remove_resource(res);
> + kfree(res);
> + data->res[i] = NULL;
> + if (mapped)
> + continue;
> + goto err_request_mem;
> + }
> +
> +     cur_start += cur_len;
> + remaining -= cur_len;
>   }
>   mapped++;
>   }

It appears that we need to hot-remove memory in the granularity of
memory_block_size_bytes() too, according to try_remove_memory().  If so,
it seems better to allocate one dax_kmem_data.res[] element for each
memory block instead of dax region?

Best Regards,
Huang, Ying

Re: [PATCH] memory tier: rename destroy_memory_type() to put_memory_type()

2023-07-05 Thread Huang, Ying

Miaohe Lin  writes:

> It appears that destroy_memory_type() isn't a very good name because
> we usually will not free the memory_type here. So rename it to a more
> appropriate name i.e. put_memory_type().
>
> Suggested-by: Huang, Ying 
> Signed-off-by: Miaohe Lin 

LGTM, Thanks!

Reviewed-by: "Huang, Ying" 

> ---
>  drivers/dax/kmem.c   | 4 ++--
>  include/linux/memory-tiers.h | 4 ++--
>  mm/memory-tiers.c| 6 +++---
>  3 files changed, 7 insertions(+), 7 deletions(-)
>
> diff --git a/drivers/dax/kmem.c b/drivers/dax/kmem.c
> index 898ca9505754..c57acb73e3db 100644
> --- a/drivers/dax/kmem.c
> +++ b/drivers/dax/kmem.c
> @@ -264,7 +264,7 @@ static int __init dax_kmem_init(void)
>   return rc;
>  
>  error_dax_driver:
> - destroy_memory_type(dax_slowmem_type);
> + put_memory_type(dax_slowmem_type);
>  err_dax_slowmem_type:
>   kfree_const(kmem_name);
>   return rc;
> @@ -275,7 +275,7 @@ static void __exit dax_kmem_exit(void)
>   dax_driver_unregister(&device_dax_kmem_driver);
>   if (!any_hotremove_failed)
>   kfree_const(kmem_name);
> - destroy_memory_type(dax_slowmem_type);
> + put_memory_type(dax_slowmem_type);
>  }
>  
>  MODULE_AUTHOR("Intel Corporation");
> diff --git a/include/linux/memory-tiers.h b/include/linux/memory-tiers.h
> index fc9647b1b4f9..437441cdf78f 100644
> --- a/include/linux/memory-tiers.h
> +++ b/include/linux/memory-tiers.h
> @@ -33,7 +33,7 @@ struct memory_dev_type {
>  #ifdef CONFIG_NUMA
>  extern bool numa_demotion_enabled;
>  struct memory_dev_type *alloc_memory_type(int adistance);
> -void destroy_memory_type(struct memory_dev_type *memtype);
> +void put_memory_type(struct memory_dev_type *memtype);
>  void init_node_memory_type(int node, struct memory_dev_type *default_type);
>  void clear_node_memory_type(int node, struct memory_dev_type *memtype);
>  #ifdef CONFIG_MIGRATION
> @@ -68,7 +68,7 @@ static inline struct memory_dev_type *alloc_memory_type(int 
> adistance)
>   return NULL;
>  }
>  
> -static inline void destroy_memory_type(struct memory_dev_type *memtype)
> +static inline void put_memory_type(struct memory_dev_type *memtype)
>  {
>  
>  }
> diff --git a/mm/memory-tiers.c b/mm/memory-tiers.c
> index 1719fa3bcf02..c49ab03f49b1 100644
> --- a/mm/memory-tiers.c
> +++ b/mm/memory-tiers.c
> @@ -560,11 +560,11 @@ struct memory_dev_type *alloc_memory_type(int adistance)
>  }
>  EXPORT_SYMBOL_GPL(alloc_memory_type);
>  
> -void destroy_memory_type(struct memory_dev_type *memtype)
> +void put_memory_type(struct memory_dev_type *memtype)
>  {
>   kref_put(&memtype->kref, release_memtype);
>  }
> -EXPORT_SYMBOL_GPL(destroy_memory_type);
> +EXPORT_SYMBOL_GPL(put_memory_type);
>  
>  void init_node_memory_type(int node, struct memory_dev_type *memtype)
>  {
> @@ -586,7 +586,7 @@ void clear_node_memory_type(int node, struct 
> memory_dev_type *memtype)
>*/
>   if (!node_memory_types[node].map_count) {
>   node_memory_types[node].memtype = NULL;
> - destroy_memory_type(memtype);
> + put_memory_type(memtype);
>   }
>   mutex_unlock(&memory_tier_lock);
>  }

[PATCH RESEND 0/4] memory tiering: calculate abstract distance based on ACPI HMAT

2023-07-20 Thread Huang Ying

We have the explicit memory tiers framework to manage systems with
multiple types of memory, e.g., DRAM in DIMM slots and CXL memory
devices.  Where, same kind of memory devices will be grouped into
memory types, then put into memory tiers.  To describe the performance
of a memory type, abstract distance is defined.  Which is in direct
proportion to the memory latency and inversely proportional to the
memory bandwidth.  To keep the code as simple as possible, fixed
abstract distance is used in dax/kmem to describe slow memory such as
Optane DCPMM.

To support more memory types, in this series, we added the abstract
distance calculation algorithm management mechanism, provided a
algorithm implementation based on ACPI HMAT, and used the general
abstract distance calculation interface in dax/kmem driver.  So,
dax/kmem can support HBM (high bandwidth memory) in addition to the
original Optane DCPMM.

Changelog:

V1 (from RFC):

- Added some comments per Aneesh's comments, Thanks!

Best Regards,
Huang, Ying

[PATCH RESEND 2/4] acpi, hmat: refactor hmat_register_target_initiators()

2023-07-20 Thread Huang Ying

Previously, in hmat_register_target_initiators(), the performance
attributes are calculated and the corresponding sysfs links and files
are created too.  Which is called during memory onlining.

But now, to calculate the abstract distance of a memory target before
memory onlining, we need to calculate the performance attributes for
a memory target without creating sysfs links and files.

To do that, hmat_register_target_initiators() is refactored to make it
possible to calculate performance attributes separately.

Signed-off-by: "Huang, Ying" 
Cc: Aneesh Kumar K.V 
Cc: Wei Xu 
Cc: Alistair Popple 
Cc: Dan Williams 
Cc: Dave Hansen 
Cc: Davidlohr Bueso 
Cc: Johannes Weiner 
Cc: Jonathan Cameron 
Cc: Michal Hocko 
Cc: Yang Shi 
Cc: Rafael J Wysocki 
---
 drivers/acpi/numa/hmat.c | 81 +++-
 1 file changed, 30 insertions(+), 51 deletions(-)

diff --git a/drivers/acpi/numa/hmat.c b/drivers/acpi/numa/hmat.c
index bba268ecd802..2dee0098f1a9 100644
--- a/drivers/acpi/numa/hmat.c
+++ b/drivers/acpi/numa/hmat.c
@@ -582,28 +582,25 @@ static int initiators_to_nodemask(unsigned long *p_nodes)
return 0;
 }
 
-static void hmat_register_target_initiators(struct memory_target *target)
+static void hmat_update_target_attrs(struct memory_target *target,
+unsigned long *p_nodes, int access)
 {
-   static DECLARE_BITMAP(p_nodes, MAX_NUMNODES);
struct memory_initiator *initiator;
-   unsigned int mem_nid, cpu_nid;
+   unsigned int cpu_nid;
struct memory_locality *loc = NULL;
u32 best = 0;
-   bool access0done = false;
int i;
 
-   mem_nid = pxm_to_node(target->memory_pxm);
+   bitmap_zero(p_nodes, MAX_NUMNODES);
/*
-* If the Address Range Structure provides a local processor pxm, link
+* If the Address Range Structure provides a local processor pxm, set
 * only that one. Otherwise, find the best performance attributes and
-* register all initiators that match.
+* collect all initiators that match.
 */
if (target->processor_pxm != PXM_INVAL) {
cpu_nid = pxm_to_node(target->processor_pxm);
-   register_memory_node_under_compute_node(mem_nid, cpu_nid, 0);
-   access0done = true;
-   if (node_state(cpu_nid, N_CPU)) {
-   register_memory_node_under_compute_node(mem_nid, 
cpu_nid, 1);
+   if (access == 0 || node_state(cpu_nid, N_CPU)) {
+   set_bit(target->processor_pxm, p_nodes);
return;
}
}
@@ -617,47 +614,10 @@ static void hmat_register_target_initiators(struct 
memory_target *target)
 * We'll also use the sorting to prime the candidate nodes with known
 * initiators.
 */
-   bitmap_zero(p_nodes, MAX_NUMNODES);
list_sort(NULL, &initiators, initiator_cmp);
if (initiators_to_nodemask(p_nodes) < 0)
return;
 
-   if (!access0done) {
-   for (i = WRITE_LATENCY; i <= READ_BANDWIDTH; i++) {
-   loc = localities_types[i];
-   if (!loc)
-   continue;
-
-   best = 0;
-   list_for_each_entry(initiator, &initiators, node) {
-   u32 value;
-
-   if (!test_bit(initiator->processor_pxm, 
p_nodes))
-   continue;
-
-   value = hmat_initiator_perf(target, initiator,
-   loc->hmat_loc);
-   if (hmat_update_best(loc->hmat_loc->data_type, 
value, &best))
-   bitmap_clear(p_nodes, 0, 
initiator->processor_pxm);
-   if (value != best)
-   clear_bit(initiator->processor_pxm, 
p_nodes);
-   }
-   if (best)
-   hmat_update_target_access(target, 
loc->hmat_loc->data_type,
- best, 0);
-   }
-
-   for_each_set_bit(i, p_nodes, MAX_NUMNODES) {
-   cpu_nid = pxm_to_node(i);
-   register_memory_node_under_compute_node(mem_nid, 
cpu_nid, 0);
-   }
-   }
-
-   /* Access 1 ignores Generic Initiators */
-   bitmap_zero(p_nodes, MAX_NUMNODES);
-   if (initiators_to_nodemask(p_nodes) < 0)
-   return;
-
for (i = WRITE_LATENCY; i <= READ_BANDWIDTH; i++) {
loc = localities_types[i];
if (!loc)
@@ -667,7 +627,7 @@ static void hmat_register_target_initiators(struct 
memory_target *target)
list

[PATCH RESEND 1/4] memory tiering: add abstract distance calculation algorithms management

2023-07-20 Thread Huang Ying

The abstract distance may be calculated by various drivers, such as
ACPI HMAT, CXL CDAT, etc.  While it may be used by various code which
hot-add memory node, such as dax/kmem etc.  To decouple the algorithm
users and the providers, the abstract distance calculation algorithms
management mechanism is implemented in this patch.  It provides
interface for the providers to register the implementation, and
interface for the users.

Multiple algorithm implementations can cooperate via calculating
abstract distance for different memory nodes.  The preference of
algorithm implementations can be specified via
priority (notifier_block.priority).

Signed-off-by: "Huang, Ying" 
Cc: Aneesh Kumar K.V 
Cc: Wei Xu 
Cc: Alistair Popple 
Cc: Dan Williams 
Cc: Dave Hansen 
Cc: Davidlohr Bueso 
Cc: Johannes Weiner 
Cc: Jonathan Cameron 
Cc: Michal Hocko 
Cc: Yang Shi 
Cc: Rafael J Wysocki 
---
 include/linux/memory-tiers.h | 19 
 mm/memory-tiers.c| 59 
 2 files changed, 78 insertions(+)

diff --git a/include/linux/memory-tiers.h b/include/linux/memory-tiers.h
index fc9647b1b4f9..c6429e624244 100644
--- a/include/linux/memory-tiers.h
+++ b/include/linux/memory-tiers.h
@@ -6,6 +6,7 @@
 #include 
 #include 
 #include 
+#include 
 /*
  * Each tier cover a abstrace distance chunk size of 128
  */
@@ -36,6 +37,9 @@ struct memory_dev_type *alloc_memory_type(int adistance);
 void destroy_memory_type(struct memory_dev_type *memtype);
 void init_node_memory_type(int node, struct memory_dev_type *default_type);
 void clear_node_memory_type(int node, struct memory_dev_type *memtype);
+int register_mt_adistance_algorithm(struct notifier_block *nb);
+int unregister_mt_adistance_algorithm(struct notifier_block *nb);
+int mt_calc_adistance(int node, int *adist);
 #ifdef CONFIG_MIGRATION
 int next_demotion_node(int node);
 void node_get_allowed_targets(pg_data_t *pgdat, nodemask_t *targets);
@@ -97,5 +101,20 @@ static inline bool node_is_toptier(int node)
 {
return true;
 }
+
+static inline int register_mt_adistance_algorithm(struct notifier_block *nb)
+{
+   return 0;
+}
+
+static inline int unregister_mt_adistance_algorithm(struct notifier_block *nb)
+{
+   return 0;
+}
+
+static inline int mt_calc_adistance(int node, int *adist)
+{
+   return NOTIFY_DONE;
+}
 #endif /* CONFIG_NUMA */
 #endif  /* _LINUX_MEMORY_TIERS_H */
diff --git a/mm/memory-tiers.c b/mm/memory-tiers.c
index a516e303e304..1e55fbe2ad51 100644
--- a/mm/memory-tiers.c
+++ b/mm/memory-tiers.c
@@ -5,6 +5,7 @@
 #include 
 #include 
 #include 
+#include 
 
 #include "internal.h"
 
@@ -105,6 +106,8 @@ static int top_tier_adistance;
 static struct demotion_nodes *node_demotion __read_mostly;
 #endif /* CONFIG_MIGRATION */
 
+static BLOCKING_NOTIFIER_HEAD(mt_adistance_algorithms);
+
 static inline struct memory_tier *to_memory_tier(struct device *device)
 {
return container_of(device, struct memory_tier, dev);
@@ -592,6 +595,62 @@ void clear_node_memory_type(int node, struct 
memory_dev_type *memtype)
 }
 EXPORT_SYMBOL_GPL(clear_node_memory_type);
 
+/**
+ * register_mt_adistance_algorithm() - Register memory tiering abstract 
distance algorithm
+ * @nb: The notifier block which describe the algorithm
+ *
+ * Return: 0 on success, errno on error.
+ *
+ * Every memory tiering abstract distance algorithm provider needs to
+ * register the algorithm with register_mt_adistance_algorithm().  To
+ * calculate the abstract distance for a specified memory node, the
+ * notifier function will be called unless some high priority
+ * algorithm has provided result.  The prototype of the notifier
+ * function is as follows,
+ *
+ *   int (*algorithm_notifier)(struct notifier_block *nb,
+ * unsigned long nid, void *data);
+ *
+ * Where "nid" specifies the memory node, "data" is the pointer to the
+ * returned abstract distance (that is, "int *adist").  If the
+ * algorithm provides the result, NOTIFY_STOP should be returned.
+ * Otherwise, return_value & %NOTIFY_STOP_MASK == 0 to allow the next
+ * algorithm in the chain to provide the result.
+ */
+int register_mt_adistance_algorithm(struct notifier_block *nb)
+{
+   return blocking_notifier_chain_register(&mt_adistance_algorithms, nb);
+}
+EXPORT_SYMBOL_GPL(register_mt_adistance_algorithm);
+
+/**
+ * unregister_mt_adistance_algorithm() - Unregister memory tiering abstract 
distance algorithm
+ * @nb: the notifier block which describe the algorithm
+ *
+ * Return: 0 on success, errno on error.
+ */
+int unregister_mt_adistance_algorithm(struct notifier_block *nb)
+{
+   return blocking_notifier_chain_unregister(&mt_adistance_algorithms, nb);
+}
+EXPORT_SYMBOL_GPL(unregister_mt_adistance_algorithm);
+
+/**
+ * mt_calc_adistance() - Calculate abstract distance with registered algorithms
+ * @node: the node to calculate abstract distance for
+ * @adist:

[PATCH RESEND 3/4] acpi, hmat: calculate abstract distance with HMAT

2023-07-20 Thread Huang Ying

A memory tiering abstract distance calculation algorithm based on ACPI
HMAT is implemented.  The basic idea is as follows.

The performance attributes of system default DRAM nodes are recorded
as the base line.  Whose abstract distance is MEMTIER_ADISTANCE_DRAM.
Then, the ratio of the abstract distance of a memory node (target) to
MEMTIER_ADISTANCE_DRAM is scaled based on the ratio of the performance
attributes of the node to that of the default DRAM nodes.

Signed-off-by: "Huang, Ying" 
Cc: Aneesh Kumar K.V 
Cc: Wei Xu 
Cc: Alistair Popple 
Cc: Dan Williams 
Cc: Dave Hansen 
Cc: Davidlohr Bueso 
Cc: Johannes Weiner 
Cc: Jonathan Cameron 
Cc: Michal Hocko 
Cc: Yang Shi 
Cc: Rafael J Wysocki 
---
 drivers/acpi/numa/hmat.c | 138 ++-
 include/linux/memory-tiers.h |   2 +
 mm/memory-tiers.c|   2 +-
 3 files changed, 140 insertions(+), 2 deletions(-)

diff --git a/drivers/acpi/numa/hmat.c b/drivers/acpi/numa/hmat.c
index 2dee0098f1a9..306a912090f0 100644
--- a/drivers/acpi/numa/hmat.c
+++ b/drivers/acpi/numa/hmat.c
@@ -24,6 +24,7 @@
 #include 
 #include 
 #include 
+#include 
 
 static u8 hmat_revision;
 static int hmat_disable __initdata;
@@ -759,6 +760,137 @@ static int hmat_callback(struct notifier_block *self,
return NOTIFY_OK;
 }
 
+static int hmat_adistance_disabled;
+static struct node_hmem_attrs default_dram_attrs;
+
+static void dump_hmem_attrs(struct node_hmem_attrs *attrs)
+{
+   pr_cont("read_latency: %u, write_latency: %u, read_bandwidth: %u, 
write_bandwidth: %u\n",
+   attrs->read_latency, attrs->write_latency,
+   attrs->read_bandwidth, attrs->write_bandwidth);
+}
+
+static void disable_hmat_adistance_algorithm(void)
+{
+   hmat_adistance_disabled = true;
+}
+
+static int hmat_init_default_dram_attrs(void)
+{
+   struct memory_target *target;
+   struct node_hmem_attrs *attrs;
+   int nid, pxm;
+   int nid_dram = NUMA_NO_NODE;
+
+   if (default_dram_attrs.read_latency +
+   default_dram_attrs.write_latency != 0)
+   return 0;
+
+   if (!default_dram_type)
+   return -EIO;
+
+   for_each_node_mask(nid, default_dram_type->nodes) {
+   pxm = node_to_pxm(nid);
+   target = find_mem_target(pxm);
+   if (!target)
+   continue;
+   attrs = &target->hmem_attrs[1];
+   if (nid_dram == NUMA_NO_NODE) {
+   if (attrs->read_latency + attrs->write_latency == 0 ||
+   attrs->read_bandwidth + attrs->write_bandwidth == 
0) {
+   pr_info("hmat: invalid hmem attrs for default 
DRAM node: %d,\n",
+   nid);
+   pr_info("  ");
+   dump_hmem_attrs(attrs);
+   pr_info("  disable hmat based abstract distance 
algorithm.\n");
+   disable_hmat_adistance_algorithm();
+   return -EIO;
+   }
+   nid_dram = nid;
+   default_dram_attrs = *attrs;
+   continue;
+   }
+
+   /*
+* The performance of all default DRAM nodes is expected
+* to be same (that is, the variation is less than 10%).
+* And it will be used as base to calculate the abstract
+* distance of other memory nodes.
+*/
+   if (abs(attrs->read_latency - default_dram_attrs.read_latency) 
* 10 >
+   default_dram_attrs.read_latency ||
+   abs(attrs->write_latency - 
default_dram_attrs.write_latency) * 10 >
+   default_dram_attrs.write_latency ||
+   abs(attrs->read_bandwidth - 
default_dram_attrs.read_bandwidth) * 10 >
+   default_dram_attrs.read_bandwidth) {
+   pr_info("hmat: hmem attrs for DRAM nodes mismatch.\n");
+   pr_info("  node %d:", nid_dram);
+   dump_hmem_attrs(&default_dram_attrs);
+   pr_info("  node %d:", nid);
+   dump_hmem_attrs(attrs);
+   pr_info("  disable hmat based abstract distance 
algorithm.\n");
+   disable_hmat_adistance_algorithm();
+   return -EIO;
+   }
+   }
+
+   return 0;
+}
+
+static int hmat_calculate_adistance(struct notifier_block *self,
+   unsigned long nid, void *data)
+{
+   static DECLARE_BITMAP(p_nodes, MAX_NUMNODES);
+   struct memory_target *target;
+   struct node_hmem_attrs *attrs;
+   int *adist = data;
+   in

< 2 3 4 5 6 7 8 9 10 11 >

601 - 700 of 1854 matches

Mail list logo