Note: Based on "mm/mpol: Use special PROT_NONE to migrate pages" but
        sufficiently different that the signed-off-bys were dropped

Combine our previous _PAGE_NUMA, mpol_misplaced and migrate_misplaced_page()
pieces into an effective migrate on fault scheme.

Note that (on x86) we rely on PROT_NONE pages being !present and avoid
the TLB flush from try_to_unmap(TTU_MIGRATION). This greatly improves the
page-migration performance.

Based-on-work-by: Peter Zijlstra <a.p.zijls...@chello.nl>
Signed-off-by: Mel Gorman <mgor...@suse.de>
---
 include/linux/huge_mm.h |    8 ++++----
 mm/huge_memory.c        |   32 +++++++++++++++++++++++++++++---
 mm/memory.c             |   29 +++++++++++++++++++++++++----
 3 files changed, 58 insertions(+), 11 deletions(-)

diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h
index a13ebb1..406f81c 100644
--- a/include/linux/huge_mm.h
+++ b/include/linux/huge_mm.h
@@ -160,8 +160,8 @@ static inline struct page *compound_trans_head(struct page 
*page)
        return page;
 }
 
-extern int do_huge_pmd_numa_page(struct mm_struct *mm, unsigned long addr,
-                                 pmd_t pmd, pmd_t *pmdp);
+extern int do_huge_pmd_numa_page(struct mm_struct *mm, struct vm_area_struct 
*vma,
+                               unsigned long addr, pmd_t pmd, pmd_t *pmdp);
 
 #else /* CONFIG_TRANSPARENT_HUGEPAGE */
 #define HPAGE_PMD_SHIFT ({ BUILD_BUG(); 0; })
@@ -200,8 +200,8 @@ static inline int pmd_trans_huge_lock(pmd_t *pmd,
        return 0;
 }
 
-static inline int do_huge_pmd_numa_page(struct mm_struct *mm, unsigned long 
addr,
-                                       pmd_t pmd, pmd_t *pmdp);
+static inline int do_huge_pmd_numa_page(struct mm_struct *mm, struct 
vm_area_struct *vma,
+                                       unsigned long addr, pmd_t pmd, pmd_t 
*pmdp);
 {
 }
 
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index 92a64d2..1453c30 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -18,6 +18,7 @@
 #include <linux/freezer.h>
 #include <linux/mman.h>
 #include <linux/pagemap.h>
+#include <linux/migrate.h>
 #include <asm/tlb.h>
 #include <asm/pgalloc.h>
 #include "internal.h"
@@ -1018,16 +1019,39 @@ out:
 }
 
 /* NUMA hinting page fault entry point for trans huge pmds */
-int do_huge_pmd_numa_page(struct mm_struct *mm, unsigned long addr,
-                               pmd_t pmd, pmd_t *pmdp)
+int do_huge_pmd_numa_page(struct mm_struct *mm, struct vm_area_struct *vma,
+                               unsigned long addr, pmd_t pmd, pmd_t *pmdp)
 {
-       struct page *page;
+       struct page *page = NULL;
+       unsigned long haddr = addr & HPAGE_PMD_MASK;
+       int target_nid;
 
        spin_lock(&mm->page_table_lock);
        if (unlikely(!pmd_same(pmd, *pmdp)))
                goto out_unlock;
 
        page = pmd_page(pmd);
+       get_page(page);
+       spin_unlock(&mm->page_table_lock);
+
+       target_nid = mpol_misplaced(page, vma, haddr);
+       if (target_nid == -1)
+               goto clear_pmdnuma;
+
+       /*
+        * Due to lacking code to migrate thp pages, we'll split
+        * (which preserves the special PROT_NONE) and re-take the
+        * fault on the normal pages.
+        */
+       split_huge_page(page);
+       put_page(page);
+       return 0;
+
+clear_pmdnuma:
+       spin_lock(&mm->page_table_lock);
+       if (unlikely(!pmd_same(pmd, *pmdp)))
+               goto out_unlock;
+
        pmd = pmd_mknonnuma(pmd);
        set_pmd_at(mm, addr & HPAGE_PMD_MASK, pmdp, pmd);
        VM_BUG_ON(pmd_numa(*pmdp));
@@ -1035,6 +1059,8 @@ int do_huge_pmd_numa_page(struct mm_struct *mm, unsigned 
long addr,
 
 out_unlock:
        spin_unlock(&mm->page_table_lock);
+       if (page)
+               put_page(page);
        return 0;
 }
 
diff --git a/mm/memory.c b/mm/memory.c
index 72092d8..fb46ef2 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -57,6 +57,7 @@
 #include <linux/swapops.h>
 #include <linux/elf.h>
 #include <linux/gfp.h>
+#include <linux/migrate.h>
 
 #include <asm/io.h>
 #include <asm/pgalloc.h>
@@ -3436,8 +3437,9 @@ static int do_nonlinear_fault(struct mm_struct *mm, 
struct vm_area_struct *vma,
 int do_numa_page(struct mm_struct *mm, struct vm_area_struct *vma,
                   unsigned long addr, pte_t pte, pte_t *ptep, pmd_t *pmd)
 {
-       struct page *page;
+       struct page *page = NULL;
        spinlock_t *ptl;
+       int current_nid, target_nid;
 
        /*
        * The "pte" at this point cannot be used safely without
@@ -3452,14 +3454,33 @@ int do_numa_page(struct mm_struct *mm, struct 
vm_area_struct *vma,
        spin_lock(ptl);
        if (unlikely(!pte_same(*ptep, pte)))
                goto out_unlock;
-       pte = pte_mknonnuma(pte);
-       set_pte_at(mm, addr, ptep, pte);
+
        page = vm_normal_page(vma, addr, pte);
        BUG_ON(!page);
+
+       get_page(page);
+       current_nid = page_to_nid(page);
+       target_nid = mpol_misplaced(page, vma, addr);
+       if (target_nid == -1)
+               goto clear_pmdnuma;
+
+       pte_unmap_unlock(ptep, ptl);
+       migrate_misplaced_page(page, target_nid);
+       page = NULL;
+
+       ptep = pte_offset_map_lock(mm, pmd, addr, &ptl);
+       if (!pte_same(*ptep, pte))
+               goto out_unlock;
+
+clear_pmdnuma:
+       pte = pte_mknonnuma(pte);
+       set_pte_at(mm, addr, ptep, pte);
        update_mmu_cache(vma, addr, ptep);
 
 out_unlock:
        pte_unmap_unlock(ptep, ptl);
+       if (page)
+               put_page(page);
        return 0;
 }
 
@@ -3631,7 +3652,7 @@ retry:
                barrier();
                if (pmd_trans_huge(orig_pmd)) {
                        if (pmd_numa(*pmd))
-                               return do_huge_pmd_numa_page(mm, address,
+                               return do_huge_pmd_numa_page(mm, vma, address,
                                                             orig_pmd, pmd);
 
                        if ((flags & FAULT_FLAG_WRITE) && !pmd_write(orig_pmd)) 
{
-- 
1.7.9.2

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Reply via email to