From: Zi Yan <z...@nvidia.com>

Add PUD-level TLB flush ops and teach page_vma_mapped_talk about 1GB
THPs.

Signed-off-by: Zi Yan <z...@nvidia.com>
---
 arch/x86/include/asm/pgtable.h |  3 +++
 arch/x86/mm/pgtable.c          | 13 +++++++++++++
 include/asm-generic/pgtable.h  | 14 ++++++++++++++
 include/linux/mmu_notifier.h   | 13 +++++++++++++
 include/linux/rmap.h           |  1 +
 mm/page_vma_mapped.c           | 33 +++++++++++++++++++++++++++++----
 mm/rmap.c                      | 12 +++++++++---
 7 files changed, 82 insertions(+), 7 deletions(-)

diff --git a/arch/x86/include/asm/pgtable.h b/arch/x86/include/asm/pgtable.h
index ae3ac49c32ad..f99ce657d282 100644
--- a/arch/x86/include/asm/pgtable.h
+++ b/arch/x86/include/asm/pgtable.h
@@ -1151,6 +1151,9 @@ extern int pudp_test_and_clear_young(struct 
vm_area_struct *vma,
 extern int pmdp_clear_flush_young(struct vm_area_struct *vma,
                                  unsigned long address, pmd_t *pmdp);
 
+#define __HAVE_ARCH_PUDP_CLEAR_YOUNG_FLUSH
+extern int pudp_clear_flush_young(struct vm_area_struct *vma,
+                                 unsigned long address, pud_t *pudp);
 
 #define pmd_write pmd_write
 static inline int pmd_write(pmd_t pmd)
diff --git a/arch/x86/mm/pgtable.c b/arch/x86/mm/pgtable.c
index 0a5008690d7c..0edcfa8007cb 100644
--- a/arch/x86/mm/pgtable.c
+++ b/arch/x86/mm/pgtable.c
@@ -643,6 +643,19 @@ int pmdp_clear_flush_young(struct vm_area_struct *vma,
 
        return young;
 }
+int pudp_clear_flush_young(struct vm_area_struct *vma,
+                          unsigned long address, pud_t *pudp)
+{
+       int young;
+
+       VM_BUG_ON(address & ~HPAGE_PUD_MASK);
+
+       young = pudp_test_and_clear_young(vma, address, pudp);
+       if (young)
+               flush_tlb_range(vma, address, address + HPAGE_PUD_SIZE);
+
+       return young;
+}
 #endif
 
 /**
diff --git a/include/asm-generic/pgtable.h b/include/asm-generic/pgtable.h
index 0f626d6177c3..682531e0d55c 100644
--- a/include/asm-generic/pgtable.h
+++ b/include/asm-generic/pgtable.h
@@ -121,6 +121,20 @@ static inline int pmdp_clear_flush_young(struct 
vm_area_struct *vma,
 #endif /* CONFIG_TRANSPARENT_HUGEPAGE */
 #endif
 
+#ifndef __HAVE_ARCH_PUDP_CLEAR_YOUNG_FLUSH
+#ifdef CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD
+extern int pudp_clear_flush_young(struct vm_area_struct *vma,
+                                 unsigned long address, pud_t *pudp);
+#else
+int pudp_clear_flush_young(struct vm_area_struct *vma,
+                                 unsigned long address, pud_t *pudp)
+{
+       BUILD_BUG();
+       return 0;
+}
+#endif /* CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD  */
+#endif
+
 #ifndef __HAVE_ARCH_PTEP_GET_AND_CLEAR
 static inline pte_t ptep_get_and_clear(struct mm_struct *mm,
                                       unsigned long address,
diff --git a/include/linux/mmu_notifier.h b/include/linux/mmu_notifier.h
index 4050ec1c3b45..6850b9e9b2cb 100644
--- a/include/linux/mmu_notifier.h
+++ b/include/linux/mmu_notifier.h
@@ -353,6 +353,19 @@ static inline void mmu_notifier_range_init(struct 
mmu_notifier_range *range,
        __young;                                                        \
 })
 
+#define pudp_clear_flush_young_notify(__vma, __address, __pudp)                
\
+({                                                                     \
+       int __young;                                                    \
+       struct vm_area_struct *___vma = __vma;                          \
+       unsigned long ___address = __address;                           \
+       __young = pudp_clear_flush_young(___vma, ___address, __pudp);   \
+       __young |= mmu_notifier_clear_flush_young(___vma->vm_mm,        \
+                                                 ___address,           \
+                                                 ___address +          \
+                                                       PUD_SIZE);      \
+       __young;                                                        \
+})
+
 #define ptep_clear_young_notify(__vma, __address, __ptep)              \
 ({                                                                     \
        int __young;                                                    \
diff --git a/include/linux/rmap.h b/include/linux/rmap.h
index 988d176472df..2b566736e3c2 100644
--- a/include/linux/rmap.h
+++ b/include/linux/rmap.h
@@ -206,6 +206,7 @@ struct page_vma_mapped_walk {
        struct page *page;
        struct vm_area_struct *vma;
        unsigned long address;
+       pud_t *pud;
        pmd_t *pmd;
        pte_t *pte;
        spinlock_t *ptl;
diff --git a/mm/page_vma_mapped.c b/mm/page_vma_mapped.c
index 11df03e71288..a473553aa9a5 100644
--- a/mm/page_vma_mapped.c
+++ b/mm/page_vma_mapped.c
@@ -141,9 +141,12 @@ bool page_vma_mapped_walk(struct page_vma_mapped_walk 
*pvmw)
        struct page *page = pvmw->page;
        pgd_t *pgd;
        p4d_t *p4d;
-       pud_t *pud;
+       pud_t pude;
        pmd_t pmde;
 
+       if (!pvmw->pte && !pvmw->pmd && pvmw->pud)
+               return not_found(pvmw);
+
        /* The only possible pmd mapping has been handled on last iteration */
        if (pvmw->pmd && !pvmw->pte)
                return not_found(pvmw);
@@ -171,10 +174,31 @@ bool page_vma_mapped_walk(struct page_vma_mapped_walk 
*pvmw)
        p4d = p4d_offset(pgd, pvmw->address);
        if (!p4d_present(*p4d))
                return false;
-       pud = pud_offset(p4d, pvmw->address);
-       if (!pud_present(*pud))
+       pvmw->pud = pud_offset(p4d, pvmw->address);
+
+       /*
+        * Make sure the pud value isn't cached in a register by the
+        * compiler and used as a stale value after we've observed a
+        * subsequent update.
+        */
+       pude = READ_ONCE(*pvmw->pud);
+       if (pud_trans_huge(pude)) {
+               pvmw->ptl = pud_lock(mm, pvmw->pud);
+               if (likely(pud_trans_huge(*pvmw->pud))) {
+                       if (pvmw->flags & PVMW_MIGRATION)
+                               return not_found(pvmw);
+                       if (pud_page(*pvmw->pud) != page)
+                               return not_found(pvmw);
+                       return true;
+               } else {
+                       /* THP pud was split under us: handle on pmd level */
+                       spin_unlock(pvmw->ptl);
+                       pvmw->ptl = NULL;
+               }
+       } else if (!pud_present(pude))
                return false;
-       pvmw->pmd = pmd_offset(pud, pvmw->address);
+
+       pvmw->pmd = pmd_offset(pvmw->pud, pvmw->address);
        /*
         * Make sure the pmd value isn't cached in a register by the
         * compiler and used as a stale value after we've observed a
@@ -210,6 +234,7 @@ bool page_vma_mapped_walk(struct page_vma_mapped_walk *pvmw)
        } else if (!pmd_present(pmde)) {
                return false;
        }
+
        if (!map_pte(pvmw))
                goto next_pte;
        while (1) {
diff --git a/mm/rmap.c b/mm/rmap.c
index dae66a4329ea..f69d81d4a956 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -789,9 +789,15 @@ static bool page_referenced_one(struct page *page, struct 
vm_area_struct *vma,
                                        referenced++;
                        }
                } else if (IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE)) {
-                       if (pmdp_clear_flush_young_notify(vma, address,
-                                               pvmw.pmd))
-                               referenced++;
+                       if (pvmw.pmd) {
+                               if (pmdp_clear_flush_young_notify(vma, address,
+                                                       pvmw.pmd))
+                                       referenced++;
+                       } else if (pvmw.pud) {
+                               if (pudp_clear_flush_young_notify(vma, address,
+                                                       pvmw.pud))
+                                       referenced++;
+                       }
                } else {
                        /* unexpected pmd-mapped page? */
                        WARN_ON_ONCE(1);
-- 
2.20.1

Reply via email to