This is based on (but somewhat different from) what hugetlbfs
does to share/unshare page tables.

Signed-off-by: Larry Bassel <larry.bas...@oracle.com>
---
 include/linux/hugetlb.h |   4 ++
 mm/huge_memory.c        |  37 +++++++++++++++++
 mm/hugetlb.c            |   8 ++--
 mm/memory.c             | 108 +++++++++++++++++++++++++++++++++++++++++++++++-
 4 files changed, 152 insertions(+), 5 deletions(-)

diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h
index edf476c..debff55 100644
--- a/include/linux/hugetlb.h
+++ b/include/linux/hugetlb.h
@@ -140,6 +140,10 @@ pte_t *huge_pte_offset(struct mm_struct *mm,
 int huge_pmd_unshare(struct mm_struct *mm, unsigned long *addr, pte_t *ptep);
 void adjust_range_if_pmd_sharing_possible(struct vm_area_struct *vma,
                                unsigned long *start, unsigned long *end);
+unsigned long page_table_shareable(struct vm_area_struct *svma,
+                                  struct vm_area_struct *vma,
+                                  unsigned long addr, pgoff_t idx);
+bool vma_shareable(struct vm_area_struct *vma, unsigned long addr);
 struct page *follow_huge_addr(struct mm_struct *mm, unsigned long address,
                              int write);
 struct page *follow_huge_pd(struct vm_area_struct *vma,
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index 9f8bce9..935874c 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -1751,6 +1751,33 @@ static inline void zap_deposited_table(struct mm_struct 
*mm, pmd_t *pmd)
        mm_dec_nr_ptes(mm);
 }
 
+#ifdef CONFIG_ARCH_HAS_HUGE_PMD_SHARE
+static int unshare_huge_pmd(struct mm_struct *mm, unsigned long addr,
+                           pmd_t *pmdp)
+{
+       pgd_t *pgd = pgd_offset(mm, addr);
+       p4d_t *p4d = p4d_offset(pgd, addr);
+       pud_t *pud = pud_offset(p4d, addr);
+
+       WARN_ON(page_count(virt_to_page(pmdp)) == 0);
+       if (page_count(virt_to_page(pmdp)) == 1)
+               return 0;
+
+       pud_clear(pud);
+       put_page(virt_to_page(pmdp));
+       mm_dec_nr_pmds(mm);
+       return 1;
+}
+
+#else
+static int unshare_huge_pmd(struct mm_struct *mm, unsigned long addr,
+                           pmd_t *pmdp)
+{
+       return 0;
+}
+
+#endif
+
 int zap_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma,
                 pmd_t *pmd, unsigned long addr)
 {
@@ -1768,6 +1795,11 @@ int zap_huge_pmd(struct mmu_gather *tlb, struct 
vm_area_struct *vma,
         * pgtable_trans_huge_withdraw after finishing pmdp related
         * operations.
         */
+       if (unshare_huge_pmd(vma->vm_mm, addr, pmd)) {
+               spin_unlock(ptl);
+               return 1;
+       }
+
        orig_pmd = pmdp_huge_get_and_clear_full(tlb->mm, addr, pmd,
                        tlb->fullmm);
        tlb_remove_pmd_tlb_entry(tlb, pmd, addr);
@@ -1915,6 +1947,11 @@ int change_huge_pmd(struct vm_area_struct *vma, pmd_t 
*pmd,
        if (!ptl)
                return 0;
 
+       if (unshare_huge_pmd(mm, addr, pmd)) {
+               spin_unlock(ptl);
+               return HPAGE_PMD_NR;
+       }
+
        preserve_write = prot_numa && pmd_write(*pmd);
        ret = 1;
 
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index 3a54c9d..1c1ed4e 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -4653,9 +4653,9 @@ long hugetlb_unreserve_pages(struct inode *inode, long 
start, long end,
 }
 
 #ifdef CONFIG_ARCH_HAS_HUGE_PMD_SHARE
-static unsigned long page_table_shareable(struct vm_area_struct *svma,
-                               struct vm_area_struct *vma,
-                               unsigned long addr, pgoff_t idx)
+unsigned long page_table_shareable(struct vm_area_struct *svma,
+                                  struct vm_area_struct *vma,
+                                  unsigned long addr, pgoff_t idx)
 {
        unsigned long saddr = ((idx - svma->vm_pgoff) << PAGE_SHIFT) +
                                svma->vm_start;
@@ -4678,7 +4678,7 @@ static unsigned long page_table_shareable(struct 
vm_area_struct *svma,
        return saddr;
 }
 
-static bool vma_shareable(struct vm_area_struct *vma, unsigned long addr)
+bool vma_shareable(struct vm_area_struct *vma, unsigned long addr)
 {
        unsigned long base = addr & PUD_MASK;
        unsigned long end = base + PUD_SIZE;
diff --git a/mm/memory.c b/mm/memory.c
index ddf20bd..1ca8f75 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -3932,6 +3932,109 @@ static vm_fault_t handle_pte_fault(struct vm_fault *vmf)
        return 0;
 }
 
+#ifdef CONFIG_ARCH_HAS_HUGE_PMD_SHARE
+static pmd_t *huge_pmd_offset(struct mm_struct *mm,
+                             unsigned long addr, unsigned long sz)
+{
+       pgd_t *pgd;
+       p4d_t *p4d;
+       pud_t *pud;
+       pmd_t *pmd;
+
+       pgd = pgd_offset(mm, addr);
+       if (!pgd_present(*pgd))
+               return NULL;
+       p4d = p4d_offset(pgd, addr);
+       if (!p4d_present(*p4d))
+               return NULL;
+
+       pud = pud_offset(p4d, addr);
+       if (sz != PUD_SIZE && pud_none(*pud))
+               return NULL;
+       /* hugepage or swap? */
+       if (pud_huge(*pud) || !pud_present(*pud))
+               return (pmd_t *)pud;
+
+       pmd = pmd_offset(pud, addr);
+       if (sz != PMD_SIZE && pmd_none(*pmd))
+               return NULL;
+       /* hugepage or swap? */
+       if (pmd_huge(*pmd) || !pmd_present(*pmd))
+               return pmd;
+
+       return NULL;
+}
+
+static pmd_t *pmd_share(struct mm_struct *mm, pud_t *pud, unsigned long addr)
+{
+       struct vm_area_struct *vma = find_vma(mm, addr);
+       struct address_space *mapping = vma->vm_file->f_mapping;
+       pgoff_t idx = ((addr - vma->vm_start) >> PAGE_SHIFT) +
+                       vma->vm_pgoff;
+       struct vm_area_struct *svma;
+       unsigned long saddr;
+       pmd_t *spmd = NULL;
+       pmd_t *pmd;
+       spinlock_t *ptl;
+
+       if (!vma_shareable(vma, addr))
+               return pmd_alloc(mm, pud, addr);
+
+       i_mmap_lock_write(mapping);
+
+       vma_interval_tree_foreach(svma, &mapping->i_mmap, idx, idx) {
+               if (svma == vma)
+                       continue;
+
+               saddr = page_table_shareable(svma, vma, addr, idx);
+               if (saddr) {
+                       spmd = huge_pmd_offset(svma->vm_mm, saddr,
+                                              vma_mmu_pagesize(svma));
+                       if (spmd) {
+                               get_page(virt_to_page(spmd));
+                               break;
+                       }
+               }
+       }
+
+       if (!spmd)
+               goto out;
+
+       ptl = pmd_lockptr(mm, spmd);
+       spin_lock(ptl);
+
+       if (pud_none(*pud)) {
+               pud_populate(mm, pud,
+                            (pmd_t *)((unsigned long)spmd & PAGE_MASK));
+               mm_inc_nr_pmds(mm);
+       } else {
+               put_page(virt_to_page(spmd));
+       }
+       spin_unlock(ptl);
+out:
+       pmd = pmd_alloc(mm, pud, addr);
+       i_mmap_unlock_write(mapping);
+       return pmd;
+}
+
+static bool may_share_pmd(struct vm_area_struct *vma)
+{
+       if (vma_is_fsdax(vma))
+               return true;
+       return false;
+}
+#else
+static pmd_t *pmd_share(struct mm_struct *mm, pud_t *pud, unsigned long addr)
+{
+       return pmd_alloc(mm, pud, addr);
+}
+
+static bool may_share_pmd(struct vm_area_struct *vma)
+{
+       return false;
+}
+#endif
+
 /*
  * By the time we get here, we already hold the mm semaphore
  *
@@ -3985,7 +4088,10 @@ static vm_fault_t __handle_mm_fault(struct 
vm_area_struct *vma,
                }
        }
 
-       vmf.pmd = pmd_alloc(mm, vmf.pud, address);
+       if (unlikely(may_share_pmd(vma)))
+               vmf.pmd = pmd_share(mm, vmf.pud, address);
+       else
+               vmf.pmd = pmd_alloc(mm, vmf.pud, address);
        if (!vmf.pmd)
                return VM_FAULT_OOM;
        if (pmd_none(*vmf.pmd) && __transparent_hugepage_enabled(vma)) {
-- 
1.8.3.1

Reply via email to