For either swap and page migration, we all use the bit 2 of the entry to
identify whether this entry is uffd write-protected.  It plays a similar
role as the existing soft dirty bit in swap entries but only for keeping
the uffd-wp tracking for a specific PTE/PMD.

Something special here is that when we want to recover the uffd-wp bit
from a swap/migration entry to the PTE bit we'll also need to take care
of the _PAGE_RW bit and make sure it's cleared, otherwise even with the
_PAGE_UFFD_WP bit we can't trap it at all.

Note that this patch removed two lines from "userfaultfd: wp: hook
userfault handler to write protection fault" where we try to remove the
VM_FAULT_WRITE from vmf->flags when uffd-wp is set for the VMA.  This
patch will still keep the write flag there.

Signed-off-by: Peter Xu <pet...@redhat.com>
---
 include/linux/swapops.h | 2 ++
 mm/huge_memory.c        | 3 +++
 mm/memory.c             | 8 ++++++--
 mm/migrate.c            | 7 +++++++
 mm/mprotect.c           | 2 ++
 mm/rmap.c               | 6 ++++++
 6 files changed, 26 insertions(+), 2 deletions(-)

diff --git a/include/linux/swapops.h b/include/linux/swapops.h
index 4d961668e5fc..0c2923b1cdb7 100644
--- a/include/linux/swapops.h
+++ b/include/linux/swapops.h
@@ -68,6 +68,8 @@ static inline swp_entry_t pte_to_swp_entry(pte_t pte)
 
        if (pte_swp_soft_dirty(pte))
                pte = pte_swp_clear_soft_dirty(pte);
+       if (pte_swp_uffd_wp(pte))
+               pte = pte_swp_clear_uffd_wp(pte);
        arch_entry = __pte_to_swp_entry(pte);
        return swp_entry(__swp_type(arch_entry), __swp_offset(arch_entry));
 }
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index 2a3ec62e83b6..682f1427da1a 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -2171,6 +2171,7 @@ static void __split_huge_pmd_locked(struct vm_area_struct 
*vma, pmd_t *pmd,
                write = is_write_migration_entry(entry);
                young = false;
                soft_dirty = pmd_swp_soft_dirty(old_pmd);
+               uffd_wp = pmd_swp_uffd_wp(old_pmd);
        } else {
                page = pmd_page(old_pmd);
                if (pmd_dirty(old_pmd))
@@ -2203,6 +2204,8 @@ static void __split_huge_pmd_locked(struct vm_area_struct 
*vma, pmd_t *pmd,
                        entry = swp_entry_to_pte(swp_entry);
                        if (soft_dirty)
                                entry = pte_swp_mksoft_dirty(entry);
+                       if (uffd_wp)
+                               entry = pte_swp_mkuffd_wp(entry);
                } else {
                        entry = mk_pte(page + i, READ_ONCE(vma->vm_page_prot));
                        entry = maybe_mkwrite(entry, vma);
diff --git a/mm/memory.c b/mm/memory.c
index f5497752d2a3..ac7d659e40fe 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -736,6 +736,8 @@ copy_one_pte(struct mm_struct *dst_mm, struct mm_struct 
*src_mm,
                                pte = swp_entry_to_pte(entry);
                                if (pte_swp_soft_dirty(*src_pte))
                                        pte = pte_swp_mksoft_dirty(pte);
+                               if (pte_swp_uffd_wp(*src_pte))
+                                       pte = pte_swp_mkuffd_wp(pte);
                                set_pte_at(src_mm, addr, src_pte, pte);
                        }
                } else if (is_device_private_entry(entry)) {
@@ -2814,8 +2816,6 @@ vm_fault_t do_swap_page(struct vm_fault *vmf)
        inc_mm_counter_fast(vma->vm_mm, MM_ANONPAGES);
        dec_mm_counter_fast(vma->vm_mm, MM_SWAPENTS);
        pte = mk_pte(page, vma->vm_page_prot);
-       if (userfaultfd_wp(vma))
-               vmf->flags &= ~FAULT_FLAG_WRITE;
        if ((vmf->flags & FAULT_FLAG_WRITE) && reuse_swap_page(page, NULL)) {
                pte = maybe_mkwrite(pte_mkdirty(pte), vma);
                vmf->flags &= ~FAULT_FLAG_WRITE;
@@ -2825,6 +2825,10 @@ vm_fault_t do_swap_page(struct vm_fault *vmf)
        flush_icache_page(vma, page);
        if (pte_swp_soft_dirty(vmf->orig_pte))
                pte = pte_mksoft_dirty(pte);
+       if (pte_swp_uffd_wp(vmf->orig_pte)) {
+               pte = pte_mkuffd_wp(pte);
+               pte = pte_wrprotect(pte);
+       }
        set_pte_at(vma->vm_mm, vmf->address, vmf->pte, pte);
        arch_do_swap_page(vma->vm_mm, vma, vmf->address, pte, vmf->orig_pte);
        vmf->orig_pte = pte;
diff --git a/mm/migrate.c b/mm/migrate.c
index f7e4bfdc13b7..963d3dd65cf0 100644
--- a/mm/migrate.c
+++ b/mm/migrate.c
@@ -242,6 +242,11 @@ static bool remove_migration_pte(struct page *page, struct 
vm_area_struct *vma,
                if (is_write_migration_entry(entry))
                        pte = maybe_mkwrite(pte, vma);
 
+               if (pte_swp_uffd_wp(*pvmw.pte)) {
+                       pte = pte_mkuffd_wp(pte);
+                       pte = pte_wrprotect(pte);
+               }
+
                if (unlikely(is_zone_device_page(new))) {
                        if (is_device_private_page(new)) {
                                entry = make_device_private_entry(new, 
pte_write(pte));
@@ -2265,6 +2270,8 @@ static int migrate_vma_collect_pmd(pmd_t *pmdp,
                        swp_pte = swp_entry_to_pte(entry);
                        if (pte_soft_dirty(pte))
                                swp_pte = pte_swp_mksoft_dirty(swp_pte);
+                       if (pte_uffd_wp(pte))
+                               swp_pte = pte_swp_mkuffd_wp(swp_pte);
                        set_pte_at(mm, addr, ptep, swp_pte);
 
                        /*
diff --git a/mm/mprotect.c b/mm/mprotect.c
index c37c9aa7a54e..2ce62d806108 100644
--- a/mm/mprotect.c
+++ b/mm/mprotect.c
@@ -187,6 +187,8 @@ static unsigned long change_pte_range(struct vm_area_struct 
*vma, pmd_t *pmd,
                                newpte = swp_entry_to_pte(entry);
                                if (pte_swp_soft_dirty(oldpte))
                                        newpte = pte_swp_mksoft_dirty(newpte);
+                               if (pte_swp_uffd_wp(oldpte))
+                                       newpte = pte_swp_mkuffd_wp(newpte);
                                set_pte_at(mm, addr, pte, newpte);
 
                                pages++;
diff --git a/mm/rmap.c b/mm/rmap.c
index 85b7f9423352..e1cf191db4f3 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -1463,6 +1463,8 @@ static bool try_to_unmap_one(struct page *page, struct 
vm_area_struct *vma,
                        swp_pte = swp_entry_to_pte(entry);
                        if (pte_soft_dirty(pteval))
                                swp_pte = pte_swp_mksoft_dirty(swp_pte);
+                       if (pte_uffd_wp(pteval))
+                               swp_pte = pte_swp_mkuffd_wp(swp_pte);
                        set_pte_at(mm, pvmw.address, pvmw.pte, swp_pte);
                        /*
                         * No need to invalidate here it will synchronize on
@@ -1555,6 +1557,8 @@ static bool try_to_unmap_one(struct page *page, struct 
vm_area_struct *vma,
                        swp_pte = swp_entry_to_pte(entry);
                        if (pte_soft_dirty(pteval))
                                swp_pte = pte_swp_mksoft_dirty(swp_pte);
+                       if (pte_uffd_wp(pteval))
+                               swp_pte = pte_swp_mkuffd_wp(swp_pte);
                        set_pte_at(mm, address, pvmw.pte, swp_pte);
                        /*
                         * No need to invalidate here it will synchronize on
@@ -1621,6 +1625,8 @@ static bool try_to_unmap_one(struct page *page, struct 
vm_area_struct *vma,
                        swp_pte = swp_entry_to_pte(entry);
                        if (pte_soft_dirty(pteval))
                                swp_pte = pte_swp_mksoft_dirty(swp_pte);
+                       if (pte_uffd_wp(pteval))
+                               swp_pte = pte_swp_mkuffd_wp(swp_pte);
                        set_pte_at(mm, address, pvmw.pte, swp_pte);
                        /* Invalidate as we cleared the pte */
                        mmu_notifier_invalidate_range(mm, address,
-- 
2.17.1

Reply via email to