For VMAs that don't want write notifications, PTEs created for read
faults have their write bit set. If the read fault happens after
VM_SOFTDIRTY is cleared, then the PTE's softdirty bit will remain
clear after subsequent writes.

Here's a simple code snippet to demonstrate the bug:

  char* m = mmap(NULL, getpagesize(), PROT_READ | PROT_WRITE,
                 MAP_ANONYMOUS | MAP_SHARED, -1, 0);
  system("echo 4 > /proc/$PPID/clear_refs"); /* clear VM_SOFTDIRTY */
  assert(*m == '\0');     /* new PTE allows write access */
  assert(!soft_dirty(x));
  *m = 'x';               /* should dirty the page */
  assert(soft_dirty(x));  /* fails */

With this patch, write notifications are enabled when VM_SOFTDIRTY is
cleared. Furthermore, to avoid unnecessary faults, write
notifications are disabled when VM_SOFTDIRTY is reset.

As a side effect of enabling and disabling write notifications with
care, this patch fixes a bug in mprotect where vm_page_prot bits set
by drivers were zapped on mprotect. An analogous bug was fixed in mmap
by c9d0bf241451a3ab7d02e1652c22b80cd7d93e8f.

Reported-by: Peter Feiner <pfei...@google.com>
Suggested-by: Kirill A. Shutemov <kirill.shute...@linux.intel.com>
Reviewed-by: Kirill A. Shutemov <kirill.shute...@linux.intel.com>
Signed-off-by: Peter Feiner <pfei...@google.com>

---

v1 -> v2: Instead of checking VM_SOFTDIRTY in the fault handler,
          enable write notifications on vm_page_prot when we clear
          VM_SOFTDIRTY.

v2 -> v3: * Grab the mmap_sem in write mode if any VMAs have
            VM_SOFTDIRTY set. This involved refactoring clear_refs_write
            to make it less unwieldy.

          * In mprotect, don't inadvertently disable write notifications on VMAs
            that have had VM_SOFTDIRTY cleared

          * The mprotect fix and mmap cleanup that comprised the
            second and third patches in v2 were swallowed by the main
            patch because of vm_page_prot corner case handling.

v3 -> v4: Handle !defined(CONFIG_MEM_SOFT_DIRTY): old patch would have
          enabled write notifications for all VMAs in this case.
---
 fs/proc/task_mmu.c | 113 +++++++++++++++++++++++++++++++++--------------------
 include/linux/mm.h |  14 +++++++
 mm/mmap.c          |  26 ++++++------
 mm/mprotect.c      |   6 +--
 4 files changed, 99 insertions(+), 60 deletions(-)

diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c
index dfc791c..f5e75c6 100644
--- a/fs/proc/task_mmu.c
+++ b/fs/proc/task_mmu.c
@@ -785,13 +785,80 @@ static int clear_refs_pte_range(pmd_t *pmd, unsigned long 
addr,
        return 0;
 }
 
+static int clear_refs(struct mm_struct *mm, enum clear_refs_types type,
+                      int write)
+{
+       int r = 0;
+       struct vm_area_struct *vma;
+       struct clear_refs_private cp = {
+               .type = type,
+       };
+       struct mm_walk clear_refs_walk = {
+               .pmd_entry = clear_refs_pte_range,
+               .mm = mm,
+               .private = &cp,
+       };
+
+       if (write)
+               down_write(&mm->mmap_sem);
+       else
+               down_read(&mm->mmap_sem);
+
+       if (type == CLEAR_REFS_SOFT_DIRTY)
+               mmu_notifier_invalidate_range_start(mm, 0, -1);
+
+       for (vma = mm->mmap; vma; vma = vma->vm_next) {
+               cp.vma = vma;
+               if (is_vm_hugetlb_page(vma))
+                       continue;
+               /*
+                * Writing 1 to /proc/pid/clear_refs affects all pages.
+                *
+                * Writing 2 to /proc/pid/clear_refs only affects
+                * Anonymous pages.
+                *
+                * Writing 3 to /proc/pid/clear_refs only affects file
+                * mapped pages.
+                *
+                * Writing 4 to /proc/pid/clear_refs affects all pages.
+                */
+               if (type == CLEAR_REFS_ANON && vma->vm_file)
+                       continue;
+               if (type == CLEAR_REFS_MAPPED && !vma->vm_file)
+                       continue;
+               if (type == CLEAR_REFS_SOFT_DIRTY &&
+                   (vma->vm_flags & VM_SOFTDIRTY)) {
+                       if (!write) {
+                               r = -EAGAIN;
+                               break;
+                       }
+                       vma->vm_flags &= ~VM_SOFTDIRTY;
+                       vma_enable_writenotify(vma);
+               }
+               walk_page_range(vma->vm_start, vma->vm_end,
+                               &clear_refs_walk);
+       }
+
+       if (type == CLEAR_REFS_SOFT_DIRTY)
+               mmu_notifier_invalidate_range_end(mm, 0, -1);
+
+       if (!r)
+               flush_tlb_mm(mm);
+
+       if (write)
+               up_write(&mm->mmap_sem);
+       else
+               up_read(&mm->mmap_sem);
+
+       return r;
+}
+
 static ssize_t clear_refs_write(struct file *file, const char __user *buf,
                                size_t count, loff_t *ppos)
 {
        struct task_struct *task;
        char buffer[PROC_NUMBUF];
        struct mm_struct *mm;
-       struct vm_area_struct *vma;
        enum clear_refs_types type;
        int itype;
        int rv;
@@ -820,47 +887,9 @@ static ssize_t clear_refs_write(struct file *file, const 
char __user *buf,
                return -ESRCH;
        mm = get_task_mm(task);
        if (mm) {
-               struct clear_refs_private cp = {
-                       .type = type,
-               };
-               struct mm_walk clear_refs_walk = {
-                       .pmd_entry = clear_refs_pte_range,
-                       .mm = mm,
-                       .private = &cp,
-               };
-               down_read(&mm->mmap_sem);
-               if (type == CLEAR_REFS_SOFT_DIRTY)
-                       mmu_notifier_invalidate_range_start(mm, 0, -1);
-               for (vma = mm->mmap; vma; vma = vma->vm_next) {
-                       cp.vma = vma;
-                       if (is_vm_hugetlb_page(vma))
-                               continue;
-                       /*
-                        * Writing 1 to /proc/pid/clear_refs affects all pages.
-                        *
-                        * Writing 2 to /proc/pid/clear_refs only affects
-                        * Anonymous pages.
-                        *
-                        * Writing 3 to /proc/pid/clear_refs only affects file
-                        * mapped pages.
-                        *
-                        * Writing 4 to /proc/pid/clear_refs affects all pages.
-                        */
-                       if (type == CLEAR_REFS_ANON && vma->vm_file)
-                               continue;
-                       if (type == CLEAR_REFS_MAPPED && !vma->vm_file)
-                               continue;
-                       if (type == CLEAR_REFS_SOFT_DIRTY) {
-                               if (vma->vm_flags & VM_SOFTDIRTY)
-                                       vma->vm_flags &= ~VM_SOFTDIRTY;
-                       }
-                       walk_page_range(vma->vm_start, vma->vm_end,
-                                       &clear_refs_walk);
-               }
-               if (type == CLEAR_REFS_SOFT_DIRTY)
-                       mmu_notifier_invalidate_range_end(mm, 0, -1);
-               flush_tlb_mm(mm);
-               up_read(&mm->mmap_sem);
+               rv = clear_refs(mm, type, 0);
+               if (rv)
+                       clear_refs(mm, type, 1);
                mmput(mm);
        }
        put_task_struct(task);
diff --git a/include/linux/mm.h b/include/linux/mm.h
index 8981cc8..7979b79 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -1946,6 +1946,20 @@ static inline pgprot_t vm_get_page_prot(unsigned long 
vm_flags)
 }
 #endif
 
+/* Enable write notifications without blowing away special flags. */
+static inline void vma_enable_writenotify(struct vm_area_struct *vma)
+{
+       pgprot_t newprot = vm_get_page_prot(vma->vm_flags & ~VM_SHARED);
+       vma->vm_page_prot = pgprot_modify(vma->vm_page_prot, newprot);
+}
+
+/* Disable write notifications without blowing away special flags. */
+static inline void vma_disable_writenotify(struct vm_area_struct *vma)
+{
+       pgprot_t newprot = vm_get_page_prot(vma->vm_flags);
+       vma->vm_page_prot = pgprot_modify(vma->vm_page_prot, newprot);
+}
+
 #ifdef CONFIG_NUMA_BALANCING
 unsigned long change_prot_numa(struct vm_area_struct *vma,
                        unsigned long start, unsigned long end);
diff --git a/mm/mmap.c b/mm/mmap.c
index c1f2ea4..0031130 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -1470,6 +1470,12 @@ int vma_wants_writenotify(struct vm_area_struct *vma)
        if (vma->vm_ops && vma->vm_ops->page_mkwrite)
                return 1;
 
+#ifdef CONFIG_MEM_SOFT_DIRTY
+       /* Do we need to track softdirty? */
+       if (!(vm_flags & VM_SOFTDIRTY))
+               return 1;
+#endif
+
        /* The open routine did something to the protections already? */
        if (pgprot_val(vma->vm_page_prot) !=
            pgprot_val(vm_get_page_prot(vm_flags)))
@@ -1610,21 +1616,6 @@ munmap_back:
                        goto free_vma;
        }
 
-       if (vma_wants_writenotify(vma)) {
-               pgprot_t pprot = vma->vm_page_prot;
-
-               /* Can vma->vm_page_prot have changed??
-                *
-                * Answer: Yes, drivers may have changed it in their
-                *         f_op->mmap method.
-                *
-                * Ensures that vmas marked as uncached stay that way.
-                */
-               vma->vm_page_prot = vm_get_page_prot(vm_flags & ~VM_SHARED);
-               if (pgprot_val(pprot) == pgprot_val(pgprot_noncached(pprot)))
-                       vma->vm_page_prot = pgprot_noncached(vma->vm_page_prot);
-       }
-
        vma_link(mm, vma, prev, rb_link, rb_parent);
        /* Once vma denies write, undo our temporary denial count */
        if (file) {
@@ -1658,6 +1649,11 @@ out:
         */
        vma->vm_flags |= VM_SOFTDIRTY;
 
+       if (vma_wants_writenotify(vma))
+               vma_enable_writenotify(vma);
+       else
+               vma_disable_writenotify(vma);
+
        return addr;
 
 unmap_and_free_vma:
diff --git a/mm/mprotect.c b/mm/mprotect.c
index c43d557..2dea043 100644
--- a/mm/mprotect.c
+++ b/mm/mprotect.c
@@ -320,12 +320,12 @@ success:
         * held in write mode.
         */
        vma->vm_flags = newflags;
-       vma->vm_page_prot = pgprot_modify(vma->vm_page_prot,
-                                         vm_get_page_prot(newflags));
 
        if (vma_wants_writenotify(vma)) {
-               vma->vm_page_prot = vm_get_page_prot(newflags & ~VM_SHARED);
+               vma_enable_writenotify(vma);
                dirty_accountable = 1;
+       } else {
+               vma_disable_writenotify(vma);
        }
 
        change_protection(vma, start, end, vma->vm_page_prot,
-- 
2.1.0.rc2.206.gedb03e5

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Reply via email to