Commit:

  af2c1401e6f9 ("mm: numa: guarantee that tlb_flush_pending updates are visible 
before page table updates")

added smp_mb__before_spinlock() to set_tlb_flush_pending(). I think we
can solve the same problem without this barrier.

If instead we mandate that mm_tlb_flush_pending() is used while
holding the PTL we're guaranteed to observe prior
set_tlb_flush_pending() instances.

For this to work we need to rework migrate_misplaced_transhuge_page()
a little and move the test up into do_huge_pmd_numa_page().

Cc: Mel Gorman <[email protected]>
Cc: Rik van Riel <[email protected]>
Signed-off-by: Peter Zijlstra (Intel) <[email protected]>
---
--- a/include/linux/mm_types.h
+++ b/include/linux/mm_types.h
@@ -527,18 +527,16 @@ static inline cpumask_t *mm_cpumask(stru
  */
 static inline bool mm_tlb_flush_pending(struct mm_struct *mm)
 {
-       barrier();
+       /*
+        * Must be called with PTL held; such that our PTL acquire will have
+        * observed the store from set_tlb_flush_pending().
+        */
        return mm->tlb_flush_pending;
 }
 static inline void set_tlb_flush_pending(struct mm_struct *mm)
 {
        mm->tlb_flush_pending = true;
-
-       /*
-        * Guarantee that the tlb_flush_pending store does not leak into the
-        * critical section updating the page tables
-        */
-       smp_mb__before_spinlock();
+       barrier();
 }
 /* Clearing is done after a TLB flush, which also provides a barrier. */
 static inline void clear_tlb_flush_pending(struct mm_struct *mm)
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -1410,6 +1410,7 @@ int do_huge_pmd_numa_page(struct vm_faul
        unsigned long haddr = vmf->address & HPAGE_PMD_MASK;
        int page_nid = -1, this_nid = numa_node_id();
        int target_nid, last_cpupid = -1;
+       bool need_flush = false;
        bool page_locked;
        bool migrated = false;
        bool was_writable;
@@ -1490,10 +1491,29 @@ int do_huge_pmd_numa_page(struct vm_faul
        }
 
        /*
+        * Since we took the NUMA fault, we must have observed the !accessible
+        * bit. Make sure all other CPUs agree with that, to avoid them
+        * modifying the page we're about to migrate.
+        *
+        * Must be done under PTL such that we'll observe the relevant
+        * set_tlb_flush_pending().
+        */
+       if (mm_tlb_flush_pending(mm))
+               need_flush = true;
+
+       /*
         * Migrate the THP to the requested node, returns with page unlocked
         * and access rights restored.
         */
        spin_unlock(vmf->ptl);
+
+       /*
+        * We are not sure a pending tlb flush here is for a huge page
+        * mapping or not. Hence use the tlb range variant
+        */
+       if (need_flush)
+               flush_tlb_range(vma, haddr, haddr + HPAGE_PMD_SIZE);
+
        migrated = migrate_misplaced_transhuge_page(vma->vm_mm, vma,
                                vmf->pmd, pmd, vmf->address, page, target_nid);
        if (migrated) {
--- a/mm/migrate.c
+++ b/mm/migrate.c
@@ -1935,12 +1935,6 @@ int migrate_misplaced_transhuge_page(str
                put_page(new_page);
                goto out_fail;
        }
-       /*
-        * We are not sure a pending tlb flush here is for a huge page
-        * mapping or not. Hence use the tlb range variant
-        */
-       if (mm_tlb_flush_pending(mm))
-               flush_tlb_range(vma, mmun_start, mmun_end);
 
        /* Prepare a page as a migration target */
        __SetPageLocked(new_page);


Reply via email to