from:"Laurent Dufour"

[PATCH v3 18/20] perf tools: Add support for the SPF perf event

2017-09-08 Thread Laurent Dufour

Add support for the new speculative faults event.

Signed-off-by: Laurent Dufour 
---
 tools/include/uapi/linux/perf_event.h | 1 +
 tools/perf/util/evsel.c   | 1 +
 tools/perf/util/parse-events.c| 4 
 tools/perf/util/parse-events.l| 1 +
 tools/perf/util/python.c  | 1 +
 5 files changed, 8 insertions(+)

diff --git a/tools/include/uapi/linux/perf_event.h 
b/tools/include/uapi/linux/perf_event.h
index 2a37ae925d85..8ee9a661018d 100644
--- a/tools/include/uapi/linux/perf_event.h
+++ b/tools/include/uapi/linux/perf_event.h
@@ -111,6 +111,7 @@ enum perf_sw_ids {
PERF_COUNT_SW_EMULATION_FAULTS  = 8,
PERF_COUNT_SW_DUMMY = 9,
PERF_COUNT_SW_BPF_OUTPUT= 10,
+   PERF_COUNT_SW_SPF   = 11,
 
PERF_COUNT_SW_MAX,  /* non-ABI */
 };
diff --git a/tools/perf/util/evsel.c b/tools/perf/util/evsel.c
index d9bd632ed7db..983e8104b523 100644
--- a/tools/perf/util/evsel.c
+++ b/tools/perf/util/evsel.c
@@ -432,6 +432,7 @@ const char *perf_evsel__sw_names[PERF_COUNT_SW_MAX] = {
"alignment-faults",
"emulation-faults",
"dummy",
+   "speculative-faults",
 };
 
 static const char *__perf_evsel__sw_name(u64 config)
diff --git a/tools/perf/util/parse-events.c b/tools/perf/util/parse-events.c
index f44aeba51d1f..ac2ebf99e965 100644
--- a/tools/perf/util/parse-events.c
+++ b/tools/perf/util/parse-events.c
@@ -135,6 +135,10 @@ struct event_symbol event_symbols_sw[PERF_COUNT_SW_MAX] = {
.symbol = "bpf-output",
.alias  = "",
},
+   [PERF_COUNT_SW_SPF] = {
+   .symbol = "speculative-faults",
+   .alias  = "spf",
+   },
 };
 
 #define __PERF_EVENT_FIELD(config, name) \
diff --git a/tools/perf/util/parse-events.l b/tools/perf/util/parse-events.l
index c42edeac451f..af3e52fb9103 100644
--- a/tools/perf/util/parse-events.l
+++ b/tools/perf/util/parse-events.l
@@ -289,6 +289,7 @@ alignment-faults{ return 
sym(yyscanner, PERF_TYPE_SOFTWARE, PERF_COUNT_SW_AL
 emulation-faults   { return sym(yyscanner, 
PERF_TYPE_SOFTWARE, PERF_COUNT_SW_EMULATION_FAULTS); }
 dummy  { return sym(yyscanner, 
PERF_TYPE_SOFTWARE, PERF_COUNT_SW_DUMMY); }
 bpf-output { return sym(yyscanner, 
PERF_TYPE_SOFTWARE, PERF_COUNT_SW_BPF_OUTPUT); }
+speculative-faults|spf { return sym(yyscanner, 
PERF_TYPE_SOFTWARE, PERF_COUNT_SW_SPF); }
 
/*
 * We have to handle the kernel PMU event 
cycles-ct/cycles-t/mem-loads/mem-stores separately.
diff --git a/tools/perf/util/python.c b/tools/perf/util/python.c
index c129e99114ae..12209adb7cb5 100644
--- a/tools/perf/util/python.c
+++ b/tools/perf/util/python.c
@@ -1141,6 +1141,7 @@ static struct {
PERF_CONST(COUNT_SW_ALIGNMENT_FAULTS),
PERF_CONST(COUNT_SW_EMULATION_FAULTS),
PERF_CONST(COUNT_SW_DUMMY),
+   PERF_CONST(COUNT_SW_SPF),
 
PERF_CONST(SAMPLE_IP),
PERF_CONST(SAMPLE_TID),
-- 
2.7.4

[PATCH v3 15/20] mm: Try spin lock in speculative path

2017-09-08 Thread Laurent Dufour

There is a deadlock when a CPU is doing a speculative page fault and
another one is calling do_unmap().

The deadlock occurred because the speculative path try to spinlock the
pte while the interrupt are disabled. When the other CPU in the
unmap's path has locked the pte then is waiting for all the CPU to
invalidate the TLB. As the CPU doing the speculative fault have the
interrupt disable it can't invalidate the TLB, and can't get the lock.

Since we are in a speculative path, we can race with other mm action.
So let assume that the lock may not get acquired and fail the
speculative page fault.

Here are the stacks captured during the deadlock:

CPU 0
native_flush_tlb_others+0x7c/0x260
flush_tlb_mm_range+0x6a/0x220
tlb_flush_mmu_tlbonly+0x63/0xc0
unmap_page_range+0x897/0x9d0
? unmap_single_vma+0x7d/0xe0
? release_pages+0x2b3/0x360
unmap_single_vma+0x7d/0xe0
unmap_vmas+0x51/0xa0
unmap_region+0xbd/0x130
do_munmap+0x279/0x460
SyS_munmap+0x53/0x70

CPU 1
do_raw_spin_lock+0x14e/0x160
_raw_spin_lock+0x5d/0x80
? pte_map_lock+0x169/0x1b0
pte_map_lock+0x169/0x1b0
handle_pte_fault+0xbf2/0xd80
? trace_hardirqs_on+0xd/0x10
handle_speculative_fault+0x272/0x280
handle_speculative_fault+0x5/0x280
__do_page_fault+0x187/0x580
trace_do_page_fault+0x52/0x260
do_async_page_fault+0x19/0x70
async_page_fault+0x28/0x30

Signed-off-by: Laurent Dufour <lduf...@linux.vnet.ibm.com>
---
 mm/memory.c | 19 ---
 1 file changed, 16 insertions(+), 3 deletions(-)

diff --git a/mm/memory.c b/mm/memory.c
index 5e98259c7ac0..18b39f930ce1 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -2432,7 +2432,8 @@ static bool pte_spinlock(struct vm_fault *vmf)
goto out;
 
vmf->ptl = pte_lockptr(vmf->vma->vm_mm, vmf->pmd);
-   spin_lock(vmf->ptl);
+   if (unlikely(!spin_trylock(vmf->ptl)))
+   goto out;
 
if (vma_has_changed(vmf)) {
spin_unlock(vmf->ptl);
@@ -2468,8 +2469,20 @@ static bool pte_map_lock(struct vm_fault *vmf)
if (vma_has_changed(vmf))
goto out;
 
-   pte = pte_offset_map_lock(vmf->vma->vm_mm, vmf->pmd,
- vmf->address, );
+   /*
+* Same as pte_offset_map_lock() except that we call
+* spin_trylock() in place of spin_lock() to avoid race with
+* unmap path which may have the lock and wait for this CPU
+* to invalidate TLB but this CPU has irq disabled.
+* Since we are in a speculative patch, accept it could fail
+*/
+   ptl = pte_lockptr(vmf->vma->vm_mm, vmf->pmd);
+   pte = pte_offset_map(vmf->pmd, vmf->address);
+   if (unlikely(!spin_trylock(ptl))) {
+   pte_unmap(pte);
+   goto out;
+   }
+
if (vma_has_changed(vmf)) {
pte_unmap_unlock(pte, ptl);
goto out;
-- 
2.7.4

[PATCH v3 15/20] mm: Try spin lock in speculative path

2017-09-08 Thread Laurent Dufour

There is a deadlock when a CPU is doing a speculative page fault and
another one is calling do_unmap().

The deadlock occurred because the speculative path try to spinlock the
pte while the interrupt are disabled. When the other CPU in the
unmap's path has locked the pte then is waiting for all the CPU to
invalidate the TLB. As the CPU doing the speculative fault have the
interrupt disable it can't invalidate the TLB, and can't get the lock.

Since we are in a speculative path, we can race with other mm action.
So let assume that the lock may not get acquired and fail the
speculative page fault.

Here are the stacks captured during the deadlock:

CPU 0
native_flush_tlb_others+0x7c/0x260
flush_tlb_mm_range+0x6a/0x220
tlb_flush_mmu_tlbonly+0x63/0xc0
unmap_page_range+0x897/0x9d0
? unmap_single_vma+0x7d/0xe0
? release_pages+0x2b3/0x360
unmap_single_vma+0x7d/0xe0
unmap_vmas+0x51/0xa0
unmap_region+0xbd/0x130
do_munmap+0x279/0x460
SyS_munmap+0x53/0x70

CPU 1
do_raw_spin_lock+0x14e/0x160
_raw_spin_lock+0x5d/0x80
? pte_map_lock+0x169/0x1b0
pte_map_lock+0x169/0x1b0
handle_pte_fault+0xbf2/0xd80
? trace_hardirqs_on+0xd/0x10
handle_speculative_fault+0x272/0x280
handle_speculative_fault+0x5/0x280
__do_page_fault+0x187/0x580
trace_do_page_fault+0x52/0x260
do_async_page_fault+0x19/0x70
async_page_fault+0x28/0x30

Signed-off-by: Laurent Dufour 
---
 mm/memory.c | 19 ---
 1 file changed, 16 insertions(+), 3 deletions(-)

diff --git a/mm/memory.c b/mm/memory.c
index 5e98259c7ac0..18b39f930ce1 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -2432,7 +2432,8 @@ static bool pte_spinlock(struct vm_fault *vmf)
goto out;
 
vmf->ptl = pte_lockptr(vmf->vma->vm_mm, vmf->pmd);
-   spin_lock(vmf->ptl);
+   if (unlikely(!spin_trylock(vmf->ptl)))
+   goto out;
 
if (vma_has_changed(vmf)) {
spin_unlock(vmf->ptl);
@@ -2468,8 +2469,20 @@ static bool pte_map_lock(struct vm_fault *vmf)
if (vma_has_changed(vmf))
goto out;
 
-   pte = pte_offset_map_lock(vmf->vma->vm_mm, vmf->pmd,
- vmf->address, );
+   /*
+* Same as pte_offset_map_lock() except that we call
+* spin_trylock() in place of spin_lock() to avoid race with
+* unmap path which may have the lock and wait for this CPU
+* to invalidate TLB but this CPU has irq disabled.
+* Since we are in a speculative patch, accept it could fail
+*/
+   ptl = pte_lockptr(vmf->vma->vm_mm, vmf->pmd);
+   pte = pte_offset_map(vmf->pmd, vmf->address);
+   if (unlikely(!spin_trylock(ptl))) {
+   pte_unmap(pte);
+   goto out;
+   }
+
if (vma_has_changed(vmf)) {
pte_unmap_unlock(pte, ptl);
goto out;
-- 
2.7.4

[PATCH v3 14/20] mm: Provide speculative fault infrastructure

2017-09-08 Thread Laurent Dufour

From: Peter Zijlstra <pet...@infradead.org>

Provide infrastructure to do a speculative fault (not holding
mmap_sem).

The not holding of mmap_sem means we can race against VMA
change/removal and page-table destruction. We use the SRCU VMA freeing
to keep the VMA around. We use the VMA seqcount to detect change
(including umapping / page-table deletion) and we use gup_fast() style
page-table walking to deal with page-table races.

Once we've obtained the page and are ready to update the PTE, we
validate if the state we started the fault with is still valid, if
not, we'll fail the fault with VM_FAULT_RETRY, otherwise we update the
PTE and we're done.

Signed-off-by: Peter Zijlstra (Intel) <pet...@infradead.org>

[Manage the newly introduced pte_spinlock() for speculative page
 fault to fail if the VMA is touched in our back]
[Rename vma_is_dead() to vma_has_changed() and declare it here]
[Call p4d_alloc() as it is safe since pgd is valid]
[Call pud_alloc() as it is safe since p4d is valid]
[Set fe.sequence in __handle_mm_fault()]
[Abort speculative path when handle_userfault() has to be called]
[Add additional VMA's flags checks in handle_speculative_fault()]
[Clear FAULT_FLAG_ALLOW_RETRY in handle_speculative_fault()]
[Don't set vmf->pte and vmf->ptl if pte_map_lock() failed]
[Remove warning comment about waiting for !seq&1 since we don't want
 to wait]
[Remove warning about no huge page support, mention it explictly]
[Don't call do_fault() in the speculative path as __do_fault() calls
 vma->vm_ops->fault() which may want to release mmap_sem]
[Only vm_fault pointer argument for vma_has_changed()]
[Fix check against huge page, calling pmd_trans_huge()]
[Introduce __HAVE_ARCH_CALL_SPF to declare the SPF handler only when
 architecture is supporting it]
[Use READ_ONCE() when reading VMA's fields in the speculative path]
[Explicitly check for __HAVE_ARCH_PTE_SPECIAL as we can't support for
 processing done in vm_normal_page()]
[Check that vma->anon_vma is already set when starting the speculative
 path]
[Check for memory policy as we can't support MPOL_INTERLEAVE case due to
 the processing done in mpol_misplaced()]
[Don't support VMA growing up or down]
[Move check on vm_sequence just before calling handle_pte_fault()]
Signed-off-by: Laurent Dufour <lduf...@linux.vnet.ibm.com>
---
 include/linux/hugetlb_inline.h |   2 +-
 include/linux/mm.h |   5 +
 include/linux/pagemap.h|   4 +-
 mm/internal.h  |  14 +++
 mm/memory.c| 249 -
 5 files changed, 266 insertions(+), 8 deletions(-)

diff --git a/include/linux/hugetlb_inline.h b/include/linux/hugetlb_inline.h
index a4e7ca0f3585..6cfdfca4cc2a 100644
--- a/include/linux/hugetlb_inline.h
+++ b/include/linux/hugetlb_inline.h
@@ -7,7 +7,7 @@
 
 static inline bool is_vm_hugetlb_page(struct vm_area_struct *vma)
 {
-   return !!(vma->vm_flags & VM_HUGETLB);
+   return !!(READ_ONCE(vma->vm_flags) & VM_HUGETLB);
 }
 
 #else
diff --git a/include/linux/mm.h b/include/linux/mm.h
index a2857aaa03f1..966b69f10f57 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -320,6 +320,7 @@ struct vm_fault {
gfp_t gfp_mask; /* gfp mask to be used for allocations 
*/
pgoff_t pgoff;  /* Logical page offset based on vma */
unsigned long address;  /* Faulting virtual address */
+   unsigned int sequence;
pmd_t *pmd; /* Pointer to pmd entry matching
 * the 'address' */
pud_t *pud; /* Pointer to pud entry matching
@@ -1342,6 +1343,10 @@ int invalidate_inode_page(struct page *page);
 #ifdef CONFIG_MMU
 extern int handle_mm_fault(struct vm_area_struct *vma, unsigned long address,
unsigned int flags);
+#ifdef __HAVE_ARCH_CALL_SPF
+extern int handle_speculative_fault(struct mm_struct *mm,
+   unsigned long address, unsigned int flags);
+#endif /* __HAVE_ARCH_CALL_SPF */
 extern int fixup_user_fault(struct task_struct *tsk, struct mm_struct *mm,
unsigned long address, unsigned int fault_flags,
bool *unlocked);
diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h
index 5bbd6780f205..832aa3ec7d00 100644
--- a/include/linux/pagemap.h
+++ b/include/linux/pagemap.h
@@ -451,8 +451,8 @@ static inline pgoff_t linear_page_index(struct 
vm_area_struct *vma,
pgoff_t pgoff;
if (unlikely(is_vm_hugetlb_page(vma)))
return linear_hugepage_index(vma, address);
-   pgoff = (address - vma->vm_start) >> PAGE_SHIFT;
-   pgoff += vma->vm_pgoff;
+   pgoff = (address - READ_ONCE(vma->vm_start)) >> PAGE_SHIFT;
+   pgoff += READ_ONCE(vma->vm_pgoff);
return pgoff;
 }
 
diff --git a/mm/internal.h b/mm/inte

[PATCH v3 14/20] mm: Provide speculative fault infrastructure

2017-09-08 Thread Laurent Dufour

From: Peter Zijlstra 

Provide infrastructure to do a speculative fault (not holding
mmap_sem).

The not holding of mmap_sem means we can race against VMA
change/removal and page-table destruction. We use the SRCU VMA freeing
to keep the VMA around. We use the VMA seqcount to detect change
(including umapping / page-table deletion) and we use gup_fast() style
page-table walking to deal with page-table races.

Once we've obtained the page and are ready to update the PTE, we
validate if the state we started the fault with is still valid, if
not, we'll fail the fault with VM_FAULT_RETRY, otherwise we update the
PTE and we're done.

Signed-off-by: Peter Zijlstra (Intel) 

[Manage the newly introduced pte_spinlock() for speculative page
 fault to fail if the VMA is touched in our back]
[Rename vma_is_dead() to vma_has_changed() and declare it here]
[Call p4d_alloc() as it is safe since pgd is valid]
[Call pud_alloc() as it is safe since p4d is valid]
[Set fe.sequence in __handle_mm_fault()]
[Abort speculative path when handle_userfault() has to be called]
[Add additional VMA's flags checks in handle_speculative_fault()]
[Clear FAULT_FLAG_ALLOW_RETRY in handle_speculative_fault()]
[Don't set vmf->pte and vmf->ptl if pte_map_lock() failed]
[Remove warning comment about waiting for !seq&1 since we don't want
 to wait]
[Remove warning about no huge page support, mention it explictly]
[Don't call do_fault() in the speculative path as __do_fault() calls
 vma->vm_ops->fault() which may want to release mmap_sem]
[Only vm_fault pointer argument for vma_has_changed()]
[Fix check against huge page, calling pmd_trans_huge()]
[Introduce __HAVE_ARCH_CALL_SPF to declare the SPF handler only when
 architecture is supporting it]
[Use READ_ONCE() when reading VMA's fields in the speculative path]
[Explicitly check for __HAVE_ARCH_PTE_SPECIAL as we can't support for
 processing done in vm_normal_page()]
[Check that vma->anon_vma is already set when starting the speculative
 path]
[Check for memory policy as we can't support MPOL_INTERLEAVE case due to
 the processing done in mpol_misplaced()]
[Don't support VMA growing up or down]
[Move check on vm_sequence just before calling handle_pte_fault()]
Signed-off-by: Laurent Dufour 
---
 include/linux/hugetlb_inline.h |   2 +-
 include/linux/mm.h |   5 +
 include/linux/pagemap.h|   4 +-
 mm/internal.h  |  14 +++
 mm/memory.c| 249 -
 5 files changed, 266 insertions(+), 8 deletions(-)

diff --git a/include/linux/hugetlb_inline.h b/include/linux/hugetlb_inline.h
index a4e7ca0f3585..6cfdfca4cc2a 100644
--- a/include/linux/hugetlb_inline.h
+++ b/include/linux/hugetlb_inline.h
@@ -7,7 +7,7 @@
 
 static inline bool is_vm_hugetlb_page(struct vm_area_struct *vma)
 {
-   return !!(vma->vm_flags & VM_HUGETLB);
+   return !!(READ_ONCE(vma->vm_flags) & VM_HUGETLB);
 }
 
 #else
diff --git a/include/linux/mm.h b/include/linux/mm.h
index a2857aaa03f1..966b69f10f57 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -320,6 +320,7 @@ struct vm_fault {
gfp_t gfp_mask; /* gfp mask to be used for allocations 
*/
pgoff_t pgoff;  /* Logical page offset based on vma */
unsigned long address;  /* Faulting virtual address */
+   unsigned int sequence;
pmd_t *pmd; /* Pointer to pmd entry matching
 * the 'address' */
pud_t *pud; /* Pointer to pud entry matching
@@ -1342,6 +1343,10 @@ int invalidate_inode_page(struct page *page);
 #ifdef CONFIG_MMU
 extern int handle_mm_fault(struct vm_area_struct *vma, unsigned long address,
unsigned int flags);
+#ifdef __HAVE_ARCH_CALL_SPF
+extern int handle_speculative_fault(struct mm_struct *mm,
+   unsigned long address, unsigned int flags);
+#endif /* __HAVE_ARCH_CALL_SPF */
 extern int fixup_user_fault(struct task_struct *tsk, struct mm_struct *mm,
unsigned long address, unsigned int fault_flags,
bool *unlocked);
diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h
index 5bbd6780f205..832aa3ec7d00 100644
--- a/include/linux/pagemap.h
+++ b/include/linux/pagemap.h
@@ -451,8 +451,8 @@ static inline pgoff_t linear_page_index(struct 
vm_area_struct *vma,
pgoff_t pgoff;
if (unlikely(is_vm_hugetlb_page(vma)))
return linear_hugepage_index(vma, address);
-   pgoff = (address - vma->vm_start) >> PAGE_SHIFT;
-   pgoff += vma->vm_pgoff;
+   pgoff = (address - READ_ONCE(vma->vm_start)) >> PAGE_SHIFT;
+   pgoff += READ_ONCE(vma->vm_pgoff);
return pgoff;
 }
 
diff --git a/mm/internal.h b/mm/internal.h
index 84360184eafd..4ddadc440c26 100644
--- a/mm/internal.h
+++ b/mm/internal.h
@@ -45,6 +45

[PATCH v3 11/20] mm: Introduce __maybe_mkwrite()

2017-09-08 Thread Laurent Dufour

The current maybe_mkwrite() is getting passed the pointer to the vma
structure to fetch the vm_flags field.

When dealing with the speculative page fault handler, it will be better to
rely on the cached vm_flags value stored in the vm_fault structure.

This patch introduce a __maybe_mkwrite() service which can be called by
passing the value of the vm_flags field.

There is no change functional changes expected for the other callers of
maybe_mkwrite().

Signed-off-by: Laurent Dufour <lduf...@linux.vnet.ibm.com>
---
 include/linux/mm.h | 9 +++--
 mm/memory.c| 6 +++---
 2 files changed, 10 insertions(+), 5 deletions(-)

diff --git a/include/linux/mm.h b/include/linux/mm.h
index 5fd90ac31317..bb0c87f1c725 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -673,13 +673,18 @@ void free_compound_page(struct page *page);
  * pte_mkwrite.  But get_user_pages can cause write faults for mappings
  * that do not have writing enabled, when used by access_process_vm.
  */
-static inline pte_t maybe_mkwrite(pte_t pte, struct vm_area_struct *vma)
+static inline pte_t __maybe_mkwrite(pte_t pte, unsigned long vma_flags)
 {
-   if (likely(vma->vm_flags & VM_WRITE))
+   if (likely(vma_flags & VM_WRITE))
pte = pte_mkwrite(pte);
return pte;
 }
 
+static inline pte_t maybe_mkwrite(pte_t pte, struct vm_area_struct *vma)
+{
+   return __maybe_mkwrite(pte, vma->vm_flags);
+}
+
 int alloc_set_pte(struct vm_fault *vmf, struct mem_cgroup *memcg,
struct page *page);
 int finish_fault(struct vm_fault *vmf);
diff --git a/mm/memory.c b/mm/memory.c
index 4583f354be94..c306f1f64c9e 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -2408,7 +2408,7 @@ static inline void wp_page_reuse(struct vm_fault *vmf)
 
flush_cache_page(vma, vmf->address, pte_pfn(vmf->orig_pte));
entry = pte_mkyoung(vmf->orig_pte);
-   entry = maybe_mkwrite(pte_mkdirty(entry), vma);
+   entry = __maybe_mkwrite(pte_mkdirty(entry), vmf->vma_flags);
if (ptep_set_access_flags(vma, vmf->address, vmf->pte, entry, 1))
update_mmu_cache(vma, vmf->address, vmf->pte);
pte_unmap_unlock(vmf->pte, vmf->ptl);
@@ -2498,8 +2498,8 @@ static int wp_page_copy(struct vm_fault *vmf)
inc_mm_counter_fast(mm, MM_ANONPAGES);
}
flush_cache_page(vma, vmf->address, pte_pfn(vmf->orig_pte));
-   entry = mk_pte(new_page, vma->vm_page_prot);
-   entry = maybe_mkwrite(pte_mkdirty(entry), vma);
+   entry = mk_pte(new_page, vmf->vma_page_prot);
+   entry = __maybe_mkwrite(pte_mkdirty(entry), vmf->vma_flags);
/*
 * Clear the pte entry and flush it first, before updating the
 * pte with the new entry. This will avoid a race condition
-- 
2.7.4

[PATCH v3 07/20] mm: Cache some VMA fields in the vm_fault structure

2017-09-08 Thread Laurent Dufour

When handling speculative page fault, the vma->vm_flags and
vma->vm_page_prot fields are read once the page table lock is released. So
there is no more guarantee that these fields would not change in our back.
They will be saved in the vm_fault structure before the VMA is checked for
changes.

This patch also set the fields in hugetlb_no_page() and
__collapse_huge_page_swapin even if it is not need for the callee.

Signed-off-by: Laurent Dufour <lduf...@linux.vnet.ibm.com>
---
 include/linux/mm.h |  6 ++
 mm/hugetlb.c   |  2 ++
 mm/khugepaged.c|  2 ++
 mm/memory.c| 38 --
 4 files changed, 30 insertions(+), 18 deletions(-)

diff --git a/include/linux/mm.h b/include/linux/mm.h
index 46e769a5a7ab..5fd90ac31317 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -350,6 +350,12 @@ struct vm_fault {
 * page table to avoid allocation from
 * atomic context.
 */
+   /*
+* These entries are required when handling speculative page fault.
+* This way the page handling is done using consistent field values.
+*/
+   unsigned long vma_flags;
+   pgprot_t vma_page_prot;
 };
 
 /* page entry size for vm->huge_fault() */
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index 424b0ef08a60..da82a86a4761 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -3683,6 +3683,8 @@ static int hugetlb_no_page(struct mm_struct *mm, struct 
vm_area_struct *vma,
.vma = vma,
.address = address,
.flags = flags,
+   .vma_flags = vma->vm_flags,
+   .vma_page_prot = vma->vm_page_prot,
/*
 * Hard to debug if it ends up being
 * used by a callee that assumes
diff --git a/mm/khugepaged.c b/mm/khugepaged.c
index 56dd994c05d0..0525a0e74535 100644
--- a/mm/khugepaged.c
+++ b/mm/khugepaged.c
@@ -881,6 +881,8 @@ static bool __collapse_huge_page_swapin(struct mm_struct 
*mm,
.flags = FAULT_FLAG_ALLOW_RETRY,
.pmd = pmd,
.pgoff = linear_page_index(vma, address),
+   .vma_flags = vma->vm_flags,
+   .vma_page_prot = vma->vm_page_prot,
};
 
/* we only decide to swapin, if there is enough young ptes */
diff --git a/mm/memory.c b/mm/memory.c
index f250e7c92948..f008042ab24e 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -2556,7 +2556,7 @@ static int wp_page_copy(struct vm_fault *vmf)
 * Don't let another task, with possibly unlocked vma,
 * keep the mlocked page.
 */
-   if (page_copied && (vma->vm_flags & VM_LOCKED)) {
+   if (page_copied && (vmf->vma_flags & VM_LOCKED)) {
lock_page(old_page);/* LRU manipulation */
if (PageMlocked(old_page))
munlock_vma_page(old_page);
@@ -2590,7 +2590,7 @@ static int wp_page_copy(struct vm_fault *vmf)
  */
 int finish_mkwrite_fault(struct vm_fault *vmf)
 {
-   WARN_ON_ONCE(!(vmf->vma->vm_flags & VM_SHARED));
+   WARN_ON_ONCE(!(vmf->vma_flags & VM_SHARED));
if (!pte_map_lock(vmf))
return VM_FAULT_RETRY;
/*
@@ -2692,7 +2692,7 @@ static int do_wp_page(struct vm_fault *vmf)
 * We should not cow pages in a shared writeable mapping.
 * Just mark the pages writable and/or call ops->pfn_mkwrite.
 */
-   if ((vma->vm_flags & (VM_WRITE|VM_SHARED)) ==
+   if ((vmf->vma_flags & (VM_WRITE|VM_SHARED)) ==
 (VM_WRITE|VM_SHARED))
return wp_pfn_shared(vmf);
 
@@ -2739,7 +2739,7 @@ static int do_wp_page(struct vm_fault *vmf)
return VM_FAULT_WRITE;
}
unlock_page(vmf->page);
-   } else if (unlikely((vma->vm_flags & (VM_WRITE|VM_SHARED)) ==
+   } else if (unlikely((vmf->vma_flags & (VM_WRITE|VM_SHARED)) ==
(VM_WRITE|VM_SHARED))) {
return wp_page_shared(vmf);
}
@@ -2975,7 +2975,7 @@ int do_swap_page(struct vm_fault *vmf)
 
inc_mm_counter_fast(vma->vm_mm, MM_ANONPAGES);
dec_mm_counter_fast(vma->vm_mm, MM_SWAPENTS);
-   pte = mk_pte(page, vma->vm_page_prot);
+   pte = mk_pte(page, vmf->vma_page_prot);
if ((vmf->flags & FAULT_FLAG_WRITE) && reuse_swap_page(page, NULL)) {
pte = maybe_mkwrite(pte_mkdirty(pte), vma);
vmf->flags &= ~FAULT_FLAG_WRITE;
@@ -2999,7 +299

[PATCH v3 11/20] mm: Introduce __maybe_mkwrite()

2017-09-08 Thread Laurent Dufour

The current maybe_mkwrite() is getting passed the pointer to the vma
structure to fetch the vm_flags field.

When dealing with the speculative page fault handler, it will be better to
rely on the cached vm_flags value stored in the vm_fault structure.

This patch introduce a __maybe_mkwrite() service which can be called by
passing the value of the vm_flags field.

There is no change functional changes expected for the other callers of
maybe_mkwrite().

Signed-off-by: Laurent Dufour 
---
 include/linux/mm.h | 9 +++--
 mm/memory.c| 6 +++---
 2 files changed, 10 insertions(+), 5 deletions(-)

diff --git a/include/linux/mm.h b/include/linux/mm.h
index 5fd90ac31317..bb0c87f1c725 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -673,13 +673,18 @@ void free_compound_page(struct page *page);
  * pte_mkwrite.  But get_user_pages can cause write faults for mappings
  * that do not have writing enabled, when used by access_process_vm.
  */
-static inline pte_t maybe_mkwrite(pte_t pte, struct vm_area_struct *vma)
+static inline pte_t __maybe_mkwrite(pte_t pte, unsigned long vma_flags)
 {
-   if (likely(vma->vm_flags & VM_WRITE))
+   if (likely(vma_flags & VM_WRITE))
pte = pte_mkwrite(pte);
return pte;
 }
 
+static inline pte_t maybe_mkwrite(pte_t pte, struct vm_area_struct *vma)
+{
+   return __maybe_mkwrite(pte, vma->vm_flags);
+}
+
 int alloc_set_pte(struct vm_fault *vmf, struct mem_cgroup *memcg,
struct page *page);
 int finish_fault(struct vm_fault *vmf);
diff --git a/mm/memory.c b/mm/memory.c
index 4583f354be94..c306f1f64c9e 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -2408,7 +2408,7 @@ static inline void wp_page_reuse(struct vm_fault *vmf)
 
flush_cache_page(vma, vmf->address, pte_pfn(vmf->orig_pte));
entry = pte_mkyoung(vmf->orig_pte);
-   entry = maybe_mkwrite(pte_mkdirty(entry), vma);
+   entry = __maybe_mkwrite(pte_mkdirty(entry), vmf->vma_flags);
if (ptep_set_access_flags(vma, vmf->address, vmf->pte, entry, 1))
update_mmu_cache(vma, vmf->address, vmf->pte);
pte_unmap_unlock(vmf->pte, vmf->ptl);
@@ -2498,8 +2498,8 @@ static int wp_page_copy(struct vm_fault *vmf)
inc_mm_counter_fast(mm, MM_ANONPAGES);
}
flush_cache_page(vma, vmf->address, pte_pfn(vmf->orig_pte));
-   entry = mk_pte(new_page, vma->vm_page_prot);
-   entry = maybe_mkwrite(pte_mkdirty(entry), vma);
+   entry = mk_pte(new_page, vmf->vma_page_prot);
+   entry = __maybe_mkwrite(pte_mkdirty(entry), vmf->vma_flags);
/*
 * Clear the pte entry and flush it first, before updating the
 * pte with the new entry. This will avoid a race condition
-- 
2.7.4

[PATCH v3 07/20] mm: Cache some VMA fields in the vm_fault structure

2017-09-08 Thread Laurent Dufour

When handling speculative page fault, the vma->vm_flags and
vma->vm_page_prot fields are read once the page table lock is released. So
there is no more guarantee that these fields would not change in our back.
They will be saved in the vm_fault structure before the VMA is checked for
changes.

This patch also set the fields in hugetlb_no_page() and
__collapse_huge_page_swapin even if it is not need for the callee.

Signed-off-by: Laurent Dufour 
---
 include/linux/mm.h |  6 ++
 mm/hugetlb.c   |  2 ++
 mm/khugepaged.c|  2 ++
 mm/memory.c| 38 --
 4 files changed, 30 insertions(+), 18 deletions(-)

diff --git a/include/linux/mm.h b/include/linux/mm.h
index 46e769a5a7ab..5fd90ac31317 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -350,6 +350,12 @@ struct vm_fault {
 * page table to avoid allocation from
 * atomic context.
 */
+   /*
+* These entries are required when handling speculative page fault.
+* This way the page handling is done using consistent field values.
+*/
+   unsigned long vma_flags;
+   pgprot_t vma_page_prot;
 };
 
 /* page entry size for vm->huge_fault() */
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index 424b0ef08a60..da82a86a4761 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -3683,6 +3683,8 @@ static int hugetlb_no_page(struct mm_struct *mm, struct 
vm_area_struct *vma,
.vma = vma,
.address = address,
.flags = flags,
+   .vma_flags = vma->vm_flags,
+   .vma_page_prot = vma->vm_page_prot,
/*
 * Hard to debug if it ends up being
 * used by a callee that assumes
diff --git a/mm/khugepaged.c b/mm/khugepaged.c
index 56dd994c05d0..0525a0e74535 100644
--- a/mm/khugepaged.c
+++ b/mm/khugepaged.c
@@ -881,6 +881,8 @@ static bool __collapse_huge_page_swapin(struct mm_struct 
*mm,
.flags = FAULT_FLAG_ALLOW_RETRY,
.pmd = pmd,
.pgoff = linear_page_index(vma, address),
+   .vma_flags = vma->vm_flags,
+   .vma_page_prot = vma->vm_page_prot,
};
 
/* we only decide to swapin, if there is enough young ptes */
diff --git a/mm/memory.c b/mm/memory.c
index f250e7c92948..f008042ab24e 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -2556,7 +2556,7 @@ static int wp_page_copy(struct vm_fault *vmf)
 * Don't let another task, with possibly unlocked vma,
 * keep the mlocked page.
 */
-   if (page_copied && (vma->vm_flags & VM_LOCKED)) {
+   if (page_copied && (vmf->vma_flags & VM_LOCKED)) {
lock_page(old_page);/* LRU manipulation */
if (PageMlocked(old_page))
munlock_vma_page(old_page);
@@ -2590,7 +2590,7 @@ static int wp_page_copy(struct vm_fault *vmf)
  */
 int finish_mkwrite_fault(struct vm_fault *vmf)
 {
-   WARN_ON_ONCE(!(vmf->vma->vm_flags & VM_SHARED));
+   WARN_ON_ONCE(!(vmf->vma_flags & VM_SHARED));
if (!pte_map_lock(vmf))
return VM_FAULT_RETRY;
/*
@@ -2692,7 +2692,7 @@ static int do_wp_page(struct vm_fault *vmf)
 * We should not cow pages in a shared writeable mapping.
 * Just mark the pages writable and/or call ops->pfn_mkwrite.
 */
-   if ((vma->vm_flags & (VM_WRITE|VM_SHARED)) ==
+   if ((vmf->vma_flags & (VM_WRITE|VM_SHARED)) ==
 (VM_WRITE|VM_SHARED))
return wp_pfn_shared(vmf);
 
@@ -2739,7 +2739,7 @@ static int do_wp_page(struct vm_fault *vmf)
return VM_FAULT_WRITE;
}
unlock_page(vmf->page);
-   } else if (unlikely((vma->vm_flags & (VM_WRITE|VM_SHARED)) ==
+   } else if (unlikely((vmf->vma_flags & (VM_WRITE|VM_SHARED)) ==
(VM_WRITE|VM_SHARED))) {
return wp_page_shared(vmf);
}
@@ -2975,7 +2975,7 @@ int do_swap_page(struct vm_fault *vmf)
 
inc_mm_counter_fast(vma->vm_mm, MM_ANONPAGES);
dec_mm_counter_fast(vma->vm_mm, MM_SWAPENTS);
-   pte = mk_pte(page, vma->vm_page_prot);
+   pte = mk_pte(page, vmf->vma_page_prot);
if ((vmf->flags & FAULT_FLAG_WRITE) && reuse_swap_page(page, NULL)) {
pte = maybe_mkwrite(pte_mkdirty(pte), vma);
vmf->flags &= ~FAULT_FLAG_WRITE;
@@ -2999,7 +2999,7 @@ int do_swap_page(struct vm_

[PATCH v3 19/20] x86/mm: Add speculative pagefault handling

2017-09-08 Thread Laurent Dufour

From: Peter Zijlstra <pet...@infradead.org>

Try a speculative fault before acquiring mmap_sem, if it returns with
VM_FAULT_RETRY continue with the mmap_sem acquisition and do the
traditional fault.

Signed-off-by: Peter Zijlstra (Intel) <pet...@infradead.org>

[Clearing of FAULT_FLAG_ALLOW_RETRY is now done in
 handle_speculative_fault()]
[Retry with usual fault path in the case VM_ERROR is returned by
 handle_speculative_fault(). This allows signal to be delivered]
Signed-off-by: Laurent Dufour <lduf...@linux.vnet.ibm.com>
---
 arch/x86/include/asm/pgtable_types.h |  7 +++
 arch/x86/mm/fault.c  | 19 +++
 2 files changed, 26 insertions(+)

diff --git a/arch/x86/include/asm/pgtable_types.h 
b/arch/x86/include/asm/pgtable_types.h
index f1492473f10e..aadc8ecb91fb 100644
--- a/arch/x86/include/asm/pgtable_types.h
+++ b/arch/x86/include/asm/pgtable_types.h
@@ -257,6 +257,13 @@ enum page_cache_mode {
 #define PGD_IDENT_ATTR  0x001  /* PRESENT (no other attributes) */
 #endif
 
+/*
+ * Advertise that we call the Speculative Page Fault handler.
+ */
+#ifdef CONFIG_X86_64
+#define __HAVE_ARCH_CALL_SPF
+#endif
+
 #ifdef CONFIG_X86_32
 # include 
 #else
diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c
index b836a7274e12..652af5524f42 100644
--- a/arch/x86/mm/fault.c
+++ b/arch/x86/mm/fault.c
@@ -1365,6 +1365,24 @@ __do_page_fault(struct pt_regs *regs, unsigned long 
error_code,
if (error_code & PF_INSTR)
flags |= FAULT_FLAG_INSTRUCTION;
 
+#ifdef __HAVE_ARCH_CALL_SPF
+   if (error_code & PF_USER) {
+   fault = handle_speculative_fault(mm, address, flags);
+
+   /*
+* We also check against VM_FAULT_ERROR because we have to
+* raise a signal by calling later mm_fault_error() which
+* requires the vma pointer to be set. So in that case,
+* we fall through the normal path.
+*/
+   if (!(fault & VM_FAULT_RETRY || fault & VM_FAULT_ERROR)) {
+   perf_sw_event(PERF_COUNT_SW_SPF, 1,
+ regs, address);
+   goto done;
+   }
+   }
+#endif /* __HAVE_ARCH_CALL_SPF */
+
/*
 * When running in the kernel we expect faults to occur only to
 * addresses in user space.  All other faults represent errors in
@@ -1474,6 +1492,7 @@ __do_page_fault(struct pt_regs *regs, unsigned long 
error_code,
return;
}
 
+done:
/*
 * Major/minor page fault accounting. If any of the events
 * returned VM_FAULT_MAJOR, we account it as a major fault.
-- 
2.7.4

[PATCH v3 17/20] perf: Add a speculative page fault sw event

2017-09-08 Thread Laurent Dufour

Add a new software event to count succeeded speculative page faults.

Signed-off-by: Laurent Dufour <lduf...@linux.vnet.ibm.com>
---
 include/uapi/linux/perf_event.h | 1 +
 1 file changed, 1 insertion(+)

diff --git a/include/uapi/linux/perf_event.h b/include/uapi/linux/perf_event.h
index 140ae638cfd6..101e509ee39b 100644
--- a/include/uapi/linux/perf_event.h
+++ b/include/uapi/linux/perf_event.h
@@ -111,6 +111,7 @@ enum perf_sw_ids {
PERF_COUNT_SW_EMULATION_FAULTS  = 8,
PERF_COUNT_SW_DUMMY = 9,
PERF_COUNT_SW_BPF_OUTPUT= 10,
+   PERF_COUNT_SW_SPF   = 11,
 
PERF_COUNT_SW_MAX,  /* non-ABI */
 };
-- 
2.7.4

[PATCH v3 19/20] x86/mm: Add speculative pagefault handling

2017-09-08 Thread Laurent Dufour

From: Peter Zijlstra 

Try a speculative fault before acquiring mmap_sem, if it returns with
VM_FAULT_RETRY continue with the mmap_sem acquisition and do the
traditional fault.

Signed-off-by: Peter Zijlstra (Intel) 

[Clearing of FAULT_FLAG_ALLOW_RETRY is now done in
 handle_speculative_fault()]
[Retry with usual fault path in the case VM_ERROR is returned by
 handle_speculative_fault(). This allows signal to be delivered]
Signed-off-by: Laurent Dufour 
---
 arch/x86/include/asm/pgtable_types.h |  7 +++
 arch/x86/mm/fault.c  | 19 +++
 2 files changed, 26 insertions(+)

diff --git a/arch/x86/include/asm/pgtable_types.h 
b/arch/x86/include/asm/pgtable_types.h
index f1492473f10e..aadc8ecb91fb 100644
--- a/arch/x86/include/asm/pgtable_types.h
+++ b/arch/x86/include/asm/pgtable_types.h
@@ -257,6 +257,13 @@ enum page_cache_mode {
 #define PGD_IDENT_ATTR  0x001  /* PRESENT (no other attributes) */
 #endif
 
+/*
+ * Advertise that we call the Speculative Page Fault handler.
+ */
+#ifdef CONFIG_X86_64
+#define __HAVE_ARCH_CALL_SPF
+#endif
+
 #ifdef CONFIG_X86_32
 # include 
 #else
diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c
index b836a7274e12..652af5524f42 100644
--- a/arch/x86/mm/fault.c
+++ b/arch/x86/mm/fault.c
@@ -1365,6 +1365,24 @@ __do_page_fault(struct pt_regs *regs, unsigned long 
error_code,
if (error_code & PF_INSTR)
flags |= FAULT_FLAG_INSTRUCTION;
 
+#ifdef __HAVE_ARCH_CALL_SPF
+   if (error_code & PF_USER) {
+   fault = handle_speculative_fault(mm, address, flags);
+
+   /*
+* We also check against VM_FAULT_ERROR because we have to
+* raise a signal by calling later mm_fault_error() which
+* requires the vma pointer to be set. So in that case,
+* we fall through the normal path.
+*/
+   if (!(fault & VM_FAULT_RETRY || fault & VM_FAULT_ERROR)) {
+   perf_sw_event(PERF_COUNT_SW_SPF, 1,
+ regs, address);
+   goto done;
+   }
+   }
+#endif /* __HAVE_ARCH_CALL_SPF */
+
/*
 * When running in the kernel we expect faults to occur only to
 * addresses in user space.  All other faults represent errors in
@@ -1474,6 +1492,7 @@ __do_page_fault(struct pt_regs *regs, unsigned long 
error_code,
return;
}
 
+done:
/*
 * Major/minor page fault accounting. If any of the events
 * returned VM_FAULT_MAJOR, we account it as a major fault.
-- 
2.7.4

[PATCH v3 17/20] perf: Add a speculative page fault sw event

2017-09-08 Thread Laurent Dufour

Add a new software event to count succeeded speculative page faults.

Signed-off-by: Laurent Dufour 
---
 include/uapi/linux/perf_event.h | 1 +
 1 file changed, 1 insertion(+)

diff --git a/include/uapi/linux/perf_event.h b/include/uapi/linux/perf_event.h
index 140ae638cfd6..101e509ee39b 100644
--- a/include/uapi/linux/perf_event.h
+++ b/include/uapi/linux/perf_event.h
@@ -111,6 +111,7 @@ enum perf_sw_ids {
PERF_COUNT_SW_EMULATION_FAULTS  = 8,
PERF_COUNT_SW_DUMMY = 9,
PERF_COUNT_SW_BPF_OUTPUT= 10,
+   PERF_COUNT_SW_SPF   = 11,
 
PERF_COUNT_SW_MAX,  /* non-ABI */
 };
-- 
2.7.4

[PATCH v3 08/20] mm: Protect SPF handler against anon_vma changes

2017-09-08 Thread Laurent Dufour

The speculative page fault handler must be protected against anon_vma
changes. This is because page_add_new_anon_rmap() is called during the
speculative path.

In addition, don't try speculative page fault if the VMA don't have an
anon_vma structure allocated because its allocation should be
protected by the mmap_sem.

In __vma_adjust() when importer->anon_vma is set, there is no need to
protect against speculative page faults since speculative page fault
is aborted if the vma->anon_vma is not set.

When calling page_add_new_anon_rmap() vma->anon_vma is necessarily
valid since we checked for it when locking the pte and the anon_vma is
removed once the pte is unlocked. So even if the speculative page
fault handler is running concurrently with do_unmap(), as the pte is
locked in unmap_region() - through unmap_vmas() - and the anon_vma
unlinked later, because we check for the vma sequence counter which is
updated in unmap_page_range() before locking the pte, and then in
free_pgtables() so when locking the pte the change will be detected.

Signed-off-by: Laurent Dufour <lduf...@linux.vnet.ibm.com>
---
 mm/memory.c | 4 
 1 file changed, 4 insertions(+)

diff --git a/mm/memory.c b/mm/memory.c
index f008042ab24e..401b13cbfc3c 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -617,7 +617,9 @@ void free_pgtables(struct mmu_gather *tlb, struct 
vm_area_struct *vma,
 * Hide vma from rmap and truncate_pagecache before freeing
 * pgtables
 */
+   write_seqcount_begin(>vm_sequence);
unlink_anon_vmas(vma);
+   write_seqcount_end(>vm_sequence);
unlink_file_vma(vma);
 
if (is_vm_hugetlb_page(vma)) {
@@ -631,7 +633,9 @@ void free_pgtables(struct mmu_gather *tlb, struct 
vm_area_struct *vma,
   && !is_vm_hugetlb_page(next)) {
vma = next;
next = vma->vm_next;
+   write_seqcount_begin(>vm_sequence);
unlink_anon_vmas(vma);
+   write_seqcount_end(>vm_sequence);
unlink_file_vma(vma);
}
free_pgd_range(tlb, addr, vma->vm_end,
-- 
2.7.4

[PATCH v3 08/20] mm: Protect SPF handler against anon_vma changes

2017-09-08 Thread Laurent Dufour

The speculative page fault handler must be protected against anon_vma
changes. This is because page_add_new_anon_rmap() is called during the
speculative path.

In addition, don't try speculative page fault if the VMA don't have an
anon_vma structure allocated because its allocation should be
protected by the mmap_sem.

In __vma_adjust() when importer->anon_vma is set, there is no need to
protect against speculative page faults since speculative page fault
is aborted if the vma->anon_vma is not set.

When calling page_add_new_anon_rmap() vma->anon_vma is necessarily
valid since we checked for it when locking the pte and the anon_vma is
removed once the pte is unlocked. So even if the speculative page
fault handler is running concurrently with do_unmap(), as the pte is
locked in unmap_region() - through unmap_vmas() - and the anon_vma
unlinked later, because we check for the vma sequence counter which is
updated in unmap_page_range() before locking the pte, and then in
free_pgtables() so when locking the pte the change will be detected.

Signed-off-by: Laurent Dufour 
---
 mm/memory.c | 4 
 1 file changed, 4 insertions(+)

diff --git a/mm/memory.c b/mm/memory.c
index f008042ab24e..401b13cbfc3c 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -617,7 +617,9 @@ void free_pgtables(struct mmu_gather *tlb, struct 
vm_area_struct *vma,
 * Hide vma from rmap and truncate_pagecache before freeing
 * pgtables
 */
+   write_seqcount_begin(>vm_sequence);
unlink_anon_vmas(vma);
+   write_seqcount_end(>vm_sequence);
unlink_file_vma(vma);
 
if (is_vm_hugetlb_page(vma)) {
@@ -631,7 +633,9 @@ void free_pgtables(struct mmu_gather *tlb, struct 
vm_area_struct *vma,
   && !is_vm_hugetlb_page(next)) {
vma = next;
next = vma->vm_next;
+   write_seqcount_begin(>vm_sequence);
unlink_anon_vmas(vma);
+   write_seqcount_end(>vm_sequence);
unlink_file_vma(vma);
}
free_pgd_range(tlb, addr, vma->vm_end,
-- 
2.7.4

[PATCH v3 10/20] mm: Introduce __lru_cache_add_active_or_unevictable

2017-09-08 Thread Laurent Dufour

The speculative page fault handler which is run without holding the
mmap_sem is calling lru_cache_add_active_or_unevictable() but the vm_flags
is not guaranteed to remain constant.
Introducing __lru_cache_add_active_or_unevictable() which has the vma flags
value parameter instead of the vma pointer.

Signed-off-by: Laurent Dufour <lduf...@linux.vnet.ibm.com>
---
 include/linux/swap.h | 11 +--
 mm/memory.c  |  8 
 mm/swap.c| 12 ++--
 3 files changed, 19 insertions(+), 12 deletions(-)

diff --git a/include/linux/swap.h b/include/linux/swap.h
index 8a807292037f..9b4dbb98af89 100644
--- a/include/linux/swap.h
+++ b/include/linux/swap.h
@@ -323,8 +323,15 @@ extern void swap_setup(void);
 
 extern void add_page_to_unevictable_list(struct page *page);
 
-extern void lru_cache_add_active_or_unevictable(struct page *page,
-   struct vm_area_struct *vma);
+extern void __lru_cache_add_active_or_unevictable(struct page *page,
+   unsigned long vma_flags);
+
+static inline void lru_cache_add_active_or_unevictable(struct page *page,
+   struct vm_area_struct *vma)
+{
+   return __lru_cache_add_active_or_unevictable(page, vma->vm_flags);
+}
+
 
 /* linux/mm/vmscan.c */
 extern unsigned long zone_reclaimable_pages(struct zone *zone);
diff --git a/mm/memory.c b/mm/memory.c
index a4982917c16e..4583f354be94 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -2509,7 +2509,7 @@ static int wp_page_copy(struct vm_fault *vmf)
ptep_clear_flush_notify(vma, vmf->address, vmf->pte);
page_add_new_anon_rmap(new_page, vma, vmf->address, false);
mem_cgroup_commit_charge(new_page, memcg, false, false);
-   lru_cache_add_active_or_unevictable(new_page, vma);
+   __lru_cache_add_active_or_unevictable(new_page, vmf->vma_flags);
/*
 * We call the notify macro here because, when using secondary
 * mmu page tables (such as kvm shadow page tables), we want the
@@ -2998,7 +2998,7 @@ int do_swap_page(struct vm_fault *vmf)
} else { /* ksm created a completely new copy */
page_add_new_anon_rmap(page, vma, vmf->address, false);
mem_cgroup_commit_charge(page, memcg, false, false);
-   lru_cache_add_active_or_unevictable(page, vma);
+   __lru_cache_add_active_or_unevictable(page, vmf->vma_flags);
}
 
swap_free(entry);
@@ -3144,7 +3144,7 @@ static int do_anonymous_page(struct vm_fault *vmf)
inc_mm_counter_fast(vma->vm_mm, MM_ANONPAGES);
page_add_new_anon_rmap(page, vma, vmf->address, false);
mem_cgroup_commit_charge(page, memcg, false, false);
-   lru_cache_add_active_or_unevictable(page, vma);
+   __lru_cache_add_active_or_unevictable(page, vmf->vma_flags);
 setpte:
set_pte_at(vma->vm_mm, vmf->address, vmf->pte, entry);
 
@@ -3396,7 +3396,7 @@ int alloc_set_pte(struct vm_fault *vmf, struct mem_cgroup 
*memcg,
inc_mm_counter_fast(vma->vm_mm, MM_ANONPAGES);
page_add_new_anon_rmap(page, vma, vmf->address, false);
mem_cgroup_commit_charge(page, memcg, false, false);
-   lru_cache_add_active_or_unevictable(page, vma);
+   __lru_cache_add_active_or_unevictable(page, vmf->vma_flags);
} else {
inc_mm_counter_fast(vma->vm_mm, mm_counter_file(page));
page_add_file_rmap(page, false);
diff --git a/mm/swap.c b/mm/swap.c
index 9295ae960d66..b084bb16d769 100644
--- a/mm/swap.c
+++ b/mm/swap.c
@@ -470,21 +470,21 @@ void add_page_to_unevictable_list(struct page *page)
 }
 
 /**
- * lru_cache_add_active_or_unevictable
- * @page:  the page to be added to LRU
- * @vma:   vma in which page is mapped for determining reclaimability
+ * __lru_cache_add_active_or_unevictable
+ * @page:  the page to be added to LRU
+ * @vma_flags:  vma in which page is mapped for determining reclaimability
  *
  * Place @page on the active or unevictable LRU list, depending on its
  * evictability.  Note that if the page is not evictable, it goes
  * directly back onto it's zone's unevictable list, it does NOT use a
  * per cpu pagevec.
  */
-void lru_cache_add_active_or_unevictable(struct page *page,
-struct vm_area_struct *vma)
+void __lru_cache_add_active_or_unevictable(struct page *page,
+  unsigned long vma_flags)
 {
VM_BUG_ON_PAGE(PageLRU(page), page);
 
-   if (likely((vma->vm_flags & (VM_LOCKED | VM_SPECIAL)) != VM_LOCKED)) {
+   if (likely((vma_flags & (VM_LOCKED | VM_SPECIAL)) != VM_LOCKED)) {
SetPageActive(page);
lru_cache_add(page);
return;
-- 
2.7.4

[PATCH v3 10/20] mm: Introduce __lru_cache_add_active_or_unevictable

2017-09-08 Thread Laurent Dufour

The speculative page fault handler which is run without holding the
mmap_sem is calling lru_cache_add_active_or_unevictable() but the vm_flags
is not guaranteed to remain constant.
Introducing __lru_cache_add_active_or_unevictable() which has the vma flags
value parameter instead of the vma pointer.

Signed-off-by: Laurent Dufour 
---
 include/linux/swap.h | 11 +--
 mm/memory.c  |  8 
 mm/swap.c| 12 ++--
 3 files changed, 19 insertions(+), 12 deletions(-)

diff --git a/include/linux/swap.h b/include/linux/swap.h
index 8a807292037f..9b4dbb98af89 100644
--- a/include/linux/swap.h
+++ b/include/linux/swap.h
@@ -323,8 +323,15 @@ extern void swap_setup(void);
 
 extern void add_page_to_unevictable_list(struct page *page);
 
-extern void lru_cache_add_active_or_unevictable(struct page *page,
-   struct vm_area_struct *vma);
+extern void __lru_cache_add_active_or_unevictable(struct page *page,
+   unsigned long vma_flags);
+
+static inline void lru_cache_add_active_or_unevictable(struct page *page,
+   struct vm_area_struct *vma)
+{
+   return __lru_cache_add_active_or_unevictable(page, vma->vm_flags);
+}
+
 
 /* linux/mm/vmscan.c */
 extern unsigned long zone_reclaimable_pages(struct zone *zone);
diff --git a/mm/memory.c b/mm/memory.c
index a4982917c16e..4583f354be94 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -2509,7 +2509,7 @@ static int wp_page_copy(struct vm_fault *vmf)
ptep_clear_flush_notify(vma, vmf->address, vmf->pte);
page_add_new_anon_rmap(new_page, vma, vmf->address, false);
mem_cgroup_commit_charge(new_page, memcg, false, false);
-   lru_cache_add_active_or_unevictable(new_page, vma);
+   __lru_cache_add_active_or_unevictable(new_page, vmf->vma_flags);
/*
 * We call the notify macro here because, when using secondary
 * mmu page tables (such as kvm shadow page tables), we want the
@@ -2998,7 +2998,7 @@ int do_swap_page(struct vm_fault *vmf)
} else { /* ksm created a completely new copy */
page_add_new_anon_rmap(page, vma, vmf->address, false);
mem_cgroup_commit_charge(page, memcg, false, false);
-   lru_cache_add_active_or_unevictable(page, vma);
+   __lru_cache_add_active_or_unevictable(page, vmf->vma_flags);
}
 
swap_free(entry);
@@ -3144,7 +3144,7 @@ static int do_anonymous_page(struct vm_fault *vmf)
inc_mm_counter_fast(vma->vm_mm, MM_ANONPAGES);
page_add_new_anon_rmap(page, vma, vmf->address, false);
mem_cgroup_commit_charge(page, memcg, false, false);
-   lru_cache_add_active_or_unevictable(page, vma);
+   __lru_cache_add_active_or_unevictable(page, vmf->vma_flags);
 setpte:
set_pte_at(vma->vm_mm, vmf->address, vmf->pte, entry);
 
@@ -3396,7 +3396,7 @@ int alloc_set_pte(struct vm_fault *vmf, struct mem_cgroup 
*memcg,
inc_mm_counter_fast(vma->vm_mm, MM_ANONPAGES);
page_add_new_anon_rmap(page, vma, vmf->address, false);
mem_cgroup_commit_charge(page, memcg, false, false);
-   lru_cache_add_active_or_unevictable(page, vma);
+   __lru_cache_add_active_or_unevictable(page, vmf->vma_flags);
} else {
inc_mm_counter_fast(vma->vm_mm, mm_counter_file(page));
page_add_file_rmap(page, false);
diff --git a/mm/swap.c b/mm/swap.c
index 9295ae960d66..b084bb16d769 100644
--- a/mm/swap.c
+++ b/mm/swap.c
@@ -470,21 +470,21 @@ void add_page_to_unevictable_list(struct page *page)
 }
 
 /**
- * lru_cache_add_active_or_unevictable
- * @page:  the page to be added to LRU
- * @vma:   vma in which page is mapped for determining reclaimability
+ * __lru_cache_add_active_or_unevictable
+ * @page:  the page to be added to LRU
+ * @vma_flags:  vma in which page is mapped for determining reclaimability
  *
  * Place @page on the active or unevictable LRU list, depending on its
  * evictability.  Note that if the page is not evictable, it goes
  * directly back onto it's zone's unevictable list, it does NOT use a
  * per cpu pagevec.
  */
-void lru_cache_add_active_or_unevictable(struct page *page,
-struct vm_area_struct *vma)
+void __lru_cache_add_active_or_unevictable(struct page *page,
+  unsigned long vma_flags)
 {
VM_BUG_ON_PAGE(PageLRU(page), page);
 
-   if (likely((vma->vm_flags & (VM_LOCKED | VM_SPECIAL)) != VM_LOCKED)) {
+   if (likely((vma_flags & (VM_LOCKED | VM_SPECIAL)) != VM_LOCKED)) {
SetPageActive(page);
lru_cache_add(page);
return;
-- 
2.7.4

[PATCH v3 02/20] mm: Prepare for FAULT_FLAG_SPECULATIVE

2017-09-08 Thread Laurent Dufour

From: Peter Zijlstra <pet...@infradead.org>

When speculating faults (without holding mmap_sem) we need to validate
that the vma against which we loaded pages is still valid when we're
ready to install the new PTE.

Therefore, replace the pte_offset_map_lock() calls that (re)take the
PTL with pte_map_lock() which can fail in case we find the VMA changed
since we started the fault.

Signed-off-by: Peter Zijlstra (Intel) <pet...@infradead.org>

[Port to 4.12 kernel]
[Remove the comment about the fault_env structure which has been
 implemented as the vm_fault structure in the kernel]
Signed-off-by: Laurent Dufour <lduf...@linux.vnet.ibm.com>
---
 include/linux/mm.h |  1 +
 mm/memory.c| 55 ++
 2 files changed, 40 insertions(+), 16 deletions(-)

diff --git a/include/linux/mm.h b/include/linux/mm.h
index 68be41b31ad0..46e769a5a7ab 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -291,6 +291,7 @@ extern pgprot_t protection_map[16];
 #define FAULT_FLAG_USER0x40/* The fault originated in 
userspace */
 #define FAULT_FLAG_REMOTE  0x80/* faulting for non current tsk/mm */
 #define FAULT_FLAG_INSTRUCTION  0x100  /* The fault was during an instruction 
fetch */
+#define FAULT_FLAG_SPECULATIVE 0x200   /* Speculative fault, not holding 
mmap_sem */
 
 #define FAULT_FLAG_TRACE \
{ FAULT_FLAG_WRITE, "WRITE" }, \
diff --git a/mm/memory.c b/mm/memory.c
index 30bccfa00630..13c8c3c8b5e4 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -2408,6 +2408,12 @@ static inline void wp_page_reuse(struct vm_fault *vmf)
pte_unmap_unlock(vmf->pte, vmf->ptl);
 }
 
+static bool pte_map_lock(struct vm_fault *vmf)
+{
+   vmf->pte = pte_offset_map_lock(vmf->vma->vm_mm, vmf->pmd, vmf->address, 
>ptl);
+   return true;
+}
+
 /*
  * Handle the case of a page which we actually need to copy to a new page.
  *
@@ -2435,6 +2441,7 @@ static int wp_page_copy(struct vm_fault *vmf)
const unsigned long mmun_start = vmf->address & PAGE_MASK;
const unsigned long mmun_end = mmun_start + PAGE_SIZE;
struct mem_cgroup *memcg;
+   int ret = VM_FAULT_OOM;
 
if (unlikely(anon_vma_prepare(vma)))
goto oom;
@@ -2462,7 +2469,11 @@ static int wp_page_copy(struct vm_fault *vmf)
/*
 * Re-check the pte - we dropped the lock
 */
-   vmf->pte = pte_offset_map_lock(mm, vmf->pmd, vmf->address, >ptl);
+   if (!pte_map_lock(vmf)) {
+   mem_cgroup_cancel_charge(new_page, memcg, false);
+   ret = VM_FAULT_RETRY;
+   goto oom_free_new;
+   }
if (likely(pte_same(*vmf->pte, vmf->orig_pte))) {
if (old_page) {
if (!PageAnon(old_page)) {
@@ -2550,7 +2561,7 @@ static int wp_page_copy(struct vm_fault *vmf)
 oom:
if (old_page)
put_page(old_page);
-   return VM_FAULT_OOM;
+   return ret;
 }
 
 /**
@@ -2571,8 +2582,8 @@ static int wp_page_copy(struct vm_fault *vmf)
 int finish_mkwrite_fault(struct vm_fault *vmf)
 {
WARN_ON_ONCE(!(vmf->vma->vm_flags & VM_SHARED));
-   vmf->pte = pte_offset_map_lock(vmf->vma->vm_mm, vmf->pmd, vmf->address,
-  >ptl);
+   if (!pte_map_lock(vmf))
+   return VM_FAULT_RETRY;
/*
 * We might have raced with another page fault while we released the
 * pte_offset_map_lock.
@@ -2690,8 +2701,11 @@ static int do_wp_page(struct vm_fault *vmf)
get_page(vmf->page);
pte_unmap_unlock(vmf->pte, vmf->ptl);
lock_page(vmf->page);
-   vmf->pte = pte_offset_map_lock(vma->vm_mm, vmf->pmd,
-   vmf->address, >ptl);
+   if (!pte_map_lock(vmf)) {
+   unlock_page(vmf->page);
+   put_page(vmf->page);
+   return VM_FAULT_RETRY;
+   }
if (!pte_same(*vmf->pte, vmf->orig_pte)) {
unlock_page(vmf->page);
pte_unmap_unlock(vmf->pte, vmf->ptl);
@@ -2868,8 +2882,10 @@ int do_swap_page(struct vm_fault *vmf)
 * Back out if somebody else faulted in this pte
 * while we released the pte lock.
 */
-   vmf->pte = pte_offset_map_lock(vma->vm_mm, vmf->pmd,
-   vmf->address, >ptl);
+   if (!pte_map_lock(vmf)) {
+   delayacct_clear_flag(DELAYACCT_PF_SWAPIN);
+   ret

[PATCH v3 02/20] mm: Prepare for FAULT_FLAG_SPECULATIVE

2017-09-08 Thread Laurent Dufour

From: Peter Zijlstra 

When speculating faults (without holding mmap_sem) we need to validate
that the vma against which we loaded pages is still valid when we're
ready to install the new PTE.

Therefore, replace the pte_offset_map_lock() calls that (re)take the
PTL with pte_map_lock() which can fail in case we find the VMA changed
since we started the fault.

Signed-off-by: Peter Zijlstra (Intel) 

[Port to 4.12 kernel]
[Remove the comment about the fault_env structure which has been
 implemented as the vm_fault structure in the kernel]
Signed-off-by: Laurent Dufour 
---
 include/linux/mm.h |  1 +
 mm/memory.c| 55 ++
 2 files changed, 40 insertions(+), 16 deletions(-)

diff --git a/include/linux/mm.h b/include/linux/mm.h
index 68be41b31ad0..46e769a5a7ab 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -291,6 +291,7 @@ extern pgprot_t protection_map[16];
 #define FAULT_FLAG_USER0x40/* The fault originated in 
userspace */
 #define FAULT_FLAG_REMOTE  0x80/* faulting for non current tsk/mm */
 #define FAULT_FLAG_INSTRUCTION  0x100  /* The fault was during an instruction 
fetch */
+#define FAULT_FLAG_SPECULATIVE 0x200   /* Speculative fault, not holding 
mmap_sem */
 
 #define FAULT_FLAG_TRACE \
{ FAULT_FLAG_WRITE, "WRITE" }, \
diff --git a/mm/memory.c b/mm/memory.c
index 30bccfa00630..13c8c3c8b5e4 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -2408,6 +2408,12 @@ static inline void wp_page_reuse(struct vm_fault *vmf)
pte_unmap_unlock(vmf->pte, vmf->ptl);
 }
 
+static bool pte_map_lock(struct vm_fault *vmf)
+{
+   vmf->pte = pte_offset_map_lock(vmf->vma->vm_mm, vmf->pmd, vmf->address, 
>ptl);
+   return true;
+}
+
 /*
  * Handle the case of a page which we actually need to copy to a new page.
  *
@@ -2435,6 +2441,7 @@ static int wp_page_copy(struct vm_fault *vmf)
const unsigned long mmun_start = vmf->address & PAGE_MASK;
const unsigned long mmun_end = mmun_start + PAGE_SIZE;
struct mem_cgroup *memcg;
+   int ret = VM_FAULT_OOM;
 
if (unlikely(anon_vma_prepare(vma)))
goto oom;
@@ -2462,7 +2469,11 @@ static int wp_page_copy(struct vm_fault *vmf)
/*
 * Re-check the pte - we dropped the lock
 */
-   vmf->pte = pte_offset_map_lock(mm, vmf->pmd, vmf->address, >ptl);
+   if (!pte_map_lock(vmf)) {
+   mem_cgroup_cancel_charge(new_page, memcg, false);
+   ret = VM_FAULT_RETRY;
+   goto oom_free_new;
+   }
if (likely(pte_same(*vmf->pte, vmf->orig_pte))) {
if (old_page) {
if (!PageAnon(old_page)) {
@@ -2550,7 +2561,7 @@ static int wp_page_copy(struct vm_fault *vmf)
 oom:
if (old_page)
put_page(old_page);
-   return VM_FAULT_OOM;
+   return ret;
 }
 
 /**
@@ -2571,8 +2582,8 @@ static int wp_page_copy(struct vm_fault *vmf)
 int finish_mkwrite_fault(struct vm_fault *vmf)
 {
WARN_ON_ONCE(!(vmf->vma->vm_flags & VM_SHARED));
-   vmf->pte = pte_offset_map_lock(vmf->vma->vm_mm, vmf->pmd, vmf->address,
-  >ptl);
+   if (!pte_map_lock(vmf))
+   return VM_FAULT_RETRY;
/*
 * We might have raced with another page fault while we released the
 * pte_offset_map_lock.
@@ -2690,8 +2701,11 @@ static int do_wp_page(struct vm_fault *vmf)
get_page(vmf->page);
pte_unmap_unlock(vmf->pte, vmf->ptl);
lock_page(vmf->page);
-   vmf->pte = pte_offset_map_lock(vma->vm_mm, vmf->pmd,
-   vmf->address, >ptl);
+   if (!pte_map_lock(vmf)) {
+   unlock_page(vmf->page);
+   put_page(vmf->page);
+   return VM_FAULT_RETRY;
+   }
if (!pte_same(*vmf->pte, vmf->orig_pte)) {
unlock_page(vmf->page);
pte_unmap_unlock(vmf->pte, vmf->ptl);
@@ -2868,8 +2882,10 @@ int do_swap_page(struct vm_fault *vmf)
 * Back out if somebody else faulted in this pte
 * while we released the pte lock.
 */
-   vmf->pte = pte_offset_map_lock(vma->vm_mm, vmf->pmd,
-   vmf->address, >ptl);
+   if (!pte_map_lock(vmf)) {
+   delayacct_clear_flag(DELAYACCT_PF_SWAPIN);
+   return VM_FAULT_RETRY;
+   }
if (likely(pte_same(*vmf->pte, vmf->o

[PATCH v3 00/20] Speculative page faults

2017-09-08 Thread Laurent Dufour

50/
[5] https://lwn.net/Articles/725607/

Laurent Dufour (14):
  mm: Introduce pte_spinlock for FAULT_FLAG_SPECULATIVE
  mm: Protect VMA modifications using VMA sequence count
  mm: Cache some VMA fields in the vm_fault structure
  mm: Protect SPF handler against anon_vma changes
  mm/migrate: Pass vm_fault pointer to migrate_misplaced_page()
  mm: Introduce __lru_cache_add_active_or_unevictable
  mm: Introduce __maybe_mkwrite()
  mm: Introduce __vm_normal_page()
  mm: Introduce __page_add_new_anon_rmap()
  mm: Try spin lock in speculative path
  mm: Adding speculative page fault failure trace events
  perf: Add a speculative page fault sw event
  perf tools: Add support for the SPF perf event
  powerpc/mm: Add speculative page fault

Peter Zijlstra (6):
  mm: Dont assume page-table invariance during faults
  mm: Prepare for FAULT_FLAG_SPECULATIVE
  mm: VMA sequence count
  mm: RCU free VMAs
  mm: Provide speculative fault infrastructure
  x86/mm: Add speculative pagefault handling

 arch/powerpc/include/asm/book3s/64/pgtable.h |   5 +
 arch/powerpc/mm/fault.c  |  15 +
 arch/x86/include/asm/pgtable_types.h |   7 +
 arch/x86/mm/fault.c  |  19 ++
 fs/proc/task_mmu.c   |   5 +-
 fs/userfaultfd.c |  17 +-
 include/linux/hugetlb_inline.h   |   2 +-
 include/linux/migrate.h  |   4 +-
 include/linux/mm.h   |  28 +-
 include/linux/mm_types.h |   3 +
 include/linux/pagemap.h  |   4 +-
 include/linux/rmap.h |  12 +-
 include/linux/swap.h |  11 +-
 include/trace/events/pagefault.h |  87 +
 include/uapi/linux/perf_event.h  |   1 +
 kernel/fork.c|   1 +
 mm/hugetlb.c |   2 +
 mm/init-mm.c |   1 +
 mm/internal.h|  19 ++
 mm/khugepaged.c  |   5 +
 mm/madvise.c |   6 +-
 mm/memory.c  | 478 ++-
 mm/mempolicy.c   |  51 ++-
 mm/migrate.c |   4 +-
 mm/mlock.c   |  13 +-
 mm/mmap.c| 138 ++--
 mm/mprotect.c|   4 +-
 mm/mremap.c  |   7 +
 mm/rmap.c|   5 +-
 mm/swap.c|  12 +-
 tools/include/uapi/linux/perf_event.h|   1 +
 tools/perf/util/evsel.c  |   1 +
 tools/perf/util/parse-events.c   |   4 +
 tools/perf/util/parse-events.l   |   1 +
 tools/perf/util/python.c |   1 +
 35 files changed, 796 insertions(+), 178 deletions(-)
 create mode 100644 include/trace/events/pagefault.h

-- 
2.7.4

[PATCH v3 00/20] Speculative page faults

2017-09-08 Thread Laurent Dufour

50/
[5] https://lwn.net/Articles/725607/

Laurent Dufour (14):
  mm: Introduce pte_spinlock for FAULT_FLAG_SPECULATIVE
  mm: Protect VMA modifications using VMA sequence count
  mm: Cache some VMA fields in the vm_fault structure
  mm: Protect SPF handler against anon_vma changes
  mm/migrate: Pass vm_fault pointer to migrate_misplaced_page()
  mm: Introduce __lru_cache_add_active_or_unevictable
  mm: Introduce __maybe_mkwrite()
  mm: Introduce __vm_normal_page()
  mm: Introduce __page_add_new_anon_rmap()
  mm: Try spin lock in speculative path
  mm: Adding speculative page fault failure trace events
  perf: Add a speculative page fault sw event
  perf tools: Add support for the SPF perf event
  powerpc/mm: Add speculative page fault

Peter Zijlstra (6):
  mm: Dont assume page-table invariance during faults
  mm: Prepare for FAULT_FLAG_SPECULATIVE
  mm: VMA sequence count
  mm: RCU free VMAs
  mm: Provide speculative fault infrastructure
  x86/mm: Add speculative pagefault handling

 arch/powerpc/include/asm/book3s/64/pgtable.h |   5 +
 arch/powerpc/mm/fault.c  |  15 +
 arch/x86/include/asm/pgtable_types.h |   7 +
 arch/x86/mm/fault.c  |  19 ++
 fs/proc/task_mmu.c   |   5 +-
 fs/userfaultfd.c |  17 +-
 include/linux/hugetlb_inline.h   |   2 +-
 include/linux/migrate.h  |   4 +-
 include/linux/mm.h   |  28 +-
 include/linux/mm_types.h |   3 +
 include/linux/pagemap.h  |   4 +-
 include/linux/rmap.h |  12 +-
 include/linux/swap.h |  11 +-
 include/trace/events/pagefault.h |  87 +
 include/uapi/linux/perf_event.h  |   1 +
 kernel/fork.c|   1 +
 mm/hugetlb.c |   2 +
 mm/init-mm.c |   1 +
 mm/internal.h|  19 ++
 mm/khugepaged.c  |   5 +
 mm/madvise.c |   6 +-
 mm/memory.c  | 478 ++-
 mm/mempolicy.c   |  51 ++-
 mm/migrate.c |   4 +-
 mm/mlock.c   |  13 +-
 mm/mmap.c| 138 ++--
 mm/mprotect.c|   4 +-
 mm/mremap.c  |   7 +
 mm/rmap.c|   5 +-
 mm/swap.c|  12 +-
 tools/include/uapi/linux/perf_event.h|   1 +
 tools/perf/util/evsel.c  |   1 +
 tools/perf/util/parse-events.c   |   4 +
 tools/perf/util/parse-events.l   |   1 +
 tools/perf/util/python.c |   1 +
 35 files changed, 796 insertions(+), 178 deletions(-)
 create mode 100644 include/trace/events/pagefault.h

-- 
2.7.4

Re: [PATCH v3 00/20] Speculative page faults

2017-09-08 Thread Laurent Dufour

On 08/09/2017 19:32, Laurent Dufour wrote:
> This is a port on kernel 4.13 of the work done by Peter Zijlstra to
> handle page fault without holding the mm semaphore [1].

Sorry for the noise, I got trouble sending the whole series through this
email. I will try again.

Cheers,
Laurent.

Re: [PATCH v3 00/20] Speculative page faults

2017-09-08 Thread Laurent Dufour

On 08/09/2017 19:32, Laurent Dufour wrote:
> This is a port on kernel 4.13 of the work done by Peter Zijlstra to
> handle page fault without holding the mm semaphore [1].

Sorry for the noise, I got trouble sending the whole series through this
email. I will try again.

Cheers,
Laurent.

[PATCH v3 01/20] mm: Dont assume page-table invariance during faults

2017-09-08 Thread Laurent Dufour

From: Peter Zijlstra 

One of the side effects of speculating on faults (without holding
mmap_sem) is that we can race with free_pgtables() and therefore we
cannot assume the page-tables will stick around.

Remove the reliance on the pte pointer.

Signed-off-by: Peter Zijlstra (Intel) 
---
 mm/memory.c | 29 -
 1 file changed, 29 deletions(-)

diff --git a/mm/memory.c b/mm/memory.c
index ec4e15494901..30bccfa00630 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -2270,30 +2270,6 @@ int apply_to_page_range(struct mm_struct *mm, unsigned 
long addr,
 }
 EXPORT_SYMBOL_GPL(apply_to_page_range);
 
-/*
- * handle_pte_fault chooses page fault handler according to an entry which was
- * read non-atomically.  Before making any commitment, on those architectures
- * or configurations (e.g. i386 with PAE) which might give a mix of unmatched
- * parts, do_swap_page must check under lock before unmapping the pte and
- * proceeding (but do_wp_page is only called after already making such a check;
- * and do_anonymous_page can safely check later on).
- */
-static inline int pte_unmap_same(struct mm_struct *mm, pmd_t *pmd,
-   pte_t *page_table, pte_t orig_pte)
-{
-   int same = 1;
-#if defined(CONFIG_SMP) || defined(CONFIG_PREEMPT)
-   if (sizeof(pte_t) > sizeof(unsigned long)) {
-   spinlock_t *ptl = pte_lockptr(mm, pmd);
-   spin_lock(ptl);
-   same = pte_same(*page_table, orig_pte);
-   spin_unlock(ptl);
-   }
-#endif
-   pte_unmap(page_table);
-   return same;
-}
-
 static inline void cow_user_page(struct page *dst, struct page *src, unsigned 
long va, struct vm_area_struct *vma)
 {
debug_dma_assert_idle(src);
@@ -2854,11 +2830,6 @@ int do_swap_page(struct vm_fault *vmf)
 
if (vma_readahead)
page = swap_readahead_detect(vmf, _ra);
-   if (!pte_unmap_same(vma->vm_mm, vmf->pmd, vmf->pte, vmf->orig_pte)) {
-   if (page)
-   put_page(page);
-   goto out;
-   }
 
entry = pte_to_swp_entry(vmf->orig_pte);
if (unlikely(non_swap_entry(entry))) {
-- 
2.7.4

[PATCH v3 01/20] mm: Dont assume page-table invariance during faults

2017-09-08 Thread Laurent Dufour

From: Peter Zijlstra 

One of the side effects of speculating on faults (without holding
mmap_sem) is that we can race with free_pgtables() and therefore we
cannot assume the page-tables will stick around.

Remove the reliance on the pte pointer.

Signed-off-by: Peter Zijlstra (Intel) 
---
 mm/memory.c | 29 -
 1 file changed, 29 deletions(-)

diff --git a/mm/memory.c b/mm/memory.c
index ec4e15494901..30bccfa00630 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -2270,30 +2270,6 @@ int apply_to_page_range(struct mm_struct *mm, unsigned 
long addr,
 }
 EXPORT_SYMBOL_GPL(apply_to_page_range);
 
-/*
- * handle_pte_fault chooses page fault handler according to an entry which was
- * read non-atomically.  Before making any commitment, on those architectures
- * or configurations (e.g. i386 with PAE) which might give a mix of unmatched
- * parts, do_swap_page must check under lock before unmapping the pte and
- * proceeding (but do_wp_page is only called after already making such a check;
- * and do_anonymous_page can safely check later on).
- */
-static inline int pte_unmap_same(struct mm_struct *mm, pmd_t *pmd,
-   pte_t *page_table, pte_t orig_pte)
-{
-   int same = 1;
-#if defined(CONFIG_SMP) || defined(CONFIG_PREEMPT)
-   if (sizeof(pte_t) > sizeof(unsigned long)) {
-   spinlock_t *ptl = pte_lockptr(mm, pmd);
-   spin_lock(ptl);
-   same = pte_same(*page_table, orig_pte);
-   spin_unlock(ptl);
-   }
-#endif
-   pte_unmap(page_table);
-   return same;
-}
-
 static inline void cow_user_page(struct page *dst, struct page *src, unsigned 
long va, struct vm_area_struct *vma)
 {
debug_dma_assert_idle(src);
@@ -2854,11 +2830,6 @@ int do_swap_page(struct vm_fault *vmf)
 
if (vma_readahead)
page = swap_readahead_detect(vmf, _ra);
-   if (!pte_unmap_same(vma->vm_mm, vmf->pmd, vmf->pte, vmf->orig_pte)) {
-   if (page)
-   put_page(page);
-   goto out;
-   }
 
entry = pte_to_swp_entry(vmf->orig_pte);
if (unlikely(non_swap_entry(entry))) {
-- 
2.7.4

[PATCH v3 00/20] Speculative page faults

2017-09-08 Thread Laurent Dufour

50/
[5] https://lwn.net/Articles/725607/

Laurent Dufour (14):
  mm: Introduce pte_spinlock for FAULT_FLAG_SPECULATIVE
  mm: Protect VMA modifications using VMA sequence count
  mm: Cache some VMA fields in the vm_fault structure
  mm: Protect SPF handler against anon_vma changes
  mm/migrate: Pass vm_fault pointer to migrate_misplaced_page()
  mm: Introduce __lru_cache_add_active_or_unevictable
  mm: Introduce __maybe_mkwrite()
  mm: Introduce __vm_normal_page()
  mm: Introduce __page_add_new_anon_rmap()
  mm: Try spin lock in speculative path
  mm: Adding speculative page fault failure trace events
  perf: Add a speculative page fault sw event
  perf tools: Add support for the SPF perf event
  powerpc/mm: Add speculative page fault

Peter Zijlstra (6):
  mm: Dont assume page-table invariance during faults
  mm: Prepare for FAULT_FLAG_SPECULATIVE
  mm: VMA sequence count
  mm: RCU free VMAs
  mm: Provide speculative fault infrastructure
  x86/mm: Add speculative pagefault handling

 arch/powerpc/include/asm/book3s/64/pgtable.h |   5 +
 arch/powerpc/mm/fault.c  |  15 +
 arch/x86/include/asm/pgtable_types.h |   7 +
 arch/x86/mm/fault.c  |  19 ++
 fs/proc/task_mmu.c   |   5 +-
 fs/userfaultfd.c |  17 +-
 include/linux/hugetlb_inline.h   |   2 +-
 include/linux/migrate.h  |   4 +-
 include/linux/mm.h   |  28 +-
 include/linux/mm_types.h |   3 +
 include/linux/pagemap.h  |   4 +-
 include/linux/rmap.h |  12 +-
 include/linux/swap.h |  11 +-
 include/trace/events/pagefault.h |  87 +
 include/uapi/linux/perf_event.h  |   1 +
 kernel/fork.c|   1 +
 mm/hugetlb.c |   2 +
 mm/init-mm.c |   1 +
 mm/internal.h|  19 ++
 mm/khugepaged.c  |   5 +
 mm/madvise.c |   6 +-
 mm/memory.c  | 478 ++-
 mm/mempolicy.c   |  51 ++-
 mm/migrate.c |   4 +-
 mm/mlock.c   |  13 +-
 mm/mmap.c| 138 ++--
 mm/mprotect.c|   4 +-
 mm/mremap.c  |   7 +
 mm/rmap.c|   5 +-
 mm/swap.c|  12 +-
 tools/include/uapi/linux/perf_event.h|   1 +
 tools/perf/util/evsel.c  |   1 +
 tools/perf/util/parse-events.c   |   4 +
 tools/perf/util/parse-events.l   |   1 +
 tools/perf/util/python.c |   1 +
 35 files changed, 796 insertions(+), 178 deletions(-)
 create mode 100644 include/trace/events/pagefault.h

-- 
2.7.4

[PATCH v3 00/20] Speculative page faults

2017-09-08 Thread Laurent Dufour

50/
[5] https://lwn.net/Articles/725607/

Laurent Dufour (14):
  mm: Introduce pte_spinlock for FAULT_FLAG_SPECULATIVE
  mm: Protect VMA modifications using VMA sequence count
  mm: Cache some VMA fields in the vm_fault structure
  mm: Protect SPF handler against anon_vma changes
  mm/migrate: Pass vm_fault pointer to migrate_misplaced_page()
  mm: Introduce __lru_cache_add_active_or_unevictable
  mm: Introduce __maybe_mkwrite()
  mm: Introduce __vm_normal_page()
  mm: Introduce __page_add_new_anon_rmap()
  mm: Try spin lock in speculative path
  mm: Adding speculative page fault failure trace events
  perf: Add a speculative page fault sw event
  perf tools: Add support for the SPF perf event
  powerpc/mm: Add speculative page fault

Peter Zijlstra (6):
  mm: Dont assume page-table invariance during faults
  mm: Prepare for FAULT_FLAG_SPECULATIVE
  mm: VMA sequence count
  mm: RCU free VMAs
  mm: Provide speculative fault infrastructure
  x86/mm: Add speculative pagefault handling

 arch/powerpc/include/asm/book3s/64/pgtable.h |   5 +
 arch/powerpc/mm/fault.c  |  15 +
 arch/x86/include/asm/pgtable_types.h |   7 +
 arch/x86/mm/fault.c  |  19 ++
 fs/proc/task_mmu.c   |   5 +-
 fs/userfaultfd.c |  17 +-
 include/linux/hugetlb_inline.h   |   2 +-
 include/linux/migrate.h  |   4 +-
 include/linux/mm.h   |  28 +-
 include/linux/mm_types.h |   3 +
 include/linux/pagemap.h  |   4 +-
 include/linux/rmap.h |  12 +-
 include/linux/swap.h |  11 +-
 include/trace/events/pagefault.h |  87 +
 include/uapi/linux/perf_event.h  |   1 +
 kernel/fork.c|   1 +
 mm/hugetlb.c |   2 +
 mm/init-mm.c |   1 +
 mm/internal.h|  19 ++
 mm/khugepaged.c  |   5 +
 mm/madvise.c |   6 +-
 mm/memory.c  | 478 ++-
 mm/mempolicy.c   |  51 ++-
 mm/migrate.c |   4 +-
 mm/mlock.c   |  13 +-
 mm/mmap.c| 138 ++--
 mm/mprotect.c|   4 +-
 mm/mremap.c  |   7 +
 mm/rmap.c|   5 +-
 mm/swap.c|  12 +-
 tools/include/uapi/linux/perf_event.h|   1 +
 tools/perf/util/evsel.c  |   1 +
 tools/perf/util/parse-events.c   |   4 +
 tools/perf/util/parse-events.l   |   1 +
 tools/perf/util/python.c |   1 +
 35 files changed, 796 insertions(+), 178 deletions(-)
 create mode 100644 include/trace/events/pagefault.h

-- 
2.7.4

Re: [PATCH v2 00/20] Speculative page faults

2017-09-08 Thread Laurent Dufour

On 21/08/2017 04:26, Sergey Senozhatsky wrote:
> Hello,
> 
> On (08/18/17 00:04), Laurent Dufour wrote:
>> This is a port on kernel 4.13 of the work done by Peter Zijlstra to
>> handle page fault without holding the mm semaphore [1].
>>
>> The idea is to try to handle user space page faults without holding the
>> mmap_sem. This should allow better concurrency for massively threaded
>> process since the page fault handler will not wait for other threads memory
>> layout change to be done, assuming that this change is done in another part
>> of the process's memory space. This type page fault is named speculative
>> page fault. If the speculative page fault fails because of a concurrency is
>> detected or because underlying PMD or PTE tables are not yet allocating, it
>> is failing its processing and a classic page fault is then tried.
>>
>> The speculative page fault (SPF) has to look for the VMA matching the fault
>> address without holding the mmap_sem, so the VMA list is now managed using
>> SRCU allowing lockless walking. The only impact would be the deferred file
>> derefencing in the case of a file mapping, since the file pointer is
>> released once the SRCU cleaning is done.  This patch relies on the change
>> done recently by Paul McKenney in SRCU which now runs a callback per CPU
>> instead of per SRCU structure [1].
>>
>> The VMA's attributes checked during the speculative page fault processing
>> have to be protected against parallel changes. This is done by using a per
>> VMA sequence lock. This sequence lock allows the speculative page fault
>> handler to fast check for parallel changes in progress and to abort the
>> speculative page fault in that case.
>>
>> Once the VMA is found, the speculative page fault handler would check for
>> the VMA's attributes to verify that the page fault has to be handled
>> correctly or not. Thus the VMA is protected through a sequence lock which
>> allows fast detection of concurrent VMA changes. If such a change is
>> detected, the speculative page fault is aborted and a *classic* page fault
>> is tried.  VMA sequence locks are added when VMA attributes which are
>> checked during the page fault are modified.
>>
>> When the PTE is fetched, the VMA is checked to see if it has been changed,
>> so once the page table is locked, the VMA is valid, so any other changes
>> leading to touching this PTE will need to lock the page table, so no
>> parallel change is possible at this time.
> 
> [ 2311.315400] ==
> [ 2311.315401] WARNING: possible circular locking dependency detected
> [ 2311.315403] 4.13.0-rc5-next-20170817-dbg-00039-gaf11d7500492-dirty #1743 
> Not tainted
> [ 2311.315404] --
> [ 2311.315406] khugepaged/43 is trying to acquire lock:
> [ 2311.315407]  (>i_mmap_rwsem){}, at: [] 
> rmap_walk_file+0x5a/0x147
> [ 2311.315415] 
>but task is already holding lock:
> [ 2311.315416]  (fs_reclaim){+.+.}, at: [] 
> fs_reclaim_acquire+0x12/0x35
> [ 2311.315420] 
>which lock already depends on the new lock.
> 
> [ 2311.315422] 
>the existing dependency chain (in reverse order) is:
> [ 2311.315423] 
>-> #3 (fs_reclaim){+.+.}:
> [ 2311.315427]fs_reclaim_acquire+0x32/0x35
> [ 2311.315429]__alloc_pages_nodemask+0x8d/0x217
> [ 2311.315432]pte_alloc_one+0x13/0x5e
> [ 2311.315434]__pte_alloc+0x1f/0x83
> [ 2311.315436]move_page_tables+0x2c9/0x5ac
> [ 2311.315438]move_vma.isra.25+0xff/0x2a2
> [ 2311.315439]SyS_mremap+0x41b/0x49e
> [ 2311.315442]entry_SYSCALL_64_fastpath+0x18/0xad
> [ 2311.315443] 
>-> #2 (>vm_sequence/1){+.+.}:
> [ 2311.315449]write_seqcount_begin_nested+0x1b/0x1d
> [ 2311.315451]__vma_adjust+0x1b7/0x5d6
> [ 2311.315453]__split_vma+0x142/0x1a3
> [ 2311.315454]do_munmap+0x128/0x2af
> [ 2311.315455]vm_munmap+0x5a/0x73
> [ 2311.315458]elf_map+0xb1/0xce
> [ 2311.315459]load_elf_binary+0x8e0/0x1348
> [ 2311.315462]search_binary_handler+0x70/0x1f3
> [ 2311.315464]load_script+0x1a6/0x1b5
> [ 2311.315466]search_binary_handler+0x70/0x1f3
> [ 2311.315468]do_execveat_common+0x461/0x691
> [ 2311.315471]kernel_init+0x5a/0xf0
> [ 2311.315472]ret_from_fork+0x27/0x40
> [ 2311.315473] 
>-> #1 (>vm_sequence){+.+.}:
> [ 2311.315478]write_seqcount_begin_nested+0x1b/0x1d
> [ 2311.315480]__vma_adjust+0x19c/0x5d

Re: [PATCH v2 00/20] Speculative page faults

2017-09-08 Thread Laurent Dufour

On 21/08/2017 04:26, Sergey Senozhatsky wrote:
> Hello,
> 
> On (08/18/17 00:04), Laurent Dufour wrote:
>> This is a port on kernel 4.13 of the work done by Peter Zijlstra to
>> handle page fault without holding the mm semaphore [1].
>>
>> The idea is to try to handle user space page faults without holding the
>> mmap_sem. This should allow better concurrency for massively threaded
>> process since the page fault handler will not wait for other threads memory
>> layout change to be done, assuming that this change is done in another part
>> of the process's memory space. This type page fault is named speculative
>> page fault. If the speculative page fault fails because of a concurrency is
>> detected or because underlying PMD or PTE tables are not yet allocating, it
>> is failing its processing and a classic page fault is then tried.
>>
>> The speculative page fault (SPF) has to look for the VMA matching the fault
>> address without holding the mmap_sem, so the VMA list is now managed using
>> SRCU allowing lockless walking. The only impact would be the deferred file
>> derefencing in the case of a file mapping, since the file pointer is
>> released once the SRCU cleaning is done.  This patch relies on the change
>> done recently by Paul McKenney in SRCU which now runs a callback per CPU
>> instead of per SRCU structure [1].
>>
>> The VMA's attributes checked during the speculative page fault processing
>> have to be protected against parallel changes. This is done by using a per
>> VMA sequence lock. This sequence lock allows the speculative page fault
>> handler to fast check for parallel changes in progress and to abort the
>> speculative page fault in that case.
>>
>> Once the VMA is found, the speculative page fault handler would check for
>> the VMA's attributes to verify that the page fault has to be handled
>> correctly or not. Thus the VMA is protected through a sequence lock which
>> allows fast detection of concurrent VMA changes. If such a change is
>> detected, the speculative page fault is aborted and a *classic* page fault
>> is tried.  VMA sequence locks are added when VMA attributes which are
>> checked during the page fault are modified.
>>
>> When the PTE is fetched, the VMA is checked to see if it has been changed,
>> so once the page table is locked, the VMA is valid, so any other changes
>> leading to touching this PTE will need to lock the page table, so no
>> parallel change is possible at this time.
> 
> [ 2311.315400] ==
> [ 2311.315401] WARNING: possible circular locking dependency detected
> [ 2311.315403] 4.13.0-rc5-next-20170817-dbg-00039-gaf11d7500492-dirty #1743 
> Not tainted
> [ 2311.315404] --
> [ 2311.315406] khugepaged/43 is trying to acquire lock:
> [ 2311.315407]  (>i_mmap_rwsem){}, at: [] 
> rmap_walk_file+0x5a/0x147
> [ 2311.315415] 
>but task is already holding lock:
> [ 2311.315416]  (fs_reclaim){+.+.}, at: [] 
> fs_reclaim_acquire+0x12/0x35
> [ 2311.315420] 
>which lock already depends on the new lock.
> 
> [ 2311.315422] 
>the existing dependency chain (in reverse order) is:
> [ 2311.315423] 
>-> #3 (fs_reclaim){+.+.}:
> [ 2311.315427]fs_reclaim_acquire+0x32/0x35
> [ 2311.315429]__alloc_pages_nodemask+0x8d/0x217
> [ 2311.315432]pte_alloc_one+0x13/0x5e
> [ 2311.315434]__pte_alloc+0x1f/0x83
> [ 2311.315436]move_page_tables+0x2c9/0x5ac
> [ 2311.315438]move_vma.isra.25+0xff/0x2a2
> [ 2311.315439]SyS_mremap+0x41b/0x49e
> [ 2311.315442]entry_SYSCALL_64_fastpath+0x18/0xad
> [ 2311.315443] 
>-> #2 (>vm_sequence/1){+.+.}:
> [ 2311.315449]write_seqcount_begin_nested+0x1b/0x1d
> [ 2311.315451]__vma_adjust+0x1b7/0x5d6
> [ 2311.315453]__split_vma+0x142/0x1a3
> [ 2311.315454]do_munmap+0x128/0x2af
> [ 2311.315455]vm_munmap+0x5a/0x73
> [ 2311.315458]elf_map+0xb1/0xce
> [ 2311.315459]load_elf_binary+0x8e0/0x1348
> [ 2311.315462]search_binary_handler+0x70/0x1f3
> [ 2311.315464]load_script+0x1a6/0x1b5
> [ 2311.315466]search_binary_handler+0x70/0x1f3
> [ 2311.315468]do_execveat_common+0x461/0x691
> [ 2311.315471]kernel_init+0x5a/0xf0
> [ 2311.315472]ret_from_fork+0x27/0x40
> [ 2311.315473] 
>-> #1 (>vm_sequence){+.+.}:
> [ 2311.315478]write_seqcount_begin_nested+0x1b/0x1d
> [ 2311.315480]__vma_adjust+0x19c/0x5d

Re: [PATCH] powerpc/mm: Fix missing mmap_sem release

2017-09-08 Thread Laurent Dufour

On 07/09/2017 22:51, Davidlohr Bueso wrote:
> On Thu, 07 Sep 2017, Laurent Dufour wrote:
> 
>> The commit b5c8f0fd595d ("powerpc/mm: Rework mm_fault_error()") reviewed
>> the way the error path is managed in __do_page_fault() but it was a bit too
>> agressive when handling a case by returning without releasing the mmap_sem.
>>
>> By the way, replacing current->mm->mmap_sem by mm->mmap_sem as mm is set to
>> current->mm.
>>
>> Fixes: b5c8f0fd595d ("powerpc/mm: Rework mm_fault_error()")
>> Cc: Benjamin Herrenschmidt <b...@kernel.crashing.org>
>> Signed-off-by: Laurent Dufour <laurent@free.fr>
>> ---
>> arch/powerpc/mm/fault.c | 3 ++-
>> 1 file changed, 2 insertions(+), 1 deletion(-)
>>
>> diff --git a/arch/powerpc/mm/fault.c b/arch/powerpc/mm/fault.c
>> index 4797d08581ce..f799ccf37d27 100644
>> --- a/arch/powerpc/mm/fault.c
>> +++ b/arch/powerpc/mm/fault.c
> 
> But... here:
> 
> /*
>  * If we need to retry the mmap_sem has already been released,
>  * and if there is a fatal signal pending there is no guarantee
>  * that we made any progress. Handle this case first.
>  */
> 
>> @@ -521,10 +521,11 @@ static int __do_page_fault(struct pt_regs *regs,
>> unsigned long address,
>>  * User mode? Just return to handle the fatal exception otherwise
>>  * return to bad_page_fault
>>  */
>> +up_read(>mmap_sem);
>> return is_user ? 0 : SIGBUS;
>> }
> 
> Per the above comment, for that case handle_mm_fault()
> has already released mmap_sem. The same occurs in x86,
> for example.

Oops, my bad.

Please forget about this stupid patch...

Re: [PATCH] powerpc/mm: Fix missing mmap_sem release

2017-09-08 Thread Laurent Dufour

On 07/09/2017 22:51, Davidlohr Bueso wrote:
> On Thu, 07 Sep 2017, Laurent Dufour wrote:
> 
>> The commit b5c8f0fd595d ("powerpc/mm: Rework mm_fault_error()") reviewed
>> the way the error path is managed in __do_page_fault() but it was a bit too
>> agressive when handling a case by returning without releasing the mmap_sem.
>>
>> By the way, replacing current->mm->mmap_sem by mm->mmap_sem as mm is set to
>> current->mm.
>>
>> Fixes: b5c8f0fd595d ("powerpc/mm: Rework mm_fault_error()")
>> Cc: Benjamin Herrenschmidt 
>> Signed-off-by: Laurent Dufour 
>> ---
>> arch/powerpc/mm/fault.c | 3 ++-
>> 1 file changed, 2 insertions(+), 1 deletion(-)
>>
>> diff --git a/arch/powerpc/mm/fault.c b/arch/powerpc/mm/fault.c
>> index 4797d08581ce..f799ccf37d27 100644
>> --- a/arch/powerpc/mm/fault.c
>> +++ b/arch/powerpc/mm/fault.c
> 
> But... here:
> 
> /*
>  * If we need to retry the mmap_sem has already been released,
>  * and if there is a fatal signal pending there is no guarantee
>  * that we made any progress. Handle this case first.
>  */
> 
>> @@ -521,10 +521,11 @@ static int __do_page_fault(struct pt_regs *regs,
>> unsigned long address,
>>  * User mode? Just return to handle the fatal exception otherwise
>>  * return to bad_page_fault
>>  */
>> +up_read(>mmap_sem);
>> return is_user ? 0 : SIGBUS;
>> }
> 
> Per the above comment, for that case handle_mm_fault()
> has already released mmap_sem. The same occurs in x86,
> for example.

Oops, my bad.

Please forget about this stupid patch...

[PATCH] powerpc/mm: Fix missing mmap_sem release

2017-09-07 Thread Laurent Dufour

The commit b5c8f0fd595d ("powerpc/mm: Rework mm_fault_error()") reviewed
the way the error path is managed in __do_page_fault() but it was a bit too
agressive when handling a case by returning without releasing the mmap_sem.

By the way, replacing current->mm->mmap_sem by mm->mmap_sem as mm is set to
current->mm.

Fixes: b5c8f0fd595d ("powerpc/mm: Rework mm_fault_error()")
Cc: Benjamin Herrenschmidt <b...@kernel.crashing.org>
Signed-off-by: Laurent Dufour <laurent@free.fr>
---
 arch/powerpc/mm/fault.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/arch/powerpc/mm/fault.c b/arch/powerpc/mm/fault.c
index 4797d08581ce..f799ccf37d27 100644
--- a/arch/powerpc/mm/fault.c
+++ b/arch/powerpc/mm/fault.c
@@ -521,10 +521,11 @@ static int __do_page_fault(struct pt_regs *regs, unsigned 
long address,
 * User mode? Just return to handle the fatal exception 
otherwise
 * return to bad_page_fault
 */
+   up_read(>mmap_sem);
return is_user ? 0 : SIGBUS;
}
 
-   up_read(>mm->mmap_sem);
+   up_read(>mmap_sem);
 
if (unlikely(fault & VM_FAULT_ERROR))
return mm_fault_error(regs, address, fault);
-- 
2.7.4

[PATCH] powerpc/mm: Fix missing mmap_sem release

2017-09-07 Thread Laurent Dufour

The commit b5c8f0fd595d ("powerpc/mm: Rework mm_fault_error()") reviewed
the way the error path is managed in __do_page_fault() but it was a bit too
agressive when handling a case by returning without releasing the mmap_sem.

By the way, replacing current->mm->mmap_sem by mm->mmap_sem as mm is set to
current->mm.

Fixes: b5c8f0fd595d ("powerpc/mm: Rework mm_fault_error()")
Cc: Benjamin Herrenschmidt 
Signed-off-by: Laurent Dufour 
---
 arch/powerpc/mm/fault.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/arch/powerpc/mm/fault.c b/arch/powerpc/mm/fault.c
index 4797d08581ce..f799ccf37d27 100644
--- a/arch/powerpc/mm/fault.c
+++ b/arch/powerpc/mm/fault.c
@@ -521,10 +521,11 @@ static int __do_page_fault(struct pt_regs *regs, unsigned 
long address,
 * User mode? Just return to handle the fatal exception 
otherwise
 * return to bad_page_fault
 */
+   up_read(>mmap_sem);
return is_user ? 0 : SIGBUS;
}
 
-   up_read(>mm->mmap_sem);
+   up_read(>mmap_sem);
 
if (unlikely(fault & VM_FAULT_ERROR))
return mm_fault_error(regs, address, fault);
-- 
2.7.4

Re: [HMM-v25 10/19] mm/memcontrol: support MEMORY_DEVICE_PRIVATE v4

2017-09-05 Thread Laurent Dufour

On 17/08/2017 02:05, Jérôme Glisse wrote:
> HMM pages (private or public device pages) are ZONE_DEVICE page and
> thus need special handling when it comes to lru or refcount. This
> patch make sure that memcontrol properly handle those when it face
> them. Those pages are use like regular pages in a process address
> space either as anonymous page or as file back page. So from memcg
> point of view we want to handle them like regular page for now at
> least.
> 
> Changed since v3:
>   - remove public support and move those chunk to separate patch
> Changed since v2:
>   - s/host/public
> Changed since v1:
>   - s/public/host
>   - add comments explaining how device memory behave and why
> 
> Signed-off-by: Jérôme Glisse 
> Acked-by: Balbir Singh 
> Cc: Johannes Weiner 
> Cc: Michal Hocko 
> Cc: Vladimir Davydov 
> Cc: cgro...@vger.kernel.org
> ---
>  kernel/memremap.c |  1 +
>  mm/memcontrol.c   | 52 
>  2 files changed, 49 insertions(+), 4 deletions(-)
> 
> diff --git a/kernel/memremap.c b/kernel/memremap.c
> index 398630c1fba3..f42d7483e886 100644
> --- a/kernel/memremap.c
> +++ b/kernel/memremap.c
> @@ -492,6 +492,7 @@ void put_zone_device_private_page(struct page *page)
>   __ClearPageWaiters(page);
> 
>   page->mapping = NULL;
> + mem_cgroup_uncharge(page);
> 
>   page->pgmap->page_free(page, page->pgmap->data);
>   } else if (!count)
> diff --git a/mm/memcontrol.c b/mm/memcontrol.c
> index 604fb3ca8028..977d1cf3493a 100644
> --- a/mm/memcontrol.c
> +++ b/mm/memcontrol.c
> @@ -4407,12 +4407,13 @@ enum mc_target_type {
>   MC_TARGET_NONE = 0,
>   MC_TARGET_PAGE,
>   MC_TARGET_SWAP,
> + MC_TARGET_DEVICE,
>  };
> 
>  static struct page *mc_handle_present_pte(struct vm_area_struct *vma,
>   unsigned long addr, pte_t ptent)
>  {
> - struct page *page = vm_normal_page(vma, addr, ptent);
> + struct page *page = _vm_normal_page(vma, addr, ptent, true);

Hi Jérôme,

As _vm_normal_page() is defined later in the patch 18, so this patch should
 break the bisectability.

Cheers,
Laurent.

> 
>   if (!page || !page_mapped(page))
>   return NULL;
> @@ -4429,7 +4430,7 @@ static struct page *mc_handle_present_pte(struct 
> vm_area_struct *vma,
>   return page;
>  }
> 
> -#ifdef CONFIG_SWAP
> +#if defined(CONFIG_SWAP) || defined(CONFIG_DEVICE_PRIVATE)
>  static struct page *mc_handle_swap_pte(struct vm_area_struct *vma,
>   pte_t ptent, swp_entry_t *entry)
>  {
> @@ -4438,6 +4439,23 @@ static struct page *mc_handle_swap_pte(struct 
> vm_area_struct *vma,
> 
>   if (!(mc.flags & MOVE_ANON) || non_swap_entry(ent))
>   return NULL;
> +
> + /*
> +  * Handle MEMORY_DEVICE_PRIVATE which are ZONE_DEVICE page belonging to
> +  * a device and because they are not accessible by CPU they are store
> +  * as special swap entry in the CPU page table.
> +  */
> + if (is_device_private_entry(ent)) {
> + page = device_private_entry_to_page(ent);
> + /*
> +  * MEMORY_DEVICE_PRIVATE means ZONE_DEVICE page and which have
> +  * a refcount of 1 when free (unlike normal page)
> +  */
> + if (!page_ref_add_unless(page, 1, 1))
> + return NULL;
> + return page;
> + }
> +
>   /*
>* Because lookup_swap_cache() updates some statistics counter,
>* we call find_get_page() with swapper_space directly.
> @@ -4598,6 +4616,12 @@ static int mem_cgroup_move_account(struct page *page,
>   *   2(MC_TARGET_SWAP): if the swap entry corresponding to this pte is a
>   * target for charge migration. if @target is not NULL, the entry is 
> stored
>   * in target->ent.
> + *   3(MC_TARGET_DEVICE): like MC_TARGET_PAGE  but page is 
> MEMORY_DEVICE_PRIVATE
> + * (so ZONE_DEVICE page and thus not on the lru). For now we such page is
> + * charge like a regular page would be as for all intent and purposes it 
> is
> + * just special memory taking the place of a regular page.
> + *
> + * See Documentations/vm/hmm.txt and include/linux/hmm.h
>   *
>   * Called with pte lock held.
>   */
> @@ -4626,6 +4650,8 @@ static enum mc_target_type get_mctgt_type(struct 
> vm_area_struct *vma,
>*/
>   if (page->mem_cgroup == mc.from) {
>   ret = MC_TARGET_PAGE;
> + if (is_device_private_page(page))
> + ret = MC_TARGET_DEVICE;
>   if (target)
>   target->page = page;
>   }
> @@ -4693,6 +4719,11 @@ static int mem_cgroup_count_precharge_pte_range(pmd_t 
> *pmd,
> 
>   ptl = pmd_trans_huge_lock(pmd, vma);
>   if (ptl) {
>

Re: [HMM-v25 10/19] mm/memcontrol: support MEMORY_DEVICE_PRIVATE v4

2017-09-05 Thread Laurent Dufour

On 17/08/2017 02:05, Jérôme Glisse wrote:
> HMM pages (private or public device pages) are ZONE_DEVICE page and
> thus need special handling when it comes to lru or refcount. This
> patch make sure that memcontrol properly handle those when it face
> them. Those pages are use like regular pages in a process address
> space either as anonymous page or as file back page. So from memcg
> point of view we want to handle them like regular page for now at
> least.
> 
> Changed since v3:
>   - remove public support and move those chunk to separate patch
> Changed since v2:
>   - s/host/public
> Changed since v1:
>   - s/public/host
>   - add comments explaining how device memory behave and why
> 
> Signed-off-by: Jérôme Glisse 
> Acked-by: Balbir Singh 
> Cc: Johannes Weiner 
> Cc: Michal Hocko 
> Cc: Vladimir Davydov 
> Cc: cgro...@vger.kernel.org
> ---
>  kernel/memremap.c |  1 +
>  mm/memcontrol.c   | 52 
>  2 files changed, 49 insertions(+), 4 deletions(-)
> 
> diff --git a/kernel/memremap.c b/kernel/memremap.c
> index 398630c1fba3..f42d7483e886 100644
> --- a/kernel/memremap.c
> +++ b/kernel/memremap.c
> @@ -492,6 +492,7 @@ void put_zone_device_private_page(struct page *page)
>   __ClearPageWaiters(page);
> 
>   page->mapping = NULL;
> + mem_cgroup_uncharge(page);
> 
>   page->pgmap->page_free(page, page->pgmap->data);
>   } else if (!count)
> diff --git a/mm/memcontrol.c b/mm/memcontrol.c
> index 604fb3ca8028..977d1cf3493a 100644
> --- a/mm/memcontrol.c
> +++ b/mm/memcontrol.c
> @@ -4407,12 +4407,13 @@ enum mc_target_type {
>   MC_TARGET_NONE = 0,
>   MC_TARGET_PAGE,
>   MC_TARGET_SWAP,
> + MC_TARGET_DEVICE,
>  };
> 
>  static struct page *mc_handle_present_pte(struct vm_area_struct *vma,
>   unsigned long addr, pte_t ptent)
>  {
> - struct page *page = vm_normal_page(vma, addr, ptent);
> + struct page *page = _vm_normal_page(vma, addr, ptent, true);

Hi Jérôme,

As _vm_normal_page() is defined later in the patch 18, so this patch should
 break the bisectability.

Cheers,
Laurent.

> 
>   if (!page || !page_mapped(page))
>   return NULL;
> @@ -4429,7 +4430,7 @@ static struct page *mc_handle_present_pte(struct 
> vm_area_struct *vma,
>   return page;
>  }
> 
> -#ifdef CONFIG_SWAP
> +#if defined(CONFIG_SWAP) || defined(CONFIG_DEVICE_PRIVATE)
>  static struct page *mc_handle_swap_pte(struct vm_area_struct *vma,
>   pte_t ptent, swp_entry_t *entry)
>  {
> @@ -4438,6 +4439,23 @@ static struct page *mc_handle_swap_pte(struct 
> vm_area_struct *vma,
> 
>   if (!(mc.flags & MOVE_ANON) || non_swap_entry(ent))
>   return NULL;
> +
> + /*
> +  * Handle MEMORY_DEVICE_PRIVATE which are ZONE_DEVICE page belonging to
> +  * a device and because they are not accessible by CPU they are store
> +  * as special swap entry in the CPU page table.
> +  */
> + if (is_device_private_entry(ent)) {
> + page = device_private_entry_to_page(ent);
> + /*
> +  * MEMORY_DEVICE_PRIVATE means ZONE_DEVICE page and which have
> +  * a refcount of 1 when free (unlike normal page)
> +  */
> + if (!page_ref_add_unless(page, 1, 1))
> + return NULL;
> + return page;
> + }
> +
>   /*
>* Because lookup_swap_cache() updates some statistics counter,
>* we call find_get_page() with swapper_space directly.
> @@ -4598,6 +4616,12 @@ static int mem_cgroup_move_account(struct page *page,
>   *   2(MC_TARGET_SWAP): if the swap entry corresponding to this pte is a
>   * target for charge migration. if @target is not NULL, the entry is 
> stored
>   * in target->ent.
> + *   3(MC_TARGET_DEVICE): like MC_TARGET_PAGE  but page is 
> MEMORY_DEVICE_PRIVATE
> + * (so ZONE_DEVICE page and thus not on the lru). For now we such page is
> + * charge like a regular page would be as for all intent and purposes it 
> is
> + * just special memory taking the place of a regular page.
> + *
> + * See Documentations/vm/hmm.txt and include/linux/hmm.h
>   *
>   * Called with pte lock held.
>   */
> @@ -4626,6 +4650,8 @@ static enum mc_target_type get_mctgt_type(struct 
> vm_area_struct *vma,
>*/
>   if (page->mem_cgroup == mc.from) {
>   ret = MC_TARGET_PAGE;
> + if (is_device_private_page(page))
> + ret = MC_TARGET_DEVICE;
>   if (target)
>   target->page = page;
>   }
> @@ -4693,6 +4719,11 @@ static int mem_cgroup_count_precharge_pte_range(pmd_t 
> *pmd,
> 
>   ptl = pmd_trans_huge_lock(pmd, vma);
>   if (ptl) {
> + /*
> +  * Note their can not be MC_TARGET_DEVICE for now as we do not
> +

Re: [PATCH] mm: Fix mem_cgroup_oom_disable() call missing

2017-09-05 Thread Laurent Dufour

On 05/09/2017 17:46, Kirill A. Shutemov wrote:
> On Tue, Sep 05, 2017 at 05:30:39PM +0200, Laurent Dufour wrote:
>> Seen while reading the code, in handle_mm_fault(), in the case
>> arch_vma_access_permitted() is failing the call to mem_cgroup_oom_disable()
>> is not made.
>>
>> To fix that, move the call to mem_cgroup_oom_enable() after calling
>> arch_vma_access_permitted() as it should not have entered the memcg OOM.
>>
>> Fixes: bae473a423f6 ("mm: introduce fault_env")
>> Signed-off-by: Laurent Dufour <lduf...@linux.vnet.ibm.com>
> 
> Ouch. Sorry for this.
> 
> Acked-by: Kirill A. Shutemov <kir...@shutemov.name>
> 
> Cc: stable@ is needed too.

Andrew, should I resent it with stable in copy ?

> 
> It's strange we haven't seen reports of warning from
> mem_cgroup_oom_enable().

AFAIU, arch_vma_access_permitted() is only defined for x86 and it is
failing only in the case of the protection key mismatch, not so much used
for now...

Cheers,
Laurent.

Re: [PATCH] mm: Fix mem_cgroup_oom_disable() call missing

2017-09-05 Thread Laurent Dufour

On 05/09/2017 17:46, Kirill A. Shutemov wrote:
> On Tue, Sep 05, 2017 at 05:30:39PM +0200, Laurent Dufour wrote:
>> Seen while reading the code, in handle_mm_fault(), in the case
>> arch_vma_access_permitted() is failing the call to mem_cgroup_oom_disable()
>> is not made.
>>
>> To fix that, move the call to mem_cgroup_oom_enable() after calling
>> arch_vma_access_permitted() as it should not have entered the memcg OOM.
>>
>> Fixes: bae473a423f6 ("mm: introduce fault_env")
>> Signed-off-by: Laurent Dufour 
> 
> Ouch. Sorry for this.
> 
> Acked-by: Kirill A. Shutemov 
> 
> Cc: stable@ is needed too.

Andrew, should I resent it with stable in copy ?

> 
> It's strange we haven't seen reports of warning from
> mem_cgroup_oom_enable().

AFAIU, arch_vma_access_permitted() is only defined for x86 and it is
failing only in the case of the protection key mismatch, not so much used
for now...

Cheers,
Laurent.

[PATCH] mm: Fix mem_cgroup_oom_disable() call missing

2017-09-05 Thread Laurent Dufour

Seen while reading the code, in handle_mm_fault(), in the case
arch_vma_access_permitted() is failing the call to mem_cgroup_oom_disable()
is not made.

To fix that, move the call to mem_cgroup_oom_enable() after calling
arch_vma_access_permitted() as it should not have entered the memcg OOM.

Fixes: bae473a423f6 ("mm: introduce fault_env")
Signed-off-by: Laurent Dufour <lduf...@linux.vnet.ibm.com>
---
 mm/memory.c | 10 +-
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/mm/memory.c b/mm/memory.c
index 56e48e4593cb..274547075486 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -3888,6 +3888,11 @@ int handle_mm_fault(struct vm_area_struct *vma, unsigned 
long address,
/* do counter updates before entering really critical section. */
check_sync_rss_stat(current);
 
+   if (!arch_vma_access_permitted(vma, flags & FAULT_FLAG_WRITE,
+   flags & FAULT_FLAG_INSTRUCTION,
+   flags & FAULT_FLAG_REMOTE))
+   return VM_FAULT_SIGSEGV;
+
/*
 * Enable the memcg OOM handling for faults triggered in user
 * space.  Kernel faults are handled more gracefully.
@@ -3895,11 +3900,6 @@ int handle_mm_fault(struct vm_area_struct *vma, unsigned 
long address,
if (flags & FAULT_FLAG_USER)
mem_cgroup_oom_enable();
 
-   if (!arch_vma_access_permitted(vma, flags & FAULT_FLAG_WRITE,
-   flags & FAULT_FLAG_INSTRUCTION,
-   flags & FAULT_FLAG_REMOTE))
-   return VM_FAULT_SIGSEGV;
-
if (unlikely(is_vm_hugetlb_page(vma)))
ret = hugetlb_fault(vma->vm_mm, vma, address, flags);
else
-- 
2.7.4

[PATCH] mm: Fix mem_cgroup_oom_disable() call missing

2017-09-05 Thread Laurent Dufour

Seen while reading the code, in handle_mm_fault(), in the case
arch_vma_access_permitted() is failing the call to mem_cgroup_oom_disable()
is not made.

To fix that, move the call to mem_cgroup_oom_enable() after calling
arch_vma_access_permitted() as it should not have entered the memcg OOM.

Fixes: bae473a423f6 ("mm: introduce fault_env")
Signed-off-by: Laurent Dufour 
---
 mm/memory.c | 10 +-
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/mm/memory.c b/mm/memory.c
index 56e48e4593cb..274547075486 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -3888,6 +3888,11 @@ int handle_mm_fault(struct vm_area_struct *vma, unsigned 
long address,
/* do counter updates before entering really critical section. */
check_sync_rss_stat(current);
 
+   if (!arch_vma_access_permitted(vma, flags & FAULT_FLAG_WRITE,
+   flags & FAULT_FLAG_INSTRUCTION,
+   flags & FAULT_FLAG_REMOTE))
+   return VM_FAULT_SIGSEGV;
+
/*
 * Enable the memcg OOM handling for faults triggered in user
 * space.  Kernel faults are handled more gracefully.
@@ -3895,11 +3900,6 @@ int handle_mm_fault(struct vm_area_struct *vma, unsigned 
long address,
if (flags & FAULT_FLAG_USER)
mem_cgroup_oom_enable();
 
-   if (!arch_vma_access_permitted(vma, flags & FAULT_FLAG_WRITE,
-   flags & FAULT_FLAG_INSTRUCTION,
-   flags & FAULT_FLAG_REMOTE))
-   return VM_FAULT_SIGSEGV;
-
if (unlikely(is_vm_hugetlb_page(vma)))
ret = hugetlb_fault(vma->vm_mm, vma, address, flags);
else
-- 
2.7.4

[PATCH] x86/mm: Fix fault error path using unsafe vma pointer

2017-09-04 Thread Laurent Dufour

The commit 7b2d0dbac489 ("x86/mm/pkeys: Pass VMA down in to fault signal
generation code") pass down a vma pointer to the error path, but that is
done once the mmap_sem is released when calling mm_fault_error() from
__do_page_fault().

This is dangerous as the pointed vma structure is no more safe to be used
once the mmap_sem has been released. As only the protection key value is
required in the error processing, we could just pass down this value.

This patch fixes this by passing a pointer to a protection key value down
to the fault signal generation code. The use of a pointer allows to keep
the check generating a warning message in fill_sig_info_pkey() when the vma
was not known. If the pointer is valid, the protection value can be
accessed by deferencing the pointer.

Fixes: 7b2d0dbac489 ("x86/mm/pkeys: Pass VMA down in to fault signal
generation code")
Cc: Dave Hansen <dave.han...@linux.intel.com>
Signed-off-by: Laurent Dufour <lduf...@linux.vnet.ibm.com>
---
 arch/x86/mm/fault.c | 47 ---
 1 file changed, 24 insertions(+), 23 deletions(-)

diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c
index 2a1fa10c6a98..c18e737c5f9b 100644
--- a/arch/x86/mm/fault.c
+++ b/arch/x86/mm/fault.c
@@ -192,8 +192,7 @@ is_prefetch(struct pt_regs *regs, unsigned long error_code, 
unsigned long addr)
  * 6. T1   : reaches here, sees vma_pkey(vma)=5, when we really
  *  faulted on a pte with its pkey=4.
  */
-static void fill_sig_info_pkey(int si_code, siginfo_t *info,
-   struct vm_area_struct *vma)
+static void fill_sig_info_pkey(int si_code, siginfo_t *info, int *pkey)
 {
/* This is effectively an #ifdef */
if (!boot_cpu_has(X86_FEATURE_OSPKE))
@@ -209,7 +208,7 @@ static void fill_sig_info_pkey(int si_code, siginfo_t *info,
 * valid VMA, so we should never reach this without a
 * valid VMA.
 */
-   if (!vma) {
+   if (!pkey) {
WARN_ONCE(1, "PKU fault with no VMA passed in");
info->si_pkey = 0;
return;
@@ -219,13 +218,12 @@ static void fill_sig_info_pkey(int si_code, siginfo_t 
*info,
 * absolutely guranteed to be 100% accurate because of
 * the race explained above.
 */
-   info->si_pkey = vma_pkey(vma);
+   info->si_pkey = *pkey;
 }
 
 static void
 force_sig_info_fault(int si_signo, int si_code, unsigned long address,
-struct task_struct *tsk, struct vm_area_struct *vma,
-int fault)
+struct task_struct *tsk, int *pkey, int fault)
 {
unsigned lsb = 0;
siginfo_t info;
@@ -240,7 +238,7 @@ force_sig_info_fault(int si_signo, int si_code, unsigned 
long address,
lsb = PAGE_SHIFT;
info.si_addr_lsb = lsb;
 
-   fill_sig_info_pkey(si_code, , vma);
+   fill_sig_info_pkey(si_code, , pkey);
 
force_sig_info(si_signo, , tsk);
 }
@@ -758,8 +756,6 @@ no_context(struct pt_regs *regs, unsigned long error_code,
struct task_struct *tsk = current;
unsigned long flags;
int sig;
-   /* No context means no VMA to pass down */
-   struct vm_area_struct *vma = NULL;
 
/* Are we prepared to handle this kernel fault? */
if (fixup_exception(regs, X86_TRAP_PF)) {
@@ -784,7 +780,7 @@ no_context(struct pt_regs *regs, unsigned long error_code,
 
/* XXX: hwpoison faults will set the wrong code. */
force_sig_info_fault(signal, si_code, address,
-tsk, vma, 0);
+tsk, NULL, 0);
}
 
/*
@@ -893,8 +889,7 @@ show_signal_msg(struct pt_regs *regs, unsigned long 
error_code,
 
 static void
 __bad_area_nosemaphore(struct pt_regs *regs, unsigned long error_code,
-  unsigned long address, struct vm_area_struct *vma,
-  int si_code)
+  unsigned long address, int *pkey, int si_code)
 {
struct task_struct *tsk = current;
 
@@ -942,7 +937,7 @@ __bad_area_nosemaphore(struct pt_regs *regs, unsigned long 
error_code,
tsk->thread.error_code  = error_code;
tsk->thread.trap_nr = X86_TRAP_PF;
 
-   force_sig_info_fault(SIGSEGV, si_code, address, tsk, vma, 0);
+   force_sig_info_fault(SIGSEGV, si_code, address, tsk, pkey, 0);
 
return;
}
@@ -955,9 +950,9 @@ __bad_area_nosemaphore(struct pt_regs *regs, unsigned long 
error_code,
 
 static noinline void
 bad_area_nosemaphore(struct pt_regs *regs, unsigned long error_code,
-unsigned long address, struct vm_area_struct *vma)
+unsigned long address, int *pkey)
 {
-   __bad_area_nosemaphore(regs, error_code, address, vma, SEGV_MAPERR);
+   __bad_area

[PATCH] x86/mm: Fix fault error path using unsafe vma pointer

2017-09-04 Thread Laurent Dufour

The commit 7b2d0dbac489 ("x86/mm/pkeys: Pass VMA down in to fault signal
generation code") pass down a vma pointer to the error path, but that is
done once the mmap_sem is released when calling mm_fault_error() from
__do_page_fault().

This is dangerous as the pointed vma structure is no more safe to be used
once the mmap_sem has been released. As only the protection key value is
required in the error processing, we could just pass down this value.

This patch fixes this by passing a pointer to a protection key value down
to the fault signal generation code. The use of a pointer allows to keep
the check generating a warning message in fill_sig_info_pkey() when the vma
was not known. If the pointer is valid, the protection value can be
accessed by deferencing the pointer.

Fixes: 7b2d0dbac489 ("x86/mm/pkeys: Pass VMA down in to fault signal
generation code")
Cc: Dave Hansen 
Signed-off-by: Laurent Dufour 
---
 arch/x86/mm/fault.c | 47 ---
 1 file changed, 24 insertions(+), 23 deletions(-)

diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c
index 2a1fa10c6a98..c18e737c5f9b 100644
--- a/arch/x86/mm/fault.c
+++ b/arch/x86/mm/fault.c
@@ -192,8 +192,7 @@ is_prefetch(struct pt_regs *regs, unsigned long error_code, 
unsigned long addr)
  * 6. T1   : reaches here, sees vma_pkey(vma)=5, when we really
  *  faulted on a pte with its pkey=4.
  */
-static void fill_sig_info_pkey(int si_code, siginfo_t *info,
-   struct vm_area_struct *vma)
+static void fill_sig_info_pkey(int si_code, siginfo_t *info, int *pkey)
 {
/* This is effectively an #ifdef */
if (!boot_cpu_has(X86_FEATURE_OSPKE))
@@ -209,7 +208,7 @@ static void fill_sig_info_pkey(int si_code, siginfo_t *info,
 * valid VMA, so we should never reach this without a
 * valid VMA.
 */
-   if (!vma) {
+   if (!pkey) {
WARN_ONCE(1, "PKU fault with no VMA passed in");
info->si_pkey = 0;
return;
@@ -219,13 +218,12 @@ static void fill_sig_info_pkey(int si_code, siginfo_t 
*info,
 * absolutely guranteed to be 100% accurate because of
 * the race explained above.
 */
-   info->si_pkey = vma_pkey(vma);
+   info->si_pkey = *pkey;
 }
 
 static void
 force_sig_info_fault(int si_signo, int si_code, unsigned long address,
-struct task_struct *tsk, struct vm_area_struct *vma,
-int fault)
+struct task_struct *tsk, int *pkey, int fault)
 {
unsigned lsb = 0;
siginfo_t info;
@@ -240,7 +238,7 @@ force_sig_info_fault(int si_signo, int si_code, unsigned 
long address,
lsb = PAGE_SHIFT;
info.si_addr_lsb = lsb;
 
-   fill_sig_info_pkey(si_code, , vma);
+   fill_sig_info_pkey(si_code, , pkey);
 
force_sig_info(si_signo, , tsk);
 }
@@ -758,8 +756,6 @@ no_context(struct pt_regs *regs, unsigned long error_code,
struct task_struct *tsk = current;
unsigned long flags;
int sig;
-   /* No context means no VMA to pass down */
-   struct vm_area_struct *vma = NULL;
 
/* Are we prepared to handle this kernel fault? */
if (fixup_exception(regs, X86_TRAP_PF)) {
@@ -784,7 +780,7 @@ no_context(struct pt_regs *regs, unsigned long error_code,
 
/* XXX: hwpoison faults will set the wrong code. */
force_sig_info_fault(signal, si_code, address,
-tsk, vma, 0);
+tsk, NULL, 0);
}
 
/*
@@ -893,8 +889,7 @@ show_signal_msg(struct pt_regs *regs, unsigned long 
error_code,
 
 static void
 __bad_area_nosemaphore(struct pt_regs *regs, unsigned long error_code,
-  unsigned long address, struct vm_area_struct *vma,
-  int si_code)
+  unsigned long address, int *pkey, int si_code)
 {
struct task_struct *tsk = current;
 
@@ -942,7 +937,7 @@ __bad_area_nosemaphore(struct pt_regs *regs, unsigned long 
error_code,
tsk->thread.error_code  = error_code;
tsk->thread.trap_nr = X86_TRAP_PF;
 
-   force_sig_info_fault(SIGSEGV, si_code, address, tsk, vma, 0);
+   force_sig_info_fault(SIGSEGV, si_code, address, tsk, pkey, 0);
 
return;
}
@@ -955,9 +950,9 @@ __bad_area_nosemaphore(struct pt_regs *regs, unsigned long 
error_code,
 
 static noinline void
 bad_area_nosemaphore(struct pt_regs *regs, unsigned long error_code,
-unsigned long address, struct vm_area_struct *vma)
+unsigned long address, int *pkey)
 {
-   __bad_area_nosemaphore(regs, error_code, address, vma, SEGV_MAPERR);
+   __bad_area_nosemaphore(regs, error_code, address, pkey, SEGV_MAPERR);
 }
 
 static void
@

Re: [PATCH v2 14/20] mm: Provide speculative fault infrastructure

2017-08-30 Thread Laurent Dufour

On 30/08/2017 07:03, Anshuman Khandual wrote:
> On 08/29/2017 07:15 PM, Peter Zijlstra wrote:
>> On Tue, Aug 29, 2017 at 03:18:25PM +0200, Laurent Dufour wrote:
>>> On 29/08/2017 14:04, Peter Zijlstra wrote:
>>>> On Tue, Aug 29, 2017 at 09:59:30AM +0200, Laurent Dufour wrote:
>>>>> On 27/08/2017 02:18, Kirill A. Shutemov wrote:
>>>>>>> +
>>>>>>> +   if (unlikely(!vma->anon_vma))
>>>>>>> +   goto unlock;
>>>>>>
>>>>>> It deserves a comment.
>>>>>
>>>>> You're right I'll add it in the next version.
>>>>> For the record, the root cause is that __anon_vma_prepare() requires the
>>>>> mmap_sem to be held because vm_next and vm_prev must be safe.
>>>>
>>>> But should that test not be:
>>>>
>>>>if (unlikely(vma_is_anonymous(vma) && !vma->anon_vma))
>>>>goto unlock;
>>>>
>>>> Because !anon vmas will never have ->anon_vma set and you don't want to
>>>> exclude those.
>>>
>>> Yes in the case we later allow non anonymous vmas to be handled.
>>> Currently only anonymous vmas are supported so the check is good enough,
>>> isn't it ?
>>
>> That wasn't at all clear from reading the code. This makes it clear
>> ->anon_vma is only ever looked at for anonymous.
>>
>> And like Kirill says, we _really_ should start allowing some (if not
>> all) vm_ops. Large file based mappings aren't particularly rare.
>>
>> I'm not sure we want to introduce a white-list or just bite the bullet
>> and audit all ->fault() implementations. But either works and isn't
>> terribly difficult, auditing all is more work though.
> 
> filemap_fault() is used as vma-vm_ops->fault() for most of the file
> systems. Changing it can enable speculative fault support for all of
> them. It will still exclude other driver based vma-vm_ops->fault()
> implementation. AFAICS, __lock_page_or_retry() function can drop
> mm->mmap_sem if the page could not be locked right away. As suggested
> by Peterz, making it understand FAULT_FLAG_SPECULATIVE should be good
> enough. The patch is lightly tested for file mappings on top of this
> series.

Hi Anshuman,

This sounds pretty good, except for  the FAULT_FLAG_RETRY_NOWAIT's case I
mentioned in another mail.

The next step would be to find a way to discriminate between the vm_fault()
functions. Any idea ?

Thanks,
Laurent.

> 
> diff --git a/mm/filemap.c b/mm/filemap.c
> index a497024..08f3042 100644
> --- a/mm/filemap.c
> +++ b/mm/filemap.c
> @@ -1181,6 +1181,18 @@ int __lock_page_killable(struct page *__page)
>  int __lock_page_or_retry(struct page *page, struct mm_struct *mm,
>  unsigned int flags)
>  {
> +   if (flags & FAULT_FLAG_SPECULATIVE) {
> +   if (flags & FAULT_FLAG_KILLABLE) {
> +   int ret;
> +
> +   ret = __lock_page_killable(page);
> +   if (ret)
> +   return 0;
> +   } else
> +   __lock_page(page);
> +   return 1;
> +   }
> +
> if (flags & FAULT_FLAG_ALLOW_RETRY) {
> /*
>  * CAUTION! In this case, mmap_sem is not released
> diff --git a/mm/memory.c b/mm/memory.c
> index 549d235..02347f3 100644
> --- a/mm/memory.c
> +++ b/mm/memory.c
> @@ -3836,8 +3836,6 @@ static int handle_pte_fault(struct vm_fault *vmf)
> if (!vmf->pte) {
> if (vma_is_anonymous(vmf->vma))
> return do_anonymous_page(vmf);
> -   else if (vmf->flags & FAULT_FLAG_SPECULATIVE)
> -   return VM_FAULT_RETRY;
> else
> return do_fault(vmf);
> }
> @@ -4012,17 +4010,7 @@ int handle_speculative_fault(struct mm_struct *mm, 
> unsigned long address,
> goto unlock;
> }
> 
> -   /*
> -* Can't call vm_ops service has we don't know what they would do
> -* with the VMA.
> -* This include huge page from hugetlbfs.
> -*/
> -   if (vma->vm_ops) {
> -   trace_spf_vma_notsup(_RET_IP_, vma, address);
> -   goto unlock;
> -   }
> -
> -   if (unlikely(!vma->anon_vma)) {
> +   if (unlikely(vma_is_anonymous(vma) && !vma->anon_vma)) {
> trace_spf_vma_notsup(_RET_IP_, vma, address);
> goto unlock;
> }
>

Re: [PATCH v2 14/20] mm: Provide speculative fault infrastructure

2017-08-30 Thread Laurent Dufour

On 30/08/2017 07:03, Anshuman Khandual wrote:
> On 08/29/2017 07:15 PM, Peter Zijlstra wrote:
>> On Tue, Aug 29, 2017 at 03:18:25PM +0200, Laurent Dufour wrote:
>>> On 29/08/2017 14:04, Peter Zijlstra wrote:
>>>> On Tue, Aug 29, 2017 at 09:59:30AM +0200, Laurent Dufour wrote:
>>>>> On 27/08/2017 02:18, Kirill A. Shutemov wrote:
>>>>>>> +
>>>>>>> +   if (unlikely(!vma->anon_vma))
>>>>>>> +   goto unlock;
>>>>>>
>>>>>> It deserves a comment.
>>>>>
>>>>> You're right I'll add it in the next version.
>>>>> For the record, the root cause is that __anon_vma_prepare() requires the
>>>>> mmap_sem to be held because vm_next and vm_prev must be safe.
>>>>
>>>> But should that test not be:
>>>>
>>>>if (unlikely(vma_is_anonymous(vma) && !vma->anon_vma))
>>>>goto unlock;
>>>>
>>>> Because !anon vmas will never have ->anon_vma set and you don't want to
>>>> exclude those.
>>>
>>> Yes in the case we later allow non anonymous vmas to be handled.
>>> Currently only anonymous vmas are supported so the check is good enough,
>>> isn't it ?
>>
>> That wasn't at all clear from reading the code. This makes it clear
>> ->anon_vma is only ever looked at for anonymous.
>>
>> And like Kirill says, we _really_ should start allowing some (if not
>> all) vm_ops. Large file based mappings aren't particularly rare.
>>
>> I'm not sure we want to introduce a white-list or just bite the bullet
>> and audit all ->fault() implementations. But either works and isn't
>> terribly difficult, auditing all is more work though.
> 
> filemap_fault() is used as vma-vm_ops->fault() for most of the file
> systems. Changing it can enable speculative fault support for all of
> them. It will still exclude other driver based vma-vm_ops->fault()
> implementation. AFAICS, __lock_page_or_retry() function can drop
> mm->mmap_sem if the page could not be locked right away. As suggested
> by Peterz, making it understand FAULT_FLAG_SPECULATIVE should be good
> enough. The patch is lightly tested for file mappings on top of this
> series.

Hi Anshuman,

This sounds pretty good, except for  the FAULT_FLAG_RETRY_NOWAIT's case I
mentioned in another mail.

The next step would be to find a way to discriminate between the vm_fault()
functions. Any idea ?

Thanks,
Laurent.

> 
> diff --git a/mm/filemap.c b/mm/filemap.c
> index a497024..08f3042 100644
> --- a/mm/filemap.c
> +++ b/mm/filemap.c
> @@ -1181,6 +1181,18 @@ int __lock_page_killable(struct page *__page)
>  int __lock_page_or_retry(struct page *page, struct mm_struct *mm,
>  unsigned int flags)
>  {
> +   if (flags & FAULT_FLAG_SPECULATIVE) {
> +   if (flags & FAULT_FLAG_KILLABLE) {
> +   int ret;
> +
> +   ret = __lock_page_killable(page);
> +   if (ret)
> +   return 0;
> +   } else
> +   __lock_page(page);
> +   return 1;
> +   }
> +
> if (flags & FAULT_FLAG_ALLOW_RETRY) {
> /*
>  * CAUTION! In this case, mmap_sem is not released
> diff --git a/mm/memory.c b/mm/memory.c
> index 549d235..02347f3 100644
> --- a/mm/memory.c
> +++ b/mm/memory.c
> @@ -3836,8 +3836,6 @@ static int handle_pte_fault(struct vm_fault *vmf)
> if (!vmf->pte) {
> if (vma_is_anonymous(vmf->vma))
> return do_anonymous_page(vmf);
> -   else if (vmf->flags & FAULT_FLAG_SPECULATIVE)
> -   return VM_FAULT_RETRY;
> else
> return do_fault(vmf);
> }
> @@ -4012,17 +4010,7 @@ int handle_speculative_fault(struct mm_struct *mm, 
> unsigned long address,
> goto unlock;
> }
> 
> -   /*
> -* Can't call vm_ops service has we don't know what they would do
> -* with the VMA.
> -* This include huge page from hugetlbfs.
> -*/
> -   if (vma->vm_ops) {
> -   trace_spf_vma_notsup(_RET_IP_, vma, address);
> -   goto unlock;
> -   }
> -
> -   if (unlikely(!vma->anon_vma)) {
> +   if (unlikely(vma_is_anonymous(vma) && !vma->anon_vma)) {
> trace_spf_vma_notsup(_RET_IP_, vma, address);
> goto unlock;
> }
>

Re: [PATCH v2 14/20] mm: Provide speculative fault infrastructure

2017-08-30 Thread Laurent Dufour

On 30/08/2017 07:58, Peter Zijlstra wrote:
> On Wed, Aug 30, 2017 at 10:33:50AM +0530, Anshuman Khandual wrote:
>> diff --git a/mm/filemap.c b/mm/filemap.c
>> index a497024..08f3042 100644
>> --- a/mm/filemap.c
>> +++ b/mm/filemap.c
>> @@ -1181,6 +1181,18 @@ int __lock_page_killable(struct page *__page)
>>  int __lock_page_or_retry(struct page *page, struct mm_struct *mm,
>>  unsigned int flags)
>>  {
>> +   if (flags & FAULT_FLAG_SPECULATIVE) {
>> +   if (flags & FAULT_FLAG_KILLABLE) {
>> +   int ret;
>> +
>> +   ret = __lock_page_killable(page);
>> +   if (ret)
>> +   return 0;
>> +   } else
>> +   __lock_page(page);
>> +   return 1;
>> +   }
>> +
>> if (flags & FAULT_FLAG_ALLOW_RETRY) {
>> /*
>>  * CAUTION! In this case, mmap_sem is not released
> 
> Yeah, that looks right.

Hum, I'm wondering if FAULT_FLAG_RETRY_NOWAIT should be forced in the
speculative path in that case to match the semantics of
__lock_page_or_retry().

> 
>> @@ -4012,17 +4010,7 @@ int handle_speculative_fault(struct mm_struct *mm, 
>> unsigned long address,
>> goto unlock;
>> }
>>
>> +   if (unlikely(vma_is_anonymous(vma) && !vma->anon_vma)) {
>> trace_spf_vma_notsup(_RET_IP_, vma, address);
>> goto unlock;
>> }
> 
> As riel pointed out on IRC slightly later, private file maps also need
> ->anon_vma and those actually have ->vm_ops IIRC so the condition needs
> to be slightly more complicated.

Yes I read again the code and lead to the same conclusion.

Re: [PATCH v2 14/20] mm: Provide speculative fault infrastructure

2017-08-30 Thread Laurent Dufour

On 30/08/2017 07:58, Peter Zijlstra wrote:
> On Wed, Aug 30, 2017 at 10:33:50AM +0530, Anshuman Khandual wrote:
>> diff --git a/mm/filemap.c b/mm/filemap.c
>> index a497024..08f3042 100644
>> --- a/mm/filemap.c
>> +++ b/mm/filemap.c
>> @@ -1181,6 +1181,18 @@ int __lock_page_killable(struct page *__page)
>>  int __lock_page_or_retry(struct page *page, struct mm_struct *mm,
>>  unsigned int flags)
>>  {
>> +   if (flags & FAULT_FLAG_SPECULATIVE) {
>> +   if (flags & FAULT_FLAG_KILLABLE) {
>> +   int ret;
>> +
>> +   ret = __lock_page_killable(page);
>> +   if (ret)
>> +   return 0;
>> +   } else
>> +   __lock_page(page);
>> +   return 1;
>> +   }
>> +
>> if (flags & FAULT_FLAG_ALLOW_RETRY) {
>> /*
>>  * CAUTION! In this case, mmap_sem is not released
> 
> Yeah, that looks right.

Hum, I'm wondering if FAULT_FLAG_RETRY_NOWAIT should be forced in the
speculative path in that case to match the semantics of
__lock_page_or_retry().

> 
>> @@ -4012,17 +4010,7 @@ int handle_speculative_fault(struct mm_struct *mm, 
>> unsigned long address,
>> goto unlock;
>> }
>>
>> +   if (unlikely(vma_is_anonymous(vma) && !vma->anon_vma)) {
>> trace_spf_vma_notsup(_RET_IP_, vma, address);
>> goto unlock;
>> }
> 
> As riel pointed out on IRC slightly later, private file maps also need
> ->anon_vma and those actually have ->vm_ops IIRC so the condition needs
> to be slightly more complicated.

Yes I read again the code and lead to the same conclusion.

Re: [PATCH v2 14/20] mm: Provide speculative fault infrastructure

2017-08-30 Thread Laurent Dufour

On 27/08/2017 02:18, Kirill A. Shutemov wrote:
> On Fri, Aug 18, 2017 at 12:05:13AM +0200, Laurent Dufour wrote:
>> +/*
>> + * vm_normal_page() adds some processing which should be done while
>> + * hodling the mmap_sem.
>> + */
>> +int handle_speculative_fault(struct mm_struct *mm, unsigned long address,
>> + unsigned int flags)
>> +{
>> +struct vm_fault vmf = {
>> +.address = address,
>> +};
>> +pgd_t *pgd;
>> +p4d_t *p4d;
>> +pud_t *pud;
>> +pmd_t *pmd;
>> +int dead, seq, idx, ret = VM_FAULT_RETRY;
>> +struct vm_area_struct *vma;
>> +struct mempolicy *pol;
>> +
>> +/* Clear flags that may lead to release the mmap_sem to retry */
>> +flags &= ~(FAULT_FLAG_ALLOW_RETRY|FAULT_FLAG_KILLABLE);
>> +flags |= FAULT_FLAG_SPECULATIVE;
>> +
>> +idx = srcu_read_lock(_srcu);
>> +vma = find_vma_srcu(mm, address);
>> +if (!vma)
>> +goto unlock;
>> +
>> +/*
>> + * Validate the VMA found by the lockless lookup.
>> + */
>> +dead = RB_EMPTY_NODE(>vm_rb);
>> +seq = raw_read_seqcount(>vm_sequence); /* rmb <-> 
>> seqlock,vma_rb_erase() */
>> +if ((seq & 1) || dead)
>> +goto unlock;
>> +
>> +/*
>> + * Can't call vm_ops service has we don't know what they would do
>> + * with the VMA.
>> + * This include huge page from hugetlbfs.
>> + */
>> +if (vma->vm_ops)
>> +goto unlock;
> 
> I think we need to have a way to white-list safe ->vm_ops.
> 
>> +
>> +if (unlikely(!vma->anon_vma))
>> +goto unlock;
> 
> It deserves a comment.
> 
>> +
>> +vmf.vma_flags = READ_ONCE(vma->vm_flags);
>> +vmf.vma_page_prot = READ_ONCE(vma->vm_page_prot);
>> +
>> +/* Can't call userland page fault handler in the speculative path */
>> +if (unlikely(vmf.vma_flags & VM_UFFD_MISSING))
>> +goto unlock;
>> +
>> +/*
>> + * MPOL_INTERLEAVE implies additional check in mpol_misplaced() which
>> + * are not compatible with the speculative page fault processing.
>> + */
>> +pol = __get_vma_policy(vma, address);
>> +if (!pol)
>> +pol = get_task_policy(current);
>> +if (pol && pol->mode == MPOL_INTERLEAVE)
>> +goto unlock;
>> +
>> +if (vmf.vma_flags & VM_GROWSDOWN || vmf.vma_flags & VM_GROWSUP)
>> +/*
>> + * This could be detected by the check address against VMA's
>> + * boundaries but we want to trace it as not supported instead
>> + * of changed.
>> + */
>> +goto unlock;
>> +
>> +if (address < READ_ONCE(vma->vm_start)
>> +|| READ_ONCE(vma->vm_end) <= address)
>> +goto unlock;
>> +
>> +/*
>> + * The three following checks are copied from access_error from
>> + * arch/x86/mm/fault.c
>> + */
>> +if (!arch_vma_access_permitted(vma, flags & FAULT_FLAG_WRITE,
>> +   flags & FAULT_FLAG_INSTRUCTION,
>> +   flags & FAULT_FLAG_REMOTE))
>> +goto unlock;
>> +
>> +/* This is one is required to check that the VMA has write access set */
>> +if (flags & FAULT_FLAG_WRITE) {
>> +if (unlikely(!(vmf.vma_flags & VM_WRITE)))
>> +goto unlock;
>> +} else {
>> +if (unlikely(!(vmf.vma_flags & (VM_READ | VM_EXEC | VM_WRITE
>> +goto unlock;
>> +}
>> +
>> +/*
>> + * Do a speculative lookup of the PTE entry.
>> + */
>> +local_irq_disable();
>> +pgd = pgd_offset(mm, address);
>> +if (pgd_none(*pgd) || unlikely(pgd_bad(*pgd)))
>> +goto out_walk;
>> +
>> +p4d = p4d_alloc(mm, pgd, address);
>> +if (p4d_none(*p4d) || unlikely(p4d_bad(*p4d)))
>> +goto out_walk;
>> +
>> +pud = pud_alloc(mm, p4d, address);
>> +if (pud_none(*pud) || unlikely(pud_bad(*pud)))
>> +goto out_walk;
>> +
>> +pmd = pmd_offset(pud, address);
>> +if (pmd_none(*pmd) || unlikely(pmd_bad(*pmd)))
>> +goto out_walk;
>> +
>> +/*
>> + * The above does not allocate/instantia

Re: [PATCH v2 14/20] mm: Provide speculative fault infrastructure

2017-08-30 Thread Laurent Dufour

On 27/08/2017 02:18, Kirill A. Shutemov wrote:
> On Fri, Aug 18, 2017 at 12:05:13AM +0200, Laurent Dufour wrote:
>> +/*
>> + * vm_normal_page() adds some processing which should be done while
>> + * hodling the mmap_sem.
>> + */
>> +int handle_speculative_fault(struct mm_struct *mm, unsigned long address,
>> + unsigned int flags)
>> +{
>> +struct vm_fault vmf = {
>> +.address = address,
>> +};
>> +pgd_t *pgd;
>> +p4d_t *p4d;
>> +pud_t *pud;
>> +pmd_t *pmd;
>> +int dead, seq, idx, ret = VM_FAULT_RETRY;
>> +struct vm_area_struct *vma;
>> +struct mempolicy *pol;
>> +
>> +/* Clear flags that may lead to release the mmap_sem to retry */
>> +flags &= ~(FAULT_FLAG_ALLOW_RETRY|FAULT_FLAG_KILLABLE);
>> +flags |= FAULT_FLAG_SPECULATIVE;
>> +
>> +idx = srcu_read_lock(_srcu);
>> +vma = find_vma_srcu(mm, address);
>> +if (!vma)
>> +goto unlock;
>> +
>> +/*
>> + * Validate the VMA found by the lockless lookup.
>> + */
>> +dead = RB_EMPTY_NODE(>vm_rb);
>> +seq = raw_read_seqcount(>vm_sequence); /* rmb <-> 
>> seqlock,vma_rb_erase() */
>> +if ((seq & 1) || dead)
>> +goto unlock;
>> +
>> +/*
>> + * Can't call vm_ops service has we don't know what they would do
>> + * with the VMA.
>> + * This include huge page from hugetlbfs.
>> + */
>> +if (vma->vm_ops)
>> +goto unlock;
> 
> I think we need to have a way to white-list safe ->vm_ops.
> 
>> +
>> +if (unlikely(!vma->anon_vma))
>> +goto unlock;
> 
> It deserves a comment.
> 
>> +
>> +vmf.vma_flags = READ_ONCE(vma->vm_flags);
>> +vmf.vma_page_prot = READ_ONCE(vma->vm_page_prot);
>> +
>> +/* Can't call userland page fault handler in the speculative path */
>> +if (unlikely(vmf.vma_flags & VM_UFFD_MISSING))
>> +goto unlock;
>> +
>> +/*
>> + * MPOL_INTERLEAVE implies additional check in mpol_misplaced() which
>> + * are not compatible with the speculative page fault processing.
>> + */
>> +pol = __get_vma_policy(vma, address);
>> +if (!pol)
>> +pol = get_task_policy(current);
>> +if (pol && pol->mode == MPOL_INTERLEAVE)
>> +goto unlock;
>> +
>> +if (vmf.vma_flags & VM_GROWSDOWN || vmf.vma_flags & VM_GROWSUP)
>> +/*
>> + * This could be detected by the check address against VMA's
>> + * boundaries but we want to trace it as not supported instead
>> + * of changed.
>> + */
>> +goto unlock;
>> +
>> +if (address < READ_ONCE(vma->vm_start)
>> +|| READ_ONCE(vma->vm_end) <= address)
>> +goto unlock;
>> +
>> +/*
>> + * The three following checks are copied from access_error from
>> + * arch/x86/mm/fault.c
>> + */
>> +if (!arch_vma_access_permitted(vma, flags & FAULT_FLAG_WRITE,
>> +   flags & FAULT_FLAG_INSTRUCTION,
>> +   flags & FAULT_FLAG_REMOTE))
>> +goto unlock;
>> +
>> +/* This is one is required to check that the VMA has write access set */
>> +if (flags & FAULT_FLAG_WRITE) {
>> +if (unlikely(!(vmf.vma_flags & VM_WRITE)))
>> +goto unlock;
>> +} else {
>> +if (unlikely(!(vmf.vma_flags & (VM_READ | VM_EXEC | VM_WRITE
>> +goto unlock;
>> +}
>> +
>> +/*
>> + * Do a speculative lookup of the PTE entry.
>> + */
>> +local_irq_disable();
>> +pgd = pgd_offset(mm, address);
>> +if (pgd_none(*pgd) || unlikely(pgd_bad(*pgd)))
>> +goto out_walk;
>> +
>> +p4d = p4d_alloc(mm, pgd, address);
>> +if (p4d_none(*p4d) || unlikely(p4d_bad(*p4d)))
>> +goto out_walk;
>> +
>> +pud = pud_alloc(mm, p4d, address);
>> +if (pud_none(*pud) || unlikely(pud_bad(*pud)))
>> +goto out_walk;
>> +
>> +pmd = pmd_offset(pud, address);
>> +if (pmd_none(*pmd) || unlikely(pmd_bad(*pmd)))
>> +goto out_walk;
>> +
>> +/*
>> + * The above does not allocate/instantia

Re: [PATCH v2 20/20] powerpc/mm: Add speculative page fault

2017-08-29 Thread Laurent Dufour

On 21/08/2017 08:58, Anshuman Khandual wrote:
> On 08/18/2017 03:35 AM, Laurent Dufour wrote:
>> This patch enable the speculative page fault on the PowerPC
>> architecture.
>>
>> This will try a speculative page fault without holding the mmap_sem,
>> if it returns with WM_FAULT_RETRY, the mmap_sem is acquired and the
> 
> s/WM_FAULT_RETRY/VM_FAULT_RETRY/

Good catch ;)

>> traditional page fault processing is done.
>>
>> Support is only provide for BOOK3S_64 currently because:
>> - require CONFIG_PPC_STD_MMU because checks done in
>>   set_access_flags_filter()
> 
> What checks are done in set_access_flags_filter() ? We are just
> adding the code block in do_page_fault().

set_access_flags_filter() is checking for vm_flags & VM_EXEC which may be
changed in our back, leading to a spurious WARN displayed.
This being said, I focused on the BOOK3S as this meaningful for large
system, and I didn't get time to check for embedded systems.

> 
>> - require BOOK3S because we can't support for book3e_hugetlb_preload()
>>   called by update_mmu_cache()
>>
>> Signed-off-by: Laurent Dufour <lduf...@linux.vnet.ibm.com>
>> ---
>>  arch/powerpc/include/asm/book3s/64/pgtable.h |  5 +
>>  arch/powerpc/mm/fault.c  | 30 
>> +++-
>>  2 files changed, 34 insertions(+), 1 deletion(-)
>>
>> diff --git a/arch/powerpc/include/asm/book3s/64/pgtable.h 
>> b/arch/powerpc/include/asm/book3s/64/pgtable.h
>> index 818a58fc3f4f..897f8b9f67e6 100644
>> --- a/arch/powerpc/include/asm/book3s/64/pgtable.h
>> +++ b/arch/powerpc/include/asm/book3s/64/pgtable.h
>> @@ -313,6 +313,11 @@ extern unsigned long pci_io_base;
>>  /* Advertise support for _PAGE_SPECIAL */
>>  #define __HAVE_ARCH_PTE_SPECIAL
>>  
>> +/* Advertise that we call the Speculative Page Fault handler */
>> +#if defined(CONFIG_PPC_BOOK3S_64)
>> +#define __HAVE_ARCH_CALL_SPF
>> +#endif
>> +
>>  #ifndef __ASSEMBLY__
>>  
>>  /*
>> diff --git a/arch/powerpc/mm/fault.c b/arch/powerpc/mm/fault.c
>> index 4c422632047b..7b3cc4c30eab 100644
>> --- a/arch/powerpc/mm/fault.c
>> +++ b/arch/powerpc/mm/fault.c
>> @@ -291,9 +291,36 @@ int do_page_fault(struct pt_regs *regs, unsigned long 
>> address,
>>  if (is_write && is_user)
>>  store_update_sp = store_updates_sp(regs);
>>  
>> -if (is_user)
>> +if (is_user) {
>>  flags |= FAULT_FLAG_USER;
>>  
>> +#if defined(__HAVE_ARCH_CALL_SPF)
>> +/* let's try a speculative page fault without grabbing the
>> + * mmap_sem.
>> + */
>> +
>> +/*
>> + * flags is set later based on the VMA's flags, for the common
>> + * speculative service, we need some flags to be set.
>> + */
>> +if (is_write)
>> +flags |= FAULT_FLAG_WRITE;
>> +
>> +fault = handle_speculative_fault(mm, address, flags);
>> +if (!(fault & VM_FAULT_RETRY || fault & VM_FAULT_ERROR)) {
>> +perf_sw_event(PERF_COUNT_SW_SPF_DONE, 1,
>> +  regs, address);
>> +goto done;
> 
> Why we should retry with classical page fault on VM_FAULT_ERROR ?
> We should always return VM_FAULT_RETRY in case there is a clear
> collision some where which requires retry with classical method
> and return VM_FAULT_ERROR in cases where we know that it cannot
> be retried and fail for good. Should not handle_speculative_fault()
> be changed to accommodate this ?

There is no need to change handle_speculative_fault(), it should return
VM_FAULT_RETRY when a retry is required. If VM_FAULT_ERROR is return, we
should be able to jump to the block dealing with VM_FAULT_ERROR and calling
vm_fault_error().


> 
>> +}
>> +
>> +/*
>> + * Resetting flags since the following code assumes
>> + * FAULT_FLAG_WRITE is not set.
>> + */
>> +flags &= ~FAULT_FLAG_WRITE;
>> +#endif /* defined(__HAVE_ARCH_CALL_SPF) */
> 
> Setting and resetting of FAULT_FLAG_WRITE seems confusing. Why you
> say that some flags need to be set for handle_speculative_fault()
> function. Could you elaborate on this ?

FAULT_FLAG_WRITE is required to handle write access. In the case we retry
with the classical path, the flag is reset and will be set later if
!is_exec and is_write.

Re: [PATCH v2 20/20] powerpc/mm: Add speculative page fault

2017-08-29 Thread Laurent Dufour

On 21/08/2017 08:58, Anshuman Khandual wrote:
> On 08/18/2017 03:35 AM, Laurent Dufour wrote:
>> This patch enable the speculative page fault on the PowerPC
>> architecture.
>>
>> This will try a speculative page fault without holding the mmap_sem,
>> if it returns with WM_FAULT_RETRY, the mmap_sem is acquired and the
> 
> s/WM_FAULT_RETRY/VM_FAULT_RETRY/

Good catch ;)

>> traditional page fault processing is done.
>>
>> Support is only provide for BOOK3S_64 currently because:
>> - require CONFIG_PPC_STD_MMU because checks done in
>>   set_access_flags_filter()
> 
> What checks are done in set_access_flags_filter() ? We are just
> adding the code block in do_page_fault().

set_access_flags_filter() is checking for vm_flags & VM_EXEC which may be
changed in our back, leading to a spurious WARN displayed.
This being said, I focused on the BOOK3S as this meaningful for large
system, and I didn't get time to check for embedded systems.

> 
>> - require BOOK3S because we can't support for book3e_hugetlb_preload()
>>   called by update_mmu_cache()
>>
>> Signed-off-by: Laurent Dufour 
>> ---
>>  arch/powerpc/include/asm/book3s/64/pgtable.h |  5 +
>>  arch/powerpc/mm/fault.c  | 30 
>> +++-
>>  2 files changed, 34 insertions(+), 1 deletion(-)
>>
>> diff --git a/arch/powerpc/include/asm/book3s/64/pgtable.h 
>> b/arch/powerpc/include/asm/book3s/64/pgtable.h
>> index 818a58fc3f4f..897f8b9f67e6 100644
>> --- a/arch/powerpc/include/asm/book3s/64/pgtable.h
>> +++ b/arch/powerpc/include/asm/book3s/64/pgtable.h
>> @@ -313,6 +313,11 @@ extern unsigned long pci_io_base;
>>  /* Advertise support for _PAGE_SPECIAL */
>>  #define __HAVE_ARCH_PTE_SPECIAL
>>  
>> +/* Advertise that we call the Speculative Page Fault handler */
>> +#if defined(CONFIG_PPC_BOOK3S_64)
>> +#define __HAVE_ARCH_CALL_SPF
>> +#endif
>> +
>>  #ifndef __ASSEMBLY__
>>  
>>  /*
>> diff --git a/arch/powerpc/mm/fault.c b/arch/powerpc/mm/fault.c
>> index 4c422632047b..7b3cc4c30eab 100644
>> --- a/arch/powerpc/mm/fault.c
>> +++ b/arch/powerpc/mm/fault.c
>> @@ -291,9 +291,36 @@ int do_page_fault(struct pt_regs *regs, unsigned long 
>> address,
>>  if (is_write && is_user)
>>  store_update_sp = store_updates_sp(regs);
>>  
>> -if (is_user)
>> +if (is_user) {
>>  flags |= FAULT_FLAG_USER;
>>  
>> +#if defined(__HAVE_ARCH_CALL_SPF)
>> +/* let's try a speculative page fault without grabbing the
>> + * mmap_sem.
>> + */
>> +
>> +/*
>> + * flags is set later based on the VMA's flags, for the common
>> + * speculative service, we need some flags to be set.
>> + */
>> +if (is_write)
>> +flags |= FAULT_FLAG_WRITE;
>> +
>> +fault = handle_speculative_fault(mm, address, flags);
>> +if (!(fault & VM_FAULT_RETRY || fault & VM_FAULT_ERROR)) {
>> +perf_sw_event(PERF_COUNT_SW_SPF_DONE, 1,
>> +  regs, address);
>> +goto done;
> 
> Why we should retry with classical page fault on VM_FAULT_ERROR ?
> We should always return VM_FAULT_RETRY in case there is a clear
> collision some where which requires retry with classical method
> and return VM_FAULT_ERROR in cases where we know that it cannot
> be retried and fail for good. Should not handle_speculative_fault()
> be changed to accommodate this ?

There is no need to change handle_speculative_fault(), it should return
VM_FAULT_RETRY when a retry is required. If VM_FAULT_ERROR is return, we
should be able to jump to the block dealing with VM_FAULT_ERROR and calling
vm_fault_error().


> 
>> +}
>> +
>> +/*
>> + * Resetting flags since the following code assumes
>> + * FAULT_FLAG_WRITE is not set.
>> + */
>> +flags &= ~FAULT_FLAG_WRITE;
>> +#endif /* defined(__HAVE_ARCH_CALL_SPF) */
> 
> Setting and resetting of FAULT_FLAG_WRITE seems confusing. Why you
> say that some flags need to be set for handle_speculative_fault()
> function. Could you elaborate on this ?

FAULT_FLAG_WRITE is required to handle write access. In the case we retry
with the classical path, the flag is reset and will be set later if
!is_exec and is_write.

Re: [PATCH v2 19/20] x86/mm: Add speculative pagefault handling

2017-08-29 Thread Laurent Dufour

On 29/08/2017 16:50, Laurent Dufour wrote:
> On 21/08/2017 09:29, Anshuman Khandual wrote:
>> On 08/18/2017 03:35 AM, Laurent Dufour wrote:
>>> From: Peter Zijlstra <pet...@infradead.org>
>>>
>>> Try a speculative fault before acquiring mmap_sem, if it returns with
>>> VM_FAULT_RETRY continue with the mmap_sem acquisition and do the
>>> traditional fault.
>>>
>>> Signed-off-by: Peter Zijlstra (Intel) <pet...@infradead.org>
>>>
>>> [Clearing of FAULT_FLAG_ALLOW_RETRY is now done in
>>>  handle_speculative_fault()]
>>> [Retry with usual fault path in the case VM_ERROR is returned by
>>>  handle_speculative_fault(). This allows signal to be delivered]
>>> Signed-off-by: Laurent Dufour <lduf...@linux.vnet.ibm.com>
>>> ---
>>>  arch/x86/include/asm/pgtable_types.h |  7 +++
>>>  arch/x86/mm/fault.c  | 19 +++
>>>  2 files changed, 26 insertions(+)
>>>
>>> diff --git a/arch/x86/include/asm/pgtable_types.h 
>>> b/arch/x86/include/asm/pgtable_types.h
>>> index bf9638e1ee42..4fd2693a037e 100644
>>> --- a/arch/x86/include/asm/pgtable_types.h
>>> +++ b/arch/x86/include/asm/pgtable_types.h
>>> @@ -234,6 +234,13 @@ enum page_cache_mode {
>>>  #define PGD_IDENT_ATTR  0x001  /* PRESENT (no other 
>>> attributes) */
>>>  #endif
>>>  
>>> +/*
>>> + * Advertise that we call the Speculative Page Fault handler.
>>> + */
>>> +#ifdef CONFIG_X86_64
>>> +#define __HAVE_ARCH_CALL_SPF
>>> +#endif
>>> +
>>>  #ifdef CONFIG_X86_32
>>>  # include 
>>>  #else
>>> diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c
>>> index 2a1fa10c6a98..4c070b9a4362 100644
>>> --- a/arch/x86/mm/fault.c
>>> +++ b/arch/x86/mm/fault.c
>>> @@ -1365,6 +1365,24 @@ __do_page_fault(struct pt_regs *regs, unsigned long 
>>> error_code,
>>> if (error_code & PF_INSTR)
>>> flags |= FAULT_FLAG_INSTRUCTION;
>>>  
>>> +#ifdef __HAVE_ARCH_CALL_SPF
>>> +   if (error_code & PF_USER) {
>>> +   fault = handle_speculative_fault(mm, address, flags);
>>> +
>>> +   /*
>>> +* We also check against VM_FAULT_ERROR because we have to
>>> +* raise a signal by calling later mm_fault_error() which
>>> +* requires the vma pointer to be set. So in that case,
>>> +* we fall through the normal path.
>>
>> Cant mm_fault_error() be called inside handle_speculative_fault() ?
>> Falling through the normal page fault path again just to raise a
>> signal seems overkill. Looking into mm_fault_error(), it seems they
>> are different for x86 and powerpc.
>>
>> X86:
>>
>> mm_fault_error(struct pt_regs *regs, unsigned long error_code,
>>unsigned long address, struct vm_area_struct *vma,
>>unsigned int fault)
>>
>> powerpc:
>>
>> mm_fault_error(struct pt_regs *regs, unsigned long addr, int fault)
>>
>> Even in case of X86, I guess we would have reference to the faulting
>> VMA (after the SRCU search) which can be used to call this function
>> directly.
> 
> Yes I think this is doable in the case of x86.

Indeed this is not doable as the vma pointer is not returned by
handle_speculative_fault() and this is not possible to return it because
once srcu_read_unlock() is called, the pointer is no more safe.

Re: [PATCH v2 19/20] x86/mm: Add speculative pagefault handling

2017-08-29 Thread Laurent Dufour

On 29/08/2017 16:50, Laurent Dufour wrote:
> On 21/08/2017 09:29, Anshuman Khandual wrote:
>> On 08/18/2017 03:35 AM, Laurent Dufour wrote:
>>> From: Peter Zijlstra 
>>>
>>> Try a speculative fault before acquiring mmap_sem, if it returns with
>>> VM_FAULT_RETRY continue with the mmap_sem acquisition and do the
>>> traditional fault.
>>>
>>> Signed-off-by: Peter Zijlstra (Intel) 
>>>
>>> [Clearing of FAULT_FLAG_ALLOW_RETRY is now done in
>>>  handle_speculative_fault()]
>>> [Retry with usual fault path in the case VM_ERROR is returned by
>>>  handle_speculative_fault(). This allows signal to be delivered]
>>> Signed-off-by: Laurent Dufour 
>>> ---
>>>  arch/x86/include/asm/pgtable_types.h |  7 +++
>>>  arch/x86/mm/fault.c  | 19 +++
>>>  2 files changed, 26 insertions(+)
>>>
>>> diff --git a/arch/x86/include/asm/pgtable_types.h 
>>> b/arch/x86/include/asm/pgtable_types.h
>>> index bf9638e1ee42..4fd2693a037e 100644
>>> --- a/arch/x86/include/asm/pgtable_types.h
>>> +++ b/arch/x86/include/asm/pgtable_types.h
>>> @@ -234,6 +234,13 @@ enum page_cache_mode {
>>>  #define PGD_IDENT_ATTR  0x001  /* PRESENT (no other 
>>> attributes) */
>>>  #endif
>>>  
>>> +/*
>>> + * Advertise that we call the Speculative Page Fault handler.
>>> + */
>>> +#ifdef CONFIG_X86_64
>>> +#define __HAVE_ARCH_CALL_SPF
>>> +#endif
>>> +
>>>  #ifdef CONFIG_X86_32
>>>  # include 
>>>  #else
>>> diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c
>>> index 2a1fa10c6a98..4c070b9a4362 100644
>>> --- a/arch/x86/mm/fault.c
>>> +++ b/arch/x86/mm/fault.c
>>> @@ -1365,6 +1365,24 @@ __do_page_fault(struct pt_regs *regs, unsigned long 
>>> error_code,
>>> if (error_code & PF_INSTR)
>>> flags |= FAULT_FLAG_INSTRUCTION;
>>>  
>>> +#ifdef __HAVE_ARCH_CALL_SPF
>>> +   if (error_code & PF_USER) {
>>> +   fault = handle_speculative_fault(mm, address, flags);
>>> +
>>> +   /*
>>> +* We also check against VM_FAULT_ERROR because we have to
>>> +* raise a signal by calling later mm_fault_error() which
>>> +* requires the vma pointer to be set. So in that case,
>>> +* we fall through the normal path.
>>
>> Cant mm_fault_error() be called inside handle_speculative_fault() ?
>> Falling through the normal page fault path again just to raise a
>> signal seems overkill. Looking into mm_fault_error(), it seems they
>> are different for x86 and powerpc.
>>
>> X86:
>>
>> mm_fault_error(struct pt_regs *regs, unsigned long error_code,
>>unsigned long address, struct vm_area_struct *vma,
>>unsigned int fault)
>>
>> powerpc:
>>
>> mm_fault_error(struct pt_regs *regs, unsigned long addr, int fault)
>>
>> Even in case of X86, I guess we would have reference to the faulting
>> VMA (after the SRCU search) which can be used to call this function
>> directly.
> 
> Yes I think this is doable in the case of x86.

Indeed this is not doable as the vma pointer is not returned by
handle_speculative_fault() and this is not possible to return it because
once srcu_read_unlock() is called, the pointer is no more safe.

Re: [PATCH v2 19/20] x86/mm: Add speculative pagefault handling

2017-08-29 Thread Laurent Dufour

On 21/08/2017 09:29, Anshuman Khandual wrote:
> On 08/18/2017 03:35 AM, Laurent Dufour wrote:
>> From: Peter Zijlstra <pet...@infradead.org>
>>
>> Try a speculative fault before acquiring mmap_sem, if it returns with
>> VM_FAULT_RETRY continue with the mmap_sem acquisition and do the
>> traditional fault.
>>
>> Signed-off-by: Peter Zijlstra (Intel) <pet...@infradead.org>
>>
>> [Clearing of FAULT_FLAG_ALLOW_RETRY is now done in
>>  handle_speculative_fault()]
>> [Retry with usual fault path in the case VM_ERROR is returned by
>>  handle_speculative_fault(). This allows signal to be delivered]
>> Signed-off-by: Laurent Dufour <lduf...@linux.vnet.ibm.com>
>> ---
>>  arch/x86/include/asm/pgtable_types.h |  7 +++
>>  arch/x86/mm/fault.c  | 19 +++
>>  2 files changed, 26 insertions(+)
>>
>> diff --git a/arch/x86/include/asm/pgtable_types.h 
>> b/arch/x86/include/asm/pgtable_types.h
>> index bf9638e1ee42..4fd2693a037e 100644
>> --- a/arch/x86/include/asm/pgtable_types.h
>> +++ b/arch/x86/include/asm/pgtable_types.h
>> @@ -234,6 +234,13 @@ enum page_cache_mode {
>>  #define PGD_IDENT_ATTR   0x001  /* PRESENT (no other 
>> attributes) */
>>  #endif
>>  
>> +/*
>> + * Advertise that we call the Speculative Page Fault handler.
>> + */
>> +#ifdef CONFIG_X86_64
>> +#define __HAVE_ARCH_CALL_SPF
>> +#endif
>> +
>>  #ifdef CONFIG_X86_32
>>  # include 
>>  #else
>> diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c
>> index 2a1fa10c6a98..4c070b9a4362 100644
>> --- a/arch/x86/mm/fault.c
>> +++ b/arch/x86/mm/fault.c
>> @@ -1365,6 +1365,24 @@ __do_page_fault(struct pt_regs *regs, unsigned long 
>> error_code,
>>  if (error_code & PF_INSTR)
>>  flags |= FAULT_FLAG_INSTRUCTION;
>>  
>> +#ifdef __HAVE_ARCH_CALL_SPF
>> +if (error_code & PF_USER) {
>> +fault = handle_speculative_fault(mm, address, flags);
>> +
>> +/*
>> + * We also check against VM_FAULT_ERROR because we have to
>> + * raise a signal by calling later mm_fault_error() which
>> + * requires the vma pointer to be set. So in that case,
>> + * we fall through the normal path.
> 
> Cant mm_fault_error() be called inside handle_speculative_fault() ?
> Falling through the normal page fault path again just to raise a
> signal seems overkill. Looking into mm_fault_error(), it seems they
> are different for x86 and powerpc.
> 
> X86:
> 
> mm_fault_error(struct pt_regs *regs, unsigned long error_code,
>unsigned long address, struct vm_area_struct *vma,
>unsigned int fault)
> 
> powerpc:
> 
> mm_fault_error(struct pt_regs *regs, unsigned long addr, int fault)
> 
> Even in case of X86, I guess we would have reference to the faulting
> VMA (after the SRCU search) which can be used to call this function
> directly.

Yes I think this is doable in the case of x86.

Re: [PATCH v2 19/20] x86/mm: Add speculative pagefault handling

2017-08-29 Thread Laurent Dufour

On 21/08/2017 09:29, Anshuman Khandual wrote:
> On 08/18/2017 03:35 AM, Laurent Dufour wrote:
>> From: Peter Zijlstra 
>>
>> Try a speculative fault before acquiring mmap_sem, if it returns with
>> VM_FAULT_RETRY continue with the mmap_sem acquisition and do the
>> traditional fault.
>>
>> Signed-off-by: Peter Zijlstra (Intel) 
>>
>> [Clearing of FAULT_FLAG_ALLOW_RETRY is now done in
>>  handle_speculative_fault()]
>> [Retry with usual fault path in the case VM_ERROR is returned by
>>  handle_speculative_fault(). This allows signal to be delivered]
>> Signed-off-by: Laurent Dufour 
>> ---
>>  arch/x86/include/asm/pgtable_types.h |  7 +++
>>  arch/x86/mm/fault.c  | 19 +++
>>  2 files changed, 26 insertions(+)
>>
>> diff --git a/arch/x86/include/asm/pgtable_types.h 
>> b/arch/x86/include/asm/pgtable_types.h
>> index bf9638e1ee42..4fd2693a037e 100644
>> --- a/arch/x86/include/asm/pgtable_types.h
>> +++ b/arch/x86/include/asm/pgtable_types.h
>> @@ -234,6 +234,13 @@ enum page_cache_mode {
>>  #define PGD_IDENT_ATTR   0x001  /* PRESENT (no other 
>> attributes) */
>>  #endif
>>  
>> +/*
>> + * Advertise that we call the Speculative Page Fault handler.
>> + */
>> +#ifdef CONFIG_X86_64
>> +#define __HAVE_ARCH_CALL_SPF
>> +#endif
>> +
>>  #ifdef CONFIG_X86_32
>>  # include 
>>  #else
>> diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c
>> index 2a1fa10c6a98..4c070b9a4362 100644
>> --- a/arch/x86/mm/fault.c
>> +++ b/arch/x86/mm/fault.c
>> @@ -1365,6 +1365,24 @@ __do_page_fault(struct pt_regs *regs, unsigned long 
>> error_code,
>>  if (error_code & PF_INSTR)
>>  flags |= FAULT_FLAG_INSTRUCTION;
>>  
>> +#ifdef __HAVE_ARCH_CALL_SPF
>> +if (error_code & PF_USER) {
>> +fault = handle_speculative_fault(mm, address, flags);
>> +
>> +/*
>> + * We also check against VM_FAULT_ERROR because we have to
>> + * raise a signal by calling later mm_fault_error() which
>> + * requires the vma pointer to be set. So in that case,
>> + * we fall through the normal path.
> 
> Cant mm_fault_error() be called inside handle_speculative_fault() ?
> Falling through the normal page fault path again just to raise a
> signal seems overkill. Looking into mm_fault_error(), it seems they
> are different for x86 and powerpc.
> 
> X86:
> 
> mm_fault_error(struct pt_regs *regs, unsigned long error_code,
>unsigned long address, struct vm_area_struct *vma,
>unsigned int fault)
> 
> powerpc:
> 
> mm_fault_error(struct pt_regs *regs, unsigned long addr, int fault)
> 
> Even in case of X86, I guess we would have reference to the faulting
> VMA (after the SRCU search) which can be used to call this function
> directly.

Yes I think this is doable in the case of x86.

Re: [PATCH v2 14/20] mm: Provide speculative fault infrastructure

2017-08-29 Thread Laurent Dufour

On 29/08/2017 14:04, Peter Zijlstra wrote:
> On Tue, Aug 29, 2017 at 09:59:30AM +0200, Laurent Dufour wrote:
>> On 27/08/2017 02:18, Kirill A. Shutemov wrote:
>>>> +
>>>> +  if (unlikely(!vma->anon_vma))
>>>> +  goto unlock;
>>>
>>> It deserves a comment.
>>
>> You're right I'll add it in the next version.
>> For the record, the root cause is that __anon_vma_prepare() requires the
>> mmap_sem to be held because vm_next and vm_prev must be safe.
> 
> But should that test not be:
> 
>   if (unlikely(vma_is_anonymous(vma) && !vma->anon_vma))
>   goto unlock;
> 
> Because !anon vmas will never have ->anon_vma set and you don't want to
> exclude those.

Yes in the case we later allow non anonymous vmas to be handled.
Currently only anonymous vmas are supported so the check is good enough,
isn't it ?

Re: [PATCH v2 14/20] mm: Provide speculative fault infrastructure

2017-08-29 Thread Laurent Dufour

On 29/08/2017 14:04, Peter Zijlstra wrote:
> On Tue, Aug 29, 2017 at 09:59:30AM +0200, Laurent Dufour wrote:
>> On 27/08/2017 02:18, Kirill A. Shutemov wrote:
>>>> +
>>>> +  if (unlikely(!vma->anon_vma))
>>>> +  goto unlock;
>>>
>>> It deserves a comment.
>>
>> You're right I'll add it in the next version.
>> For the record, the root cause is that __anon_vma_prepare() requires the
>> mmap_sem to be held because vm_next and vm_prev must be safe.
> 
> But should that test not be:
> 
>   if (unlikely(vma_is_anonymous(vma) && !vma->anon_vma))
>   goto unlock;
> 
> Because !anon vmas will never have ->anon_vma set and you don't want to
> exclude those.

Yes in the case we later allow non anonymous vmas to be handled.
Currently only anonymous vmas are supported so the check is good enough,
isn't it ?

Re: [PATCH v2 14/20] mm: Provide speculative fault infrastructure

2017-08-29 Thread Laurent Dufour

On 27/08/2017 02:18, Kirill A. Shutemov wrote:
> On Fri, Aug 18, 2017 at 12:05:13AM +0200, Laurent Dufour wrote:
>> +/*
>> + * vm_normal_page() adds some processing which should be done while
>> + * hodling the mmap_sem.
>> + */
>> +int handle_speculative_fault(struct mm_struct *mm, unsigned long address,
>> + unsigned int flags)
>> +{
>> +struct vm_fault vmf = {
>> +.address = address,
>> +};
>> +pgd_t *pgd;
>> +p4d_t *p4d;
>> +pud_t *pud;
>> +pmd_t *pmd;
>> +int dead, seq, idx, ret = VM_FAULT_RETRY;
>> +struct vm_area_struct *vma;
>> +struct mempolicy *pol;
>> +
>> +/* Clear flags that may lead to release the mmap_sem to retry */
>> +flags &= ~(FAULT_FLAG_ALLOW_RETRY|FAULT_FLAG_KILLABLE);
>> +flags |= FAULT_FLAG_SPECULATIVE;
>> +
>> +idx = srcu_read_lock(_srcu);
>> +vma = find_vma_srcu(mm, address);
>> +if (!vma)
>> +goto unlock;
>> +
>> +/*
>> + * Validate the VMA found by the lockless lookup.
>> + */
>> +dead = RB_EMPTY_NODE(>vm_rb);
>> +seq = raw_read_seqcount(>vm_sequence); /* rmb <-> 
>> seqlock,vma_rb_erase() */
>> +if ((seq & 1) || dead)
>> +goto unlock;
>> +
>> +/*
>> + * Can't call vm_ops service has we don't know what they would do
>> + * with the VMA.
>> + * This include huge page from hugetlbfs.
>> + */
>> +if (vma->vm_ops)
>> +goto unlock;
> 
> I think we need to have a way to white-list safe ->vm_ops.

Hi Kirill,
Yes this would be a good optimization done in a next step.

>> +
>> +if (unlikely(!vma->anon_vma))
>> +goto unlock;
> 
> It deserves a comment.

You're right I'll add it in the next version.
For the record, the root cause is that __anon_vma_prepare() requires the
mmap_sem to be held because vm_next and vm_prev must be safe.


>> +
>> +vmf.vma_flags = READ_ONCE(vma->vm_flags);
>> +vmf.vma_page_prot = READ_ONCE(vma->vm_page_prot);
>> +
>> +/* Can't call userland page fault handler in the speculative path */
>> +if (unlikely(vmf.vma_flags & VM_UFFD_MISSING))
>> +goto unlock;
>> +
>> +/*
>> + * MPOL_INTERLEAVE implies additional check in mpol_misplaced() which
>> + * are not compatible with the speculative page fault processing.
>> + */
>> +pol = __get_vma_policy(vma, address);
>> +if (!pol)
>> +pol = get_task_policy(current);
>> +if (pol && pol->mode == MPOL_INTERLEAVE)
>> +goto unlock;
>> +
>> +if (vmf.vma_flags & VM_GROWSDOWN || vmf.vma_flags & VM_GROWSUP)
>> +/*
>> + * This could be detected by the check address against VMA's
>> + * boundaries but we want to trace it as not supported instead
>> + * of changed.
>> + */
>> +goto unlock;
>> +
>> +if (address < READ_ONCE(vma->vm_start)
>> +|| READ_ONCE(vma->vm_end) <= address)
>> +goto unlock;
>> +
>> +/*
>> + * The three following checks are copied from access_error from
>> + * arch/x86/mm/fault.c
>> + */
>> +if (!arch_vma_access_permitted(vma, flags & FAULT_FLAG_WRITE,
>> +   flags & FAULT_FLAG_INSTRUCTION,
>> +   flags & FAULT_FLAG_REMOTE))
>> +goto unlock;
>> +
>> +/* This is one is required to check that the VMA has write access set */
>> +if (flags & FAULT_FLAG_WRITE) {
>> +if (unlikely(!(vmf.vma_flags & VM_WRITE)))
>> +goto unlock;
>> +} else {
>> +if (unlikely(!(vmf.vma_flags & (VM_READ | VM_EXEC | VM_WRITE
>> +goto unlock;
>> +}
>> +
>> +/*
>> + * Do a speculative lookup of the PTE entry.
>> + */
>> +local_irq_disable();
>> +pgd = pgd_offset(mm, address);
>> +if (pgd_none(*pgd) || unlikely(pgd_bad(*pgd)))
>> +goto out_walk;
>> +
>> +p4d = p4d_alloc(mm, pgd, address);
>> +if (p4d_none(*p4d) || unlikely(p4d_bad(*p4d)))
>> +goto out_walk;
>> +
>> +pud = pud_alloc(mm, p4d, address);
>> +if (pud_none(*pud) || unlikely(pud_bad(*pud)))
>> +goto out_walk;
>>

Re: [PATCH v2 14/20] mm: Provide speculative fault infrastructure

2017-08-29 Thread Laurent Dufour

On 27/08/2017 02:18, Kirill A. Shutemov wrote:
> On Fri, Aug 18, 2017 at 12:05:13AM +0200, Laurent Dufour wrote:
>> +/*
>> + * vm_normal_page() adds some processing which should be done while
>> + * hodling the mmap_sem.
>> + */
>> +int handle_speculative_fault(struct mm_struct *mm, unsigned long address,
>> + unsigned int flags)
>> +{
>> +struct vm_fault vmf = {
>> +.address = address,
>> +};
>> +pgd_t *pgd;
>> +p4d_t *p4d;
>> +pud_t *pud;
>> +pmd_t *pmd;
>> +int dead, seq, idx, ret = VM_FAULT_RETRY;
>> +struct vm_area_struct *vma;
>> +struct mempolicy *pol;
>> +
>> +/* Clear flags that may lead to release the mmap_sem to retry */
>> +flags &= ~(FAULT_FLAG_ALLOW_RETRY|FAULT_FLAG_KILLABLE);
>> +flags |= FAULT_FLAG_SPECULATIVE;
>> +
>> +idx = srcu_read_lock(_srcu);
>> +vma = find_vma_srcu(mm, address);
>> +if (!vma)
>> +goto unlock;
>> +
>> +/*
>> + * Validate the VMA found by the lockless lookup.
>> + */
>> +dead = RB_EMPTY_NODE(>vm_rb);
>> +seq = raw_read_seqcount(>vm_sequence); /* rmb <-> 
>> seqlock,vma_rb_erase() */
>> +if ((seq & 1) || dead)
>> +goto unlock;
>> +
>> +/*
>> + * Can't call vm_ops service has we don't know what they would do
>> + * with the VMA.
>> + * This include huge page from hugetlbfs.
>> + */
>> +if (vma->vm_ops)
>> +goto unlock;
> 
> I think we need to have a way to white-list safe ->vm_ops.

Hi Kirill,
Yes this would be a good optimization done in a next step.

>> +
>> +if (unlikely(!vma->anon_vma))
>> +goto unlock;
> 
> It deserves a comment.

You're right I'll add it in the next version.
For the record, the root cause is that __anon_vma_prepare() requires the
mmap_sem to be held because vm_next and vm_prev must be safe.


>> +
>> +vmf.vma_flags = READ_ONCE(vma->vm_flags);
>> +vmf.vma_page_prot = READ_ONCE(vma->vm_page_prot);
>> +
>> +/* Can't call userland page fault handler in the speculative path */
>> +if (unlikely(vmf.vma_flags & VM_UFFD_MISSING))
>> +goto unlock;
>> +
>> +/*
>> + * MPOL_INTERLEAVE implies additional check in mpol_misplaced() which
>> + * are not compatible with the speculative page fault processing.
>> + */
>> +pol = __get_vma_policy(vma, address);
>> +if (!pol)
>> +pol = get_task_policy(current);
>> +if (pol && pol->mode == MPOL_INTERLEAVE)
>> +goto unlock;
>> +
>> +if (vmf.vma_flags & VM_GROWSDOWN || vmf.vma_flags & VM_GROWSUP)
>> +/*
>> + * This could be detected by the check address against VMA's
>> + * boundaries but we want to trace it as not supported instead
>> + * of changed.
>> + */
>> +goto unlock;
>> +
>> +if (address < READ_ONCE(vma->vm_start)
>> +|| READ_ONCE(vma->vm_end) <= address)
>> +goto unlock;
>> +
>> +/*
>> + * The three following checks are copied from access_error from
>> + * arch/x86/mm/fault.c
>> + */
>> +if (!arch_vma_access_permitted(vma, flags & FAULT_FLAG_WRITE,
>> +   flags & FAULT_FLAG_INSTRUCTION,
>> +   flags & FAULT_FLAG_REMOTE))
>> +goto unlock;
>> +
>> +/* This is one is required to check that the VMA has write access set */
>> +if (flags & FAULT_FLAG_WRITE) {
>> +if (unlikely(!(vmf.vma_flags & VM_WRITE)))
>> +goto unlock;
>> +} else {
>> +if (unlikely(!(vmf.vma_flags & (VM_READ | VM_EXEC | VM_WRITE
>> +goto unlock;
>> +}
>> +
>> +/*
>> + * Do a speculative lookup of the PTE entry.
>> + */
>> +local_irq_disable();
>> +pgd = pgd_offset(mm, address);
>> +if (pgd_none(*pgd) || unlikely(pgd_bad(*pgd)))
>> +goto out_walk;
>> +
>> +p4d = p4d_alloc(mm, pgd, address);
>> +if (p4d_none(*p4d) || unlikely(p4d_bad(*p4d)))
>> +goto out_walk;
>> +
>> +pud = pud_alloc(mm, p4d, address);
>> +if (pud_none(*pud) || unlikely(pud_bad(*pud)))
>> +goto out_walk;
>>

Re: [PATCH v2 00/20] Speculative page faults

2017-08-25 Thread Laurent Dufour

On 21/08/2017 08:28, Anshuman Khandual wrote:
> On 08/18/2017 03:34 AM, Laurent Dufour wrote:
>> This is a port on kernel 4.13 of the work done by Peter Zijlstra to
>> handle page fault without holding the mm semaphore [1].
>>
>> The idea is to try to handle user space page faults without holding the
>> mmap_sem. This should allow better concurrency for massively threaded
>> process since the page fault handler will not wait for other threads memory
>> layout change to be done, assuming that this change is done in another part
>> of the process's memory space. This type page fault is named speculative
>> page fault. If the speculative page fault fails because of a concurrency is
>> detected or because underlying PMD or PTE tables are not yet allocating, it
>> is failing its processing and a classic page fault is then tried.
>>
>> The speculative page fault (SPF) has to look for the VMA matching the fault
>> address without holding the mmap_sem, so the VMA list is now managed using
>> SRCU allowing lockless walking. The only impact would be the deferred file
>> derefencing in the case of a file mapping, since the file pointer is
>> released once the SRCU cleaning is done.  This patch relies on the change
>> done recently by Paul McKenney in SRCU which now runs a callback per CPU
>> instead of per SRCU structure [1].
>>
>> The VMA's attributes checked during the speculative page fault processing
>> have to be protected against parallel changes. This is done by using a per
>> VMA sequence lock. This sequence lock allows the speculative page fault
>> handler to fast check for parallel changes in progress and to abort the
>> speculative page fault in that case.
>>
>> Once the VMA is found, the speculative page fault handler would check for
>> the VMA's attributes to verify that the page fault has to be handled
>> correctly or not. Thus the VMA is protected through a sequence lock which
>> allows fast detection of concurrent VMA changes. If such a change is
>> detected, the speculative page fault is aborted and a *classic* page fault
>> is tried.  VMA sequence locks are added when VMA attributes which are
>> checked during the page fault are modified.
>>
>> When the PTE is fetched, the VMA is checked to see if it has been changed,
>> so once the page table is locked, the VMA is valid, so any other changes
>> leading to touching this PTE will need to lock the page table, so no
>> parallel change is possible at this time.
>>
>> Compared to the Peter's initial work, this series introduces a spin_trylock
>> when dealing with speculative page fault. This is required to avoid dead
>> lock when handling a page fault while a TLB invalidate is requested by an
>> other CPU holding the PTE. Another change due to a lock dependency issue
>> with mapping->i_mmap_rwsem.
>>
>> In addition some VMA field values which are used once the PTE is unlocked
>> at the end the page fault path are saved into the vm_fault structure to
>> used the values matching the VMA at the time the PTE was locked.
>>
>> This series builds on top of v4.13-rc5 and is functional on x86 and
>> PowerPC.
>>
>> Tests have been made using a large commercial in-memory database on a
>> PowerPC system with 752 CPU using RFC v5. The results are very encouraging
>> since the loading of the 2TB database was faster by 14% with the
>> speculative page fault.
>>
> 
> You specifically mention loading as most of the page faults will
> happen at that time and then the working set will settle down with
> very less page faults there after ? That means unless there is
> another wave of page faults we wont notice performance improvement
> during the runtime.

I just captured performance statistic during the database loading then
since the database was not stimulated, there was no page faults generated.
Further tests will be made while the database is running but I didn't have
the framework to do so right now.

> 
>> Using ebizzy test [3], which spreads a lot of threads, the result are good
>> when running on both a large or a small system. When using kernbench, the
> 
> The performance improvements are greater as there is a lot of creation
> and destruction of anon mappings which generates constant flow of page
> faults to be handled.
> 
>> result are quite similar which expected as not so much multi threaded
>> processes are involved. But there is no performance degradation neither
>> which is good.
> 
> If we compile with 'make -j N' there would be a lot of threads but I
> guess the problem is SPF does not support handling file mapping IIUC
> whi

Re: [PATCH v2 00/20] Speculative page faults

2017-08-25 Thread Laurent Dufour

On 21/08/2017 08:28, Anshuman Khandual wrote:
> On 08/18/2017 03:34 AM, Laurent Dufour wrote:
>> This is a port on kernel 4.13 of the work done by Peter Zijlstra to
>> handle page fault without holding the mm semaphore [1].
>>
>> The idea is to try to handle user space page faults without holding the
>> mmap_sem. This should allow better concurrency for massively threaded
>> process since the page fault handler will not wait for other threads memory
>> layout change to be done, assuming that this change is done in another part
>> of the process's memory space. This type page fault is named speculative
>> page fault. If the speculative page fault fails because of a concurrency is
>> detected or because underlying PMD or PTE tables are not yet allocating, it
>> is failing its processing and a classic page fault is then tried.
>>
>> The speculative page fault (SPF) has to look for the VMA matching the fault
>> address without holding the mmap_sem, so the VMA list is now managed using
>> SRCU allowing lockless walking. The only impact would be the deferred file
>> derefencing in the case of a file mapping, since the file pointer is
>> released once the SRCU cleaning is done.  This patch relies on the change
>> done recently by Paul McKenney in SRCU which now runs a callback per CPU
>> instead of per SRCU structure [1].
>>
>> The VMA's attributes checked during the speculative page fault processing
>> have to be protected against parallel changes. This is done by using a per
>> VMA sequence lock. This sequence lock allows the speculative page fault
>> handler to fast check for parallel changes in progress and to abort the
>> speculative page fault in that case.
>>
>> Once the VMA is found, the speculative page fault handler would check for
>> the VMA's attributes to verify that the page fault has to be handled
>> correctly or not. Thus the VMA is protected through a sequence lock which
>> allows fast detection of concurrent VMA changes. If such a change is
>> detected, the speculative page fault is aborted and a *classic* page fault
>> is tried.  VMA sequence locks are added when VMA attributes which are
>> checked during the page fault are modified.
>>
>> When the PTE is fetched, the VMA is checked to see if it has been changed,
>> so once the page table is locked, the VMA is valid, so any other changes
>> leading to touching this PTE will need to lock the page table, so no
>> parallel change is possible at this time.
>>
>> Compared to the Peter's initial work, this series introduces a spin_trylock
>> when dealing with speculative page fault. This is required to avoid dead
>> lock when handling a page fault while a TLB invalidate is requested by an
>> other CPU holding the PTE. Another change due to a lock dependency issue
>> with mapping->i_mmap_rwsem.
>>
>> In addition some VMA field values which are used once the PTE is unlocked
>> at the end the page fault path are saved into the vm_fault structure to
>> used the values matching the VMA at the time the PTE was locked.
>>
>> This series builds on top of v4.13-rc5 and is functional on x86 and
>> PowerPC.
>>
>> Tests have been made using a large commercial in-memory database on a
>> PowerPC system with 752 CPU using RFC v5. The results are very encouraging
>> since the loading of the 2TB database was faster by 14% with the
>> speculative page fault.
>>
> 
> You specifically mention loading as most of the page faults will
> happen at that time and then the working set will settle down with
> very less page faults there after ? That means unless there is
> another wave of page faults we wont notice performance improvement
> during the runtime.

I just captured performance statistic during the database loading then
since the database was not stimulated, there was no page faults generated.
Further tests will be made while the database is running but I didn't have
the framework to do so right now.

> 
>> Using ebizzy test [3], which spreads a lot of threads, the result are good
>> when running on both a large or a small system. When using kernbench, the
> 
> The performance improvements are greater as there is a lot of creation
> and destruction of anon mappings which generates constant flow of page
> faults to be handled.
> 
>> result are quite similar which expected as not so much multi threaded
>> processes are involved. But there is no performance degradation neither
>> which is good.
> 
> If we compile with 'make -j N' there would be a lot of threads but I
> guess the problem is SPF does not support handling file mapping IIUC
> whi

Re: [PATCH v2 18/20] perf tools: Add support for the SPF perf event

2017-08-25 Thread Laurent Dufour

On 21/08/2017 10:48, Anshuman Khandual wrote:
> On 08/18/2017 03:35 AM, Laurent Dufour wrote:
>> Add support for the new speculative faults event.
>>
>> Signed-off-by: Laurent Dufour <lduf...@linux.vnet.ibm.com>
>> ---
>>  tools/include/uapi/linux/perf_event.h | 1 +
>>  tools/perf/util/evsel.c   | 1 +
>>  tools/perf/util/parse-events.c| 4 
>>  tools/perf/util/parse-events.l| 1 +
>>  tools/perf/util/python.c  | 1 +
>>  5 files changed, 8 insertions(+)
>>
>> diff --git a/tools/include/uapi/linux/perf_event.h 
>> b/tools/include/uapi/linux/perf_event.h
>> index b1c0b187acfe..3043ec0988e9 100644
>> --- a/tools/include/uapi/linux/perf_event.h
>> +++ b/tools/include/uapi/linux/perf_event.h
>> @@ -111,6 +111,7 @@ enum perf_sw_ids {
>>  PERF_COUNT_SW_EMULATION_FAULTS  = 8,
>>  PERF_COUNT_SW_DUMMY = 9,
>>  PERF_COUNT_SW_BPF_OUTPUT= 10,
>> +PERF_COUNT_SW_SPF_DONE  = 11,
> 
> Right, just one event for the success case. 'DONE' is redundant, only
> 'SPF' should be fine IMHO.
> 

Fair enough, I'll rename it PERF_COUNT_SW_SPF.

Thanks,
Laurent.

Re: [PATCH v2 18/20] perf tools: Add support for the SPF perf event

2017-08-25 Thread Laurent Dufour

On 21/08/2017 10:48, Anshuman Khandual wrote:
> On 08/18/2017 03:35 AM, Laurent Dufour wrote:
>> Add support for the new speculative faults event.
>>
>> Signed-off-by: Laurent Dufour 
>> ---
>>  tools/include/uapi/linux/perf_event.h | 1 +
>>  tools/perf/util/evsel.c   | 1 +
>>  tools/perf/util/parse-events.c| 4 
>>  tools/perf/util/parse-events.l| 1 +
>>  tools/perf/util/python.c  | 1 +
>>  5 files changed, 8 insertions(+)
>>
>> diff --git a/tools/include/uapi/linux/perf_event.h 
>> b/tools/include/uapi/linux/perf_event.h
>> index b1c0b187acfe..3043ec0988e9 100644
>> --- a/tools/include/uapi/linux/perf_event.h
>> +++ b/tools/include/uapi/linux/perf_event.h
>> @@ -111,6 +111,7 @@ enum perf_sw_ids {
>>  PERF_COUNT_SW_EMULATION_FAULTS  = 8,
>>  PERF_COUNT_SW_DUMMY = 9,
>>  PERF_COUNT_SW_BPF_OUTPUT= 10,
>> +PERF_COUNT_SW_SPF_DONE  = 11,
> 
> Right, just one event for the success case. 'DONE' is redundant, only
> 'SPF' should be fine IMHO.
> 

Fair enough, I'll rename it PERF_COUNT_SW_SPF.

Thanks,
Laurent.

Re: [PATCH v2 14/20] mm: Provide speculative fault infrastructure

2017-08-25 Thread Laurent Dufour

On 20/08/2017 14:11, Sergey Senozhatsky wrote:
> On (08/18/17 00:05), Laurent Dufour wrote:
> [..]
>> +/*
>> + * MPOL_INTERLEAVE implies additional check in mpol_misplaced() which
>> + * are not compatible with the speculative page fault processing.
>> + */
>> +pol = __get_vma_policy(vma, address);
>> +if (!pol)
>> +pol = get_task_policy(current);
>> +if (pol && pol->mode == MPOL_INTERLEAVE)
>> +goto unlock;
> 
> include/linux/mempolicy.h defines
> 
> struct mempolicy *get_task_policy(struct task_struct *p);
> struct mempolicy *__get_vma_policy(struct vm_area_struct *vma,
>   unsigned long addr);
> 
> only for CONFIG_NUMA configs.

Thanks Sergey, I'll add #ifdef around this block.

Re: [PATCH v2 14/20] mm: Provide speculative fault infrastructure

2017-08-25 Thread Laurent Dufour

On 20/08/2017 14:11, Sergey Senozhatsky wrote:
> On (08/18/17 00:05), Laurent Dufour wrote:
> [..]
>> +/*
>> + * MPOL_INTERLEAVE implies additional check in mpol_misplaced() which
>> + * are not compatible with the speculative page fault processing.
>> + */
>> +pol = __get_vma_policy(vma, address);
>> +if (!pol)
>> +pol = get_task_policy(current);
>> +if (pol && pol->mode == MPOL_INTERLEAVE)
>> +goto unlock;
> 
> include/linux/mempolicy.h defines
> 
> struct mempolicy *get_task_policy(struct task_struct *p);
> struct mempolicy *__get_vma_policy(struct vm_area_struct *vma,
>   unsigned long addr);
> 
> only for CONFIG_NUMA configs.

Thanks Sergey, I'll add #ifdef around this block.

[PATCH v2 02/20] mm: Prepare for FAULT_FLAG_SPECULATIVE

2017-08-17 Thread Laurent Dufour

From: Peter Zijlstra <pet...@infradead.org>

When speculating faults (without holding mmap_sem) we need to validate
that the vma against which we loaded pages is still valid when we're
ready to install the new PTE.

Therefore, replace the pte_offset_map_lock() calls that (re)take the
PTL with pte_map_lock() which can fail in case we find the VMA changed
since we started the fault.

Signed-off-by: Peter Zijlstra (Intel) <pet...@infradead.org>

[Port to 4.12 kernel]
[Remove the comment about the fault_env structure which has been
 implemented as the vm_fault structure in the kernel]
Signed-off-by: Laurent Dufour <lduf...@linux.vnet.ibm.com>
---
 include/linux/mm.h |  1 +
 mm/memory.c| 55 ++
 2 files changed, 40 insertions(+), 16 deletions(-)

diff --git a/include/linux/mm.h b/include/linux/mm.h
index 46b9ac5e8569..8763ec96dc78 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -286,6 +286,7 @@ extern pgprot_t protection_map[16];
 #define FAULT_FLAG_USER0x40/* The fault originated in 
userspace */
 #define FAULT_FLAG_REMOTE  0x80/* faulting for non current tsk/mm */
 #define FAULT_FLAG_INSTRUCTION  0x100  /* The fault was during an instruction 
fetch */
+#define FAULT_FLAG_SPECULATIVE 0x200   /* Speculative fault, not holding 
mmap_sem */
 
 #define FAULT_FLAG_TRACE \
{ FAULT_FLAG_WRITE, "WRITE" }, \
diff --git a/mm/memory.c b/mm/memory.c
index 36609c082256..3ed1b00ca841 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -2269,6 +2269,12 @@ static inline void wp_page_reuse(struct vm_fault *vmf)
pte_unmap_unlock(vmf->pte, vmf->ptl);
 }
 
+static bool pte_map_lock(struct vm_fault *vmf)
+{
+   vmf->pte = pte_offset_map_lock(vmf->vma->vm_mm, vmf->pmd, vmf->address, 
>ptl);
+   return true;
+}
+
 /*
  * Handle the case of a page which we actually need to copy to a new page.
  *
@@ -2296,6 +2302,7 @@ static int wp_page_copy(struct vm_fault *vmf)
const unsigned long mmun_start = vmf->address & PAGE_MASK;
const unsigned long mmun_end = mmun_start + PAGE_SIZE;
struct mem_cgroup *memcg;
+   int ret = VM_FAULT_OOM;
 
if (unlikely(anon_vma_prepare(vma)))
goto oom;
@@ -2323,7 +2330,11 @@ static int wp_page_copy(struct vm_fault *vmf)
/*
 * Re-check the pte - we dropped the lock
 */
-   vmf->pte = pte_offset_map_lock(mm, vmf->pmd, vmf->address, >ptl);
+   if (!pte_map_lock(vmf)) {
+   mem_cgroup_cancel_charge(new_page, memcg, false);
+   ret = VM_FAULT_RETRY;
+   goto oom_free_new;
+   }
if (likely(pte_same(*vmf->pte, vmf->orig_pte))) {
if (old_page) {
if (!PageAnon(old_page)) {
@@ -2411,7 +2422,7 @@ static int wp_page_copy(struct vm_fault *vmf)
 oom:
if (old_page)
put_page(old_page);
-   return VM_FAULT_OOM;
+   return ret;
 }
 
 /**
@@ -2432,8 +2443,8 @@ static int wp_page_copy(struct vm_fault *vmf)
 int finish_mkwrite_fault(struct vm_fault *vmf)
 {
WARN_ON_ONCE(!(vmf->vma->vm_flags & VM_SHARED));
-   vmf->pte = pte_offset_map_lock(vmf->vma->vm_mm, vmf->pmd, vmf->address,
-  >ptl);
+   if (!pte_map_lock(vmf))
+   return VM_FAULT_RETRY;
/*
 * We might have raced with another page fault while we released the
 * pte_offset_map_lock.
@@ -2551,8 +2562,11 @@ static int do_wp_page(struct vm_fault *vmf)
get_page(vmf->page);
pte_unmap_unlock(vmf->pte, vmf->ptl);
lock_page(vmf->page);
-   vmf->pte = pte_offset_map_lock(vma->vm_mm, vmf->pmd,
-   vmf->address, >ptl);
+   if (!pte_map_lock(vmf)) {
+   unlock_page(vmf->page);
+   put_page(vmf->page);
+   return VM_FAULT_RETRY;
+   }
if (!pte_same(*vmf->pte, vmf->orig_pte)) {
unlock_page(vmf->page);
pte_unmap_unlock(vmf->pte, vmf->ptl);
@@ -2710,8 +2724,10 @@ int do_swap_page(struct vm_fault *vmf)
 * Back out if somebody else faulted in this pte
 * while we released the pte lock.
 */
-   vmf->pte = pte_offset_map_lock(vma->vm_mm, vmf->pmd,
-   vmf->address, >ptl);
+   if (!pte_map_lock(vmf)) {
+   delayacct_clear_flag(DELAYACCT_PF_SWAPIN);
+   ret

[PATCH v2 02/20] mm: Prepare for FAULT_FLAG_SPECULATIVE

2017-08-17 Thread Laurent Dufour

From: Peter Zijlstra 

When speculating faults (without holding mmap_sem) we need to validate
that the vma against which we loaded pages is still valid when we're
ready to install the new PTE.

Therefore, replace the pte_offset_map_lock() calls that (re)take the
PTL with pte_map_lock() which can fail in case we find the VMA changed
since we started the fault.

Signed-off-by: Peter Zijlstra (Intel) 

[Port to 4.12 kernel]
[Remove the comment about the fault_env structure which has been
 implemented as the vm_fault structure in the kernel]
Signed-off-by: Laurent Dufour 
---
 include/linux/mm.h |  1 +
 mm/memory.c| 55 ++
 2 files changed, 40 insertions(+), 16 deletions(-)

diff --git a/include/linux/mm.h b/include/linux/mm.h
index 46b9ac5e8569..8763ec96dc78 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -286,6 +286,7 @@ extern pgprot_t protection_map[16];
 #define FAULT_FLAG_USER0x40/* The fault originated in 
userspace */
 #define FAULT_FLAG_REMOTE  0x80/* faulting for non current tsk/mm */
 #define FAULT_FLAG_INSTRUCTION  0x100  /* The fault was during an instruction 
fetch */
+#define FAULT_FLAG_SPECULATIVE 0x200   /* Speculative fault, not holding 
mmap_sem */
 
 #define FAULT_FLAG_TRACE \
{ FAULT_FLAG_WRITE, "WRITE" }, \
diff --git a/mm/memory.c b/mm/memory.c
index 36609c082256..3ed1b00ca841 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -2269,6 +2269,12 @@ static inline void wp_page_reuse(struct vm_fault *vmf)
pte_unmap_unlock(vmf->pte, vmf->ptl);
 }
 
+static bool pte_map_lock(struct vm_fault *vmf)
+{
+   vmf->pte = pte_offset_map_lock(vmf->vma->vm_mm, vmf->pmd, vmf->address, 
>ptl);
+   return true;
+}
+
 /*
  * Handle the case of a page which we actually need to copy to a new page.
  *
@@ -2296,6 +2302,7 @@ static int wp_page_copy(struct vm_fault *vmf)
const unsigned long mmun_start = vmf->address & PAGE_MASK;
const unsigned long mmun_end = mmun_start + PAGE_SIZE;
struct mem_cgroup *memcg;
+   int ret = VM_FAULT_OOM;
 
if (unlikely(anon_vma_prepare(vma)))
goto oom;
@@ -2323,7 +2330,11 @@ static int wp_page_copy(struct vm_fault *vmf)
/*
 * Re-check the pte - we dropped the lock
 */
-   vmf->pte = pte_offset_map_lock(mm, vmf->pmd, vmf->address, >ptl);
+   if (!pte_map_lock(vmf)) {
+   mem_cgroup_cancel_charge(new_page, memcg, false);
+   ret = VM_FAULT_RETRY;
+   goto oom_free_new;
+   }
if (likely(pte_same(*vmf->pte, vmf->orig_pte))) {
if (old_page) {
if (!PageAnon(old_page)) {
@@ -2411,7 +2422,7 @@ static int wp_page_copy(struct vm_fault *vmf)
 oom:
if (old_page)
put_page(old_page);
-   return VM_FAULT_OOM;
+   return ret;
 }
 
 /**
@@ -2432,8 +2443,8 @@ static int wp_page_copy(struct vm_fault *vmf)
 int finish_mkwrite_fault(struct vm_fault *vmf)
 {
WARN_ON_ONCE(!(vmf->vma->vm_flags & VM_SHARED));
-   vmf->pte = pte_offset_map_lock(vmf->vma->vm_mm, vmf->pmd, vmf->address,
-  >ptl);
+   if (!pte_map_lock(vmf))
+   return VM_FAULT_RETRY;
/*
 * We might have raced with another page fault while we released the
 * pte_offset_map_lock.
@@ -2551,8 +2562,11 @@ static int do_wp_page(struct vm_fault *vmf)
get_page(vmf->page);
pte_unmap_unlock(vmf->pte, vmf->ptl);
lock_page(vmf->page);
-   vmf->pte = pte_offset_map_lock(vma->vm_mm, vmf->pmd,
-   vmf->address, >ptl);
+   if (!pte_map_lock(vmf)) {
+   unlock_page(vmf->page);
+   put_page(vmf->page);
+   return VM_FAULT_RETRY;
+   }
if (!pte_same(*vmf->pte, vmf->orig_pte)) {
unlock_page(vmf->page);
pte_unmap_unlock(vmf->pte, vmf->ptl);
@@ -2710,8 +2724,10 @@ int do_swap_page(struct vm_fault *vmf)
 * Back out if somebody else faulted in this pte
 * while we released the pte lock.
 */
-   vmf->pte = pte_offset_map_lock(vma->vm_mm, vmf->pmd,
-   vmf->address, >ptl);
+   if (!pte_map_lock(vmf)) {
+   delayacct_clear_flag(DELAYACCT_PF_SWAPIN);
+   return VM_FAULT_RETRY;
+   }
if (likely(pte_same(*vmf->pte, vmf->o

[PATCH v2 04/20] mm: VMA sequence count

2017-08-17 Thread Laurent Dufour

From: Peter Zijlstra <pet...@infradead.org>

Wrap the VMA modifications (vma_adjust/unmap_page_range) with sequence
counts such that we can easily test if a VMA is changed.

The unmap_page_range() one allows us to make assumptions about
page-tables; when we find the seqcount hasn't changed we can assume
page-tables are still valid.

The flip side is that we cannot distinguish between a vma_adjust() and
the unmap_page_range() -- where with the former we could have
re-checked the vma bounds against the address.

Signed-off-by: Peter Zijlstra (Intel) <pet...@infradead.org>

[Port to 4.12 kernel]
[Fix lock dependency between mapping->i_mmap_rwsem and vma->vm_sequence]
Signed-off-by: Laurent Dufour <lduf...@linux.vnet.ibm.com>
---
 include/linux/mm_types.h |  1 +
 mm/memory.c  |  2 ++
 mm/mmap.c| 21 ++---
 3 files changed, 21 insertions(+), 3 deletions(-)

diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h
index 3cadee0a3508..642aad26b32f 100644
--- a/include/linux/mm_types.h
+++ b/include/linux/mm_types.h
@@ -342,6 +342,7 @@ struct vm_area_struct {
struct mempolicy *vm_policy;/* NUMA policy for the VMA */
 #endif
struct vm_userfaultfd_ctx vm_userfaultfd_ctx;
+   seqcount_t vm_sequence;
 } __randomize_layout;
 
 struct core_thread {
diff --git a/mm/memory.c b/mm/memory.c
index fa598889eb0e..4a2736fe2ef6 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -1408,6 +1408,7 @@ void unmap_page_range(struct mmu_gather *tlb,
unsigned long next;
 
BUG_ON(addr >= end);
+   write_seqcount_begin(>vm_sequence);
tlb_start_vma(tlb, vma);
pgd = pgd_offset(vma->vm_mm, addr);
do {
@@ -1417,6 +1418,7 @@ void unmap_page_range(struct mmu_gather *tlb,
next = zap_p4d_range(tlb, vma, pgd, addr, next, details);
} while (pgd++, addr = next, addr != end);
tlb_end_vma(tlb, vma);
+   write_seqcount_end(>vm_sequence);
 }
 
 
diff --git a/mm/mmap.c b/mm/mmap.c
index f19efcf75418..140b22136cb7 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -557,6 +557,8 @@ void __vma_link_rb(struct mm_struct *mm, struct 
vm_area_struct *vma,
else
mm->highest_vm_end = vm_end_gap(vma);
 
+   seqcount_init(>vm_sequence);
+
/*
 * vma->vm_prev wasn't known when we followed the rbtree to find the
 * correct insertion point for that vma. As a result, we could not
@@ -798,6 +800,11 @@ int __vma_adjust(struct vm_area_struct *vma, unsigned long 
start,
}
}
 
+   write_seqcount_begin(>vm_sequence);
+   if (next && next != vma)
+   write_seqcount_begin_nested(>vm_sequence,
+   SINGLE_DEPTH_NESTING);
+
anon_vma = vma->anon_vma;
if (!anon_vma && adjust_next)
anon_vma = next->anon_vma;
@@ -902,6 +909,7 @@ int __vma_adjust(struct vm_area_struct *vma, unsigned long 
start,
mm->map_count--;
mpol_put(vma_policy(next));
kmem_cache_free(vm_area_cachep, next);
+   write_seqcount_end(>vm_sequence);
/*
 * In mprotect's case 6 (see comments on vma_merge),
 * we must remove another next too. It would clutter
@@ -931,11 +939,14 @@ int __vma_adjust(struct vm_area_struct *vma, unsigned 
long start,
if (remove_next == 2) {
remove_next = 1;
end = next->vm_end;
+   write_seqcount_end(>vm_sequence);
goto again;
-   }
-   else if (next)
+   } else if (next) {
+   if (next != vma)
+   write_seqcount_begin_nested(>vm_sequence,
+   
SINGLE_DEPTH_NESTING);
vma_gap_update(next);
-   else {
+   } else {
/*
 * If remove_next == 2 we obviously can't
 * reach this path.
@@ -961,6 +972,10 @@ int __vma_adjust(struct vm_area_struct *vma, unsigned long 
start,
if (insert && file)
uprobe_mmap(insert);
 
+   if (next && next != vma)
+   write_seqcount_end(>vm_sequence);
+   write_seqcount_end(>vm_sequence);
+
validate_mm(mm);
 
return 0;
-- 
2.7.4

[PATCH v2 04/20] mm: VMA sequence count

2017-08-17 Thread Laurent Dufour

From: Peter Zijlstra 

Wrap the VMA modifications (vma_adjust/unmap_page_range) with sequence
counts such that we can easily test if a VMA is changed.

The unmap_page_range() one allows us to make assumptions about
page-tables; when we find the seqcount hasn't changed we can assume
page-tables are still valid.

The flip side is that we cannot distinguish between a vma_adjust() and
the unmap_page_range() -- where with the former we could have
re-checked the vma bounds against the address.

Signed-off-by: Peter Zijlstra (Intel) 

[Port to 4.12 kernel]
[Fix lock dependency between mapping->i_mmap_rwsem and vma->vm_sequence]
Signed-off-by: Laurent Dufour 
---
 include/linux/mm_types.h |  1 +
 mm/memory.c  |  2 ++
 mm/mmap.c| 21 ++---
 3 files changed, 21 insertions(+), 3 deletions(-)

diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h
index 3cadee0a3508..642aad26b32f 100644
--- a/include/linux/mm_types.h
+++ b/include/linux/mm_types.h
@@ -342,6 +342,7 @@ struct vm_area_struct {
struct mempolicy *vm_policy;/* NUMA policy for the VMA */
 #endif
struct vm_userfaultfd_ctx vm_userfaultfd_ctx;
+   seqcount_t vm_sequence;
 } __randomize_layout;
 
 struct core_thread {
diff --git a/mm/memory.c b/mm/memory.c
index fa598889eb0e..4a2736fe2ef6 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -1408,6 +1408,7 @@ void unmap_page_range(struct mmu_gather *tlb,
unsigned long next;
 
BUG_ON(addr >= end);
+   write_seqcount_begin(>vm_sequence);
tlb_start_vma(tlb, vma);
pgd = pgd_offset(vma->vm_mm, addr);
do {
@@ -1417,6 +1418,7 @@ void unmap_page_range(struct mmu_gather *tlb,
next = zap_p4d_range(tlb, vma, pgd, addr, next, details);
} while (pgd++, addr = next, addr != end);
tlb_end_vma(tlb, vma);
+   write_seqcount_end(>vm_sequence);
 }
 
 
diff --git a/mm/mmap.c b/mm/mmap.c
index f19efcf75418..140b22136cb7 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -557,6 +557,8 @@ void __vma_link_rb(struct mm_struct *mm, struct 
vm_area_struct *vma,
else
mm->highest_vm_end = vm_end_gap(vma);
 
+   seqcount_init(>vm_sequence);
+
/*
 * vma->vm_prev wasn't known when we followed the rbtree to find the
 * correct insertion point for that vma. As a result, we could not
@@ -798,6 +800,11 @@ int __vma_adjust(struct vm_area_struct *vma, unsigned long 
start,
}
}
 
+   write_seqcount_begin(>vm_sequence);
+   if (next && next != vma)
+   write_seqcount_begin_nested(>vm_sequence,
+   SINGLE_DEPTH_NESTING);
+
anon_vma = vma->anon_vma;
if (!anon_vma && adjust_next)
anon_vma = next->anon_vma;
@@ -902,6 +909,7 @@ int __vma_adjust(struct vm_area_struct *vma, unsigned long 
start,
mm->map_count--;
mpol_put(vma_policy(next));
kmem_cache_free(vm_area_cachep, next);
+   write_seqcount_end(>vm_sequence);
/*
 * In mprotect's case 6 (see comments on vma_merge),
 * we must remove another next too. It would clutter
@@ -931,11 +939,14 @@ int __vma_adjust(struct vm_area_struct *vma, unsigned 
long start,
if (remove_next == 2) {
remove_next = 1;
end = next->vm_end;
+   write_seqcount_end(>vm_sequence);
goto again;
-   }
-   else if (next)
+   } else if (next) {
+   if (next != vma)
+   write_seqcount_begin_nested(>vm_sequence,
+   
SINGLE_DEPTH_NESTING);
vma_gap_update(next);
-   else {
+   } else {
/*
 * If remove_next == 2 we obviously can't
 * reach this path.
@@ -961,6 +972,10 @@ int __vma_adjust(struct vm_area_struct *vma, unsigned long 
start,
if (insert && file)
uprobe_mmap(insert);
 
+   if (next && next != vma)
+   write_seqcount_end(>vm_sequence);
+   write_seqcount_end(>vm_sequence);
+
validate_mm(mm);
 
return 0;
-- 
2.7.4

[PATCH v2 06/20] mm: RCU free VMAs

2017-08-17 Thread Laurent Dufour

From: Peter Zijlstra <pet...@infradead.org>

Manage the VMAs with SRCU such that we can do a lockless VMA lookup.

We put the fput(vma->vm_file) in the SRCU callback, this keeps files
valid during speculative faults, this is possible due to the delayed
fput work by Al Viro -- do we need srcu_barrier() in unmount
someplace?

We guard the mm_rb tree with a seqlock (this could be a seqcount but
we'd have to disable preemption around the write side in order to make
the retry loop in __read_seqcount_begin() work) such that we can know
if the rb tree walk was correct. We cannot trust the restult of a
lockless tree walk in the face of concurrent tree rotations; although
we can trust on the termination of such walks -- tree rotations
guarantee the end result is a tree again after all.

Furthermore, we rely on the WMB implied by the
write_seqlock/count_begin() to separate the VMA initialization and the
publishing stores, analogous to the RELEASE in rcu_assign_pointer().
We also rely on the RMB from read_seqretry() to separate the vma load
from further loads like the smp_read_barrier_depends() in regular
RCU.

We must not touch the vmacache while doing SRCU lookups as that is not
properly serialized against changes. We update gap information after
publishing the VMA, but A) we don't use that and B) the seqlock
read side would fix that anyhow.

We clear vma->vm_rb for nodes removed from the vma tree such that we
can easily detect such 'dead' nodes, we rely on the WMB from
write_sequnlock() to separate the tree removal and clearing the node.

Provide find_vma_srcu() which wraps the required magic.

Signed-off-by: Peter Zijlstra (Intel) <pet...@infradead.org>

[Remove the warnings in description about the SRCU global lock which
 has been removed now]
[Rename vma_is_dead() to vma_has_changed() and move its adding to the next
 patch]
Signed-off-by: Laurent Dufour <lduf...@linux.vnet.ibm.com>
---
 include/linux/mm_types.h |   2 +
 kernel/fork.c|   1 +
 mm/init-mm.c |   1 +
 mm/internal.h|   5 +++
 mm/mmap.c| 100 +++
 5 files changed, 83 insertions(+), 26 deletions(-)

diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h
index 642aad26b32f..f3851b250fde 100644
--- a/include/linux/mm_types.h
+++ b/include/linux/mm_types.h
@@ -343,6 +343,7 @@ struct vm_area_struct {
 #endif
struct vm_userfaultfd_ctx vm_userfaultfd_ctx;
seqcount_t vm_sequence;
+   struct rcu_head vm_rcu_head;
 } __randomize_layout;
 
 struct core_thread {
@@ -360,6 +361,7 @@ struct kioctx_table;
 struct mm_struct {
struct vm_area_struct *mmap;/* list of VMAs */
struct rb_root mm_rb;
+   seqlock_t mm_seq;
u32 vmacache_seqnum;   /* per-thread vmacache */
 #ifdef CONFIG_MMU
unsigned long (*get_unmapped_area) (struct file *filp,
diff --git a/kernel/fork.c b/kernel/fork.c
index e075b7780421..f28aa54c668c 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -791,6 +791,7 @@ static struct mm_struct *mm_init(struct mm_struct *mm, 
struct task_struct *p,
mm->mmap = NULL;
mm->mm_rb = RB_ROOT;
mm->vmacache_seqnum = 0;
+   seqlock_init(>mm_seq);
atomic_set(>mm_users, 1);
atomic_set(>mm_count, 1);
init_rwsem(>mmap_sem);
diff --git a/mm/init-mm.c b/mm/init-mm.c
index 975e49f00f34..2b1fa061684f 100644
--- a/mm/init-mm.c
+++ b/mm/init-mm.c
@@ -16,6 +16,7 @@
 
 struct mm_struct init_mm = {
.mm_rb  = RB_ROOT,
+   .mm_seq = __SEQLOCK_UNLOCKED(init_mm.mm_seq),
.pgd= swapper_pg_dir,
.mm_users   = ATOMIC_INIT(2),
.mm_count   = ATOMIC_INIT(1),
diff --git a/mm/internal.h b/mm/internal.h
index 4ef49fc55e58..736540f15936 100644
--- a/mm/internal.h
+++ b/mm/internal.h
@@ -40,6 +40,11 @@ void page_writeback_init(void);
 
 int do_swap_page(struct vm_fault *vmf);
 
+extern struct srcu_struct vma_srcu;
+
+extern struct vm_area_struct *find_vma_srcu(struct mm_struct *mm,
+   unsigned long addr);
+
 void free_pgtables(struct mmu_gather *tlb, struct vm_area_struct *start_vma,
unsigned long floor, unsigned long ceiling);
 
diff --git a/mm/mmap.c b/mm/mmap.c
index b480043e38fb..34a7f1bdffe4 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -159,6 +159,23 @@ void unlink_file_vma(struct vm_area_struct *vma)
}
 }
 
+DEFINE_SRCU(vma_srcu);
+
+static void __free_vma(struct rcu_head *head)
+{
+   struct vm_area_struct *vma =
+   container_of(head, struct vm_area_struct, vm_rcu_head);
+
+   if (vma->vm_file)
+   fput(vma->vm_file);
+   kmem_cache_free(vm_area_cachep, vma);
+}
+
+static void free_vma(struct vm_area_struct *vma)
+{
+   call_srcu(_srcu, >vm_rcu_head, __free_vma);
+}
+
 /*
  * Close a vm structure and free it, returning the next.

[PATCH v2 06/20] mm: RCU free VMAs

2017-08-17 Thread Laurent Dufour

From: Peter Zijlstra 

Manage the VMAs with SRCU such that we can do a lockless VMA lookup.

We put the fput(vma->vm_file) in the SRCU callback, this keeps files
valid during speculative faults, this is possible due to the delayed
fput work by Al Viro -- do we need srcu_barrier() in unmount
someplace?

We guard the mm_rb tree with a seqlock (this could be a seqcount but
we'd have to disable preemption around the write side in order to make
the retry loop in __read_seqcount_begin() work) such that we can know
if the rb tree walk was correct. We cannot trust the restult of a
lockless tree walk in the face of concurrent tree rotations; although
we can trust on the termination of such walks -- tree rotations
guarantee the end result is a tree again after all.

Furthermore, we rely on the WMB implied by the
write_seqlock/count_begin() to separate the VMA initialization and the
publishing stores, analogous to the RELEASE in rcu_assign_pointer().
We also rely on the RMB from read_seqretry() to separate the vma load
from further loads like the smp_read_barrier_depends() in regular
RCU.

We must not touch the vmacache while doing SRCU lookups as that is not
properly serialized against changes. We update gap information after
publishing the VMA, but A) we don't use that and B) the seqlock
read side would fix that anyhow.

We clear vma->vm_rb for nodes removed from the vma tree such that we
can easily detect such 'dead' nodes, we rely on the WMB from
write_sequnlock() to separate the tree removal and clearing the node.

Provide find_vma_srcu() which wraps the required magic.

Signed-off-by: Peter Zijlstra (Intel) 

[Remove the warnings in description about the SRCU global lock which
 has been removed now]
[Rename vma_is_dead() to vma_has_changed() and move its adding to the next
 patch]
Signed-off-by: Laurent Dufour 
---
 include/linux/mm_types.h |   2 +
 kernel/fork.c|   1 +
 mm/init-mm.c |   1 +
 mm/internal.h|   5 +++
 mm/mmap.c| 100 +++
 5 files changed, 83 insertions(+), 26 deletions(-)

diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h
index 642aad26b32f..f3851b250fde 100644
--- a/include/linux/mm_types.h
+++ b/include/linux/mm_types.h
@@ -343,6 +343,7 @@ struct vm_area_struct {
 #endif
struct vm_userfaultfd_ctx vm_userfaultfd_ctx;
seqcount_t vm_sequence;
+   struct rcu_head vm_rcu_head;
 } __randomize_layout;
 
 struct core_thread {
@@ -360,6 +361,7 @@ struct kioctx_table;
 struct mm_struct {
struct vm_area_struct *mmap;/* list of VMAs */
struct rb_root mm_rb;
+   seqlock_t mm_seq;
u32 vmacache_seqnum;   /* per-thread vmacache */
 #ifdef CONFIG_MMU
unsigned long (*get_unmapped_area) (struct file *filp,
diff --git a/kernel/fork.c b/kernel/fork.c
index e075b7780421..f28aa54c668c 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -791,6 +791,7 @@ static struct mm_struct *mm_init(struct mm_struct *mm, 
struct task_struct *p,
mm->mmap = NULL;
mm->mm_rb = RB_ROOT;
mm->vmacache_seqnum = 0;
+   seqlock_init(>mm_seq);
atomic_set(>mm_users, 1);
atomic_set(>mm_count, 1);
init_rwsem(>mmap_sem);
diff --git a/mm/init-mm.c b/mm/init-mm.c
index 975e49f00f34..2b1fa061684f 100644
--- a/mm/init-mm.c
+++ b/mm/init-mm.c
@@ -16,6 +16,7 @@
 
 struct mm_struct init_mm = {
.mm_rb  = RB_ROOT,
+   .mm_seq = __SEQLOCK_UNLOCKED(init_mm.mm_seq),
.pgd= swapper_pg_dir,
.mm_users   = ATOMIC_INIT(2),
.mm_count   = ATOMIC_INIT(1),
diff --git a/mm/internal.h b/mm/internal.h
index 4ef49fc55e58..736540f15936 100644
--- a/mm/internal.h
+++ b/mm/internal.h
@@ -40,6 +40,11 @@ void page_writeback_init(void);
 
 int do_swap_page(struct vm_fault *vmf);
 
+extern struct srcu_struct vma_srcu;
+
+extern struct vm_area_struct *find_vma_srcu(struct mm_struct *mm,
+   unsigned long addr);
+
 void free_pgtables(struct mmu_gather *tlb, struct vm_area_struct *start_vma,
unsigned long floor, unsigned long ceiling);
 
diff --git a/mm/mmap.c b/mm/mmap.c
index b480043e38fb..34a7f1bdffe4 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -159,6 +159,23 @@ void unlink_file_vma(struct vm_area_struct *vma)
}
 }
 
+DEFINE_SRCU(vma_srcu);
+
+static void __free_vma(struct rcu_head *head)
+{
+   struct vm_area_struct *vma =
+   container_of(head, struct vm_area_struct, vm_rcu_head);
+
+   if (vma->vm_file)
+   fput(vma->vm_file);
+   kmem_cache_free(vm_area_cachep, vma);
+}
+
+static void free_vma(struct vm_area_struct *vma)
+{
+   call_srcu(_srcu, >vm_rcu_head, __free_vma);
+}
+
 /*
  * Close a vm structure and free it, returning the next.
  */
@@ -169,10 +186,8 @@ static struct vm_area_struct *remove_vma(struct 
vm_a

[PATCH v2 08/20] mm: Protect SPF handler against anon_vma changes

2017-08-17 Thread Laurent Dufour

The speculative page fault handler must be protected against anon_vma
changes. This is because page_add_new_anon_rmap() is called during the
speculative path.

In addition, don't try speculative page fault if the VMA don't have an
anon_vma structure allocated because its allocation should be
protected by the mmap_sem.

In __vma_adjust() when importer->anon_vma is set, there is no need to
protect against speculative page faults since speculative page fault
is aborted if the vma->anon_vma is not set.

When calling page_add_new_anon_rmap() vma->anon_vma is necessarily
valid since we checked for it when locking the pte and the anon_vma is
removed once the pte is unlocked. So even if the speculative page
fault handler is running concurrently with do_unmap(), as the pte is
locked in unmap_region() - through unmap_vmas() - and the anon_vma
unlinked later, because we check for the vma sequence counter which is
updated in unmap_page_range() before locking the pte, and then in
free_pgtables() so when locking the pte the change will be detected.

Signed-off-by: Laurent Dufour <lduf...@linux.vnet.ibm.com>
---
 mm/memory.c | 4 
 1 file changed, 4 insertions(+)

diff --git a/mm/memory.c b/mm/memory.c
index da3bd07bb052..68e4fdcce692 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -615,7 +615,9 @@ void free_pgtables(struct mmu_gather *tlb, struct 
vm_area_struct *vma,
 * Hide vma from rmap and truncate_pagecache before freeing
 * pgtables
 */
+   write_seqcount_begin(>vm_sequence);
unlink_anon_vmas(vma);
+   write_seqcount_end(>vm_sequence);
unlink_file_vma(vma);
 
if (is_vm_hugetlb_page(vma)) {
@@ -629,7 +631,9 @@ void free_pgtables(struct mmu_gather *tlb, struct 
vm_area_struct *vma,
   && !is_vm_hugetlb_page(next)) {
vma = next;
next = vma->vm_next;
+   write_seqcount_begin(>vm_sequence);
unlink_anon_vmas(vma);
+   write_seqcount_end(>vm_sequence);
unlink_file_vma(vma);
}
free_pgd_range(tlb, addr, vma->vm_end,
-- 
2.7.4

[PATCH v2 09/20] mm/migrate: Pass vm_fault pointer to migrate_misplaced_page()

2017-08-17 Thread Laurent Dufour

migrate_misplaced_page() is only called during the page fault handling so
it's better to pass the pointer to the struct vm_fault instead of the vma.

This way during the speculative page fault path the saved vma->vm_flags
could be used.

Signed-off-by: Laurent Dufour <lduf...@linux.vnet.ibm.com>
---
 include/linux/migrate.h | 4 ++--
 mm/memory.c | 2 +-
 mm/migrate.c| 4 ++--
 3 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/include/linux/migrate.h b/include/linux/migrate.h
index 3e0d405dc842..65357105cbab 100644
--- a/include/linux/migrate.h
+++ b/include/linux/migrate.h
@@ -108,14 +108,14 @@ static inline void __ClearPageMovable(struct page *page)
 #ifdef CONFIG_NUMA_BALANCING
 extern bool pmd_trans_migrating(pmd_t pmd);
 extern int migrate_misplaced_page(struct page *page,
- struct vm_area_struct *vma, int node);
+ struct vm_fault *vmf, int node);
 #else
 static inline bool pmd_trans_migrating(pmd_t pmd)
 {
return false;
 }
 static inline int migrate_misplaced_page(struct page *page,
-struct vm_area_struct *vma, int node)
+struct vm_fault *vmf, int node)
 {
return -EAGAIN; /* can't migrate now */
 }
diff --git a/mm/memory.c b/mm/memory.c
index 68e4fdcce692..535282b3 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -3616,7 +3616,7 @@ static int do_numa_page(struct vm_fault *vmf)
}
 
/* Migrate to the requested node */
-   migrated = migrate_misplaced_page(page, vma, target_nid);
+   migrated = migrate_misplaced_page(page, vmf, target_nid);
if (migrated) {
page_nid = target_nid;
flags |= TNF_MIGRATED;
diff --git a/mm/migrate.c b/mm/migrate.c
index d68a41da6abb..354f74f7dad3 100644
--- a/mm/migrate.c
+++ b/mm/migrate.c
@@ -1847,7 +1847,7 @@ bool pmd_trans_migrating(pmd_t pmd)
  * node. Caller is expected to have an elevated reference count on
  * the page that will be dropped by this function before returning.
  */
-int migrate_misplaced_page(struct page *page, struct vm_area_struct *vma,
+int migrate_misplaced_page(struct page *page, struct vm_fault *vmf,
   int node)
 {
pg_data_t *pgdat = NODE_DATA(node);
@@ -1860,7 +1860,7 @@ int migrate_misplaced_page(struct page *page, struct 
vm_area_struct *vma,
 * with execute permissions as they are probably shared libraries.
 */
if (page_mapcount(page) != 1 && page_is_file_cache(page) &&
-   (vma->vm_flags & VM_EXEC))
+   (vmf->vma_flags & VM_EXEC))
goto out;
 
/*
-- 
2.7.4

[PATCH v2 08/20] mm: Protect SPF handler against anon_vma changes

2017-08-17 Thread Laurent Dufour

The speculative page fault handler must be protected against anon_vma
changes. This is because page_add_new_anon_rmap() is called during the
speculative path.

In addition, don't try speculative page fault if the VMA don't have an
anon_vma structure allocated because its allocation should be
protected by the mmap_sem.

In __vma_adjust() when importer->anon_vma is set, there is no need to
protect against speculative page faults since speculative page fault
is aborted if the vma->anon_vma is not set.

When calling page_add_new_anon_rmap() vma->anon_vma is necessarily
valid since we checked for it when locking the pte and the anon_vma is
removed once the pte is unlocked. So even if the speculative page
fault handler is running concurrently with do_unmap(), as the pte is
locked in unmap_region() - through unmap_vmas() - and the anon_vma
unlinked later, because we check for the vma sequence counter which is
updated in unmap_page_range() before locking the pte, and then in
free_pgtables() so when locking the pte the change will be detected.

Signed-off-by: Laurent Dufour 
---
 mm/memory.c | 4 
 1 file changed, 4 insertions(+)

diff --git a/mm/memory.c b/mm/memory.c
index da3bd07bb052..68e4fdcce692 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -615,7 +615,9 @@ void free_pgtables(struct mmu_gather *tlb, struct 
vm_area_struct *vma,
 * Hide vma from rmap and truncate_pagecache before freeing
 * pgtables
 */
+   write_seqcount_begin(>vm_sequence);
unlink_anon_vmas(vma);
+   write_seqcount_end(>vm_sequence);
unlink_file_vma(vma);
 
if (is_vm_hugetlb_page(vma)) {
@@ -629,7 +631,9 @@ void free_pgtables(struct mmu_gather *tlb, struct 
vm_area_struct *vma,
   && !is_vm_hugetlb_page(next)) {
vma = next;
next = vma->vm_next;
+   write_seqcount_begin(>vm_sequence);
unlink_anon_vmas(vma);
+   write_seqcount_end(>vm_sequence);
unlink_file_vma(vma);
}
free_pgd_range(tlb, addr, vma->vm_end,
-- 
2.7.4

[PATCH v2 09/20] mm/migrate: Pass vm_fault pointer to migrate_misplaced_page()

2017-08-17 Thread Laurent Dufour

migrate_misplaced_page() is only called during the page fault handling so
it's better to pass the pointer to the struct vm_fault instead of the vma.

This way during the speculative page fault path the saved vma->vm_flags
could be used.

Signed-off-by: Laurent Dufour 
---
 include/linux/migrate.h | 4 ++--
 mm/memory.c | 2 +-
 mm/migrate.c| 4 ++--
 3 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/include/linux/migrate.h b/include/linux/migrate.h
index 3e0d405dc842..65357105cbab 100644
--- a/include/linux/migrate.h
+++ b/include/linux/migrate.h
@@ -108,14 +108,14 @@ static inline void __ClearPageMovable(struct page *page)
 #ifdef CONFIG_NUMA_BALANCING
 extern bool pmd_trans_migrating(pmd_t pmd);
 extern int migrate_misplaced_page(struct page *page,
- struct vm_area_struct *vma, int node);
+ struct vm_fault *vmf, int node);
 #else
 static inline bool pmd_trans_migrating(pmd_t pmd)
 {
return false;
 }
 static inline int migrate_misplaced_page(struct page *page,
-struct vm_area_struct *vma, int node)
+struct vm_fault *vmf, int node)
 {
return -EAGAIN; /* can't migrate now */
 }
diff --git a/mm/memory.c b/mm/memory.c
index 68e4fdcce692..535282b3 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -3616,7 +3616,7 @@ static int do_numa_page(struct vm_fault *vmf)
}
 
/* Migrate to the requested node */
-   migrated = migrate_misplaced_page(page, vma, target_nid);
+   migrated = migrate_misplaced_page(page, vmf, target_nid);
if (migrated) {
page_nid = target_nid;
flags |= TNF_MIGRATED;
diff --git a/mm/migrate.c b/mm/migrate.c
index d68a41da6abb..354f74f7dad3 100644
--- a/mm/migrate.c
+++ b/mm/migrate.c
@@ -1847,7 +1847,7 @@ bool pmd_trans_migrating(pmd_t pmd)
  * node. Caller is expected to have an elevated reference count on
  * the page that will be dropped by this function before returning.
  */
-int migrate_misplaced_page(struct page *page, struct vm_area_struct *vma,
+int migrate_misplaced_page(struct page *page, struct vm_fault *vmf,
   int node)
 {
pg_data_t *pgdat = NODE_DATA(node);
@@ -1860,7 +1860,7 @@ int migrate_misplaced_page(struct page *page, struct 
vm_area_struct *vma,
 * with execute permissions as they are probably shared libraries.
 */
if (page_mapcount(page) != 1 && page_is_file_cache(page) &&
-   (vma->vm_flags & VM_EXEC))
+   (vmf->vma_flags & VM_EXEC))
goto out;
 
/*
-- 
2.7.4

[PATCH v2 12/20] mm: Introduce __vm_normal_page()

2017-08-17 Thread Laurent Dufour

When dealing with the speculative fault path we should use the VMA's field
cached value stored in the vm_fault structure.

Currently vm_normal_page() is using the pointer to the VMA to fetch the
vm_flags value. This patch provides a new __vm_normal_page() which is
receiving the vm_flags flags value as parameter.

Note: The speculative path is turned on for architecture providing support
for special PTE flag. So only the first block of vm_normal_page is used
during the speculative path.

Signed-off-by: Laurent Dufour <lduf...@linux.vnet.ibm.com>
---
 mm/memory.c | 25 +
 1 file changed, 17 insertions(+), 8 deletions(-)

diff --git a/mm/memory.c b/mm/memory.c
index ad7b6372d302..9f9e5bb7a556 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -820,8 +820,9 @@ static void print_bad_pte(struct vm_area_struct *vma, 
unsigned long addr,
 #else
 # define HAVE_PTE_SPECIAL 0
 #endif
-struct page *vm_normal_page(struct vm_area_struct *vma, unsigned long addr,
-   pte_t pte)
+static struct page *__vm_normal_page(struct vm_area_struct *vma,
+unsigned long addr,
+pte_t pte, unsigned long vma_flags)
 {
unsigned long pfn = pte_pfn(pte);
 
@@ -830,7 +831,7 @@ struct page *vm_normal_page(struct vm_area_struct *vma, 
unsigned long addr,
goto check_pfn;
if (vma->vm_ops && vma->vm_ops->find_special_page)
return vma->vm_ops->find_special_page(vma, addr);
-   if (vma->vm_flags & (VM_PFNMAP | VM_MIXEDMAP))
+   if (vma_flags & (VM_PFNMAP | VM_MIXEDMAP))
return NULL;
if (!is_zero_pfn(pfn))
print_bad_pte(vma, addr, pte, NULL);
@@ -839,8 +840,8 @@ struct page *vm_normal_page(struct vm_area_struct *vma, 
unsigned long addr,
 
/* !HAVE_PTE_SPECIAL case follows: */
 
-   if (unlikely(vma->vm_flags & (VM_PFNMAP|VM_MIXEDMAP))) {
-   if (vma->vm_flags & VM_MIXEDMAP) {
+   if (unlikely(vma_flags & (VM_PFNMAP|VM_MIXEDMAP))) {
+   if (vma_flags & VM_MIXEDMAP) {
if (!pfn_valid(pfn))
return NULL;
goto out;
@@ -849,7 +850,7 @@ struct page *vm_normal_page(struct vm_area_struct *vma, 
unsigned long addr,
off = (addr - vma->vm_start) >> PAGE_SHIFT;
if (pfn == vma->vm_pgoff + off)
return NULL;
-   if (!is_cow_mapping(vma->vm_flags))
+   if (!is_cow_mapping(vma_flags))
return NULL;
}
}
@@ -870,6 +871,13 @@ struct page *vm_normal_page(struct vm_area_struct *vma, 
unsigned long addr,
return pfn_to_page(pfn);
 }
 
+struct page *vm_normal_page(struct vm_area_struct *vma, unsigned long addr,
+   pte_t pte)
+{
+   return __vm_normal_page(vma, addr, pte, vma->vm_flags);
+}
+
+
 #ifdef CONFIG_TRANSPARENT_HUGEPAGE
 struct page *vm_normal_page_pmd(struct vm_area_struct *vma, unsigned long addr,
pmd_t pmd)
@@ -2548,7 +2556,8 @@ static int do_wp_page(struct vm_fault *vmf)
 {
struct vm_area_struct *vma = vmf->vma;
 
-   vmf->page = vm_normal_page(vma, vmf->address, vmf->orig_pte);
+   vmf->page = __vm_normal_page(vma, vmf->address, vmf->orig_pte,
+vmf->vma_flags);
if (!vmf->page) {
/*
 * VM_MIXEDMAP !pfn_valid() case, or VM_SOFTDIRTY clear on a
@@ -3575,7 +3584,7 @@ static int do_numa_page(struct vm_fault *vmf)
ptep_modify_prot_commit(vma->vm_mm, vmf->address, vmf->pte, pte);
update_mmu_cache(vma, vmf->address, vmf->pte);
 
-   page = vm_normal_page(vma, vmf->address, pte);
+   page = __vm_normal_page(vma, vmf->address, pte, vmf->vma_flags);
if (!page) {
pte_unmap_unlock(vmf->pte, vmf->ptl);
return 0;
-- 
2.7.4

[PATCH v2 12/20] mm: Introduce __vm_normal_page()

2017-08-17 Thread Laurent Dufour

When dealing with the speculative fault path we should use the VMA's field
cached value stored in the vm_fault structure.

Currently vm_normal_page() is using the pointer to the VMA to fetch the
vm_flags value. This patch provides a new __vm_normal_page() which is
receiving the vm_flags flags value as parameter.

Note: The speculative path is turned on for architecture providing support
for special PTE flag. So only the first block of vm_normal_page is used
during the speculative path.

Signed-off-by: Laurent Dufour 
---
 mm/memory.c | 25 +
 1 file changed, 17 insertions(+), 8 deletions(-)

diff --git a/mm/memory.c b/mm/memory.c
index ad7b6372d302..9f9e5bb7a556 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -820,8 +820,9 @@ static void print_bad_pte(struct vm_area_struct *vma, 
unsigned long addr,
 #else
 # define HAVE_PTE_SPECIAL 0
 #endif
-struct page *vm_normal_page(struct vm_area_struct *vma, unsigned long addr,
-   pte_t pte)
+static struct page *__vm_normal_page(struct vm_area_struct *vma,
+unsigned long addr,
+pte_t pte, unsigned long vma_flags)
 {
unsigned long pfn = pte_pfn(pte);
 
@@ -830,7 +831,7 @@ struct page *vm_normal_page(struct vm_area_struct *vma, 
unsigned long addr,
goto check_pfn;
if (vma->vm_ops && vma->vm_ops->find_special_page)
return vma->vm_ops->find_special_page(vma, addr);
-   if (vma->vm_flags & (VM_PFNMAP | VM_MIXEDMAP))
+   if (vma_flags & (VM_PFNMAP | VM_MIXEDMAP))
return NULL;
if (!is_zero_pfn(pfn))
print_bad_pte(vma, addr, pte, NULL);
@@ -839,8 +840,8 @@ struct page *vm_normal_page(struct vm_area_struct *vma, 
unsigned long addr,
 
/* !HAVE_PTE_SPECIAL case follows: */
 
-   if (unlikely(vma->vm_flags & (VM_PFNMAP|VM_MIXEDMAP))) {
-   if (vma->vm_flags & VM_MIXEDMAP) {
+   if (unlikely(vma_flags & (VM_PFNMAP|VM_MIXEDMAP))) {
+   if (vma_flags & VM_MIXEDMAP) {
if (!pfn_valid(pfn))
return NULL;
goto out;
@@ -849,7 +850,7 @@ struct page *vm_normal_page(struct vm_area_struct *vma, 
unsigned long addr,
off = (addr - vma->vm_start) >> PAGE_SHIFT;
if (pfn == vma->vm_pgoff + off)
return NULL;
-   if (!is_cow_mapping(vma->vm_flags))
+   if (!is_cow_mapping(vma_flags))
return NULL;
}
}
@@ -870,6 +871,13 @@ struct page *vm_normal_page(struct vm_area_struct *vma, 
unsigned long addr,
return pfn_to_page(pfn);
 }
 
+struct page *vm_normal_page(struct vm_area_struct *vma, unsigned long addr,
+   pte_t pte)
+{
+   return __vm_normal_page(vma, addr, pte, vma->vm_flags);
+}
+
+
 #ifdef CONFIG_TRANSPARENT_HUGEPAGE
 struct page *vm_normal_page_pmd(struct vm_area_struct *vma, unsigned long addr,
pmd_t pmd)
@@ -2548,7 +2556,8 @@ static int do_wp_page(struct vm_fault *vmf)
 {
struct vm_area_struct *vma = vmf->vma;
 
-   vmf->page = vm_normal_page(vma, vmf->address, vmf->orig_pte);
+   vmf->page = __vm_normal_page(vma, vmf->address, vmf->orig_pte,
+vmf->vma_flags);
if (!vmf->page) {
/*
 * VM_MIXEDMAP !pfn_valid() case, or VM_SOFTDIRTY clear on a
@@ -3575,7 +3584,7 @@ static int do_numa_page(struct vm_fault *vmf)
ptep_modify_prot_commit(vma->vm_mm, vmf->address, vmf->pte, pte);
update_mmu_cache(vma, vmf->address, vmf->pte);
 
-   page = vm_normal_page(vma, vmf->address, pte);
+   page = __vm_normal_page(vma, vmf->address, pte, vmf->vma_flags);
if (!page) {
pte_unmap_unlock(vmf->pte, vmf->ptl);
return 0;
-- 
2.7.4

[PATCH v2 10/20] mm: Introduce __lru_cache_add_active_or_unevictable

2017-08-17 Thread Laurent Dufour

The speculative page fault handler which is run without holding the
mmap_sem is calling lru_cache_add_active_or_unevictable() but the vm_flags
is not guaranteed to remain constant.
Introducing __lru_cache_add_active_or_unevictable() which has the vma flags
value parameter instead of the vma pointer.

Signed-off-by: Laurent Dufour <lduf...@linux.vnet.ibm.com>
---
 include/linux/swap.h | 11 +--
 mm/memory.c  |  8 
 mm/swap.c| 12 ++--
 3 files changed, 19 insertions(+), 12 deletions(-)

diff --git a/include/linux/swap.h b/include/linux/swap.h
index d83d28e53e62..fdea932fe10f 100644
--- a/include/linux/swap.h
+++ b/include/linux/swap.h
@@ -285,8 +285,15 @@ extern void swap_setup(void);
 
 extern void add_page_to_unevictable_list(struct page *page);
 
-extern void lru_cache_add_active_or_unevictable(struct page *page,
-   struct vm_area_struct *vma);
+extern void __lru_cache_add_active_or_unevictable(struct page *page,
+   unsigned long vma_flags);
+
+static inline void lru_cache_add_active_or_unevictable(struct page *page,
+   struct vm_area_struct *vma)
+{
+   return __lru_cache_add_active_or_unevictable(page, vma->vm_flags);
+}
+
 
 /* linux/mm/vmscan.c */
 extern unsigned long zone_reclaimable_pages(struct zone *zone);
diff --git a/mm/memory.c b/mm/memory.c
index 535282b3..c6b18cc87e90 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -2370,7 +2370,7 @@ static int wp_page_copy(struct vm_fault *vmf)
ptep_clear_flush_notify(vma, vmf->address, vmf->pte);
page_add_new_anon_rmap(new_page, vma, vmf->address, false);
mem_cgroup_commit_charge(new_page, memcg, false, false);
-   lru_cache_add_active_or_unevictable(new_page, vma);
+   __lru_cache_add_active_or_unevictable(new_page, vmf->vma_flags);
/*
 * We call the notify macro here because, when using secondary
 * mmu page tables (such as kvm shadow page tables), we want the
@@ -2840,7 +2840,7 @@ int do_swap_page(struct vm_fault *vmf)
} else { /* ksm created a completely new copy */
page_add_new_anon_rmap(page, vma, vmf->address, false);
mem_cgroup_commit_charge(page, memcg, false, false);
-   lru_cache_add_active_or_unevictable(page, vma);
+   __lru_cache_add_active_or_unevictable(page, vmf->vma_flags);
}
 
swap_free(entry);
@@ -2978,7 +2978,7 @@ static int do_anonymous_page(struct vm_fault *vmf)
inc_mm_counter_fast(vma->vm_mm, MM_ANONPAGES);
page_add_new_anon_rmap(page, vma, vmf->address, false);
mem_cgroup_commit_charge(page, memcg, false, false);
-   lru_cache_add_active_or_unevictable(page, vma);
+   __lru_cache_add_active_or_unevictable(page, vmf->vma_flags);
 setpte:
set_pte_at(vma->vm_mm, vmf->address, vmf->pte, entry);
 
@@ -3230,7 +3230,7 @@ int alloc_set_pte(struct vm_fault *vmf, struct mem_cgroup 
*memcg,
inc_mm_counter_fast(vma->vm_mm, MM_ANONPAGES);
page_add_new_anon_rmap(page, vma, vmf->address, false);
mem_cgroup_commit_charge(page, memcg, false, false);
-   lru_cache_add_active_or_unevictable(page, vma);
+   __lru_cache_add_active_or_unevictable(page, vmf->vma_flags);
} else {
inc_mm_counter_fast(vma->vm_mm, mm_counter_file(page));
page_add_file_rmap(page, false);
diff --git a/mm/swap.c b/mm/swap.c
index 60b1d2a75852..ece0826a205b 100644
--- a/mm/swap.c
+++ b/mm/swap.c
@@ -470,21 +470,21 @@ void add_page_to_unevictable_list(struct page *page)
 }
 
 /**
- * lru_cache_add_active_or_unevictable
- * @page:  the page to be added to LRU
- * @vma:   vma in which page is mapped for determining reclaimability
+ * __lru_cache_add_active_or_unevictable
+ * @page:  the page to be added to LRU
+ * @vma_flags:  vma in which page is mapped for determining reclaimability
  *
  * Place @page on the active or unevictable LRU list, depending on its
  * evictability.  Note that if the page is not evictable, it goes
  * directly back onto it's zone's unevictable list, it does NOT use a
  * per cpu pagevec.
  */
-void lru_cache_add_active_or_unevictable(struct page *page,
-struct vm_area_struct *vma)
+void __lru_cache_add_active_or_unevictable(struct page *page,
+  unsigned long vma_flags)
 {
VM_BUG_ON_PAGE(PageLRU(page), page);
 
-   if (likely((vma->vm_flags & (VM_LOCKED | VM_SPECIAL)) != VM_LOCKED)) {
+   if (likely((vma_flags & (VM_LOCKED | VM_SPECIAL)) != VM_LOCKED)) {
SetPageActive(page);
lru_cache_add(page);
return;
-- 
2.7.4

[PATCH v2 10/20] mm: Introduce __lru_cache_add_active_or_unevictable

2017-08-17 Thread Laurent Dufour

The speculative page fault handler which is run without holding the
mmap_sem is calling lru_cache_add_active_or_unevictable() but the vm_flags
is not guaranteed to remain constant.
Introducing __lru_cache_add_active_or_unevictable() which has the vma flags
value parameter instead of the vma pointer.

Signed-off-by: Laurent Dufour 
---
 include/linux/swap.h | 11 +--
 mm/memory.c  |  8 
 mm/swap.c| 12 ++--
 3 files changed, 19 insertions(+), 12 deletions(-)

diff --git a/include/linux/swap.h b/include/linux/swap.h
index d83d28e53e62..fdea932fe10f 100644
--- a/include/linux/swap.h
+++ b/include/linux/swap.h
@@ -285,8 +285,15 @@ extern void swap_setup(void);
 
 extern void add_page_to_unevictable_list(struct page *page);
 
-extern void lru_cache_add_active_or_unevictable(struct page *page,
-   struct vm_area_struct *vma);
+extern void __lru_cache_add_active_or_unevictable(struct page *page,
+   unsigned long vma_flags);
+
+static inline void lru_cache_add_active_or_unevictable(struct page *page,
+   struct vm_area_struct *vma)
+{
+   return __lru_cache_add_active_or_unevictable(page, vma->vm_flags);
+}
+
 
 /* linux/mm/vmscan.c */
 extern unsigned long zone_reclaimable_pages(struct zone *zone);
diff --git a/mm/memory.c b/mm/memory.c
index 535282b3..c6b18cc87e90 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -2370,7 +2370,7 @@ static int wp_page_copy(struct vm_fault *vmf)
ptep_clear_flush_notify(vma, vmf->address, vmf->pte);
page_add_new_anon_rmap(new_page, vma, vmf->address, false);
mem_cgroup_commit_charge(new_page, memcg, false, false);
-   lru_cache_add_active_or_unevictable(new_page, vma);
+   __lru_cache_add_active_or_unevictable(new_page, vmf->vma_flags);
/*
 * We call the notify macro here because, when using secondary
 * mmu page tables (such as kvm shadow page tables), we want the
@@ -2840,7 +2840,7 @@ int do_swap_page(struct vm_fault *vmf)
} else { /* ksm created a completely new copy */
page_add_new_anon_rmap(page, vma, vmf->address, false);
mem_cgroup_commit_charge(page, memcg, false, false);
-   lru_cache_add_active_or_unevictable(page, vma);
+   __lru_cache_add_active_or_unevictable(page, vmf->vma_flags);
}
 
swap_free(entry);
@@ -2978,7 +2978,7 @@ static int do_anonymous_page(struct vm_fault *vmf)
inc_mm_counter_fast(vma->vm_mm, MM_ANONPAGES);
page_add_new_anon_rmap(page, vma, vmf->address, false);
mem_cgroup_commit_charge(page, memcg, false, false);
-   lru_cache_add_active_or_unevictable(page, vma);
+   __lru_cache_add_active_or_unevictable(page, vmf->vma_flags);
 setpte:
set_pte_at(vma->vm_mm, vmf->address, vmf->pte, entry);
 
@@ -3230,7 +3230,7 @@ int alloc_set_pte(struct vm_fault *vmf, struct mem_cgroup 
*memcg,
inc_mm_counter_fast(vma->vm_mm, MM_ANONPAGES);
page_add_new_anon_rmap(page, vma, vmf->address, false);
mem_cgroup_commit_charge(page, memcg, false, false);
-   lru_cache_add_active_or_unevictable(page, vma);
+   __lru_cache_add_active_or_unevictable(page, vmf->vma_flags);
} else {
inc_mm_counter_fast(vma->vm_mm, mm_counter_file(page));
page_add_file_rmap(page, false);
diff --git a/mm/swap.c b/mm/swap.c
index 60b1d2a75852..ece0826a205b 100644
--- a/mm/swap.c
+++ b/mm/swap.c
@@ -470,21 +470,21 @@ void add_page_to_unevictable_list(struct page *page)
 }
 
 /**
- * lru_cache_add_active_or_unevictable
- * @page:  the page to be added to LRU
- * @vma:   vma in which page is mapped for determining reclaimability
+ * __lru_cache_add_active_or_unevictable
+ * @page:  the page to be added to LRU
+ * @vma_flags:  vma in which page is mapped for determining reclaimability
  *
  * Place @page on the active or unevictable LRU list, depending on its
  * evictability.  Note that if the page is not evictable, it goes
  * directly back onto it's zone's unevictable list, it does NOT use a
  * per cpu pagevec.
  */
-void lru_cache_add_active_or_unevictable(struct page *page,
-struct vm_area_struct *vma)
+void __lru_cache_add_active_or_unevictable(struct page *page,
+  unsigned long vma_flags)
 {
VM_BUG_ON_PAGE(PageLRU(page), page);
 
-   if (likely((vma->vm_flags & (VM_LOCKED | VM_SPECIAL)) != VM_LOCKED)) {
+   if (likely((vma_flags & (VM_LOCKED | VM_SPECIAL)) != VM_LOCKED)) {
SetPageActive(page);
lru_cache_add(page);
return;
-- 
2.7.4

[PATCH v2 14/20] mm: Provide speculative fault infrastructure

2017-08-17 Thread Laurent Dufour

From: Peter Zijlstra <pet...@infradead.org>

Provide infrastructure to do a speculative fault (not holding
mmap_sem).

The not holding of mmap_sem means we can race against VMA
change/removal and page-table destruction. We use the SRCU VMA freeing
to keep the VMA around. We use the VMA seqcount to detect change
(including umapping / page-table deletion) and we use gup_fast() style
page-table walking to deal with page-table races.

Once we've obtained the page and are ready to update the PTE, we
validate if the state we started the fault with is still valid, if
not, we'll fail the fault with VM_FAULT_RETRY, otherwise we update the
PTE and we're done.

Signed-off-by: Peter Zijlstra (Intel) <pet...@infradead.org>

[Manage the newly introduced pte_spinlock() for speculative page
 fault to fail if the VMA is touched in our back]
[Rename vma_is_dead() to vma_has_changed() and declare it here]
[Call p4d_alloc() as it is safe since pgd is valid]
[Call pud_alloc() as it is safe since p4d is valid]
[Set fe.sequence in __handle_mm_fault()]
[Abort speculative path when handle_userfault() has to be called]
[Add additional VMA's flags checks in handle_speculative_fault()]
[Clear FAULT_FLAG_ALLOW_RETRY in handle_speculative_fault()]
[Don't set vmf->pte and vmf->ptl if pte_map_lock() failed]
[Remove warning comment about waiting for !seq&1 since we don't want
 to wait]
[Remove warning about no huge page support, mention it explictly]
[Don't call do_fault() in the speculative path as __do_fault() calls
 vma->vm_ops->fault() which may want to release mmap_sem]
[Only vm_fault pointer argument for vma_has_changed()]
[Fix check against huge page, calling pmd_trans_huge()]
[Introduce __HAVE_ARCH_CALL_SPF to declare the SPF handler only when
 architecture is supporting it]
[Use READ_ONCE() when reading VMA's fields in the speculative path]
[Explicitly check for __HAVE_ARCH_PTE_SPECIAL as we can't support for
 processing done in vm_normal_page()]
[Check that vma->anon_vma is already set when starting the speculative
 path]
[Check for memory policy as we can't support MPOL_INTERLEAVE case due to
 the processing done in mpol_misplaced()]
[Don't support VMA growing up or down]
[Move check on vm_sequence just before calling handle_pte_fault()]
Signed-off-by: Laurent Dufour <lduf...@linux.vnet.ibm.com>
---
 include/linux/hugetlb_inline.h |   2 +-
 include/linux/mm.h |   5 +
 include/linux/pagemap.h|   4 +-
 mm/internal.h  |  14 +++
 mm/memory.c| 237 -
 5 files changed, 254 insertions(+), 8 deletions(-)

diff --git a/include/linux/hugetlb_inline.h b/include/linux/hugetlb_inline.h
index a4e7ca0f3585..6cfdfca4cc2a 100644
--- a/include/linux/hugetlb_inline.h
+++ b/include/linux/hugetlb_inline.h
@@ -7,7 +7,7 @@
 
 static inline bool is_vm_hugetlb_page(struct vm_area_struct *vma)
 {
-   return !!(vma->vm_flags & VM_HUGETLB);
+   return !!(READ_ONCE(vma->vm_flags) & VM_HUGETLB);
 }
 
 #else
diff --git a/include/linux/mm.h b/include/linux/mm.h
index 0f4ddd72b172..0fe0811d304f 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -315,6 +315,7 @@ struct vm_fault {
gfp_t gfp_mask; /* gfp mask to be used for allocations 
*/
pgoff_t pgoff;  /* Logical page offset based on vma */
unsigned long address;  /* Faulting virtual address */
+   unsigned int sequence;
pmd_t *pmd; /* Pointer to pmd entry matching
 * the 'address' */
pud_t *pud; /* Pointer to pud entry matching
@@ -1297,6 +1298,10 @@ int invalidate_inode_page(struct page *page);
 #ifdef CONFIG_MMU
 extern int handle_mm_fault(struct vm_area_struct *vma, unsigned long address,
unsigned int flags);
+#ifdef __HAVE_ARCH_CALL_SPF
+extern int handle_speculative_fault(struct mm_struct *mm,
+   unsigned long address, unsigned int flags);
+#endif /* __HAVE_ARCH_CALL_SPF */
 extern int fixup_user_fault(struct task_struct *tsk, struct mm_struct *mm,
unsigned long address, unsigned int fault_flags,
bool *unlocked);
diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h
index 79b36f57c3ba..3a9735dfa6b6 100644
--- a/include/linux/pagemap.h
+++ b/include/linux/pagemap.h
@@ -443,8 +443,8 @@ static inline pgoff_t linear_page_index(struct 
vm_area_struct *vma,
pgoff_t pgoff;
if (unlikely(is_vm_hugetlb_page(vma)))
return linear_hugepage_index(vma, address);
-   pgoff = (address - vma->vm_start) >> PAGE_SHIFT;
-   pgoff += vma->vm_pgoff;
+   pgoff = (address - READ_ONCE(vma->vm_start)) >> PAGE_SHIFT;
+   pgoff += READ_ONCE(vma->vm_pgoff);
return pgoff;
 }
 
diff --git a/mm/internal.h b/mm/inte

[PATCH v2 14/20] mm: Provide speculative fault infrastructure

2017-08-17 Thread Laurent Dufour

From: Peter Zijlstra 

Provide infrastructure to do a speculative fault (not holding
mmap_sem).

The not holding of mmap_sem means we can race against VMA
change/removal and page-table destruction. We use the SRCU VMA freeing
to keep the VMA around. We use the VMA seqcount to detect change
(including umapping / page-table deletion) and we use gup_fast() style
page-table walking to deal with page-table races.

Once we've obtained the page and are ready to update the PTE, we
validate if the state we started the fault with is still valid, if
not, we'll fail the fault with VM_FAULT_RETRY, otherwise we update the
PTE and we're done.

Signed-off-by: Peter Zijlstra (Intel) 

[Manage the newly introduced pte_spinlock() for speculative page
 fault to fail if the VMA is touched in our back]
[Rename vma_is_dead() to vma_has_changed() and declare it here]
[Call p4d_alloc() as it is safe since pgd is valid]
[Call pud_alloc() as it is safe since p4d is valid]
[Set fe.sequence in __handle_mm_fault()]
[Abort speculative path when handle_userfault() has to be called]
[Add additional VMA's flags checks in handle_speculative_fault()]
[Clear FAULT_FLAG_ALLOW_RETRY in handle_speculative_fault()]
[Don't set vmf->pte and vmf->ptl if pte_map_lock() failed]
[Remove warning comment about waiting for !seq&1 since we don't want
 to wait]
[Remove warning about no huge page support, mention it explictly]
[Don't call do_fault() in the speculative path as __do_fault() calls
 vma->vm_ops->fault() which may want to release mmap_sem]
[Only vm_fault pointer argument for vma_has_changed()]
[Fix check against huge page, calling pmd_trans_huge()]
[Introduce __HAVE_ARCH_CALL_SPF to declare the SPF handler only when
 architecture is supporting it]
[Use READ_ONCE() when reading VMA's fields in the speculative path]
[Explicitly check for __HAVE_ARCH_PTE_SPECIAL as we can't support for
 processing done in vm_normal_page()]
[Check that vma->anon_vma is already set when starting the speculative
 path]
[Check for memory policy as we can't support MPOL_INTERLEAVE case due to
 the processing done in mpol_misplaced()]
[Don't support VMA growing up or down]
[Move check on vm_sequence just before calling handle_pte_fault()]
Signed-off-by: Laurent Dufour 
---
 include/linux/hugetlb_inline.h |   2 +-
 include/linux/mm.h |   5 +
 include/linux/pagemap.h|   4 +-
 mm/internal.h  |  14 +++
 mm/memory.c| 237 -
 5 files changed, 254 insertions(+), 8 deletions(-)

diff --git a/include/linux/hugetlb_inline.h b/include/linux/hugetlb_inline.h
index a4e7ca0f3585..6cfdfca4cc2a 100644
--- a/include/linux/hugetlb_inline.h
+++ b/include/linux/hugetlb_inline.h
@@ -7,7 +7,7 @@
 
 static inline bool is_vm_hugetlb_page(struct vm_area_struct *vma)
 {
-   return !!(vma->vm_flags & VM_HUGETLB);
+   return !!(READ_ONCE(vma->vm_flags) & VM_HUGETLB);
 }
 
 #else
diff --git a/include/linux/mm.h b/include/linux/mm.h
index 0f4ddd72b172..0fe0811d304f 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -315,6 +315,7 @@ struct vm_fault {
gfp_t gfp_mask; /* gfp mask to be used for allocations 
*/
pgoff_t pgoff;  /* Logical page offset based on vma */
unsigned long address;  /* Faulting virtual address */
+   unsigned int sequence;
pmd_t *pmd; /* Pointer to pmd entry matching
 * the 'address' */
pud_t *pud; /* Pointer to pud entry matching
@@ -1297,6 +1298,10 @@ int invalidate_inode_page(struct page *page);
 #ifdef CONFIG_MMU
 extern int handle_mm_fault(struct vm_area_struct *vma, unsigned long address,
unsigned int flags);
+#ifdef __HAVE_ARCH_CALL_SPF
+extern int handle_speculative_fault(struct mm_struct *mm,
+   unsigned long address, unsigned int flags);
+#endif /* __HAVE_ARCH_CALL_SPF */
 extern int fixup_user_fault(struct task_struct *tsk, struct mm_struct *mm,
unsigned long address, unsigned int fault_flags,
bool *unlocked);
diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h
index 79b36f57c3ba..3a9735dfa6b6 100644
--- a/include/linux/pagemap.h
+++ b/include/linux/pagemap.h
@@ -443,8 +443,8 @@ static inline pgoff_t linear_page_index(struct 
vm_area_struct *vma,
pgoff_t pgoff;
if (unlikely(is_vm_hugetlb_page(vma)))
return linear_hugepage_index(vma, address);
-   pgoff = (address - vma->vm_start) >> PAGE_SHIFT;
-   pgoff += vma->vm_pgoff;
+   pgoff = (address - READ_ONCE(vma->vm_start)) >> PAGE_SHIFT;
+   pgoff += READ_ONCE(vma->vm_pgoff);
return pgoff;
 }
 
diff --git a/mm/internal.h b/mm/internal.h
index 736540f15936..9d6347e35747 100644
--- a/mm/internal.h
+++ b/mm/internal.h
@@ -45,6 +45

[PATCH v2 15/20] mm: Try spin lock in speculative path

2017-08-17 Thread Laurent Dufour

There is a deadlock when a CPU is doing a speculative page fault and
another one is calling do_unmap().

The deadlock occurred because the speculative path try to spinlock the
pte while the interrupt are disabled. When the other CPU in the
unmap's path has locked the pte then is waiting for all the CPU to
invalidate the TLB. As the CPU doing the speculative fault have the
interrupt disable it can't invalidate the TLB, and can't get the lock.

Since we are in a speculative path, we can race with other mm action.
So let assume that the lock may not get acquired and fail the
speculative page fault.

Here are the stacks captured during the deadlock:

CPU 0
native_flush_tlb_others+0x7c/0x260
flush_tlb_mm_range+0x6a/0x220
tlb_flush_mmu_tlbonly+0x63/0xc0
unmap_page_range+0x897/0x9d0
? unmap_single_vma+0x7d/0xe0
? release_pages+0x2b3/0x360
unmap_single_vma+0x7d/0xe0
unmap_vmas+0x51/0xa0
unmap_region+0xbd/0x130
do_munmap+0x279/0x460
SyS_munmap+0x53/0x70

CPU 1
do_raw_spin_lock+0x14e/0x160
_raw_spin_lock+0x5d/0x80
? pte_map_lock+0x169/0x1b0
pte_map_lock+0x169/0x1b0
handle_pte_fault+0xbf2/0xd80
? trace_hardirqs_on+0xd/0x10
handle_speculative_fault+0x272/0x280
handle_speculative_fault+0x5/0x280
__do_page_fault+0x187/0x580
trace_do_page_fault+0x52/0x260
do_async_page_fault+0x19/0x70
async_page_fault+0x28/0x30

Signed-off-by: Laurent Dufour <lduf...@linux.vnet.ibm.com>
---
 mm/memory.c | 19 ---
 1 file changed, 16 insertions(+), 3 deletions(-)

diff --git a/mm/memory.c b/mm/memory.c
index 0ba14a5797b2..8c701e4f59d3 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -2300,7 +2300,8 @@ static bool pte_spinlock(struct vm_fault *vmf)
goto out;
 
vmf->ptl = pte_lockptr(vmf->vma->vm_mm, vmf->pmd);
-   spin_lock(vmf->ptl);
+   if (unlikely(!spin_trylock(vmf->ptl)))
+   goto out;
 
if (vma_has_changed(vmf)) {
spin_unlock(vmf->ptl);
@@ -2336,8 +2337,20 @@ static bool pte_map_lock(struct vm_fault *vmf)
if (vma_has_changed(vmf))
goto out;
 
-   pte = pte_offset_map_lock(vmf->vma->vm_mm, vmf->pmd,
- vmf->address, );
+   /*
+* Same as pte_offset_map_lock() except that we call
+* spin_trylock() in place of spin_lock() to avoid race with
+* unmap path which may have the lock and wait for this CPU
+* to invalidate TLB but this CPU has irq disabled.
+* Since we are in a speculative patch, accept it could fail
+*/
+   ptl = pte_lockptr(vmf->vma->vm_mm, vmf->pmd);
+   pte = pte_offset_map(vmf->pmd, vmf->address);
+   if (unlikely(!spin_trylock(ptl))) {
+   pte_unmap(pte);
+   goto out;
+   }
+
if (vma_has_changed(vmf)) {
pte_unmap_unlock(pte, ptl);
goto out;
-- 
2.7.4

[PATCH v2 15/20] mm: Try spin lock in speculative path

2017-08-17 Thread Laurent Dufour

There is a deadlock when a CPU is doing a speculative page fault and
another one is calling do_unmap().

The deadlock occurred because the speculative path try to spinlock the
pte while the interrupt are disabled. When the other CPU in the
unmap's path has locked the pte then is waiting for all the CPU to
invalidate the TLB. As the CPU doing the speculative fault have the
interrupt disable it can't invalidate the TLB, and can't get the lock.

Since we are in a speculative path, we can race with other mm action.
So let assume that the lock may not get acquired and fail the
speculative page fault.

Here are the stacks captured during the deadlock:

CPU 0
native_flush_tlb_others+0x7c/0x260
flush_tlb_mm_range+0x6a/0x220
tlb_flush_mmu_tlbonly+0x63/0xc0
unmap_page_range+0x897/0x9d0
? unmap_single_vma+0x7d/0xe0
? release_pages+0x2b3/0x360
unmap_single_vma+0x7d/0xe0
unmap_vmas+0x51/0xa0
unmap_region+0xbd/0x130
do_munmap+0x279/0x460
SyS_munmap+0x53/0x70

CPU 1
do_raw_spin_lock+0x14e/0x160
_raw_spin_lock+0x5d/0x80
? pte_map_lock+0x169/0x1b0
pte_map_lock+0x169/0x1b0
handle_pte_fault+0xbf2/0xd80
? trace_hardirqs_on+0xd/0x10
handle_speculative_fault+0x272/0x280
handle_speculative_fault+0x5/0x280
__do_page_fault+0x187/0x580
trace_do_page_fault+0x52/0x260
do_async_page_fault+0x19/0x70
async_page_fault+0x28/0x30

Signed-off-by: Laurent Dufour 
---
 mm/memory.c | 19 ---
 1 file changed, 16 insertions(+), 3 deletions(-)

diff --git a/mm/memory.c b/mm/memory.c
index 0ba14a5797b2..8c701e4f59d3 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -2300,7 +2300,8 @@ static bool pte_spinlock(struct vm_fault *vmf)
goto out;
 
vmf->ptl = pte_lockptr(vmf->vma->vm_mm, vmf->pmd);
-   spin_lock(vmf->ptl);
+   if (unlikely(!spin_trylock(vmf->ptl)))
+   goto out;
 
if (vma_has_changed(vmf)) {
spin_unlock(vmf->ptl);
@@ -2336,8 +2337,20 @@ static bool pte_map_lock(struct vm_fault *vmf)
if (vma_has_changed(vmf))
goto out;
 
-   pte = pte_offset_map_lock(vmf->vma->vm_mm, vmf->pmd,
- vmf->address, );
+   /*
+* Same as pte_offset_map_lock() except that we call
+* spin_trylock() in place of spin_lock() to avoid race with
+* unmap path which may have the lock and wait for this CPU
+* to invalidate TLB but this CPU has irq disabled.
+* Since we are in a speculative patch, accept it could fail
+*/
+   ptl = pte_lockptr(vmf->vma->vm_mm, vmf->pmd);
+   pte = pte_offset_map(vmf->pmd, vmf->address);
+   if (unlikely(!spin_trylock(ptl))) {
+   pte_unmap(pte);
+   goto out;
+   }
+
if (vma_has_changed(vmf)) {
pte_unmap_unlock(pte, ptl);
goto out;
-- 
2.7.4

[PATCH v2 17/20] perf: Add a speculative page fault sw event

2017-08-17 Thread Laurent Dufour

Add a new software event to count succeeded speculative page faults.

Signed-off-by: Laurent Dufour <lduf...@linux.vnet.ibm.com>
---
 include/uapi/linux/perf_event.h | 1 +
 1 file changed, 1 insertion(+)

diff --git a/include/uapi/linux/perf_event.h b/include/uapi/linux/perf_event.h
index b1c0b187acfe..3043ec0988e9 100644
--- a/include/uapi/linux/perf_event.h
+++ b/include/uapi/linux/perf_event.h
@@ -111,6 +111,7 @@ enum perf_sw_ids {
PERF_COUNT_SW_EMULATION_FAULTS  = 8,
PERF_COUNT_SW_DUMMY = 9,
PERF_COUNT_SW_BPF_OUTPUT= 10,
+   PERF_COUNT_SW_SPF_DONE  = 11,
 
PERF_COUNT_SW_MAX,  /* non-ABI */
 };
-- 
2.7.4

[PATCH v2 17/20] perf: Add a speculative page fault sw event

2017-08-17 Thread Laurent Dufour

Add a new software event to count succeeded speculative page faults.

Signed-off-by: Laurent Dufour 
---
 include/uapi/linux/perf_event.h | 1 +
 1 file changed, 1 insertion(+)

diff --git a/include/uapi/linux/perf_event.h b/include/uapi/linux/perf_event.h
index b1c0b187acfe..3043ec0988e9 100644
--- a/include/uapi/linux/perf_event.h
+++ b/include/uapi/linux/perf_event.h
@@ -111,6 +111,7 @@ enum perf_sw_ids {
PERF_COUNT_SW_EMULATION_FAULTS  = 8,
PERF_COUNT_SW_DUMMY = 9,
PERF_COUNT_SW_BPF_OUTPUT= 10,
+   PERF_COUNT_SW_SPF_DONE  = 11,
 
PERF_COUNT_SW_MAX,  /* non-ABI */
 };
-- 
2.7.4

[PATCH v2 19/20] x86/mm: Add speculative pagefault handling

2017-08-17 Thread Laurent Dufour

From: Peter Zijlstra <pet...@infradead.org>

Try a speculative fault before acquiring mmap_sem, if it returns with
VM_FAULT_RETRY continue with the mmap_sem acquisition and do the
traditional fault.

Signed-off-by: Peter Zijlstra (Intel) <pet...@infradead.org>

[Clearing of FAULT_FLAG_ALLOW_RETRY is now done in
 handle_speculative_fault()]
[Retry with usual fault path in the case VM_ERROR is returned by
 handle_speculative_fault(). This allows signal to be delivered]
Signed-off-by: Laurent Dufour <lduf...@linux.vnet.ibm.com>
---
 arch/x86/include/asm/pgtable_types.h |  7 +++
 arch/x86/mm/fault.c  | 19 +++
 2 files changed, 26 insertions(+)

diff --git a/arch/x86/include/asm/pgtable_types.h 
b/arch/x86/include/asm/pgtable_types.h
index bf9638e1ee42..4fd2693a037e 100644
--- a/arch/x86/include/asm/pgtable_types.h
+++ b/arch/x86/include/asm/pgtable_types.h
@@ -234,6 +234,13 @@ enum page_cache_mode {
 #define PGD_IDENT_ATTR  0x001  /* PRESENT (no other attributes) */
 #endif
 
+/*
+ * Advertise that we call the Speculative Page Fault handler.
+ */
+#ifdef CONFIG_X86_64
+#define __HAVE_ARCH_CALL_SPF
+#endif
+
 #ifdef CONFIG_X86_32
 # include 
 #else
diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c
index 2a1fa10c6a98..4c070b9a4362 100644
--- a/arch/x86/mm/fault.c
+++ b/arch/x86/mm/fault.c
@@ -1365,6 +1365,24 @@ __do_page_fault(struct pt_regs *regs, unsigned long 
error_code,
if (error_code & PF_INSTR)
flags |= FAULT_FLAG_INSTRUCTION;
 
+#ifdef __HAVE_ARCH_CALL_SPF
+   if (error_code & PF_USER) {
+   fault = handle_speculative_fault(mm, address, flags);
+
+   /*
+* We also check against VM_FAULT_ERROR because we have to
+* raise a signal by calling later mm_fault_error() which
+* requires the vma pointer to be set. So in that case,
+* we fall through the normal path.
+*/
+   if (!(fault & VM_FAULT_RETRY || fault & VM_FAULT_ERROR)) {
+   perf_sw_event(PERF_COUNT_SW_SPF_DONE, 1,
+ regs, address);
+   goto done;
+   }
+   }
+#endif /* __HAVE_ARCH_CALL_SPF */
+
/*
 * When running in the kernel we expect faults to occur only to
 * addresses in user space.  All other faults represent errors in
@@ -1474,6 +1492,7 @@ __do_page_fault(struct pt_regs *regs, unsigned long 
error_code,
return;
}
 
+done:
/*
 * Major/minor page fault accounting. If any of the events
 * returned VM_FAULT_MAJOR, we account it as a major fault.
-- 
2.7.4

[PATCH v2 19/20] x86/mm: Add speculative pagefault handling

2017-08-17 Thread Laurent Dufour

From: Peter Zijlstra 

Try a speculative fault before acquiring mmap_sem, if it returns with
VM_FAULT_RETRY continue with the mmap_sem acquisition and do the
traditional fault.

Signed-off-by: Peter Zijlstra (Intel) 

[Clearing of FAULT_FLAG_ALLOW_RETRY is now done in
 handle_speculative_fault()]
[Retry with usual fault path in the case VM_ERROR is returned by
 handle_speculative_fault(). This allows signal to be delivered]
Signed-off-by: Laurent Dufour 
---
 arch/x86/include/asm/pgtable_types.h |  7 +++
 arch/x86/mm/fault.c  | 19 +++
 2 files changed, 26 insertions(+)

diff --git a/arch/x86/include/asm/pgtable_types.h 
b/arch/x86/include/asm/pgtable_types.h
index bf9638e1ee42..4fd2693a037e 100644
--- a/arch/x86/include/asm/pgtable_types.h
+++ b/arch/x86/include/asm/pgtable_types.h
@@ -234,6 +234,13 @@ enum page_cache_mode {
 #define PGD_IDENT_ATTR  0x001  /* PRESENT (no other attributes) */
 #endif
 
+/*
+ * Advertise that we call the Speculative Page Fault handler.
+ */
+#ifdef CONFIG_X86_64
+#define __HAVE_ARCH_CALL_SPF
+#endif
+
 #ifdef CONFIG_X86_32
 # include 
 #else
diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c
index 2a1fa10c6a98..4c070b9a4362 100644
--- a/arch/x86/mm/fault.c
+++ b/arch/x86/mm/fault.c
@@ -1365,6 +1365,24 @@ __do_page_fault(struct pt_regs *regs, unsigned long 
error_code,
if (error_code & PF_INSTR)
flags |= FAULT_FLAG_INSTRUCTION;
 
+#ifdef __HAVE_ARCH_CALL_SPF
+   if (error_code & PF_USER) {
+   fault = handle_speculative_fault(mm, address, flags);
+
+   /*
+* We also check against VM_FAULT_ERROR because we have to
+* raise a signal by calling later mm_fault_error() which
+* requires the vma pointer to be set. So in that case,
+* we fall through the normal path.
+*/
+   if (!(fault & VM_FAULT_RETRY || fault & VM_FAULT_ERROR)) {
+   perf_sw_event(PERF_COUNT_SW_SPF_DONE, 1,
+ regs, address);
+   goto done;
+   }
+   }
+#endif /* __HAVE_ARCH_CALL_SPF */
+
/*
 * When running in the kernel we expect faults to occur only to
 * addresses in user space.  All other faults represent errors in
@@ -1474,6 +1492,7 @@ __do_page_fault(struct pt_regs *regs, unsigned long 
error_code,
return;
}
 
+done:
/*
 * Major/minor page fault accounting. If any of the events
 * returned VM_FAULT_MAJOR, we account it as a major fault.
-- 
2.7.4

[PATCH v2 16/20] mm: Adding speculative page fault failure trace events

2017-08-17 Thread Laurent Dufour

This patch a set of new trace events to collect the speculative page fault
event failures.

Signed-off-by: Laurent Dufour <lduf...@linux.vnet.ibm.com>
---
 include/trace/events/pagefault.h | 87 
 mm/memory.c  | 68 ---
 2 files changed, 141 insertions(+), 14 deletions(-)
 create mode 100644 include/trace/events/pagefault.h

diff --git a/include/trace/events/pagefault.h b/include/trace/events/pagefault.h
new file mode 100644
index ..d7d56f8102d1
--- /dev/null
+++ b/include/trace/events/pagefault.h
@@ -0,0 +1,87 @@
+#undef TRACE_SYSTEM
+#define TRACE_SYSTEM pagefault
+
+#if !defined(_TRACE_PAGEFAULT_H) || defined(TRACE_HEADER_MULTI_READ)
+#define _TRACE_PAGEFAULT_H
+
+#include 
+#include 
+
+DECLARE_EVENT_CLASS(spf,
+
+   TP_PROTO(unsigned long caller,
+struct vm_area_struct *vma, unsigned long address),
+
+   TP_ARGS(caller, vma, address),
+
+   TP_STRUCT__entry(
+   __field(unsigned long, caller)
+   __field(unsigned long, vm_start)
+   __field(unsigned long, vm_end)
+   __field(unsigned long, address)
+   ),
+
+   TP_fast_assign(
+   __entry->caller = caller;
+   __entry->vm_start   = vma->vm_start;
+   __entry->vm_end = vma->vm_end;
+   __entry->address= address;
+   ),
+
+   TP_printk("ip:%lx vma:%lu-%lx address:%lx",
+ __entry->caller, __entry->vm_start, __entry->vm_end,
+ __entry->address)
+);
+
+DEFINE_EVENT(spf, spf_pte_lock,
+
+   TP_PROTO(unsigned long caller,
+struct vm_area_struct *vma, unsigned long address),
+
+   TP_ARGS(caller, vma, address)
+);
+
+DEFINE_EVENT(spf, spf_vma_changed,
+
+   TP_PROTO(unsigned long caller,
+struct vm_area_struct *vma, unsigned long address),
+
+   TP_ARGS(caller, vma, address)
+);
+
+DEFINE_EVENT(spf, spf_vma_dead,
+
+   TP_PROTO(unsigned long caller,
+struct vm_area_struct *vma, unsigned long address),
+
+   TP_ARGS(caller, vma, address)
+);
+
+DEFINE_EVENT(spf, spf_vma_noanon,
+
+   TP_PROTO(unsigned long caller,
+struct vm_area_struct *vma, unsigned long address),
+
+   TP_ARGS(caller, vma, address)
+);
+
+DEFINE_EVENT(spf, spf_vma_notsup,
+
+   TP_PROTO(unsigned long caller,
+struct vm_area_struct *vma, unsigned long address),
+
+   TP_ARGS(caller, vma, address)
+);
+
+DEFINE_EVENT(spf, spf_vma_access,
+
+   TP_PROTO(unsigned long caller,
+struct vm_area_struct *vma, unsigned long address),
+
+   TP_ARGS(caller, vma, address)
+);
+
+#endif /* _TRACE_PAGEFAULT_H */
+
+/* This part must be outside protection */
+#include 
diff --git a/mm/memory.c b/mm/memory.c
index 8c701e4f59d3..549d23583f53 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -79,6 +79,9 @@
 
 #include "internal.h"
 
+#define CREATE_TRACE_POINTS
+#include 
+
 #ifdef LAST_CPUPID_NOT_IN_PAGE_FLAGS
 #warning Unfortunate NUMA and NUMA Balancing config, growing page-frame for 
last_cpupid.
 #endif
@@ -2296,15 +2299,20 @@ static bool pte_spinlock(struct vm_fault *vmf)
}
 
local_irq_disable();
-   if (vma_has_changed(vmf))
+   if (vma_has_changed(vmf)) {
+   trace_spf_vma_changed(_RET_IP_, vmf->vma, vmf->address);
goto out;
+   }
 
vmf->ptl = pte_lockptr(vmf->vma->vm_mm, vmf->pmd);
-   if (unlikely(!spin_trylock(vmf->ptl)))
+   if (unlikely(!spin_trylock(vmf->ptl))) {
+   trace_spf_pte_lock(_RET_IP_, vmf->vma, vmf->address);
goto out;
+   }
 
if (vma_has_changed(vmf)) {
spin_unlock(vmf->ptl);
+   trace_spf_vma_changed(_RET_IP_, vmf->vma, vmf->address);
goto out;
}
 
@@ -2334,8 +2342,10 @@ static bool pte_map_lock(struct vm_fault *vmf)
 * block on the PTL and thus we're safe.
 */
local_irq_disable();
-   if (vma_has_changed(vmf))
+   if (vma_has_changed(vmf)) {
+   trace_spf_vma_changed(_RET_IP_, vmf->vma, vmf->address);
goto out;
+   }
 
/*
 * Same as pte_offset_map_lock() except that we call
@@ -2348,11 +2358,13 @@ static bool pte_map_lock(struct vm_fault *vmf)
pte = pte_offset_map(vmf->pmd, vmf->address);
if (unlikely(!spin_trylock(ptl))) {
pte_unmap(pte);
+   trace_spf_pte_lock(_RET_IP_, vmf->vma, vmf->address);
goto out;
}
 
if (vma_has_changed(vmf)) {
pte_unmap_unlock(pte, ptl);
+   trace_spf_vma_changed(_RET_IP_, vmf->vma, vmf->address);
goto out;
}
 
@@ -3989,27 +4001,40 @@ int h

[PATCH v2 16/20] mm: Adding speculative page fault failure trace events

2017-08-17 Thread Laurent Dufour

This patch a set of new trace events to collect the speculative page fault
event failures.

Signed-off-by: Laurent Dufour 
---
 include/trace/events/pagefault.h | 87 
 mm/memory.c  | 68 ---
 2 files changed, 141 insertions(+), 14 deletions(-)
 create mode 100644 include/trace/events/pagefault.h

diff --git a/include/trace/events/pagefault.h b/include/trace/events/pagefault.h
new file mode 100644
index ..d7d56f8102d1
--- /dev/null
+++ b/include/trace/events/pagefault.h
@@ -0,0 +1,87 @@
+#undef TRACE_SYSTEM
+#define TRACE_SYSTEM pagefault
+
+#if !defined(_TRACE_PAGEFAULT_H) || defined(TRACE_HEADER_MULTI_READ)
+#define _TRACE_PAGEFAULT_H
+
+#include 
+#include 
+
+DECLARE_EVENT_CLASS(spf,
+
+   TP_PROTO(unsigned long caller,
+struct vm_area_struct *vma, unsigned long address),
+
+   TP_ARGS(caller, vma, address),
+
+   TP_STRUCT__entry(
+   __field(unsigned long, caller)
+   __field(unsigned long, vm_start)
+   __field(unsigned long, vm_end)
+   __field(unsigned long, address)
+   ),
+
+   TP_fast_assign(
+   __entry->caller = caller;
+   __entry->vm_start   = vma->vm_start;
+   __entry->vm_end = vma->vm_end;
+   __entry->address= address;
+   ),
+
+   TP_printk("ip:%lx vma:%lu-%lx address:%lx",
+ __entry->caller, __entry->vm_start, __entry->vm_end,
+ __entry->address)
+);
+
+DEFINE_EVENT(spf, spf_pte_lock,
+
+   TP_PROTO(unsigned long caller,
+struct vm_area_struct *vma, unsigned long address),
+
+   TP_ARGS(caller, vma, address)
+);
+
+DEFINE_EVENT(spf, spf_vma_changed,
+
+   TP_PROTO(unsigned long caller,
+struct vm_area_struct *vma, unsigned long address),
+
+   TP_ARGS(caller, vma, address)
+);
+
+DEFINE_EVENT(spf, spf_vma_dead,
+
+   TP_PROTO(unsigned long caller,
+struct vm_area_struct *vma, unsigned long address),
+
+   TP_ARGS(caller, vma, address)
+);
+
+DEFINE_EVENT(spf, spf_vma_noanon,
+
+   TP_PROTO(unsigned long caller,
+struct vm_area_struct *vma, unsigned long address),
+
+   TP_ARGS(caller, vma, address)
+);
+
+DEFINE_EVENT(spf, spf_vma_notsup,
+
+   TP_PROTO(unsigned long caller,
+struct vm_area_struct *vma, unsigned long address),
+
+   TP_ARGS(caller, vma, address)
+);
+
+DEFINE_EVENT(spf, spf_vma_access,
+
+   TP_PROTO(unsigned long caller,
+struct vm_area_struct *vma, unsigned long address),
+
+   TP_ARGS(caller, vma, address)
+);
+
+#endif /* _TRACE_PAGEFAULT_H */
+
+/* This part must be outside protection */
+#include 
diff --git a/mm/memory.c b/mm/memory.c
index 8c701e4f59d3..549d23583f53 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -79,6 +79,9 @@
 
 #include "internal.h"
 
+#define CREATE_TRACE_POINTS
+#include 
+
 #ifdef LAST_CPUPID_NOT_IN_PAGE_FLAGS
 #warning Unfortunate NUMA and NUMA Balancing config, growing page-frame for 
last_cpupid.
 #endif
@@ -2296,15 +2299,20 @@ static bool pte_spinlock(struct vm_fault *vmf)
}
 
local_irq_disable();
-   if (vma_has_changed(vmf))
+   if (vma_has_changed(vmf)) {
+   trace_spf_vma_changed(_RET_IP_, vmf->vma, vmf->address);
goto out;
+   }
 
vmf->ptl = pte_lockptr(vmf->vma->vm_mm, vmf->pmd);
-   if (unlikely(!spin_trylock(vmf->ptl)))
+   if (unlikely(!spin_trylock(vmf->ptl))) {
+   trace_spf_pte_lock(_RET_IP_, vmf->vma, vmf->address);
goto out;
+   }
 
if (vma_has_changed(vmf)) {
spin_unlock(vmf->ptl);
+   trace_spf_vma_changed(_RET_IP_, vmf->vma, vmf->address);
goto out;
}
 
@@ -2334,8 +2342,10 @@ static bool pte_map_lock(struct vm_fault *vmf)
 * block on the PTL and thus we're safe.
 */
local_irq_disable();
-   if (vma_has_changed(vmf))
+   if (vma_has_changed(vmf)) {
+   trace_spf_vma_changed(_RET_IP_, vmf->vma, vmf->address);
goto out;
+   }
 
/*
 * Same as pte_offset_map_lock() except that we call
@@ -2348,11 +2358,13 @@ static bool pte_map_lock(struct vm_fault *vmf)
pte = pte_offset_map(vmf->pmd, vmf->address);
if (unlikely(!spin_trylock(ptl))) {
pte_unmap(pte);
+   trace_spf_pte_lock(_RET_IP_, vmf->vma, vmf->address);
goto out;
}
 
if (vma_has_changed(vmf)) {
pte_unmap_unlock(pte, ptl);
+   trace_spf_vma_changed(_RET_IP_, vmf->vma, vmf->address);
goto out;
}
 
@@ -3989,27 +4001,40 @@ int handle_specula

[PATCH v2 20/20] powerpc/mm: Add speculative page fault

2017-08-17 Thread Laurent Dufour

This patch enable the speculative page fault on the PowerPC
architecture.

This will try a speculative page fault without holding the mmap_sem,
if it returns with WM_FAULT_RETRY, the mmap_sem is acquired and the
traditional page fault processing is done.

Support is only provide for BOOK3S_64 currently because:
- require CONFIG_PPC_STD_MMU because checks done in
  set_access_flags_filter()
- require BOOK3S because we can't support for book3e_hugetlb_preload()
  called by update_mmu_cache()

Signed-off-by: Laurent Dufour <lduf...@linux.vnet.ibm.com>
---
 arch/powerpc/include/asm/book3s/64/pgtable.h |  5 +
 arch/powerpc/mm/fault.c  | 30 +++-
 2 files changed, 34 insertions(+), 1 deletion(-)

diff --git a/arch/powerpc/include/asm/book3s/64/pgtable.h 
b/arch/powerpc/include/asm/book3s/64/pgtable.h
index 818a58fc3f4f..897f8b9f67e6 100644
--- a/arch/powerpc/include/asm/book3s/64/pgtable.h
+++ b/arch/powerpc/include/asm/book3s/64/pgtable.h
@@ -313,6 +313,11 @@ extern unsigned long pci_io_base;
 /* Advertise support for _PAGE_SPECIAL */
 #define __HAVE_ARCH_PTE_SPECIAL
 
+/* Advertise that we call the Speculative Page Fault handler */
+#if defined(CONFIG_PPC_BOOK3S_64)
+#define __HAVE_ARCH_CALL_SPF
+#endif
+
 #ifndef __ASSEMBLY__
 
 /*
diff --git a/arch/powerpc/mm/fault.c b/arch/powerpc/mm/fault.c
index 4c422632047b..7b3cc4c30eab 100644
--- a/arch/powerpc/mm/fault.c
+++ b/arch/powerpc/mm/fault.c
@@ -291,9 +291,36 @@ int do_page_fault(struct pt_regs *regs, unsigned long 
address,
if (is_write && is_user)
store_update_sp = store_updates_sp(regs);
 
-   if (is_user)
+   if (is_user) {
flags |= FAULT_FLAG_USER;
 
+#if defined(__HAVE_ARCH_CALL_SPF)
+   /* let's try a speculative page fault without grabbing the
+* mmap_sem.
+*/
+
+   /*
+* flags is set later based on the VMA's flags, for the common
+* speculative service, we need some flags to be set.
+*/
+   if (is_write)
+   flags |= FAULT_FLAG_WRITE;
+
+   fault = handle_speculative_fault(mm, address, flags);
+   if (!(fault & VM_FAULT_RETRY || fault & VM_FAULT_ERROR)) {
+   perf_sw_event(PERF_COUNT_SW_SPF_DONE, 1,
+ regs, address);
+   goto done;
+   }
+
+   /*
+* Resetting flags since the following code assumes
+* FAULT_FLAG_WRITE is not set.
+*/
+   flags &= ~FAULT_FLAG_WRITE;
+#endif /* defined(__HAVE_ARCH_CALL_SPF) */
+   }
+
/* When running in the kernel we expect faults to occur only to
 * addresses in user space.  All other faults represent errors in the
 * kernel and should generate an OOPS.  Unfortunately, in the case of an
@@ -479,6 +506,7 @@ int do_page_fault(struct pt_regs *regs, unsigned long 
address,
rc = 0;
}
 
+done:
/*
 * Major/minor page fault accounting.
 */
-- 
2.7.4

[PATCH v2 18/20] perf tools: Add support for the SPF perf event

2017-08-17 Thread Laurent Dufour

Add support for the new speculative faults event.

Signed-off-by: Laurent Dufour <lduf...@linux.vnet.ibm.com>
---
 tools/include/uapi/linux/perf_event.h | 1 +
 tools/perf/util/evsel.c   | 1 +
 tools/perf/util/parse-events.c| 4 
 tools/perf/util/parse-events.l| 1 +
 tools/perf/util/python.c  | 1 +
 5 files changed, 8 insertions(+)

diff --git a/tools/include/uapi/linux/perf_event.h 
b/tools/include/uapi/linux/perf_event.h
index b1c0b187acfe..3043ec0988e9 100644
--- a/tools/include/uapi/linux/perf_event.h
+++ b/tools/include/uapi/linux/perf_event.h
@@ -111,6 +111,7 @@ enum perf_sw_ids {
PERF_COUNT_SW_EMULATION_FAULTS  = 8,
PERF_COUNT_SW_DUMMY = 9,
PERF_COUNT_SW_BPF_OUTPUT= 10,
+   PERF_COUNT_SW_SPF_DONE  = 11,
 
PERF_COUNT_SW_MAX,  /* non-ABI */
 };
diff --git a/tools/perf/util/evsel.c b/tools/perf/util/evsel.c
index 413f74df08de..660a7038198b 100644
--- a/tools/perf/util/evsel.c
+++ b/tools/perf/util/evsel.c
@@ -426,6 +426,7 @@ const char *perf_evsel__sw_names[PERF_COUNT_SW_MAX] = {
"alignment-faults",
"emulation-faults",
"dummy",
+   "speculative-faults",
 };
 
 static const char *__perf_evsel__sw_name(u64 config)
diff --git a/tools/perf/util/parse-events.c b/tools/perf/util/parse-events.c
index 01e779b91c8e..ef8ef30d39c3 100644
--- a/tools/perf/util/parse-events.c
+++ b/tools/perf/util/parse-events.c
@@ -135,6 +135,10 @@ struct event_symbol event_symbols_sw[PERF_COUNT_SW_MAX] = {
.symbol = "bpf-output",
.alias  = "",
},
+   [PERF_COUNT_SW_SPF_DONE] = {
+   .symbol = "speculative-faults",
+   .alias  = "spf",
+   },
 };
 
 #define __PERF_EVENT_FIELD(config, name) \
diff --git a/tools/perf/util/parse-events.l b/tools/perf/util/parse-events.l
index 660fca05bc93..5cb78f004737 100644
--- a/tools/perf/util/parse-events.l
+++ b/tools/perf/util/parse-events.l
@@ -274,6 +274,7 @@ alignment-faults{ return 
sym(yyscanner, PERF_TYPE_SOFTWARE, PERF_COUNT_SW_AL
 emulation-faults   { return sym(yyscanner, 
PERF_TYPE_SOFTWARE, PERF_COUNT_SW_EMULATION_FAULTS); }
 dummy  { return sym(yyscanner, 
PERF_TYPE_SOFTWARE, PERF_COUNT_SW_DUMMY); }
 bpf-output { return sym(yyscanner, 
PERF_TYPE_SOFTWARE, PERF_COUNT_SW_BPF_OUTPUT); }
+speculative-faults|spf { return sym(yyscanner, 
PERF_TYPE_SOFTWARE, PERF_COUNT_SW_SPF_DONE); }
 
/*
 * We have to handle the kernel PMU event 
cycles-ct/cycles-t/mem-loads/mem-stores separately.
diff --git a/tools/perf/util/python.c b/tools/perf/util/python.c
index c129e99114ae..1ee06e47d9dc 100644
--- a/tools/perf/util/python.c
+++ b/tools/perf/util/python.c
@@ -1141,6 +1141,7 @@ static struct {
PERF_CONST(COUNT_SW_ALIGNMENT_FAULTS),
PERF_CONST(COUNT_SW_EMULATION_FAULTS),
PERF_CONST(COUNT_SW_DUMMY),
+   PERF_CONST(COUNT_SW_SPF_DONE),
 
PERF_CONST(SAMPLE_IP),
PERF_CONST(SAMPLE_TID),
-- 
2.7.4

[PATCH v2 20/20] powerpc/mm: Add speculative page fault

2017-08-17 Thread Laurent Dufour

This patch enable the speculative page fault on the PowerPC
architecture.

This will try a speculative page fault without holding the mmap_sem,
if it returns with WM_FAULT_RETRY, the mmap_sem is acquired and the
traditional page fault processing is done.

Support is only provide for BOOK3S_64 currently because:
- require CONFIG_PPC_STD_MMU because checks done in
  set_access_flags_filter()
- require BOOK3S because we can't support for book3e_hugetlb_preload()
  called by update_mmu_cache()

Signed-off-by: Laurent Dufour 
---
 arch/powerpc/include/asm/book3s/64/pgtable.h |  5 +
 arch/powerpc/mm/fault.c  | 30 +++-
 2 files changed, 34 insertions(+), 1 deletion(-)

diff --git a/arch/powerpc/include/asm/book3s/64/pgtable.h 
b/arch/powerpc/include/asm/book3s/64/pgtable.h
index 818a58fc3f4f..897f8b9f67e6 100644
--- a/arch/powerpc/include/asm/book3s/64/pgtable.h
+++ b/arch/powerpc/include/asm/book3s/64/pgtable.h
@@ -313,6 +313,11 @@ extern unsigned long pci_io_base;
 /* Advertise support for _PAGE_SPECIAL */
 #define __HAVE_ARCH_PTE_SPECIAL
 
+/* Advertise that we call the Speculative Page Fault handler */
+#if defined(CONFIG_PPC_BOOK3S_64)
+#define __HAVE_ARCH_CALL_SPF
+#endif
+
 #ifndef __ASSEMBLY__
 
 /*
diff --git a/arch/powerpc/mm/fault.c b/arch/powerpc/mm/fault.c
index 4c422632047b..7b3cc4c30eab 100644
--- a/arch/powerpc/mm/fault.c
+++ b/arch/powerpc/mm/fault.c
@@ -291,9 +291,36 @@ int do_page_fault(struct pt_regs *regs, unsigned long 
address,
if (is_write && is_user)
store_update_sp = store_updates_sp(regs);
 
-   if (is_user)
+   if (is_user) {
flags |= FAULT_FLAG_USER;
 
+#if defined(__HAVE_ARCH_CALL_SPF)
+   /* let's try a speculative page fault without grabbing the
+* mmap_sem.
+*/
+
+   /*
+* flags is set later based on the VMA's flags, for the common
+* speculative service, we need some flags to be set.
+*/
+   if (is_write)
+   flags |= FAULT_FLAG_WRITE;
+
+   fault = handle_speculative_fault(mm, address, flags);
+   if (!(fault & VM_FAULT_RETRY || fault & VM_FAULT_ERROR)) {
+   perf_sw_event(PERF_COUNT_SW_SPF_DONE, 1,
+ regs, address);
+   goto done;
+   }
+
+   /*
+* Resetting flags since the following code assumes
+* FAULT_FLAG_WRITE is not set.
+*/
+   flags &= ~FAULT_FLAG_WRITE;
+#endif /* defined(__HAVE_ARCH_CALL_SPF) */
+   }
+
/* When running in the kernel we expect faults to occur only to
 * addresses in user space.  All other faults represent errors in the
 * kernel and should generate an OOPS.  Unfortunately, in the case of an
@@ -479,6 +506,7 @@ int do_page_fault(struct pt_regs *regs, unsigned long 
address,
rc = 0;
}
 
+done:
/*
 * Major/minor page fault accounting.
 */
-- 
2.7.4

[PATCH v2 18/20] perf tools: Add support for the SPF perf event

2017-08-17 Thread Laurent Dufour

Add support for the new speculative faults event.

Signed-off-by: Laurent Dufour 
---
 tools/include/uapi/linux/perf_event.h | 1 +
 tools/perf/util/evsel.c   | 1 +
 tools/perf/util/parse-events.c| 4 
 tools/perf/util/parse-events.l| 1 +
 tools/perf/util/python.c  | 1 +
 5 files changed, 8 insertions(+)

diff --git a/tools/include/uapi/linux/perf_event.h 
b/tools/include/uapi/linux/perf_event.h
index b1c0b187acfe..3043ec0988e9 100644
--- a/tools/include/uapi/linux/perf_event.h
+++ b/tools/include/uapi/linux/perf_event.h
@@ -111,6 +111,7 @@ enum perf_sw_ids {
PERF_COUNT_SW_EMULATION_FAULTS  = 8,
PERF_COUNT_SW_DUMMY = 9,
PERF_COUNT_SW_BPF_OUTPUT= 10,
+   PERF_COUNT_SW_SPF_DONE  = 11,
 
PERF_COUNT_SW_MAX,  /* non-ABI */
 };
diff --git a/tools/perf/util/evsel.c b/tools/perf/util/evsel.c
index 413f74df08de..660a7038198b 100644
--- a/tools/perf/util/evsel.c
+++ b/tools/perf/util/evsel.c
@@ -426,6 +426,7 @@ const char *perf_evsel__sw_names[PERF_COUNT_SW_MAX] = {
"alignment-faults",
"emulation-faults",
"dummy",
+   "speculative-faults",
 };
 
 static const char *__perf_evsel__sw_name(u64 config)
diff --git a/tools/perf/util/parse-events.c b/tools/perf/util/parse-events.c
index 01e779b91c8e..ef8ef30d39c3 100644
--- a/tools/perf/util/parse-events.c
+++ b/tools/perf/util/parse-events.c
@@ -135,6 +135,10 @@ struct event_symbol event_symbols_sw[PERF_COUNT_SW_MAX] = {
.symbol = "bpf-output",
.alias  = "",
},
+   [PERF_COUNT_SW_SPF_DONE] = {
+   .symbol = "speculative-faults",
+   .alias  = "spf",
+   },
 };
 
 #define __PERF_EVENT_FIELD(config, name) \
diff --git a/tools/perf/util/parse-events.l b/tools/perf/util/parse-events.l
index 660fca05bc93..5cb78f004737 100644
--- a/tools/perf/util/parse-events.l
+++ b/tools/perf/util/parse-events.l
@@ -274,6 +274,7 @@ alignment-faults{ return 
sym(yyscanner, PERF_TYPE_SOFTWARE, PERF_COUNT_SW_AL
 emulation-faults   { return sym(yyscanner, 
PERF_TYPE_SOFTWARE, PERF_COUNT_SW_EMULATION_FAULTS); }
 dummy  { return sym(yyscanner, 
PERF_TYPE_SOFTWARE, PERF_COUNT_SW_DUMMY); }
 bpf-output { return sym(yyscanner, 
PERF_TYPE_SOFTWARE, PERF_COUNT_SW_BPF_OUTPUT); }
+speculative-faults|spf { return sym(yyscanner, 
PERF_TYPE_SOFTWARE, PERF_COUNT_SW_SPF_DONE); }
 
/*
 * We have to handle the kernel PMU event 
cycles-ct/cycles-t/mem-loads/mem-stores separately.
diff --git a/tools/perf/util/python.c b/tools/perf/util/python.c
index c129e99114ae..1ee06e47d9dc 100644
--- a/tools/perf/util/python.c
+++ b/tools/perf/util/python.c
@@ -1141,6 +1141,7 @@ static struct {
PERF_CONST(COUNT_SW_ALIGNMENT_FAULTS),
PERF_CONST(COUNT_SW_EMULATION_FAULTS),
PERF_CONST(COUNT_SW_DUMMY),
+   PERF_CONST(COUNT_SW_SPF_DONE),
 
PERF_CONST(SAMPLE_IP),
PERF_CONST(SAMPLE_TID),
-- 
2.7.4

[PATCH v2 07/20] mm: Cache some VMA fields in the vm_fault structure

2017-08-17 Thread Laurent Dufour

When handling speculative page fault, the vma->vm_flags and
vma->vm_page_prot fields are read once the page table lock is released. So
there is no more guarantee that these fields would not change in our back.
They will be saved in the vm_fault structure before the VMA is checked for
changes.

This patch also set the fields in hugetlb_no_page() and
__collapse_huge_page_swapin even if it is not need for the callee.

Signed-off-by: Laurent Dufour <lduf...@linux.vnet.ibm.com>
---
 include/linux/mm.h |  6 ++
 mm/hugetlb.c   |  2 ++
 mm/khugepaged.c|  2 ++
 mm/memory.c| 38 --
 4 files changed, 30 insertions(+), 18 deletions(-)

diff --git a/include/linux/mm.h b/include/linux/mm.h
index 8763ec96dc78..43d313ff3a5b 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -345,6 +345,12 @@ struct vm_fault {
 * page table to avoid allocation from
 * atomic context.
 */
+   /*
+* These entries are required when handling speculative page fault.
+* This way the page handling is done using consistent field values.
+*/
+   unsigned long vma_flags;
+   pgprot_t vma_page_prot;
 };
 
 /* page entry size for vm->huge_fault() */
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index 31e207cb399b..55201b98133e 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -3676,6 +3676,8 @@ static int hugetlb_no_page(struct mm_struct *mm, struct 
vm_area_struct *vma,
.vma = vma,
.address = address,
.flags = flags,
+   .vma_flags = vma->vm_flags,
+   .vma_page_prot = vma->vm_page_prot,
/*
 * Hard to debug if it ends up being
 * used by a callee that assumes
diff --git a/mm/khugepaged.c b/mm/khugepaged.c
index 56dd994c05d0..0525a0e74535 100644
--- a/mm/khugepaged.c
+++ b/mm/khugepaged.c
@@ -881,6 +881,8 @@ static bool __collapse_huge_page_swapin(struct mm_struct 
*mm,
.flags = FAULT_FLAG_ALLOW_RETRY,
.pmd = pmd,
.pgoff = linear_page_index(vma, address),
+   .vma_flags = vma->vm_flags,
+   .vma_page_prot = vma->vm_page_prot,
};
 
/* we only decide to swapin, if there is enough young ptes */
diff --git a/mm/memory.c b/mm/memory.c
index 4a2736fe2ef6..da3bd07bb052 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -2417,7 +2417,7 @@ static int wp_page_copy(struct vm_fault *vmf)
 * Don't let another task, with possibly unlocked vma,
 * keep the mlocked page.
 */
-   if (page_copied && (vma->vm_flags & VM_LOCKED)) {
+   if (page_copied && (vmf->vma_flags & VM_LOCKED)) {
lock_page(old_page);/* LRU manipulation */
if (PageMlocked(old_page))
munlock_vma_page(old_page);
@@ -2451,7 +2451,7 @@ static int wp_page_copy(struct vm_fault *vmf)
  */
 int finish_mkwrite_fault(struct vm_fault *vmf)
 {
-   WARN_ON_ONCE(!(vmf->vma->vm_flags & VM_SHARED));
+   WARN_ON_ONCE(!(vmf->vma_flags & VM_SHARED));
if (!pte_map_lock(vmf))
return VM_FAULT_RETRY;
/*
@@ -2553,7 +2553,7 @@ static int do_wp_page(struct vm_fault *vmf)
 * We should not cow pages in a shared writeable mapping.
 * Just mark the pages writable and/or call ops->pfn_mkwrite.
 */
-   if ((vma->vm_flags & (VM_WRITE|VM_SHARED)) ==
+   if ((vmf->vma_flags & (VM_WRITE|VM_SHARED)) ==
 (VM_WRITE|VM_SHARED))
return wp_pfn_shared(vmf);
 
@@ -2600,7 +2600,7 @@ static int do_wp_page(struct vm_fault *vmf)
return VM_FAULT_WRITE;
}
unlock_page(vmf->page);
-   } else if (unlikely((vma->vm_flags & (VM_WRITE|VM_SHARED)) ==
+   } else if (unlikely((vmf->vma_flags & (VM_WRITE|VM_SHARED)) ==
(VM_WRITE|VM_SHARED))) {
return wp_page_shared(vmf);
}
@@ -2817,7 +2817,7 @@ int do_swap_page(struct vm_fault *vmf)
 
inc_mm_counter_fast(vma->vm_mm, MM_ANONPAGES);
dec_mm_counter_fast(vma->vm_mm, MM_SWAPENTS);
-   pte = mk_pte(page, vma->vm_page_prot);
+   pte = mk_pte(page, vmf->vma_page_prot);
if ((vmf->flags & FAULT_FLAG_WRITE) && reuse_swap_page(page, NULL)) {
pte = maybe_mkwrite(pte_mkdirty(pte), vma);
vmf->flags &= ~FAULT_FLAG_WRITE;
@@ -2841,7 +284

[PATCH v2 07/20] mm: Cache some VMA fields in the vm_fault structure

2017-08-17 Thread Laurent Dufour

When handling speculative page fault, the vma->vm_flags and
vma->vm_page_prot fields are read once the page table lock is released. So
there is no more guarantee that these fields would not change in our back.
They will be saved in the vm_fault structure before the VMA is checked for
changes.

This patch also set the fields in hugetlb_no_page() and
__collapse_huge_page_swapin even if it is not need for the callee.

Signed-off-by: Laurent Dufour 
---
 include/linux/mm.h |  6 ++
 mm/hugetlb.c   |  2 ++
 mm/khugepaged.c|  2 ++
 mm/memory.c| 38 --
 4 files changed, 30 insertions(+), 18 deletions(-)

diff --git a/include/linux/mm.h b/include/linux/mm.h
index 8763ec96dc78..43d313ff3a5b 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -345,6 +345,12 @@ struct vm_fault {
 * page table to avoid allocation from
 * atomic context.
 */
+   /*
+* These entries are required when handling speculative page fault.
+* This way the page handling is done using consistent field values.
+*/
+   unsigned long vma_flags;
+   pgprot_t vma_page_prot;
 };
 
 /* page entry size for vm->huge_fault() */
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index 31e207cb399b..55201b98133e 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -3676,6 +3676,8 @@ static int hugetlb_no_page(struct mm_struct *mm, struct 
vm_area_struct *vma,
.vma = vma,
.address = address,
.flags = flags,
+   .vma_flags = vma->vm_flags,
+   .vma_page_prot = vma->vm_page_prot,
/*
 * Hard to debug if it ends up being
 * used by a callee that assumes
diff --git a/mm/khugepaged.c b/mm/khugepaged.c
index 56dd994c05d0..0525a0e74535 100644
--- a/mm/khugepaged.c
+++ b/mm/khugepaged.c
@@ -881,6 +881,8 @@ static bool __collapse_huge_page_swapin(struct mm_struct 
*mm,
.flags = FAULT_FLAG_ALLOW_RETRY,
.pmd = pmd,
.pgoff = linear_page_index(vma, address),
+   .vma_flags = vma->vm_flags,
+   .vma_page_prot = vma->vm_page_prot,
};
 
/* we only decide to swapin, if there is enough young ptes */
diff --git a/mm/memory.c b/mm/memory.c
index 4a2736fe2ef6..da3bd07bb052 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -2417,7 +2417,7 @@ static int wp_page_copy(struct vm_fault *vmf)
 * Don't let another task, with possibly unlocked vma,
 * keep the mlocked page.
 */
-   if (page_copied && (vma->vm_flags & VM_LOCKED)) {
+   if (page_copied && (vmf->vma_flags & VM_LOCKED)) {
lock_page(old_page);/* LRU manipulation */
if (PageMlocked(old_page))
munlock_vma_page(old_page);
@@ -2451,7 +2451,7 @@ static int wp_page_copy(struct vm_fault *vmf)
  */
 int finish_mkwrite_fault(struct vm_fault *vmf)
 {
-   WARN_ON_ONCE(!(vmf->vma->vm_flags & VM_SHARED));
+   WARN_ON_ONCE(!(vmf->vma_flags & VM_SHARED));
if (!pte_map_lock(vmf))
return VM_FAULT_RETRY;
/*
@@ -2553,7 +2553,7 @@ static int do_wp_page(struct vm_fault *vmf)
 * We should not cow pages in a shared writeable mapping.
 * Just mark the pages writable and/or call ops->pfn_mkwrite.
 */
-   if ((vma->vm_flags & (VM_WRITE|VM_SHARED)) ==
+   if ((vmf->vma_flags & (VM_WRITE|VM_SHARED)) ==
 (VM_WRITE|VM_SHARED))
return wp_pfn_shared(vmf);
 
@@ -2600,7 +2600,7 @@ static int do_wp_page(struct vm_fault *vmf)
return VM_FAULT_WRITE;
}
unlock_page(vmf->page);
-   } else if (unlikely((vma->vm_flags & (VM_WRITE|VM_SHARED)) ==
+   } else if (unlikely((vmf->vma_flags & (VM_WRITE|VM_SHARED)) ==
(VM_WRITE|VM_SHARED))) {
return wp_page_shared(vmf);
}
@@ -2817,7 +2817,7 @@ int do_swap_page(struct vm_fault *vmf)
 
inc_mm_counter_fast(vma->vm_mm, MM_ANONPAGES);
dec_mm_counter_fast(vma->vm_mm, MM_SWAPENTS);
-   pte = mk_pte(page, vma->vm_page_prot);
+   pte = mk_pte(page, vmf->vma_page_prot);
if ((vmf->flags & FAULT_FLAG_WRITE) && reuse_swap_page(page, NULL)) {
pte = maybe_mkwrite(pte_mkdirty(pte), vma);
vmf->flags &= ~FAULT_FLAG_WRITE;
@@ -2841,7 +2841,7 @@ int do_swap_page(struct vm_

[PATCH v2 13/20] mm: Introduce __page_add_new_anon_rmap()

2017-08-17 Thread Laurent Dufour

When dealing with speculative page fault handler, we may race with VMA
being split or merged. In this case the vma->vm_start and vm->vm_end
fields may not match the address the page fault is occurring.

This can only happens when the VMA is split but in that case, the
anon_vma pointer of the new VMA will be the same as the original one,
because in __split_vma the new->anon_vma is set to src->anon_vma when
*new = *vma.

So even if the VMA boundaries are not correct, the anon_vma pointer is
still valid.

If the VMA has been merged, then the VMA in which it has been merged
must have the same anon_vma pointer otherwise the merge can't be done.

So in all the case we know that the anon_vma is valid, since we have
checked before starting the speculative page fault that the anon_vma
pointer is valid for this VMA and since there is an anon_vma this
means that at one time a page has been backed and that before the VMA
is cleaned, the page table lock would have to be grab to clean the
PTE, and the anon_vma field is checked once the PTE is locked.

This patch introduce a new __page_add_new_anon_rmap() service which
doesn't check for the VMA boundaries, and create a new inline one
which do the check.

When called from a page fault handler, if this is not a speculative one,
there is a guarantee that vm_start and vm_end match the faulting address,
so this check is useless. In the context of the speculative page fault
handler, this check may be wrong but anon_vma is still valid as explained
above.

Signed-off-by: Laurent Dufour <lduf...@linux.vnet.ibm.com>
---
 include/linux/rmap.h | 12 ++--
 mm/memory.c  |  8 
 mm/rmap.c|  5 ++---
 3 files changed, 16 insertions(+), 9 deletions(-)

diff --git a/include/linux/rmap.h b/include/linux/rmap.h
index 43ef2c30cb0f..f5cd4dbc78b0 100644
--- a/include/linux/rmap.h
+++ b/include/linux/rmap.h
@@ -170,8 +170,16 @@ void page_add_anon_rmap(struct page *, struct 
vm_area_struct *,
unsigned long, bool);
 void do_page_add_anon_rmap(struct page *, struct vm_area_struct *,
   unsigned long, int);
-void page_add_new_anon_rmap(struct page *, struct vm_area_struct *,
-   unsigned long, bool);
+void __page_add_new_anon_rmap(struct page *, struct vm_area_struct *,
+ unsigned long, bool);
+static inline void page_add_new_anon_rmap(struct page *page,
+ struct vm_area_struct *vma,
+ unsigned long address, bool compound)
+{
+   VM_BUG_ON_VMA(address < vma->vm_start || address >= vma->vm_end, vma);
+   __page_add_new_anon_rmap(page, vma, address, compound);
+}
+
 void page_add_file_rmap(struct page *, bool);
 void page_remove_rmap(struct page *, bool);
 
diff --git a/mm/memory.c b/mm/memory.c
index 9f9e5bb7a556..51bc8315281e 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -2376,7 +2376,7 @@ static int wp_page_copy(struct vm_fault *vmf)
 * thread doing COW.
 */
ptep_clear_flush_notify(vma, vmf->address, vmf->pte);
-   page_add_new_anon_rmap(new_page, vma, vmf->address, false);
+   __page_add_new_anon_rmap(new_page, vma, vmf->address, false);
mem_cgroup_commit_charge(new_page, memcg, false, false);
__lru_cache_add_active_or_unevictable(new_page, vmf->vma_flags);
/*
@@ -2847,7 +2847,7 @@ int do_swap_page(struct vm_fault *vmf)
mem_cgroup_commit_charge(page, memcg, true, false);
activate_page(page);
} else { /* ksm created a completely new copy */
-   page_add_new_anon_rmap(page, vma, vmf->address, false);
+   __page_add_new_anon_rmap(page, vma, vmf->address, false);
mem_cgroup_commit_charge(page, memcg, false, false);
__lru_cache_add_active_or_unevictable(page, vmf->vma_flags);
}
@@ -2985,7 +2985,7 @@ static int do_anonymous_page(struct vm_fault *vmf)
}
 
inc_mm_counter_fast(vma->vm_mm, MM_ANONPAGES);
-   page_add_new_anon_rmap(page, vma, vmf->address, false);
+   __page_add_new_anon_rmap(page, vma, vmf->address, false);
mem_cgroup_commit_charge(page, memcg, false, false);
__lru_cache_add_active_or_unevictable(page, vmf->vma_flags);
 setpte:
@@ -3237,7 +3237,7 @@ int alloc_set_pte(struct vm_fault *vmf, struct mem_cgroup 
*memcg,
/* copy-on-write page */
if (write && !(vmf->vma_flags & VM_SHARED)) {
inc_mm_counter_fast(vma->vm_mm, MM_ANONPAGES);
-   page_add_new_anon_rmap(page, vma, vmf->address, false);
+   __page_add_new_anon_rmap(page, vma, vmf->address, false);
mem_cgroup_commit_charge(page, memcg, false, false);
__lru_cache_add_active_or_unevictable(page, vmf->vma

[PATCH v2 13/20] mm: Introduce __page_add_new_anon_rmap()

2017-08-17 Thread Laurent Dufour

When dealing with speculative page fault handler, we may race with VMA
being split or merged. In this case the vma->vm_start and vm->vm_end
fields may not match the address the page fault is occurring.

This can only happens when the VMA is split but in that case, the
anon_vma pointer of the new VMA will be the same as the original one,
because in __split_vma the new->anon_vma is set to src->anon_vma when
*new = *vma.

So even if the VMA boundaries are not correct, the anon_vma pointer is
still valid.

If the VMA has been merged, then the VMA in which it has been merged
must have the same anon_vma pointer otherwise the merge can't be done.

So in all the case we know that the anon_vma is valid, since we have
checked before starting the speculative page fault that the anon_vma
pointer is valid for this VMA and since there is an anon_vma this
means that at one time a page has been backed and that before the VMA
is cleaned, the page table lock would have to be grab to clean the
PTE, and the anon_vma field is checked once the PTE is locked.

This patch introduce a new __page_add_new_anon_rmap() service which
doesn't check for the VMA boundaries, and create a new inline one
which do the check.

When called from a page fault handler, if this is not a speculative one,
there is a guarantee that vm_start and vm_end match the faulting address,
so this check is useless. In the context of the speculative page fault
handler, this check may be wrong but anon_vma is still valid as explained
above.

Signed-off-by: Laurent Dufour 
---
 include/linux/rmap.h | 12 ++--
 mm/memory.c  |  8 
 mm/rmap.c|  5 ++---
 3 files changed, 16 insertions(+), 9 deletions(-)

diff --git a/include/linux/rmap.h b/include/linux/rmap.h
index 43ef2c30cb0f..f5cd4dbc78b0 100644
--- a/include/linux/rmap.h
+++ b/include/linux/rmap.h
@@ -170,8 +170,16 @@ void page_add_anon_rmap(struct page *, struct 
vm_area_struct *,
unsigned long, bool);
 void do_page_add_anon_rmap(struct page *, struct vm_area_struct *,
   unsigned long, int);
-void page_add_new_anon_rmap(struct page *, struct vm_area_struct *,
-   unsigned long, bool);
+void __page_add_new_anon_rmap(struct page *, struct vm_area_struct *,
+ unsigned long, bool);
+static inline void page_add_new_anon_rmap(struct page *page,
+ struct vm_area_struct *vma,
+ unsigned long address, bool compound)
+{
+   VM_BUG_ON_VMA(address < vma->vm_start || address >= vma->vm_end, vma);
+   __page_add_new_anon_rmap(page, vma, address, compound);
+}
+
 void page_add_file_rmap(struct page *, bool);
 void page_remove_rmap(struct page *, bool);
 
diff --git a/mm/memory.c b/mm/memory.c
index 9f9e5bb7a556..51bc8315281e 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -2376,7 +2376,7 @@ static int wp_page_copy(struct vm_fault *vmf)
 * thread doing COW.
 */
ptep_clear_flush_notify(vma, vmf->address, vmf->pte);
-   page_add_new_anon_rmap(new_page, vma, vmf->address, false);
+   __page_add_new_anon_rmap(new_page, vma, vmf->address, false);
mem_cgroup_commit_charge(new_page, memcg, false, false);
__lru_cache_add_active_or_unevictable(new_page, vmf->vma_flags);
/*
@@ -2847,7 +2847,7 @@ int do_swap_page(struct vm_fault *vmf)
mem_cgroup_commit_charge(page, memcg, true, false);
activate_page(page);
} else { /* ksm created a completely new copy */
-   page_add_new_anon_rmap(page, vma, vmf->address, false);
+   __page_add_new_anon_rmap(page, vma, vmf->address, false);
mem_cgroup_commit_charge(page, memcg, false, false);
__lru_cache_add_active_or_unevictable(page, vmf->vma_flags);
}
@@ -2985,7 +2985,7 @@ static int do_anonymous_page(struct vm_fault *vmf)
}
 
inc_mm_counter_fast(vma->vm_mm, MM_ANONPAGES);
-   page_add_new_anon_rmap(page, vma, vmf->address, false);
+   __page_add_new_anon_rmap(page, vma, vmf->address, false);
mem_cgroup_commit_charge(page, memcg, false, false);
__lru_cache_add_active_or_unevictable(page, vmf->vma_flags);
 setpte:
@@ -3237,7 +3237,7 @@ int alloc_set_pte(struct vm_fault *vmf, struct mem_cgroup 
*memcg,
/* copy-on-write page */
if (write && !(vmf->vma_flags & VM_SHARED)) {
inc_mm_counter_fast(vma->vm_mm, MM_ANONPAGES);
-   page_add_new_anon_rmap(page, vma, vmf->address, false);
+   __page_add_new_anon_rmap(page, vma, vmf->address, false);
mem_cgroup_commit_charge(page, memcg, false, false);
__lru_cache_add_active_or_unevictable(page, vmf->vma_flags);
} else {
diff --gi

[PATCH v2 11/20] mm: Introduce __maybe_mkwrite()

2017-08-17 Thread Laurent Dufour

The current maybe_mkwrite() is getting passed the pointer to the vma
structure to fetch the vm_flags field.

When dealing with the speculative page fault handler, it will be better to
rely on the cached vm_flags value stored in the vm_fault structure.

This patch introduce a __maybe_mkwrite() service which can be called by
passing the value of the vm_flags field.

There is no change functional changes expected for the other callers of
maybe_mkwrite().

Signed-off-by: Laurent Dufour <lduf...@linux.vnet.ibm.com>
---
 include/linux/mm.h | 9 +++--
 mm/memory.c| 6 +++---
 2 files changed, 10 insertions(+), 5 deletions(-)

diff --git a/include/linux/mm.h b/include/linux/mm.h
index 43d313ff3a5b..0f4ddd72b172 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -668,13 +668,18 @@ void free_compound_page(struct page *page);
  * pte_mkwrite.  But get_user_pages can cause write faults for mappings
  * that do not have writing enabled, when used by access_process_vm.
  */
-static inline pte_t maybe_mkwrite(pte_t pte, struct vm_area_struct *vma)
+static inline pte_t __maybe_mkwrite(pte_t pte, unsigned long vma_flags)
 {
-   if (likely(vma->vm_flags & VM_WRITE))
+   if (likely(vma_flags & VM_WRITE))
pte = pte_mkwrite(pte);
return pte;
 }
 
+static inline pte_t maybe_mkwrite(pte_t pte, struct vm_area_struct *vma)
+{
+   return __maybe_mkwrite(pte, vma->vm_flags);
+}
+
 int alloc_set_pte(struct vm_fault *vmf, struct mem_cgroup *memcg,
struct page *page);
 int finish_fault(struct vm_fault *vmf);
diff --git a/mm/memory.c b/mm/memory.c
index c6b18cc87e90..ad7b6372d302 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -2269,7 +2269,7 @@ static inline void wp_page_reuse(struct vm_fault *vmf)
 
flush_cache_page(vma, vmf->address, pte_pfn(vmf->orig_pte));
entry = pte_mkyoung(vmf->orig_pte);
-   entry = maybe_mkwrite(pte_mkdirty(entry), vma);
+   entry = __maybe_mkwrite(pte_mkdirty(entry), vmf->vma_flags);
if (ptep_set_access_flags(vma, vmf->address, vmf->pte, entry, 1))
update_mmu_cache(vma, vmf->address, vmf->pte);
pte_unmap_unlock(vmf->pte, vmf->ptl);
@@ -2359,8 +2359,8 @@ static int wp_page_copy(struct vm_fault *vmf)
inc_mm_counter_fast(mm, MM_ANONPAGES);
}
flush_cache_page(vma, vmf->address, pte_pfn(vmf->orig_pte));
-   entry = mk_pte(new_page, vma->vm_page_prot);
-   entry = maybe_mkwrite(pte_mkdirty(entry), vma);
+   entry = mk_pte(new_page, vmf->vma_page_prot);
+   entry = __maybe_mkwrite(pte_mkdirty(entry), vmf->vma_flags);
/*
 * Clear the pte entry and flush it first, before updating the
 * pte with the new entry. This will avoid a race condition
-- 
2.7.4

[PATCH v2 11/20] mm: Introduce __maybe_mkwrite()

2017-08-17 Thread Laurent Dufour

The current maybe_mkwrite() is getting passed the pointer to the vma
structure to fetch the vm_flags field.

When dealing with the speculative page fault handler, it will be better to
rely on the cached vm_flags value stored in the vm_fault structure.

This patch introduce a __maybe_mkwrite() service which can be called by
passing the value of the vm_flags field.

There is no change functional changes expected for the other callers of
maybe_mkwrite().

Signed-off-by: Laurent Dufour 
---
 include/linux/mm.h | 9 +++--
 mm/memory.c| 6 +++---
 2 files changed, 10 insertions(+), 5 deletions(-)

diff --git a/include/linux/mm.h b/include/linux/mm.h
index 43d313ff3a5b..0f4ddd72b172 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -668,13 +668,18 @@ void free_compound_page(struct page *page);
  * pte_mkwrite.  But get_user_pages can cause write faults for mappings
  * that do not have writing enabled, when used by access_process_vm.
  */
-static inline pte_t maybe_mkwrite(pte_t pte, struct vm_area_struct *vma)
+static inline pte_t __maybe_mkwrite(pte_t pte, unsigned long vma_flags)
 {
-   if (likely(vma->vm_flags & VM_WRITE))
+   if (likely(vma_flags & VM_WRITE))
pte = pte_mkwrite(pte);
return pte;
 }
 
+static inline pte_t maybe_mkwrite(pte_t pte, struct vm_area_struct *vma)
+{
+   return __maybe_mkwrite(pte, vma->vm_flags);
+}
+
 int alloc_set_pte(struct vm_fault *vmf, struct mem_cgroup *memcg,
struct page *page);
 int finish_fault(struct vm_fault *vmf);
diff --git a/mm/memory.c b/mm/memory.c
index c6b18cc87e90..ad7b6372d302 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -2269,7 +2269,7 @@ static inline void wp_page_reuse(struct vm_fault *vmf)
 
flush_cache_page(vma, vmf->address, pte_pfn(vmf->orig_pte));
entry = pte_mkyoung(vmf->orig_pte);
-   entry = maybe_mkwrite(pte_mkdirty(entry), vma);
+   entry = __maybe_mkwrite(pte_mkdirty(entry), vmf->vma_flags);
if (ptep_set_access_flags(vma, vmf->address, vmf->pte, entry, 1))
update_mmu_cache(vma, vmf->address, vmf->pte);
pte_unmap_unlock(vmf->pte, vmf->ptl);
@@ -2359,8 +2359,8 @@ static int wp_page_copy(struct vm_fault *vmf)
inc_mm_counter_fast(mm, MM_ANONPAGES);
}
flush_cache_page(vma, vmf->address, pte_pfn(vmf->orig_pte));
-   entry = mk_pte(new_page, vma->vm_page_prot);
-   entry = maybe_mkwrite(pte_mkdirty(entry), vma);
+   entry = mk_pte(new_page, vmf->vma_page_prot);
+   entry = __maybe_mkwrite(pte_mkdirty(entry), vmf->vma_flags);
/*
 * Clear the pte entry and flush it first, before updating the
 * pte with the new entry. This will avoid a race condition
-- 
2.7.4

[PATCH v2 05/20] mm: Protect VMA modifications using VMA sequence count

2017-08-17 Thread Laurent Dufour

The VMA sequence count has been introduced to allow fast detection of
VMA modification when running a page fault handler without holding
the mmap_sem.

This patch provides protection against the VMA modification done in :
- madvise()
- mremap()
- mpol_rebind_policy()
- vma_replace_policy()
- change_prot_numa()
- mlock(), munlock()
- mprotect()
- mmap_region()
- collapse_huge_page()
- userfaultd registering services

In addition, VMA fields which will be read during the speculative fault
path needs to be written using WRITE_ONCE to prevent write to be split
and intermediate values to be pushed to other CPUs.

Signed-off-by: Laurent Dufour <lduf...@linux.vnet.ibm.com>
---
 fs/proc/task_mmu.c |  5 -
 fs/userfaultfd.c   | 17 +
 mm/khugepaged.c|  3 +++
 mm/madvise.c   |  6 +-
 mm/mempolicy.c | 51 ++-
 mm/mlock.c | 13 -
 mm/mmap.c  | 17 ++---
 mm/mprotect.c  |  4 +++-
 mm/mremap.c|  7 +++
 9 files changed, 87 insertions(+), 36 deletions(-)

diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c
index fe8f3265e877..e682179edaae 100644
--- a/fs/proc/task_mmu.c
+++ b/fs/proc/task_mmu.c
@@ -1067,8 +1067,11 @@ static ssize_t clear_refs_write(struct file *file, const 
char __user *buf,
goto out_mm;
}
for (vma = mm->mmap; vma; vma = vma->vm_next) {
-   vma->vm_flags &= ~VM_SOFTDIRTY;
+   write_seqcount_begin(>vm_sequence);
+   WRITE_ONCE(vma->vm_flags,
+  vma->vm_flags & 
~VM_SOFTDIRTY);
vma_set_page_prot(vma);
+   write_seqcount_end(>vm_sequence);
}
downgrade_write(>mmap_sem);
break;
diff --git a/fs/userfaultfd.c b/fs/userfaultfd.c
index b0d5897bc4e6..77b1e025c88e 100644
--- a/fs/userfaultfd.c
+++ b/fs/userfaultfd.c
@@ -612,8 +612,11 @@ int dup_userfaultfd(struct vm_area_struct *vma, struct 
list_head *fcs)
 
octx = vma->vm_userfaultfd_ctx.ctx;
if (!octx || !(octx->features & UFFD_FEATURE_EVENT_FORK)) {
+   write_seqcount_begin(>vm_sequence);
vma->vm_userfaultfd_ctx = NULL_VM_UFFD_CTX;
-   vma->vm_flags &= ~(VM_UFFD_WP | VM_UFFD_MISSING);
+   WRITE_ONCE(vma->vm_flags,
+  vma->vm_flags & ~(VM_UFFD_WP | VM_UFFD_MISSING));
+   write_seqcount_end(>vm_sequence);
return 0;
}
 
@@ -838,8 +841,10 @@ static int userfaultfd_release(struct inode *inode, struct 
file *file)
vma = prev;
else
prev = vma;
-   vma->vm_flags = new_flags;
+   write_seqcount_begin(>vm_sequence);
+   WRITE_ONCE(vma->vm_flags, new_flags);
vma->vm_userfaultfd_ctx = NULL_VM_UFFD_CTX;
+   write_seqcount_end(>vm_sequence);
}
up_write(>mmap_sem);
mmput(mm);
@@ -1357,8 +1362,10 @@ static int userfaultfd_register(struct userfaultfd_ctx 
*ctx,
 * the next vma was merged into the current one and
 * the current one has not been updated yet.
 */
-   vma->vm_flags = new_flags;
+   write_seqcount_begin(>vm_sequence);
+   WRITE_ONCE(vma->vm_flags, new_flags);
vma->vm_userfaultfd_ctx.ctx = ctx;
+   write_seqcount_end(>vm_sequence);
 
skip:
prev = vma;
@@ -1515,8 +1522,10 @@ static int userfaultfd_unregister(struct userfaultfd_ctx 
*ctx,
 * the next vma was merged into the current one and
 * the current one has not been updated yet.
 */
-   vma->vm_flags = new_flags;
+   write_seqcount_begin(>vm_sequence);
+   WRITE_ONCE(vma->vm_flags, new_flags);
vma->vm_userfaultfd_ctx = NULL_VM_UFFD_CTX;
+   write_seqcount_end(>vm_sequence);
 
skip:
prev = vma;
diff --git a/mm/khugepaged.c b/mm/khugepaged.c
index c01f177a1120..56dd994c05d0 100644
--- a/mm/khugepaged.c
+++ b/mm/khugepaged.c
@@ -1005,6 +1005,7 @@ static void collapse_huge_page(struct mm_struct *mm,
if (mm_find_pmd(mm, address) != pmd)
goto out;
 
+   write_seqcount_begin(>vm_sequence);
anon_vma_lock_write(vma->anon_vma);
 
pte = pte_offset_map(pmd, address);
@

[PATCH v2 05/20] mm: Protect VMA modifications using VMA sequence count

2017-08-17 Thread Laurent Dufour

The VMA sequence count has been introduced to allow fast detection of
VMA modification when running a page fault handler without holding
the mmap_sem.

This patch provides protection against the VMA modification done in :
- madvise()
- mremap()
- mpol_rebind_policy()
- vma_replace_policy()
- change_prot_numa()
- mlock(), munlock()
- mprotect()
- mmap_region()
- collapse_huge_page()
- userfaultd registering services

In addition, VMA fields which will be read during the speculative fault
path needs to be written using WRITE_ONCE to prevent write to be split
and intermediate values to be pushed to other CPUs.

Signed-off-by: Laurent Dufour 
---
 fs/proc/task_mmu.c |  5 -
 fs/userfaultfd.c   | 17 +
 mm/khugepaged.c|  3 +++
 mm/madvise.c   |  6 +-
 mm/mempolicy.c | 51 ++-
 mm/mlock.c | 13 -
 mm/mmap.c  | 17 ++---
 mm/mprotect.c  |  4 +++-
 mm/mremap.c|  7 +++
 9 files changed, 87 insertions(+), 36 deletions(-)

diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c
index fe8f3265e877..e682179edaae 100644
--- a/fs/proc/task_mmu.c
+++ b/fs/proc/task_mmu.c
@@ -1067,8 +1067,11 @@ static ssize_t clear_refs_write(struct file *file, const 
char __user *buf,
goto out_mm;
}
for (vma = mm->mmap; vma; vma = vma->vm_next) {
-   vma->vm_flags &= ~VM_SOFTDIRTY;
+   write_seqcount_begin(>vm_sequence);
+   WRITE_ONCE(vma->vm_flags,
+  vma->vm_flags & 
~VM_SOFTDIRTY);
vma_set_page_prot(vma);
+   write_seqcount_end(>vm_sequence);
}
downgrade_write(>mmap_sem);
break;
diff --git a/fs/userfaultfd.c b/fs/userfaultfd.c
index b0d5897bc4e6..77b1e025c88e 100644
--- a/fs/userfaultfd.c
+++ b/fs/userfaultfd.c
@@ -612,8 +612,11 @@ int dup_userfaultfd(struct vm_area_struct *vma, struct 
list_head *fcs)
 
octx = vma->vm_userfaultfd_ctx.ctx;
if (!octx || !(octx->features & UFFD_FEATURE_EVENT_FORK)) {
+   write_seqcount_begin(>vm_sequence);
vma->vm_userfaultfd_ctx = NULL_VM_UFFD_CTX;
-   vma->vm_flags &= ~(VM_UFFD_WP | VM_UFFD_MISSING);
+   WRITE_ONCE(vma->vm_flags,
+  vma->vm_flags & ~(VM_UFFD_WP | VM_UFFD_MISSING));
+   write_seqcount_end(>vm_sequence);
return 0;
}
 
@@ -838,8 +841,10 @@ static int userfaultfd_release(struct inode *inode, struct 
file *file)
vma = prev;
else
prev = vma;
-   vma->vm_flags = new_flags;
+   write_seqcount_begin(>vm_sequence);
+   WRITE_ONCE(vma->vm_flags, new_flags);
vma->vm_userfaultfd_ctx = NULL_VM_UFFD_CTX;
+   write_seqcount_end(>vm_sequence);
}
up_write(>mmap_sem);
mmput(mm);
@@ -1357,8 +1362,10 @@ static int userfaultfd_register(struct userfaultfd_ctx 
*ctx,
 * the next vma was merged into the current one and
 * the current one has not been updated yet.
 */
-   vma->vm_flags = new_flags;
+   write_seqcount_begin(>vm_sequence);
+   WRITE_ONCE(vma->vm_flags, new_flags);
vma->vm_userfaultfd_ctx.ctx = ctx;
+   write_seqcount_end(>vm_sequence);
 
skip:
prev = vma;
@@ -1515,8 +1522,10 @@ static int userfaultfd_unregister(struct userfaultfd_ctx 
*ctx,
 * the next vma was merged into the current one and
 * the current one has not been updated yet.
 */
-   vma->vm_flags = new_flags;
+   write_seqcount_begin(>vm_sequence);
+   WRITE_ONCE(vma->vm_flags, new_flags);
vma->vm_userfaultfd_ctx = NULL_VM_UFFD_CTX;
+   write_seqcount_end(>vm_sequence);
 
skip:
prev = vma;
diff --git a/mm/khugepaged.c b/mm/khugepaged.c
index c01f177a1120..56dd994c05d0 100644
--- a/mm/khugepaged.c
+++ b/mm/khugepaged.c
@@ -1005,6 +1005,7 @@ static void collapse_huge_page(struct mm_struct *mm,
if (mm_find_pmd(mm, address) != pmd)
goto out;
 
+   write_seqcount_begin(>vm_sequence);
anon_vma_lock_write(vma->anon_vma);
 
pte = pte_offset_map(pmd, address);
@@ -1040,6 +1041,7 @@ static void collapse_hug

[PATCH v2 01/20] mm: Dont assume page-table invariance during faults

2017-08-17 Thread Laurent Dufour

From: Peter Zijlstra 

One of the side effects of speculating on faults (without holding
mmap_sem) is that we can race with free_pgtables() and therefore we
cannot assume the page-tables will stick around.

Remove the reliance on the pte pointer.

Signed-off-by: Peter Zijlstra (Intel) 
---
 mm/memory.c | 27 ---
 1 file changed, 27 deletions(-)

diff --git a/mm/memory.c b/mm/memory.c
index e158f7ac6730..36609c082256 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -2131,30 +2131,6 @@ int apply_to_page_range(struct mm_struct *mm, unsigned 
long addr,
 }
 EXPORT_SYMBOL_GPL(apply_to_page_range);
 
-/*
- * handle_pte_fault chooses page fault handler according to an entry which was
- * read non-atomically.  Before making any commitment, on those architectures
- * or configurations (e.g. i386 with PAE) which might give a mix of unmatched
- * parts, do_swap_page must check under lock before unmapping the pte and
- * proceeding (but do_wp_page is only called after already making such a check;
- * and do_anonymous_page can safely check later on).
- */
-static inline int pte_unmap_same(struct mm_struct *mm, pmd_t *pmd,
-   pte_t *page_table, pte_t orig_pte)
-{
-   int same = 1;
-#if defined(CONFIG_SMP) || defined(CONFIG_PREEMPT)
-   if (sizeof(pte_t) > sizeof(unsigned long)) {
-   spinlock_t *ptl = pte_lockptr(mm, pmd);
-   spin_lock(ptl);
-   same = pte_same(*page_table, orig_pte);
-   spin_unlock(ptl);
-   }
-#endif
-   pte_unmap(page_table);
-   return same;
-}
-
 static inline void cow_user_page(struct page *dst, struct page *src, unsigned 
long va, struct vm_area_struct *vma)
 {
debug_dma_assert_idle(src);
@@ -2711,9 +2687,6 @@ int do_swap_page(struct vm_fault *vmf)
int exclusive = 0;
int ret = 0;
 
-   if (!pte_unmap_same(vma->vm_mm, vmf->pmd, vmf->pte, vmf->orig_pte))
-   goto out;
-
entry = pte_to_swp_entry(vmf->orig_pte);
if (unlikely(non_swap_entry(entry))) {
if (is_migration_entry(entry)) {
-- 
2.7.4

< 4 5 6 7 8 9 10 11 12 13 >

801 - 900 of 1353 matches

Mail list logo