Re: [PATCH v10 18/25] mm: provide speculative fault infrastructure

2018-05-15 Thread Laurent Dufour
On 15/05/2018 15:09, vinayak menon wrote:
> On Tue, Apr 17, 2018 at 8:03 PM, Laurent Dufour
>  wrote:
>>
>> +#ifdef CONFIG_SPECULATIVE_PAGE_FAULT
>> +
>> +#ifndef __HAVE_ARCH_PTE_SPECIAL
>> +/* This is required by vm_normal_page() */
>> +#error "Speculative page fault handler requires __HAVE_ARCH_PTE_SPECIAL"
>> +#endif
>> +
>> +/*
>> + * vm_normal_page() adds some processing which should be done while
>> + * hodling the mmap_sem.
>> + */
>> +int __handle_speculative_fault(struct mm_struct *mm, unsigned long address,
>> +  unsigned int flags)
>> +{
>> +   struct vm_fault vmf = {
>> +   .address = address,
>> +   };
>> +   pgd_t *pgd, pgdval;
>> +   p4d_t *p4d, p4dval;
>> +   pud_t pudval;
>> +   int seq, ret = VM_FAULT_RETRY;
>> +   struct vm_area_struct *vma;
>> +
>> +   /* Clear flags that may lead to release the mmap_sem to retry */
>> +   flags &= ~(FAULT_FLAG_ALLOW_RETRY|FAULT_FLAG_KILLABLE);
>> +   flags |= FAULT_FLAG_SPECULATIVE;
>> +
>> +   vma = get_vma(mm, address);
>> +   if (!vma)
>> +   return ret;
>> +
>> +   seq = raw_read_seqcount(>vm_sequence); /* rmb <-> 
>> seqlock,vma_rb_erase() */
>> +   if (seq & 1)
>> +   goto out_put;
>> +
>> +   /*
>> +* Can't call vm_ops service has we don't know what they would do
>> +* with the VMA.
>> +* This include huge page from hugetlbfs.
>> +*/
>> +   if (vma->vm_ops)
>> +   goto out_put;
>> +
>> +   /*
>> +* __anon_vma_prepare() requires the mmap_sem to be held
>> +* because vm_next and vm_prev must be safe. This can't be guaranteed
>> +* in the speculative path.
>> +*/
>> +   if (unlikely(!vma->anon_vma))
>> +   goto out_put;
>> +
>> +   vmf.vma_flags = READ_ONCE(vma->vm_flags);
>> +   vmf.vma_page_prot = READ_ONCE(vma->vm_page_prot);
>> +
>> +   /* Can't call userland page fault handler in the speculative path */
>> +   if (unlikely(vmf.vma_flags & VM_UFFD_MISSING))
>> +   goto out_put;
>> +
>> +   if (vmf.vma_flags & VM_GROWSDOWN || vmf.vma_flags & VM_GROWSUP)
>> +   /*
>> +* This could be detected by the check address against VMA's
>> +* boundaries but we want to trace it as not supported 
>> instead
>> +* of changed.
>> +*/
>> +   goto out_put;
>> +
>> +   if (address < READ_ONCE(vma->vm_start)
>> +   || READ_ONCE(vma->vm_end) <= address)
>> +   goto out_put;
>> +
>> +   if (!arch_vma_access_permitted(vma, flags & FAULT_FLAG_WRITE,
>> +  flags & FAULT_FLAG_INSTRUCTION,
>> +  flags & FAULT_FLAG_REMOTE)) {
>> +   ret = VM_FAULT_SIGSEGV;
>> +   goto out_put;
>> +   }
>> +
>> +   /* This is one is required to check that the VMA has write access 
>> set */
>> +   if (flags & FAULT_FLAG_WRITE) {
>> +   if (unlikely(!(vmf.vma_flags & VM_WRITE))) {
>> +   ret = VM_FAULT_SIGSEGV;
>> +   goto out_put;
>> +   }
>> +   } else if (unlikely(!(vmf.vma_flags & (VM_READ|VM_EXEC|VM_WRITE {
>> +   ret = VM_FAULT_SIGSEGV;
>> +   goto out_put;
>> +   }
>> +
>> +   if (IS_ENABLED(CONFIG_NUMA)) {
>> +   struct mempolicy *pol;
>> +
>> +   /*
>> +* MPOL_INTERLEAVE implies additional checks in
>> +* mpol_misplaced() which are not compatible with the
>> +*speculative page fault processing.
>> +*/
>> +   pol = __get_vma_policy(vma, address);
> 
> 
> This gives a compile time error when CONFIG_NUMA is disabled, as there
> is no definition for
> __get_vma_policy.

IS_ENABLED is not workiing as I expected, my mistake.
I'll rollback to the legacy #ifdef stuff.

Thanks,
Laurent.


>> +   if (!pol)
>> +   pol = get_task_policy(current);
>> +   if (pol && pol->mode == MPOL_INTERLEAVE)
>> +   goto out_put;
>> +   }
>> +
>> +   /*
>> +* Do a speculative lookup of the PTE entry.
>> +*/
>> +   local_irq_disable();
>> +   pgd = pgd_offset(mm, address);
>> +   pgdval = READ_ONCE(*pgd);
>> +   if (pgd_none(pgdval) || unlikely(pgd_bad(pgdval)))
>> +   goto out_walk;
>> +
>> +   p4d = p4d_offset(pgd, address);
>> +   p4dval = READ_ONCE(*p4d);
>> +   if (p4d_none(p4dval) || unlikely(p4d_bad(p4dval)))
>> +   goto out_walk;
>> +
>> +   vmf.pud = pud_offset(p4d, address);
>> +   pudval = READ_ONCE(*vmf.pud);
>> +   if (pud_none(pudval) || unlikely(pud_bad(pudval)))
>> +   goto out_walk;
>> +
>> +   /* Huge pages at PUD level are not 

Re: [PATCH v10 18/25] mm: provide speculative fault infrastructure

2018-05-15 Thread vinayak menon
On Tue, Apr 17, 2018 at 8:03 PM, Laurent Dufour
 wrote:
>
> +#ifdef CONFIG_SPECULATIVE_PAGE_FAULT
> +
> +#ifndef __HAVE_ARCH_PTE_SPECIAL
> +/* This is required by vm_normal_page() */
> +#error "Speculative page fault handler requires __HAVE_ARCH_PTE_SPECIAL"
> +#endif
> +
> +/*
> + * vm_normal_page() adds some processing which should be done while
> + * hodling the mmap_sem.
> + */
> +int __handle_speculative_fault(struct mm_struct *mm, unsigned long address,
> +  unsigned int flags)
> +{
> +   struct vm_fault vmf = {
> +   .address = address,
> +   };
> +   pgd_t *pgd, pgdval;
> +   p4d_t *p4d, p4dval;
> +   pud_t pudval;
> +   int seq, ret = VM_FAULT_RETRY;
> +   struct vm_area_struct *vma;
> +
> +   /* Clear flags that may lead to release the mmap_sem to retry */
> +   flags &= ~(FAULT_FLAG_ALLOW_RETRY|FAULT_FLAG_KILLABLE);
> +   flags |= FAULT_FLAG_SPECULATIVE;
> +
> +   vma = get_vma(mm, address);
> +   if (!vma)
> +   return ret;
> +
> +   seq = raw_read_seqcount(>vm_sequence); /* rmb <-> 
> seqlock,vma_rb_erase() */
> +   if (seq & 1)
> +   goto out_put;
> +
> +   /*
> +* Can't call vm_ops service has we don't know what they would do
> +* with the VMA.
> +* This include huge page from hugetlbfs.
> +*/
> +   if (vma->vm_ops)
> +   goto out_put;
> +
> +   /*
> +* __anon_vma_prepare() requires the mmap_sem to be held
> +* because vm_next and vm_prev must be safe. This can't be guaranteed
> +* in the speculative path.
> +*/
> +   if (unlikely(!vma->anon_vma))
> +   goto out_put;
> +
> +   vmf.vma_flags = READ_ONCE(vma->vm_flags);
> +   vmf.vma_page_prot = READ_ONCE(vma->vm_page_prot);
> +
> +   /* Can't call userland page fault handler in the speculative path */
> +   if (unlikely(vmf.vma_flags & VM_UFFD_MISSING))
> +   goto out_put;
> +
> +   if (vmf.vma_flags & VM_GROWSDOWN || vmf.vma_flags & VM_GROWSUP)
> +   /*
> +* This could be detected by the check address against VMA's
> +* boundaries but we want to trace it as not supported instead
> +* of changed.
> +*/
> +   goto out_put;
> +
> +   if (address < READ_ONCE(vma->vm_start)
> +   || READ_ONCE(vma->vm_end) <= address)
> +   goto out_put;
> +
> +   if (!arch_vma_access_permitted(vma, flags & FAULT_FLAG_WRITE,
> +  flags & FAULT_FLAG_INSTRUCTION,
> +  flags & FAULT_FLAG_REMOTE)) {
> +   ret = VM_FAULT_SIGSEGV;
> +   goto out_put;
> +   }
> +
> +   /* This is one is required to check that the VMA has write access set 
> */
> +   if (flags & FAULT_FLAG_WRITE) {
> +   if (unlikely(!(vmf.vma_flags & VM_WRITE))) {
> +   ret = VM_FAULT_SIGSEGV;
> +   goto out_put;
> +   }
> +   } else if (unlikely(!(vmf.vma_flags & (VM_READ|VM_EXEC|VM_WRITE {
> +   ret = VM_FAULT_SIGSEGV;
> +   goto out_put;
> +   }
> +
> +   if (IS_ENABLED(CONFIG_NUMA)) {
> +   struct mempolicy *pol;
> +
> +   /*
> +* MPOL_INTERLEAVE implies additional checks in
> +* mpol_misplaced() which are not compatible with the
> +*speculative page fault processing.
> +*/
> +   pol = __get_vma_policy(vma, address);


This gives a compile time error when CONFIG_NUMA is disabled, as there
is no definition for
__get_vma_policy.


> +   if (!pol)
> +   pol = get_task_policy(current);
> +   if (pol && pol->mode == MPOL_INTERLEAVE)
> +   goto out_put;
> +   }
> +
> +   /*
> +* Do a speculative lookup of the PTE entry.
> +*/
> +   local_irq_disable();
> +   pgd = pgd_offset(mm, address);
> +   pgdval = READ_ONCE(*pgd);
> +   if (pgd_none(pgdval) || unlikely(pgd_bad(pgdval)))
> +   goto out_walk;
> +
> +   p4d = p4d_offset(pgd, address);
> +   p4dval = READ_ONCE(*p4d);
> +   if (p4d_none(p4dval) || unlikely(p4d_bad(p4dval)))
> +   goto out_walk;
> +
> +   vmf.pud = pud_offset(p4d, address);
> +   pudval = READ_ONCE(*vmf.pud);
> +   if (pud_none(pudval) || unlikely(pud_bad(pudval)))
> +   goto out_walk;
> +
> +   /* Huge pages at PUD level are not supported. */
> +   if (unlikely(pud_trans_huge(pudval)))
> +   goto out_walk;
> +
> +   vmf.pmd = pmd_offset(vmf.pud, address);
> +   vmf.orig_pmd = READ_ONCE(*vmf.pmd);
> +   /*
> +* pmd_none could mean that a hugepage collapse is in progress
> +* in our 

[PATCH v10 18/25] mm: provide speculative fault infrastructure

2018-04-17 Thread Laurent Dufour
From: Peter Zijlstra 

Provide infrastructure to do a speculative fault (not holding
mmap_sem).

The not holding of mmap_sem means we can race against VMA
change/removal and page-table destruction. We use the SRCU VMA freeing
to keep the VMA around. We use the VMA seqcount to detect change
(including umapping / page-table deletion) and we use gup_fast() style
page-table walking to deal with page-table races.

Once we've obtained the page and are ready to update the PTE, we
validate if the state we started the fault with is still valid, if
not, we'll fail the fault with VM_FAULT_RETRY, otherwise we update the
PTE and we're done.

Signed-off-by: Peter Zijlstra (Intel) 

[Manage the newly introduced pte_spinlock() for speculative page
 fault to fail if the VMA is touched in our back]
[Rename vma_is_dead() to vma_has_changed() and declare it here]
[Fetch p4d and pud]
[Set vmd.sequence in __handle_mm_fault()]
[Abort speculative path when handle_userfault() has to be called]
[Add additional VMA's flags checks in handle_speculative_fault()]
[Clear FAULT_FLAG_ALLOW_RETRY in handle_speculative_fault()]
[Don't set vmf->pte and vmf->ptl if pte_map_lock() failed]
[Remove warning comment about waiting for !seq&1 since we don't want
 to wait]
[Remove warning about no huge page support, mention it explictly]
[Don't call do_fault() in the speculative path as __do_fault() calls
 vma->vm_ops->fault() which may want to release mmap_sem]
[Only vm_fault pointer argument for vma_has_changed()]
[Fix check against huge page, calling pmd_trans_huge()]
[Use READ_ONCE() when reading VMA's fields in the speculative path]
[Explicitly check for __HAVE_ARCH_PTE_SPECIAL as we can't support for
 processing done in vm_normal_page()]
[Check that vma->anon_vma is already set when starting the speculative
 path]
[Check for memory policy as we can't support MPOL_INTERLEAVE case due to
 the processing done in mpol_misplaced()]
[Don't support VMA growing up or down]
[Move check on vm_sequence just before calling handle_pte_fault()]
[Don't build SPF services if !CONFIG_SPECULATIVE_PAGE_FAULT]
[Add mem cgroup oom check]
[Use READ_ONCE to access p*d entries]
[Replace deprecated ACCESS_ONCE() by READ_ONCE() in vma_has_changed()]
[Don't fetch pte again in handle_pte_fault() when running the speculative
 path]
[Check PMD against concurrent collapsing operation]
[Try spin lock the pte during the speculative path to avoid deadlock with
 other CPU's invalidating the TLB and requiring this CPU to catch the
 inter processor's interrupt]
[Move define of FAULT_FLAG_SPECULATIVE here]
[Introduce __handle_speculative_fault() and add a check against
 mm->mm_users in handle_speculative_fault() defined in mm.h]
Signed-off-by: Laurent Dufour 
---
 include/linux/hugetlb_inline.h |   2 +-
 include/linux/mm.h |  30 
 include/linux/pagemap.h|   4 +-
 mm/internal.h  |  16 +-
 mm/memory.c| 340 -
 5 files changed, 385 insertions(+), 7 deletions(-)

diff --git a/include/linux/hugetlb_inline.h b/include/linux/hugetlb_inline.h
index 0660a03d37d9..9e25283d6fc9 100644
--- a/include/linux/hugetlb_inline.h
+++ b/include/linux/hugetlb_inline.h
@@ -8,7 +8,7 @@
 
 static inline bool is_vm_hugetlb_page(struct vm_area_struct *vma)
 {
-   return !!(vma->vm_flags & VM_HUGETLB);
+   return !!(READ_ONCE(vma->vm_flags) & VM_HUGETLB);
 }
 
 #else
diff --git a/include/linux/mm.h b/include/linux/mm.h
index e2c24ea58d94..08540c98d63b 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -309,6 +309,7 @@ extern pgprot_t protection_map[16];
 #define FAULT_FLAG_USER0x40/* The fault originated in 
userspace */
 #define FAULT_FLAG_REMOTE  0x80/* faulting for non current tsk/mm */
 #define FAULT_FLAG_INSTRUCTION  0x100  /* The fault was during an instruction 
fetch */
+#define FAULT_FLAG_SPECULATIVE 0x200   /* Speculative fault, not holding 
mmap_sem */
 
 #define FAULT_FLAG_TRACE \
{ FAULT_FLAG_WRITE, "WRITE" }, \
@@ -337,6 +338,10 @@ struct vm_fault {
gfp_t gfp_mask; /* gfp mask to be used for allocations 
*/
pgoff_t pgoff;  /* Logical page offset based on vma */
unsigned long address;  /* Faulting virtual address */
+#ifdef CONFIG_SPECULATIVE_PAGE_FAULT
+   unsigned int sequence;
+   pmd_t orig_pmd; /* value of PMD at the time of fault */
+#endif
pmd_t *pmd; /* Pointer to pmd entry matching
 * the 'address' */
pud_t *pud; /* Pointer to pud entry matching
@@ -1373,6 +1378,31 @@ int invalidate_inode_page(struct page *page);
 #ifdef CONFIG_MMU
 extern int handle_mm_fault(struct vm_area_struct *vma, unsigned long address,
unsigned int flags);
+
+#ifdef