Re: [PATCH v9 04/24] mm: Prepare for FAULT_FLAG_SPECULATIVE

2018-04-04 Thread Laurent Dufour


On 03/04/2018 23:57, David Rientjes wrote:
> On Wed, 28 Mar 2018, Laurent Dufour wrote:
> 
 diff --git a/include/linux/mm.h b/include/linux/mm.h
 index 4d02524a7998..2f3e98edc94a 100644
 --- a/include/linux/mm.h
 +++ b/include/linux/mm.h
 @@ -300,6 +300,7 @@ extern pgprot_t protection_map[16];
  #define FAULT_FLAG_USER   0x40/* The fault originated in 
 userspace */
  #define FAULT_FLAG_REMOTE 0x80/* faulting for non current tsk/mm */
  #define FAULT_FLAG_INSTRUCTION  0x100 /* The fault was during an 
 instruction fetch */
 +#define FAULT_FLAG_SPECULATIVE0x200   /* Speculative fault, not 
 holding mmap_sem */
  
  #define FAULT_FLAG_TRACE \
{ FAULT_FLAG_WRITE, "WRITE" }, \
>>>
>>> I think FAULT_FLAG_SPECULATIVE should be introduced in the patch that 
>>> actually uses it.
>>
>> I think you're right, I'll move down this define in the series.
>>
 diff --git a/mm/memory.c b/mm/memory.c
 index e0ae4999c824..8ac241b9f370 100644
 --- a/mm/memory.c
 +++ b/mm/memory.c
 @@ -2288,6 +2288,13 @@ int apply_to_page_range(struct mm_struct *mm, 
 unsigned long addr,
  }
  EXPORT_SYMBOL_GPL(apply_to_page_range);
  
 +static bool pte_map_lock(struct vm_fault *vmf)
>>>
>>> inline?
>>
>> Agreed.
>>
> 
> Ignore this, the final form of the function after the full patchset 
> shouldn't be inline.

Indeed, I only kept as inlined the small pte_map_lock() and later
pte_spinlock() defined when CONFIG_SPECULATIVE_PAGE_FAULT is not set.

 +{
 +  vmf->pte = pte_offset_map_lock(vmf->vma->vm_mm, vmf->pmd,
 + vmf->address, >ptl);
 +  return true;
 +}
 +
  /*
   * handle_pte_fault chooses page fault handler according to an entry 
 which was
   * read non-atomically.  Before making any commitment, on those 
 architectures
 @@ -2477,6 +2484,7 @@ static int wp_page_copy(struct vm_fault *vmf)
const unsigned long mmun_start = vmf->address & PAGE_MASK;
const unsigned long mmun_end = mmun_start + PAGE_SIZE;
struct mem_cgroup *memcg;
 +  int ret = VM_FAULT_OOM;
  
if (unlikely(anon_vma_prepare(vma)))
goto oom;
 @@ -2504,7 +2512,11 @@ static int wp_page_copy(struct vm_fault *vmf)
/*
 * Re-check the pte - we dropped the lock
 */
 -  vmf->pte = pte_offset_map_lock(mm, vmf->pmd, vmf->address, >ptl);
 +  if (!pte_map_lock(vmf)) {
 +  mem_cgroup_cancel_charge(new_page, memcg, false);
 +  ret = VM_FAULT_RETRY;
 +  goto oom_free_new;
 +  }
>>>
>>> Ugh, but we aren't oom here, so maybe rename oom_free_new so that it makes 
>>> sense for return values other than VM_FAULT_OOM?
>>
>> You're right, now this label name is not correct, I'll rename it to
>> "out_free_new" and rename also the label "oom" to "out" since it is generic 
>> too
>> now.
>>
> 
> I think it would just be better to introduce a out_uncharge that handles 
> the mem_cgroup_cancel_charge() in the exit path.

Yes adding an out_uncharge label sounds good too. I'll add it and also rename
oom_* ones to out_*.

> 
> diff --git a/mm/memory.c b/mm/memory.c
> --- a/mm/memory.c
> +++ b/mm/memory.c
> @@ -2645,9 +2645,8 @@ static int wp_page_copy(struct vm_fault *vmf)
>* Re-check the pte - we dropped the lock
>*/
>   if (!pte_map_lock(vmf)) {
> - mem_cgroup_cancel_charge(new_page, memcg, false);
>   ret = VM_FAULT_RETRY;
> - goto oom_free_new;
> + goto out_uncharge;
>   }
>   if (likely(pte_same(*vmf->pte, vmf->orig_pte))) {
>   if (old_page) {
> @@ -2735,6 +2734,8 @@ static int wp_page_copy(struct vm_fault *vmf)
>   put_page(old_page);
>   }
>   return page_copied ? VM_FAULT_WRITE : 0;
> +out_uncharge:
> + mem_cgroup_cancel_charge(new_page, memcg, false);
>  oom_free_new:
>   put_page(new_page);
>  oom:
> 



Re: [PATCH v9 04/24] mm: Prepare for FAULT_FLAG_SPECULATIVE

2018-04-04 Thread Laurent Dufour


On 03/04/2018 23:57, David Rientjes wrote:
> On Wed, 28 Mar 2018, Laurent Dufour wrote:
> 
 diff --git a/include/linux/mm.h b/include/linux/mm.h
 index 4d02524a7998..2f3e98edc94a 100644
 --- a/include/linux/mm.h
 +++ b/include/linux/mm.h
 @@ -300,6 +300,7 @@ extern pgprot_t protection_map[16];
  #define FAULT_FLAG_USER   0x40/* The fault originated in 
 userspace */
  #define FAULT_FLAG_REMOTE 0x80/* faulting for non current tsk/mm */
  #define FAULT_FLAG_INSTRUCTION  0x100 /* The fault was during an 
 instruction fetch */
 +#define FAULT_FLAG_SPECULATIVE0x200   /* Speculative fault, not 
 holding mmap_sem */
  
  #define FAULT_FLAG_TRACE \
{ FAULT_FLAG_WRITE, "WRITE" }, \
>>>
>>> I think FAULT_FLAG_SPECULATIVE should be introduced in the patch that 
>>> actually uses it.
>>
>> I think you're right, I'll move down this define in the series.
>>
 diff --git a/mm/memory.c b/mm/memory.c
 index e0ae4999c824..8ac241b9f370 100644
 --- a/mm/memory.c
 +++ b/mm/memory.c
 @@ -2288,6 +2288,13 @@ int apply_to_page_range(struct mm_struct *mm, 
 unsigned long addr,
  }
  EXPORT_SYMBOL_GPL(apply_to_page_range);
  
 +static bool pte_map_lock(struct vm_fault *vmf)
>>>
>>> inline?
>>
>> Agreed.
>>
> 
> Ignore this, the final form of the function after the full patchset 
> shouldn't be inline.

Indeed, I only kept as inlined the small pte_map_lock() and later
pte_spinlock() defined when CONFIG_SPECULATIVE_PAGE_FAULT is not set.

 +{
 +  vmf->pte = pte_offset_map_lock(vmf->vma->vm_mm, vmf->pmd,
 + vmf->address, >ptl);
 +  return true;
 +}
 +
  /*
   * handle_pte_fault chooses page fault handler according to an entry 
 which was
   * read non-atomically.  Before making any commitment, on those 
 architectures
 @@ -2477,6 +2484,7 @@ static int wp_page_copy(struct vm_fault *vmf)
const unsigned long mmun_start = vmf->address & PAGE_MASK;
const unsigned long mmun_end = mmun_start + PAGE_SIZE;
struct mem_cgroup *memcg;
 +  int ret = VM_FAULT_OOM;
  
if (unlikely(anon_vma_prepare(vma)))
goto oom;
 @@ -2504,7 +2512,11 @@ static int wp_page_copy(struct vm_fault *vmf)
/*
 * Re-check the pte - we dropped the lock
 */
 -  vmf->pte = pte_offset_map_lock(mm, vmf->pmd, vmf->address, >ptl);
 +  if (!pte_map_lock(vmf)) {
 +  mem_cgroup_cancel_charge(new_page, memcg, false);
 +  ret = VM_FAULT_RETRY;
 +  goto oom_free_new;
 +  }
>>>
>>> Ugh, but we aren't oom here, so maybe rename oom_free_new so that it makes 
>>> sense for return values other than VM_FAULT_OOM?
>>
>> You're right, now this label name is not correct, I'll rename it to
>> "out_free_new" and rename also the label "oom" to "out" since it is generic 
>> too
>> now.
>>
> 
> I think it would just be better to introduce a out_uncharge that handles 
> the mem_cgroup_cancel_charge() in the exit path.

Yes adding an out_uncharge label sounds good too. I'll add it and also rename
oom_* ones to out_*.

> 
> diff --git a/mm/memory.c b/mm/memory.c
> --- a/mm/memory.c
> +++ b/mm/memory.c
> @@ -2645,9 +2645,8 @@ static int wp_page_copy(struct vm_fault *vmf)
>* Re-check the pte - we dropped the lock
>*/
>   if (!pte_map_lock(vmf)) {
> - mem_cgroup_cancel_charge(new_page, memcg, false);
>   ret = VM_FAULT_RETRY;
> - goto oom_free_new;
> + goto out_uncharge;
>   }
>   if (likely(pte_same(*vmf->pte, vmf->orig_pte))) {
>   if (old_page) {
> @@ -2735,6 +2734,8 @@ static int wp_page_copy(struct vm_fault *vmf)
>   put_page(old_page);
>   }
>   return page_copied ? VM_FAULT_WRITE : 0;
> +out_uncharge:
> + mem_cgroup_cancel_charge(new_page, memcg, false);
>  oom_free_new:
>   put_page(new_page);
>  oom:
> 



Re: [PATCH v9 04/24] mm: Prepare for FAULT_FLAG_SPECULATIVE

2018-04-03 Thread David Rientjes
On Wed, 28 Mar 2018, Laurent Dufour wrote:

> >> diff --git a/include/linux/mm.h b/include/linux/mm.h
> >> index 4d02524a7998..2f3e98edc94a 100644
> >> --- a/include/linux/mm.h
> >> +++ b/include/linux/mm.h
> >> @@ -300,6 +300,7 @@ extern pgprot_t protection_map[16];
> >>  #define FAULT_FLAG_USER   0x40/* The fault originated in 
> >> userspace */
> >>  #define FAULT_FLAG_REMOTE 0x80/* faulting for non current tsk/mm */
> >>  #define FAULT_FLAG_INSTRUCTION  0x100 /* The fault was during an 
> >> instruction fetch */
> >> +#define FAULT_FLAG_SPECULATIVE0x200   /* Speculative fault, not 
> >> holding mmap_sem */
> >>  
> >>  #define FAULT_FLAG_TRACE \
> >>{ FAULT_FLAG_WRITE, "WRITE" }, \
> > 
> > I think FAULT_FLAG_SPECULATIVE should be introduced in the patch that 
> > actually uses it.
> 
> I think you're right, I'll move down this define in the series.
> 
> >> diff --git a/mm/memory.c b/mm/memory.c
> >> index e0ae4999c824..8ac241b9f370 100644
> >> --- a/mm/memory.c
> >> +++ b/mm/memory.c
> >> @@ -2288,6 +2288,13 @@ int apply_to_page_range(struct mm_struct *mm, 
> >> unsigned long addr,
> >>  }
> >>  EXPORT_SYMBOL_GPL(apply_to_page_range);
> >>  
> >> +static bool pte_map_lock(struct vm_fault *vmf)
> > 
> > inline?
> 
> Agreed.
> 

Ignore this, the final form of the function after the full patchset 
shouldn't be inline.

> >> +{
> >> +  vmf->pte = pte_offset_map_lock(vmf->vma->vm_mm, vmf->pmd,
> >> + vmf->address, >ptl);
> >> +  return true;
> >> +}
> >> +
> >>  /*
> >>   * handle_pte_fault chooses page fault handler according to an entry 
> >> which was
> >>   * read non-atomically.  Before making any commitment, on those 
> >> architectures
> >> @@ -2477,6 +2484,7 @@ static int wp_page_copy(struct vm_fault *vmf)
> >>const unsigned long mmun_start = vmf->address & PAGE_MASK;
> >>const unsigned long mmun_end = mmun_start + PAGE_SIZE;
> >>struct mem_cgroup *memcg;
> >> +  int ret = VM_FAULT_OOM;
> >>  
> >>if (unlikely(anon_vma_prepare(vma)))
> >>goto oom;
> >> @@ -2504,7 +2512,11 @@ static int wp_page_copy(struct vm_fault *vmf)
> >>/*
> >> * Re-check the pte - we dropped the lock
> >> */
> >> -  vmf->pte = pte_offset_map_lock(mm, vmf->pmd, vmf->address, >ptl);
> >> +  if (!pte_map_lock(vmf)) {
> >> +  mem_cgroup_cancel_charge(new_page, memcg, false);
> >> +  ret = VM_FAULT_RETRY;
> >> +  goto oom_free_new;
> >> +  }
> > 
> > Ugh, but we aren't oom here, so maybe rename oom_free_new so that it makes 
> > sense for return values other than VM_FAULT_OOM?
> 
> You're right, now this label name is not correct, I'll rename it to
> "out_free_new" and rename also the label "oom" to "out" since it is generic 
> too
> now.
> 

I think it would just be better to introduce a out_uncharge that handles 
the mem_cgroup_cancel_charge() in the exit path.

diff --git a/mm/memory.c b/mm/memory.c
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -2645,9 +2645,8 @@ static int wp_page_copy(struct vm_fault *vmf)
 * Re-check the pte - we dropped the lock
 */
if (!pte_map_lock(vmf)) {
-   mem_cgroup_cancel_charge(new_page, memcg, false);
ret = VM_FAULT_RETRY;
-   goto oom_free_new;
+   goto out_uncharge;
}
if (likely(pte_same(*vmf->pte, vmf->orig_pte))) {
if (old_page) {
@@ -2735,6 +2734,8 @@ static int wp_page_copy(struct vm_fault *vmf)
put_page(old_page);
}
return page_copied ? VM_FAULT_WRITE : 0;
+out_uncharge:
+   mem_cgroup_cancel_charge(new_page, memcg, false);
 oom_free_new:
put_page(new_page);
 oom:


Re: [PATCH v9 04/24] mm: Prepare for FAULT_FLAG_SPECULATIVE

2018-04-03 Thread David Rientjes
On Wed, 28 Mar 2018, Laurent Dufour wrote:

> >> diff --git a/include/linux/mm.h b/include/linux/mm.h
> >> index 4d02524a7998..2f3e98edc94a 100644
> >> --- a/include/linux/mm.h
> >> +++ b/include/linux/mm.h
> >> @@ -300,6 +300,7 @@ extern pgprot_t protection_map[16];
> >>  #define FAULT_FLAG_USER   0x40/* The fault originated in 
> >> userspace */
> >>  #define FAULT_FLAG_REMOTE 0x80/* faulting for non current tsk/mm */
> >>  #define FAULT_FLAG_INSTRUCTION  0x100 /* The fault was during an 
> >> instruction fetch */
> >> +#define FAULT_FLAG_SPECULATIVE0x200   /* Speculative fault, not 
> >> holding mmap_sem */
> >>  
> >>  #define FAULT_FLAG_TRACE \
> >>{ FAULT_FLAG_WRITE, "WRITE" }, \
> > 
> > I think FAULT_FLAG_SPECULATIVE should be introduced in the patch that 
> > actually uses it.
> 
> I think you're right, I'll move down this define in the series.
> 
> >> diff --git a/mm/memory.c b/mm/memory.c
> >> index e0ae4999c824..8ac241b9f370 100644
> >> --- a/mm/memory.c
> >> +++ b/mm/memory.c
> >> @@ -2288,6 +2288,13 @@ int apply_to_page_range(struct mm_struct *mm, 
> >> unsigned long addr,
> >>  }
> >>  EXPORT_SYMBOL_GPL(apply_to_page_range);
> >>  
> >> +static bool pte_map_lock(struct vm_fault *vmf)
> > 
> > inline?
> 
> Agreed.
> 

Ignore this, the final form of the function after the full patchset 
shouldn't be inline.

> >> +{
> >> +  vmf->pte = pte_offset_map_lock(vmf->vma->vm_mm, vmf->pmd,
> >> + vmf->address, >ptl);
> >> +  return true;
> >> +}
> >> +
> >>  /*
> >>   * handle_pte_fault chooses page fault handler according to an entry 
> >> which was
> >>   * read non-atomically.  Before making any commitment, on those 
> >> architectures
> >> @@ -2477,6 +2484,7 @@ static int wp_page_copy(struct vm_fault *vmf)
> >>const unsigned long mmun_start = vmf->address & PAGE_MASK;
> >>const unsigned long mmun_end = mmun_start + PAGE_SIZE;
> >>struct mem_cgroup *memcg;
> >> +  int ret = VM_FAULT_OOM;
> >>  
> >>if (unlikely(anon_vma_prepare(vma)))
> >>goto oom;
> >> @@ -2504,7 +2512,11 @@ static int wp_page_copy(struct vm_fault *vmf)
> >>/*
> >> * Re-check the pte - we dropped the lock
> >> */
> >> -  vmf->pte = pte_offset_map_lock(mm, vmf->pmd, vmf->address, >ptl);
> >> +  if (!pte_map_lock(vmf)) {
> >> +  mem_cgroup_cancel_charge(new_page, memcg, false);
> >> +  ret = VM_FAULT_RETRY;
> >> +  goto oom_free_new;
> >> +  }
> > 
> > Ugh, but we aren't oom here, so maybe rename oom_free_new so that it makes 
> > sense for return values other than VM_FAULT_OOM?
> 
> You're right, now this label name is not correct, I'll rename it to
> "out_free_new" and rename also the label "oom" to "out" since it is generic 
> too
> now.
> 

I think it would just be better to introduce a out_uncharge that handles 
the mem_cgroup_cancel_charge() in the exit path.

diff --git a/mm/memory.c b/mm/memory.c
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -2645,9 +2645,8 @@ static int wp_page_copy(struct vm_fault *vmf)
 * Re-check the pte - we dropped the lock
 */
if (!pte_map_lock(vmf)) {
-   mem_cgroup_cancel_charge(new_page, memcg, false);
ret = VM_FAULT_RETRY;
-   goto oom_free_new;
+   goto out_uncharge;
}
if (likely(pte_same(*vmf->pte, vmf->orig_pte))) {
if (old_page) {
@@ -2735,6 +2734,8 @@ static int wp_page_copy(struct vm_fault *vmf)
put_page(old_page);
}
return page_copied ? VM_FAULT_WRITE : 0;
+out_uncharge:
+   mem_cgroup_cancel_charge(new_page, memcg, false);
 oom_free_new:
put_page(new_page);
 oom:


Re: [PATCH v9 04/24] mm: Prepare for FAULT_FLAG_SPECULATIVE

2018-03-28 Thread Laurent Dufour
On 25/03/2018 23:50, David Rientjes wrote:
> On Tue, 13 Mar 2018, Laurent Dufour wrote:
> 
>> From: Peter Zijlstra 
>>
>> When speculating faults (without holding mmap_sem) we need to validate
>> that the vma against which we loaded pages is still valid when we're
>> ready to install the new PTE.
>>
>> Therefore, replace the pte_offset_map_lock() calls that (re)take the
>> PTL with pte_map_lock() which can fail in case we find the VMA changed
>> since we started the fault.
>>
> 
> Based on how its used, I would have suspected this to be named 
> pte_map_trylock().
> 
>> Signed-off-by: Peter Zijlstra (Intel) 
>>
>> [Port to 4.12 kernel]
>> [Remove the comment about the fault_env structure which has been
>>  implemented as the vm_fault structure in the kernel]
>> [move pte_map_lock()'s definition upper in the file]
>> Signed-off-by: Laurent Dufour 
>> ---
>>  include/linux/mm.h |  1 +
>>  mm/memory.c| 56 
>> ++
>>  2 files changed, 41 insertions(+), 16 deletions(-)
>>
>> diff --git a/include/linux/mm.h b/include/linux/mm.h
>> index 4d02524a7998..2f3e98edc94a 100644
>> --- a/include/linux/mm.h
>> +++ b/include/linux/mm.h
>> @@ -300,6 +300,7 @@ extern pgprot_t protection_map[16];
>>  #define FAULT_FLAG_USER 0x40/* The fault originated in 
>> userspace */
>>  #define FAULT_FLAG_REMOTE   0x80/* faulting for non current tsk/mm */
>>  #define FAULT_FLAG_INSTRUCTION  0x100   /* The fault was during an 
>> instruction fetch */
>> +#define FAULT_FLAG_SPECULATIVE  0x200   /* Speculative fault, not 
>> holding mmap_sem */
>>  
>>  #define FAULT_FLAG_TRACE \
>>  { FAULT_FLAG_WRITE, "WRITE" }, \
> 
> I think FAULT_FLAG_SPECULATIVE should be introduced in the patch that 
> actually uses it.

I think you're right, I'll move down this define in the series.

>> diff --git a/mm/memory.c b/mm/memory.c
>> index e0ae4999c824..8ac241b9f370 100644
>> --- a/mm/memory.c
>> +++ b/mm/memory.c
>> @@ -2288,6 +2288,13 @@ int apply_to_page_range(struct mm_struct *mm, 
>> unsigned long addr,
>>  }
>>  EXPORT_SYMBOL_GPL(apply_to_page_range);
>>  
>> +static bool pte_map_lock(struct vm_fault *vmf)
> 
> inline?

Agreed.

>> +{
>> +vmf->pte = pte_offset_map_lock(vmf->vma->vm_mm, vmf->pmd,
>> +   vmf->address, >ptl);
>> +return true;
>> +}
>> +
>>  /*
>>   * handle_pte_fault chooses page fault handler according to an entry which 
>> was
>>   * read non-atomically.  Before making any commitment, on those 
>> architectures
>> @@ -2477,6 +2484,7 @@ static int wp_page_copy(struct vm_fault *vmf)
>>  const unsigned long mmun_start = vmf->address & PAGE_MASK;
>>  const unsigned long mmun_end = mmun_start + PAGE_SIZE;
>>  struct mem_cgroup *memcg;
>> +int ret = VM_FAULT_OOM;
>>  
>>  if (unlikely(anon_vma_prepare(vma)))
>>  goto oom;
>> @@ -2504,7 +2512,11 @@ static int wp_page_copy(struct vm_fault *vmf)
>>  /*
>>   * Re-check the pte - we dropped the lock
>>   */
>> -vmf->pte = pte_offset_map_lock(mm, vmf->pmd, vmf->address, >ptl);
>> +if (!pte_map_lock(vmf)) {
>> +mem_cgroup_cancel_charge(new_page, memcg, false);
>> +ret = VM_FAULT_RETRY;
>> +goto oom_free_new;
>> +}
> 
> Ugh, but we aren't oom here, so maybe rename oom_free_new so that it makes 
> sense for return values other than VM_FAULT_OOM?

You're right, now this label name is not correct, I'll rename it to
"out_free_new" and rename also the label "oom" to "out" since it is generic too
now.

>>  if (likely(pte_same(*vmf->pte, vmf->orig_pte))) {
>>  if (old_page) {
>>  if (!PageAnon(old_page)) {
>> @@ -2596,7 +2608,7 @@ static int wp_page_copy(struct vm_fault *vmf)
>>  oom:
>>  if (old_page)
>>  put_page(old_page);
>> -return VM_FAULT_OOM;
>> +return ret;
>>  }
>>  
>>  /**
>> @@ -2617,8 +2629,8 @@ static int wp_page_copy(struct vm_fault *vmf)
>>  int finish_mkwrite_fault(struct vm_fault *vmf)
>>  {
>>  WARN_ON_ONCE(!(vmf->vma->vm_flags & VM_SHARED));
>> -vmf->pte = pte_offset_map_lock(vmf->vma->vm_mm, vmf->pmd, vmf->address,
>> -   >ptl);
>> +if (!pte_map_lock(vmf))
>> +return VM_FAULT_RETRY;
>>  /*
>>   * We might have raced with another page fault while we released the
>>   * pte_offset_map_lock.
>> @@ -2736,8 +2748,11 @@ static int do_wp_page(struct vm_fault *vmf)
>>  get_page(vmf->page);
>>  pte_unmap_unlock(vmf->pte, vmf->ptl);
>>  lock_page(vmf->page);
>> -vmf->pte = pte_offset_map_lock(vma->vm_mm, vmf->pmd,
>> -vmf->address, >ptl);
>> +if (!pte_map_lock(vmf)) {
>> +unlock_page(vmf->page);
>> + 

Re: [PATCH v9 04/24] mm: Prepare for FAULT_FLAG_SPECULATIVE

2018-03-28 Thread Laurent Dufour
On 25/03/2018 23:50, David Rientjes wrote:
> On Tue, 13 Mar 2018, Laurent Dufour wrote:
> 
>> From: Peter Zijlstra 
>>
>> When speculating faults (without holding mmap_sem) we need to validate
>> that the vma against which we loaded pages is still valid when we're
>> ready to install the new PTE.
>>
>> Therefore, replace the pte_offset_map_lock() calls that (re)take the
>> PTL with pte_map_lock() which can fail in case we find the VMA changed
>> since we started the fault.
>>
> 
> Based on how its used, I would have suspected this to be named 
> pte_map_trylock().
> 
>> Signed-off-by: Peter Zijlstra (Intel) 
>>
>> [Port to 4.12 kernel]
>> [Remove the comment about the fault_env structure which has been
>>  implemented as the vm_fault structure in the kernel]
>> [move pte_map_lock()'s definition upper in the file]
>> Signed-off-by: Laurent Dufour 
>> ---
>>  include/linux/mm.h |  1 +
>>  mm/memory.c| 56 
>> ++
>>  2 files changed, 41 insertions(+), 16 deletions(-)
>>
>> diff --git a/include/linux/mm.h b/include/linux/mm.h
>> index 4d02524a7998..2f3e98edc94a 100644
>> --- a/include/linux/mm.h
>> +++ b/include/linux/mm.h
>> @@ -300,6 +300,7 @@ extern pgprot_t protection_map[16];
>>  #define FAULT_FLAG_USER 0x40/* The fault originated in 
>> userspace */
>>  #define FAULT_FLAG_REMOTE   0x80/* faulting for non current tsk/mm */
>>  #define FAULT_FLAG_INSTRUCTION  0x100   /* The fault was during an 
>> instruction fetch */
>> +#define FAULT_FLAG_SPECULATIVE  0x200   /* Speculative fault, not 
>> holding mmap_sem */
>>  
>>  #define FAULT_FLAG_TRACE \
>>  { FAULT_FLAG_WRITE, "WRITE" }, \
> 
> I think FAULT_FLAG_SPECULATIVE should be introduced in the patch that 
> actually uses it.

I think you're right, I'll move down this define in the series.

>> diff --git a/mm/memory.c b/mm/memory.c
>> index e0ae4999c824..8ac241b9f370 100644
>> --- a/mm/memory.c
>> +++ b/mm/memory.c
>> @@ -2288,6 +2288,13 @@ int apply_to_page_range(struct mm_struct *mm, 
>> unsigned long addr,
>>  }
>>  EXPORT_SYMBOL_GPL(apply_to_page_range);
>>  
>> +static bool pte_map_lock(struct vm_fault *vmf)
> 
> inline?

Agreed.

>> +{
>> +vmf->pte = pte_offset_map_lock(vmf->vma->vm_mm, vmf->pmd,
>> +   vmf->address, >ptl);
>> +return true;
>> +}
>> +
>>  /*
>>   * handle_pte_fault chooses page fault handler according to an entry which 
>> was
>>   * read non-atomically.  Before making any commitment, on those 
>> architectures
>> @@ -2477,6 +2484,7 @@ static int wp_page_copy(struct vm_fault *vmf)
>>  const unsigned long mmun_start = vmf->address & PAGE_MASK;
>>  const unsigned long mmun_end = mmun_start + PAGE_SIZE;
>>  struct mem_cgroup *memcg;
>> +int ret = VM_FAULT_OOM;
>>  
>>  if (unlikely(anon_vma_prepare(vma)))
>>  goto oom;
>> @@ -2504,7 +2512,11 @@ static int wp_page_copy(struct vm_fault *vmf)
>>  /*
>>   * Re-check the pte - we dropped the lock
>>   */
>> -vmf->pte = pte_offset_map_lock(mm, vmf->pmd, vmf->address, >ptl);
>> +if (!pte_map_lock(vmf)) {
>> +mem_cgroup_cancel_charge(new_page, memcg, false);
>> +ret = VM_FAULT_RETRY;
>> +goto oom_free_new;
>> +}
> 
> Ugh, but we aren't oom here, so maybe rename oom_free_new so that it makes 
> sense for return values other than VM_FAULT_OOM?

You're right, now this label name is not correct, I'll rename it to
"out_free_new" and rename also the label "oom" to "out" since it is generic too
now.

>>  if (likely(pte_same(*vmf->pte, vmf->orig_pte))) {
>>  if (old_page) {
>>  if (!PageAnon(old_page)) {
>> @@ -2596,7 +2608,7 @@ static int wp_page_copy(struct vm_fault *vmf)
>>  oom:
>>  if (old_page)
>>  put_page(old_page);
>> -return VM_FAULT_OOM;
>> +return ret;
>>  }
>>  
>>  /**
>> @@ -2617,8 +2629,8 @@ static int wp_page_copy(struct vm_fault *vmf)
>>  int finish_mkwrite_fault(struct vm_fault *vmf)
>>  {
>>  WARN_ON_ONCE(!(vmf->vma->vm_flags & VM_SHARED));
>> -vmf->pte = pte_offset_map_lock(vmf->vma->vm_mm, vmf->pmd, vmf->address,
>> -   >ptl);
>> +if (!pte_map_lock(vmf))
>> +return VM_FAULT_RETRY;
>>  /*
>>   * We might have raced with another page fault while we released the
>>   * pte_offset_map_lock.
>> @@ -2736,8 +2748,11 @@ static int do_wp_page(struct vm_fault *vmf)
>>  get_page(vmf->page);
>>  pte_unmap_unlock(vmf->pte, vmf->ptl);
>>  lock_page(vmf->page);
>> -vmf->pte = pte_offset_map_lock(vma->vm_mm, vmf->pmd,
>> -vmf->address, >ptl);
>> +if (!pte_map_lock(vmf)) {
>> +unlock_page(vmf->page);
>> +put_page(vmf->page);
>> +

Re: [PATCH v9 04/24] mm: Prepare for FAULT_FLAG_SPECULATIVE

2018-03-25 Thread David Rientjes
On Tue, 13 Mar 2018, Laurent Dufour wrote:

> From: Peter Zijlstra 
> 
> When speculating faults (without holding mmap_sem) we need to validate
> that the vma against which we loaded pages is still valid when we're
> ready to install the new PTE.
> 
> Therefore, replace the pte_offset_map_lock() calls that (re)take the
> PTL with pte_map_lock() which can fail in case we find the VMA changed
> since we started the fault.
> 

Based on how its used, I would have suspected this to be named 
pte_map_trylock().

> Signed-off-by: Peter Zijlstra (Intel) 
> 
> [Port to 4.12 kernel]
> [Remove the comment about the fault_env structure which has been
>  implemented as the vm_fault structure in the kernel]
> [move pte_map_lock()'s definition upper in the file]
> Signed-off-by: Laurent Dufour 
> ---
>  include/linux/mm.h |  1 +
>  mm/memory.c| 56 
> ++
>  2 files changed, 41 insertions(+), 16 deletions(-)
> 
> diff --git a/include/linux/mm.h b/include/linux/mm.h
> index 4d02524a7998..2f3e98edc94a 100644
> --- a/include/linux/mm.h
> +++ b/include/linux/mm.h
> @@ -300,6 +300,7 @@ extern pgprot_t protection_map[16];
>  #define FAULT_FLAG_USER  0x40/* The fault originated in 
> userspace */
>  #define FAULT_FLAG_REMOTE0x80/* faulting for non current tsk/mm */
>  #define FAULT_FLAG_INSTRUCTION  0x100/* The fault was during an 
> instruction fetch */
> +#define FAULT_FLAG_SPECULATIVE   0x200   /* Speculative fault, not 
> holding mmap_sem */
>  
>  #define FAULT_FLAG_TRACE \
>   { FAULT_FLAG_WRITE, "WRITE" }, \

I think FAULT_FLAG_SPECULATIVE should be introduced in the patch that 
actually uses it.

> diff --git a/mm/memory.c b/mm/memory.c
> index e0ae4999c824..8ac241b9f370 100644
> --- a/mm/memory.c
> +++ b/mm/memory.c
> @@ -2288,6 +2288,13 @@ int apply_to_page_range(struct mm_struct *mm, unsigned 
> long addr,
>  }
>  EXPORT_SYMBOL_GPL(apply_to_page_range);
>  
> +static bool pte_map_lock(struct vm_fault *vmf)

inline?

> +{
> + vmf->pte = pte_offset_map_lock(vmf->vma->vm_mm, vmf->pmd,
> +vmf->address, >ptl);
> + return true;
> +}
> +
>  /*
>   * handle_pte_fault chooses page fault handler according to an entry which 
> was
>   * read non-atomically.  Before making any commitment, on those architectures
> @@ -2477,6 +2484,7 @@ static int wp_page_copy(struct vm_fault *vmf)
>   const unsigned long mmun_start = vmf->address & PAGE_MASK;
>   const unsigned long mmun_end = mmun_start + PAGE_SIZE;
>   struct mem_cgroup *memcg;
> + int ret = VM_FAULT_OOM;
>  
>   if (unlikely(anon_vma_prepare(vma)))
>   goto oom;
> @@ -2504,7 +2512,11 @@ static int wp_page_copy(struct vm_fault *vmf)
>   /*
>* Re-check the pte - we dropped the lock
>*/
> - vmf->pte = pte_offset_map_lock(mm, vmf->pmd, vmf->address, >ptl);
> + if (!pte_map_lock(vmf)) {
> + mem_cgroup_cancel_charge(new_page, memcg, false);
> + ret = VM_FAULT_RETRY;
> + goto oom_free_new;
> + }

Ugh, but we aren't oom here, so maybe rename oom_free_new so that it makes 
sense for return values other than VM_FAULT_OOM?

>   if (likely(pte_same(*vmf->pte, vmf->orig_pte))) {
>   if (old_page) {
>   if (!PageAnon(old_page)) {
> @@ -2596,7 +2608,7 @@ static int wp_page_copy(struct vm_fault *vmf)
>  oom:
>   if (old_page)
>   put_page(old_page);
> - return VM_FAULT_OOM;
> + return ret;
>  }
>  
>  /**
> @@ -2617,8 +2629,8 @@ static int wp_page_copy(struct vm_fault *vmf)
>  int finish_mkwrite_fault(struct vm_fault *vmf)
>  {
>   WARN_ON_ONCE(!(vmf->vma->vm_flags & VM_SHARED));
> - vmf->pte = pte_offset_map_lock(vmf->vma->vm_mm, vmf->pmd, vmf->address,
> ->ptl);
> + if (!pte_map_lock(vmf))
> + return VM_FAULT_RETRY;
>   /*
>* We might have raced with another page fault while we released the
>* pte_offset_map_lock.
> @@ -2736,8 +2748,11 @@ static int do_wp_page(struct vm_fault *vmf)
>   get_page(vmf->page);
>   pte_unmap_unlock(vmf->pte, vmf->ptl);
>   lock_page(vmf->page);
> - vmf->pte = pte_offset_map_lock(vma->vm_mm, vmf->pmd,
> - vmf->address, >ptl);
> + if (!pte_map_lock(vmf)) {
> + unlock_page(vmf->page);
> + put_page(vmf->page);
> + return VM_FAULT_RETRY;
> + }
>   if (!pte_same(*vmf->pte, vmf->orig_pte)) {
>   unlock_page(vmf->page);
>   pte_unmap_unlock(vmf->pte, vmf->ptl);
> @@ -2947,8 +2962,10 @@ int 

Re: [PATCH v9 04/24] mm: Prepare for FAULT_FLAG_SPECULATIVE

2018-03-25 Thread David Rientjes
On Tue, 13 Mar 2018, Laurent Dufour wrote:

> From: Peter Zijlstra 
> 
> When speculating faults (without holding mmap_sem) we need to validate
> that the vma against which we loaded pages is still valid when we're
> ready to install the new PTE.
> 
> Therefore, replace the pte_offset_map_lock() calls that (re)take the
> PTL with pte_map_lock() which can fail in case we find the VMA changed
> since we started the fault.
> 

Based on how its used, I would have suspected this to be named 
pte_map_trylock().

> Signed-off-by: Peter Zijlstra (Intel) 
> 
> [Port to 4.12 kernel]
> [Remove the comment about the fault_env structure which has been
>  implemented as the vm_fault structure in the kernel]
> [move pte_map_lock()'s definition upper in the file]
> Signed-off-by: Laurent Dufour 
> ---
>  include/linux/mm.h |  1 +
>  mm/memory.c| 56 
> ++
>  2 files changed, 41 insertions(+), 16 deletions(-)
> 
> diff --git a/include/linux/mm.h b/include/linux/mm.h
> index 4d02524a7998..2f3e98edc94a 100644
> --- a/include/linux/mm.h
> +++ b/include/linux/mm.h
> @@ -300,6 +300,7 @@ extern pgprot_t protection_map[16];
>  #define FAULT_FLAG_USER  0x40/* The fault originated in 
> userspace */
>  #define FAULT_FLAG_REMOTE0x80/* faulting for non current tsk/mm */
>  #define FAULT_FLAG_INSTRUCTION  0x100/* The fault was during an 
> instruction fetch */
> +#define FAULT_FLAG_SPECULATIVE   0x200   /* Speculative fault, not 
> holding mmap_sem */
>  
>  #define FAULT_FLAG_TRACE \
>   { FAULT_FLAG_WRITE, "WRITE" }, \

I think FAULT_FLAG_SPECULATIVE should be introduced in the patch that 
actually uses it.

> diff --git a/mm/memory.c b/mm/memory.c
> index e0ae4999c824..8ac241b9f370 100644
> --- a/mm/memory.c
> +++ b/mm/memory.c
> @@ -2288,6 +2288,13 @@ int apply_to_page_range(struct mm_struct *mm, unsigned 
> long addr,
>  }
>  EXPORT_SYMBOL_GPL(apply_to_page_range);
>  
> +static bool pte_map_lock(struct vm_fault *vmf)

inline?

> +{
> + vmf->pte = pte_offset_map_lock(vmf->vma->vm_mm, vmf->pmd,
> +vmf->address, >ptl);
> + return true;
> +}
> +
>  /*
>   * handle_pte_fault chooses page fault handler according to an entry which 
> was
>   * read non-atomically.  Before making any commitment, on those architectures
> @@ -2477,6 +2484,7 @@ static int wp_page_copy(struct vm_fault *vmf)
>   const unsigned long mmun_start = vmf->address & PAGE_MASK;
>   const unsigned long mmun_end = mmun_start + PAGE_SIZE;
>   struct mem_cgroup *memcg;
> + int ret = VM_FAULT_OOM;
>  
>   if (unlikely(anon_vma_prepare(vma)))
>   goto oom;
> @@ -2504,7 +2512,11 @@ static int wp_page_copy(struct vm_fault *vmf)
>   /*
>* Re-check the pte - we dropped the lock
>*/
> - vmf->pte = pte_offset_map_lock(mm, vmf->pmd, vmf->address, >ptl);
> + if (!pte_map_lock(vmf)) {
> + mem_cgroup_cancel_charge(new_page, memcg, false);
> + ret = VM_FAULT_RETRY;
> + goto oom_free_new;
> + }

Ugh, but we aren't oom here, so maybe rename oom_free_new so that it makes 
sense for return values other than VM_FAULT_OOM?

>   if (likely(pte_same(*vmf->pte, vmf->orig_pte))) {
>   if (old_page) {
>   if (!PageAnon(old_page)) {
> @@ -2596,7 +2608,7 @@ static int wp_page_copy(struct vm_fault *vmf)
>  oom:
>   if (old_page)
>   put_page(old_page);
> - return VM_FAULT_OOM;
> + return ret;
>  }
>  
>  /**
> @@ -2617,8 +2629,8 @@ static int wp_page_copy(struct vm_fault *vmf)
>  int finish_mkwrite_fault(struct vm_fault *vmf)
>  {
>   WARN_ON_ONCE(!(vmf->vma->vm_flags & VM_SHARED));
> - vmf->pte = pte_offset_map_lock(vmf->vma->vm_mm, vmf->pmd, vmf->address,
> ->ptl);
> + if (!pte_map_lock(vmf))
> + return VM_FAULT_RETRY;
>   /*
>* We might have raced with another page fault while we released the
>* pte_offset_map_lock.
> @@ -2736,8 +2748,11 @@ static int do_wp_page(struct vm_fault *vmf)
>   get_page(vmf->page);
>   pte_unmap_unlock(vmf->pte, vmf->ptl);
>   lock_page(vmf->page);
> - vmf->pte = pte_offset_map_lock(vma->vm_mm, vmf->pmd,
> - vmf->address, >ptl);
> + if (!pte_map_lock(vmf)) {
> + unlock_page(vmf->page);
> + put_page(vmf->page);
> + return VM_FAULT_RETRY;
> + }
>   if (!pte_same(*vmf->pte, vmf->orig_pte)) {
>   unlock_page(vmf->page);
>   pte_unmap_unlock(vmf->pte, vmf->ptl);
> @@ -2947,8 +2962,10 @@ int do_swap_page(struct vm_fault *vmf)
>* Back out if somebody