Re: [PATCH v9 04/24] mm: Prepare for FAULT_FLAG_SPECULATIVE
On 03/04/2018 23:57, David Rientjes wrote: > On Wed, 28 Mar 2018, Laurent Dufour wrote: > diff --git a/include/linux/mm.h b/include/linux/mm.h index 4d02524a7998..2f3e98edc94a 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -300,6 +300,7 @@ extern pgprot_t protection_map[16]; #define FAULT_FLAG_USER 0x40/* The fault originated in userspace */ #define FAULT_FLAG_REMOTE 0x80/* faulting for non current tsk/mm */ #define FAULT_FLAG_INSTRUCTION 0x100 /* The fault was during an instruction fetch */ +#define FAULT_FLAG_SPECULATIVE0x200 /* Speculative fault, not holding mmap_sem */ #define FAULT_FLAG_TRACE \ { FAULT_FLAG_WRITE, "WRITE" }, \ >>> >>> I think FAULT_FLAG_SPECULATIVE should be introduced in the patch that >>> actually uses it. >> >> I think you're right, I'll move down this define in the series. >> diff --git a/mm/memory.c b/mm/memory.c index e0ae4999c824..8ac241b9f370 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -2288,6 +2288,13 @@ int apply_to_page_range(struct mm_struct *mm, unsigned long addr, } EXPORT_SYMBOL_GPL(apply_to_page_range); +static bool pte_map_lock(struct vm_fault *vmf) >>> >>> inline? >> >> Agreed. >> > > Ignore this, the final form of the function after the full patchset > shouldn't be inline. Indeed, I only kept as inlined the small pte_map_lock() and later pte_spinlock() defined when CONFIG_SPECULATIVE_PAGE_FAULT is not set. +{ + vmf->pte = pte_offset_map_lock(vmf->vma->vm_mm, vmf->pmd, + vmf->address, >ptl); + return true; +} + /* * handle_pte_fault chooses page fault handler according to an entry which was * read non-atomically. Before making any commitment, on those architectures @@ -2477,6 +2484,7 @@ static int wp_page_copy(struct vm_fault *vmf) const unsigned long mmun_start = vmf->address & PAGE_MASK; const unsigned long mmun_end = mmun_start + PAGE_SIZE; struct mem_cgroup *memcg; + int ret = VM_FAULT_OOM; if (unlikely(anon_vma_prepare(vma))) goto oom; @@ -2504,7 +2512,11 @@ static int wp_page_copy(struct vm_fault *vmf) /* * Re-check the pte - we dropped the lock */ - vmf->pte = pte_offset_map_lock(mm, vmf->pmd, vmf->address, >ptl); + if (!pte_map_lock(vmf)) { + mem_cgroup_cancel_charge(new_page, memcg, false); + ret = VM_FAULT_RETRY; + goto oom_free_new; + } >>> >>> Ugh, but we aren't oom here, so maybe rename oom_free_new so that it makes >>> sense for return values other than VM_FAULT_OOM? >> >> You're right, now this label name is not correct, I'll rename it to >> "out_free_new" and rename also the label "oom" to "out" since it is generic >> too >> now. >> > > I think it would just be better to introduce a out_uncharge that handles > the mem_cgroup_cancel_charge() in the exit path. Yes adding an out_uncharge label sounds good too. I'll add it and also rename oom_* ones to out_*. > > diff --git a/mm/memory.c b/mm/memory.c > --- a/mm/memory.c > +++ b/mm/memory.c > @@ -2645,9 +2645,8 @@ static int wp_page_copy(struct vm_fault *vmf) >* Re-check the pte - we dropped the lock >*/ > if (!pte_map_lock(vmf)) { > - mem_cgroup_cancel_charge(new_page, memcg, false); > ret = VM_FAULT_RETRY; > - goto oom_free_new; > + goto out_uncharge; > } > if (likely(pte_same(*vmf->pte, vmf->orig_pte))) { > if (old_page) { > @@ -2735,6 +2734,8 @@ static int wp_page_copy(struct vm_fault *vmf) > put_page(old_page); > } > return page_copied ? VM_FAULT_WRITE : 0; > +out_uncharge: > + mem_cgroup_cancel_charge(new_page, memcg, false); > oom_free_new: > put_page(new_page); > oom: >
Re: [PATCH v9 04/24] mm: Prepare for FAULT_FLAG_SPECULATIVE
On 03/04/2018 23:57, David Rientjes wrote: > On Wed, 28 Mar 2018, Laurent Dufour wrote: > diff --git a/include/linux/mm.h b/include/linux/mm.h index 4d02524a7998..2f3e98edc94a 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -300,6 +300,7 @@ extern pgprot_t protection_map[16]; #define FAULT_FLAG_USER 0x40/* The fault originated in userspace */ #define FAULT_FLAG_REMOTE 0x80/* faulting for non current tsk/mm */ #define FAULT_FLAG_INSTRUCTION 0x100 /* The fault was during an instruction fetch */ +#define FAULT_FLAG_SPECULATIVE0x200 /* Speculative fault, not holding mmap_sem */ #define FAULT_FLAG_TRACE \ { FAULT_FLAG_WRITE, "WRITE" }, \ >>> >>> I think FAULT_FLAG_SPECULATIVE should be introduced in the patch that >>> actually uses it. >> >> I think you're right, I'll move down this define in the series. >> diff --git a/mm/memory.c b/mm/memory.c index e0ae4999c824..8ac241b9f370 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -2288,6 +2288,13 @@ int apply_to_page_range(struct mm_struct *mm, unsigned long addr, } EXPORT_SYMBOL_GPL(apply_to_page_range); +static bool pte_map_lock(struct vm_fault *vmf) >>> >>> inline? >> >> Agreed. >> > > Ignore this, the final form of the function after the full patchset > shouldn't be inline. Indeed, I only kept as inlined the small pte_map_lock() and later pte_spinlock() defined when CONFIG_SPECULATIVE_PAGE_FAULT is not set. +{ + vmf->pte = pte_offset_map_lock(vmf->vma->vm_mm, vmf->pmd, + vmf->address, >ptl); + return true; +} + /* * handle_pte_fault chooses page fault handler according to an entry which was * read non-atomically. Before making any commitment, on those architectures @@ -2477,6 +2484,7 @@ static int wp_page_copy(struct vm_fault *vmf) const unsigned long mmun_start = vmf->address & PAGE_MASK; const unsigned long mmun_end = mmun_start + PAGE_SIZE; struct mem_cgroup *memcg; + int ret = VM_FAULT_OOM; if (unlikely(anon_vma_prepare(vma))) goto oom; @@ -2504,7 +2512,11 @@ static int wp_page_copy(struct vm_fault *vmf) /* * Re-check the pte - we dropped the lock */ - vmf->pte = pte_offset_map_lock(mm, vmf->pmd, vmf->address, >ptl); + if (!pte_map_lock(vmf)) { + mem_cgroup_cancel_charge(new_page, memcg, false); + ret = VM_FAULT_RETRY; + goto oom_free_new; + } >>> >>> Ugh, but we aren't oom here, so maybe rename oom_free_new so that it makes >>> sense for return values other than VM_FAULT_OOM? >> >> You're right, now this label name is not correct, I'll rename it to >> "out_free_new" and rename also the label "oom" to "out" since it is generic >> too >> now. >> > > I think it would just be better to introduce a out_uncharge that handles > the mem_cgroup_cancel_charge() in the exit path. Yes adding an out_uncharge label sounds good too. I'll add it and also rename oom_* ones to out_*. > > diff --git a/mm/memory.c b/mm/memory.c > --- a/mm/memory.c > +++ b/mm/memory.c > @@ -2645,9 +2645,8 @@ static int wp_page_copy(struct vm_fault *vmf) >* Re-check the pte - we dropped the lock >*/ > if (!pte_map_lock(vmf)) { > - mem_cgroup_cancel_charge(new_page, memcg, false); > ret = VM_FAULT_RETRY; > - goto oom_free_new; > + goto out_uncharge; > } > if (likely(pte_same(*vmf->pte, vmf->orig_pte))) { > if (old_page) { > @@ -2735,6 +2734,8 @@ static int wp_page_copy(struct vm_fault *vmf) > put_page(old_page); > } > return page_copied ? VM_FAULT_WRITE : 0; > +out_uncharge: > + mem_cgroup_cancel_charge(new_page, memcg, false); > oom_free_new: > put_page(new_page); > oom: >
Re: [PATCH v9 04/24] mm: Prepare for FAULT_FLAG_SPECULATIVE
On Wed, 28 Mar 2018, Laurent Dufour wrote: > >> diff --git a/include/linux/mm.h b/include/linux/mm.h > >> index 4d02524a7998..2f3e98edc94a 100644 > >> --- a/include/linux/mm.h > >> +++ b/include/linux/mm.h > >> @@ -300,6 +300,7 @@ extern pgprot_t protection_map[16]; > >> #define FAULT_FLAG_USER 0x40/* The fault originated in > >> userspace */ > >> #define FAULT_FLAG_REMOTE 0x80/* faulting for non current tsk/mm */ > >> #define FAULT_FLAG_INSTRUCTION 0x100 /* The fault was during an > >> instruction fetch */ > >> +#define FAULT_FLAG_SPECULATIVE0x200 /* Speculative fault, not > >> holding mmap_sem */ > >> > >> #define FAULT_FLAG_TRACE \ > >>{ FAULT_FLAG_WRITE, "WRITE" }, \ > > > > I think FAULT_FLAG_SPECULATIVE should be introduced in the patch that > > actually uses it. > > I think you're right, I'll move down this define in the series. > > >> diff --git a/mm/memory.c b/mm/memory.c > >> index e0ae4999c824..8ac241b9f370 100644 > >> --- a/mm/memory.c > >> +++ b/mm/memory.c > >> @@ -2288,6 +2288,13 @@ int apply_to_page_range(struct mm_struct *mm, > >> unsigned long addr, > >> } > >> EXPORT_SYMBOL_GPL(apply_to_page_range); > >> > >> +static bool pte_map_lock(struct vm_fault *vmf) > > > > inline? > > Agreed. > Ignore this, the final form of the function after the full patchset shouldn't be inline. > >> +{ > >> + vmf->pte = pte_offset_map_lock(vmf->vma->vm_mm, vmf->pmd, > >> + vmf->address, >ptl); > >> + return true; > >> +} > >> + > >> /* > >> * handle_pte_fault chooses page fault handler according to an entry > >> which was > >> * read non-atomically. Before making any commitment, on those > >> architectures > >> @@ -2477,6 +2484,7 @@ static int wp_page_copy(struct vm_fault *vmf) > >>const unsigned long mmun_start = vmf->address & PAGE_MASK; > >>const unsigned long mmun_end = mmun_start + PAGE_SIZE; > >>struct mem_cgroup *memcg; > >> + int ret = VM_FAULT_OOM; > >> > >>if (unlikely(anon_vma_prepare(vma))) > >>goto oom; > >> @@ -2504,7 +2512,11 @@ static int wp_page_copy(struct vm_fault *vmf) > >>/* > >> * Re-check the pte - we dropped the lock > >> */ > >> - vmf->pte = pte_offset_map_lock(mm, vmf->pmd, vmf->address, >ptl); > >> + if (!pte_map_lock(vmf)) { > >> + mem_cgroup_cancel_charge(new_page, memcg, false); > >> + ret = VM_FAULT_RETRY; > >> + goto oom_free_new; > >> + } > > > > Ugh, but we aren't oom here, so maybe rename oom_free_new so that it makes > > sense for return values other than VM_FAULT_OOM? > > You're right, now this label name is not correct, I'll rename it to > "out_free_new" and rename also the label "oom" to "out" since it is generic > too > now. > I think it would just be better to introduce a out_uncharge that handles the mem_cgroup_cancel_charge() in the exit path. diff --git a/mm/memory.c b/mm/memory.c --- a/mm/memory.c +++ b/mm/memory.c @@ -2645,9 +2645,8 @@ static int wp_page_copy(struct vm_fault *vmf) * Re-check the pte - we dropped the lock */ if (!pte_map_lock(vmf)) { - mem_cgroup_cancel_charge(new_page, memcg, false); ret = VM_FAULT_RETRY; - goto oom_free_new; + goto out_uncharge; } if (likely(pte_same(*vmf->pte, vmf->orig_pte))) { if (old_page) { @@ -2735,6 +2734,8 @@ static int wp_page_copy(struct vm_fault *vmf) put_page(old_page); } return page_copied ? VM_FAULT_WRITE : 0; +out_uncharge: + mem_cgroup_cancel_charge(new_page, memcg, false); oom_free_new: put_page(new_page); oom:
Re: [PATCH v9 04/24] mm: Prepare for FAULT_FLAG_SPECULATIVE
On Wed, 28 Mar 2018, Laurent Dufour wrote: > >> diff --git a/include/linux/mm.h b/include/linux/mm.h > >> index 4d02524a7998..2f3e98edc94a 100644 > >> --- a/include/linux/mm.h > >> +++ b/include/linux/mm.h > >> @@ -300,6 +300,7 @@ extern pgprot_t protection_map[16]; > >> #define FAULT_FLAG_USER 0x40/* The fault originated in > >> userspace */ > >> #define FAULT_FLAG_REMOTE 0x80/* faulting for non current tsk/mm */ > >> #define FAULT_FLAG_INSTRUCTION 0x100 /* The fault was during an > >> instruction fetch */ > >> +#define FAULT_FLAG_SPECULATIVE0x200 /* Speculative fault, not > >> holding mmap_sem */ > >> > >> #define FAULT_FLAG_TRACE \ > >>{ FAULT_FLAG_WRITE, "WRITE" }, \ > > > > I think FAULT_FLAG_SPECULATIVE should be introduced in the patch that > > actually uses it. > > I think you're right, I'll move down this define in the series. > > >> diff --git a/mm/memory.c b/mm/memory.c > >> index e0ae4999c824..8ac241b9f370 100644 > >> --- a/mm/memory.c > >> +++ b/mm/memory.c > >> @@ -2288,6 +2288,13 @@ int apply_to_page_range(struct mm_struct *mm, > >> unsigned long addr, > >> } > >> EXPORT_SYMBOL_GPL(apply_to_page_range); > >> > >> +static bool pte_map_lock(struct vm_fault *vmf) > > > > inline? > > Agreed. > Ignore this, the final form of the function after the full patchset shouldn't be inline. > >> +{ > >> + vmf->pte = pte_offset_map_lock(vmf->vma->vm_mm, vmf->pmd, > >> + vmf->address, >ptl); > >> + return true; > >> +} > >> + > >> /* > >> * handle_pte_fault chooses page fault handler according to an entry > >> which was > >> * read non-atomically. Before making any commitment, on those > >> architectures > >> @@ -2477,6 +2484,7 @@ static int wp_page_copy(struct vm_fault *vmf) > >>const unsigned long mmun_start = vmf->address & PAGE_MASK; > >>const unsigned long mmun_end = mmun_start + PAGE_SIZE; > >>struct mem_cgroup *memcg; > >> + int ret = VM_FAULT_OOM; > >> > >>if (unlikely(anon_vma_prepare(vma))) > >>goto oom; > >> @@ -2504,7 +2512,11 @@ static int wp_page_copy(struct vm_fault *vmf) > >>/* > >> * Re-check the pte - we dropped the lock > >> */ > >> - vmf->pte = pte_offset_map_lock(mm, vmf->pmd, vmf->address, >ptl); > >> + if (!pte_map_lock(vmf)) { > >> + mem_cgroup_cancel_charge(new_page, memcg, false); > >> + ret = VM_FAULT_RETRY; > >> + goto oom_free_new; > >> + } > > > > Ugh, but we aren't oom here, so maybe rename oom_free_new so that it makes > > sense for return values other than VM_FAULT_OOM? > > You're right, now this label name is not correct, I'll rename it to > "out_free_new" and rename also the label "oom" to "out" since it is generic > too > now. > I think it would just be better to introduce a out_uncharge that handles the mem_cgroup_cancel_charge() in the exit path. diff --git a/mm/memory.c b/mm/memory.c --- a/mm/memory.c +++ b/mm/memory.c @@ -2645,9 +2645,8 @@ static int wp_page_copy(struct vm_fault *vmf) * Re-check the pte - we dropped the lock */ if (!pte_map_lock(vmf)) { - mem_cgroup_cancel_charge(new_page, memcg, false); ret = VM_FAULT_RETRY; - goto oom_free_new; + goto out_uncharge; } if (likely(pte_same(*vmf->pte, vmf->orig_pte))) { if (old_page) { @@ -2735,6 +2734,8 @@ static int wp_page_copy(struct vm_fault *vmf) put_page(old_page); } return page_copied ? VM_FAULT_WRITE : 0; +out_uncharge: + mem_cgroup_cancel_charge(new_page, memcg, false); oom_free_new: put_page(new_page); oom:
Re: [PATCH v9 04/24] mm: Prepare for FAULT_FLAG_SPECULATIVE
On 25/03/2018 23:50, David Rientjes wrote: > On Tue, 13 Mar 2018, Laurent Dufour wrote: > >> From: Peter Zijlstra>> >> When speculating faults (without holding mmap_sem) we need to validate >> that the vma against which we loaded pages is still valid when we're >> ready to install the new PTE. >> >> Therefore, replace the pte_offset_map_lock() calls that (re)take the >> PTL with pte_map_lock() which can fail in case we find the VMA changed >> since we started the fault. >> > > Based on how its used, I would have suspected this to be named > pte_map_trylock(). > >> Signed-off-by: Peter Zijlstra (Intel) >> >> [Port to 4.12 kernel] >> [Remove the comment about the fault_env structure which has been >> implemented as the vm_fault structure in the kernel] >> [move pte_map_lock()'s definition upper in the file] >> Signed-off-by: Laurent Dufour >> --- >> include/linux/mm.h | 1 + >> mm/memory.c| 56 >> ++ >> 2 files changed, 41 insertions(+), 16 deletions(-) >> >> diff --git a/include/linux/mm.h b/include/linux/mm.h >> index 4d02524a7998..2f3e98edc94a 100644 >> --- a/include/linux/mm.h >> +++ b/include/linux/mm.h >> @@ -300,6 +300,7 @@ extern pgprot_t protection_map[16]; >> #define FAULT_FLAG_USER 0x40/* The fault originated in >> userspace */ >> #define FAULT_FLAG_REMOTE 0x80/* faulting for non current tsk/mm */ >> #define FAULT_FLAG_INSTRUCTION 0x100 /* The fault was during an >> instruction fetch */ >> +#define FAULT_FLAG_SPECULATIVE 0x200 /* Speculative fault, not >> holding mmap_sem */ >> >> #define FAULT_FLAG_TRACE \ >> { FAULT_FLAG_WRITE, "WRITE" }, \ > > I think FAULT_FLAG_SPECULATIVE should be introduced in the patch that > actually uses it. I think you're right, I'll move down this define in the series. >> diff --git a/mm/memory.c b/mm/memory.c >> index e0ae4999c824..8ac241b9f370 100644 >> --- a/mm/memory.c >> +++ b/mm/memory.c >> @@ -2288,6 +2288,13 @@ int apply_to_page_range(struct mm_struct *mm, >> unsigned long addr, >> } >> EXPORT_SYMBOL_GPL(apply_to_page_range); >> >> +static bool pte_map_lock(struct vm_fault *vmf) > > inline? Agreed. >> +{ >> +vmf->pte = pte_offset_map_lock(vmf->vma->vm_mm, vmf->pmd, >> + vmf->address, >ptl); >> +return true; >> +} >> + >> /* >> * handle_pte_fault chooses page fault handler according to an entry which >> was >> * read non-atomically. Before making any commitment, on those >> architectures >> @@ -2477,6 +2484,7 @@ static int wp_page_copy(struct vm_fault *vmf) >> const unsigned long mmun_start = vmf->address & PAGE_MASK; >> const unsigned long mmun_end = mmun_start + PAGE_SIZE; >> struct mem_cgroup *memcg; >> +int ret = VM_FAULT_OOM; >> >> if (unlikely(anon_vma_prepare(vma))) >> goto oom; >> @@ -2504,7 +2512,11 @@ static int wp_page_copy(struct vm_fault *vmf) >> /* >> * Re-check the pte - we dropped the lock >> */ >> -vmf->pte = pte_offset_map_lock(mm, vmf->pmd, vmf->address, >ptl); >> +if (!pte_map_lock(vmf)) { >> +mem_cgroup_cancel_charge(new_page, memcg, false); >> +ret = VM_FAULT_RETRY; >> +goto oom_free_new; >> +} > > Ugh, but we aren't oom here, so maybe rename oom_free_new so that it makes > sense for return values other than VM_FAULT_OOM? You're right, now this label name is not correct, I'll rename it to "out_free_new" and rename also the label "oom" to "out" since it is generic too now. >> if (likely(pte_same(*vmf->pte, vmf->orig_pte))) { >> if (old_page) { >> if (!PageAnon(old_page)) { >> @@ -2596,7 +2608,7 @@ static int wp_page_copy(struct vm_fault *vmf) >> oom: >> if (old_page) >> put_page(old_page); >> -return VM_FAULT_OOM; >> +return ret; >> } >> >> /** >> @@ -2617,8 +2629,8 @@ static int wp_page_copy(struct vm_fault *vmf) >> int finish_mkwrite_fault(struct vm_fault *vmf) >> { >> WARN_ON_ONCE(!(vmf->vma->vm_flags & VM_SHARED)); >> -vmf->pte = pte_offset_map_lock(vmf->vma->vm_mm, vmf->pmd, vmf->address, >> - >ptl); >> +if (!pte_map_lock(vmf)) >> +return VM_FAULT_RETRY; >> /* >> * We might have raced with another page fault while we released the >> * pte_offset_map_lock. >> @@ -2736,8 +2748,11 @@ static int do_wp_page(struct vm_fault *vmf) >> get_page(vmf->page); >> pte_unmap_unlock(vmf->pte, vmf->ptl); >> lock_page(vmf->page); >> -vmf->pte = pte_offset_map_lock(vma->vm_mm, vmf->pmd, >> -vmf->address, >ptl); >> +if (!pte_map_lock(vmf)) { >> +unlock_page(vmf->page); >> +
Re: [PATCH v9 04/24] mm: Prepare for FAULT_FLAG_SPECULATIVE
On 25/03/2018 23:50, David Rientjes wrote: > On Tue, 13 Mar 2018, Laurent Dufour wrote: > >> From: Peter Zijlstra >> >> When speculating faults (without holding mmap_sem) we need to validate >> that the vma against which we loaded pages is still valid when we're >> ready to install the new PTE. >> >> Therefore, replace the pte_offset_map_lock() calls that (re)take the >> PTL with pte_map_lock() which can fail in case we find the VMA changed >> since we started the fault. >> > > Based on how its used, I would have suspected this to be named > pte_map_trylock(). > >> Signed-off-by: Peter Zijlstra (Intel) >> >> [Port to 4.12 kernel] >> [Remove the comment about the fault_env structure which has been >> implemented as the vm_fault structure in the kernel] >> [move pte_map_lock()'s definition upper in the file] >> Signed-off-by: Laurent Dufour >> --- >> include/linux/mm.h | 1 + >> mm/memory.c| 56 >> ++ >> 2 files changed, 41 insertions(+), 16 deletions(-) >> >> diff --git a/include/linux/mm.h b/include/linux/mm.h >> index 4d02524a7998..2f3e98edc94a 100644 >> --- a/include/linux/mm.h >> +++ b/include/linux/mm.h >> @@ -300,6 +300,7 @@ extern pgprot_t protection_map[16]; >> #define FAULT_FLAG_USER 0x40/* The fault originated in >> userspace */ >> #define FAULT_FLAG_REMOTE 0x80/* faulting for non current tsk/mm */ >> #define FAULT_FLAG_INSTRUCTION 0x100 /* The fault was during an >> instruction fetch */ >> +#define FAULT_FLAG_SPECULATIVE 0x200 /* Speculative fault, not >> holding mmap_sem */ >> >> #define FAULT_FLAG_TRACE \ >> { FAULT_FLAG_WRITE, "WRITE" }, \ > > I think FAULT_FLAG_SPECULATIVE should be introduced in the patch that > actually uses it. I think you're right, I'll move down this define in the series. >> diff --git a/mm/memory.c b/mm/memory.c >> index e0ae4999c824..8ac241b9f370 100644 >> --- a/mm/memory.c >> +++ b/mm/memory.c >> @@ -2288,6 +2288,13 @@ int apply_to_page_range(struct mm_struct *mm, >> unsigned long addr, >> } >> EXPORT_SYMBOL_GPL(apply_to_page_range); >> >> +static bool pte_map_lock(struct vm_fault *vmf) > > inline? Agreed. >> +{ >> +vmf->pte = pte_offset_map_lock(vmf->vma->vm_mm, vmf->pmd, >> + vmf->address, >ptl); >> +return true; >> +} >> + >> /* >> * handle_pte_fault chooses page fault handler according to an entry which >> was >> * read non-atomically. Before making any commitment, on those >> architectures >> @@ -2477,6 +2484,7 @@ static int wp_page_copy(struct vm_fault *vmf) >> const unsigned long mmun_start = vmf->address & PAGE_MASK; >> const unsigned long mmun_end = mmun_start + PAGE_SIZE; >> struct mem_cgroup *memcg; >> +int ret = VM_FAULT_OOM; >> >> if (unlikely(anon_vma_prepare(vma))) >> goto oom; >> @@ -2504,7 +2512,11 @@ static int wp_page_copy(struct vm_fault *vmf) >> /* >> * Re-check the pte - we dropped the lock >> */ >> -vmf->pte = pte_offset_map_lock(mm, vmf->pmd, vmf->address, >ptl); >> +if (!pte_map_lock(vmf)) { >> +mem_cgroup_cancel_charge(new_page, memcg, false); >> +ret = VM_FAULT_RETRY; >> +goto oom_free_new; >> +} > > Ugh, but we aren't oom here, so maybe rename oom_free_new so that it makes > sense for return values other than VM_FAULT_OOM? You're right, now this label name is not correct, I'll rename it to "out_free_new" and rename also the label "oom" to "out" since it is generic too now. >> if (likely(pte_same(*vmf->pte, vmf->orig_pte))) { >> if (old_page) { >> if (!PageAnon(old_page)) { >> @@ -2596,7 +2608,7 @@ static int wp_page_copy(struct vm_fault *vmf) >> oom: >> if (old_page) >> put_page(old_page); >> -return VM_FAULT_OOM; >> +return ret; >> } >> >> /** >> @@ -2617,8 +2629,8 @@ static int wp_page_copy(struct vm_fault *vmf) >> int finish_mkwrite_fault(struct vm_fault *vmf) >> { >> WARN_ON_ONCE(!(vmf->vma->vm_flags & VM_SHARED)); >> -vmf->pte = pte_offset_map_lock(vmf->vma->vm_mm, vmf->pmd, vmf->address, >> - >ptl); >> +if (!pte_map_lock(vmf)) >> +return VM_FAULT_RETRY; >> /* >> * We might have raced with another page fault while we released the >> * pte_offset_map_lock. >> @@ -2736,8 +2748,11 @@ static int do_wp_page(struct vm_fault *vmf) >> get_page(vmf->page); >> pte_unmap_unlock(vmf->pte, vmf->ptl); >> lock_page(vmf->page); >> -vmf->pte = pte_offset_map_lock(vma->vm_mm, vmf->pmd, >> -vmf->address, >ptl); >> +if (!pte_map_lock(vmf)) { >> +unlock_page(vmf->page); >> +put_page(vmf->page); >> +
Re: [PATCH v9 04/24] mm: Prepare for FAULT_FLAG_SPECULATIVE
On Tue, 13 Mar 2018, Laurent Dufour wrote: > From: Peter Zijlstra> > When speculating faults (without holding mmap_sem) we need to validate > that the vma against which we loaded pages is still valid when we're > ready to install the new PTE. > > Therefore, replace the pte_offset_map_lock() calls that (re)take the > PTL with pte_map_lock() which can fail in case we find the VMA changed > since we started the fault. > Based on how its used, I would have suspected this to be named pte_map_trylock(). > Signed-off-by: Peter Zijlstra (Intel) > > [Port to 4.12 kernel] > [Remove the comment about the fault_env structure which has been > implemented as the vm_fault structure in the kernel] > [move pte_map_lock()'s definition upper in the file] > Signed-off-by: Laurent Dufour > --- > include/linux/mm.h | 1 + > mm/memory.c| 56 > ++ > 2 files changed, 41 insertions(+), 16 deletions(-) > > diff --git a/include/linux/mm.h b/include/linux/mm.h > index 4d02524a7998..2f3e98edc94a 100644 > --- a/include/linux/mm.h > +++ b/include/linux/mm.h > @@ -300,6 +300,7 @@ extern pgprot_t protection_map[16]; > #define FAULT_FLAG_USER 0x40/* The fault originated in > userspace */ > #define FAULT_FLAG_REMOTE0x80/* faulting for non current tsk/mm */ > #define FAULT_FLAG_INSTRUCTION 0x100/* The fault was during an > instruction fetch */ > +#define FAULT_FLAG_SPECULATIVE 0x200 /* Speculative fault, not > holding mmap_sem */ > > #define FAULT_FLAG_TRACE \ > { FAULT_FLAG_WRITE, "WRITE" }, \ I think FAULT_FLAG_SPECULATIVE should be introduced in the patch that actually uses it. > diff --git a/mm/memory.c b/mm/memory.c > index e0ae4999c824..8ac241b9f370 100644 > --- a/mm/memory.c > +++ b/mm/memory.c > @@ -2288,6 +2288,13 @@ int apply_to_page_range(struct mm_struct *mm, unsigned > long addr, > } > EXPORT_SYMBOL_GPL(apply_to_page_range); > > +static bool pte_map_lock(struct vm_fault *vmf) inline? > +{ > + vmf->pte = pte_offset_map_lock(vmf->vma->vm_mm, vmf->pmd, > +vmf->address, >ptl); > + return true; > +} > + > /* > * handle_pte_fault chooses page fault handler according to an entry which > was > * read non-atomically. Before making any commitment, on those architectures > @@ -2477,6 +2484,7 @@ static int wp_page_copy(struct vm_fault *vmf) > const unsigned long mmun_start = vmf->address & PAGE_MASK; > const unsigned long mmun_end = mmun_start + PAGE_SIZE; > struct mem_cgroup *memcg; > + int ret = VM_FAULT_OOM; > > if (unlikely(anon_vma_prepare(vma))) > goto oom; > @@ -2504,7 +2512,11 @@ static int wp_page_copy(struct vm_fault *vmf) > /* >* Re-check the pte - we dropped the lock >*/ > - vmf->pte = pte_offset_map_lock(mm, vmf->pmd, vmf->address, >ptl); > + if (!pte_map_lock(vmf)) { > + mem_cgroup_cancel_charge(new_page, memcg, false); > + ret = VM_FAULT_RETRY; > + goto oom_free_new; > + } Ugh, but we aren't oom here, so maybe rename oom_free_new so that it makes sense for return values other than VM_FAULT_OOM? > if (likely(pte_same(*vmf->pte, vmf->orig_pte))) { > if (old_page) { > if (!PageAnon(old_page)) { > @@ -2596,7 +2608,7 @@ static int wp_page_copy(struct vm_fault *vmf) > oom: > if (old_page) > put_page(old_page); > - return VM_FAULT_OOM; > + return ret; > } > > /** > @@ -2617,8 +2629,8 @@ static int wp_page_copy(struct vm_fault *vmf) > int finish_mkwrite_fault(struct vm_fault *vmf) > { > WARN_ON_ONCE(!(vmf->vma->vm_flags & VM_SHARED)); > - vmf->pte = pte_offset_map_lock(vmf->vma->vm_mm, vmf->pmd, vmf->address, > ->ptl); > + if (!pte_map_lock(vmf)) > + return VM_FAULT_RETRY; > /* >* We might have raced with another page fault while we released the >* pte_offset_map_lock. > @@ -2736,8 +2748,11 @@ static int do_wp_page(struct vm_fault *vmf) > get_page(vmf->page); > pte_unmap_unlock(vmf->pte, vmf->ptl); > lock_page(vmf->page); > - vmf->pte = pte_offset_map_lock(vma->vm_mm, vmf->pmd, > - vmf->address, >ptl); > + if (!pte_map_lock(vmf)) { > + unlock_page(vmf->page); > + put_page(vmf->page); > + return VM_FAULT_RETRY; > + } > if (!pte_same(*vmf->pte, vmf->orig_pte)) { > unlock_page(vmf->page); > pte_unmap_unlock(vmf->pte, vmf->ptl); > @@ -2947,8 +2962,10 @@ int
Re: [PATCH v9 04/24] mm: Prepare for FAULT_FLAG_SPECULATIVE
On Tue, 13 Mar 2018, Laurent Dufour wrote: > From: Peter Zijlstra > > When speculating faults (without holding mmap_sem) we need to validate > that the vma against which we loaded pages is still valid when we're > ready to install the new PTE. > > Therefore, replace the pte_offset_map_lock() calls that (re)take the > PTL with pte_map_lock() which can fail in case we find the VMA changed > since we started the fault. > Based on how its used, I would have suspected this to be named pte_map_trylock(). > Signed-off-by: Peter Zijlstra (Intel) > > [Port to 4.12 kernel] > [Remove the comment about the fault_env structure which has been > implemented as the vm_fault structure in the kernel] > [move pte_map_lock()'s definition upper in the file] > Signed-off-by: Laurent Dufour > --- > include/linux/mm.h | 1 + > mm/memory.c| 56 > ++ > 2 files changed, 41 insertions(+), 16 deletions(-) > > diff --git a/include/linux/mm.h b/include/linux/mm.h > index 4d02524a7998..2f3e98edc94a 100644 > --- a/include/linux/mm.h > +++ b/include/linux/mm.h > @@ -300,6 +300,7 @@ extern pgprot_t protection_map[16]; > #define FAULT_FLAG_USER 0x40/* The fault originated in > userspace */ > #define FAULT_FLAG_REMOTE0x80/* faulting for non current tsk/mm */ > #define FAULT_FLAG_INSTRUCTION 0x100/* The fault was during an > instruction fetch */ > +#define FAULT_FLAG_SPECULATIVE 0x200 /* Speculative fault, not > holding mmap_sem */ > > #define FAULT_FLAG_TRACE \ > { FAULT_FLAG_WRITE, "WRITE" }, \ I think FAULT_FLAG_SPECULATIVE should be introduced in the patch that actually uses it. > diff --git a/mm/memory.c b/mm/memory.c > index e0ae4999c824..8ac241b9f370 100644 > --- a/mm/memory.c > +++ b/mm/memory.c > @@ -2288,6 +2288,13 @@ int apply_to_page_range(struct mm_struct *mm, unsigned > long addr, > } > EXPORT_SYMBOL_GPL(apply_to_page_range); > > +static bool pte_map_lock(struct vm_fault *vmf) inline? > +{ > + vmf->pte = pte_offset_map_lock(vmf->vma->vm_mm, vmf->pmd, > +vmf->address, >ptl); > + return true; > +} > + > /* > * handle_pte_fault chooses page fault handler according to an entry which > was > * read non-atomically. Before making any commitment, on those architectures > @@ -2477,6 +2484,7 @@ static int wp_page_copy(struct vm_fault *vmf) > const unsigned long mmun_start = vmf->address & PAGE_MASK; > const unsigned long mmun_end = mmun_start + PAGE_SIZE; > struct mem_cgroup *memcg; > + int ret = VM_FAULT_OOM; > > if (unlikely(anon_vma_prepare(vma))) > goto oom; > @@ -2504,7 +2512,11 @@ static int wp_page_copy(struct vm_fault *vmf) > /* >* Re-check the pte - we dropped the lock >*/ > - vmf->pte = pte_offset_map_lock(mm, vmf->pmd, vmf->address, >ptl); > + if (!pte_map_lock(vmf)) { > + mem_cgroup_cancel_charge(new_page, memcg, false); > + ret = VM_FAULT_RETRY; > + goto oom_free_new; > + } Ugh, but we aren't oom here, so maybe rename oom_free_new so that it makes sense for return values other than VM_FAULT_OOM? > if (likely(pte_same(*vmf->pte, vmf->orig_pte))) { > if (old_page) { > if (!PageAnon(old_page)) { > @@ -2596,7 +2608,7 @@ static int wp_page_copy(struct vm_fault *vmf) > oom: > if (old_page) > put_page(old_page); > - return VM_FAULT_OOM; > + return ret; > } > > /** > @@ -2617,8 +2629,8 @@ static int wp_page_copy(struct vm_fault *vmf) > int finish_mkwrite_fault(struct vm_fault *vmf) > { > WARN_ON_ONCE(!(vmf->vma->vm_flags & VM_SHARED)); > - vmf->pte = pte_offset_map_lock(vmf->vma->vm_mm, vmf->pmd, vmf->address, > ->ptl); > + if (!pte_map_lock(vmf)) > + return VM_FAULT_RETRY; > /* >* We might have raced with another page fault while we released the >* pte_offset_map_lock. > @@ -2736,8 +2748,11 @@ static int do_wp_page(struct vm_fault *vmf) > get_page(vmf->page); > pte_unmap_unlock(vmf->pte, vmf->ptl); > lock_page(vmf->page); > - vmf->pte = pte_offset_map_lock(vma->vm_mm, vmf->pmd, > - vmf->address, >ptl); > + if (!pte_map_lock(vmf)) { > + unlock_page(vmf->page); > + put_page(vmf->page); > + return VM_FAULT_RETRY; > + } > if (!pte_same(*vmf->pte, vmf->orig_pte)) { > unlock_page(vmf->page); > pte_unmap_unlock(vmf->pte, vmf->ptl); > @@ -2947,8 +2962,10 @@ int do_swap_page(struct vm_fault *vmf) >* Back out if somebody