Implementing pte_update with pte_xchg (which uses cmpxchg) is inefficient. A single larx/stcx. works fine, no need for the less efficient cmpxchg sequence.
Then remove the memory barriers from the operation. There is a requirement for TLB flushing to load mm_cpumask after the store that reduces pte permissions, which is moved into the TLB flush code. Signed-off-by: Nicholas Piggin <npig...@gmail.com> --- arch/powerpc/include/asm/book3s/64/radix.h | 25 +++++++++++----------- arch/powerpc/mm/mmu_context.c | 6 ++++-- arch/powerpc/mm/tlb-radix.c | 11 +++++++++- 3 files changed, 27 insertions(+), 15 deletions(-) diff --git a/arch/powerpc/include/asm/book3s/64/radix.h b/arch/powerpc/include/asm/book3s/64/radix.h index 9c567d243f61..ef9f96742ce1 100644 --- a/arch/powerpc/include/asm/book3s/64/radix.h +++ b/arch/powerpc/include/asm/book3s/64/radix.h @@ -131,20 +131,21 @@ extern void radix__ptep_set_access_flags(struct vm_area_struct *vma, pte_t *ptep static inline unsigned long __radix_pte_update(pte_t *ptep, unsigned long clr, unsigned long set) { - pte_t pte; - unsigned long old_pte, new_pte; - - do { - pte = READ_ONCE(*ptep); - old_pte = pte_val(pte); - new_pte = (old_pte | set) & ~clr; - - } while (!pte_xchg(ptep, __pte(old_pte), __pte(new_pte))); - - return old_pte; + __be64 old_be, tmp_be; + + __asm__ __volatile__( + "1: ldarx %0,0,%3 # pte_update\n" + " andc %1,%0,%5 \n" + " or %1,%1,%4 \n" + " stdcx. %1,0,%3 \n" + " bne- 1b" + : "=&r" (old_be), "=&r" (tmp_be), "=m" (*ptep) + : "r" (ptep), "r" (cpu_to_be64(set)), "r" (cpu_to_be64(clr)) + : "cc" ); + + return be64_to_cpu(old_be); } - static inline unsigned long radix__pte_update(struct mm_struct *mm, unsigned long addr, pte_t *ptep, unsigned long clr, diff --git a/arch/powerpc/mm/mmu_context.c b/arch/powerpc/mm/mmu_context.c index 0ab297c4cfad..f84e14f23e50 100644 --- a/arch/powerpc/mm/mmu_context.c +++ b/arch/powerpc/mm/mmu_context.c @@ -57,8 +57,10 @@ void switch_mm_irqs_off(struct mm_struct *prev, struct mm_struct *next, * in switch_slb(), and/or the store of paca->mm_ctx_id in * copy_mm_to_paca(). * - * On the read side the barrier is in pte_xchg(), which orders - * the store to the PTE vs the load of mm_cpumask. + * On the other side, the barrier is in mm/tlb-radix.c for + * radix which orders earlier stores to clear the PTEs vs + * the load of mm_cpumask. And pte_xchg which does the same + * thing for hash. * * This full barrier is needed by membarrier when switching * between processes after store to rq->curr, before user-space diff --git a/arch/powerpc/mm/tlb-radix.c b/arch/powerpc/mm/tlb-radix.c index 5ac3206c51cc..cdc50398fd60 100644 --- a/arch/powerpc/mm/tlb-radix.c +++ b/arch/powerpc/mm/tlb-radix.c @@ -524,6 +524,11 @@ void radix__flush_tlb_mm(struct mm_struct *mm) return; preempt_disable(); + /* + * Order loads of mm_cpumask vs previous stores to clear ptes before + * the invalidate. See barrier in switch_mm_irqs_off + */ + smp_mb(); if (!mm_is_thread_local(mm)) { if (mm_needs_flush_escalation(mm)) _tlbie_pid(pid, RIC_FLUSH_ALL); @@ -544,6 +549,7 @@ void radix__flush_all_mm(struct mm_struct *mm) return; preempt_disable(); + smp_mb(); /* see radix__flush_tlb_mm */ if (!mm_is_thread_local(mm)) _tlbie_pid(pid, RIC_FLUSH_ALL); else @@ -568,6 +574,7 @@ void radix__flush_tlb_page_psize(struct mm_struct *mm, unsigned long vmaddr, return; preempt_disable(); + smp_mb(); /* see radix__flush_tlb_mm */ if (!mm_is_thread_local(mm)) _tlbie_va(vmaddr, pid, psize, RIC_FLUSH_TLB); else @@ -630,6 +637,7 @@ void radix__flush_tlb_range(struct vm_area_struct *vma, unsigned long start, return; preempt_disable(); + smp_mb(); /* see radix__flush_tlb_mm */ if (mm_is_thread_local(mm)) { local = true; full = (end == TLB_FLUSH_ALL || @@ -791,6 +799,7 @@ static inline void __radix__flush_tlb_range_psize(struct mm_struct *mm, return; preempt_disable(); + smp_mb(); /* see radix__flush_tlb_mm */ if (mm_is_thread_local(mm)) { local = true; full = (end == TLB_FLUSH_ALL || @@ -849,7 +858,7 @@ void radix__flush_tlb_collapsed_pmd(struct mm_struct *mm, unsigned long addr) /* Otherwise first do the PWC, then iterate the pages. */ preempt_disable(); - + smp_mb(); /* see radix__flush_tlb_mm */ if (mm_is_thread_local(mm)) { _tlbiel_va_range(addr, end, pid, PAGE_SIZE, mmu_virtual_psize, true); } else { -- 2.17.0