This uses more optimal ll/sc style access patterns (rather than cmpxchg), and also sets the EH=1 lock hint on those operations which acquire ownership of the lock. --- arch/powerpc/include/asm/qspinlock.h | 25 +++++-- arch/powerpc/include/asm/qspinlock_types.h | 6 +- arch/powerpc/lib/qspinlock.c | 81 +++++++++++++++------- 3 files changed, 79 insertions(+), 33 deletions(-)
diff --git a/arch/powerpc/include/asm/qspinlock.h b/arch/powerpc/include/asm/qspinlock.h index 79a1936fb68d..3ab354159e5e 100644 --- a/arch/powerpc/include/asm/qspinlock.h +++ b/arch/powerpc/include/asm/qspinlock.h @@ -2,28 +2,43 @@ #ifndef _ASM_POWERPC_QSPINLOCK_H #define _ASM_POWERPC_QSPINLOCK_H -#include <linux/atomic.h> #include <linux/compiler.h> #include <asm/qspinlock_types.h> static __always_inline int queued_spin_is_locked(struct qspinlock *lock) { - return atomic_read(&lock->val); + return READ_ONCE(lock->val); } static __always_inline int queued_spin_value_unlocked(struct qspinlock lock) { - return !atomic_read(&lock.val); + return !lock.val; } static __always_inline int queued_spin_is_contended(struct qspinlock *lock) { - return !!(atomic_read(&lock->val) & _Q_TAIL_CPU_MASK); + return !!(READ_ONCE(lock->val) & _Q_TAIL_CPU_MASK); } static __always_inline int queued_spin_trylock(struct qspinlock *lock) { - if (atomic_cmpxchg_acquire(&lock->val, 0, _Q_LOCKED_VAL) == 0) + u32 new = _Q_LOCKED_VAL; + u32 prev; + + asm volatile( +"1: lwarx %0,0,%1,%3 # queued_spin_trylock \n" +" cmpwi 0,%0,0 \n" +" bne- 2f \n" +" stwcx. %2,0,%1 \n" +" bne- 1b \n" +"\t" PPC_ACQUIRE_BARRIER " \n" +"2: \n" + : "=&r" (prev) + : "r" (&lock->val), "r" (new), + "i" (IS_ENABLED(CONFIG_PPC64) ? 1 : 0) + : "cr0", "memory"); + + if (likely(prev == 0)) return 1; return 0; } diff --git a/arch/powerpc/include/asm/qspinlock_types.h b/arch/powerpc/include/asm/qspinlock_types.h index 3425dab42576..210adf05b235 100644 --- a/arch/powerpc/include/asm/qspinlock_types.h +++ b/arch/powerpc/include/asm/qspinlock_types.h @@ -7,7 +7,7 @@ typedef struct qspinlock { union { - atomic_t val; + u32 val; #ifdef __LITTLE_ENDIAN struct { @@ -23,10 +23,10 @@ typedef struct qspinlock { }; } arch_spinlock_t; -#define __ARCH_SPIN_LOCK_UNLOCKED { { .val = ATOMIC_INIT(0) } } +#define __ARCH_SPIN_LOCK_UNLOCKED { { .val = 0 } } /* - * Bitfields in the atomic value: + * Bitfields in the lock word: * * 0: locked bit * 16-31: tail cpu (+1) diff --git a/arch/powerpc/lib/qspinlock.c b/arch/powerpc/lib/qspinlock.c index 5ebb88d95636..7c71e5e287df 100644 --- a/arch/powerpc/lib/qspinlock.c +++ b/arch/powerpc/lib/qspinlock.c @@ -1,5 +1,4 @@ // SPDX-License-Identifier: GPL-2.0-or-later -#include <linux/atomic.h> #include <linux/bug.h> #include <linux/compiler.h> #include <linux/export.h> @@ -22,32 +21,59 @@ struct qnodes { static DEFINE_PER_CPU_ALIGNED(struct qnodes, qnodes); -static inline int encode_tail_cpu(void) +static inline u32 encode_tail_cpu(void) { return (smp_processor_id() + 1) << _Q_TAIL_CPU_OFFSET; } -static inline int get_tail_cpu(int val) +static inline int get_tail_cpu(u32 val) { return (val >> _Q_TAIL_CPU_OFFSET) - 1; } /* Take the lock by setting the bit, no other CPUs may concurrently lock it. */ +/* Take the lock by setting the lock bit, no other CPUs will touch it. */ static __always_inline void lock_set_locked(struct qspinlock *lock) { - atomic_or(_Q_LOCKED_VAL, &lock->val); - __atomic_acquire_fence(); + u32 new = _Q_LOCKED_VAL; + u32 prev; + + asm volatile( +"1: lwarx %0,0,%1,%3 # lock_set_locked \n" +" or %0,%0,%2 \n" +" stwcx. %0,0,%1 \n" +" bne- 1b \n" +"\t" PPC_ACQUIRE_BARRIER " \n" + : "=&r" (prev) + : "r" (&lock->val), "r" (new), + "i" (IS_ENABLED(CONFIG_PPC64) ? 1 : 0) + : "cr0", "memory"); } -/* Take lock, clearing tail, cmpxchg with val (which must not be locked) */ -static __always_inline int trylock_clear_tail_cpu(struct qspinlock *lock, int val) +/* Take lock, clearing tail, cmpxchg with old (which must not be locked) */ +static __always_inline int trylock_clear_tail_cpu(struct qspinlock *lock, u32 old) { - int newval = _Q_LOCKED_VAL; - - if (atomic_cmpxchg_acquire(&lock->val, val, newval) == val) + u32 new = _Q_LOCKED_VAL; + u32 prev; + + BUG_ON(old & _Q_LOCKED_VAL); + + asm volatile( +"1: lwarx %0,0,%1,%4 # trylock_clear_tail_cpu \n" +" cmpw 0,%0,%2 \n" +" bne- 2f \n" +" stwcx. %3,0,%1 \n" +" bne- 1b \n" +"\t" PPC_ACQUIRE_BARRIER " \n" +"2: \n" + : "=&r" (prev) + : "r" (&lock->val), "r"(old), "r" (new), + "i" (IS_ENABLED(CONFIG_PPC64) ? 1 : 0) + : "cr0", "memory"); + + if (likely(prev == old)) return 1; - else - return 0; + return 0; } /* @@ -56,20 +82,25 @@ static __always_inline int trylock_clear_tail_cpu(struct qspinlock *lock, int va * This provides a release barrier for publishing node, and an acquire barrier * for getting the old node. */ -static __always_inline int publish_tail_cpu(struct qspinlock *lock, int tail) +static __always_inline u32 publish_tail_cpu(struct qspinlock *lock, u32 tail) { - for (;;) { - int val = atomic_read(&lock->val); - int newval = (val & ~_Q_TAIL_CPU_MASK) | tail; - int old; - - old = atomic_cmpxchg(&lock->val, val, newval); - if (old == val) - return old; - } + u32 prev, tmp; + + asm volatile( +"\t" PPC_RELEASE_BARRIER " \n" +"1: lwarx %0,0,%2 # publish_tail_cpu \n" +" andc %1,%0,%4 \n" +" or %1,%1,%3 \n" +" stwcx. %1,0,%2 \n" +" bne- 1b \n" + : "=&r" (prev), "=&r"(tmp) + : "r" (&lock->val), "r" (tail), "r"(_Q_TAIL_CPU_MASK) + : "cr0", "memory"); + + return prev; } -static struct qnode *get_tail_qnode(struct qspinlock *lock, int val) +static struct qnode *get_tail_qnode(struct qspinlock *lock, u32 val) { int cpu = get_tail_cpu(val); struct qnodes *qnodesp = per_cpu_ptr(&qnodes, cpu); @@ -88,7 +119,7 @@ static inline void queued_spin_lock_mcs_queue(struct qspinlock *lock) { struct qnodes *qnodesp; struct qnode *next, *node; - int val, old, tail; + u32 val, old, tail; int idx; BUILD_BUG_ON(CONFIG_NR_CPUS >= (1U << _Q_TAIL_CPU_BITS)); @@ -134,7 +165,7 @@ static inline void queued_spin_lock_mcs_queue(struct qspinlock *lock) } /* We're at the head of the waitqueue, wait for the lock. */ - while ((val = atomic_read(&lock->val)) & _Q_LOCKED_VAL) + while ((val = READ_ONCE(lock->val)) & _Q_LOCKED_VAL) cpu_relax(); /* If we're the last queued, must clean up the tail. */ -- 2.35.1