qspinlock: Optimize for x86

Peter Zijlstra Tue, 02 Oct 2018 07:14:57 -0700

On Tue, Oct 02, 2018 at 02:19:53PM +0100, Will Deacon wrote:
> On Mon, Oct 01, 2018 at 10:00:28PM +0200, Peter Zijlstra wrote:


> > Let me draw a picture of that..
> > 
> > 
> >   CPU0              CPU1            CPU2            CPU3
> > 
> > 0)                                          lock
> >                                               trylock -> (0,0,1)
> > 1)lock
> >     trylock /* fail */
> > 
> > 2)          lock
> >               trylock /* fail */
> >               tas-pending -> (0,1,1)
> >               wait-locked
> > 
> > 3)                          lock
> >                               trylock /* fail */
> >                               tas-pending /* fail */
> > 
> > 4)                                          unlock -> (0,1,0)
> >               clr_pnd_set_lck -> (0,0,1)
> >               unlock -> (0,0,0)
> > 
> > 5)  tas-pending -> (0,1,0)
> >     read-val -> (0,1,0)
> > 6)  clr_pnd_set_lck -> (0,0,1)
> > 7)                            xchg_tail -> (n,0,1)
> >                               load_acquire <- (n,0,0) (from-4)
> > 8)                            cmpxchg /* fail */
> >                               set_locked()
> > 
> > > Is there something I'm missing that means this can't happen? I suppose
> > > cacheline granularity ends up giving serialisation between (4) and (7),
> > > but I'd *much* prefer not to rely on that because it feels horribly
> > > fragile.
> > 
> > Well, on x86 atomics are fully ordered, so the xchg_tail() does in
> > fact have smp_mb() in and that should order it sufficient for that not
> > to happen I think.
> 
> Hmm, does that actually help, though? I still think you're relying on the
> cache-coherence protocol to serialise the xchg() on pending before the
> xchg_tail(), which I think is fragile because they don't actually overlap.

Maybe, I suspect TSO makes it work, but see the below alternative.

---
--- a/arch/x86/include/asm/qspinlock.h
+++ b/arch/x86/include/asm/qspinlock.h
@@ -6,9 +6,29 @@
 #include <asm/cpufeature.h>
 #include <asm-generic/qspinlock_types.h>
 #include <asm/paravirt.h>
+#include <asm/rmwcc.h>
 
 #define _Q_PENDING_LOOPS       (1 << 9)
 
+static __always_inline bool __test_and_set_pending(struct qspinlock *lock)
+{
+       GEN_BINARY_RMWcc(LOCK_PREFIX "btsl",
+                        lock->val.counter, "Ir", _Q_PENDING_OFFSET, "%0", c);
+}
+
+#define queued_set_pending_fetch_acquire queued_set_pending_fetch_acquire
+static inline u32 queued_set_pending_fetch_acquire(struct qspinlock *lock)
+{
+       u32 val = 0;
+
+       if (__test_and_set_pending(lock))
+               val |= _Q_PENDING_VAL;
+
+       val |= atomic_read(&lock->val) & ~_Q_PENDING_MASK;
+
+       return val;
+}
+
 #ifdef CONFIG_PARAVIRT_SPINLOCKS
 extern void native_queued_spin_lock_slowpath(struct qspinlock *lock, u32 val);
 extern void __pv_init_lock_hash(void);
--- a/kernel/locking/qspinlock.c
+++ b/kernel/locking/qspinlock.c
@@ -232,6 +232,20 @@ static __always_inline u32 xchg_tail(str
 #endif /* _Q_PENDING_BITS == 8 */
 
 /**
+ * queued_set_pending_fetch_acquire - fetch the whole lock value and set 
pending
+ * @lock : Pointer to queued spinlock structure
+ * Return: The previous lock value
+ *
+ * *,*,* -> *,1,*
+ */
+#ifndef queued_set_pending_fetch_acquire
+static __always_inline u32 queued_set_pending_fetch_acquire(struct qspinlock 
*lock)
+{
+       return atomic_fetch_or_acquire(_Q_PENDING_VAL, &lock->val);
+}
+#endif
+
+/**
  * set_locked - Set the lock bit and own the lock
  * @lock: Pointer to queued spinlock structure
  *
@@ -328,7 +342,7 @@ void queued_spin_lock_slowpath(struct qs
         *
         * 0,0,* -> 0,1,* -> 0,0,1 pending, trylock
         */
-       val = atomic_fetch_or_acquire(_Q_PENDING_VAL, &lock->val);
+       val = queued_set_pending_fetch_acquire(lock);
 
        /*
         * If we observe contention, there is a concurrent locker.

Re: [RFC][PATCH 3/3] locking/qspinlock: Optimize for x86

Reply via email to