qspinlock: Optimize for smaller NR_CPUS

tip-bot for Peter Zijlstra (Intel) Fri, 08 May 2015 06:32:11 -0700

Commit-ID:  69f9cae90907e09af95fb991ed384670cef8dd32
Gitweb:     http://git.kernel.org/tip/69f9cae90907e09af95fb991ed384670cef8dd32
Author:     Peter Zijlstra (Intel) <[email protected]>
AuthorDate: Fri, 24 Apr 2015 14:56:34 -0400
Committer:  Ingo Molnar <[email protected]>
CommitDate: Fri, 8 May 2015 12:36:48 +0200


locking/qspinlock: Optimize for smaller NR_CPUS

When we allow for a max NR_CPUS < 2^14 we can optimize the pending
wait-acquire and the xchg_tail() operations.

By growing the pending bit to a byte, we reduce the tail to 16bit.
This means we can use xchg16 for the tail part and do away with all
the repeated compxchg() operations.

This in turn allows us to unconditionally acquire; the locked state
as observed by the wait loops cannot change. And because both locked
and pending are now a full byte we can use simple stores for the
state transition, obviating one atomic operation entirely.

This optimization is needed to make the qspinlock achieve performance
parity with ticket spinlock at light load.

All this is horribly broken on Alpha pre EV56 (and any other arch that
cannot do single-copy atomic byte stores).

Signed-off-by: Peter Zijlstra (Intel) <[email protected]>
Signed-off-by: Waiman Long <[email protected]>
Signed-off-by: Peter Zijlstra (Intel) <[email protected]>
Cc: Andrew Morton <[email protected]>
Cc: Boris Ostrovsky <[email protected]>
Cc: Borislav Petkov <[email protected]>
Cc: Daniel J Blueman <[email protected]>
Cc: David Vrabel <[email protected]>
Cc: Douglas Hatch <[email protected]>
Cc: H. Peter Anvin <[email protected]>
Cc: Konrad Rzeszutek Wilk <[email protected]>
Cc: Linus Torvalds <[email protected]>
Cc: Oleg Nesterov <[email protected]>
Cc: Paolo Bonzini <[email protected]>
Cc: Paul E. McKenney <[email protected]>
Cc: Peter Zijlstra <[email protected]>
Cc: Raghavendra K T <[email protected]>
Cc: Rik van Riel <[email protected]>
Cc: Scott J Norton <[email protected]>
Cc: Thomas Gleixner <[email protected]>
Cc: [email protected]
Cc: [email protected]
Link: 
http://lkml.kernel.org/r/[email protected]
Signed-off-by: Ingo Molnar <[email protected]>
---
 include/asm-generic/qspinlock_types.h | 13 +++++++
 kernel/locking/qspinlock.c            | 69 ++++++++++++++++++++++++++++++++++-
 2 files changed, 81 insertions(+), 1 deletion(-)

diff --git a/include/asm-generic/qspinlock_types.h 
b/include/asm-generic/qspinlock_types.h
index 3a7f671..85f888e 100644
--- a/include/asm-generic/qspinlock_types.h
+++ b/include/asm-generic/qspinlock_types.h
@@ -35,6 +35,14 @@ typedef struct qspinlock {
 /*
  * Bitfields in the atomic value:
  *
+ * When NR_CPUS < 16K
+ *  0- 7: locked byte
+ *     8: pending
+ *  9-15: not used
+ * 16-17: tail index
+ * 18-31: tail cpu (+1)
+ *
+ * When NR_CPUS >= 16K
  *  0- 7: locked byte
  *     8: pending
  *  9-10: tail index
@@ -47,7 +55,11 @@ typedef struct qspinlock {
 #define _Q_LOCKED_MASK         _Q_SET_MASK(LOCKED)
 
 #define _Q_PENDING_OFFSET      (_Q_LOCKED_OFFSET + _Q_LOCKED_BITS)
+#if CONFIG_NR_CPUS < (1U << 14)
+#define _Q_PENDING_BITS                8
+#else
 #define _Q_PENDING_BITS                1
+#endif
 #define _Q_PENDING_MASK                _Q_SET_MASK(PENDING)
 
 #define _Q_TAIL_IDX_OFFSET     (_Q_PENDING_OFFSET + _Q_PENDING_BITS)
@@ -58,6 +70,7 @@ typedef struct qspinlock {
 #define _Q_TAIL_CPU_BITS       (32 - _Q_TAIL_CPU_OFFSET)
 #define _Q_TAIL_CPU_MASK       _Q_SET_MASK(TAIL_CPU)
 
+#define _Q_TAIL_OFFSET         _Q_TAIL_IDX_OFFSET
 #define _Q_TAIL_MASK           (_Q_TAIL_IDX_MASK | _Q_TAIL_CPU_MASK)
 
 #define _Q_LOCKED_VAL          (1U << _Q_LOCKED_OFFSET)
diff --git a/kernel/locking/qspinlock.c b/kernel/locking/qspinlock.c
index 82bb4a9..e17efe7 100644
--- a/kernel/locking/qspinlock.c
+++ b/kernel/locking/qspinlock.c
@@ -24,6 +24,7 @@
 #include <linux/percpu.h>
 #include <linux/hardirq.h>
 #include <linux/mutex.h>
+#include <asm/byteorder.h>
 #include <asm/qspinlock.h>
 
 /*
@@ -56,6 +57,10 @@
  * node; whereby avoiding the need to carry a node from lock to unlock, and
  * preserving existing lock API. This also makes the unlock code simpler and
  * faster.
+ *
+ * N.B. The current implementation only supports architectures that allow
+ *      atomic operations on smaller 8-bit and 16-bit data types.
+ *
  */
 
 #include "mcs_spinlock.h"
@@ -96,6 +101,62 @@ static inline struct mcs_spinlock *decode_tail(u32 tail)
 
 #define _Q_LOCKED_PENDING_MASK (_Q_LOCKED_MASK | _Q_PENDING_MASK)
 
+/*
+ * By using the whole 2nd least significant byte for the pending bit, we
+ * can allow better optimization of the lock acquisition for the pending
+ * bit holder.
+ */
+#if _Q_PENDING_BITS == 8
+
+struct __qspinlock {
+       union {
+               atomic_t val;
+               struct {
+#ifdef __LITTLE_ENDIAN
+                       u16     locked_pending;
+                       u16     tail;
+#else
+                       u16     tail;
+                       u16     locked_pending;
+#endif
+               };
+       };
+};
+
+/**
+ * clear_pending_set_locked - take ownership and clear the pending bit.
+ * @lock: Pointer to queued spinlock structure
+ *
+ * *,1,0 -> *,0,1
+ *
+ * Lock stealing is not allowed if this function is used.
+ */
+static __always_inline void clear_pending_set_locked(struct qspinlock *lock)
+{
+       struct __qspinlock *l = (void *)lock;
+
+       WRITE_ONCE(l->locked_pending, _Q_LOCKED_VAL);
+}
+
+/*
+ * xchg_tail - Put in the new queue tail code word & retrieve previous one
+ * @lock : Pointer to queued spinlock structure
+ * @tail : The new queue tail code word
+ * Return: The previous queue tail code word
+ *
+ * xchg(lock, tail)
+ *
+ * p,*,* -> n,*,* ; prev = xchg(lock, node)
+ */
+static __always_inline u32 xchg_tail(struct qspinlock *lock, u32 tail)
+{
+       struct __qspinlock *l = (void *)lock;
+
+       return (u32)xchg(&l->tail, tail >> _Q_TAIL_OFFSET) << _Q_TAIL_OFFSET;
+}
+
+#else /* _Q_PENDING_BITS == 8 */
+
 /**
  * clear_pending_set_locked - take ownership and clear the pending bit.
  * @lock: Pointer to queued spinlock structure
@@ -131,6 +192,7 @@ static __always_inline u32 xchg_tail(struct qspinlock 
*lock, u32 tail)
        }
        return old;
 }
+#endif /* _Q_PENDING_BITS == 8 */
 
 /**
  * queued_spin_lock_slowpath - acquire the queued spinlock
@@ -205,8 +267,13 @@ void queued_spin_lock_slowpath(struct qspinlock *lock, u32 
val)
         * we're pending, wait for the owner to go away.
         *
         * *,1,1 -> *,1,0
+        *
+        * this wait loop must be a load-acquire such that we match the
+        * store-release that clears the locked bit and create lock
+        * sequentiality; this is because not all clear_pending_set_locked()
+        * implementations imply full barriers.
         */
-       while ((val = atomic_read(&lock->val)) & _Q_LOCKED_MASK)
+       while ((val = smp_load_acquire(&lock->val.counter)) & _Q_LOCKED_MASK)
                cpu_relax();
 
        /*
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to [email protected]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

[tip:locking/core] locking/qspinlock: Optimize for smaller NR_CPUS

Reply via email to