In an overcommitted guest where some vCPUs have to be halted to make forward progress in other areas, it is highly likely that a vCPU later in the spinlock queue will be spinning while the ones earlier in the queue would have been halted already. The spinning in the later vCPUs is then just a waste of precious CPU cycles because they are not going to get the lock soon as the earlier ones have to be woken up and take their turn to get the lock.
This patch implements a wait-early mechanism where the vCPU will call pv_wait() earlier if the previous vCPU is in the halted state already. In this case, it will spin less before calling pv_wait(). On the other hand, if the previous vCPU was running and then becomes halted, the current vCPU will call pv_wait() immmediately in this case. This patch also separates the spin threshold for queue head and queue nodes. It favors the queue head by allowing it to spin longer before calling pv_wait(). Signed-off-by: Waiman Long <waiman.l...@hp.com> --- kernel/locking/qspinlock.c | 5 ++- kernel/locking/qspinlock_paravirt.h | 52 +++++++++++++++++++++++++++++++++-- 2 files changed, 52 insertions(+), 5 deletions(-) diff --git a/kernel/locking/qspinlock.c b/kernel/locking/qspinlock.c index d2e0fc1..782bc18 100644 --- a/kernel/locking/qspinlock.c +++ b/kernel/locking/qspinlock.c @@ -238,7 +238,8 @@ static __always_inline void set_locked(struct qspinlock *lock) */ static __always_inline void __pv_init_node(struct mcs_spinlock *node) { } -static __always_inline void __pv_wait_node(struct mcs_spinlock *node) { } +static __always_inline void __pv_wait_node(struct mcs_spinlock *node, + struct mcs_spinlock *prev) { } static __always_inline void __pv_scan_next(struct qspinlock *lock, struct mcs_spinlock *node) { } static __always_inline void __pv_wait_head(struct qspinlock *lock, @@ -391,7 +392,7 @@ queue: prev = decode_tail(old); WRITE_ONCE(prev->next, node); - pv_wait_node(node); + pv_wait_node(node, prev); arch_mcs_spin_lock_contended(&node->locked); } diff --git a/kernel/locking/qspinlock_paravirt.h b/kernel/locking/qspinlock_paravirt.h index 4c1a299..b3fe5bb 100644 --- a/kernel/locking/qspinlock_paravirt.h +++ b/kernel/locking/qspinlock_paravirt.h @@ -22,6 +22,26 @@ #define _Q_SLOW_VAL (3U << _Q_LOCKED_OFFSET) /* + * Queued Spinlock Spin Thresholds + * ------------------------------- + * Because of the cacheline contention effect of the ticket spinlock, the + * same spin threshold for queued spinlock will run a bit faster. So we set + * a slight larger threshold for the queue head (1.25X) while the other queue + * nodes will keep the same threshold. + * + * A queue node vCPU will spin less if the vCPU in the previous node is halted. + * The queue node vCPU will also monitor the state of the previous node + * periodically if it is not halted. When the previous node vCPU transitions + * from active to halted, the current one will go to halted state too. It is + * because it takes quite a lot of cycles for a vCPU to perform vmexit and + * vmenter. So it is better for the current vCPU to go be halted too. + */ +#define QHEAD_SPIN_THRESHOLD (SPIN_THRESHOLD + (SPIN_THRESHOLD/4)) +#define QNODE_SPIN_THRESHOLD SPIN_THRESHOLD +#define QNODE_SPIN_THRESHOLD_SHORT (QNODE_SPIN_THRESHOLD >> 4) +#define QNODE_SPIN_CHECK_MASK 0xff + +/* * Queue node uses: vcpu_running & vcpu_halted. * Queue head uses: vcpu_running & vcpu_hashed. */ @@ -187,15 +207,41 @@ static void pv_init_node(struct mcs_spinlock *node) * pv_scan_next() is used to set _Q_SLOW_VAL and fill in hash table on its * behalf. */ -static void pv_wait_node(struct mcs_spinlock *node) +static void pv_wait_node(struct mcs_spinlock *node, struct mcs_spinlock *prev) { struct pv_node *pn = (struct pv_node *)node; + struct pv_node *pp = (struct pv_node *)prev; + bool prev_halted; int loop; for (;;) { - for (loop = SPIN_THRESHOLD; loop; loop--) { + /* + * Spin less if the previous vCPU was in the halted state + */ + prev_halted = (READ_ONCE(pp->state) != vcpu_running); + loop = prev_halted ? QNODE_SPIN_THRESHOLD_SHORT + : QNODE_SPIN_THRESHOLD; + while (loop--) { if (READ_ONCE(node->locked)) return; + /* + * Look for state transition at previous node. + * + * running => halted: + * call pv_wait() now to halt current vCPU + * halted => running: + * reset spin threshold to QNODE_SPIN_THRESHOLD + */ + if (!(loop & QNODE_SPIN_CHECK_MASK)) { + bool halted = (READ_ONCE(pp->state) + != vcpu_running); + if (!prev_halted && halted) { + break; + } else if (prev_halted && !halted) { + loop = QNODE_SPIN_THRESHOLD; + prev_halted = halted; + } + } cpu_relax(); } @@ -282,7 +328,7 @@ static void pv_wait_head(struct qspinlock *lock, struct mcs_spinlock *node) : NULL; for (;;) { WRITE_ONCE(pn->state, vcpu_running); - for (loop = SPIN_THRESHOLD; loop; loop--) { + for (loop = QHEAD_SPIN_THRESHOLD; loop; loop--) { if (!READ_ONCE(l->locked)) return; cpu_relax(); -- 1.7.1 -- To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/