On Thu, 2022-07-28 at 16:31 +1000, Nicholas Piggin wrote:
[resend as utf-8, not utf-7]
> Finding the owner or a queued waiter on a lock with a preempted vcpu
> is indicative of an oversubscribed guest causing the lock to get into
> trouble. Provide some options to detect this situation and have new
> CPUs avoid queueing for a longer time (more steal iterations) to
> minimise the problems caused by vcpu preemption on the queue.
> ---
>  arch/powerpc/include/asm/qspinlock_types.h |   7 +-
>  arch/powerpc/lib/qspinlock.c               | 240 +++++++++++++++++++--
>  2 files changed, 232 insertions(+), 15 deletions(-)
> 
> diff --git a/arch/powerpc/include/asm/qspinlock_types.h 
> b/arch/powerpc/include/asm/qspinlock_types.h
> index 35f9525381e6..4fbcc8a4230b 100644
> --- a/arch/powerpc/include/asm/qspinlock_types.h
> +++ b/arch/powerpc/include/asm/qspinlock_types.h
> @@ -30,7 +30,7 @@ typedef struct qspinlock {
>   *
>   *     0: locked bit
>   *  1-14: lock holder cpu
> - *    15: unused bit
> + *    15: lock owner or queuer vcpus observed to be preempted bit
>   *    16: must queue bit
>   * 17-31: tail cpu (+1)
>   */
> @@ -49,6 +49,11 @@ typedef struct qspinlock {
>  #error "qspinlock does not support such large CONFIG_NR_CPUS"
>  #endif
>  
> +#define _Q_SLEEPY_OFFSET     15
> +#define _Q_SLEEPY_BITS               1
> +#define _Q_SLEEPY_MASK               _Q_SET_MASK(SLEEPY_OWNER)
> +#define _Q_SLEEPY_VAL                (1U << _Q_SLEEPY_OFFSET)
> +
>  #define _Q_MUST_Q_OFFSET     16
>  #define _Q_MUST_Q_BITS               1
>  #define _Q_MUST_Q_MASK               _Q_SET_MASK(MUST_Q)
> diff --git a/arch/powerpc/lib/qspinlock.c b/arch/powerpc/lib/qspinlock.c
> index 5cfd69931e31..c18133c01450 100644
> --- a/arch/powerpc/lib/qspinlock.c
> +++ b/arch/powerpc/lib/qspinlock.c
> @@ -5,6 +5,7 @@
>  #include <linux/percpu.h>
>  #include <linux/smp.h>
>  #include <linux/topology.h>
> +#include <linux/sched/clock.h>
>  #include <asm/qspinlock.h>
>  #include <asm/paravirt.h>
>  
> @@ -36,24 +37,54 @@ static int HEAD_SPINS __read_mostly = (1<<8);
>  static bool pv_yield_owner __read_mostly = true;
>  static bool pv_yield_allow_steal __read_mostly = false;
>  static bool pv_spin_on_preempted_owner __read_mostly = false;
> +static bool pv_sleepy_lock __read_mostly = true;
> +static bool pv_sleepy_lock_sticky __read_mostly = false;

The sticky part could potentially be its own patch.

> +static u64 pv_sleepy_lock_interval_ns __read_mostly = 0;
> +static int pv_sleepy_lock_factor __read_mostly = 256;
>  static bool pv_yield_prev __read_mostly = true;
>  static bool pv_yield_propagate_owner __read_mostly = true;
>  static bool pv_prod_head __read_mostly = false;
>  
>  static DEFINE_PER_CPU_ALIGNED(struct qnodes, qnodes);
> +static DEFINE_PER_CPU_ALIGNED(u64, sleepy_lock_seen_clock);
>  
> -static __always_inline int get_steal_spins(bool paravirt, bool remote)
> +static __always_inline bool recently_sleepy(void)
> +{

Other users of pv_sleepy_lock_interval_ns first check pv_sleepy_lock.

> +     if (pv_sleepy_lock_interval_ns) {
> +             u64 seen = this_cpu_read(sleepy_lock_seen_clock);
> +
> +             if (seen) {
> +                     u64 delta = sched_clock() - seen;
> +                     if (delta < pv_sleepy_lock_interval_ns)
> +                             return true;
> +                     this_cpu_write(sleepy_lock_seen_clock, 0);
> +             }
> +     }
> +
> +     return false;
> +}
> +
> +static __always_inline int get_steal_spins(bool paravirt, bool remote, bool 
> sleepy)

It seems like paravirt is implied by sleepy.

>  {
>       if (remote) {
> -             return REMOTE_STEAL_SPINS;
> +             if (paravirt && sleepy)
> +                     return REMOTE_STEAL_SPINS * pv_sleepy_lock_factor;
> +             else
> +                     return REMOTE_STEAL_SPINS;
>       } else {
> -             return STEAL_SPINS;
> +             if (paravirt && sleepy)
> +                     return STEAL_SPINS * pv_sleepy_lock_factor;
> +             else
> +                     return STEAL_SPINS;
>       }
>  }

I think that separate functions would still be nicer but this could get rid of
the nesting conditionals like


        int spins;
        if (remote)
                spins = REMOTE_STEAL_SPINS;
        else
                spins = STEAL_SPINS;

        if (sleepy)
                return spins * pv_sleepy_lock_factor;
        return spins;

>  
> -static __always_inline int get_head_spins(bool paravirt)
> +static __always_inline int get_head_spins(bool paravirt, bool sleepy)
>  {
> -     return HEAD_SPINS;
> +     if (paravirt && sleepy)
> +             return HEAD_SPINS * pv_sleepy_lock_factor;
> +     else
> +             return HEAD_SPINS;
>  }
>  
>  static inline u32 encode_tail_cpu(void)
> @@ -206,6 +237,60 @@ static __always_inline u32 lock_clear_mustq(struct 
> qspinlock *lock)
>       return prev;
>  }
>  
> +static __always_inline bool lock_try_set_sleepy(struct qspinlock *lock, u32 
> old)
> +{
> +     u32 prev;
> +     u32 new = old | _Q_SLEEPY_VAL;
> +
> +     BUG_ON(!(old & _Q_LOCKED_VAL));
> +     BUG_ON(old & _Q_SLEEPY_VAL);
> +
> +     asm volatile(
> +"1:  lwarx   %0,0,%1         # lock_try_set_sleepy                   \n"
> +"    cmpw    0,%0,%2                                                 \n"
> +"    bne-    2f                                                      \n"
> +"    stwcx.  %3,0,%1                                                 \n"
> +"    bne-    1b                                                      \n"
> +"2:                                                                  \n"
> +     : "=&r" (prev)
> +     : "r" (&lock->val), "r"(old), "r" (new)
> +     : "cr0", "memory");
> +
> +     if (prev == old)
> +             return true;
> +     return false;
> +}
> +
> +static __always_inline void seen_sleepy_owner(struct qspinlock *lock, u32 
> val)
> +{
> +     if (pv_sleepy_lock) {
> +             if (pv_sleepy_lock_interval_ns)
> +                     this_cpu_write(sleepy_lock_seen_clock, sched_clock());
> +             if (!(val & _Q_SLEEPY_VAL))
> +                     lock_try_set_sleepy(lock, val);
> +     }
> +}
> +
> +static __always_inline void seen_sleepy_lock(void)
> +{
> +     if (pv_sleepy_lock && pv_sleepy_lock_interval_ns)
> +             this_cpu_write(sleepy_lock_seen_clock, sched_clock());
> +}
> +
> +static __always_inline void seen_sleepy_node(struct qspinlock *lock)
> +{

If yield_to_prev() was made to take a raw val, that val could be passed to
seen_sleepy_node() and it would not need to get it by itself.

> +     if (pv_sleepy_lock) {
> +             u32 val = READ_ONCE(lock->val);
> +
> +             if (pv_sleepy_lock_interval_ns)
> +                     this_cpu_write(sleepy_lock_seen_clock, sched_clock());
> +             if (val & _Q_LOCKED_VAL) {
> +                     if (!(val & _Q_SLEEPY_VAL))
> +                             lock_try_set_sleepy(lock, val);
> +             }
> +     }
> +}
> +
>  static struct qnode *get_tail_qnode(struct qspinlock *lock, u32 val)
>  {
>       int cpu = get_tail_cpu(val);
> @@ -244,6 +329,7 @@ static __always_inline void 
> __yield_to_locked_owner(struct qspinlock *lock, u32
>  
>       spin_end();
>  
> +     seen_sleepy_owner(lock, val);
>       *preempted = true;
>  
>       /*
> @@ -307,11 +393,13 @@ static __always_inline void propagate_yield_cpu(struct 
> qnode *node, u32 val, int
>       }
>  }
>  
> -static __always_inline void yield_to_prev(struct qspinlock *lock, struct 
> qnode *node, int prev_cpu, bool paravirt)
> +static __always_inline void yield_to_prev(struct qspinlock *lock, struct 
> qnode *node, int prev_cpu, bool paravirt, bool *preempted)
>  {
>       u32 yield_count;
>       int yield_cpu;
>  
> +     *preempted = false;
> +
>       if (!paravirt)
>               goto relax;
>  
> @@ -332,6 +420,9 @@ static __always_inline void yield_to_prev(struct 
> qspinlock *lock, struct qnode *
>  
>       spin_end();
>  
> +     *preempted = true;
> +     seen_sleepy_node(lock);
> +
>       smp_rmb();
>  
>       if (yield_cpu == node->yield_cpu) {
> @@ -353,6 +444,9 @@ static __always_inline void yield_to_prev(struct 
> qspinlock *lock, struct qnode *
>  
>       spin_end();
>  
> +     *preempted = true;
> +     seen_sleepy_node(lock);
> +
>       smp_rmb(); /* See yield_to_locked_owner comment */
>  
>       if (!node->locked) {
> @@ -369,6 +463,9 @@ static __always_inline void yield_to_prev(struct 
> qspinlock *lock, struct qnode *
>  
>  static __always_inline bool try_to_steal_lock(struct qspinlock *lock, bool 
> paravirt)
>  {
> +     bool preempted;
> +     bool seen_preempted = false;
> +     bool sleepy = false;
>       int iters = 0;
>  
>       if (!STEAL_SPINS) {
> @@ -376,7 +473,6 @@ static __always_inline bool try_to_steal_lock(struct 
> qspinlock *lock, bool parav
>                       spin_begin();
>                       for (;;) {
>                               u32 val = READ_ONCE(lock->val);
> -                             bool preempted;
>  
>                               if (val & _Q_MUST_Q_VAL)
>                                       break;
> @@ -395,7 +491,6 @@ static __always_inline bool try_to_steal_lock(struct 
> qspinlock *lock, bool parav
>       spin_begin();
>       for (;;) {
>               u32 val = READ_ONCE(lock->val);
> -             bool preempted;
>  
>               if (val & _Q_MUST_Q_VAL)
>                       break;
> @@ -408,9 +503,29 @@ static __always_inline bool try_to_steal_lock(struct 
> qspinlock *lock, bool parav
>                       continue;
>               }
>  
> +             if (paravirt && pv_sleepy_lock && !sleepy) {
> +                     if (!sleepy) {

The enclosing conditional means this would always be true. I think the out 
conditional should be
if (paravirt && pv_sleepy_lock)
otherwise the pv_sleepy_lock_sticky part wouldn't work properly.


> +                             if (val & _Q_SLEEPY_VAL) {
> +                                     seen_sleepy_lock();
> +                                     sleepy = true;
> +                             } else if (recently_sleepy()) {
> +                                     sleepy = true;
> +                             }
> +
> +                     if (pv_sleepy_lock_sticky && seen_preempted &&
> +                                     !(val & _Q_SLEEPY_VAL)) {
> +                             if (lock_try_set_sleepy(lock, val))
> +                                     val |= _Q_SLEEPY_VAL;
> +                     }
> +
> +
>               yield_to_locked_owner(lock, val, paravirt, &preempted);
> +             if (preempted)
> +                     seen_preempted = true;

This could belong to the next if statement, there can not be !paravirt && 
preempted ?

>  
>               if (paravirt && preempted) {
> +                     sleepy = true;
> +
>                       if (!pv_spin_on_preempted_owner)
>                               iters++;
>                       /*
> @@ -425,14 +540,15 @@ static __always_inline bool try_to_steal_lock(struct 
> qspinlock *lock, bool parav
>                       iters++;
>               }
>  
> -             if (iters >= get_steal_spins(paravirt, false))
> +             if (iters >= get_steal_spins(paravirt, false, sleepy))
>                       break;
> -             if (iters >= get_steal_spins(paravirt, true)) {
> +             if (iters >= get_steal_spins(paravirt, true, sleepy)) {
>                       int cpu = get_owner_cpu(val);
>                       if (numa_node_id() != cpu_to_node(cpu))
>                               break;
>               }
>       }
> +
>       spin_end();
>  
>       return false;
> @@ -443,6 +559,7 @@ static __always_inline void 
> queued_spin_lock_mcs_queue(struct qspinlock *lock, b
>       struct qnodes *qnodesp;
>       struct qnode *next, *node;
>       u32 val, old, tail;
> +     bool seen_preempted = false;
>       int idx;
>  
>       BUILD_BUG_ON(CONFIG_NR_CPUS >= (1U << _Q_TAIL_CPU_BITS));
> @@ -485,8 +602,13 @@ static __always_inline void 
> queued_spin_lock_mcs_queue(struct qspinlock *lock, b
>  
>               /* Wait for mcs node lock to be released */
>               spin_begin();
> -             while (!node->locked)
> -                     yield_to_prev(lock, node, prev_cpu, paravirt);
> +             while (!node->locked) {
> +                     bool preempted;
> +
> +                     yield_to_prev(lock, node, prev_cpu, paravirt, 
> &preempted);
> +                     if (preempted)
> +                             seen_preempted = true;
> +             }
>               spin_end();
>  
>               /* Clear out stale propagated yield_cpu */
> @@ -506,6 +628,8 @@ static __always_inline void 
> queued_spin_lock_mcs_queue(struct qspinlock *lock, b
>  
>                       propagate_yield_cpu(node, val, &set_yield_cpu, 
> paravirt);
>                       yield_head_to_locked_owner(lock, val, paravirt, false, 
> &preempted);
> +                     if (preempted)
> +                             seen_preempted = true;
>               }
>               spin_end();
>  
> @@ -521,27 +645,47 @@ static __always_inline void 
> queued_spin_lock_mcs_queue(struct qspinlock *lock, b
>       } else {
>               int set_yield_cpu = -1;
>               int iters = 0;
> +             bool sleepy = false;
>               bool set_mustq = false;
> +             bool preempted;
>  
>  again:
>               /* We're at the head of the waitqueue, wait for the lock. */
>               spin_begin();
>               while ((val = READ_ONCE(lock->val)) & _Q_LOCKED_VAL) {
> -                     bool preempted;
> +                     if (paravirt && pv_sleepy_lock) {
> +                             if (!sleepy) {
> +                                     if (val & _Q_SLEEPY_VAL) {
> +                                             seen_sleepy_lock();
> +                                             sleepy = true;
> +                                     } else if (recently_sleepy()) {
> +                                             sleepy = true;
> +                                     }
> +                             }
> +                             if (pv_sleepy_lock_sticky && seen_preempted &&
> +                                             !(val & _Q_SLEEPY_VAL)) {
> +                                     if (lock_try_set_sleepy(lock, val))
> +                                             val |= _Q_SLEEPY_VAL;
> +                             }
> +                     }
>  
>                       propagate_yield_cpu(node, val, &set_yield_cpu, 
> paravirt);
>                       yield_head_to_locked_owner(lock, val, paravirt,
>                                       pv_yield_allow_steal && set_mustq,
>                                       &preempted);
> +                     if (preempted)
> +                             seen_preempted = true;
>  
>                       if (paravirt && preempted) {
> +                             sleepy = true;
> +
>                               if (!pv_spin_on_preempted_owner)
>                                       iters++;
>                       } else {
>                               iters++;
>                       }
>  
> -                     if (!set_mustq && iters >= get_head_spins(paravirt)) {
> +                     if (!set_mustq && iters >= get_head_spins(paravirt, 
> sleepy)) {
>                               set_mustq = true;
>                               lock_set_mustq(lock);
>                               val |= _Q_MUST_Q_VAL;
> @@ -729,6 +873,70 @@ static int pv_spin_on_preempted_owner_get(void *data, 
> u64 *val)
>  
>  DEFINE_SIMPLE_ATTRIBUTE(fops_pv_spin_on_preempted_owner, 
> pv_spin_on_preempted_owner_get, pv_spin_on_preempted_owner_set, "%llu\n");
>  
> +static int pv_sleepy_lock_set(void *data, u64 val)
> +{
> +     pv_sleepy_lock = !!val;
> +
> +     return 0;
> +}
> +
> +static int pv_sleepy_lock_get(void *data, u64 *val)
> +{
> +     *val = pv_sleepy_lock;
> +
> +     return 0;
> +}
> +
> +DEFINE_SIMPLE_ATTRIBUTE(fops_pv_sleepy_lock, pv_sleepy_lock_get, 
> pv_sleepy_lock_set, "%llu\n");
> +
> +static int pv_sleepy_lock_sticky_set(void *data, u64 val)
> +{
> +     pv_sleepy_lock_sticky = !!val;
> +
> +     return 0;
> +}
> +
> +static int pv_sleepy_lock_sticky_get(void *data, u64 *val)
> +{
> +     *val = pv_sleepy_lock_sticky;
> +
> +     return 0;
> +}
> +
> +DEFINE_SIMPLE_ATTRIBUTE(fops_pv_sleepy_lock_sticky, 
> pv_sleepy_lock_sticky_get, pv_sleepy_lock_sticky_set, "%llu\n");
> +
> +static int pv_sleepy_lock_interval_ns_set(void *data, u64 val)
> +{
> +     pv_sleepy_lock_interval_ns = val;
> +
> +     return 0;
> +}
> +
> +static int pv_sleepy_lock_interval_ns_get(void *data, u64 *val)
> +{
> +     *val = pv_sleepy_lock_interval_ns;
> +
> +     return 0;
> +}
> +
> +DEFINE_SIMPLE_ATTRIBUTE(fops_pv_sleepy_lock_interval_ns, 
> pv_sleepy_lock_interval_ns_get, pv_sleepy_lock_interval_ns_set, "%llu\n");
> +
> +static int pv_sleepy_lock_factor_set(void *data, u64 val)
> +{
> +     pv_sleepy_lock_factor = val;
> +
> +     return 0;
> +}
> +
> +static int pv_sleepy_lock_factor_get(void *data, u64 *val)
> +{
> +     *val = pv_sleepy_lock_factor;
> +
> +     return 0;
> +}
> +
> +DEFINE_SIMPLE_ATTRIBUTE(fops_pv_sleepy_lock_factor, 
> pv_sleepy_lock_factor_get, pv_sleepy_lock_factor_set, "%llu\n");
> +
>  static int pv_yield_prev_set(void *data, u64 val)
>  {
>       pv_yield_prev = !!val;
> @@ -786,6 +994,10 @@ static __init int spinlock_debugfs_init(void)
>               debugfs_create_file("qspl_pv_yield_owner", 0600, 
> arch_debugfs_dir, NULL, &fops_pv_yield_owner);
>               debugfs_create_file("qspl_pv_yield_allow_steal", 0600, 
> arch_debugfs_dir, NULL, &fops_pv_yield_allow_steal);
>               debugfs_create_file("qspl_pv_spin_on_preempted_owner", 0600, 
> arch_debugfs_dir, NULL, &fops_pv_spin_on_preempted_owner);
> +             debugfs_create_file("qspl_pv_sleepy_lock", 0600, 
> arch_debugfs_dir, NULL, &fops_pv_sleepy_lock);
> +             debugfs_create_file("qspl_pv_sleepy_lock_sticky", 0600, 
> arch_debugfs_dir, NULL, &fops_pv_sleepy_lock_sticky);
> +             debugfs_create_file("qspl_pv_sleepy_lock_interval_ns", 0600, 
> arch_debugfs_dir, NULL, &fops_pv_sleepy_lock_interval_ns);
> +             debugfs_create_file("qspl_pv_sleepy_lock_factor", 0600, 
> arch_debugfs_dir, NULL, &fops_pv_sleepy_lock_factor);
>               debugfs_create_file("qspl_pv_yield_prev", 0600, 
> arch_debugfs_dir, NULL, &fops_pv_yield_prev);
>               debugfs_create_file("qspl_pv_yield_propagate_owner", 0600, 
> arch_debugfs_dir, NULL, &fops_pv_yield_propagate_owner);
>               debugfs_create_file("qspl_pv_prod_head", 0600, 
> arch_debugfs_dir, NULL, &fops_pv_prod_head);

Reply via email to