On 28/07/2022 08:31:14, Nicholas Piggin wrote:
> Having all CPUs poll the lock word for the owner CPU that should be
> yielded to defeats most of the purpose of using MCS queueing for
> scalability. Yet it may be desirable for queued waiters to to yield
> to a preempted owner.
> 
> s390 addreses this problem by having queued waiters sample the lock
> word to find the owner much less frequently. In this approach, the
> waiters never sample it directly, but the queue head propagates the
> owner CPU back to the next waiter if it ever finds the owner has
> been preempted. Queued waiters then subsequently propagate the owner
> CPU back to the next waiter, and so on.
> 
> Disable this option by default for now, i.e., no logical change.
> ---
>  arch/powerpc/lib/qspinlock.c | 85 +++++++++++++++++++++++++++++++++++-
>  1 file changed, 84 insertions(+), 1 deletion(-)
> 
> diff --git a/arch/powerpc/lib/qspinlock.c b/arch/powerpc/lib/qspinlock.c
> index 94f007f66942..28c85a2d5635 100644
> --- a/arch/powerpc/lib/qspinlock.c
> +++ b/arch/powerpc/lib/qspinlock.c
> @@ -12,6 +12,7 @@
>  struct qnode {
>       struct qnode    *next;
>       struct qspinlock *lock;
> +     int             yield_cpu;
>       u8              locked; /* 1 if lock acquired */
>  };
>  
> @@ -28,6 +29,7 @@ static int HEAD_SPINS __read_mostly = (1<<8);
>  static bool pv_yield_owner __read_mostly = true;
>  static bool pv_yield_allow_steal __read_mostly = false;
>  static bool pv_yield_prev __read_mostly = true;
> +static bool pv_yield_propagate_owner __read_mostly = true;
>  
>  static DEFINE_PER_CPU_ALIGNED(struct qnodes, qnodes);
>  
> @@ -257,13 +259,66 @@ static __always_inline void 
> yield_head_to_locked_owner(struct qspinlock *lock, u
>       __yield_to_locked_owner(lock, val, paravirt, clear_mustq);
>  }
>  
> +static __always_inline void propagate_yield_cpu(struct qnode *node, u32 val, 
> int *set_yield_cpu, bool paravirt)
> +{
> +     struct qnode *next;
> +     int owner;
> +
> +     if (!paravirt)
> +             return;
> +     if (!pv_yield_propagate_owner)
> +             return;
> +
> +     owner = get_owner_cpu(val);
> +     if (*set_yield_cpu == owner)
> +             return;
> +
> +     next = READ_ONCE(node->next);
> +     if (!next)
> +             return;
> +
> +     if (vcpu_is_preempted(owner)) {
> +             next->yield_cpu = owner;
> +             *set_yield_cpu = owner;
> +     } else if (*set_yield_cpu != -1) {
> +             next->yield_cpu = owner;
> +             *set_yield_cpu = owner;
> +     }

This is bit confusing, the else branch is the same as the true one.
This might be written like this:

        if (vcpu_is_preempted(owner) || *set_yield_cpu != -1) {
                next->yield_cpu = owner;
                *set_yield_cpu = owner;
        }

> +}
> +
>  static __always_inline void yield_to_prev(struct qspinlock *lock, struct 
> qnode *node, int prev_cpu, bool paravirt)
>  {
>       u32 yield_count;
> +     int yield_cpu;
>  
>       if (!paravirt)
>               goto relax;
>  
> +     if (!pv_yield_propagate_owner)
> +             goto yield_prev;
> +
> +     yield_cpu = READ_ONCE(node->yield_cpu);
> +     if (yield_cpu == -1) {
> +             /* Propagate back the -1 CPU */
> +             if (node->next && node->next->yield_cpu != -1)
> +                     node->next->yield_cpu = yield_cpu;
> +             goto yield_prev;
> +     }
> +
> +     yield_count = yield_count_of(yield_cpu);
> +     if ((yield_count & 1) == 0)
> +             goto yield_prev; /* owner vcpu is running */
> +
> +     smp_rmb();
> +
> +     if (yield_cpu == node->yield_cpu) {
> +             if (node->next && node->next->yield_cpu != yield_cpu)
> +                     node->next->yield_cpu = yield_cpu;
> +             yield_to_preempted(yield_cpu, yield_count);
> +             return;
> +     }
> +

In the case that test is false, this means that the lock owner has probably
changed, why are we yeilding to the previous node instead of reading again
the node->yield_cpu, checking against -1 value etc..?
Yielding to the previous node is valid, but it might be better to yield to
the owner, isn't it?

> +yield_prev:
>       if (!pv_yield_prev)
>               goto relax;
>  
> @@ -337,6 +392,7 @@ static __always_inline void 
> queued_spin_lock_mcs_queue(struct qspinlock *lock, b
>       node = &qnodesp->nodes[idx];
>       node->next = NULL;
>       node->lock = lock;
> +     node->yield_cpu = -1;
>       node->locked = 0;
>  
>       tail = encode_tail_cpu();
> @@ -358,13 +414,21 @@ static __always_inline void 
> queued_spin_lock_mcs_queue(struct qspinlock *lock, b
>               while (!node->locked)
>                       yield_to_prev(lock, node, prev_cpu, paravirt);
>  
> +             /* Clear out stale propagated yield_cpu */
> +             if (paravirt && pv_yield_propagate_owner && node->yield_cpu != 
> -1)
> +                     node->yield_cpu = -1;

Why doing tests and not directly setting node->yield_cpu to -1?
Is the write operation more costly than the 3 tests?

> +
>               smp_rmb(); /* acquire barrier for the mcs lock */
>       }
>  
>       if (!MAYBE_STEALERS) {
> +             int set_yield_cpu = -1;
> +
>               /* We're at the head of the waitqueue, wait for the lock. */
> -             while ((val = READ_ONCE(lock->val)) & _Q_LOCKED_VAL)
> +             while ((val = READ_ONCE(lock->val)) & _Q_LOCKED_VAL) {
> +                     propagate_yield_cpu(node, val, &set_yield_cpu, 
> paravirt);
>                       yield_head_to_locked_owner(lock, val, paravirt, false);
> +             }
>  
>               /* If we're the last queued, must clean up the tail. */
>               if ((val & _Q_TAIL_CPU_MASK) == tail) {
> @@ -376,12 +440,14 @@ static __always_inline void 
> queued_spin_lock_mcs_queue(struct qspinlock *lock, b
>               /* We must be the owner, just set the lock bit and acquire */
>               lock_set_locked(lock);
>       } else {
> +             int set_yield_cpu = -1;
>               int iters = 0;
>               bool set_mustq = false;
>  
>  again:
>               /* We're at the head of the waitqueue, wait for the lock. */
>               while ((val = READ_ONCE(lock->val)) & _Q_LOCKED_VAL) {
> +                     propagate_yield_cpu(node, val, &set_yield_cpu, 
> paravirt);
>                       yield_head_to_locked_owner(lock, val, paravirt,
>                                       pv_yield_allow_steal && set_mustq);
>  
> @@ -540,6 +606,22 @@ static int pv_yield_prev_get(void *data, u64 *val)
>  
>  DEFINE_SIMPLE_ATTRIBUTE(fops_pv_yield_prev, pv_yield_prev_get, 
> pv_yield_prev_set, "%llu\n");
>  
> +static int pv_yield_propagate_owner_set(void *data, u64 val)
> +{
> +     pv_yield_propagate_owner = !!val;
> +
> +     return 0;
> +}
> +
> +static int pv_yield_propagate_owner_get(void *data, u64 *val)
> +{
> +     *val = pv_yield_propagate_owner;
> +
> +     return 0;
> +}
> +
> +DEFINE_SIMPLE_ATTRIBUTE(fops_pv_yield_propagate_owner, 
> pv_yield_propagate_owner_get, pv_yield_propagate_owner_set, "%llu\n");
> +
>  static __init int spinlock_debugfs_init(void)
>  {
>       debugfs_create_file("qspl_steal_spins", 0600, arch_debugfs_dir, NULL, 
> &fops_steal_spins);
> @@ -548,6 +630,7 @@ static __init int spinlock_debugfs_init(void)
>               debugfs_create_file("qspl_pv_yield_owner", 0600, 
> arch_debugfs_dir, NULL, &fops_pv_yield_owner);
>               debugfs_create_file("qspl_pv_yield_allow_steal", 0600, 
> arch_debugfs_dir, NULL, &fops_pv_yield_allow_steal);
>               debugfs_create_file("qspl_pv_yield_prev", 0600, 
> arch_debugfs_dir, NULL, &fops_pv_yield_prev);
> +             debugfs_create_file("qspl_pv_yield_propagate_owner", 0600, 
> arch_debugfs_dir, NULL, &fops_pv_yield_propagate_owner);
>       }
>  
>       return 0;

Reply via email to