Re: [PATCH] locking/qrwlock: Allow multiple spinning readers

2016-04-01 Thread Peter Zijlstra
On Fri, Apr 01, 2016 at 05:47:43PM +0100, Will Deacon wrote:
> > +#define smp_cond_load_acquire(ptr, cond_expr)  ({  \
> > +   typeof(ptr) __PTR = (ptr);  \
> > +   typeof(*ptr) VAL;   \
> 
> It's a bit grim having a magic variable name, but I have no better
> suggestion.

Right; we had this discussion and this is the best we could come up
with.

lkml.kernel.org/r/ca+55afzza9eb3hfptspdmemoifem5bwqgow+ib7slvymtet...@mail.gmail.com

> > +   for (;;) {  \
> > +   VAL = READ_ONCE(*__PTR);\
> > +   if (cond_expr)  \
> > +   break;  \
> > +   cpu_relax();\
> > +   }   \
> > +   smp_rmb(); /* ctrl + rmb := acquire */  \
> > +   VAL;\
> > +})
> 
> Can you stick some #ifndef guards around this, please?

Oh sure; I'll even compile and boot it when I find a moment ;-)



Re: [PATCH] locking/qrwlock: Allow multiple spinning readers

2016-04-01 Thread Will Deacon
On Fri, Apr 01, 2016 at 01:43:03PM +0200, Peter Zijlstra wrote:
> > Ah, yes, I forgot about that. Lemme go find that discussions and see
> > what I can do there.
> 
> Completely untested..
> 
> ---
> include/linux/compiler.h   | 20 ++--
> kernel/locking/qspinlock.c | 12 ++--
> kernel/sched/core.c|  9 +
> kernel/sched/sched.h   |  2 +-
> kernel/smp.c   |  2 +-
> 5 files changed, 27 insertions(+), 18 deletions(-)
> 
> diff --git a/include/linux/compiler.h b/include/linux/compiler.h
> index b5ff9881bef8..c64f5897664f 100644
> --- a/include/linux/compiler.h
> +++ b/include/linux/compiler.h
> @@ -305,7 +305,8 @@ static __always_inline void __write_once_size(volatile 
> void *p, void *res, int s
> })
> 
> /**
> - * smp_cond_acquire() - Spin wait for cond with ACQUIRE ordering
> + * smp_cond_load_acquire() - Spin wait for cond with ACQUIRE ordering
> + * @ptr:  pointer to the variable wait on
>  * @cond: boolean expression to wait for
>  *
>  * Equivalent to using smp_load_acquire() on the condition variable but 
> employs
> @@ -315,11 +316,18 @@ static __always_inline void __write_once_size(volatile 
> void *p, void *res, int s
>  * provides LOAD->LOAD order, together they provide LOAD->{LOAD,STORE} order,
>  * aka. ACQUIRE.
>  */
> -#define smp_cond_acquire(cond)   do {\
> - while (!(cond)) \
> - cpu_relax();\
> - smp_rmb(); /* ctrl + rmb := acquire */  \
> -} while (0)
> +#define smp_cond_load_acquire(ptr, cond_expr)({  \
> + typeof(ptr) __PTR = (ptr);  \
> + typeof(*ptr) VAL;   \

It's a bit grim having a magic variable name, but I have no better
suggestion.

> + for (;;) {  \
> + VAL = READ_ONCE(*__PTR);\
> + if (cond_expr)  \
> + break;  \
> + cpu_relax();\
> + }   \
> + smp_rmb(); /* ctrl + rmb := acquire */  \
> + VAL;\
> +})

Can you stick some #ifndef guards around this, please? That way I can do
my ldxr/wfe-based version for ARM that makes the spinning tolerable. Also,
wouldn't this be better suited to barrier.h?

Otherwise, I really like this idea. Thanks!

Will


Re: [PATCH] locking/qrwlock: Allow multiple spinning readers

2016-04-01 Thread Peter Zijlstra
> Ah, yes, I forgot about that. Lemme go find that discussions and see
> what I can do there.

Completely untested..

---
include/linux/compiler.h   | 20 ++--
kernel/locking/qspinlock.c | 12 ++--
kernel/sched/core.c|  9 +
kernel/sched/sched.h   |  2 +-
kernel/smp.c   |  2 +-
5 files changed, 27 insertions(+), 18 deletions(-)

diff --git a/include/linux/compiler.h b/include/linux/compiler.h
index b5ff9881bef8..c64f5897664f 100644
--- a/include/linux/compiler.h
+++ b/include/linux/compiler.h
@@ -305,7 +305,8 @@ static __always_inline void __write_once_size(volatile void 
*p, void *res, int s
})

/**
- * smp_cond_acquire() - Spin wait for cond with ACQUIRE ordering
+ * smp_cond_load_acquire() - Spin wait for cond with ACQUIRE ordering
+ * @ptr:  pointer to the variable wait on
 * @cond: boolean expression to wait for
 *
 * Equivalent to using smp_load_acquire() on the condition variable but employs
@@ -315,11 +316,18 @@ static __always_inline void __write_once_size(volatile 
void *p, void *res, int s
 * provides LOAD->LOAD order, together they provide LOAD->{LOAD,STORE} order,
 * aka. ACQUIRE.
 */
-#define smp_cond_acquire(cond) do {\
-   while (!(cond)) \
-   cpu_relax();\
-   smp_rmb(); /* ctrl + rmb := acquire */  \
-} while (0)
+#define smp_cond_load_acquire(ptr, cond_expr)  ({  \
+   typeof(ptr) __PTR = (ptr);  \
+   typeof(*ptr) VAL;   \
+   for (;;) {  \
+   VAL = READ_ONCE(*__PTR);\
+   if (cond_expr)  \
+   break;  \
+   cpu_relax();\
+   }   \
+   smp_rmb(); /* ctrl + rmb := acquire */  \
+   VAL;\
+})

#endif /* __KERNEL__ */

diff --git a/kernel/locking/qspinlock.c b/kernel/locking/qspinlock.c
index ce2f75e32ae1..e98e5bf679e9 100644
--- a/kernel/locking/qspinlock.c
+++ b/kernel/locking/qspinlock.c
@@ -358,7 +358,7 @@ void queued_spin_lock_slowpath(struct qspinlock *lock, u32 
val)
* sequentiality; this is because not all clear_pending_set_locked()
* implementations imply full barriers.
*/
-   smp_cond_acquire(!(atomic_read(&lock->val) & _Q_LOCKED_MASK));
+   smp_cond_load_acquire(&lock->val.counter, !(VAL & _Q_LOCKED_MASK));

   /*
* take ownership and clear the pending bit.
@@ -434,7 +434,7 @@ void queued_spin_lock_slowpath(struct qspinlock *lock, u32 
val)
*
* The PV pv_wait_head_or_lock function, if active, will acquire
* the lock and return a non-zero value. So we have to skip the
-* smp_cond_acquire() call. As the next PV queue head hasn't been
+* smp_cond_load_acquire() call. As the next PV queue head hasn't been
* designated yet, there is no way for the locked value to become
* _Q_SLOW_VAL. So both the set_locked() and the
* atomic_cmpxchg_relaxed() calls will be safe.
@@ -445,7 +445,7 @@ void queued_spin_lock_slowpath(struct qspinlock *lock, u32 
val)
   if ((val = pv_wait_head_or_lock(lock, node)))
   goto locked;

-   smp_cond_acquire(!((val = atomic_read(&lock->val)) & 
_Q_LOCKED_PENDING_MASK));
+   val = smp_cond_load_acquire(&lock->val.counter, !(VAL & 
_Q_LOCKED_PENDING_MASK));

locked:
   /*
@@ -465,9 +465,9 @@ void queued_spin_lock_slowpath(struct qspinlock *lock, u32 
val)
   break;
   }
   /*
-* The smp_cond_acquire() call above has provided the necessary
-* acquire semantics required for locking. At most two
-* iterations of this loop may be ran.
+* The smp_cond_load_acquire() call above has provided the
+* necessary acquire semantics required for locking. At most
+* two iterations of this loop may be ran.
*/
   old = atomic_cmpxchg_relaxed(&lock->val, val, _Q_LOCKED_VAL);
   if (old == val)
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 11594230ef4d..90426a5ff016 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -1843,7 +1843,7 @@ static void ttwu_queue(struct task_struct *p, int cpu)
 * chain to provide order. Instead we do:
 *
 *   1) smp_store_release(X->on_cpu, 0)
- *   2) smp_cond_acquire(!X->on_cpu)
+ *   2) smp_cond_load_acquire(&X->on_cpu, !VAL)
 *
 * Example:
 *
@@ -1854,7 +1854,7 @@ static void ttwu_queue(struct task_struct *p, int cpu)
 *   sched-out X
 *   smp_store_release(X->on_cpu, 0);
 *
- *smp_cond_acquire(!X->on_cpu);
+ *   

Re: [PATCH] locking/qrwlock: Allow multiple spinning readers

2016-04-01 Thread Peter Zijlstra
On Fri, Apr 01, 2016 at 11:41:19AM +0100, Will Deacon wrote:
> On Fri, Apr 01, 2016 at 12:31:43PM +0200, Peter Zijlstra wrote:
> > On Thu, Mar 31, 2016 at 06:12:38PM -0400, Waiman Long wrote:
> > > >>However, if we allow a limited number of readers to spin on the
> > > >>lock simultaneously, we can eliminates some of the reader-to-reader
> > > >>latencies at the expense of a bit more cacheline contention and
> > > >>probably more power consumption.
> > > >So the embedded people might not like that much.
> > > 
> > > It could be. It is always a compromise.
> > 
> > So ARM is the only one that currently waits without spinning and could
> > care; so Will might have an opinion. One 'solution' would be to make
> > this an optional feature.
> 
> Well, perhaps we could built this using the cmp-and-wait structure we spoke
> about a couple of months back. What happened with that? Is there something I
> need to go implement for ARM/arm64?

Ah, yes, I forgot about that. Lemme go find that discussions and see
what I can do there.


Re: [PATCH] locking/qrwlock: Allow multiple spinning readers

2016-04-01 Thread Will Deacon
On Fri, Apr 01, 2016 at 12:31:43PM +0200, Peter Zijlstra wrote:
> On Thu, Mar 31, 2016 at 06:12:38PM -0400, Waiman Long wrote:
> > >>However, if we allow a limited number of readers to spin on the
> > >>lock simultaneously, we can eliminates some of the reader-to-reader
> > >>latencies at the expense of a bit more cacheline contention and
> > >>probably more power consumption.
> > >So the embedded people might not like that much.
> > 
> > It could be. It is always a compromise.
> 
> So ARM is the only one that currently waits without spinning and could
> care; so Will might have an opinion. One 'solution' would be to make
> this an optional feature.

Well, perhaps we could built this using the cmp-and-wait structure we spoke
about a couple of months back. What happened with that? Is there something I
need to go implement for ARM/arm64?

Will


Re: [PATCH] locking/qrwlock: Allow multiple spinning readers

2016-04-01 Thread Peter Zijlstra
On Thu, Mar 31, 2016 at 06:12:38PM -0400, Waiman Long wrote:
> >>However, if we allow a limited number of readers to spin on the
> >>lock simultaneously, we can eliminates some of the reader-to-reader
> >>latencies at the expense of a bit more cacheline contention and
> >>probably more power consumption.
> >So the embedded people might not like that much.
> 
> It could be. It is always a compromise.

So ARM is the only one that currently waits without spinning and could
care; so Will might have an opinion. One 'solution' would be to make
this an optional feature.


Re: [PATCH] locking/qrwlock: Allow multiple spinning readers

2016-04-01 Thread Peter Zijlstra
On Thu, Mar 31, 2016 at 06:12:38PM -0400, Waiman Long wrote:
> On 03/29/2016 04:20 PM, Peter Zijlstra wrote:
> >>cnts = atomic_add_return_acquire(_QR_BIAS,&lock->cnts) - _QR_BIAS;
> >>+   while ((cnts&  _QW_WMASK) == _QW_LOCKED) {
> >>+   if (locked&&  ((cnts>>  _QR_SHIFT)<  MAX_SPINNING_READERS)) {
> >>+   /*
> >>+* Unlock the wait queue so that more readers can
> >>+* come forward and waiting for the writer to exit
> >>+* as long as no more than MAX_SPINNING_READERS
> >>+* readers are present.
> >>+*/
> >>+   arch_spin_unlock(&lock->wait_lock);
> >>+   locked = false;
> >Only 1 more can come forward with this logic. How can you ever get to 4?
> 
> Yes, each reader in the unlock path will release one in the queue. If the
> next one is also a reader, it will release one more and so on until the
> reader count reach 4 where the process will stop.
> 
> >
> >Also, what says the next in queue is a reader?
> 
> I did say in the changelog that the queue head could be a writer. In that
> case, the process will stop and the writer will wait until all the readers
> are gone.

Clearly I could not read not think straight when I looked at this patch.
You're right.

I'll go have another look ..


Re: [PATCH] locking/qrwlock: Allow multiple spinning readers

2016-03-31 Thread Waiman Long

On 03/29/2016 04:20 PM, Peter Zijlstra wrote:

On Sat, Mar 19, 2016 at 11:21:19PM -0400, Waiman Long wrote:

In qrwlock, the reader that is spining on the lock will need to notify
the next reader in the queue when the lock is free. That introduces a
reader-to-reader latency that is not present in the original rwlock.

How did you find this 'problem'?


I am constantly on the lookout for twists that can make the code run 
faster. That change turn out to be good for reader performance and so I 
send it out to solicit feedback.



That is the price for reducing lock cacheline contention. It also
reduces the performance benefit of qrwlock on reader heavy workloads.

However, if we allow a limited number of readers to spin on the
lock simultaneously, we can eliminates some of the reader-to-reader
latencies at the expense of a bit more cacheline contention and
probably more power consumption.

So the embedded people might not like that much.


It could be. It is always a compromise.


This patch changes the reader slowpath to allow multiple readers to
spin on the lock. The maximum number of concurrent readers allowed
is currently set to 4 to limit the amount of additional cacheline
contention while improving reader performance on most workloads. If
a writer comes to the queue head, however, it will stop additional
readers from coming out.

Using a multi-threaded locking microbenchmark on a 4-socket 40-core
Haswell-EX system, the locking throughput of 4.5-rc6 kernel with or
without the patch were as follows:

Do you have an actual real world benchmark where this makes a
difference?


Not yet. Will look out for some real world workload.


  /**
   * queued_read_lock_slowpath - acquire read lock of a queue rwlock
   * @lock: Pointer to queue rwlock structure
   * @cnts: Current qrwlock lock value
   */
  void queued_read_lock_slowpath(struct qrwlock *lock, u32 cnts)
  {
+   bool locked = true;
+
/*
 * Readers come here when they cannot get the lock without waiting
 */
@@ -78,7 +71,10 @@ void queued_read_lock_slowpath(struct qrwlock *lock, u32 
cnts)
 * semantics) until the lock is available without waiting in
 * the queue.
 */
+   while ((cnts&  _QW_WMASK) == _QW_LOCKED) {
+   cpu_relax_lowlatency();
+   cnts = atomic_read_acquire(&lock->cnts);
+   }
return;
}
atomic_sub(_QR_BIAS,&lock->cnts);
@@ -92,14 +88,31 @@ void queued_read_lock_slowpath(struct qrwlock *lock, u32 
cnts)
 * The ACQUIRE semantics of the following spinning code ensure
 * that accesses can't leak upwards out of our subsequent critical
 * section in the case that the lock is currently held for write.
+*
+* The reader increments the reader count&  wait until the writer
+* releases the lock.
 */
cnts = atomic_add_return_acquire(_QR_BIAS,&lock->cnts) - _QR_BIAS;
+   while ((cnts&  _QW_WMASK) == _QW_LOCKED) {
+   if (locked&&  ((cnts>>  _QR_SHIFT)<  MAX_SPINNING_READERS)) {
+   /*
+* Unlock the wait queue so that more readers can
+* come forward and waiting for the writer to exit
+* as long as no more than MAX_SPINNING_READERS
+* readers are present.
+*/
+   arch_spin_unlock(&lock->wait_lock);
+   locked = false;

Only 1 more can come forward with this logic. How can you ever get to 4?


Yes, each reader in the unlock path will release one in the queue. If 
the next one is also a reader, it will release one more and so on until 
the reader count reach 4 where the process will stop.




Also, what says the next in queue is a reader?


I did say in the changelog that the queue head could be a writer. In 
that case, the process will stop and the writer will wait until all the 
readers are gone.


Cheers,
Longman


Re: [PATCH] locking/qrwlock: Allow multiple spinning readers

2016-03-29 Thread Peter Zijlstra
On Sat, Mar 19, 2016 at 11:21:19PM -0400, Waiman Long wrote:
> In qrwlock, the reader that is spining on the lock will need to notify
> the next reader in the queue when the lock is free. That introduces a
> reader-to-reader latency that is not present in the original rwlock.

How did you find this 'problem'?

> That is the price for reducing lock cacheline contention. It also
> reduces the performance benefit of qrwlock on reader heavy workloads.
> 
> However, if we allow a limited number of readers to spin on the
> lock simultaneously, we can eliminates some of the reader-to-reader
> latencies at the expense of a bit more cacheline contention and
> probably more power consumption.

So the embedded people might not like that much.

> This patch changes the reader slowpath to allow multiple readers to
> spin on the lock. The maximum number of concurrent readers allowed
> is currently set to 4 to limit the amount of additional cacheline
> contention while improving reader performance on most workloads. If
> a writer comes to the queue head, however, it will stop additional
> readers from coming out.
> 
> Using a multi-threaded locking microbenchmark on a 4-socket 40-core
> Haswell-EX system, the locking throughput of 4.5-rc6 kernel with or
> without the patch were as follows:

Do you have an actual real world benchmark where this makes a
difference?

>  /**
>   * queued_read_lock_slowpath - acquire read lock of a queue rwlock
>   * @lock: Pointer to queue rwlock structure
>   * @cnts: Current qrwlock lock value
>   */
>  void queued_read_lock_slowpath(struct qrwlock *lock, u32 cnts)
>  {
> + bool locked = true;
> +
>   /*
>* Readers come here when they cannot get the lock without waiting
>*/
> @@ -78,7 +71,10 @@ void queued_read_lock_slowpath(struct qrwlock *lock, u32 
> cnts)
>* semantics) until the lock is available without waiting in
>* the queue.
>*/
> + while ((cnts & _QW_WMASK) == _QW_LOCKED) {
> + cpu_relax_lowlatency();
> + cnts = atomic_read_acquire(&lock->cnts);
> + }
>   return;
>   }
>   atomic_sub(_QR_BIAS, &lock->cnts);
> @@ -92,14 +88,31 @@ void queued_read_lock_slowpath(struct qrwlock *lock, u32 
> cnts)
>* The ACQUIRE semantics of the following spinning code ensure
>* that accesses can't leak upwards out of our subsequent critical
>* section in the case that the lock is currently held for write.
> +  *
> +  * The reader increments the reader count & wait until the writer
> +  * releases the lock.
>*/
>   cnts = atomic_add_return_acquire(_QR_BIAS, &lock->cnts) - _QR_BIAS;
> + while ((cnts & _QW_WMASK) == _QW_LOCKED) {
> + if (locked && ((cnts >> _QR_SHIFT) < MAX_SPINNING_READERS)) {
> + /*
> +  * Unlock the wait queue so that more readers can
> +  * come forward and waiting for the writer to exit
> +  * as long as no more than MAX_SPINNING_READERS
> +  * readers are present.
> +  */
> + arch_spin_unlock(&lock->wait_lock);
> + locked = false;

Only 1 more can come forward with this logic. How can you ever get to 4?

Also, what says the next in queue is a reader?

> + }
> + cpu_relax_lowlatency();
> + cnts = atomic_read_acquire(&lock->cnts);
> + }
>  
>   /*
>* Signal the next one in queue to become queue head
>*/
> + if (locked)
> + arch_spin_unlock(&lock->wait_lock);
>  }
>  EXPORT_SYMBOL(queued_read_lock_slowpath);
>  
> -- 
> 1.7.1
> 


Re: [PATCH] locking/qrwlock: Allow multiple spinning readers

2016-03-21 Thread Waiman Long

On 03/20/2016 06:43 AM, Peter Zijlstra wrote:

We still have that starvation case in mutex, I would think that is far
more important to fix.


Peter, I am sorry that I let the mutex patch languish for a while. I 
will work on that next.


Cheers,
Longman


Re: [PATCH] locking/qrwlock: Allow multiple spinning readers

2016-03-20 Thread Peter Zijlstra

We still have that starvation case in mutex, I would think that is far
more important to fix.


[PATCH] locking/qrwlock: Allow multiple spinning readers

2016-03-19 Thread Waiman Long
In qrwlock, the reader that is spining on the lock will need to notify
the next reader in the queue when the lock is free. That introduces a
reader-to-reader latency that is not present in the original rwlock.
That is the price for reducing lock cacheline contention. It also
reduces the performance benefit of qrwlock on reader heavy workloads.

However, if we allow a limited number of readers to spin on the
lock simultaneously, we can eliminates some of the reader-to-reader
latencies at the expense of a bit more cacheline contention and
probably more power consumption.

This patch changes the reader slowpath to allow multiple readers to
spin on the lock. The maximum number of concurrent readers allowed
is currently set to 4 to limit the amount of additional cacheline
contention while improving reader performance on most workloads. If
a writer comes to the queue head, however, it will stop additional
readers from coming out.

Using a multi-threaded locking microbenchmark on a 4-socket 40-core
Haswell-EX system, the locking throughput of 4.5-rc6 kernel with or
without the patch were as follows:

1) All locking threads run in the same socket

   r:w   # of   locking ratelocking rate% change
   ratio threads w/o patch   with patch
   -----
 1:126867 Mop/s  6125 Mop/s  -10.8%
45762 Mop/s  5065 Mop/s  -12.1%
66211 Mop/s  6876 Mop/s  +10.7%
86102 Mop/s  6954 Mop/s  +14.0%
10:128808 Mop/s  9446 Mop/s  + 7.2%
46034 Mop/s  6199 Mop/s  + 2.7%
66862 Mop/s  7702 Mop/s  +12.2%
86693 Mop/s  7808 Mop/s  +16.7%
50:12   10530 Mop/s 11028 Mop/s  + 4.7%
49065 Mop/s  9015 Mop/s  - 0.6%
69659 Mop/s 10669 Mop/s  +10.5%
88927 Mop/s 10441 Mop/s  +17.0%

2) 2 locking threads per socket (8 threads total)

   r:w  locking ratelocking rate% change
   ratiow/o patchwith patch
   --   
10:1 7027 Mop/s  8012 Mop/s  +14.0%
50:1 9986 Mop/s 11573 Mop/s  +15.9%

With a moderately contended qrwlock (2/4 threads), the performance
benefit was a bit mixed. The microbenchmark also had more run-to-run
variations at that contention level. At higher contention level,
however, there was more than 10% gain in performance.

Signed-off-by: Waiman Long 
---
 kernel/locking/qrwlock.c |   53 -
 1 files changed, 33 insertions(+), 20 deletions(-)

diff --git a/kernel/locking/qrwlock.c b/kernel/locking/qrwlock.c
index fec0823..bec6d5f 100644
--- a/kernel/locking/qrwlock.c
+++ b/kernel/locking/qrwlock.c
@@ -23,6 +23,14 @@
 #include 
 
 /*
+ * More than one reader will be allowed to spin on the lock waiting for the
+ * writer to exit. When more readers are allowed, it reduces the reader lock
+ * acquisition latency, but increases the amount of cacheline contention and
+ * probably power consumption.
+ */
+#define MAX_SPINNING_READERS   4
+
+/*
  * This internal data structure is used for optimizing access to some of
  * the subfields within the atomic_t cnts.
  */
@@ -43,29 +51,14 @@ struct __qrwlock {
 };
 
 /**
- * rspin_until_writer_unlock - inc reader count & spin until writer is gone
- * @lock  : Pointer to queue rwlock structure
- * @writer: Current queue rwlock writer status byte
- *
- * In interrupt context or at the head of the queue, the reader will just
- * increment the reader count & wait until the writer releases the lock.
- */
-static __always_inline void
-rspin_until_writer_unlock(struct qrwlock *lock, u32 cnts)
-{
-   while ((cnts & _QW_WMASK) == _QW_LOCKED) {
-   cpu_relax_lowlatency();
-   cnts = atomic_read_acquire(&lock->cnts);
-   }
-}
-
-/**
  * queued_read_lock_slowpath - acquire read lock of a queue rwlock
  * @lock: Pointer to queue rwlock structure
  * @cnts: Current qrwlock lock value
  */
 void queued_read_lock_slowpath(struct qrwlock *lock, u32 cnts)
 {
+   bool locked = true;
+
/*
 * Readers come here when they cannot get the lock without waiting
 */
@@ -78,7 +71,10 @@ void queued_read_lock_slowpath(struct qrwlock *lock, u32 
cnts)
 * semantics) until the lock is available without waiting in
 * the queue.
 */
-   rspin_until_writer_unlock(lock, cnts);
+   while ((cnts & _QW_WMASK) == _QW_LOCKED) {
+   cpu_relax_lowlatency();
+   cnts = atomic_read_acquire(&lock->cnts);
+   }
return;
}
atomic_sub(_QR_BIAS, &lock->cn