Re: [PATCH 8/9] qspinlock: Generic paravirt support

2015-04-03 Thread Peter Zijlstra
On Thu, Apr 02, 2015 at 09:48:34PM +0200, Peter Zijlstra wrote:
> @@ -158,20 +257,20 @@ static void pv_wait_head(struct qspinloc
>  void __pv_queue_spin_unlock(struct qspinlock *lock)
>  {
>   struct __qspinlock *l = (void *)lock;
> + struct pv_hash_bucket *hb;
>  
>   if (xchg(>locked, 0) != _Q_SLOW_VAL)
>   return;
>  
>   /*
>* At this point the memory pointed at by lock can be freed/reused,
> +  * however we can still use the pointer value to search in our hash
> +  * table.
>*
> +  * Also, if we observe _Q_SLOW_VAL we _must_ now observe 'our' hash
> +  * bucket. See pv_wait_head().
>*/
> + hb = pv_hash_find(lock);
> + pv_kick(hb->cpu);
> + WRITE_ONCE(hb->lock, NULL); /* unhash */
>  }

So I _think_ I found a problem with this approach :/

If, as per the above, the lock does indeed get freed, it can get
re-allocated and stuck in the hash table (again) before we get the
lookup and unhash it.

I'll have to ponder that a bit more.
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH 8/9] qspinlock: Generic paravirt support

2015-04-03 Thread Peter Zijlstra
On Thu, Apr 02, 2015 at 09:48:34PM +0200, Peter Zijlstra wrote:
 @@ -158,20 +257,20 @@ static void pv_wait_head(struct qspinloc
  void __pv_queue_spin_unlock(struct qspinlock *lock)
  {
   struct __qspinlock *l = (void *)lock;
 + struct pv_hash_bucket *hb;
  
   if (xchg(l-locked, 0) != _Q_SLOW_VAL)
   return;
  
   /*
* At this point the memory pointed at by lock can be freed/reused,
 +  * however we can still use the pointer value to search in our hash
 +  * table.
*
 +  * Also, if we observe _Q_SLOW_VAL we _must_ now observe 'our' hash
 +  * bucket. See pv_wait_head().
*/
 + hb = pv_hash_find(lock);
 + pv_kick(hb-cpu);
 + WRITE_ONCE(hb-lock, NULL); /* unhash */
  }

So I _think_ I found a problem with this approach :/

If, as per the above, the lock does indeed get freed, it can get
re-allocated and stuck in the hash table (again) before we get the
lookup and unhash it.

I'll have to ponder that a bit more.
--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH 8/9] qspinlock: Generic paravirt support

2015-04-02 Thread Waiman Long

On 04/02/2015 03:48 PM, Peter Zijlstra wrote:

On Thu, Apr 02, 2015 at 07:20:57PM +0200, Peter Zijlstra wrote:

pv_wait_head():

pv_hash()
/* MB as per cmpxchg */
cmpxchg(>locked, _Q_LOCKED_VAL, _Q_SLOW_VAL);

VS

__pv_queue_spin_unlock():

if (xchg(>locked, 0) != _Q_SLOW_VAL)
return;

/* MB as per xchg */
pv_hash_find(lock);




Something like so.. compile tested only.

I took out the LFSR because that was likely over engineering from my
side :-)

--- a/kernel/locking/qspinlock_paravirt.h
+++ b/kernel/locking/qspinlock_paravirt.h
@@ -2,6 +2,8 @@
  #error "do not include this file"
  #endif

+#include
+
  /*
   * Implement paravirt qspinlocks; the general idea is to halt the vcpus 
instead
   * of spinning them.
@@ -107,7 +109,84 @@ static void pv_kick_node(struct mcs_spin
pv_kick(pn->cpu);
  }

-static DEFINE_PER_CPU(struct qspinlock *, __pv_lock_wait);
+/*
+ * Hash table using open addressing with an linear probe sequence.
+ *
+ * Since we should not be holding locks from NMI context (very rare indeed) the
+ * max load factor is 0.75, which is around the point where open adressing
+ * breaks down.
+ *
+ * Instead of probing just the immediate bucket we probe all buckets in the
+ * same cacheline.
+ *
+ * http://en.wikipedia.org/wiki/Hash_table#Open_addressing
+ *
+ */
+
+struct pv_hash_bucket {
+   struct qspinlock *lock;
+   int cpu;
+};
+
+/*
+ * XXX dynamic allocate using nr_cpu_ids instead...
+ */
+#define PV_LOCK_HASH_BITS  (2 + NR_CPUS_BITS)
+
+#if PV_LOCK_HASH_BITS<  6
+#undef PV_LOCK_HASH_BITS
+#define PB_LOCK_HASH_BITS  6
+#endif
+
+#define PV_LOCK_HASH_SIZE  (1<<  PV_LOCK_HASH_BITS)
+
+static struct pv_hash_bucket __pv_lock_hash[PV_LOCK_HASH_SIZE] 
cacheline_aligned;
+
+#define PV_HB_PER_LINE (SMP_CACHE_BYTES / sizeof(struct 
pv_hash_bucket))
+
+static inline u32 hash_align(u32 hash)
+{
+   return hash&  ~(PV_HB_PER_LINE - 1);
+}
+
+#define for_each_hash_bucket(hb, off, hash)
\
+   for (hash = hash_align(hash), off = 0, hb =&__pv_lock_hash[hash + off];\
+   off<  PV_LOCK_HASH_SIZE; \
+   off++, hb =&__pv_lock_hash[(hash + off) % PV_LOCK_HASH_SIZE])
+
+static struct pv_hash_bucket *pv_hash_insert(struct qspinlock *lock)
+{
+   u32 offset, hash = hash_ptr(lock, PV_LOCK_HASH_BITS);
+   struct pv_hash_bucket *hb;
+
+   for_each_hash_bucket(hb, offset, hash) {
+   if (!cmpxchg(>lock, NULL, lock)) {
+   WRITE_ONCE(hb->cpu, smp_processor_id());
+   return hb;
+   }
+   }
+
+   /*
+* Hard assumes there is an empty bucket somewhere.
+*/
+   BUG();
+}
+
+static struct pv_hash_bucket *pv_hash_find(struct qspinlock *lock)
+{
+   u32 offset, hash = hash_ptr(lock, PV_LOCK_HASH_BITS);
+   struct pv_hash_bucket *hb;
+
+   for_each_hash_bucket(hb, offset, hash) {
+   if (READ_ONCE(hb->lock) == lock)
+   return hb;
+   }
+
+   /*
+* Hard assumes we _WILL_ find a match.
+*/
+   BUG();
+}

  /*
   * Wait for l->locked to become clear; halt the vcpu after a short spin.
@@ -116,7 +195,9 @@ static DEFINE_PER_CPU(struct qspinlock *
  static void pv_wait_head(struct qspinlock *lock)
  {
struct __qspinlock *l = (void *)lock;
+   struct pv_hash_bucket *hb = NULL;
int loop;
+   u8 o;

for (;;) {
for (loop = SPIN_THRESHOLD; loop; loop--) {
@@ -126,29 +207,47 @@ static void pv_wait_head(struct qspinloc
cpu_relax();
}

-   this_cpu_write(__pv_lock_wait, lock);
-   /*
-* __pv_lock_wait must be set before setting _Q_SLOW_VAL
-*
-* [S] __pv_lock_wait = lock[RmW] l = l->locked = 0
-* MB MB
-* [S] l->locked = _Q_SLOW_VAL  [L]   __pv_lock_wait
-*
-* Matches the xchg() in pv_queue_spin_unlock().
-*/
-   if (!cmpxchg(>locked, _Q_LOCKED_VAL, _Q_SLOW_VAL))
-   goto done;
+   if (!hb) {
+   hb = pv_hash_insert(lock);
+   /*
+* hb  must be set before setting _Q_SLOW_VAL
+*
+* [S]   hb<- lock   [RmW] l = l->locked = 0
+*   MB MB
+* [RmW] l->locked ?= _Q_SLOW_VAL [L]   hb
+*
+* Matches the xchg() in pv_queue_spin_unlock().
+*/
+   o = cmpxchg(>locked, _Q_LOCKED_VAL, _Q_SLOW_VAL);
+   if (!o) {
+   /*
+

Re: [PATCH 8/9] qspinlock: Generic paravirt support

2015-04-02 Thread Peter Zijlstra
On Thu, Apr 02, 2015 at 07:20:57PM +0200, Peter Zijlstra wrote:
> pv_wait_head():
> 
>   pv_hash()
>   /* MB as per cmpxchg */
>   cmpxchg(>locked, _Q_LOCKED_VAL, _Q_SLOW_VAL);
> 
> VS
> 
> __pv_queue_spin_unlock():
> 
>   if (xchg(>locked, 0) != _Q_SLOW_VAL)
>   return;
> 
>   /* MB as per xchg */
>   pv_hash_find(lock);
> 
> 


Something like so.. compile tested only.

I took out the LFSR because that was likely over engineering from my
side :-)

--- a/kernel/locking/qspinlock_paravirt.h
+++ b/kernel/locking/qspinlock_paravirt.h
@@ -2,6 +2,8 @@
 #error "do not include this file"
 #endif
 
+#include 
+
 /*
  * Implement paravirt qspinlocks; the general idea is to halt the vcpus instead
  * of spinning them.
@@ -107,7 +109,84 @@ static void pv_kick_node(struct mcs_spin
pv_kick(pn->cpu);
 }
 
-static DEFINE_PER_CPU(struct qspinlock *, __pv_lock_wait);
+/*
+ * Hash table using open addressing with an linear probe sequence.
+ *
+ * Since we should not be holding locks from NMI context (very rare indeed) the
+ * max load factor is 0.75, which is around the point where open adressing
+ * breaks down.
+ *
+ * Instead of probing just the immediate bucket we probe all buckets in the
+ * same cacheline.
+ *
+ * http://en.wikipedia.org/wiki/Hash_table#Open_addressing
+ *
+ */
+
+struct pv_hash_bucket {
+   struct qspinlock *lock;
+   int cpu;
+};
+
+/*
+ * XXX dynamic allocate using nr_cpu_ids instead...
+ */
+#define PV_LOCK_HASH_BITS  (2 + NR_CPUS_BITS)
+
+#if PV_LOCK_HASH_BITS < 6
+#undef PV_LOCK_HASH_BITS
+#define PB_LOCK_HASH_BITS  6
+#endif
+
+#define PV_LOCK_HASH_SIZE  (1 << PV_LOCK_HASH_BITS)
+
+static struct pv_hash_bucket __pv_lock_hash[PV_LOCK_HASH_SIZE] 
cacheline_aligned;
+
+#define PV_HB_PER_LINE (SMP_CACHE_BYTES / sizeof(struct 
pv_hash_bucket))
+
+static inline u32 hash_align(u32 hash)
+{
+   return hash & ~(PV_HB_PER_LINE - 1);
+}
+
+#define for_each_hash_bucket(hb, off, hash)
\
+   for (hash = hash_align(hash), off = 0, hb = &__pv_lock_hash[hash + 
off];\
+   off < PV_LOCK_HASH_SIZE;
\
+   off++, hb = &__pv_lock_hash[(hash + off) % PV_LOCK_HASH_SIZE])
+
+static struct pv_hash_bucket *pv_hash_insert(struct qspinlock *lock)
+{
+   u32 offset, hash = hash_ptr(lock, PV_LOCK_HASH_BITS);
+   struct pv_hash_bucket *hb;
+
+   for_each_hash_bucket(hb, offset, hash) {
+   if (!cmpxchg(>lock, NULL, lock)) {
+   WRITE_ONCE(hb->cpu, smp_processor_id());
+   return hb;
+   }
+   }
+
+   /*
+* Hard assumes there is an empty bucket somewhere.
+*/
+   BUG();
+}
+
+static struct pv_hash_bucket *pv_hash_find(struct qspinlock *lock)
+{
+   u32 offset, hash = hash_ptr(lock, PV_LOCK_HASH_BITS);
+   struct pv_hash_bucket *hb;
+
+   for_each_hash_bucket(hb, offset, hash) {
+   if (READ_ONCE(hb->lock) == lock)
+   return hb;
+   }
+
+   /*
+* Hard assumes we _WILL_ find a match.
+*/
+   BUG();
+}
 
 /*
  * Wait for l->locked to become clear; halt the vcpu after a short spin.
@@ -116,7 +195,9 @@ static DEFINE_PER_CPU(struct qspinlock *
 static void pv_wait_head(struct qspinlock *lock)
 {
struct __qspinlock *l = (void *)lock;
+   struct pv_hash_bucket *hb = NULL;
int loop;
+   u8 o;
 
for (;;) {
for (loop = SPIN_THRESHOLD; loop; loop--) {
@@ -126,29 +207,47 @@ static void pv_wait_head(struct qspinloc
cpu_relax();
}
 
-   this_cpu_write(__pv_lock_wait, lock);
-   /*
-* __pv_lock_wait must be set before setting _Q_SLOW_VAL
-*
-* [S] __pv_lock_wait = lock[RmW] l = l->locked = 0
-* MB MB
-* [S] l->locked = _Q_SLOW_VAL  [L]   __pv_lock_wait
-*
-* Matches the xchg() in pv_queue_spin_unlock().
-*/
-   if (!cmpxchg(>locked, _Q_LOCKED_VAL, _Q_SLOW_VAL))
-   goto done;
+   if (!hb) {
+   hb = pv_hash_insert(lock);
+   /*
+* hb  must be set before setting _Q_SLOW_VAL
+*
+* [S]   hb <- lock   [RmW] l = l->locked = 0
+*   MB MB
+* [RmW] l->locked ?= _Q_SLOW_VAL [L]   hb
+*
+* Matches the xchg() in pv_queue_spin_unlock().
+*/
+   o = cmpxchg(>locked, _Q_LOCKED_VAL, _Q_SLOW_VAL);
+   if (!o) {
+   /*
+  

Re: [PATCH 8/9] qspinlock: Generic paravirt support

2015-04-02 Thread Peter Zijlstra
On Thu, Apr 02, 2015 at 12:28:30PM -0400, Waiman Long wrote:
> On 04/01/2015 05:03 PM, Peter Zijlstra wrote:
> >On Wed, Apr 01, 2015 at 03:58:58PM -0400, Waiman Long wrote:
> >>On 04/01/2015 02:48 PM, Peter Zijlstra wrote:
> >>I am sorry that I don't quite get what you mean here. My point is that in
> >>the hashing step, a cpu will need to scan an empty bucket to put the lock
> >>in. In the interim, an previously used bucket before the empty one may get
> >>freed. In the lookup step for that lock, the scanning will stop because of
> >>an empty bucket in front of the target one.
> >Right, that's broken. So we need to do something else to limit the
> >lookup, because without that break, a lookup that needs to iterate the
> >entire array in order to determine -ENOENT, which is expensive.
> >
> >So my alternative proposal is that IFF we can guarantee that every
> >lookup will succeed -- the entry we're looking for is always there, we
> >don't need the break on empty but can probe until we find the entry.
> >This will be bound in cost to the same number if probes we required for
> >insertion and avoids the full array scan.
> >
> >Now I think we can indeed do this, if as said earlier we do not clear
> >the bucket on insert if the cmpxchg succeeds, in that case the unlock
> >will observe _Q_SLOW_VAL and do the lookup, the lookup will then find
> >the entry. And we then need the unlock to clear the entry.
> >_Q_SLOW_VAL
> >Does that explain this? Or should I try again with code?
> 
> OK, I got your proposal now. However, there is still the issue that setting
> the _Q_SLOW_VAL flag and the hash bucket are not atomic wrt each other.

So? They're strictly ordered, that's sufficient. We first hash the lock,
then we set _Q_SLOW_VAL. There's a full memory barrier between them.

> It
> is possible a CPU has set the _Q_SLOW_VAL flag but not yet filling in the
> hash bucket while another one is trying to look for it.

Nope. The unlock side does an xchg() on the locked value first, xchg
also implies a full barrier, so that guarantees that if we observe
_Q_SLOW_VAL we must also observe the hash bucket with the lock value.

> So we need to have
> some kind of synchronization mechanism to let the lookup CPU know when is a
> good time to look up.

No, its all already ordered and working.

pv_wait_head():

pv_hash()
/* MB as per cmpxchg */
cmpxchg(>locked, _Q_LOCKED_VAL, _Q_SLOW_VAL);

VS

__pv_queue_spin_unlock():

if (xchg(>locked, 0) != _Q_SLOW_VAL)
return;

/* MB as per xchg */
pv_hash_find(lock);


--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH 8/9] qspinlock: Generic paravirt support

2015-04-02 Thread Waiman Long

On 04/01/2015 05:03 PM, Peter Zijlstra wrote:

On Wed, Apr 01, 2015 at 03:58:58PM -0400, Waiman Long wrote:

On 04/01/2015 02:48 PM, Peter Zijlstra wrote:
I am sorry that I don't quite get what you mean here. My point is that in
the hashing step, a cpu will need to scan an empty bucket to put the lock
in. In the interim, an previously used bucket before the empty one may get
freed. In the lookup step for that lock, the scanning will stop because of
an empty bucket in front of the target one.

Right, that's broken. So we need to do something else to limit the
lookup, because without that break, a lookup that needs to iterate the
entire array in order to determine -ENOENT, which is expensive.

So my alternative proposal is that IFF we can guarantee that every
lookup will succeed -- the entry we're looking for is always there, we
don't need the break on empty but can probe until we find the entry.
This will be bound in cost to the same number if probes we required for
insertion and avoids the full array scan.

Now I think we can indeed do this, if as said earlier we do not clear
the bucket on insert if the cmpxchg succeeds, in that case the unlock
will observe _Q_SLOW_VAL and do the lookup, the lookup will then find
the entry. And we then need the unlock to clear the entry.
_Q_SLOW_VAL
Does that explain this? Or should I try again with code?


OK, I got your proposal now. However, there is still the issue that 
setting the _Q_SLOW_VAL flag and the hash bucket are not atomic wrt each 
other. It is possible a CPU has set the _Q_SLOW_VAL flag but not yet 
filling in the hash bucket while another one is trying to look for it. 
So we need to have some kind of synchronization mechanism to let the 
lookup CPU know when is a good time to look up.


One possibility is to delay setting _Q_SLOW_VAL until the hash bucket is 
set up. Maybe we can make that work.


Cheers,
Longman


--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH 8/9] qspinlock: Generic paravirt support

2015-04-02 Thread Peter Zijlstra
On Thu, Apr 02, 2015 at 07:20:57PM +0200, Peter Zijlstra wrote:
 pv_wait_head():
 
   pv_hash()
   /* MB as per cmpxchg */
   cmpxchg(l-locked, _Q_LOCKED_VAL, _Q_SLOW_VAL);
 
 VS
 
 __pv_queue_spin_unlock():
 
   if (xchg(l-locked, 0) != _Q_SLOW_VAL)
   return;
 
   /* MB as per xchg */
   pv_hash_find(lock);
 
 


Something like so.. compile tested only.

I took out the LFSR because that was likely over engineering from my
side :-)

--- a/kernel/locking/qspinlock_paravirt.h
+++ b/kernel/locking/qspinlock_paravirt.h
@@ -2,6 +2,8 @@
 #error do not include this file
 #endif
 
+#include linux/hash.h
+
 /*
  * Implement paravirt qspinlocks; the general idea is to halt the vcpus instead
  * of spinning them.
@@ -107,7 +109,84 @@ static void pv_kick_node(struct mcs_spin
pv_kick(pn-cpu);
 }
 
-static DEFINE_PER_CPU(struct qspinlock *, __pv_lock_wait);
+/*
+ * Hash table using open addressing with an linear probe sequence.
+ *
+ * Since we should not be holding locks from NMI context (very rare indeed) the
+ * max load factor is 0.75, which is around the point where open adressing
+ * breaks down.
+ *
+ * Instead of probing just the immediate bucket we probe all buckets in the
+ * same cacheline.
+ *
+ * http://en.wikipedia.org/wiki/Hash_table#Open_addressing
+ *
+ */
+
+struct pv_hash_bucket {
+   struct qspinlock *lock;
+   int cpu;
+};
+
+/*
+ * XXX dynamic allocate using nr_cpu_ids instead...
+ */
+#define PV_LOCK_HASH_BITS  (2 + NR_CPUS_BITS)
+
+#if PV_LOCK_HASH_BITS  6
+#undef PV_LOCK_HASH_BITS
+#define PB_LOCK_HASH_BITS  6
+#endif
+
+#define PV_LOCK_HASH_SIZE  (1  PV_LOCK_HASH_BITS)
+
+static struct pv_hash_bucket __pv_lock_hash[PV_LOCK_HASH_SIZE] 
cacheline_aligned;
+
+#define PV_HB_PER_LINE (SMP_CACHE_BYTES / sizeof(struct 
pv_hash_bucket))
+
+static inline u32 hash_align(u32 hash)
+{
+   return hash  ~(PV_HB_PER_LINE - 1);
+}
+
+#define for_each_hash_bucket(hb, off, hash)
\
+   for (hash = hash_align(hash), off = 0, hb = __pv_lock_hash[hash + 
off];\
+   off  PV_LOCK_HASH_SIZE;
\
+   off++, hb = __pv_lock_hash[(hash + off) % PV_LOCK_HASH_SIZE])
+
+static struct pv_hash_bucket *pv_hash_insert(struct qspinlock *lock)
+{
+   u32 offset, hash = hash_ptr(lock, PV_LOCK_HASH_BITS);
+   struct pv_hash_bucket *hb;
+
+   for_each_hash_bucket(hb, offset, hash) {
+   if (!cmpxchg(hb-lock, NULL, lock)) {
+   WRITE_ONCE(hb-cpu, smp_processor_id());
+   return hb;
+   }
+   }
+
+   /*
+* Hard assumes there is an empty bucket somewhere.
+*/
+   BUG();
+}
+
+static struct pv_hash_bucket *pv_hash_find(struct qspinlock *lock)
+{
+   u32 offset, hash = hash_ptr(lock, PV_LOCK_HASH_BITS);
+   struct pv_hash_bucket *hb;
+
+   for_each_hash_bucket(hb, offset, hash) {
+   if (READ_ONCE(hb-lock) == lock)
+   return hb;
+   }
+
+   /*
+* Hard assumes we _WILL_ find a match.
+*/
+   BUG();
+}
 
 /*
  * Wait for l-locked to become clear; halt the vcpu after a short spin.
@@ -116,7 +195,9 @@ static DEFINE_PER_CPU(struct qspinlock *
 static void pv_wait_head(struct qspinlock *lock)
 {
struct __qspinlock *l = (void *)lock;
+   struct pv_hash_bucket *hb = NULL;
int loop;
+   u8 o;
 
for (;;) {
for (loop = SPIN_THRESHOLD; loop; loop--) {
@@ -126,29 +207,47 @@ static void pv_wait_head(struct qspinloc
cpu_relax();
}
 
-   this_cpu_write(__pv_lock_wait, lock);
-   /*
-* __pv_lock_wait must be set before setting _Q_SLOW_VAL
-*
-* [S] __pv_lock_wait = lock[RmW] l = l-locked = 0
-* MB MB
-* [S] l-locked = _Q_SLOW_VAL  [L]   __pv_lock_wait
-*
-* Matches the xchg() in pv_queue_spin_unlock().
-*/
-   if (!cmpxchg(l-locked, _Q_LOCKED_VAL, _Q_SLOW_VAL))
-   goto done;
+   if (!hb) {
+   hb = pv_hash_insert(lock);
+   /*
+* hb  must be set before setting _Q_SLOW_VAL
+*
+* [S]   hb - lock   [RmW] l = l-locked = 0
+*   MB MB
+* [RmW] l-locked ?= _Q_SLOW_VAL [L]   hb
+*
+* Matches the xchg() in pv_queue_spin_unlock().
+*/
+   o = cmpxchg(l-locked, _Q_LOCKED_VAL, _Q_SLOW_VAL);
+   if (!o) {
+   /*
+* The lock got 

Re: [PATCH 8/9] qspinlock: Generic paravirt support

2015-04-02 Thread Waiman Long

On 04/02/2015 03:48 PM, Peter Zijlstra wrote:

On Thu, Apr 02, 2015 at 07:20:57PM +0200, Peter Zijlstra wrote:

pv_wait_head():

pv_hash()
/* MB as per cmpxchg */
cmpxchg(l-locked, _Q_LOCKED_VAL, _Q_SLOW_VAL);

VS

__pv_queue_spin_unlock():

if (xchg(l-locked, 0) != _Q_SLOW_VAL)
return;

/* MB as per xchg */
pv_hash_find(lock);




Something like so.. compile tested only.

I took out the LFSR because that was likely over engineering from my
side :-)

--- a/kernel/locking/qspinlock_paravirt.h
+++ b/kernel/locking/qspinlock_paravirt.h
@@ -2,6 +2,8 @@
  #error do not include this file
  #endif

+#includelinux/hash.h
+
  /*
   * Implement paravirt qspinlocks; the general idea is to halt the vcpus 
instead
   * of spinning them.
@@ -107,7 +109,84 @@ static void pv_kick_node(struct mcs_spin
pv_kick(pn-cpu);
  }

-static DEFINE_PER_CPU(struct qspinlock *, __pv_lock_wait);
+/*
+ * Hash table using open addressing with an linear probe sequence.
+ *
+ * Since we should not be holding locks from NMI context (very rare indeed) the
+ * max load factor is 0.75, which is around the point where open adressing
+ * breaks down.
+ *
+ * Instead of probing just the immediate bucket we probe all buckets in the
+ * same cacheline.
+ *
+ * http://en.wikipedia.org/wiki/Hash_table#Open_addressing
+ *
+ */
+
+struct pv_hash_bucket {
+   struct qspinlock *lock;
+   int cpu;
+};
+
+/*
+ * XXX dynamic allocate using nr_cpu_ids instead...
+ */
+#define PV_LOCK_HASH_BITS  (2 + NR_CPUS_BITS)
+
+#if PV_LOCK_HASH_BITS  6
+#undef PV_LOCK_HASH_BITS
+#define PB_LOCK_HASH_BITS  6
+#endif
+
+#define PV_LOCK_HASH_SIZE  (1  PV_LOCK_HASH_BITS)
+
+static struct pv_hash_bucket __pv_lock_hash[PV_LOCK_HASH_SIZE] 
cacheline_aligned;
+
+#define PV_HB_PER_LINE (SMP_CACHE_BYTES / sizeof(struct 
pv_hash_bucket))
+
+static inline u32 hash_align(u32 hash)
+{
+   return hash  ~(PV_HB_PER_LINE - 1);
+}
+
+#define for_each_hash_bucket(hb, off, hash)
\
+   for (hash = hash_align(hash), off = 0, hb =__pv_lock_hash[hash + off];\
+   off  PV_LOCK_HASH_SIZE; \
+   off++, hb =__pv_lock_hash[(hash + off) % PV_LOCK_HASH_SIZE])
+
+static struct pv_hash_bucket *pv_hash_insert(struct qspinlock *lock)
+{
+   u32 offset, hash = hash_ptr(lock, PV_LOCK_HASH_BITS);
+   struct pv_hash_bucket *hb;
+
+   for_each_hash_bucket(hb, offset, hash) {
+   if (!cmpxchg(hb-lock, NULL, lock)) {
+   WRITE_ONCE(hb-cpu, smp_processor_id());
+   return hb;
+   }
+   }
+
+   /*
+* Hard assumes there is an empty bucket somewhere.
+*/
+   BUG();
+}
+
+static struct pv_hash_bucket *pv_hash_find(struct qspinlock *lock)
+{
+   u32 offset, hash = hash_ptr(lock, PV_LOCK_HASH_BITS);
+   struct pv_hash_bucket *hb;
+
+   for_each_hash_bucket(hb, offset, hash) {
+   if (READ_ONCE(hb-lock) == lock)
+   return hb;
+   }
+
+   /*
+* Hard assumes we _WILL_ find a match.
+*/
+   BUG();
+}

  /*
   * Wait for l-locked to become clear; halt the vcpu after a short spin.
@@ -116,7 +195,9 @@ static DEFINE_PER_CPU(struct qspinlock *
  static void pv_wait_head(struct qspinlock *lock)
  {
struct __qspinlock *l = (void *)lock;
+   struct pv_hash_bucket *hb = NULL;
int loop;
+   u8 o;

for (;;) {
for (loop = SPIN_THRESHOLD; loop; loop--) {
@@ -126,29 +207,47 @@ static void pv_wait_head(struct qspinloc
cpu_relax();
}

-   this_cpu_write(__pv_lock_wait, lock);
-   /*
-* __pv_lock_wait must be set before setting _Q_SLOW_VAL
-*
-* [S] __pv_lock_wait = lock[RmW] l = l-locked = 0
-* MB MB
-* [S] l-locked = _Q_SLOW_VAL  [L]   __pv_lock_wait
-*
-* Matches the xchg() in pv_queue_spin_unlock().
-*/
-   if (!cmpxchg(l-locked, _Q_LOCKED_VAL, _Q_SLOW_VAL))
-   goto done;
+   if (!hb) {
+   hb = pv_hash_insert(lock);
+   /*
+* hb  must be set before setting _Q_SLOW_VAL
+*
+* [S]   hb- lock   [RmW] l = l-locked = 0
+*   MB MB
+* [RmW] l-locked ?= _Q_SLOW_VAL [L]   hb
+*
+* Matches the xchg() in pv_queue_spin_unlock().
+*/
+   o = cmpxchg(l-locked, _Q_LOCKED_VAL, _Q_SLOW_VAL);
+   if (!o) {
+   /*
+

Re: [PATCH 8/9] qspinlock: Generic paravirt support

2015-04-02 Thread Peter Zijlstra
On Thu, Apr 02, 2015 at 12:28:30PM -0400, Waiman Long wrote:
 On 04/01/2015 05:03 PM, Peter Zijlstra wrote:
 On Wed, Apr 01, 2015 at 03:58:58PM -0400, Waiman Long wrote:
 On 04/01/2015 02:48 PM, Peter Zijlstra wrote:
 I am sorry that I don't quite get what you mean here. My point is that in
 the hashing step, a cpu will need to scan an empty bucket to put the lock
 in. In the interim, an previously used bucket before the empty one may get
 freed. In the lookup step for that lock, the scanning will stop because of
 an empty bucket in front of the target one.
 Right, that's broken. So we need to do something else to limit the
 lookup, because without that break, a lookup that needs to iterate the
 entire array in order to determine -ENOENT, which is expensive.
 
 So my alternative proposal is that IFF we can guarantee that every
 lookup will succeed -- the entry we're looking for is always there, we
 don't need the break on empty but can probe until we find the entry.
 This will be bound in cost to the same number if probes we required for
 insertion and avoids the full array scan.
 
 Now I think we can indeed do this, if as said earlier we do not clear
 the bucket on insert if the cmpxchg succeeds, in that case the unlock
 will observe _Q_SLOW_VAL and do the lookup, the lookup will then find
 the entry. And we then need the unlock to clear the entry.
 _Q_SLOW_VAL
 Does that explain this? Or should I try again with code?
 
 OK, I got your proposal now. However, there is still the issue that setting
 the _Q_SLOW_VAL flag and the hash bucket are not atomic wrt each other.

So? They're strictly ordered, that's sufficient. We first hash the lock,
then we set _Q_SLOW_VAL. There's a full memory barrier between them.

 It
 is possible a CPU has set the _Q_SLOW_VAL flag but not yet filling in the
 hash bucket while another one is trying to look for it.

Nope. The unlock side does an xchg() on the locked value first, xchg
also implies a full barrier, so that guarantees that if we observe
_Q_SLOW_VAL we must also observe the hash bucket with the lock value.

 So we need to have
 some kind of synchronization mechanism to let the lookup CPU know when is a
 good time to look up.

No, its all already ordered and working.

pv_wait_head():

pv_hash()
/* MB as per cmpxchg */
cmpxchg(l-locked, _Q_LOCKED_VAL, _Q_SLOW_VAL);

VS

__pv_queue_spin_unlock():

if (xchg(l-locked, 0) != _Q_SLOW_VAL)
return;

/* MB as per xchg */
pv_hash_find(lock);


--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH 8/9] qspinlock: Generic paravirt support

2015-04-02 Thread Waiman Long

On 04/01/2015 05:03 PM, Peter Zijlstra wrote:

On Wed, Apr 01, 2015 at 03:58:58PM -0400, Waiman Long wrote:

On 04/01/2015 02:48 PM, Peter Zijlstra wrote:
I am sorry that I don't quite get what you mean here. My point is that in
the hashing step, a cpu will need to scan an empty bucket to put the lock
in. In the interim, an previously used bucket before the empty one may get
freed. In the lookup step for that lock, the scanning will stop because of
an empty bucket in front of the target one.

Right, that's broken. So we need to do something else to limit the
lookup, because without that break, a lookup that needs to iterate the
entire array in order to determine -ENOENT, which is expensive.

So my alternative proposal is that IFF we can guarantee that every
lookup will succeed -- the entry we're looking for is always there, we
don't need the break on empty but can probe until we find the entry.
This will be bound in cost to the same number if probes we required for
insertion and avoids the full array scan.

Now I think we can indeed do this, if as said earlier we do not clear
the bucket on insert if the cmpxchg succeeds, in that case the unlock
will observe _Q_SLOW_VAL and do the lookup, the lookup will then find
the entry. And we then need the unlock to clear the entry.
_Q_SLOW_VAL
Does that explain this? Or should I try again with code?


OK, I got your proposal now. However, there is still the issue that 
setting the _Q_SLOW_VAL flag and the hash bucket are not atomic wrt each 
other. It is possible a CPU has set the _Q_SLOW_VAL flag but not yet 
filling in the hash bucket while another one is trying to look for it. 
So we need to have some kind of synchronization mechanism to let the 
lookup CPU know when is a good time to look up.


One possibility is to delay setting _Q_SLOW_VAL until the hash bucket is 
set up. Maybe we can make that work.


Cheers,
Longman


--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH 8/9] qspinlock: Generic paravirt support

2015-04-01 Thread Peter Zijlstra
On Wed, Apr 01, 2015 at 03:58:58PM -0400, Waiman Long wrote:
> On 04/01/2015 02:48 PM, Peter Zijlstra wrote:

> I am sorry that I don't quite get what you mean here. My point is that in
> the hashing step, a cpu will need to scan an empty bucket to put the lock
> in. In the interim, an previously used bucket before the empty one may get
> freed. In the lookup step for that lock, the scanning will stop because of
> an empty bucket in front of the target one.

Right, that's broken. So we need to do something else to limit the
lookup, because without that break, a lookup that needs to iterate the
entire array in order to determine -ENOENT, which is expensive.

So my alternative proposal is that IFF we can guarantee that every
lookup will succeed -- the entry we're looking for is always there, we
don't need the break on empty but can probe until we find the entry.
This will be bound in cost to the same number if probes we required for
insertion and avoids the full array scan.

Now I think we can indeed do this, if as said earlier we do not clear
the bucket on insert if the cmpxchg succeeds, in that case the unlock
will observe _Q_SLOW_VAL and do the lookup, the lookup will then find
the entry. And we then need the unlock to clear the entry.

Does that explain this? Or should I try again with code?
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH 8/9] qspinlock: Generic paravirt support

2015-04-01 Thread Waiman Long

On 04/01/2015 01:12 PM, Peter Zijlstra wrote:

On Wed, Apr 01, 2015 at 12:20:30PM -0400, Waiman Long wrote:

After more careful reading, I think the assumption that the presence of an
unused bucket means there is no match is not true. Consider the scenario:

1. cpu 0 puts lock1 into hb[0]
2. cpu 1 puts lock2 into hb[1]
3. cpu 2 clears hb[0]
4. cpu 3 looks for lock2 and doesn't find it

Hmm, yes. The only way I can see that being true is if we assume entries
are never taken out again.

The wikipedia page could use some clarification here, this is not clear.


At this point, I am thinking using back your previous idea of passing the
queue head information down the queue.

Having to scan the entire array for a lookup sure sucks, but the wait
loops involved in the other idea can get us in the exact predicament we
were trying to get out, because their forward progress depends on other
CPUs.


For the waiting loop, the worst case is when a new CPU get queued right 
before we write the head value to the previous tail node. In the case, 
the maximum number of retries is equal to the total number of CPUs - 2. 
But that should rarely happen.


I do find a way to guarantee forward progress in a few steps. I will try 
the normal way once. If that fails, I will insert the head node to the 
tail once again after saving the next pointer. After modifying the 
previous tail node, cmpxchg will be used to restore the previous tail. 
If that fails, we just have to wait until the next pointer is updated 
and write it out to the previous tail node. We can now restore the next 
pointer and move forward.


Let me know if that looks reasonable to you.

-Longman

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH 8/9] qspinlock: Generic paravirt support

2015-04-01 Thread Waiman Long

On 04/01/2015 02:48 PM, Peter Zijlstra wrote:

On Wed, Apr 01, 2015 at 02:54:45PM -0400, Waiman Long wrote:

On 04/01/2015 02:17 PM, Peter Zijlstra wrote:

On Wed, Apr 01, 2015 at 07:42:39PM +0200, Peter Zijlstra wrote:

Hohumm.. time to think more I think ;-)

So bear with me, I've not really pondered this well so it could be full
of holes (again).

After the cmpxchg(>locked, _Q_LOCKED_VAL, _Q_SLOW_VAL) succeeds the
spin_unlock() must do the hash lookup, right? We can make the lookup
unhash.

If the cmpxchg() fails the unlock will not do the lookup and we must
unhash.

The idea being that the result is that any lookup is guaranteed to find
an entry, which reduces our worst case lookup cost to whatever the worst
case insertion cost was.


I think it doesn't matter who did the unhashing. Multiple independent locks
can be hashed to the same value. Since they can be unhashed independently,
there is no way to know whether you have checked all the possible buckets.

oh but the crux is that you guarantee a lookup will find an entry. it will
never need to iterate the entire array.


I am sorry that I don't quite get what you mean here. My point is that 
in the hashing step, a cpu will need to scan an empty bucket to put the 
lock in. In the interim, an previously used bucket before the empty one 
may get freed. In the lookup step for that lock, the scanning will stop 
because of an empty bucket in front of the target one.


-Longman
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH 8/9] qspinlock: Generic paravirt support

2015-04-01 Thread Peter Zijlstra
On Wed, Apr 01, 2015 at 02:54:45PM -0400, Waiman Long wrote:
> On 04/01/2015 02:17 PM, Peter Zijlstra wrote:
> >On Wed, Apr 01, 2015 at 07:42:39PM +0200, Peter Zijlstra wrote:
> >>>Hohumm.. time to think more I think ;-)
> >>So bear with me, I've not really pondered this well so it could be full
> >>of holes (again).
> >>
> >>After the cmpxchg(>locked, _Q_LOCKED_VAL, _Q_SLOW_VAL) succeeds the
> >>spin_unlock() must do the hash lookup, right? We can make the lookup
> >>unhash.
> >>
> >>If the cmpxchg() fails the unlock will not do the lookup and we must
> >>unhash.
> >The idea being that the result is that any lookup is guaranteed to find
> >an entry, which reduces our worst case lookup cost to whatever the worst
> >case insertion cost was.
> >
> 
> I think it doesn't matter who did the unhashing. Multiple independent locks
> can be hashed to the same value. Since they can be unhashed independently,
> there is no way to know whether you have checked all the possible buckets.

oh but the crux is that you guarantee a lookup will find an entry. it will
never need to iterate the entire array.
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH 8/9] qspinlock: Generic paravirt support

2015-04-01 Thread Waiman Long

On 04/01/2015 02:17 PM, Peter Zijlstra wrote:

On Wed, Apr 01, 2015 at 07:42:39PM +0200, Peter Zijlstra wrote:

Hohumm.. time to think more I think ;-)

So bear with me, I've not really pondered this well so it could be full
of holes (again).

After the cmpxchg(>locked, _Q_LOCKED_VAL, _Q_SLOW_VAL) succeeds the
spin_unlock() must do the hash lookup, right? We can make the lookup
unhash.

If the cmpxchg() fails the unlock will not do the lookup and we must
unhash.

The idea being that the result is that any lookup is guaranteed to find
an entry, which reduces our worst case lookup cost to whatever the worst
case insertion cost was.



I think it doesn't matter who did the unhashing. Multiple independent 
locks can be hashed to the same value. Since they can be unhashed 
independently, there is no way to know whether you have checked all the 
possible buckets.


-Longman
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH 8/9] qspinlock: Generic paravirt support

2015-04-01 Thread Peter Zijlstra
On Wed, Apr 01, 2015 at 07:42:39PM +0200, Peter Zijlstra wrote:
> > Hohumm.. time to think more I think ;-)
> 
> So bear with me, I've not really pondered this well so it could be full
> of holes (again).
> 
> After the cmpxchg(>locked, _Q_LOCKED_VAL, _Q_SLOW_VAL) succeeds the
> spin_unlock() must do the hash lookup, right? We can make the lookup
> unhash.
> 
> If the cmpxchg() fails the unlock will not do the lookup and we must
> unhash.

The idea being that the result is that any lookup is guaranteed to find
an entry, which reduces our worst case lookup cost to whatever the worst
case insertion cost was.

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH 8/9] qspinlock: Generic paravirt support

2015-04-01 Thread Peter Zijlstra
On Wed, Apr 01, 2015 at 07:12:23PM +0200, Peter Zijlstra wrote:
> On Wed, Apr 01, 2015 at 12:20:30PM -0400, Waiman Long wrote:
> > After more careful reading, I think the assumption that the presence of an
> > unused bucket means there is no match is not true. Consider the scenario:
> > 
> > 1. cpu 0 puts lock1 into hb[0]
> > 2. cpu 1 puts lock2 into hb[1]
> > 3. cpu 2 clears hb[0]
> > 4. cpu 3 looks for lock2 and doesn't find it
> 
> Hmm, yes. The only way I can see that being true is if we assume entries
> are never taken out again.
> 
> The wikipedia page could use some clarification here, this is not clear.
> 
> > At this point, I am thinking using back your previous idea of passing the
> > queue head information down the queue.
> 
> Having to scan the entire array for a lookup sure sucks, but the wait
> loops involved in the other idea can get us in the exact predicament we
> were trying to get out, because their forward progress depends on other
> CPUs.
> 
> Hohumm.. time to think more I think ;-)

So bear with me, I've not really pondered this well so it could be full
of holes (again).

After the cmpxchg(>locked, _Q_LOCKED_VAL, _Q_SLOW_VAL) succeeds the
spin_unlock() must do the hash lookup, right? We can make the lookup
unhash.

If the cmpxchg() fails the unlock will not do the lookup and we must
unhash.

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH 8/9] qspinlock: Generic paravirt support

2015-04-01 Thread Peter Zijlstra
On Wed, Apr 01, 2015 at 12:20:30PM -0400, Waiman Long wrote:
> After more careful reading, I think the assumption that the presence of an
> unused bucket means there is no match is not true. Consider the scenario:
> 
> 1. cpu 0 puts lock1 into hb[0]
> 2. cpu 1 puts lock2 into hb[1]
> 3. cpu 2 clears hb[0]
> 4. cpu 3 looks for lock2 and doesn't find it

Hmm, yes. The only way I can see that being true is if we assume entries
are never taken out again.

The wikipedia page could use some clarification here, this is not clear.

> At this point, I am thinking using back your previous idea of passing the
> queue head information down the queue.

Having to scan the entire array for a lookup sure sucks, but the wait
loops involved in the other idea can get us in the exact predicament we
were trying to get out, because their forward progress depends on other
CPUs.

Hohumm.. time to think more I think ;-)
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH 8/9] qspinlock: Generic paravirt support

2015-04-01 Thread Waiman Long

On 03/19/2015 08:25 AM, Peter Zijlstra wrote:

On Thu, Mar 19, 2015 at 11:12:42AM +0100, Peter Zijlstra wrote:

So I was now thinking of hashing the lock pointer; let me go and quickly
put something together.

A little something like so; ideally we'd allocate the hashtable since
NR_CPUS is kinda bloated, but it shows the idea I think.

And while this has loops in (the rehashing thing) their fwd progress
does not depend on other CPUs.

And I suspect that for the typical lock contention scenarios its
unlikely we ever really get into long rehashing chains.

---
  include/linux/lfsr.h|   49 
  kernel/locking/qspinlock_paravirt.h |  143 

  2 files changed, 178 insertions(+), 14 deletions(-)

--- /dev/null

+
+static int pv_hash_find(struct qspinlock *lock)
+{
+   u64 hash = hash_ptr(lock, PV_LOCK_HASH_BITS);
+   struct pv_hash_bucket *hb, *end;
+   int cpu = -1;
+
+   if (!hash)
+   hash = 1;
+
+   hb =&__pv_lock_hash[hash_align(hash)];
+   for (;;) {
+   for (end = hb + PV_HB_PER_LINE; hb<  end; hb++) {
+   struct qspinlock *l = READ_ONCE(hb->lock);
+
+   /*
+* If we hit an unused bucket, there is no match.
+*/
+   if (!l)
+   goto done;


After more careful reading, I think the assumption that the presence of 
an unused bucket means there is no match is not true. Consider the scenario:


1. cpu 0 puts lock1 into hb[0]
2. cpu 1 puts lock2 into hb[1]
3. cpu 2 clears hb[0]
4. cpu 3 looks for lock2 and doesn't find it

I was thinking about putting some USED flag in the buckets, but then we 
will eventually fill them all up as used. If we put the entries into a 
hashed linked list, we have to deal with the complicated synchronization 
issues with link list update.


At this point, I am thinking using back your previous idea of passing 
the queue head information down the queue. I am now convinced that the 
unlock call site patching should work. So I will incorporate that in my 
next update.


Please let me know if you think my reasoning is not correct.

Thanks,
Longman

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH 8/9] qspinlock: Generic paravirt support

2015-04-01 Thread Waiman Long

On 03/19/2015 08:25 AM, Peter Zijlstra wrote:

On Thu, Mar 19, 2015 at 11:12:42AM +0100, Peter Zijlstra wrote:

So I was now thinking of hashing the lock pointer; let me go and quickly
put something together.

A little something like so; ideally we'd allocate the hashtable since
NR_CPUS is kinda bloated, but it shows the idea I think.

And while this has loops in (the rehashing thing) their fwd progress
does not depend on other CPUs.

And I suspect that for the typical lock contention scenarios its
unlikely we ever really get into long rehashing chains.

---
  include/linux/lfsr.h|   49 
  kernel/locking/qspinlock_paravirt.h |  143 

  2 files changed, 178 insertions(+), 14 deletions(-)

--- /dev/null

+
+static int pv_hash_find(struct qspinlock *lock)
+{
+   u64 hash = hash_ptr(lock, PV_LOCK_HASH_BITS);
+   struct pv_hash_bucket *hb, *end;
+   int cpu = -1;
+
+   if (!hash)
+   hash = 1;
+
+   hb =__pv_lock_hash[hash_align(hash)];
+   for (;;) {
+   for (end = hb + PV_HB_PER_LINE; hb  end; hb++) {
+   struct qspinlock *l = READ_ONCE(hb-lock);
+
+   /*
+* If we hit an unused bucket, there is no match.
+*/
+   if (!l)
+   goto done;


After more careful reading, I think the assumption that the presence of 
an unused bucket means there is no match is not true. Consider the scenario:


1. cpu 0 puts lock1 into hb[0]
2. cpu 1 puts lock2 into hb[1]
3. cpu 2 clears hb[0]
4. cpu 3 looks for lock2 and doesn't find it

I was thinking about putting some USED flag in the buckets, but then we 
will eventually fill them all up as used. If we put the entries into a 
hashed linked list, we have to deal with the complicated synchronization 
issues with link list update.


At this point, I am thinking using back your previous idea of passing 
the queue head information down the queue. I am now convinced that the 
unlock call site patching should work. So I will incorporate that in my 
next update.


Please let me know if you think my reasoning is not correct.

Thanks,
Longman

--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH 8/9] qspinlock: Generic paravirt support

2015-04-01 Thread Peter Zijlstra
On Wed, Apr 01, 2015 at 12:20:30PM -0400, Waiman Long wrote:
 After more careful reading, I think the assumption that the presence of an
 unused bucket means there is no match is not true. Consider the scenario:
 
 1. cpu 0 puts lock1 into hb[0]
 2. cpu 1 puts lock2 into hb[1]
 3. cpu 2 clears hb[0]
 4. cpu 3 looks for lock2 and doesn't find it

Hmm, yes. The only way I can see that being true is if we assume entries
are never taken out again.

The wikipedia page could use some clarification here, this is not clear.

 At this point, I am thinking using back your previous idea of passing the
 queue head information down the queue.

Having to scan the entire array for a lookup sure sucks, but the wait
loops involved in the other idea can get us in the exact predicament we
were trying to get out, because their forward progress depends on other
CPUs.

Hohumm.. time to think more I think ;-)
--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH 8/9] qspinlock: Generic paravirt support

2015-04-01 Thread Peter Zijlstra
On Wed, Apr 01, 2015 at 07:42:39PM +0200, Peter Zijlstra wrote:
  Hohumm.. time to think more I think ;-)
 
 So bear with me, I've not really pondered this well so it could be full
 of holes (again).
 
 After the cmpxchg(l-locked, _Q_LOCKED_VAL, _Q_SLOW_VAL) succeeds the
 spin_unlock() must do the hash lookup, right? We can make the lookup
 unhash.
 
 If the cmpxchg() fails the unlock will not do the lookup and we must
 unhash.

The idea being that the result is that any lookup is guaranteed to find
an entry, which reduces our worst case lookup cost to whatever the worst
case insertion cost was.

--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH 8/9] qspinlock: Generic paravirt support

2015-04-01 Thread Waiman Long

On 04/01/2015 02:17 PM, Peter Zijlstra wrote:

On Wed, Apr 01, 2015 at 07:42:39PM +0200, Peter Zijlstra wrote:

Hohumm.. time to think more I think ;-)

So bear with me, I've not really pondered this well so it could be full
of holes (again).

After the cmpxchg(l-locked, _Q_LOCKED_VAL, _Q_SLOW_VAL) succeeds the
spin_unlock() must do the hash lookup, right? We can make the lookup
unhash.

If the cmpxchg() fails the unlock will not do the lookup and we must
unhash.

The idea being that the result is that any lookup is guaranteed to find
an entry, which reduces our worst case lookup cost to whatever the worst
case insertion cost was.



I think it doesn't matter who did the unhashing. Multiple independent 
locks can be hashed to the same value. Since they can be unhashed 
independently, there is no way to know whether you have checked all the 
possible buckets.


-Longman
--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH 8/9] qspinlock: Generic paravirt support

2015-04-01 Thread Peter Zijlstra
On Wed, Apr 01, 2015 at 02:54:45PM -0400, Waiman Long wrote:
 On 04/01/2015 02:17 PM, Peter Zijlstra wrote:
 On Wed, Apr 01, 2015 at 07:42:39PM +0200, Peter Zijlstra wrote:
 Hohumm.. time to think more I think ;-)
 So bear with me, I've not really pondered this well so it could be full
 of holes (again).
 
 After the cmpxchg(l-locked, _Q_LOCKED_VAL, _Q_SLOW_VAL) succeeds the
 spin_unlock() must do the hash lookup, right? We can make the lookup
 unhash.
 
 If the cmpxchg() fails the unlock will not do the lookup and we must
 unhash.
 The idea being that the result is that any lookup is guaranteed to find
 an entry, which reduces our worst case lookup cost to whatever the worst
 case insertion cost was.
 
 
 I think it doesn't matter who did the unhashing. Multiple independent locks
 can be hashed to the same value. Since they can be unhashed independently,
 there is no way to know whether you have checked all the possible buckets.

oh but the crux is that you guarantee a lookup will find an entry. it will
never need to iterate the entire array.
--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH 8/9] qspinlock: Generic paravirt support

2015-04-01 Thread Peter Zijlstra
On Wed, Apr 01, 2015 at 07:12:23PM +0200, Peter Zijlstra wrote:
 On Wed, Apr 01, 2015 at 12:20:30PM -0400, Waiman Long wrote:
  After more careful reading, I think the assumption that the presence of an
  unused bucket means there is no match is not true. Consider the scenario:
  
  1. cpu 0 puts lock1 into hb[0]
  2. cpu 1 puts lock2 into hb[1]
  3. cpu 2 clears hb[0]
  4. cpu 3 looks for lock2 and doesn't find it
 
 Hmm, yes. The only way I can see that being true is if we assume entries
 are never taken out again.
 
 The wikipedia page could use some clarification here, this is not clear.
 
  At this point, I am thinking using back your previous idea of passing the
  queue head information down the queue.
 
 Having to scan the entire array for a lookup sure sucks, but the wait
 loops involved in the other idea can get us in the exact predicament we
 were trying to get out, because their forward progress depends on other
 CPUs.
 
 Hohumm.. time to think more I think ;-)

So bear with me, I've not really pondered this well so it could be full
of holes (again).

After the cmpxchg(l-locked, _Q_LOCKED_VAL, _Q_SLOW_VAL) succeeds the
spin_unlock() must do the hash lookup, right? We can make the lookup
unhash.

If the cmpxchg() fails the unlock will not do the lookup and we must
unhash.

--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH 8/9] qspinlock: Generic paravirt support

2015-04-01 Thread Waiman Long

On 04/01/2015 01:12 PM, Peter Zijlstra wrote:

On Wed, Apr 01, 2015 at 12:20:30PM -0400, Waiman Long wrote:

After more careful reading, I think the assumption that the presence of an
unused bucket means there is no match is not true. Consider the scenario:

1. cpu 0 puts lock1 into hb[0]
2. cpu 1 puts lock2 into hb[1]
3. cpu 2 clears hb[0]
4. cpu 3 looks for lock2 and doesn't find it

Hmm, yes. The only way I can see that being true is if we assume entries
are never taken out again.

The wikipedia page could use some clarification here, this is not clear.


At this point, I am thinking using back your previous idea of passing the
queue head information down the queue.

Having to scan the entire array for a lookup sure sucks, but the wait
loops involved in the other idea can get us in the exact predicament we
were trying to get out, because their forward progress depends on other
CPUs.


For the waiting loop, the worst case is when a new CPU get queued right 
before we write the head value to the previous tail node. In the case, 
the maximum number of retries is equal to the total number of CPUs - 2. 
But that should rarely happen.


I do find a way to guarantee forward progress in a few steps. I will try 
the normal way once. If that fails, I will insert the head node to the 
tail once again after saving the next pointer. After modifying the 
previous tail node, cmpxchg will be used to restore the previous tail. 
If that fails, we just have to wait until the next pointer is updated 
and write it out to the previous tail node. We can now restore the next 
pointer and move forward.


Let me know if that looks reasonable to you.

-Longman

--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH 8/9] qspinlock: Generic paravirt support

2015-04-01 Thread Peter Zijlstra
On Wed, Apr 01, 2015 at 03:58:58PM -0400, Waiman Long wrote:
 On 04/01/2015 02:48 PM, Peter Zijlstra wrote:

 I am sorry that I don't quite get what you mean here. My point is that in
 the hashing step, a cpu will need to scan an empty bucket to put the lock
 in. In the interim, an previously used bucket before the empty one may get
 freed. In the lookup step for that lock, the scanning will stop because of
 an empty bucket in front of the target one.

Right, that's broken. So we need to do something else to limit the
lookup, because without that break, a lookup that needs to iterate the
entire array in order to determine -ENOENT, which is expensive.

So my alternative proposal is that IFF we can guarantee that every
lookup will succeed -- the entry we're looking for is always there, we
don't need the break on empty but can probe until we find the entry.
This will be bound in cost to the same number if probes we required for
insertion and avoids the full array scan.

Now I think we can indeed do this, if as said earlier we do not clear
the bucket on insert if the cmpxchg succeeds, in that case the unlock
will observe _Q_SLOW_VAL and do the lookup, the lookup will then find
the entry. And we then need the unlock to clear the entry.

Does that explain this? Or should I try again with code?
--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH 8/9] qspinlock: Generic paravirt support

2015-04-01 Thread Waiman Long

On 04/01/2015 02:48 PM, Peter Zijlstra wrote:

On Wed, Apr 01, 2015 at 02:54:45PM -0400, Waiman Long wrote:

On 04/01/2015 02:17 PM, Peter Zijlstra wrote:

On Wed, Apr 01, 2015 at 07:42:39PM +0200, Peter Zijlstra wrote:

Hohumm.. time to think more I think ;-)

So bear with me, I've not really pondered this well so it could be full
of holes (again).

After the cmpxchg(l-locked, _Q_LOCKED_VAL, _Q_SLOW_VAL) succeeds the
spin_unlock() must do the hash lookup, right? We can make the lookup
unhash.

If the cmpxchg() fails the unlock will not do the lookup and we must
unhash.

The idea being that the result is that any lookup is guaranteed to find
an entry, which reduces our worst case lookup cost to whatever the worst
case insertion cost was.


I think it doesn't matter who did the unhashing. Multiple independent locks
can be hashed to the same value. Since they can be unhashed independently,
there is no way to know whether you have checked all the possible buckets.

oh but the crux is that you guarantee a lookup will find an entry. it will
never need to iterate the entire array.


I am sorry that I don't quite get what you mean here. My point is that 
in the hashing step, a cpu will need to scan an empty bucket to put the 
lock in. In the interim, an previously used bucket before the empty one 
may get freed. In the lookup step for that lock, the scanning will stop 
because of an empty bucket in front of the target one.


-Longman
--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH 8/9] qspinlock: Generic paravirt support

2015-03-19 Thread Waiman Long

On 03/19/2015 08:25 AM, Peter Zijlstra wrote:

On Thu, Mar 19, 2015 at 11:12:42AM +0100, Peter Zijlstra wrote:

So I was now thinking of hashing the lock pointer; let me go and quickly
put something together.

A little something like so; ideally we'd allocate the hashtable since
NR_CPUS is kinda bloated, but it shows the idea I think.

And while this has loops in (the rehashing thing) their fwd progress
does not depend on other CPUs.

And I suspect that for the typical lock contention scenarios its
unlikely we ever really get into long rehashing chains.

---
  include/linux/lfsr.h|   49 
  kernel/locking/qspinlock_paravirt.h |  143 

  2 files changed, 178 insertions(+), 14 deletions(-)


This is a much better alternative.


--- /dev/null
+++ b/include/linux/lfsr.h
@@ -0,0 +1,49 @@
+#ifndef _LINUX_LFSR_H
+#define _LINUX_LFSR_H
+
+/*
+ * Simple Binary Galois Linear Feedback Shift Register
+ *
+ * http://en.wikipedia.org/wiki/Linear_feedback_shift_register
+ *
+ */
+
+extern void __lfsr_needs_more_taps(void);
+
+static __always_inline u32 lfsr_taps(int bits)
+{
+   if (bits ==  1) return 0x0001;
+   if (bits ==  2) return 0x0001;
+   if (bits ==  3) return 0x0003;
+   if (bits ==  4) return 0x0009;
+   if (bits ==  5) return 0x0012;
+   if (bits ==  6) return 0x0021;
+   if (bits ==  7) return 0x0041;
+   if (bits ==  8) return 0x008E;
+   if (bits ==  9) return 0x0108;
+   if (bits == 10) return 0x0204;
+   if (bits == 11) return 0x0402;
+   if (bits == 12) return 0x0829;
+   if (bits == 13) return 0x100D;
+   if (bits == 14) return 0x2015;
+
+   /*
+* For more taps see:
+*   http://users.ece.cmu.edu/~koopman/lfsr/index.html
+*/
+   __lfsr_needs_more_taps();
+
+   return 0;
+}
+
+static inline u32 lfsr(u32 val, int bits)
+{
+   u32 bit = val&  1;
+
+   val>>= 1;
+   if (bit)
+   val ^= lfsr_taps(bits);
+   return val;
+}
+
+#endif /* _LINUX_LFSR_H */
--- a/kernel/locking/qspinlock_paravirt.h
+++ b/kernel/locking/qspinlock_paravirt.h
@@ -2,6 +2,9 @@
  #error "do not include this file"
  #endif

+#include
+#include
+
  /*
   * Implement paravirt qspinlocks; the general idea is to halt the vcpus 
instead
   * of spinning them.
@@ -107,7 +110,120 @@ static void pv_kick_node(struct mcs_spin
pv_kick(pn->cpu);
  }

-static DEFINE_PER_CPU(struct qspinlock *, __pv_lock_wait);
+/*
+ * Hash table using open addressing with an LFSR probe sequence.
+ *
+ * Since we should not be holding locks from NMI context (very rare indeed) the
+ * max load factor is 0.75, which is around the point where open addressing
+ * breaks down.
+ *
+ * Instead of probing just the immediate bucket we probe all buckets in the
+ * same cacheline.
+ *
+ * http://en.wikipedia.org/wiki/Hash_table#Open_addressing
+ *
+ */
+
+#define HB_RESERVED((struct qspinlock *)1)
+
+struct pv_hash_bucket {
+   struct qspinlock *lock;
+   int cpu;
+};
+
+/*
+ * XXX dynamic allocate using nr_cpu_ids instead...
+ */
+#define PV_LOCK_HASH_BITS  (2 + NR_CPUS_BITS)
+


As said here, we should make it dynamically allocated depending on 
num_possible_cpus().



+#if PV_LOCK_HASH_BITS<  6
+#undef PV_LOCK_HASH_BITS
+#define PB_LOCK_HASH_BITS  6
+#endif
+
+#define PV_LOCK_HASH_SIZE  (1<<  PV_LOCK_HASH_BITS)
+
+static struct pv_hash_bucket __pv_lock_hash[PV_LOCK_HASH_SIZE] 
cacheline_aligned;
+
+#define PV_HB_PER_LINE (SMP_CACHE_BYTES / sizeof(struct 
pv_hash_bucket))
+
+static inline u32 hash_align(u32 hash)
+{
+   return hash&  ~(PV_HB_PER_LINE - 1);
+}
+
+static struct qspinlock **pv_hash(struct qspinlock *lock)
+{
+   u32 hash = hash_ptr(lock, PV_LOCK_HASH_BITS);
+   struct pv_hash_bucket *hb, *end;
+
+   if (!hash)
+   hash = 1;
+
+   hb =&__pv_lock_hash[hash_align(hash)];
+   for (;;) {
+   for (end = hb + PV_HB_PER_LINE; hb<  end; hb++) {
+   if (cmpxchg(>lock, NULL, HB_RESERVED)) {
+   WRITE_ONCE(hb->cpu, smp_processor_id());
+   /*
+* Since we must read lock first and cpu
+* second, we must write cpu first and lock
+* second, therefore use HB_RESERVE to mark an
+* entry in use before writing the values.
+*
+* This can cause hb_hash_find() to not find a
+* cpu even though _Q_SLOW_VAL, this is not a
+* problem since we re-check l->locked before
+* going to sleep and the unlock will have
+* cleared l->locked already.
+*/
+   smp_wmb(); /* 

Re: [PATCH 8/9] qspinlock: Generic paravirt support

2015-03-19 Thread Peter Zijlstra
On Thu, Mar 19, 2015 at 01:25:36PM +0100, Peter Zijlstra wrote:
> +static struct qspinlock **pv_hash(struct qspinlock *lock)
> +{
> + u32 hash = hash_ptr(lock, PV_LOCK_HASH_BITS);
> + struct pv_hash_bucket *hb, *end;
> +
> + if (!hash)
> + hash = 1;
> +
> + hb = &__pv_lock_hash[hash_align(hash)];
> + for (;;) {
> + for (end = hb + PV_HB_PER_LINE; hb < end; hb++) {
> + if (cmpxchg(>lock, NULL, HB_RESERVED)) {

That should be: !cmpxchg(), bit disturbing that that booted.

> + WRITE_ONCE(hb->cpu, smp_processor_id());
> + /*
> +  * Since we must read lock first and cpu
> +  * second, we must write cpu first and lock
> +  * second, therefore use HB_RESERVE to mark an
> +  * entry in use before writing the values.
> +  *
> +  * This can cause hb_hash_find() to not find a
> +  * cpu even though _Q_SLOW_VAL, this is not a
> +  * problem since we re-check l->locked before
> +  * going to sleep and the unlock will have
> +  * cleared l->locked already.
> +  */
> + smp_wmb(); /* matches rmb from pv_hash_find */
> + WRITE_ONCE(hb->lock, lock);
> + goto done;
> + }
> + }
> +
> + hash = lfsr(hash, PV_LOCK_HASH_BITS);
> + hb = &__pv_lock_hash[hash_align(hash)];
> + }
> +
> +done:
> + return >lock;
> +}
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH 8/9] qspinlock: Generic paravirt support

2015-03-19 Thread Peter Zijlstra
On Thu, Mar 19, 2015 at 11:12:42AM +0100, Peter Zijlstra wrote:
> So I was now thinking of hashing the lock pointer; let me go and quickly
> put something together.

A little something like so; ideally we'd allocate the hashtable since
NR_CPUS is kinda bloated, but it shows the idea I think.

And while this has loops in (the rehashing thing) their fwd progress
does not depend on other CPUs.

And I suspect that for the typical lock contention scenarios its
unlikely we ever really get into long rehashing chains.

---
 include/linux/lfsr.h|   49 
 kernel/locking/qspinlock_paravirt.h |  143 
 2 files changed, 178 insertions(+), 14 deletions(-)

--- /dev/null
+++ b/include/linux/lfsr.h
@@ -0,0 +1,49 @@
+#ifndef _LINUX_LFSR_H
+#define _LINUX_LFSR_H
+
+/*
+ * Simple Binary Galois Linear Feedback Shift Register
+ *
+ * http://en.wikipedia.org/wiki/Linear_feedback_shift_register
+ *
+ */
+
+extern void __lfsr_needs_more_taps(void);
+
+static __always_inline u32 lfsr_taps(int bits)
+{
+   if (bits ==  1) return 0x0001;
+   if (bits ==  2) return 0x0001;
+   if (bits ==  3) return 0x0003;
+   if (bits ==  4) return 0x0009;
+   if (bits ==  5) return 0x0012;
+   if (bits ==  6) return 0x0021;
+   if (bits ==  7) return 0x0041;
+   if (bits ==  8) return 0x008E;
+   if (bits ==  9) return 0x0108;
+   if (bits == 10) return 0x0204;
+   if (bits == 11) return 0x0402;
+   if (bits == 12) return 0x0829;
+   if (bits == 13) return 0x100D;
+   if (bits == 14) return 0x2015;
+
+   /*
+* For more taps see:
+*   http://users.ece.cmu.edu/~koopman/lfsr/index.html
+*/
+   __lfsr_needs_more_taps();
+
+   return 0;
+}
+
+static inline u32 lfsr(u32 val, int bits)
+{
+   u32 bit = val & 1;
+
+   val >>= 1;
+   if (bit)
+   val ^= lfsr_taps(bits);
+   return val;
+}
+
+#endif /* _LINUX_LFSR_H */
--- a/kernel/locking/qspinlock_paravirt.h
+++ b/kernel/locking/qspinlock_paravirt.h
@@ -2,6 +2,9 @@
 #error "do not include this file"
 #endif
 
+#include 
+#include 
+
 /*
  * Implement paravirt qspinlocks; the general idea is to halt the vcpus instead
  * of spinning them.
@@ -107,7 +110,120 @@ static void pv_kick_node(struct mcs_spin
pv_kick(pn->cpu);
 }
 
-static DEFINE_PER_CPU(struct qspinlock *, __pv_lock_wait);
+/*
+ * Hash table using open addressing with an LFSR probe sequence.
+ *
+ * Since we should not be holding locks from NMI context (very rare indeed) the
+ * max load factor is 0.75, which is around the point where open addressing
+ * breaks down.
+ *
+ * Instead of probing just the immediate bucket we probe all buckets in the
+ * same cacheline.
+ *
+ * http://en.wikipedia.org/wiki/Hash_table#Open_addressing
+ *
+ */
+
+#define HB_RESERVED((struct qspinlock *)1)
+
+struct pv_hash_bucket {
+   struct qspinlock *lock;
+   int cpu;
+};
+
+/*
+ * XXX dynamic allocate using nr_cpu_ids instead...
+ */
+#define PV_LOCK_HASH_BITS  (2 + NR_CPUS_BITS)
+
+#if PV_LOCK_HASH_BITS < 6
+#undef PV_LOCK_HASH_BITS
+#define PB_LOCK_HASH_BITS  6
+#endif
+
+#define PV_LOCK_HASH_SIZE  (1 << PV_LOCK_HASH_BITS)
+
+static struct pv_hash_bucket __pv_lock_hash[PV_LOCK_HASH_SIZE] 
cacheline_aligned;
+
+#define PV_HB_PER_LINE (SMP_CACHE_BYTES / sizeof(struct 
pv_hash_bucket))
+
+static inline u32 hash_align(u32 hash)
+{
+   return hash & ~(PV_HB_PER_LINE - 1);
+}
+
+static struct qspinlock **pv_hash(struct qspinlock *lock)
+{
+   u32 hash = hash_ptr(lock, PV_LOCK_HASH_BITS);
+   struct pv_hash_bucket *hb, *end;
+
+   if (!hash)
+   hash = 1;
+
+   hb = &__pv_lock_hash[hash_align(hash)];
+   for (;;) {
+   for (end = hb + PV_HB_PER_LINE; hb < end; hb++) {
+   if (cmpxchg(>lock, NULL, HB_RESERVED)) {
+   WRITE_ONCE(hb->cpu, smp_processor_id());
+   /*
+* Since we must read lock first and cpu
+* second, we must write cpu first and lock
+* second, therefore use HB_RESERVE to mark an
+* entry in use before writing the values.
+*
+* This can cause hb_hash_find() to not find a
+* cpu even though _Q_SLOW_VAL, this is not a
+* problem since we re-check l->locked before
+* going to sleep and the unlock will have
+* cleared l->locked already.
+*/
+   smp_wmb(); /* matches rmb from pv_hash_find */
+   WRITE_ONCE(hb->lock, lock);
+   goto done;
+   }
+   }
+

Re: [PATCH 8/9] qspinlock: Generic paravirt support

2015-03-19 Thread Peter Zijlstra
On Wed, Mar 18, 2015 at 04:50:37PM -0400, Waiman Long wrote:
> >+this_cpu_write(__pv_lock_wait, lock);
> 
> We may run into the same problem of needing to have 4 queue nodes per CPU.
> If an interrupt happens just after the write and before the actual wait and
> it goes through the same sequence, it will overwrite the __pv_lock_wait[]
> entry. So we may have lost wakeup. That is why the pvticket lock code did
> that just before the actual wait with interrupt disabled. We probably
> couldn't disable interrupt here. So we may need to move the actual write to
> the KVM and Xen code if we keep the current logic.

Hmm indeed. So I didn't actually mean to keep this code, but yes I
missed that.

> >+/*
> >+ * At this point the memory pointed at by lock can be freed/reused,
> >+ * however we can still use the pointer value to search in our cpu
> >+ * array.
> >+ *
> >+ * XXX: get rid of this loop
> >+ */
> >+for_each_possible_cpu(cpu) {
> >+if (per_cpu(__pv_lock_wait, cpu) == lock)
> >+pv_kick(cpu);
> >+}
> >+}
> 
> I do want to get rid of this loop too. On average, we have to scan about
> half the number of CPUs available. So it isn't that different
> performance-wise compared with my original idea of following the list from
> tail to head. And how about your idea of propagating the current head down
> the linked list?

Yeah, so I was half done with that patch when I fell asleep on the
flight home. Didn't get around to 'fixing' it. It applies and builds but
doesn't actually work.

_However_ I'm not sure I actually like it much.

The problem I have with it are these wait loops, they can generate the
same problem we're trying to avoid.

So I was now thinking of hashing the lock pointer; let me go and quickly
put something together.

---
 kernel/locking/qspinlock.c  |8 +-
 kernel/locking/qspinlock_paravirt.h |  124 
 2 files changed, 101 insertions(+), 31 deletions(-)

--- a/kernel/locking/qspinlock.c
+++ b/kernel/locking/qspinlock.c
@@ -246,10 +246,10 @@ static __always_inline void set_locked(s
  */
 
 static __always_inline void __pv_init_node(struct mcs_spinlock *node) { }
-static __always_inline void __pv_wait_node(struct mcs_spinlock *node) { }
+static __always_inline void __pv_wait_node(u32 old, struct mcs_spinlock *node) 
{ }
 static __always_inline void __pv_kick_node(struct mcs_spinlock *node) { }
 
-static __always_inline void __pv_wait_head(struct qspinlock *lock) { }
+static __always_inline void __pv_wait_head(struct qspinlock *lock, struct 
mcs_spinlock *node) { }
 
 #define pv_enabled()   false
 
@@ -399,7 +399,7 @@ void queue_spin_lock_slowpath(struct qsp
prev = decode_tail(old);
WRITE_ONCE(prev->next, node);
 
-   pv_wait_node(node);
+   pv_wait_node(old, node);
arch_mcs_spin_lock_contended(>locked);
}
 
@@ -414,7 +414,7 @@ void queue_spin_lock_slowpath(struct qsp
 * sequentiality; this is because the set_locked() function below
 * does not imply a full barrier.
 */
-   pv_wait_head(lock);
+   pv_wait_head(lock, node);
while ((val = smp_load_acquire(>val.counter)) & 
_Q_LOCKED_PENDING_MASK)
cpu_relax();
 
--- a/kernel/locking/qspinlock_paravirt.h
+++ b/kernel/locking/qspinlock_paravirt.h
@@ -29,8 +29,14 @@ struct pv_node {
 
int cpu;
u8  state;
+   struct pv_node  *head;
 };
 
+static inline struct pv_node *pv_decode_tail(u32 tail)
+{
+   return (struct pv_node *)decode_tail(tail);
+}
+
 /*
  * Initialize the PV part of the mcs_spinlock node.
  */
@@ -42,17 +48,49 @@ static void pv_init_node(struct mcs_spin
 
pn->cpu = smp_processor_id();
pn->state = vcpu_running;
+   pn->head = NULL;
+}
+
+static void pv_propagate_head(struct pv_node *pn, struct pv_node *ppn)
+{
+   /*
+* When we race against the first waiter or ourselves we have to wait
+* until the previous link updates its head pointer before we can
+* propagate it.
+*/
+   while (!READ_ONCE(ppn->head)) {
+   /*
+* queue_spin_lock_slowpath could have been waiting for the
+* node->next store above before setting node->locked.
+*/
+   if (ppn->mcs.locked)
+   return;
+
+   cpu_relax();
+   }
+   /*
+* If we interleave such that the store from pv_set_head() happens
+* between our load and store we could have over-written the new head
+* value.
+*
+* Therefore use cmpxchg to only propagate the old head value if our
+* head value is unmodified.
+*/
+   (void)cmpxchg(>head, NULL, READ_ONCE(ppn->head));
 }
 
 /*
  * Wait for node->locked to become true, halt the vcpu after a short spin.
 

Re: [PATCH 8/9] qspinlock: Generic paravirt support

2015-03-19 Thread Waiman Long

On 03/19/2015 08:25 AM, Peter Zijlstra wrote:

On Thu, Mar 19, 2015 at 11:12:42AM +0100, Peter Zijlstra wrote:

So I was now thinking of hashing the lock pointer; let me go and quickly
put something together.

A little something like so; ideally we'd allocate the hashtable since
NR_CPUS is kinda bloated, but it shows the idea I think.

And while this has loops in (the rehashing thing) their fwd progress
does not depend on other CPUs.

And I suspect that for the typical lock contention scenarios its
unlikely we ever really get into long rehashing chains.

---
  include/linux/lfsr.h|   49 
  kernel/locking/qspinlock_paravirt.h |  143 

  2 files changed, 178 insertions(+), 14 deletions(-)


This is a much better alternative.


--- /dev/null
+++ b/include/linux/lfsr.h
@@ -0,0 +1,49 @@
+#ifndef _LINUX_LFSR_H
+#define _LINUX_LFSR_H
+
+/*
+ * Simple Binary Galois Linear Feedback Shift Register
+ *
+ * http://en.wikipedia.org/wiki/Linear_feedback_shift_register
+ *
+ */
+
+extern void __lfsr_needs_more_taps(void);
+
+static __always_inline u32 lfsr_taps(int bits)
+{
+   if (bits ==  1) return 0x0001;
+   if (bits ==  2) return 0x0001;
+   if (bits ==  3) return 0x0003;
+   if (bits ==  4) return 0x0009;
+   if (bits ==  5) return 0x0012;
+   if (bits ==  6) return 0x0021;
+   if (bits ==  7) return 0x0041;
+   if (bits ==  8) return 0x008E;
+   if (bits ==  9) return 0x0108;
+   if (bits == 10) return 0x0204;
+   if (bits == 11) return 0x0402;
+   if (bits == 12) return 0x0829;
+   if (bits == 13) return 0x100D;
+   if (bits == 14) return 0x2015;
+
+   /*
+* For more taps see:
+*   http://users.ece.cmu.edu/~koopman/lfsr/index.html
+*/
+   __lfsr_needs_more_taps();
+
+   return 0;
+}
+
+static inline u32 lfsr(u32 val, int bits)
+{
+   u32 bit = val  1;
+
+   val= 1;
+   if (bit)
+   val ^= lfsr_taps(bits);
+   return val;
+}
+
+#endif /* _LINUX_LFSR_H */
--- a/kernel/locking/qspinlock_paravirt.h
+++ b/kernel/locking/qspinlock_paravirt.h
@@ -2,6 +2,9 @@
  #error do not include this file
  #endif

+#includelinux/hash.h
+#includelinux/lfsr.h
+
  /*
   * Implement paravirt qspinlocks; the general idea is to halt the vcpus 
instead
   * of spinning them.
@@ -107,7 +110,120 @@ static void pv_kick_node(struct mcs_spin
pv_kick(pn-cpu);
  }

-static DEFINE_PER_CPU(struct qspinlock *, __pv_lock_wait);
+/*
+ * Hash table using open addressing with an LFSR probe sequence.
+ *
+ * Since we should not be holding locks from NMI context (very rare indeed) the
+ * max load factor is 0.75, which is around the point where open addressing
+ * breaks down.
+ *
+ * Instead of probing just the immediate bucket we probe all buckets in the
+ * same cacheline.
+ *
+ * http://en.wikipedia.org/wiki/Hash_table#Open_addressing
+ *
+ */
+
+#define HB_RESERVED((struct qspinlock *)1)
+
+struct pv_hash_bucket {
+   struct qspinlock *lock;
+   int cpu;
+};
+
+/*
+ * XXX dynamic allocate using nr_cpu_ids instead...
+ */
+#define PV_LOCK_HASH_BITS  (2 + NR_CPUS_BITS)
+


As said here, we should make it dynamically allocated depending on 
num_possible_cpus().



+#if PV_LOCK_HASH_BITS  6
+#undef PV_LOCK_HASH_BITS
+#define PB_LOCK_HASH_BITS  6
+#endif
+
+#define PV_LOCK_HASH_SIZE  (1  PV_LOCK_HASH_BITS)
+
+static struct pv_hash_bucket __pv_lock_hash[PV_LOCK_HASH_SIZE] 
cacheline_aligned;
+
+#define PV_HB_PER_LINE (SMP_CACHE_BYTES / sizeof(struct 
pv_hash_bucket))
+
+static inline u32 hash_align(u32 hash)
+{
+   return hash  ~(PV_HB_PER_LINE - 1);
+}
+
+static struct qspinlock **pv_hash(struct qspinlock *lock)
+{
+   u32 hash = hash_ptr(lock, PV_LOCK_HASH_BITS);
+   struct pv_hash_bucket *hb, *end;
+
+   if (!hash)
+   hash = 1;
+
+   hb =__pv_lock_hash[hash_align(hash)];
+   for (;;) {
+   for (end = hb + PV_HB_PER_LINE; hb  end; hb++) {
+   if (cmpxchg(hb-lock, NULL, HB_RESERVED)) {
+   WRITE_ONCE(hb-cpu, smp_processor_id());
+   /*
+* Since we must read lock first and cpu
+* second, we must write cpu first and lock
+* second, therefore use HB_RESERVE to mark an
+* entry in use before writing the values.
+*
+* This can cause hb_hash_find() to not find a
+* cpu even though _Q_SLOW_VAL, this is not a
+* problem since we re-check l-locked before
+* going to sleep and the unlock will have
+* cleared l-locked already.
+*/
+   

Re: [PATCH 8/9] qspinlock: Generic paravirt support

2015-03-19 Thread Peter Zijlstra
On Wed, Mar 18, 2015 at 04:50:37PM -0400, Waiman Long wrote:
 +this_cpu_write(__pv_lock_wait, lock);
 
 We may run into the same problem of needing to have 4 queue nodes per CPU.
 If an interrupt happens just after the write and before the actual wait and
 it goes through the same sequence, it will overwrite the __pv_lock_wait[]
 entry. So we may have lost wakeup. That is why the pvticket lock code did
 that just before the actual wait with interrupt disabled. We probably
 couldn't disable interrupt here. So we may need to move the actual write to
 the KVM and Xen code if we keep the current logic.

Hmm indeed. So I didn't actually mean to keep this code, but yes I
missed that.

 +/*
 + * At this point the memory pointed at by lock can be freed/reused,
 + * however we can still use the pointer value to search in our cpu
 + * array.
 + *
 + * XXX: get rid of this loop
 + */
 +for_each_possible_cpu(cpu) {
 +if (per_cpu(__pv_lock_wait, cpu) == lock)
 +pv_kick(cpu);
 +}
 +}
 
 I do want to get rid of this loop too. On average, we have to scan about
 half the number of CPUs available. So it isn't that different
 performance-wise compared with my original idea of following the list from
 tail to head. And how about your idea of propagating the current head down
 the linked list?

Yeah, so I was half done with that patch when I fell asleep on the
flight home. Didn't get around to 'fixing' it. It applies and builds but
doesn't actually work.

_However_ I'm not sure I actually like it much.

The problem I have with it are these wait loops, they can generate the
same problem we're trying to avoid.

So I was now thinking of hashing the lock pointer; let me go and quickly
put something together.

---
 kernel/locking/qspinlock.c  |8 +-
 kernel/locking/qspinlock_paravirt.h |  124 
 2 files changed, 101 insertions(+), 31 deletions(-)

--- a/kernel/locking/qspinlock.c
+++ b/kernel/locking/qspinlock.c
@@ -246,10 +246,10 @@ static __always_inline void set_locked(s
  */
 
 static __always_inline void __pv_init_node(struct mcs_spinlock *node) { }
-static __always_inline void __pv_wait_node(struct mcs_spinlock *node) { }
+static __always_inline void __pv_wait_node(u32 old, struct mcs_spinlock *node) 
{ }
 static __always_inline void __pv_kick_node(struct mcs_spinlock *node) { }
 
-static __always_inline void __pv_wait_head(struct qspinlock *lock) { }
+static __always_inline void __pv_wait_head(struct qspinlock *lock, struct 
mcs_spinlock *node) { }
 
 #define pv_enabled()   false
 
@@ -399,7 +399,7 @@ void queue_spin_lock_slowpath(struct qsp
prev = decode_tail(old);
WRITE_ONCE(prev-next, node);
 
-   pv_wait_node(node);
+   pv_wait_node(old, node);
arch_mcs_spin_lock_contended(node-locked);
}
 
@@ -414,7 +414,7 @@ void queue_spin_lock_slowpath(struct qsp
 * sequentiality; this is because the set_locked() function below
 * does not imply a full barrier.
 */
-   pv_wait_head(lock);
+   pv_wait_head(lock, node);
while ((val = smp_load_acquire(lock-val.counter))  
_Q_LOCKED_PENDING_MASK)
cpu_relax();
 
--- a/kernel/locking/qspinlock_paravirt.h
+++ b/kernel/locking/qspinlock_paravirt.h
@@ -29,8 +29,14 @@ struct pv_node {
 
int cpu;
u8  state;
+   struct pv_node  *head;
 };
 
+static inline struct pv_node *pv_decode_tail(u32 tail)
+{
+   return (struct pv_node *)decode_tail(tail);
+}
+
 /*
  * Initialize the PV part of the mcs_spinlock node.
  */
@@ -42,17 +48,49 @@ static void pv_init_node(struct mcs_spin
 
pn-cpu = smp_processor_id();
pn-state = vcpu_running;
+   pn-head = NULL;
+}
+
+static void pv_propagate_head(struct pv_node *pn, struct pv_node *ppn)
+{
+   /*
+* When we race against the first waiter or ourselves we have to wait
+* until the previous link updates its head pointer before we can
+* propagate it.
+*/
+   while (!READ_ONCE(ppn-head)) {
+   /*
+* queue_spin_lock_slowpath could have been waiting for the
+* node-next store above before setting node-locked.
+*/
+   if (ppn-mcs.locked)
+   return;
+
+   cpu_relax();
+   }
+   /*
+* If we interleave such that the store from pv_set_head() happens
+* between our load and store we could have over-written the new head
+* value.
+*
+* Therefore use cmpxchg to only propagate the old head value if our
+* head value is unmodified.
+*/
+   (void)cmpxchg(pn-head, NULL, READ_ONCE(ppn-head));
 }
 
 /*
  * Wait for node-locked to become true, halt the vcpu after a short spin.
  * pv_kick_node() is used to wake the 

Re: [PATCH 8/9] qspinlock: Generic paravirt support

2015-03-19 Thread Peter Zijlstra
On Thu, Mar 19, 2015 at 11:12:42AM +0100, Peter Zijlstra wrote:
 So I was now thinking of hashing the lock pointer; let me go and quickly
 put something together.

A little something like so; ideally we'd allocate the hashtable since
NR_CPUS is kinda bloated, but it shows the idea I think.

And while this has loops in (the rehashing thing) their fwd progress
does not depend on other CPUs.

And I suspect that for the typical lock contention scenarios its
unlikely we ever really get into long rehashing chains.

---
 include/linux/lfsr.h|   49 
 kernel/locking/qspinlock_paravirt.h |  143 
 2 files changed, 178 insertions(+), 14 deletions(-)

--- /dev/null
+++ b/include/linux/lfsr.h
@@ -0,0 +1,49 @@
+#ifndef _LINUX_LFSR_H
+#define _LINUX_LFSR_H
+
+/*
+ * Simple Binary Galois Linear Feedback Shift Register
+ *
+ * http://en.wikipedia.org/wiki/Linear_feedback_shift_register
+ *
+ */
+
+extern void __lfsr_needs_more_taps(void);
+
+static __always_inline u32 lfsr_taps(int bits)
+{
+   if (bits ==  1) return 0x0001;
+   if (bits ==  2) return 0x0001;
+   if (bits ==  3) return 0x0003;
+   if (bits ==  4) return 0x0009;
+   if (bits ==  5) return 0x0012;
+   if (bits ==  6) return 0x0021;
+   if (bits ==  7) return 0x0041;
+   if (bits ==  8) return 0x008E;
+   if (bits ==  9) return 0x0108;
+   if (bits == 10) return 0x0204;
+   if (bits == 11) return 0x0402;
+   if (bits == 12) return 0x0829;
+   if (bits == 13) return 0x100D;
+   if (bits == 14) return 0x2015;
+
+   /*
+* For more taps see:
+*   http://users.ece.cmu.edu/~koopman/lfsr/index.html
+*/
+   __lfsr_needs_more_taps();
+
+   return 0;
+}
+
+static inline u32 lfsr(u32 val, int bits)
+{
+   u32 bit = val  1;
+
+   val = 1;
+   if (bit)
+   val ^= lfsr_taps(bits);
+   return val;
+}
+
+#endif /* _LINUX_LFSR_H */
--- a/kernel/locking/qspinlock_paravirt.h
+++ b/kernel/locking/qspinlock_paravirt.h
@@ -2,6 +2,9 @@
 #error do not include this file
 #endif
 
+#include linux/hash.h
+#include linux/lfsr.h
+
 /*
  * Implement paravirt qspinlocks; the general idea is to halt the vcpus instead
  * of spinning them.
@@ -107,7 +110,120 @@ static void pv_kick_node(struct mcs_spin
pv_kick(pn-cpu);
 }
 
-static DEFINE_PER_CPU(struct qspinlock *, __pv_lock_wait);
+/*
+ * Hash table using open addressing with an LFSR probe sequence.
+ *
+ * Since we should not be holding locks from NMI context (very rare indeed) the
+ * max load factor is 0.75, which is around the point where open addressing
+ * breaks down.
+ *
+ * Instead of probing just the immediate bucket we probe all buckets in the
+ * same cacheline.
+ *
+ * http://en.wikipedia.org/wiki/Hash_table#Open_addressing
+ *
+ */
+
+#define HB_RESERVED((struct qspinlock *)1)
+
+struct pv_hash_bucket {
+   struct qspinlock *lock;
+   int cpu;
+};
+
+/*
+ * XXX dynamic allocate using nr_cpu_ids instead...
+ */
+#define PV_LOCK_HASH_BITS  (2 + NR_CPUS_BITS)
+
+#if PV_LOCK_HASH_BITS  6
+#undef PV_LOCK_HASH_BITS
+#define PB_LOCK_HASH_BITS  6
+#endif
+
+#define PV_LOCK_HASH_SIZE  (1  PV_LOCK_HASH_BITS)
+
+static struct pv_hash_bucket __pv_lock_hash[PV_LOCK_HASH_SIZE] 
cacheline_aligned;
+
+#define PV_HB_PER_LINE (SMP_CACHE_BYTES / sizeof(struct 
pv_hash_bucket))
+
+static inline u32 hash_align(u32 hash)
+{
+   return hash  ~(PV_HB_PER_LINE - 1);
+}
+
+static struct qspinlock **pv_hash(struct qspinlock *lock)
+{
+   u32 hash = hash_ptr(lock, PV_LOCK_HASH_BITS);
+   struct pv_hash_bucket *hb, *end;
+
+   if (!hash)
+   hash = 1;
+
+   hb = __pv_lock_hash[hash_align(hash)];
+   for (;;) {
+   for (end = hb + PV_HB_PER_LINE; hb  end; hb++) {
+   if (cmpxchg(hb-lock, NULL, HB_RESERVED)) {
+   WRITE_ONCE(hb-cpu, smp_processor_id());
+   /*
+* Since we must read lock first and cpu
+* second, we must write cpu first and lock
+* second, therefore use HB_RESERVE to mark an
+* entry in use before writing the values.
+*
+* This can cause hb_hash_find() to not find a
+* cpu even though _Q_SLOW_VAL, this is not a
+* problem since we re-check l-locked before
+* going to sleep and the unlock will have
+* cleared l-locked already.
+*/
+   smp_wmb(); /* matches rmb from pv_hash_find */
+   WRITE_ONCE(hb-lock, lock);
+   goto done;
+   }
+   

Re: [PATCH 8/9] qspinlock: Generic paravirt support

2015-03-19 Thread Peter Zijlstra
On Thu, Mar 19, 2015 at 01:25:36PM +0100, Peter Zijlstra wrote:
 +static struct qspinlock **pv_hash(struct qspinlock *lock)
 +{
 + u32 hash = hash_ptr(lock, PV_LOCK_HASH_BITS);
 + struct pv_hash_bucket *hb, *end;
 +
 + if (!hash)
 + hash = 1;
 +
 + hb = __pv_lock_hash[hash_align(hash)];
 + for (;;) {
 + for (end = hb + PV_HB_PER_LINE; hb  end; hb++) {
 + if (cmpxchg(hb-lock, NULL, HB_RESERVED)) {

That should be: !cmpxchg(), bit disturbing that that booted.

 + WRITE_ONCE(hb-cpu, smp_processor_id());
 + /*
 +  * Since we must read lock first and cpu
 +  * second, we must write cpu first and lock
 +  * second, therefore use HB_RESERVE to mark an
 +  * entry in use before writing the values.
 +  *
 +  * This can cause hb_hash_find() to not find a
 +  * cpu even though _Q_SLOW_VAL, this is not a
 +  * problem since we re-check l-locked before
 +  * going to sleep and the unlock will have
 +  * cleared l-locked already.
 +  */
 + smp_wmb(); /* matches rmb from pv_hash_find */
 + WRITE_ONCE(hb-lock, lock);
 + goto done;
 + }
 + }
 +
 + hash = lfsr(hash, PV_LOCK_HASH_BITS);
 + hb = __pv_lock_hash[hash_align(hash)];
 + }
 +
 +done:
 + return hb-lock;
 +}
--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[PATCH 8/9] qspinlock: Generic paravirt support

2015-03-16 Thread Peter Zijlstra
Implement simple paravirt support for the qspinlock.

Provide a separate (second) version of the spin_lock_slowpath for
paravirt along with a special unlock path.

The second slowpath is generated by adding a few pv hooks to the
normal slowpath, but where those will compile away for the native
case, they expand into special wait/wake code for the pv version.

The actual MCS queue can use extra storage in the mcs_nodes[] array to
keep track of state and therefore uses directed wakeups.

The head contender has no such storage available and reverts to the
per-cpu lock entry similar to the current kvm code. We can do a single
enrty because any nesting will wake the vcpu and cause the lower loop
to retry.

Signed-off-by: Peter Zijlstra (Intel) 
---
 include/asm-generic/qspinlock.h |3 
 kernel/locking/qspinlock.c  |   69 +-
 kernel/locking/qspinlock_paravirt.h |  177 
 3 files changed, 248 insertions(+), 1 deletion(-)

--- a/include/asm-generic/qspinlock.h
+++ b/include/asm-generic/qspinlock.h
@@ -118,6 +118,9 @@ static __always_inline bool virt_queue_s
 }
 #endif
 
+extern void __pv_queue_spin_lock_slowpath(struct qspinlock *lock, u32 val);
+extern void __pv_queue_spin_unlock(struct qspinlock *lock);
+
 /*
  * Initializier
  */
--- a/kernel/locking/qspinlock.c
+++ b/kernel/locking/qspinlock.c
@@ -18,6 +18,9 @@
  * Authors: Waiman Long 
  *  Peter Zijlstra 
  */
+
+#ifndef _GEN_PV_LOCK_SLOWPATH
+
 #include 
 #include 
 #include 
@@ -65,13 +68,21 @@
 
 #include "mcs_spinlock.h"
 
+#ifdef CONFIG_PARAVIRT_SPINLOCKS
+#define MAX_NODES  8
+#else
+#define MAX_NODES  4
+#endif
+
 /*
  * Per-CPU queue node structures; we can never have more than 4 nested
  * contexts: task, softirq, hardirq, nmi.
  *
  * Exactly fits one 64-byte cacheline on a 64-bit architecture.
+ *
+ * PV doubles the storage and uses the second cacheline for PV state.
  */
-static DEFINE_PER_CPU_ALIGNED(struct mcs_spinlock, mcs_nodes[4]);
+static DEFINE_PER_CPU_ALIGNED(struct mcs_spinlock, mcs_nodes[MAX_NODES]);
 
 /*
  * We must be able to distinguish between no-tail and the tail at 0:0,
@@ -230,6 +241,32 @@ static __always_inline void set_locked(s
WRITE_ONCE(l->locked, _Q_LOCKED_VAL);
 }
 
+
+/*
+ * Generate the native code for queue_spin_unlock_slowpath(); provide NOPs for
+ * all the PV callbacks.
+ */
+
+static __always_inline void __pv_init_node(struct mcs_spinlock *node) { }
+static __always_inline void __pv_wait_node(struct mcs_spinlock *node) { }
+static __always_inline void __pv_kick_node(struct mcs_spinlock *node) { }
+
+static __always_inline void __pv_wait_head(struct qspinlock *lock) { }
+
+#define pv_enabled()   false
+
+#define pv_init_node   __pv_init_node
+#define pv_wait_node   __pv_wait_node
+#define pv_kick_node   __pv_kick_node
+
+#define pv_wait_head   __pv_wait_head
+
+#ifdef CONFIG_PARAVIRT_SPINLOCKS
+#define queue_spin_lock_slowpath   native_queue_spin_lock_slowpath
+#endif
+
+#endif /* _GEN_PV_LOCK_SLOWPATH */
+
 /**
  * queue_spin_lock_slowpath - acquire the queue spinlock
  * @lock: Pointer to queue spinlock structure
@@ -259,6 +296,9 @@ void queue_spin_lock_slowpath(struct qsp
 
BUILD_BUG_ON(CONFIG_NR_CPUS >= (1U << _Q_TAIL_CPU_BITS));
 
+   if (pv_enabled())
+   goto queue;
+
if (virt_queue_spin_lock(lock))
return;
 
@@ -335,6 +375,7 @@ void queue_spin_lock_slowpath(struct qsp
node += idx;
node->locked = 0;
node->next = NULL;
+   pv_init_node(node);
 
/*
 * We touched a (possibly) cold cacheline in the per-cpu queue node;
@@ -360,6 +401,7 @@ void queue_spin_lock_slowpath(struct qsp
prev = decode_tail(old);
WRITE_ONCE(prev->next, node);
 
+   pv_wait_node(node);
arch_mcs_spin_lock_contended(>locked);
}
 
@@ -374,6 +416,7 @@ void queue_spin_lock_slowpath(struct qsp
 * sequentiality; this is because the set_locked() function below
 * does not imply a full barrier.
 */
+   pv_wait_head(lock);
while ((val = smp_load_acquire(>val.counter)) & 
_Q_LOCKED_PENDING_MASK)
cpu_relax();
 
@@ -406,6 +449,7 @@ void queue_spin_lock_slowpath(struct qsp
cpu_relax();
 
arch_mcs_spin_unlock_contended(>locked);
+   pv_kick_node(next);
 
 release:
/*
@@ -414,3 +458,26 @@ void queue_spin_lock_slowpath(struct qsp
this_cpu_dec(mcs_nodes[0].count);
 }
 EXPORT_SYMBOL(queue_spin_lock_slowpath);
+
+/*
+ * Generate the paravirt code for queue_spin_unlock_slowpath().
+ */
+#if !defined(_GEN_PV_LOCK_SLOWPATH) && defined(CONFIG_PARAVIRT_SPINLOCKS)
+#define _GEN_PV_LOCK_SLOWPATH
+
+#undef pv_enabled
+#define pv_enabled()   true
+
+#undef pv_init_node
+#undef pv_wait_node
+#undef pv_kick_node
+
+#undef pv_wait_head
+
+#undef queue_spin_lock_slowpath
+#define 

[PATCH 8/9] qspinlock: Generic paravirt support

2015-03-16 Thread Peter Zijlstra
Implement simple paravirt support for the qspinlock.

Provide a separate (second) version of the spin_lock_slowpath for
paravirt along with a special unlock path.

The second slowpath is generated by adding a few pv hooks to the
normal slowpath, but where those will compile away for the native
case, they expand into special wait/wake code for the pv version.

The actual MCS queue can use extra storage in the mcs_nodes[] array to
keep track of state and therefore uses directed wakeups.

The head contender has no such storage available and reverts to the
per-cpu lock entry similar to the current kvm code. We can do a single
enrty because any nesting will wake the vcpu and cause the lower loop
to retry.

Signed-off-by: Peter Zijlstra (Intel) pet...@infradead.org
---
 include/asm-generic/qspinlock.h |3 
 kernel/locking/qspinlock.c  |   69 +-
 kernel/locking/qspinlock_paravirt.h |  177 
 3 files changed, 248 insertions(+), 1 deletion(-)

--- a/include/asm-generic/qspinlock.h
+++ b/include/asm-generic/qspinlock.h
@@ -118,6 +118,9 @@ static __always_inline bool virt_queue_s
 }
 #endif
 
+extern void __pv_queue_spin_lock_slowpath(struct qspinlock *lock, u32 val);
+extern void __pv_queue_spin_unlock(struct qspinlock *lock);
+
 /*
  * Initializier
  */
--- a/kernel/locking/qspinlock.c
+++ b/kernel/locking/qspinlock.c
@@ -18,6 +18,9 @@
  * Authors: Waiman Long waiman.l...@hp.com
  *  Peter Zijlstra pet...@infradead.org
  */
+
+#ifndef _GEN_PV_LOCK_SLOWPATH
+
 #include linux/smp.h
 #include linux/bug.h
 #include linux/cpumask.h
@@ -65,13 +68,21 @@
 
 #include mcs_spinlock.h
 
+#ifdef CONFIG_PARAVIRT_SPINLOCKS
+#define MAX_NODES  8
+#else
+#define MAX_NODES  4
+#endif
+
 /*
  * Per-CPU queue node structures; we can never have more than 4 nested
  * contexts: task, softirq, hardirq, nmi.
  *
  * Exactly fits one 64-byte cacheline on a 64-bit architecture.
+ *
+ * PV doubles the storage and uses the second cacheline for PV state.
  */
-static DEFINE_PER_CPU_ALIGNED(struct mcs_spinlock, mcs_nodes[4]);
+static DEFINE_PER_CPU_ALIGNED(struct mcs_spinlock, mcs_nodes[MAX_NODES]);
 
 /*
  * We must be able to distinguish between no-tail and the tail at 0:0,
@@ -230,6 +241,32 @@ static __always_inline void set_locked(s
WRITE_ONCE(l-locked, _Q_LOCKED_VAL);
 }
 
+
+/*
+ * Generate the native code for queue_spin_unlock_slowpath(); provide NOPs for
+ * all the PV callbacks.
+ */
+
+static __always_inline void __pv_init_node(struct mcs_spinlock *node) { }
+static __always_inline void __pv_wait_node(struct mcs_spinlock *node) { }
+static __always_inline void __pv_kick_node(struct mcs_spinlock *node) { }
+
+static __always_inline void __pv_wait_head(struct qspinlock *lock) { }
+
+#define pv_enabled()   false
+
+#define pv_init_node   __pv_init_node
+#define pv_wait_node   __pv_wait_node
+#define pv_kick_node   __pv_kick_node
+
+#define pv_wait_head   __pv_wait_head
+
+#ifdef CONFIG_PARAVIRT_SPINLOCKS
+#define queue_spin_lock_slowpath   native_queue_spin_lock_slowpath
+#endif
+
+#endif /* _GEN_PV_LOCK_SLOWPATH */
+
 /**
  * queue_spin_lock_slowpath - acquire the queue spinlock
  * @lock: Pointer to queue spinlock structure
@@ -259,6 +296,9 @@ void queue_spin_lock_slowpath(struct qsp
 
BUILD_BUG_ON(CONFIG_NR_CPUS = (1U  _Q_TAIL_CPU_BITS));
 
+   if (pv_enabled())
+   goto queue;
+
if (virt_queue_spin_lock(lock))
return;
 
@@ -335,6 +375,7 @@ void queue_spin_lock_slowpath(struct qsp
node += idx;
node-locked = 0;
node-next = NULL;
+   pv_init_node(node);
 
/*
 * We touched a (possibly) cold cacheline in the per-cpu queue node;
@@ -360,6 +401,7 @@ void queue_spin_lock_slowpath(struct qsp
prev = decode_tail(old);
WRITE_ONCE(prev-next, node);
 
+   pv_wait_node(node);
arch_mcs_spin_lock_contended(node-locked);
}
 
@@ -374,6 +416,7 @@ void queue_spin_lock_slowpath(struct qsp
 * sequentiality; this is because the set_locked() function below
 * does not imply a full barrier.
 */
+   pv_wait_head(lock);
while ((val = smp_load_acquire(lock-val.counter))  
_Q_LOCKED_PENDING_MASK)
cpu_relax();
 
@@ -406,6 +449,7 @@ void queue_spin_lock_slowpath(struct qsp
cpu_relax();
 
arch_mcs_spin_unlock_contended(next-locked);
+   pv_kick_node(next);
 
 release:
/*
@@ -414,3 +458,26 @@ void queue_spin_lock_slowpath(struct qsp
this_cpu_dec(mcs_nodes[0].count);
 }
 EXPORT_SYMBOL(queue_spin_lock_slowpath);
+
+/*
+ * Generate the paravirt code for queue_spin_unlock_slowpath().
+ */
+#if !defined(_GEN_PV_LOCK_SLOWPATH)  defined(CONFIG_PARAVIRT_SPINLOCKS)
+#define _GEN_PV_LOCK_SLOWPATH
+
+#undef pv_enabled
+#define pv_enabled()   true
+
+#undef pv_init_node
+#undef pv_wait_node
+#undef