[PATCH] lib: remove lib/nodemask.c

2022-08-11 Thread Yury Norov
Commit 36d4b36b69590 (lib/nodemask: inline next_node_in() and
node_random()) removed the lib/nodemask.c file, but the remove
didn't happen when the patch was applied.

Reported-by: "Aneesh Kumar K.V" 
Signed-off-by: Yury Norov 
---
 lib/nodemask.c | 23 ---
 1 file changed, 23 deletions(-)
 delete mode 100644 lib/nodemask.c

diff --git a/lib/nodemask.c b/lib/nodemask.c
deleted file mode 100644
index b8a433d16b51..
--- a/lib/nodemask.c
+++ /dev/null
@@ -1,23 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-#include 
-#include 
-#include 
-
-EXPORT_SYMBOL(__next_node_in);
-
-#ifdef CONFIG_NUMA
-/*
- * Return the bit number of a random bit set in the nodemask.
- * (returns NUMA_NO_NODE if nodemask is empty)
- */
-int node_random(const nodemask_t *maskp)
-{
-   int w, bit = NUMA_NO_NODE;
-
-   w = nodes_weight(*maskp);
-   if (w)
-   bit = bitmap_ord_to_pos(maskp->bits,
-   get_random_int() % w, MAX_NUMNODES);
-   return bit;
-}
-#endif
-- 
2.34.1



Re: [RESEND PATCH 2/2] lib/nodemask: inline next_node_in() and node_random()

2022-08-11 Thread Yury Norov
On Fri, Aug 12, 2022 at 10:46:57AM +0530, Aneesh Kumar K.V wrote:
> Yury Norov  writes:
> 
> > The functions are pretty thin wrappers around find_bit engine, and
> > keeping them in c-file prevents compiler from small_const_nbits()
> > optimization, which must take place for all systems with MAX_NUMNODES
> > less than BITS_PER_LONG (default is 16 for me).
> >
> > Moving them to header file doesn't blow up the kernel size:
> > add/remove: 1/2 grow/shrink: 9/5 up/down: 968/-88 (880)
> >
> > CC: Andy Shevchenko 
> > CC: Rasmus Villemoes 
> > Signed-off-by: Yury Norov 
> > ---
> >  MAINTAINERS  |  1 -
> >  include/linux/nodemask.h | 27 ++-
> >  lib/Makefile |  2 +-
> >  lib/nodemask.c   | 30 --
> >  4 files changed, 23 insertions(+), 37 deletions(-)
> >  delete mode 100644 lib/nodemask.c
> >
> > diff --git a/MAINTAINERS b/MAINTAINERS
> > index 7c0b8f28aa25..19c8d0ef1177 100644
> > --- a/MAINTAINERS
> > +++ b/MAINTAINERS
> > @@ -3540,7 +3540,6 @@ F:lib/bitmap.c
> >  F: lib/cpumask.c
> >  F: lib/find_bit.c
> >  F: lib/find_bit_benchmark.c
> > -F: lib/nodemask.c
> >  F: lib/test_bitmap.c
> >  F: tools/include/linux/bitmap.h
> >  F: tools/include/linux/find.h
> > diff --git a/include/linux/nodemask.h b/include/linux/nodemask.h
> > index 0f233b76c9ce..48ebe4007955 100644
> > --- a/include/linux/nodemask.h
> > +++ b/include/linux/nodemask.h
> > @@ -94,6 +94,7 @@
> >  #include 
> >  #include 
> >  #include 
> > +#include 
> >  
> >  typedef struct { DECLARE_BITMAP(bits, MAX_NUMNODES); } nodemask_t;
> >  extern nodemask_t _unused_nodemask_arg_;
> > @@ -276,7 +277,14 @@ static inline unsigned int __next_node(int n, const 
> > nodemask_t *srcp)
> >   * the first node in src if needed.  Returns MAX_NUMNODES if src is empty.
> >   */
> >  #define next_node_in(n, src) __next_node_in((n), &(src))
> > -unsigned int __next_node_in(int node, const nodemask_t *srcp);
> > +static inline unsigned int __next_node_in(int node, const nodemask_t *srcp)
> > +{
> > +   unsigned int ret = __next_node(node, srcp);
> > +
> > +   if (ret == MAX_NUMNODES)
> > +   ret = __first_node(srcp);
> > +   return ret;
> > +}
> >  
> >  static inline void init_nodemask_of_node(nodemask_t *mask, int node)
> >  {
> > @@ -493,14 +501,23 @@ static inline int num_node_state(enum node_states 
> > state)
> >  
> >  #endif
> >  
> > +/*
> > + * Return the bit number of a random bit set in the nodemask.
> > + * (returns NUMA_NO_NODE if nodemask is empty)
> > + */
> > +static inline int node_random(const nodemask_t *maskp)
> > +{
> >  #if defined(CONFIG_NUMA) && (MAX_NUMNODES > 1)
> > -extern int node_random(const nodemask_t *maskp);
> > +   int w, bit = NUMA_NO_NODE;
> > +
> > +   w = nodes_weight(*maskp);
> > +   if (w)
> > +   bit = find_nth_bit(maskp->bits, MAX_NUMNODES, get_random_int() 
> > % w);
> > +   return bit;
> >  #else
> > -static inline int node_random(const nodemask_t *mask)
> > -{
> > return 0;
> > -}
> >  #endif
> > +}
> >  
> >  #define node_online_mapnode_states[N_ONLINE]
> >  #define node_possible_map  node_states[N_POSSIBLE]
> > diff --git a/lib/Makefile b/lib/Makefile
> > index f99bf61f8bbc..731cea0342d1 100644
> > --- a/lib/Makefile
> > +++ b/lib/Makefile
> > @@ -33,7 +33,7 @@ lib-y := ctype.o string.o vsprintf.o cmdline.o \
> >  flex_proportions.o ratelimit.o show_mem.o \
> >  is_single_threaded.o plist.o decompress.o kobject_uevent.o \
> >  earlycpio.o seq_buf.o siphash.o dec_and_lock.o \
> > -nmi_backtrace.o nodemask.o win_minmax.o memcat_p.o \
> > +nmi_backtrace.o win_minmax.o memcat_p.o \
> >  buildid.o
> >  
> >  lib-$(CONFIG_PRINTK) += dump_stack.o
> > diff --git a/lib/nodemask.c b/lib/nodemask.c
> > deleted file mode 100644
> > index 7dad4ce8ff59..
> > --- a/lib/nodemask.c
> > +++ /dev/null
> > @@ -1,30 +0,0 @@
> > -// SPDX-License-Identifier: GPL-2.0
> > -#include 
> > -#include 
> > -#include 
> > -
> > -unsigned int __next_node_in(int node, const nodemask_t *srcp)
> > -{
> > -   unsigned int ret = __next_node(node, srcp);
> > -
> > -   if (ret == MAX_NUMNODES)
> > -   ret = __first_node(srcp);
> > -   return ret;
> > -}
> > -EXPORT_SYMBOL(__next_node_in);
> > -
> > -#ifdef CONFIG_NUMA
> > -/*
> > - * Return the bit number of a random bit set in the nodemask.
> > - * (returns NUMA_NO_NODE if nodemask is empty)
> > - */
> > -int node_random(const nodemask_t *maskp)
> > -{
> > -   int w, bit = NUMA_NO_NODE;
> > -
> > -   w = nodes_weight(*maskp);
> > -   if (w)
> > -   bit = find_nth_bit(maskp->bits, MAX_NUMNODES, get_random_int() 
> > % w);
> > -   return bit;
> > -}
> > -#endif
> > -- 
> > 2.34.1
> 
> The patch that got merged (36d4b36b69590fed99356a4426c940a253a93800) still 
> have lib/nodemask.c

Thanks Aneesh. I'll send a fix shortly.


Re: [RESEND PATCH 2/2] lib/nodemask: inline next_node_in() and node_random()

2022-08-11 Thread Aneesh Kumar K.V
Yury Norov  writes:

> The functions are pretty thin wrappers around find_bit engine, and
> keeping them in c-file prevents compiler from small_const_nbits()
> optimization, which must take place for all systems with MAX_NUMNODES
> less than BITS_PER_LONG (default is 16 for me).
>
> Moving them to header file doesn't blow up the kernel size:
> add/remove: 1/2 grow/shrink: 9/5 up/down: 968/-88 (880)
>
> CC: Andy Shevchenko 
> CC: Rasmus Villemoes 
> Signed-off-by: Yury Norov 
> ---
>  MAINTAINERS  |  1 -
>  include/linux/nodemask.h | 27 ++-
>  lib/Makefile |  2 +-
>  lib/nodemask.c   | 30 --
>  4 files changed, 23 insertions(+), 37 deletions(-)
>  delete mode 100644 lib/nodemask.c
>
> diff --git a/MAINTAINERS b/MAINTAINERS
> index 7c0b8f28aa25..19c8d0ef1177 100644
> --- a/MAINTAINERS
> +++ b/MAINTAINERS
> @@ -3540,7 +3540,6 @@ F:  lib/bitmap.c
>  F:   lib/cpumask.c
>  F:   lib/find_bit.c
>  F:   lib/find_bit_benchmark.c
> -F:   lib/nodemask.c
>  F:   lib/test_bitmap.c
>  F:   tools/include/linux/bitmap.h
>  F:   tools/include/linux/find.h
> diff --git a/include/linux/nodemask.h b/include/linux/nodemask.h
> index 0f233b76c9ce..48ebe4007955 100644
> --- a/include/linux/nodemask.h
> +++ b/include/linux/nodemask.h
> @@ -94,6 +94,7 @@
>  #include 
>  #include 
>  #include 
> +#include 
>  
>  typedef struct { DECLARE_BITMAP(bits, MAX_NUMNODES); } nodemask_t;
>  extern nodemask_t _unused_nodemask_arg_;
> @@ -276,7 +277,14 @@ static inline unsigned int __next_node(int n, const 
> nodemask_t *srcp)
>   * the first node in src if needed.  Returns MAX_NUMNODES if src is empty.
>   */
>  #define next_node_in(n, src) __next_node_in((n), &(src))
> -unsigned int __next_node_in(int node, const nodemask_t *srcp);
> +static inline unsigned int __next_node_in(int node, const nodemask_t *srcp)
> +{
> + unsigned int ret = __next_node(node, srcp);
> +
> + if (ret == MAX_NUMNODES)
> + ret = __first_node(srcp);
> + return ret;
> +}
>  
>  static inline void init_nodemask_of_node(nodemask_t *mask, int node)
>  {
> @@ -493,14 +501,23 @@ static inline int num_node_state(enum node_states state)
>  
>  #endif
>  
> +/*
> + * Return the bit number of a random bit set in the nodemask.
> + * (returns NUMA_NO_NODE if nodemask is empty)
> + */
> +static inline int node_random(const nodemask_t *maskp)
> +{
>  #if defined(CONFIG_NUMA) && (MAX_NUMNODES > 1)
> -extern int node_random(const nodemask_t *maskp);
> + int w, bit = NUMA_NO_NODE;
> +
> + w = nodes_weight(*maskp);
> + if (w)
> + bit = find_nth_bit(maskp->bits, MAX_NUMNODES, get_random_int() 
> % w);
> + return bit;
>  #else
> -static inline int node_random(const nodemask_t *mask)
> -{
>   return 0;
> -}
>  #endif
> +}
>  
>  #define node_online_map  node_states[N_ONLINE]
>  #define node_possible_mapnode_states[N_POSSIBLE]
> diff --git a/lib/Makefile b/lib/Makefile
> index f99bf61f8bbc..731cea0342d1 100644
> --- a/lib/Makefile
> +++ b/lib/Makefile
> @@ -33,7 +33,7 @@ lib-y := ctype.o string.o vsprintf.o cmdline.o \
>flex_proportions.o ratelimit.o show_mem.o \
>is_single_threaded.o plist.o decompress.o kobject_uevent.o \
>earlycpio.o seq_buf.o siphash.o dec_and_lock.o \
> -  nmi_backtrace.o nodemask.o win_minmax.o memcat_p.o \
> +  nmi_backtrace.o win_minmax.o memcat_p.o \
>buildid.o
>  
>  lib-$(CONFIG_PRINTK) += dump_stack.o
> diff --git a/lib/nodemask.c b/lib/nodemask.c
> deleted file mode 100644
> index 7dad4ce8ff59..
> --- a/lib/nodemask.c
> +++ /dev/null
> @@ -1,30 +0,0 @@
> -// SPDX-License-Identifier: GPL-2.0
> -#include 
> -#include 
> -#include 
> -
> -unsigned int __next_node_in(int node, const nodemask_t *srcp)
> -{
> - unsigned int ret = __next_node(node, srcp);
> -
> - if (ret == MAX_NUMNODES)
> - ret = __first_node(srcp);
> - return ret;
> -}
> -EXPORT_SYMBOL(__next_node_in);
> -
> -#ifdef CONFIG_NUMA
> -/*
> - * Return the bit number of a random bit set in the nodemask.
> - * (returns NUMA_NO_NODE if nodemask is empty)
> - */
> -int node_random(const nodemask_t *maskp)
> -{
> - int w, bit = NUMA_NO_NODE;
> -
> - w = nodes_weight(*maskp);
> - if (w)
> - bit = find_nth_bit(maskp->bits, MAX_NUMNODES, get_random_int() 
> % w);
> - return bit;
> -}
> -#endif
> -- 
> 2.34.1

The patch that got merged (36d4b36b69590fed99356a4426c940a253a93800) still have 
lib/nodemask.c


Re: [PATCH 16/17] powerpc/qspinlock: allow indefinite spinning on a preempted owner

2022-08-11 Thread Jordan Niethe
On Thu, 2022-07-28 at 16:31 +1000, Nicholas Piggin wrote:
> Provide an option that holds off queueing indefinitely while the lock
> owner is preempted. This could reduce queueing latencies for very
> overcommitted vcpu situations.
> 
> This is disabled by default.
> ---
>  arch/powerpc/lib/qspinlock.c | 91 +++-
>  1 file changed, 79 insertions(+), 12 deletions(-)
> 
> diff --git a/arch/powerpc/lib/qspinlock.c b/arch/powerpc/lib/qspinlock.c
> index 24f68bd71e2b..5cfd69931e31 100644
> --- a/arch/powerpc/lib/qspinlock.c
> +++ b/arch/powerpc/lib/qspinlock.c
> @@ -35,6 +35,7 @@ static int HEAD_SPINS __read_mostly = (1<<8);
>  
>  static bool pv_yield_owner __read_mostly = true;
>  static bool pv_yield_allow_steal __read_mostly = false;
> +static bool pv_spin_on_preempted_owner __read_mostly = false;
>  static bool pv_yield_prev __read_mostly = true;
>  static bool pv_yield_propagate_owner __read_mostly = true;
>  static bool pv_prod_head __read_mostly = false;
> @@ -220,13 +221,15 @@ static struct qnode *get_tail_qnode(struct qspinlock 
> *lock, u32 val)
>   BUG();
>  }
>  
> -static __always_inline void __yield_to_locked_owner(struct qspinlock *lock, 
> u32 val, bool paravirt, bool clear_mustq)
> +static __always_inline void __yield_to_locked_owner(struct qspinlock *lock, 
> u32 val, bool paravirt, bool clear_mustq, bool *preempted)
>  {
>   int owner;
>   u32 yield_count;
>  
>   BUG_ON(!(val & _Q_LOCKED_VAL));
>  
> + *preempted = false;
> +
>   if (!paravirt)
>   goto relax;
>  
> @@ -241,6 +244,8 @@ static __always_inline void 
> __yield_to_locked_owner(struct qspinlock *lock, u32
>  
>   spin_end();
>  
> + *preempted = true;
> +
>   /*
>* Read the lock word after sampling the yield count. On the other side
>* there may a wmb because the yield count update is done by the
> @@ -265,14 +270,14 @@ static __always_inline void 
> __yield_to_locked_owner(struct qspinlock *lock, u32
>   spin_cpu_relax();
>  }
>  
> -static __always_inline void yield_to_locked_owner(struct qspinlock *lock, 
> u32 val, bool paravirt)
> +static __always_inline void yield_to_locked_owner(struct qspinlock *lock, 
> u32 val, bool paravirt, bool *preempted)

It seems like preempted parameter could be the return value of
yield_to_locked_owner(). Then callers that don't use the value returned in
preempted don't need to create an unnecessary variable to pass in.

>  {
> - __yield_to_locked_owner(lock, val, paravirt, false);
> + __yield_to_locked_owner(lock, val, paravirt, false, preempted);
>  }
>  
> -static __always_inline void yield_head_to_locked_owner(struct qspinlock 
> *lock, u32 val, bool paravirt, bool clear_mustq)
> +static __always_inline void yield_head_to_locked_owner(struct qspinlock 
> *lock, u32 val, bool paravirt, bool clear_mustq, bool *preempted)
>  {
> - __yield_to_locked_owner(lock, val, paravirt, clear_mustq);
> + __yield_to_locked_owner(lock, val, paravirt, clear_mustq, preempted);
>  }
>  
>  static __always_inline void propagate_yield_cpu(struct qnode *node, u32 val, 
> int *set_yield_cpu, bool paravirt)
> @@ -364,12 +369,33 @@ static __always_inline void yield_to_prev(struct 
> qspinlock *lock, struct qnode *
>  
>  static __always_inline bool try_to_steal_lock(struct qspinlock *lock, bool 
> paravirt)
>  {
> - int iters;
> + int iters = 0;
> +
> + if (!STEAL_SPINS) {
> + if (paravirt && pv_spin_on_preempted_owner) {
> + spin_begin();
> + for (;;) {
> + u32 val = READ_ONCE(lock->val);
> + bool preempted;
> +
> + if (val & _Q_MUST_Q_VAL)
> + break;
> + if (!(val & _Q_LOCKED_VAL))
> + break;
> + if (!vcpu_is_preempted(get_owner_cpu(val)))
> + break;
> + yield_to_locked_owner(lock, val, paravirt, 
> &preempted);
> + }
> + spin_end();
> + }
> + return false;
> + }
>  
>   /* Attempt to steal the lock */
>   spin_begin();
>   for (;;) {
>   u32 val = READ_ONCE(lock->val);
> + bool preempted;
>  
>   if (val & _Q_MUST_Q_VAL)
>   break;
> @@ -382,9 +408,22 @@ static __always_inline bool try_to_steal_lock(struct 
> qspinlock *lock, bool parav
>   continue;
>   }
>  
> - yield_to_locked_owner(lock, val, paravirt);
> -
> - iters++;
> + yield_to_locked_owner(lock, val, paravirt, &preempted);
> +
> + if (paravirt && preempted) {
> + if (!pv_spin_on_preempted_owner)
> + iters++;
> + /*
> +   

Re: [PATCH 15/17] powerpc/qspinlock: reduce remote node steal spins

2022-08-11 Thread Jordan Niethe
On Thu, 2022-07-28 at 16:31 +1000, Nicholas Piggin wrote:
> Allow for a reduction in the number of times a CPU from a different
> node than the owner can attempt to steal the lock before queueing.
> This could bias the transfer behaviour of the lock across the
> machine and reduce NUMA crossings.
> ---
>  arch/powerpc/lib/qspinlock.c | 34 +++---
>  1 file changed, 31 insertions(+), 3 deletions(-)
> 
> diff --git a/arch/powerpc/lib/qspinlock.c b/arch/powerpc/lib/qspinlock.c
> index d4594c701f7d..24f68bd71e2b 100644
> --- a/arch/powerpc/lib/qspinlock.c
> +++ b/arch/powerpc/lib/qspinlock.c
> @@ -4,6 +4,7 @@
>  #include 
>  #include 
>  #include 
> +#include 
>  #include 
>  #include 
>  
> @@ -24,6 +25,7 @@ struct qnodes {
>  
>  /* Tuning parameters */
>  static int STEAL_SPINS __read_mostly = (1<<5);
> +static int REMOTE_STEAL_SPINS __read_mostly = (1<<2);
>  #if _Q_SPIN_TRY_LOCK_STEAL == 1
>  static const bool MAYBE_STEALERS = true;
>  #else
> @@ -39,9 +41,13 @@ static bool pv_prod_head __read_mostly = false;
>  
>  static DEFINE_PER_CPU_ALIGNED(struct qnodes, qnodes);
>  
> -static __always_inline int get_steal_spins(bool paravirt)
> +static __always_inline int get_steal_spins(bool paravirt, bool remote)
>  {
> - return STEAL_SPINS;
> + if (remote) {
> + return REMOTE_STEAL_SPINS;
> + } else {
> + return STEAL_SPINS;
> + }
>  }
>  
>  static __always_inline int get_head_spins(bool paravirt)
> @@ -380,8 +386,13 @@ static __always_inline bool try_to_steal_lock(struct 
> qspinlock *lock, bool parav
>  
>   iters++;
>  
> - if (iters >= get_steal_spins(paravirt))
> + if (iters >= get_steal_spins(paravirt, false))
>   break;
> + if (iters >= get_steal_spins(paravirt, true)) {

There's no indication of what true and false mean here which is hard to read.
To me it feels like two separate functions would be more clear.


> + int cpu = get_owner_cpu(val);
> + if (numa_node_id() != cpu_to_node(cpu))

What about using node_distance() instead?


> + break;
> + }
>   }
>   spin_end();
>  
> @@ -588,6 +599,22 @@ static int steal_spins_get(void *data, u64 *val)
>  
>  DEFINE_SIMPLE_ATTRIBUTE(fops_steal_spins, steal_spins_get, steal_spins_set, 
> "%llu\n");
>  
> +static int remote_steal_spins_set(void *data, u64 val)
> +{
> + REMOTE_STEAL_SPINS = val;

REMOTE_STEAL_SPINS is int not u64.

> +
> + return 0;
> +}
> +
> +static int remote_steal_spins_get(void *data, u64 *val)
> +{
> + *val = REMOTE_STEAL_SPINS;
> +
> + return 0;
> +}
> +
> +DEFINE_SIMPLE_ATTRIBUTE(fops_remote_steal_spins, remote_steal_spins_get, 
> remote_steal_spins_set, "%llu\n");
> +
>  static int head_spins_set(void *data, u64 val)
>  {
>   HEAD_SPINS = val;
> @@ -687,6 +714,7 @@ DEFINE_SIMPLE_ATTRIBUTE(fops_pv_prod_head, 
> pv_prod_head_get, pv_prod_head_set, "
>  static __init int spinlock_debugfs_init(void)
>  {
>   debugfs_create_file("qspl_steal_spins", 0600, arch_debugfs_dir, NULL, 
> &fops_steal_spins);
> + debugfs_create_file("qspl_remote_steal_spins", 0600, arch_debugfs_dir, 
> NULL, &fops_remote_steal_spins);
>   debugfs_create_file("qspl_head_spins", 0600, arch_debugfs_dir, NULL, 
> &fops_head_spins);
>   if (is_shared_processor()) {
>   debugfs_create_file("qspl_pv_yield_owner", 0600, 
> arch_debugfs_dir, NULL, &fops_pv_yield_owner);



Re: [PATCH 14/17] powerpc/qspinlock: use spin_begin/end API

2022-08-11 Thread Jordan Niethe
On Thu, 2022-07-28 at 16:31 +1000, Nicholas Piggin wrote:
> Use the spin_begin/spin_cpu_relax/spin_end APIs in qspinlock, which helps
> to prevent threads issuing a lot of expensive priority nops which may not
> have much effect due to immediately executing low then medium priority.

Just a general comment regarding the spin_{begin,end} API, more complicated
than something like

spin_begin()
for(;;)
spin_cpu_relax()
spin_end()

it becomes difficult to keep track of. Unfortunately, I don't have any good
suggestions how to improve it. Hopefully with P10s wait instruction we can
maybe try and move away from this.

It might be useful to comment the functions pre and post conditions regarding
expectations about spin_begin() and spin_end().

> ---
>  arch/powerpc/lib/qspinlock.c | 35 +++
>  1 file changed, 31 insertions(+), 4 deletions(-)
> 
> diff --git a/arch/powerpc/lib/qspinlock.c b/arch/powerpc/lib/qspinlock.c
> index 277aef1fab0a..d4594c701f7d 100644
> --- a/arch/powerpc/lib/qspinlock.c
> +++ b/arch/powerpc/lib/qspinlock.c
> @@ -233,6 +233,8 @@ static __always_inline void 
> __yield_to_locked_owner(struct qspinlock *lock, u32
>   if ((yield_count & 1) == 0)
>   goto relax; /* owner vcpu is running */
>  
> + spin_end();
> +
>   /*
>* Read the lock word after sampling the yield count. On the other side
>* there may a wmb because the yield count update is done by the
> @@ -248,11 +250,13 @@ static __always_inline void 
> __yield_to_locked_owner(struct qspinlock *lock, u32
>   yield_to_preempted(owner, yield_count);
>   if (clear_mustq)
>   lock_set_mustq(lock);
> + spin_begin();
>   /* Don't relax if we yielded. Maybe we should? */
>   return;
>   }
> + spin_begin();
>  relax:
> - cpu_relax();
> + spin_cpu_relax();
>  }
>  
>  static __always_inline void yield_to_locked_owner(struct qspinlock *lock, 
> u32 val, bool paravirt)
> @@ -315,14 +319,18 @@ static __always_inline void yield_to_prev(struct 
> qspinlock *lock, struct qnode *
>   if ((yield_count & 1) == 0)
>   goto yield_prev; /* owner vcpu is running */
>  
> + spin_end();
> +
>   smp_rmb();
>  
>   if (yield_cpu == node->yield_cpu) {
>   if (node->next && node->next->yield_cpu != yield_cpu)
>   node->next->yield_cpu = yield_cpu;
>   yield_to_preempted(yield_cpu, yield_count);
> + spin_begin();
>   return;
>   }
> + spin_begin();
>  
>  yield_prev:
>   if (!pv_yield_prev)
> @@ -332,15 +340,19 @@ static __always_inline void yield_to_prev(struct 
> qspinlock *lock, struct qnode *
>   if ((yield_count & 1) == 0)
>   goto relax; /* owner vcpu is running */
>  
> + spin_end();
> +
>   smp_rmb(); /* See yield_to_locked_owner comment */
>  
>   if (!node->locked) {
>   yield_to_preempted(prev_cpu, yield_count);
> + spin_begin();
>   return;
>   }
> + spin_begin();
>  
>  relax:
> - cpu_relax();
> + spin_cpu_relax();
>  }
>  
>  
> @@ -349,6 +361,7 @@ static __always_inline bool try_to_steal_lock(struct 
> qspinlock *lock, bool parav
>   int iters;
>  
>   /* Attempt to steal the lock */
> + spin_begin();
>   for (;;) {
>   u32 val = READ_ONCE(lock->val);
>  
> @@ -356,8 +369,10 @@ static __always_inline bool try_to_steal_lock(struct 
> qspinlock *lock, bool parav
>   break;
>  
>   if (unlikely(!(val & _Q_LOCKED_VAL))) {
> + spin_end();
>   if (trylock_with_tail_cpu(lock, val))
>   return true;
> + spin_begin();
>   continue;
>   }
>  
> @@ -368,6 +383,7 @@ static __always_inline bool try_to_steal_lock(struct 
> qspinlock *lock, bool parav
>   if (iters >= get_steal_spins(paravirt))
>   break;
>   }
> + spin_end();
>  
>   return false;
>  }
> @@ -418,8 +434,10 @@ static __always_inline void 
> queued_spin_lock_mcs_queue(struct qspinlock *lock, b
>   WRITE_ONCE(prev->next, node);
>  
>   /* Wait for mcs node lock to be released */
> + spin_begin();
>   while (!node->locked)
>   yield_to_prev(lock, node, prev_cpu, paravirt);
> + spin_end();
>  
>   /* Clear out stale propagated yield_cpu */
>   if (paravirt && pv_yield_propagate_owner && node->yield_cpu != 
> -1)
> @@ -432,10 +450,12 @@ static __always_inline void 
> queued_spin_lock_mcs_queue(struct qspinlock *lock, b
>   int set_yield_cpu = -1;
>  
>   /* We're at the head of the waitqueue, wait for the lock. */
> + spin_begin();
>   while ((val = R

Re: [PATCH 13/17] powerpc/qspinlock: trylock and initial lock attempt may steal

2022-08-11 Thread Jordan Niethe
On Thu, 2022-07-28 at 16:31 +1000, Nicholas Piggin wrote:
> This gives trylock slightly more strength, and it also gives most
> of the benefit of passing 'val' back through the slowpath without
> the complexity.
> ---
>  arch/powerpc/include/asm/qspinlock.h | 39 +++-
>  arch/powerpc/lib/qspinlock.c |  9 +++
>  2 files changed, 47 insertions(+), 1 deletion(-)
> 
> diff --git a/arch/powerpc/include/asm/qspinlock.h 
> b/arch/powerpc/include/asm/qspinlock.h
> index 44601b261e08..d3d2039237b2 100644
> --- a/arch/powerpc/include/asm/qspinlock.h
> +++ b/arch/powerpc/include/asm/qspinlock.h
> @@ -5,6 +5,8 @@
>  #include 
>  #include 
>  
> +#define _Q_SPIN_TRY_LOCK_STEAL 1

Would this be a config option?

> +
>  static __always_inline int queued_spin_is_locked(struct qspinlock *lock)
>  {
>   return READ_ONCE(lock->val);
> @@ -26,11 +28,12 @@ static __always_inline u32 
> queued_spin_get_locked_val(void)
>   return _Q_LOCKED_VAL | (smp_processor_id() << _Q_OWNER_CPU_OFFSET);
>  }
>  
> -static __always_inline int queued_spin_trylock(struct qspinlock *lock)
> +static __always_inline int __queued_spin_trylock_nosteal(struct qspinlock 
> *lock)
>  {
>   u32 new = queued_spin_get_locked_val();
>   u32 prev;
>  
> + /* Trylock succeeds only when unlocked and no queued nodes */
>   asm volatile(
>  "1:  lwarx   %0,0,%1,%3  # queued_spin_trylock   \n"

s/queued_spin_trylock/__queued_spin_trylock_nosteal

>  "cmpwi   0,%0,0  \n"
> @@ -49,6 +52,40 @@ static __always_inline int queued_spin_trylock(struct 
> qspinlock *lock)
>   return 0;
>  }
>  
> +static __always_inline int __queued_spin_trylock_steal(struct qspinlock 
> *lock)
> +{
> + u32 new = queued_spin_get_locked_val();
> + u32 prev, tmp;
> +
> + /* Trylock may get ahead of queued nodes if it finds unlocked */
> + asm volatile(
> +"1:  lwarx   %0,0,%2,%5  # queued_spin_trylock   \n"

s/queued_spin_trylock/__queued_spin_trylock_steal

> +"andc.   %1,%0,%4\n"
> +"bne-2f  \n"
> +"and %1,%0,%4\n"
> +"or  %1,%1,%3\n"
> +"stwcx.  %1,0,%2 \n"
> +"bne-1b  \n"
> +"\t" PPC_ACQUIRE_BARRIER "   \n"
> +"2:  \n"

Just because there's a little bit more going on here...

Q_TAIL_CPU_MASK = 0xFFFE
~Q_TAIL_CPU_MASK = 0x1


1:  lwarx   prev, 0, &lock->val, IS_ENABLED_PPC64
andc.   tmp, prev, _Q_TAIL_CPU_MASK (tmp = prev & ~_Q_TAIL_CPU_MASK)
bne-2f  (exit if locked)
and tmp, prev, _Q_TAIL_CPU_MASK (tmp = prev & _Q_TAIL_CPU_MASK)
or  tmp, tmp, new   (tmp |= new)

stwcx.  tmp, 0, &lock->val  

bne-1b  
PPC_ACQUIRE_BARRIER 
2:

... which seems correct.


> + : "=&r" (prev), "=&r" (tmp)
> + : "r" (&lock->val), "r" (new), "r" (_Q_TAIL_CPU_MASK),
> +   "i" (IS_ENABLED(CONFIG_PPC64) ? 1 : 0)
> + : "cr0", "memory");
> +
> + if (likely(!(prev & ~_Q_TAIL_CPU_MASK)))
> + return 1;
> + return 0;
> +}
> +
> +static __always_inline int queued_spin_trylock(struct qspinlock *lock)
> +{
> + if (!_Q_SPIN_TRY_LOCK_STEAL)
> + return __queued_spin_trylock_nosteal(lock);
> + else
> + return __queued_spin_trylock_steal(lock);
> +}
> +
>  void queued_spin_lock_slowpath(struct qspinlock *lock);
>  
>  static __always_inline void queued_spin_lock(struct qspinlock *lock)
> diff --git a/arch/powerpc/lib/qspinlock.c b/arch/powerpc/lib/qspinlock.c
> index 3b10e31bcf0a..277aef1fab0a 100644
> --- a/arch/powerpc/lib/qspinlock.c
> +++ b/arch/powerpc/lib/qspinlock.c
> @@ -24,7 +24,11 @@ struct qnodes {
>  
>  /* Tuning parameters */
>  static int STEAL_SPINS __read_mostly = (1<<5);
> +#if _Q_SPIN_TRY_LOCK_STEAL == 1
> +static const bool MAYBE_STEALERS = true;
> +#else
>  static bool MAYBE_STEALERS __read_mostly = true;
> +#endif
>  static int HEAD_SPINS __read_mostly = (1<<8);
>  
>  static bool pv_yield_owner __read_mostly = true;
> @@ -522,6 +526,10 @@ void pv_spinlocks_init(void)
>  #include 
>  static int steal_spins_set(void *data, u64 val)
>  {
> +#if _Q_SPIN_TRY_LOCK_STEAL == 1
> + /* MAYBE_STEAL remains true */
> + STEAL_SPINS = val;
> +#else
>   static DEFINE_MUTEX(lock);
>  
>   mutex_lock(&lock);
> @@ -539,6 +547,7 @@ static int steal_spins_set(

Re: [PATCH 12/17] powerpc/qspinlock: add ability to prod new queue head CPU

2022-08-11 Thread Jordan Niethe
On Thu, 2022-07-28 at 16:31 +1000, Nicholas Piggin wrote:
> After the head of the queue acquires the lock, it releases the
> next waiter in the queue to become the new head. Add an option
> to prod the new head if its vCPU was preempted. This may only
> have an effect if queue waiters are yielding.
> 
> Disable this option by default for now, i.e., no logical change.
> ---
>  arch/powerpc/lib/qspinlock.c | 29 -
>  1 file changed, 28 insertions(+), 1 deletion(-)
> 
> diff --git a/arch/powerpc/lib/qspinlock.c b/arch/powerpc/lib/qspinlock.c
> index 28c85a2d5635..3b10e31bcf0a 100644
> --- a/arch/powerpc/lib/qspinlock.c
> +++ b/arch/powerpc/lib/qspinlock.c
> @@ -12,6 +12,7 @@
>  struct qnode {
>   struct qnode*next;
>   struct qspinlock *lock;
> + int cpu;
>   int yield_cpu;
>   u8  locked; /* 1 if lock acquired */
>  };
> @@ -30,6 +31,7 @@ static bool pv_yield_owner __read_mostly = true;
>  static bool pv_yield_allow_steal __read_mostly = false;
>  static bool pv_yield_prev __read_mostly = true;
>  static bool pv_yield_propagate_owner __read_mostly = true;
> +static bool pv_prod_head __read_mostly = false;
>  
>  static DEFINE_PER_CPU_ALIGNED(struct qnodes, qnodes);
>  
> @@ -392,6 +394,7 @@ static __always_inline void 
> queued_spin_lock_mcs_queue(struct qspinlock *lock, b
>   node = &qnodesp->nodes[idx];
>   node->next = NULL;
>   node->lock = lock;
> + node->cpu = smp_processor_id();

I suppose this could be used in some other places too.

For example change:
yield_to_prev(lock, node, prev, paravirt);

In yield_to_prev() it could then access the prev->cpu.

>   node->yield_cpu = -1;
>   node->locked = 0;
>  
> @@ -483,7 +486,14 @@ static __always_inline void 
> queued_spin_lock_mcs_queue(struct qspinlock *lock, b
>* this store to locked. The corresponding barrier is the smp_rmb()
>* acquire barrier for mcs lock, above.
>*/
> - WRITE_ONCE(next->locked, 1);
> + if (paravirt && pv_prod_head) {
> + int next_cpu = next->cpu;
> + WRITE_ONCE(next->locked, 1);
> + if (vcpu_is_preempted(next_cpu))
> + prod_cpu(next_cpu);
> + } else {
> + WRITE_ONCE(next->locked, 1);
> + }
>  
>  release:
>   qnodesp->count--; /* release the node */
> @@ -622,6 +632,22 @@ static int pv_yield_propagate_owner_get(void *data, u64 
> *val)
>  
>  DEFINE_SIMPLE_ATTRIBUTE(fops_pv_yield_propagate_owner, 
> pv_yield_propagate_owner_get, pv_yield_propagate_owner_set, "%llu\n");
>  
> +static int pv_prod_head_set(void *data, u64 val)
> +{
> + pv_prod_head = !!val;
> +
> + return 0;
> +}
> +
> +static int pv_prod_head_get(void *data, u64 *val)
> +{
> + *val = pv_prod_head;
> +
> + return 0;
> +}
> +
> +DEFINE_SIMPLE_ATTRIBUTE(fops_pv_prod_head, pv_prod_head_get, 
> pv_prod_head_set, "%llu\n");
> +
>  static __init int spinlock_debugfs_init(void)
>  {
>   debugfs_create_file("qspl_steal_spins", 0600, arch_debugfs_dir, NULL, 
> &fops_steal_spins);
> @@ -631,6 +657,7 @@ static __init int spinlock_debugfs_init(void)
>   debugfs_create_file("qspl_pv_yield_allow_steal", 0600, 
> arch_debugfs_dir, NULL, &fops_pv_yield_allow_steal);
>   debugfs_create_file("qspl_pv_yield_prev", 0600, 
> arch_debugfs_dir, NULL, &fops_pv_yield_prev);
>   debugfs_create_file("qspl_pv_yield_propagate_owner", 0600, 
> arch_debugfs_dir, NULL, &fops_pv_yield_propagate_owner);
> + debugfs_create_file("qspl_pv_prod_head", 0600, 
> arch_debugfs_dir, NULL, &fops_pv_prod_head);
>   }
>  
>   return 0;



Re: [PATCH 11/17] powerpc/qspinlock: allow propagation of yield CPU down the queue

2022-08-11 Thread Jordan Niethe
On Thu, 2022-07-28 at 16:31 +1000, Nicholas Piggin wrote:
> Having all CPUs poll the lock word for the owner CPU that should be
> yielded to defeats most of the purpose of using MCS queueing for
> scalability. Yet it may be desirable for queued waiters to to yield
> to a preempted owner.
> 
> s390 addreses this problem by having queued waiters sample the lock
> word to find the owner much less frequently. In this approach, the
> waiters never sample it directly, but the queue head propagates the
> owner CPU back to the next waiter if it ever finds the owner has
> been preempted. Queued waiters then subsequently propagate the owner
> CPU back to the next waiter, and so on.
> 
> Disable this option by default for now, i.e., no logical change.
> ---
>  arch/powerpc/lib/qspinlock.c | 85 +++-
>  1 file changed, 84 insertions(+), 1 deletion(-)
> 
> diff --git a/arch/powerpc/lib/qspinlock.c b/arch/powerpc/lib/qspinlock.c
> index 94f007f66942..28c85a2d5635 100644
> --- a/arch/powerpc/lib/qspinlock.c
> +++ b/arch/powerpc/lib/qspinlock.c
> @@ -12,6 +12,7 @@
>  struct qnode {
>   struct qnode*next;
>   struct qspinlock *lock;
> + int yield_cpu;
>   u8  locked; /* 1 if lock acquired */
>  };
>  
> @@ -28,6 +29,7 @@ static int HEAD_SPINS __read_mostly = (1<<8);
>  static bool pv_yield_owner __read_mostly = true;
>  static bool pv_yield_allow_steal __read_mostly = false;
>  static bool pv_yield_prev __read_mostly = true;
> +static bool pv_yield_propagate_owner __read_mostly = true;

This also seems to be enabled by default.

>  
>  static DEFINE_PER_CPU_ALIGNED(struct qnodes, qnodes);
>  
> @@ -257,13 +259,66 @@ static __always_inline void 
> yield_head_to_locked_owner(struct qspinlock *lock, u
>   __yield_to_locked_owner(lock, val, paravirt, clear_mustq);
>  }
>  
> +static __always_inline void propagate_yield_cpu(struct qnode *node, u32 val, 
> int *set_yield_cpu, bool paravirt)
> +{
> + struct qnode *next;
> + int owner;
> +
> + if (!paravirt)
> + return;
> + if (!pv_yield_propagate_owner)
> + return;
> +
> + owner = get_owner_cpu(val);
> + if (*set_yield_cpu == owner)
> + return;
> +
> + next = READ_ONCE(node->next);
> + if (!next)
> + return;
> +
> + if (vcpu_is_preempted(owner)) {

Is there a difference about using vcpu_is_preempted() here
vs checking bit 0 in other places?


> + next->yield_cpu = owner;
> + *set_yield_cpu = owner;
> + } else if (*set_yield_cpu != -1) {

It might be worth giving the -1 CPU a #define.

> + next->yield_cpu = owner;
> + *set_yield_cpu = owner;
> + }
> +}

Does this need to pass set_yield_cpu by reference? Couldn't it's new value be
returned? To me it makes it more clear the function is used to change
set_yield_cpu. I think this would work:

int set_yield_cpu = -1;

static __always_inline int propagate_yield_cpu(struct qnode *node, u32 val, int 
set_yield_cpu, bool paravirt)
{
struct qnode *next;
int owner;

if (!paravirt)
goto out;
if (!pv_yield_propagate_owner)
goto out;

owner = get_owner_cpu(val);
if (set_yield_cpu == owner)
goto out;

next = READ_ONCE(node->next);
if (!next)
goto out;

if (vcpu_is_preempted(owner)) {
next->yield_cpu = owner;
return owner;
} else if (set_yield_cpu != -1) {
next->yield_cpu = owner;
return owner;
}

out:
return set_yield_cpu;
}

set_yield_cpu = propagate_yield_cpu(...  set_yield_cpu ...);



> +
>  static __always_inline void yield_to_prev(struct qspinlock *lock, struct 
> qnode *node, int prev_cpu, bool paravirt)
>  {
>   u32 yield_count;
> + int yield_cpu;
>  
>   if (!paravirt)
>   goto relax;
>  
> + if (!pv_yield_propagate_owner)
> + goto yield_prev;
> +
> + yield_cpu = READ_ONCE(node->yield_cpu);
> + if (yield_cpu == -1) {
> + /* Propagate back the -1 CPU */
> + if (node->next && node->next->yield_cpu != -1)
> + node->next->yield_cpu = yield_cpu;
> + goto yield_prev;
> + }
> +
> + yield_count = yield_count_of(yield_cpu);
> + if ((yield_count & 1) == 0)
> + goto yield_prev; /* owner vcpu is running */
> +
> + smp_rmb();
> +
> + if (yield_cpu == node->yield_cpu) {
> + if (node->next && node->next->yield_cpu != yield_cpu)
> + node->next->yield_cpu = yield_cpu;
> + yield_to_preempted(yield_cpu, yield_count);
> + return;
> + }
> +
> +yield_prev:
>   if (!pv_yield_prev)
>   goto relax;
>  
> @@ -337,6 +392,7 @@ static __always_inline void 
> queued_spin_lock_mcs_queue(struct qspinlock *lock, b
>   node

Re: [PATCH 10/17] powerpc/qspinlock: allow stealing when head of queue yields

2022-08-11 Thread Jordan Niethe
On Thu, 2022-07-28 at 16:31 +1000, Nicholas Piggin wrote:
> If the head of queue is preventing stealing but it finds the owner vCPU
> is preempted, it will yield its cycles to the owner which could cause it
> to become preempted. Add an option to re-allow stealers before yielding,
> and disallow them again after returning from the yield.
> 
> Disable this option by default for now, i.e., no logical change.
> ---
>  arch/powerpc/lib/qspinlock.c | 56 ++--
>  1 file changed, 53 insertions(+), 3 deletions(-)
> 
> diff --git a/arch/powerpc/lib/qspinlock.c b/arch/powerpc/lib/qspinlock.c
> index b39f8c5b329c..94f007f66942 100644
> --- a/arch/powerpc/lib/qspinlock.c
> +++ b/arch/powerpc/lib/qspinlock.c
> @@ -26,6 +26,7 @@ static bool MAYBE_STEALERS __read_mostly = true;
>  static int HEAD_SPINS __read_mostly = (1<<8);
>  
>  static bool pv_yield_owner __read_mostly = true;
> +static bool pv_yield_allow_steal __read_mostly = false;

To me this one does read as a boolean, but if you go with those other changes
I'd make it pv_yield_steal_enable to be consistent.

>  static bool pv_yield_prev __read_mostly = true;
>  
>  static DEFINE_PER_CPU_ALIGNED(struct qnodes, qnodes);
> @@ -173,6 +174,23 @@ static __always_inline u32 lock_set_mustq(struct 
> qspinlock *lock)
>   return prev;
>  }
>  
> +static __always_inline u32 lock_clear_mustq(struct qspinlock *lock)
> +{
> + u32 new = _Q_MUST_Q_VAL;
> + u32 prev;
> +
> + asm volatile(
> +"1:  lwarx   %0,0,%1 # lock_clear_mustq  \n"
> +"andc%0,%0,%2\n"
> +"stwcx.  %0,0,%1 \n"
> +"bne-1b  \n"
> + : "=&r" (prev)
> + : "r" (&lock->val), "r" (new)
> + : "cr0", "memory");
> +

This is pretty similar to the DEFINE_TESTOP() pattern again with the same llong 
caveat.


> + return prev;
> +}
> +
>  static struct qnode *get_tail_qnode(struct qspinlock *lock, u32 val)
>  {
>   int cpu = get_tail_cpu(val);
> @@ -188,7 +206,7 @@ static struct qnode *get_tail_qnode(struct qspinlock 
> *lock, u32 val)
>   BUG();
>  }
>  
> -static __always_inline void yield_to_locked_owner(struct qspinlock *lock, 
> u32 val, bool paravirt)
> +static __always_inline void __yield_to_locked_owner(struct qspinlock *lock, 
> u32 val, bool paravirt, bool clear_mustq)

 /* See yield_to_locked_owner comment */ comment needs to be updated now.


>  {
>   int owner;
>   u32 yield_count;
> @@ -217,7 +235,11 @@ static __always_inline void yield_to_locked_owner(struct 
> qspinlock *lock, u32 va
>   smp_rmb();
>  
>   if (READ_ONCE(lock->val) == val) {
> + if (clear_mustq)
> + lock_clear_mustq(lock);
>   yield_to_preempted(owner, yield_count);
> + if (clear_mustq)
> + lock_set_mustq(lock);
>   /* Don't relax if we yielded. Maybe we should? */
>   return;
>   }
> @@ -225,6 +247,16 @@ static __always_inline void yield_to_locked_owner(struct 
> qspinlock *lock, u32 va
>   cpu_relax();
>  }
>  
> +static __always_inline void yield_to_locked_owner(struct qspinlock *lock, 
> u32 val, bool paravirt)
> +{
> + __yield_to_locked_owner(lock, val, paravirt, false);
> +}
> +
> +static __always_inline void yield_head_to_locked_owner(struct qspinlock 
> *lock, u32 val, bool paravirt, bool clear_mustq)
> +{

The check for pv_yield_allow_steal seems like it could go here instead of
being done by the caller.
__yield_to_locked_owner() checks for pv_yield_owner so it seems more
  consistent.



> + __yield_to_locked_owner(lock, val, paravirt, clear_mustq);
> +}
> +
>  static __always_inline void yield_to_prev(struct qspinlock *lock, struct 
> qnode *node, int prev_cpu, bool paravirt)
>  {
>   u32 yield_count;
> @@ -332,7 +364,7 @@ static __always_inline void 
> queued_spin_lock_mcs_queue(struct qspinlock *lock, b
>   if (!MAYBE_STEALERS) {
>   /* We're at the head of the waitqueue, wait for the lock. */
>   while ((val = READ_ONCE(lock->val)) & _Q_LOCKED_VAL)
> - yield_to_locked_owner(lock, val, paravirt);
> + yield_head_to_locked_owner(lock, val, paravirt, false);
>  
>   /* If we're the last queued, must clean up the tail. */
>   if ((val & _Q_TAIL_CPU_MASK) == tail) {
> @@ -350,7 +382,8 @@ static __always_inline void 
> queued_spin_lock_mcs_queue(struct qspinlock *lock, b
>  again:
>   /* We're at the head of the waitqueue, wait for the lock. */
>   while ((val = READ_ONCE(lock->val)) & _Q_LOCKED_VAL) {
> - yield_to_locked_owner(lock, val, paravirt);
> + yield_head_to_locked_owner(lock, val, paravirt,
> + pv_yield_allow_steal && set_mustq);
>  
>   

Re: [PATCH 09/17] powerpc/qspinlock: implement option to yield to previous node

2022-08-11 Thread Jordan Niethe
On Thu, 2022-07-28 at 16:31 +1000, Nicholas Piggin wrote:
> Queued waiters which are not at the head of the queue don't spin on
> the lock word but their qnode lock word, waiting for the previous queued
> CPU to release them. Add an option which allows these waiters to yield
> to the previous CPU if its vCPU is preempted.
> 
> Disable this option by default for now, i.e., no logical change.
> ---
>  arch/powerpc/lib/qspinlock.c | 46 +++-
>  1 file changed, 45 insertions(+), 1 deletion(-)
> 
> diff --git a/arch/powerpc/lib/qspinlock.c b/arch/powerpc/lib/qspinlock.c
> index 55286ac91da5..b39f8c5b329c 100644
> --- a/arch/powerpc/lib/qspinlock.c
> +++ b/arch/powerpc/lib/qspinlock.c
> @@ -26,6 +26,7 @@ static bool MAYBE_STEALERS __read_mostly = true;
>  static int HEAD_SPINS __read_mostly = (1<<8);
>  
>  static bool pv_yield_owner __read_mostly = true;
> +static bool pv_yield_prev __read_mostly = true;

Similiar suggestion, maybe pv_yield_prev_enabled would read better.

Isn't this enabled by default contrary to the commit message?


>  
>  static DEFINE_PER_CPU_ALIGNED(struct qnodes, qnodes);
>  
> @@ -224,6 +225,31 @@ static __always_inline void yield_to_locked_owner(struct 
> qspinlock *lock, u32 va
>   cpu_relax();
>  }
>  
> +static __always_inline void yield_to_prev(struct qspinlock *lock, struct 
> qnode *node, int prev_cpu, bool paravirt)

yield_to_locked_owner() takes a raw val and works out the cpu to yield to.
I think for consistency have yield_to_prev() take the raw val and work it out 
too.

> +{
> + u32 yield_count;
> +
> + if (!paravirt)
> + goto relax;
> +
> + if (!pv_yield_prev)
> + goto relax;
> +
> + yield_count = yield_count_of(prev_cpu);
> + if ((yield_count & 1) == 0)
> + goto relax; /* owner vcpu is running */
> +
> + smp_rmb(); /* See yield_to_locked_owner comment */
> +
> + if (!node->locked) {
> + yield_to_preempted(prev_cpu, yield_count);
> + return;
> + }
> +
> +relax:
> + cpu_relax();
> +}
> +
>  
>  static __always_inline bool try_to_steal_lock(struct qspinlock *lock, bool 
> paravirt)
>  {
> @@ -291,13 +317,14 @@ static __always_inline void 
> queued_spin_lock_mcs_queue(struct qspinlock *lock, b
>*/
>   if (old & _Q_TAIL_CPU_MASK) {
>   struct qnode *prev = get_tail_qnode(lock, old);
> + int prev_cpu = get_tail_cpu(old);

This could then be removed.

>  
>   /* Link @node into the waitqueue. */
>   WRITE_ONCE(prev->next, node);
>  
>   /* Wait for mcs node lock to be released */
>   while (!node->locked)
> - cpu_relax();
> + yield_to_prev(lock, node, prev_cpu, paravirt);

And would have this as:
yield_to_prev(lock, node, old, paravirt);


>  
>   smp_rmb(); /* acquire barrier for the mcs lock */
>   }
> @@ -448,12 +475,29 @@ static int pv_yield_owner_get(void *data, u64 *val)
>  
>  DEFINE_SIMPLE_ATTRIBUTE(fops_pv_yield_owner, pv_yield_owner_get, 
> pv_yield_owner_set, "%llu\n");
>  
> +static int pv_yield_prev_set(void *data, u64 val)
> +{
> + pv_yield_prev = !!val;
> +
> + return 0;
> +}
> +
> +static int pv_yield_prev_get(void *data, u64 *val)
> +{
> + *val = pv_yield_prev;
> +
> + return 0;
> +}
> +
> +DEFINE_SIMPLE_ATTRIBUTE(fops_pv_yield_prev, pv_yield_prev_get, 
> pv_yield_prev_set, "%llu\n");
> +
>  static __init int spinlock_debugfs_init(void)
>  {
>   debugfs_create_file("qspl_steal_spins", 0600, arch_debugfs_dir, NULL, 
> &fops_steal_spins);
>   debugfs_create_file("qspl_head_spins", 0600, arch_debugfs_dir, NULL, 
> &fops_head_spins);
>   if (is_shared_processor()) {
>   debugfs_create_file("qspl_pv_yield_owner", 0600, 
> arch_debugfs_dir, NULL, &fops_pv_yield_owner);
> + debugfs_create_file("qspl_pv_yield_prev", 0600, 
> arch_debugfs_dir, NULL, &fops_pv_yield_prev);
>   }
>  
>   return 0;



Re: [PATCH 08/17] powerpc/qspinlock: paravirt yield to lock owner

2022-08-11 Thread Jordan Niethe
 On Thu, 2022-07-28 at 16:31 +1000, Nicholas Piggin wrote:
> Waiters spinning on the lock word should yield to the lock owner if the
> vCPU is preempted. This improves performance when the hypervisor has
> oversubscribed physical CPUs.
> ---
>  arch/powerpc/lib/qspinlock.c | 97 ++--
>  1 file changed, 83 insertions(+), 14 deletions(-)
> 
> diff --git a/arch/powerpc/lib/qspinlock.c b/arch/powerpc/lib/qspinlock.c
> index aa26cfe21f18..55286ac91da5 100644
> --- a/arch/powerpc/lib/qspinlock.c
> +++ b/arch/powerpc/lib/qspinlock.c
> @@ -5,6 +5,7 @@
>  #include 
>  #include 
>  #include 
> +#include 
>  
>  #define MAX_NODES4
>  
> @@ -24,14 +25,16 @@ static int STEAL_SPINS __read_mostly = (1<<5);
>  static bool MAYBE_STEALERS __read_mostly = true;
>  static int HEAD_SPINS __read_mostly = (1<<8);
>  
> +static bool pv_yield_owner __read_mostly = true;

Not macro case for these globals? To me name does not make it super clear this
is a boolean. What about pv_yield_owner_enabled?

> +
>  static DEFINE_PER_CPU_ALIGNED(struct qnodes, qnodes);
>  
> -static __always_inline int get_steal_spins(void)
> +static __always_inline int get_steal_spins(bool paravirt)
>  {
>   return STEAL_SPINS;
>  }
>  
> -static __always_inline int get_head_spins(void)
> +static __always_inline int get_head_spins(bool paravirt)
>  {
>   return HEAD_SPINS;
>  }
> @@ -46,7 +49,11 @@ static inline int get_tail_cpu(u32 val)
>   return (val >> _Q_TAIL_CPU_OFFSET) - 1;
>  }
>  
> -/* Take the lock by setting the bit, no other CPUs may concurrently lock it. 
> */
> +static inline int get_owner_cpu(u32 val)
> +{
> + return (val & _Q_OWNER_CPU_MASK) >> _Q_OWNER_CPU_OFFSET;
> +}
> +
>  /* Take the lock by setting the lock bit, no other CPUs will touch it. */
>  static __always_inline void lock_set_locked(struct qspinlock *lock)
>  {
> @@ -180,7 +187,45 @@ static struct qnode *get_tail_qnode(struct qspinlock 
> *lock, u32 val)
>   BUG();
>  }
>  
> -static inline bool try_to_steal_lock(struct qspinlock *lock)
> +static __always_inline void yield_to_locked_owner(struct qspinlock *lock, 
> u32 val, bool paravirt)

This name doesn't seem correct for the non paravirt case.

> +{
> + int owner;
> + u32 yield_count;
> +
> + BUG_ON(!(val & _Q_LOCKED_VAL));
> +
> + if (!paravirt)
> + goto relax;
> +
> + if (!pv_yield_owner)
> + goto relax;
> +
> + owner = get_owner_cpu(val);
> + yield_count = yield_count_of(owner);
> +
> + if ((yield_count & 1) == 0)
> + goto relax; /* owner vcpu is running */

I wonder why not use vcpu_is_preempted()?

> +
> + /*
> +  * Read the lock word after sampling the yield count. On the other side
> +  * there may a wmb because the yield count update is done by the
> +  * hypervisor preemption and the value update by the OS, however this
> +  * ordering might reduce the chance of out of order accesses and
> +  * improve the heuristic.
> +  */
> + smp_rmb();
> +
> + if (READ_ONCE(lock->val) == val) {
> + yield_to_preempted(owner, yield_count);
> + /* Don't relax if we yielded. Maybe we should? */
> + return;
> + }
> +relax:
> + cpu_relax();
> +}
> +
> +
> +static __always_inline bool try_to_steal_lock(struct qspinlock *lock, bool 
> paravirt)
>  {
>   int iters;
>  
> @@ -197,18 +242,18 @@ static inline bool try_to_steal_lock(struct qspinlock 
> *lock)
>   continue;
>   }
>  
> - cpu_relax();
> + yield_to_locked_owner(lock, val, paravirt);
>  
>   iters++;
>  
> - if (iters >= get_steal_spins())
> + if (iters >= get_steal_spins(paravirt))
>   break;
>   }
>  
>   return false;
>  }
>  
> -static inline void queued_spin_lock_mcs_queue(struct qspinlock *lock)
> +static __always_inline void queued_spin_lock_mcs_queue(struct qspinlock 
> *lock, bool paravirt)
>  {
>   struct qnodes *qnodesp;
>   struct qnode *next, *node;
> @@ -260,7 +305,7 @@ static inline void queued_spin_lock_mcs_queue(struct 
> qspinlock *lock)
>   if (!MAYBE_STEALERS) {
>   /* We're at the head of the waitqueue, wait for the lock. */
>   while ((val = READ_ONCE(lock->val)) & _Q_LOCKED_VAL)
> - cpu_relax();
> + yield_to_locked_owner(lock, val, paravirt);
>  
>   /* If we're the last queued, must clean up the tail. */
>   if ((val & _Q_TAIL_CPU_MASK) == tail) {
> @@ -278,10 +323,10 @@ static inline void queued_spin_lock_mcs_queue(struct 
> qspinlock *lock)
>  again:
>   /* We're at the head of the waitqueue, wait for the lock. */
>   while ((val = READ_ONCE(lock->val)) & _Q_LOCKED_VAL) {
> - cpu_relax();
> + yield_to_locked_owner(lock, val, paravirt);
>  
>   iters++;

Re: [PATCH 07/17] powerpc/qspinlock: store owner CPU in lock word

2022-08-11 Thread Jordan Niethe
On Thu, 2022-07-28 at 16:31 +1000, Nicholas Piggin wrote:
> Store the owner CPU number in the lock word so it may be yielded to,
> as powerpc's paravirtualised simple spinlocks do.
> ---
>  arch/powerpc/include/asm/qspinlock.h   |  8 +++-
>  arch/powerpc/include/asm/qspinlock_types.h | 10 ++
>  arch/powerpc/lib/qspinlock.c   |  6 +++---
>  3 files changed, 20 insertions(+), 4 deletions(-)
> 
> diff --git a/arch/powerpc/include/asm/qspinlock.h 
> b/arch/powerpc/include/asm/qspinlock.h
> index 3ab354159e5e..44601b261e08 100644
> --- a/arch/powerpc/include/asm/qspinlock.h
> +++ b/arch/powerpc/include/asm/qspinlock.h
> @@ -20,9 +20,15 @@ static __always_inline int queued_spin_is_contended(struct 
> qspinlock *lock)
>   return !!(READ_ONCE(lock->val) & _Q_TAIL_CPU_MASK);
>  }
>  
> +static __always_inline u32 queued_spin_get_locked_val(void)

Maybe this function should have "encode" in the name to match with
encode_tail_cpu().


> +{
> + /* XXX: make this use lock value in paca like simple spinlocks? */

Is that the paca's lock_token which is 0x8000?


> + return _Q_LOCKED_VAL | (smp_processor_id() << _Q_OWNER_CPU_OFFSET);
> +}
> +
>  static __always_inline int queued_spin_trylock(struct qspinlock *lock)
>  {
> - u32 new = _Q_LOCKED_VAL;
> + u32 new = queued_spin_get_locked_val();
>   u32 prev;
>  
>   asm volatile(
> diff --git a/arch/powerpc/include/asm/qspinlock_types.h 
> b/arch/powerpc/include/asm/qspinlock_types.h
> index 8b20f5e22bba..35f9525381e6 100644
> --- a/arch/powerpc/include/asm/qspinlock_types.h
> +++ b/arch/powerpc/include/asm/qspinlock_types.h
> @@ -29,6 +29,8 @@ typedef struct qspinlock {
>   * Bitfields in the lock word:
>   *
>   * 0: locked bit
> + *  1-14: lock holder cpu
> + *15: unused bit
>   *16: must queue bit
>   * 17-31: tail cpu (+1)

So there is one more bit to store the tail cpu vs the lock holder cpu?

>   */
> @@ -39,6 +41,14 @@ typedef struct qspinlock {
>  #define _Q_LOCKED_MASK   _Q_SET_MASK(LOCKED)
>  #define _Q_LOCKED_VAL(1U << _Q_LOCKED_OFFSET)
>  
> +#define _Q_OWNER_CPU_OFFSET  1
> +#define _Q_OWNER_CPU_BITS14
> +#define _Q_OWNER_CPU_MASK_Q_SET_MASK(OWNER_CPU)
> +
> +#if CONFIG_NR_CPUS > (1U << _Q_OWNER_CPU_BITS)
> +#error "qspinlock does not support such large CONFIG_NR_CPUS"
> +#endif
> +
>  #define _Q_MUST_Q_OFFSET 16
>  #define _Q_MUST_Q_BITS   1
>  #define _Q_MUST_Q_MASK   _Q_SET_MASK(MUST_Q)
> diff --git a/arch/powerpc/lib/qspinlock.c b/arch/powerpc/lib/qspinlock.c
> index a906cc8f15fa..aa26cfe21f18 100644
> --- a/arch/powerpc/lib/qspinlock.c
> +++ b/arch/powerpc/lib/qspinlock.c
> @@ -50,7 +50,7 @@ static inline int get_tail_cpu(u32 val)
>  /* Take the lock by setting the lock bit, no other CPUs will touch it. */
>  static __always_inline void lock_set_locked(struct qspinlock *lock)
>  {
> - u32 new = _Q_LOCKED_VAL;
> + u32 new = queued_spin_get_locked_val();
>   u32 prev;
>  
>   asm volatile(
> @@ -68,7 +68,7 @@ static __always_inline void lock_set_locked(struct 
> qspinlock *lock)
>  /* Take lock, clearing tail, cmpxchg with old (which must not be locked) */
>  static __always_inline int trylock_clear_tail_cpu(struct qspinlock *lock, 
> u32 old)
>  {
> - u32 new = _Q_LOCKED_VAL;
> + u32 new = queued_spin_get_locked_val();
>   u32 prev;
>  
>   BUG_ON(old & _Q_LOCKED_VAL);
> @@ -116,7 +116,7 @@ static __always_inline u32 __trylock_cmpxchg(struct 
> qspinlock *lock, u32 old, u3
>  /* Take lock, preserving tail, cmpxchg with val (which must not be locked) */
>  static __always_inline int trylock_with_tail_cpu(struct qspinlock *lock, u32 
> val)
>  {
> - u32 newval = _Q_LOCKED_VAL | (val & _Q_TAIL_CPU_MASK);
> + u32 newval = queued_spin_get_locked_val() | (val & _Q_TAIL_CPU_MASK);
>  
>   if (__trylock_cmpxchg(lock, val, newval) == val)
>   return 1;



Re: [PATCH v2 04/10] drm/simpledrm: Compute framebuffer stride if not set

2022-08-11 Thread Thomas Zimmermann



Am 11.08.22 um 20:26 schrieb Thomas Zimmermann:

Hi Daniel

Am 11.08.22 um 19:23 schrieb Daniel Vetter:
On Wed, 27 Jul 2022 at 09:53, Thomas Zimmermann  
wrote:


Hi

Am 25.07.22 um 17:13 schrieb Javier Martinez Canillas:

Hello Geert,

On 7/21/22 16:46, Geert Uytterhoeven wrote:

Hi Thomas,

On Wed, Jul 20, 2022 at 4:27 PM Thomas Zimmermann 
 wrote:

Compute the framebuffer's scanline stride length if not given by
the simplefb data.

Signed-off-by: Thomas Zimmermann 


Thanks for your patch!


--- a/drivers/gpu/drm/tiny/simpledrm.c
+++ b/drivers/gpu/drm/tiny/simpledrm.c
@@ -743,6 +743,9 @@ static struct simpledrm_device 
*simpledrm_device_create(struct drm_driver *drv,

  drm_err(dev, "no simplefb configuration found\n");
  return ERR_PTR(-ENODEV);
  }
+   if (!stride)
+   stride = format->cpp[0] * width;


DIV_ROUND_UP(drm_format_info_bpp(format) * width, 8)



I think you meant here:

DIV_ROUND_UP(drm_format_info_bpp(format, 0) * width, 8) ?


I guess, that's the right function. My original code is correct, but cpp
is also deprecated.


You all mean drm_format_info_min_pitch().


Thanks a lot. I wasn't even aware of this function, but I had almost 
written my own implementation of it.  I'll update the patch accordingly.


Arghh, too late. I merged that patch already.



Best regards
Thomas



I really don't want drivers to go grab any of the legacy format info
fields like bpp or depth. switch() statements on the fourcc code for
programming registers, or one of the real helper functions in
drm_fourcc.c (there might be some gaps), but not ever going through
legacy concepts. Anything else just leads to subtle bugs when new
formats get added and oops suddenly the assumptions don't hold.

Those should be strictly limited to legacy (i.e. not drm_fourcc aware)
interfaces. Heck I think even fbdev emulation should completely switch
over to drm_fourcc/drm_format_info, but alas that's a pile of work and
not much payoff.

I'm trying to volunteer Same to add a legacy_bpp tag to the above
helper and appropriately limit it, I think limiting to formats with
depth!=0 is probably the right thing. And then we should probably
remove a pile of the cargo-culted depth!=0 entries too.
-Daniel



Best regards
Thomas



With that change,

Acked-by: Javier Martinez Canillas 



--
Thomas Zimmermann
Graphics Driver Developer
SUSE Software Solutions Germany GmbH
Maxfeldstr. 5, 90409 Nürnberg, Germany
(HRB 36809, AG Nürnberg)
Geschäftsführer: Ivo Totev








--
Thomas Zimmermann
Graphics Driver Developer
SUSE Software Solutions Germany GmbH
Maxfeldstr. 5, 90409 Nürnberg, Germany
(HRB 36809, AG Nürnberg)
Geschäftsführer: Ivo Totev


OpenPGP_signature
Description: OpenPGP digital signature


Re: [PATCH v2 04/10] drm/simpledrm: Compute framebuffer stride if not set

2022-08-11 Thread Thomas Zimmermann

Hi Daniel

Am 11.08.22 um 19:23 schrieb Daniel Vetter:

On Wed, 27 Jul 2022 at 09:53, Thomas Zimmermann  wrote:


Hi

Am 25.07.22 um 17:13 schrieb Javier Martinez Canillas:

Hello Geert,

On 7/21/22 16:46, Geert Uytterhoeven wrote:

Hi Thomas,

On Wed, Jul 20, 2022 at 4:27 PM Thomas Zimmermann  wrote:

Compute the framebuffer's scanline stride length if not given by
the simplefb data.

Signed-off-by: Thomas Zimmermann 


Thanks for your patch!


--- a/drivers/gpu/drm/tiny/simpledrm.c
+++ b/drivers/gpu/drm/tiny/simpledrm.c
@@ -743,6 +743,9 @@ static struct simpledrm_device 
*simpledrm_device_create(struct drm_driver *drv,
  drm_err(dev, "no simplefb configuration found\n");
  return ERR_PTR(-ENODEV);
  }
+   if (!stride)
+   stride = format->cpp[0] * width;


DIV_ROUND_UP(drm_format_info_bpp(format) * width, 8)



I think you meant here:

DIV_ROUND_UP(drm_format_info_bpp(format, 0) * width, 8) ?


I guess, that's the right function. My original code is correct, but cpp
is also deprecated.


You all mean drm_format_info_min_pitch().


Thanks a lot. I wasn't even aware of this function, but I had almost 
written my own implementation of it.  I'll update the patch accordingly.


Best regards
Thomas



I really don't want drivers to go grab any of the legacy format info
fields like bpp or depth. switch() statements on the fourcc code for
programming registers, or one of the real helper functions in
drm_fourcc.c (there might be some gaps), but not ever going through
legacy concepts. Anything else just leads to subtle bugs when new
formats get added and oops suddenly the assumptions don't hold.

Those should be strictly limited to legacy (i.e. not drm_fourcc aware)
interfaces. Heck I think even fbdev emulation should completely switch
over to drm_fourcc/drm_format_info, but alas that's a pile of work and
not much payoff.

I'm trying to volunteer Same to add a legacy_bpp tag to the above
helper and appropriately limit it, I think limiting to formats with
depth!=0 is probably the right thing. And then we should probably
remove a pile of the cargo-culted depth!=0 entries too.
-Daniel



Best regards
Thomas



With that change,

Acked-by: Javier Martinez Canillas 



--
Thomas Zimmermann
Graphics Driver Developer
SUSE Software Solutions Germany GmbH
Maxfeldstr. 5, 90409 Nürnberg, Germany
(HRB 36809, AG Nürnberg)
Geschäftsführer: Ivo Totev






--
Thomas Zimmermann
Graphics Driver Developer
SUSE Software Solutions Germany GmbH
Maxfeldstr. 5, 90409 Nürnberg, Germany
(HRB 36809, AG Nürnberg)
Geschäftsführer: Ivo Totev


OpenPGP_signature
Description: OpenPGP digital signature


Re: [PATCH v2 04/10] drm/simpledrm: Compute framebuffer stride if not set

2022-08-11 Thread Daniel Vetter
On Wed, 27 Jul 2022 at 09:53, Thomas Zimmermann  wrote:
>
> Hi
>
> Am 25.07.22 um 17:13 schrieb Javier Martinez Canillas:
> > Hello Geert,
> >
> > On 7/21/22 16:46, Geert Uytterhoeven wrote:
> >> Hi Thomas,
> >>
> >> On Wed, Jul 20, 2022 at 4:27 PM Thomas Zimmermann  
> >> wrote:
> >>> Compute the framebuffer's scanline stride length if not given by
> >>> the simplefb data.
> >>>
> >>> Signed-off-by: Thomas Zimmermann 
> >>
> >> Thanks for your patch!
> >>
> >>> --- a/drivers/gpu/drm/tiny/simpledrm.c
> >>> +++ b/drivers/gpu/drm/tiny/simpledrm.c
> >>> @@ -743,6 +743,9 @@ static struct simpledrm_device 
> >>> *simpledrm_device_create(struct drm_driver *drv,
> >>>  drm_err(dev, "no simplefb configuration found\n");
> >>>  return ERR_PTR(-ENODEV);
> >>>  }
> >>> +   if (!stride)
> >>> +   stride = format->cpp[0] * width;
> >>
> >> DIV_ROUND_UP(drm_format_info_bpp(format) * width, 8)
> >>
> >
> > I think you meant here:
> >
> > DIV_ROUND_UP(drm_format_info_bpp(format, 0) * width, 8) ?
>
> I guess, that's the right function. My original code is correct, but cpp
> is also deprecated.

You all mean drm_format_info_min_pitch().

I really don't want drivers to go grab any of the legacy format info
fields like bpp or depth. switch() statements on the fourcc code for
programming registers, or one of the real helper functions in
drm_fourcc.c (there might be some gaps), but not ever going through
legacy concepts. Anything else just leads to subtle bugs when new
formats get added and oops suddenly the assumptions don't hold.

Those should be strictly limited to legacy (i.e. not drm_fourcc aware)
interfaces. Heck I think even fbdev emulation should completely switch
over to drm_fourcc/drm_format_info, but alas that's a pile of work and
not much payoff.

I'm trying to volunteer Same to add a legacy_bpp tag to the above
helper and appropriately limit it, I think limiting to formats with
depth!=0 is probably the right thing. And then we should probably
remove a pile of the cargo-culted depth!=0 entries too.
-Daniel

>
> Best regards
> Thomas
>
> >
> > With that change,
> >
> > Acked-by: Javier Martinez Canillas 
> >
>
> --
> Thomas Zimmermann
> Graphics Driver Developer
> SUSE Software Solutions Germany GmbH
> Maxfeldstr. 5, 90409 Nürnberg, Germany
> (HRB 36809, AG Nürnberg)
> Geschäftsführer: Ivo Totev



-- 
Daniel Vetter
Software Engineer, Intel Corporation
http://blog.ffwll.ch


[PATCH AUTOSEL 4.9 07/12] crypto: vmx - Fix warning on p8_ghash_alg

2022-08-11 Thread Sasha Levin
From: Herbert Xu 

[ Upstream commit cc8166bfc829043020b5cc3b7cdba02a17d03b6d ]

The compiler complains that p8_ghash_alg isn't declared which is
because the header file aesp8-ppc.h isn't included in ghash.c.
This patch fixes the warning.

Signed-off-by: Herbert Xu 
Acked-by: Breno Leitao 
Signed-off-by: Herbert Xu 
Signed-off-by: Sasha Levin 
---
 drivers/crypto/vmx/ghash.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/drivers/crypto/vmx/ghash.c b/drivers/crypto/vmx/ghash.c
index 1bfe867c0b7b..84a293d45cc5 100644
--- a/drivers/crypto/vmx/ghash.c
+++ b/drivers/crypto/vmx/ghash.c
@@ -22,6 +22,7 @@
 #include 
 #include 
 #include 
+#include "aesp8-ppc.h"
 
 #define IN_INTERRUPT in_interrupt()
 
-- 
2.35.1



[PATCH AUTOSEL 4.14 08/14] crypto: vmx - Fix warning on p8_ghash_alg

2022-08-11 Thread Sasha Levin
From: Herbert Xu 

[ Upstream commit cc8166bfc829043020b5cc3b7cdba02a17d03b6d ]

The compiler complains that p8_ghash_alg isn't declared which is
because the header file aesp8-ppc.h isn't included in ghash.c.
This patch fixes the warning.

Signed-off-by: Herbert Xu 
Acked-by: Breno Leitao 
Signed-off-by: Herbert Xu 
Signed-off-by: Sasha Levin 
---
 drivers/crypto/vmx/ghash.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/drivers/crypto/vmx/ghash.c b/drivers/crypto/vmx/ghash.c
index 1bfe867c0b7b..84a293d45cc5 100644
--- a/drivers/crypto/vmx/ghash.c
+++ b/drivers/crypto/vmx/ghash.c
@@ -22,6 +22,7 @@
 #include 
 #include 
 #include 
+#include "aesp8-ppc.h"
 
 #define IN_INTERRUPT in_interrupt()
 
-- 
2.35.1



[PATCH AUTOSEL 4.19 08/14] crypto: vmx - Fix warning on p8_ghash_alg

2022-08-11 Thread Sasha Levin
From: Herbert Xu 

[ Upstream commit cc8166bfc829043020b5cc3b7cdba02a17d03b6d ]

The compiler complains that p8_ghash_alg isn't declared which is
because the header file aesp8-ppc.h isn't included in ghash.c.
This patch fixes the warning.

Signed-off-by: Herbert Xu 
Acked-by: Breno Leitao 
Signed-off-by: Herbert Xu 
Signed-off-by: Sasha Levin 
---
 drivers/crypto/vmx/ghash.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/drivers/crypto/vmx/ghash.c b/drivers/crypto/vmx/ghash.c
index 2d1a8cd35509..b1b067203426 100644
--- a/drivers/crypto/vmx/ghash.c
+++ b/drivers/crypto/vmx/ghash.c
@@ -22,6 +22,7 @@
 #include 
 #include 
 #include 
+#include "aesp8-ppc.h"
 
 #define IN_INTERRUPT in_interrupt()
 
-- 
2.35.1



[PATCH AUTOSEL 5.4 15/25] crypto: vmx - Fix warning on p8_ghash_alg

2022-08-11 Thread Sasha Levin
From: Herbert Xu 

[ Upstream commit cc8166bfc829043020b5cc3b7cdba02a17d03b6d ]

The compiler complains that p8_ghash_alg isn't declared which is
because the header file aesp8-ppc.h isn't included in ghash.c.
This patch fixes the warning.

Signed-off-by: Herbert Xu 
Acked-by: Breno Leitao 
Signed-off-by: Herbert Xu 
Signed-off-by: Sasha Levin 
---
 drivers/crypto/vmx/ghash.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/drivers/crypto/vmx/ghash.c b/drivers/crypto/vmx/ghash.c
index 14807ac2e3b9..e419be7381c9 100644
--- a/drivers/crypto/vmx/ghash.c
+++ b/drivers/crypto/vmx/ghash.c
@@ -23,6 +23,7 @@
 #include 
 #include 
 #include 
+#include "aesp8-ppc.h"
 
 void gcm_init_p8(u128 htable[16], const u64 Xi[2]);
 void gcm_gmult_p8(u64 Xi[2], const u128 htable[16]);
-- 
2.35.1



[PATCH AUTOSEL 5.10 30/46] crypto: vmx - Fix warning on p8_ghash_alg

2022-08-11 Thread Sasha Levin
From: Herbert Xu 

[ Upstream commit cc8166bfc829043020b5cc3b7cdba02a17d03b6d ]

The compiler complains that p8_ghash_alg isn't declared which is
because the header file aesp8-ppc.h isn't included in ghash.c.
This patch fixes the warning.

Signed-off-by: Herbert Xu 
Acked-by: Breno Leitao 
Signed-off-by: Herbert Xu 
Signed-off-by: Sasha Levin 
---
 drivers/crypto/vmx/ghash.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/drivers/crypto/vmx/ghash.c b/drivers/crypto/vmx/ghash.c
index 14807ac2e3b9..e419be7381c9 100644
--- a/drivers/crypto/vmx/ghash.c
+++ b/drivers/crypto/vmx/ghash.c
@@ -23,6 +23,7 @@
 #include 
 #include 
 #include 
+#include "aesp8-ppc.h"
 
 void gcm_init_p8(u128 htable[16], const u64 Xi[2]);
 void gcm_gmult_p8(u64 Xi[2], const u128 htable[16]);
-- 
2.35.1



[PATCH AUTOSEL 5.15 48/69] crypto: vmx - Fix warning on p8_ghash_alg

2022-08-11 Thread Sasha Levin
From: Herbert Xu 

[ Upstream commit cc8166bfc829043020b5cc3b7cdba02a17d03b6d ]

The compiler complains that p8_ghash_alg isn't declared which is
because the header file aesp8-ppc.h isn't included in ghash.c.
This patch fixes the warning.

Signed-off-by: Herbert Xu 
Acked-by: Breno Leitao 
Signed-off-by: Herbert Xu 
Signed-off-by: Sasha Levin 
---
 drivers/crypto/vmx/ghash.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/drivers/crypto/vmx/ghash.c b/drivers/crypto/vmx/ghash.c
index 5bc5710a6de0..77eca20bc7ac 100644
--- a/drivers/crypto/vmx/ghash.c
+++ b/drivers/crypto/vmx/ghash.c
@@ -23,6 +23,7 @@
 #include 
 #include 
 #include 
+#include "aesp8-ppc.h"
 
 void gcm_init_p8(u128 htable[16], const u64 Xi[2]);
 void gcm_gmult_p8(u64 Xi[2], const u128 htable[16]);
-- 
2.35.1



Re: [PATCH v2 11/14] powerpc/64s: Clear/restore caller gprs in syscall interrupt/return

2022-08-11 Thread Segher Boessenkool
On Thu, Aug 11, 2022 at 03:39:58PM +, Christophe Leroy wrote:
> 
> 
> Le 11/08/2022 à 17:13, Segher Boessenkool a écrit :
> > Hi!
> > 
> > On Mon, Jul 25, 2022 at 04:31:11PM +1000, Rohan McLure wrote:
> >> +  /*
> >> +   * Zero user registers to prevent influencing speculative execution
> >> +   * state of kernel code.
> >> +   */
> >> +  NULLIFY_GPRS(5, 12)
> >> +  NULLIFY_NVGPRS()
> > 
> > "Nullify" means "invalidate", which is not what this does or is for :-(
> 
> Would "Zeroise" be more appropriate ?

That is probably a good compromise, yes.  It obviously is a verb, its
meaning is clear and unamiguous, and there is precedent for it even :-)

Thanks,


Segher


[PATCH AUTOSEL 5.18 58/93] crypto: vmx - Fix warning on p8_ghash_alg

2022-08-11 Thread Sasha Levin
From: Herbert Xu 

[ Upstream commit cc8166bfc829043020b5cc3b7cdba02a17d03b6d ]

The compiler complains that p8_ghash_alg isn't declared which is
because the header file aesp8-ppc.h isn't included in ghash.c.
This patch fixes the warning.

Signed-off-by: Herbert Xu 
Acked-by: Breno Leitao 
Signed-off-by: Herbert Xu 
Signed-off-by: Sasha Levin 
---
 drivers/crypto/vmx/ghash.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/drivers/crypto/vmx/ghash.c b/drivers/crypto/vmx/ghash.c
index 5bc5710a6de0..77eca20bc7ac 100644
--- a/drivers/crypto/vmx/ghash.c
+++ b/drivers/crypto/vmx/ghash.c
@@ -23,6 +23,7 @@
 #include 
 #include 
 #include 
+#include "aesp8-ppc.h"
 
 void gcm_init_p8(u128 htable[16], const u64 Xi[2]);
 void gcm_gmult_p8(u64 Xi[2], const u128 htable[16]);
-- 
2.35.1



Re: [PATCH v2 11/14] powerpc/64s: Clear/restore caller gprs in syscall interrupt/return

2022-08-11 Thread Christophe Leroy


Le 11/08/2022 à 17:13, Segher Boessenkool a écrit :
> Hi!
> 
> On Mon, Jul 25, 2022 at 04:31:11PM +1000, Rohan McLure wrote:
>> +/*
>> + * Zero user registers to prevent influencing speculative execution
>> + * state of kernel code.
>> + */
>> +NULLIFY_GPRS(5, 12)
>> +NULLIFY_NVGPRS()
> 
> "Nullify" means "invalidate", which is not what this does or is for :-(
> 

Would "Zeroise" be more appropriate ?

Christophe

[PATCH AUTOSEL 5.19 065/105] crypto: vmx - Fix warning on p8_ghash_alg

2022-08-11 Thread Sasha Levin
From: Herbert Xu 

[ Upstream commit cc8166bfc829043020b5cc3b7cdba02a17d03b6d ]

The compiler complains that p8_ghash_alg isn't declared which is
because the header file aesp8-ppc.h isn't included in ghash.c.
This patch fixes the warning.

Signed-off-by: Herbert Xu 
Acked-by: Breno Leitao 
Signed-off-by: Herbert Xu 
Signed-off-by: Sasha Levin 
---
 drivers/crypto/vmx/ghash.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/drivers/crypto/vmx/ghash.c b/drivers/crypto/vmx/ghash.c
index 5bc5710a6de0..77eca20bc7ac 100644
--- a/drivers/crypto/vmx/ghash.c
+++ b/drivers/crypto/vmx/ghash.c
@@ -23,6 +23,7 @@
 #include 
 #include 
 #include 
+#include "aesp8-ppc.h"
 
 void gcm_init_p8(u128 htable[16], const u64 Xi[2]);
 void gcm_gmult_p8(u64 Xi[2], const u128 htable[16]);
-- 
2.35.1



Re: [PATCH v2 11/14] powerpc/64s: Clear/restore caller gprs in syscall interrupt/return

2022-08-11 Thread Segher Boessenkool
Hi!

On Mon, Jul 25, 2022 at 04:31:11PM +1000, Rohan McLure wrote:
> + /*
> +  * Zero user registers to prevent influencing speculative execution
> +  * state of kernel code.
> +  */
> + NULLIFY_GPRS(5, 12)
> + NULLIFY_NVGPRS()

"Nullify" means "invalidate", which is not what this does or is for :-(


Segher


Re: [5.19.0-next-20220811] Build failure drivers/vdpa

2022-08-11 Thread Stefano Garzarella
On Thu, Aug 11, 2022 at 12:06 PM Sachin Sant  wrote:
>
> 5.19.0-next-20220811 linux-next fails to build on IBM Power with
> following error:
>
> drivers/vdpa/vdpa_sim/vdpa_sim_blk.c: In function 'vdpasim_blk_handle_req':
> drivers/vdpa/vdpa_sim/vdpa_sim_blk.c:201:3: error: a label can only be part 
> of a statement and a declaration is not a statement
>struct virtio_blk_discard_write_zeroes range;
>^~
> drivers/vdpa/vdpa_sim/vdpa_sim_blk.c:202:3: error: expected expression before 
> 'u32'
>u32 num_sectors, flags;
>^~~
> drivers/vdpa/vdpa_sim/vdpa_sim_blk.c:224:3: error: 'num_sectors' undeclared 
> (first use in this function); did you mean 'bio_sectors'?
>num_sectors = vdpasim32_to_cpu(vdpasim, range.num_sectors);
>^~~
>bio_sectors
> drivers/vdpa/vdpa_sim/vdpa_sim_blk.c:224:3: note: each undeclared identifier 
> is reported only once for each function it appears in
> drivers/vdpa/vdpa_sim/vdpa_sim_blk.c:225:3: error: 'flags' undeclared (first 
> use in this function); did you mean 'class'?
>flags = vdpasim32_to_cpu(vdpasim, range.flags);
>^
>class
> make[3]: *** [scripts/Makefile.build:250: 
> drivers/vdpa/vdpa_sim/vdpa_sim_blk.o] Error 1
> make[2]: *** [scripts/Makefile.build:525: drivers/vdpa/vdpa_sim] Error 2
> make[1]: *** [scripts/Makefile.build:525: drivers/vdpa] Error 2
> make[1]: *** Waiting for unfinished jobs….
>
> Git bisect points to the following patch
>
> commit d79b32c2e4a4e66d5678410cd45815c1c2375196
> Date:   Wed Aug 10 11:43:47 2022 +0200
> vdpa_sim_blk: add support for discard and write-zeroes
>

Thanks for the report, I already re-sent a new series with that patch fixed:
https://lore.kernel.org/virtualization/20220811083632.77525-1-sgarz...@redhat.com/T/#t

And it looks like it's already in Michael's tree:
https://git.kernel.org/pub/scm/linux/kernel/git/mst/vhost.git linux-next

Thanks,
Stefano



Re: [PATCHv3, resend] powerpc: mm: radix_tlb: rearrange the if-else block

2022-08-11 Thread Anders Roxell
On Thu, 11 Aug 2022 at 11:41, Michael Ellerman  wrote:
>
> Anders Roxell  writes:
> > Clang warns:
> >
> > arch/powerpc/mm/book3s64/radix_tlb.c:1191:23: error: variable 'hstart' is 
> > uninitialized when used here [-Werror,-Wuninitialized]
> > __tlbiel_va_range(hstart, hend, pid,
> >   ^~
> > arch/powerpc/mm/book3s64/radix_tlb.c:1175:23: note: initialize the variable 
> > 'hstart' to silence this warning
> > unsigned long hstart, hend;
> > ^
> >  = 0
> > arch/powerpc/mm/book3s64/radix_tlb.c:1191:31: error: variable 'hend' is 
> > uninitialized when used here [-Werror,-Wuninitialized]
> > __tlbiel_va_range(hstart, hend, pid,
> >   ^~~~
> > arch/powerpc/mm/book3s64/radix_tlb.c:1175:29: note: initialize the variable 
> > 'hend' to silence this warning
> > unsigned long hstart, hend;
> >   ^
> >= 0
> > 2 errors generated.
>
> Which version & config are you building?

clang-13, clang-14 and clang-nightly, configs are cell_defconfig and
maple_defconfig

I'm building with tuxmake [1] like this:
$ tuxmake --runtime podman --target-arch powerpc --toolchain clang-14
--kconfig cell_defconfig

Cheers,
Anders
[1] https://tuxmake.org/


Re: [5.19.0-next-20220811] Build failure drivers/vdpa

2022-08-11 Thread Sachin Sant



> On 11-Aug-2022, at 3:45 PM, Stefano Garzarella  wrote:
> 
>> Date:   Wed Aug 10 11:43:47 2022 +0200
>>vdpa_sim_blk: add support for discard and write-zeroes
>> 
> 
> Thanks for the report, I already re-sent a new series with that patch fixed:
> https://lore.kernel.org/virtualization/20220811083632.77525-1-sgarz...@redhat.com/T/#t

Thanks. That patch works for me.

- Sachin



Re: [PATCH v2 11/14] powerpc/64s: Clear/restore caller gprs in syscall interrupt/return

2022-08-11 Thread Andrew Donnellan
On Mon, 2022-07-25 at 16:31 +1000, Rohan McLure wrote:
> Clear user state in gprs (assign to zero) to reduce the influence of
> user
> registers on speculation within kernel syscall handlers. Clears occur
> at the very beginning of the sc and scv 0 interrupt handlers, with
> restores occurring following the execution of the syscall handler.
> 
> One function of syscall_exit_prepare is to determine when non-
> volatile
> regs must be restored, and it still serves that purpose on 32-bit.
> Use
> it now for determining where to find XER, CTR, CR.

I'm not sure exactly how syscall_exit_prepare comes into this?

> 
> Signed-off-by: Rohan McLure 
> ---
> V1 -> V2: Update summary
> ---
>  arch/powerpc/kernel/interrupt_64.S | 22 ++
>  1 file changed, 18 insertions(+), 4 deletions(-)
> 
> diff --git a/arch/powerpc/kernel/interrupt_64.S
> b/arch/powerpc/kernel/interrupt_64.S
> index 3e8a811e09c4..34167cfa5d60 100644
> --- a/arch/powerpc/kernel/interrupt_64.S
> +++ b/arch/powerpc/kernel/interrupt_64.S
> @@ -70,7 +70,7 @@ _ASM_NOKPROBE_SYMBOL(system_call_vectored_\name)
> ld  r2,PACATOC(r13)
> mfcrr12
> li  r11,0
> -   /* Can we avoid saving r3-r8 in common case? */
> +   /* Save syscall parameters in r3-r8 */
> std r3,GPR3(r1)
> std r4,GPR4(r1)
> std r5,GPR5(r1)
> @@ -108,6 +108,13 @@ END_FTR_SECTION_IFSET(CPU_FTR_HAS_PPR)
>  * but this is the best we can do.
>  */
>  
> +   /*
> +    * Zero user registers to prevent influencing speculative
> execution
> +    * state of kernel code.
> +    */
> +   NULLIFY_GPRS(5, 12)
> +   NULLIFY_NVGPRS()
> +
> /* Calling convention has r3 = orig r0, r4 = regs */
> mr  r3,r0
> bl  system_call_exception
> @@ -138,6 +145,7 @@ BEGIN_FTR_SECTION
> HMT_MEDIUM_LOW
>  END_FTR_SECTION_IFSET(CPU_FTR_HAS_PPR)
>  
> +   REST_NVGPRS(r1)
> cmpdi   r3,0
> bne .Lsyscall_vectored_\name\()_restore_regs
>  
> @@ -180,7 +188,6 @@ END_FTR_SECTION_IFSET(CPU_FTR_HAS_PPR)
> ld  r4,_LINK(r1)
> ld  r5,_XER(r1)
>  
> -   REST_NVGPRS(r1)
> ld  r0,GPR0(r1)
> mtcrr2
> mtctr   r3
> @@ -248,7 +255,7 @@ END_BTB_FLUSH_SECTION
> ld  r2,PACATOC(r13)
> mfcrr12
> li  r11,0
> -   /* Can we avoid saving r3-r8 in common case? */
> +   /* Save syscall parameters in r3-r8 */
> std r3,GPR3(r1)
> std r4,GPR4(r1)
> std r5,GPR5(r1)
> @@ -298,6 +305,13 @@ END_BTB_FLUSH_SECTION
> wrteei  1
>  #endif
>  
> +   /*
> +    * Zero user registers to prevent influencing speculative
> execution
> +    * state of kernel code.
> +    */
> +   NULLIFY_GPRS(5, 12)
> +   NULLIFY_NVGPRS()
> +
> /* Calling convention has r3 = orig r0, r4 = regs */
> mr  r3,r0
> bl  system_call_exception
> @@ -340,6 +354,7 @@ BEGIN_FTR_SECTION
> stdcx.  r0,0,r1 /* to clear the reservation
> */
>  END_FTR_SECTION_IFCLR(CPU_FTR_STCX_CHECKS_ADDRESS)
>  
> +   REST_NVGPRS(r1)
> cmpdi   r3,0
> bne .Lsyscall_restore_regs
> /* Zero volatile regs that may contain sensitive kernel data
> */
> @@ -367,7 +382,6 @@ END_FTR_SECTION_IFSET(CPU_FTR_HAS_PPR)
>  .Lsyscall_restore_regs:
> ld  r3,_CTR(r1)
> ld  r4,_XER(r1)
> -   REST_NVGPRS(r1)
> mtctr   r3
> mtspr   SPRN_XER,r4
> REST_GPR(0, r1)

-- 
Andrew DonnellanOzLabs, ADL Canberra
a...@linux.ibm.com   IBM Australia Limited



[5.19.0-next-20220811] Build failure drivers/vdpa

2022-08-11 Thread Sachin Sant
5.19.0-next-20220811 linux-next fails to build on IBM Power with 
following error:

drivers/vdpa/vdpa_sim/vdpa_sim_blk.c: In function 'vdpasim_blk_handle_req':
drivers/vdpa/vdpa_sim/vdpa_sim_blk.c:201:3: error: a label can only be part of 
a statement and a declaration is not a statement
   struct virtio_blk_discard_write_zeroes range;
   ^~
drivers/vdpa/vdpa_sim/vdpa_sim_blk.c:202:3: error: expected expression before 
'u32'
   u32 num_sectors, flags;
   ^~~
drivers/vdpa/vdpa_sim/vdpa_sim_blk.c:224:3: error: 'num_sectors' undeclared 
(first use in this function); did you mean 'bio_sectors'?
   num_sectors = vdpasim32_to_cpu(vdpasim, range.num_sectors);
   ^~~
   bio_sectors
drivers/vdpa/vdpa_sim/vdpa_sim_blk.c:224:3: note: each undeclared identifier is 
reported only once for each function it appears in
drivers/vdpa/vdpa_sim/vdpa_sim_blk.c:225:3: error: 'flags' undeclared (first 
use in this function); did you mean 'class'?
   flags = vdpasim32_to_cpu(vdpasim, range.flags);
   ^
   class
make[3]: *** [scripts/Makefile.build:250: drivers/vdpa/vdpa_sim/vdpa_sim_blk.o] 
Error 1
make[2]: *** [scripts/Makefile.build:525: drivers/vdpa/vdpa_sim] Error 2
make[1]: *** [scripts/Makefile.build:525: drivers/vdpa] Error 2
make[1]: *** Waiting for unfinished jobs….

Git bisect points to the following patch

commit d79b32c2e4a4e66d5678410cd45815c1c2375196
Date:   Wed Aug 10 11:43:47 2022 +0200
vdpa_sim_blk: add support for discard and write-zeroes

Thanks
- Sachin

Re: [PATCHv3, resend] powerpc: mm: radix_tlb: rearrange the if-else block

2022-08-11 Thread Michael Ellerman
Anders Roxell  writes:
> Clang warns:
>
> arch/powerpc/mm/book3s64/radix_tlb.c:1191:23: error: variable 'hstart' is 
> uninitialized when used here [-Werror,-Wuninitialized]
> __tlbiel_va_range(hstart, hend, pid,
>   ^~
> arch/powerpc/mm/book3s64/radix_tlb.c:1175:23: note: initialize the variable 
> 'hstart' to silence this warning
> unsigned long hstart, hend;
> ^
>  = 0
> arch/powerpc/mm/book3s64/radix_tlb.c:1191:31: error: variable 'hend' is 
> uninitialized when used here [-Werror,-Wuninitialized]
> __tlbiel_va_range(hstart, hend, pid,
>   ^~~~
> arch/powerpc/mm/book3s64/radix_tlb.c:1175:29: note: initialize the variable 
> 'hend' to silence this warning
> unsigned long hstart, hend;
>   ^
>= 0
> 2 errors generated.

Which version & config are you building?

I don't see this warning in my test builds.

cheers


Re: [PATCH] MAINTAINERS: Remove myself as EEH maintainer

2022-08-11 Thread Oliver O'Halloran
On Thu, Aug 11, 2022 at 4:22 PM Michael Ellerman  wrote:
>
> Russell Currey  writes:
> > I haven't touched EEH in a long time I don't have much knowledge of the
> > subsystem at this point either, so it's misleading to have me as a
> > maintainer.
>
> Thank you for your service.
>
> > I remain grateful to Oliver for picking up my slack over the years.
>
> Ack.
>
> But I wonder if he is still happy being listed as the only maintainer.
> Given the status is "Supported" that means "Someone is actually paid to
> look after this" - and I suspect Oracle are probably not paying him to
> do that?

I'm still happy to field questions and/or give reviews occasionally if
needed, but yeah I don't have the time, hardware, or inclination to do
any actual maintenance. IIRC Mahesh was supposed to take over
supporting EEH after I left IBM. If he's still around he should
probably be listed as a maintainer.