Re: [PATCH 05/13] rcu/nocb: Use the rcuog CPU's ->nocb_timer

2021-03-15 Thread Paul E. McKenney
On Wed, Mar 10, 2021 at 11:05:07PM +0100, Frederic Weisbecker wrote:
> On Tue, Mar 02, 2021 at 05:15:57PM -0800, Paul E. McKenney wrote:
> > The first question is of course: Did you try this with lockdep enabled?  ;-)
> 
> Yep I always do. But I may miss some configs on my testings. I usually
> test at least TREE01 on x86 and arm64.
> 
> > > @@ -1702,43 +1692,50 @@ bool rcu_is_nocb_cpu(int cpu)
> > >   return false;
> > >  }
> > >  
> > > -/*
> > > - * Kick the GP kthread for this NOCB group.  Caller holds ->nocb_lock
> > > - * and this function releases it.
> > > - */
> > > -static bool wake_nocb_gp(struct rcu_data *rdp, bool force,
> > > -  unsigned long flags)
> > > - __releases(rdp->nocb_lock)
> > > +static bool __wake_nocb_gp(struct rcu_data *rdp_gp,
> > > +struct rcu_data *rdp,
> > > +bool force, unsigned long flags)
> > > + __releases(rdp_gp->nocb_gp_lock)
> > >  {
> > >   bool needwake = false;
> > > - struct rcu_data *rdp_gp = rdp->nocb_gp_rdp;
> > >  
> > > - lockdep_assert_held(>nocb_lock);
> > >   if (!READ_ONCE(rdp_gp->nocb_gp_kthread)) {
> > > - rcu_nocb_unlock_irqrestore(rdp, flags);
> > > + raw_spin_unlock_irqrestore(_gp->nocb_gp_lock, flags);
> > >   trace_rcu_nocb_wake(rcu_state.name, rdp->cpu,
> > >   TPS("AlreadyAwake"));
> > >   return false;
> > >   }
> > >  
> > > - if (READ_ONCE(rdp->nocb_defer_wakeup) > RCU_NOCB_WAKE_NOT) {
> > > - WRITE_ONCE(rdp->nocb_defer_wakeup, RCU_NOCB_WAKE_NOT);
> > > - del_timer(>nocb_timer);
> > > + if (rdp_gp->nocb_defer_wakeup > RCU_NOCB_WAKE_NOT) {
> > 
> > So there are no longer any data races involving ->nocb_defer_wakeup?
> > 
> > (Yes, I could fire up KCSAN, but my KCSAN-capable system is otherwise
> > occupied for several more hours.)
> 
> To be more specific, there is no more unlocked write to the timer 
> (queue/cancel)
> and its nocb_defer_wakeup matching state. And there is only one (on purpose) 
> racy
> reader of ->nocb_defer_wakeup which is the non-timer deferred wakeup.
> 
> So the writes to the timer keep their WRITE_ONCE() and only the reader in
> do_nocb_deferred_wakeup() keeps its READ_ONCE(). Other readers are protected
> by the ->nocb_gp_lock.
> 
> > > +
> > >   // Advance callbacks if helpful and low contention.
> > >   needwake_gp = false;
> > >   if (!rcu_segcblist_restempty(>cblist,
> > > @@ -2178,11 +2182,18 @@ static void nocb_gp_wait(struct rcu_data *my_rdp)
> > >   my_rdp->nocb_gp_bypass = bypass;
> > >   my_rdp->nocb_gp_gp = needwait_gp;
> > >   my_rdp->nocb_gp_seq = needwait_gp ? wait_gp_seq : 0;
> > > - if (bypass && !rcu_nocb_poll) {
> > > - // At least one child with non-empty ->nocb_bypass, so set
> > > - // timer in order to avoid stranding its callbacks.
> > > + if (bypass) {
> > >   raw_spin_lock_irqsave(_rdp->nocb_gp_lock, flags);
> > > - mod_timer(_rdp->nocb_bypass_timer, j + 2);
> > > + // Avoid race with first bypass CB.
> > > + if (my_rdp->nocb_defer_wakeup > RCU_NOCB_WAKE_NOT) {
> > > + WRITE_ONCE(my_rdp->nocb_defer_wakeup, 
> > > RCU_NOCB_WAKE_NOT);
> > > + del_timer(_rdp->nocb_timer);
> > > + }
> > 
> > Given that the timer does not get queued if rcu_nocb_poll, why not move the
> > above "if" statement under the one following?
> 
> It's done later in the set.
> 
> > 
> > > + if (!rcu_nocb_poll) {
> > > + // At least one child with non-empty ->nocb_bypass, so 
> > > set
> > > + // timer in order to avoid stranding its callbacks.
> > > + mod_timer(_rdp->nocb_bypass_timer, j + 2);
> > > + }
> > >   raw_spin_unlock_irqrestore(_rdp->nocb_gp_lock, flags);
> > >   }
> > >   if (rcu_nocb_poll) {
> > > @@ -2385,7 +2399,10 @@ static void do_nocb_deferred_wakeup_timer(struct 
> > > timer_list *t)
> > >   */
> > >  static bool do_nocb_deferred_wakeup(struct rcu_data *rdp)
> > >  {
> > > - if (rcu_nocb_need_deferred_wakeup(rdp))
> > > + if (!rdp->nocb_gp_rdp)
> > > + return false;
> > 
> > This check was not necessary previously because each CPU used its own rdp,
> > correct?
> 
> Exactly!
> 
> > The theory is that this early return is taken only during boot,
> > and that the spawning of the kthreads will act as an implicit wakeup?
> 
> You guessed right! That probably deserve a comment.

OK, I have queued these for for further review and testing.  Also to
look at the overall effect.  Thank you very much!

Thanx, Paul


Re: [PATCH 05/13] rcu/nocb: Use the rcuog CPU's ->nocb_timer

2021-03-10 Thread Frederic Weisbecker
On Tue, Mar 02, 2021 at 05:15:57PM -0800, Paul E. McKenney wrote:
> The first question is of course: Did you try this with lockdep enabled?  ;-)

Yep I always do. But I may miss some configs on my testings. I usually
test at least TREE01 on x86 and arm64.

> > @@ -1702,43 +1692,50 @@ bool rcu_is_nocb_cpu(int cpu)
> > return false;
> >  }
> >  
> > -/*
> > - * Kick the GP kthread for this NOCB group.  Caller holds ->nocb_lock
> > - * and this function releases it.
> > - */
> > -static bool wake_nocb_gp(struct rcu_data *rdp, bool force,
> > -unsigned long flags)
> > -   __releases(rdp->nocb_lock)
> > +static bool __wake_nocb_gp(struct rcu_data *rdp_gp,
> > +  struct rcu_data *rdp,
> > +  bool force, unsigned long flags)
> > +   __releases(rdp_gp->nocb_gp_lock)
> >  {
> > bool needwake = false;
> > -   struct rcu_data *rdp_gp = rdp->nocb_gp_rdp;
> >  
> > -   lockdep_assert_held(>nocb_lock);
> > if (!READ_ONCE(rdp_gp->nocb_gp_kthread)) {
> > -   rcu_nocb_unlock_irqrestore(rdp, flags);
> > +   raw_spin_unlock_irqrestore(_gp->nocb_gp_lock, flags);
> > trace_rcu_nocb_wake(rcu_state.name, rdp->cpu,
> > TPS("AlreadyAwake"));
> > return false;
> > }
> >  
> > -   if (READ_ONCE(rdp->nocb_defer_wakeup) > RCU_NOCB_WAKE_NOT) {
> > -   WRITE_ONCE(rdp->nocb_defer_wakeup, RCU_NOCB_WAKE_NOT);
> > -   del_timer(>nocb_timer);
> > +   if (rdp_gp->nocb_defer_wakeup > RCU_NOCB_WAKE_NOT) {
> 
> So there are no longer any data races involving ->nocb_defer_wakeup?
> 
> (Yes, I could fire up KCSAN, but my KCSAN-capable system is otherwise
> occupied for several more hours.)

To be more specific, there is no more unlocked write to the timer (queue/cancel)
and its nocb_defer_wakeup matching state. And there is only one (on purpose) 
racy
reader of ->nocb_defer_wakeup which is the non-timer deferred wakeup.

So the writes to the timer keep their WRITE_ONCE() and only the reader in
do_nocb_deferred_wakeup() keeps its READ_ONCE(). Other readers are protected
by the ->nocb_gp_lock.

> > +
> > // Advance callbacks if helpful and low contention.
> > needwake_gp = false;
> > if (!rcu_segcblist_restempty(>cblist,
> > @@ -2178,11 +2182,18 @@ static void nocb_gp_wait(struct rcu_data *my_rdp)
> > my_rdp->nocb_gp_bypass = bypass;
> > my_rdp->nocb_gp_gp = needwait_gp;
> > my_rdp->nocb_gp_seq = needwait_gp ? wait_gp_seq : 0;
> > -   if (bypass && !rcu_nocb_poll) {
> > -   // At least one child with non-empty ->nocb_bypass, so set
> > -   // timer in order to avoid stranding its callbacks.
> > +   if (bypass) {
> > raw_spin_lock_irqsave(_rdp->nocb_gp_lock, flags);
> > -   mod_timer(_rdp->nocb_bypass_timer, j + 2);
> > +   // Avoid race with first bypass CB.
> > +   if (my_rdp->nocb_defer_wakeup > RCU_NOCB_WAKE_NOT) {
> > +   WRITE_ONCE(my_rdp->nocb_defer_wakeup, 
> > RCU_NOCB_WAKE_NOT);
> > +   del_timer(_rdp->nocb_timer);
> > +   }
> 
> Given that the timer does not get queued if rcu_nocb_poll, why not move the
> above "if" statement under the one following?

It's done later in the set.

> 
> > +   if (!rcu_nocb_poll) {
> > +   // At least one child with non-empty ->nocb_bypass, so 
> > set
> > +   // timer in order to avoid stranding its callbacks.
> > +   mod_timer(_rdp->nocb_bypass_timer, j + 2);
> > +   }
> > raw_spin_unlock_irqrestore(_rdp->nocb_gp_lock, flags);
> > }
> > if (rcu_nocb_poll) {
> > @@ -2385,7 +2399,10 @@ static void do_nocb_deferred_wakeup_timer(struct 
> > timer_list *t)
> >   */
> >  static bool do_nocb_deferred_wakeup(struct rcu_data *rdp)
> >  {
> > -   if (rcu_nocb_need_deferred_wakeup(rdp))
> > +   if (!rdp->nocb_gp_rdp)
> > +   return false;
> 
> This check was not necessary previously because each CPU used its own rdp,
> correct?

Exactly!

> The theory is that this early return is taken only during boot,
> and that the spawning of the kthreads will act as an implicit wakeup?

You guessed right! That probably deserve a comment.

Thanks!


Re: [PATCH 05/13] rcu/nocb: Use the rcuog CPU's ->nocb_timer

2021-03-03 Thread Paul E. McKenney
On Tue, Feb 23, 2021 at 01:10:03AM +0100, Frederic Weisbecker wrote:
> Currently each offline rdp has its own nocb_timer armed when the
> nocb_gp wakeup must be deferred. This layout has many drawbacks,
> compared to a solution based on a single timer per rdp group:
> 
> * There are a lot of timers to maintain.
> 
> * The per-rdp ->nocb_lock must be held to queue and cancel the timer
>   and this lock can already be quite contended.
> 
> * One timer firing doesn't cancel the other timers in the same group:
>   - These other timers can thus cause spurious wakeups
>   - Each rdp that queued a timer must lock both ->nocb_lock and then
> ->nocb_gp_lock upon exit from the kernel to idle/user/guest mode.
> 
> * We can't cancel all of them if we detect an unflushed bypass in
>   nocb_gp_wait(). In fact currently we only ever cancel the nocb_timer
>   of the leader group.
> 
> * The leader group's nocb_timer is cancelled without locking ->nocb_lock
>   in nocb_gp_wait().  This currently appears to be safe but is an
>   accident waiting to happen.
> 
> * Since the timer acquires ->nocb_lock, it requires extra care in the
>   NOCB (de-)offloading process, requiring that it be either enabled or
>   disabled and flushed.
> 
> This commit instead uses the rcuog kthread's CPU's ->nocb_timer instead.
> It is protected by nocb_gp_lock, which is _way_ less contended and
> remains so even after this change.  As a matter of fact, the nocb_timer
> almost never fires and the deferred wakeup is mostly carried out upon
> idle/user/guest entry.  Now the early check performed at this point in
> do_nocb_deferred_wakeup() is done on rdp_gp->nocb_defer_wakeup, which
> is of course racy.  However, this raciness is harmless because we only
> need the guarantee that the timer is queued if we were the last one to
> queue it.  Any other situation (another CPU has queued it and we either
> see it or not) is fine.
> 
> This solves all the issues listed above.
> 
> Signed-off-by: Frederic Weisbecker 
> Cc: Josh Triplett 
> Cc: Lai Jiangshan 
> Cc: Joel Fernandes 
> Cc: Neeraj Upadhyay 
> Cc: Boqun Feng 

I pulled in the previous three (2-4/13) with the usual commit-log wordsmithing,
thank you!  And I could not resist wordsmithing above.

I do very much like the general approach, but a few questions below.

The first question is of course: Did you try this with lockdep enabled?  ;-)

> ---
>  kernel/rcu/tree.h|   1 -
>  kernel/rcu/tree_plugin.h | 142 +--
>  2 files changed, 78 insertions(+), 65 deletions(-)
> 
> diff --git a/kernel/rcu/tree.h b/kernel/rcu/tree.h
> index 71821d59d95c..b280a843bd2c 100644
> --- a/kernel/rcu/tree.h
> +++ b/kernel/rcu/tree.h
> @@ -257,7 +257,6 @@ struct rcu_data {
>  };
>  
>  /* Values for nocb_defer_wakeup field in struct rcu_data. */
> -#define RCU_NOCB_WAKE_OFF-1
>  #define RCU_NOCB_WAKE_NOT0
>  #define RCU_NOCB_WAKE1
>  #define RCU_NOCB_WAKE_FORCE  2
> diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h
> index 587df271d640..847636d3e93d 100644
> --- a/kernel/rcu/tree_plugin.h
> +++ b/kernel/rcu/tree_plugin.h
> @@ -33,10 +33,6 @@ static inline bool rcu_current_is_nocb_kthread(struct 
> rcu_data *rdp)
>   return false;
>  }
>  
> -static inline bool rcu_running_nocb_timer(struct rcu_data *rdp)
> -{
> - return (timer_curr_running(>nocb_timer) && !in_irq());
> -}
>  #else
>  static inline int rcu_lockdep_is_held_nocb(struct rcu_data *rdp)
>  {
> @@ -48,11 +44,6 @@ static inline bool rcu_current_is_nocb_kthread(struct 
> rcu_data *rdp)
>   return false;
>  }
>  
> -static inline bool rcu_running_nocb_timer(struct rcu_data *rdp)
> -{
> - return false;
> -}
> -
>  #endif /* #ifdef CONFIG_RCU_NOCB_CPU */
>  
>  static bool rcu_rdp_is_offloaded(struct rcu_data *rdp)
> @@ -72,8 +63,7 @@ static bool rcu_rdp_is_offloaded(struct rcu_data *rdp)
> rcu_lockdep_is_held_nocb(rdp) ||
> (rdp == this_cpu_ptr(_data) &&
>  !(IS_ENABLED(CONFIG_PREEMPT_COUNT) && preemptible())) ||
> -   rcu_current_is_nocb_kthread(rdp) ||
> -   rcu_running_nocb_timer(rdp)),
> +   rcu_current_is_nocb_kthread(rdp)),
>   "Unsafe read of RCU_NOCB offloaded state"
>   );
>  
> @@ -1702,43 +1692,50 @@ bool rcu_is_nocb_cpu(int cpu)
>   return false;
>  }
>  
> -/*
> - * Kick the GP kthread for this NOCB group.  Caller holds ->nocb_lock
> - * and this function releases it.
> - */
> -static bool wake_nocb_gp(struct rcu_data *rdp, bool force,
> -  unsigned long flags)
> - __releases(rdp->nocb_lock)
> +static bool __wake_nocb_gp(struct rcu_data *rdp_gp,
> +struct rcu_data *rdp,
> +bool force, unsigned long flags)
> + __releases(rdp_gp->nocb_gp_lock)
>  {
>   bool needwake = false;
> - struct rcu_data *rdp_gp = rdp->nocb_gp_rdp;
>  
> - lockdep_assert_held(>nocb_lock);
>