Re: [PATCH 05/13] rcu/nocb: Use the rcuog CPU's ->nocb_timer
On Wed, Mar 10, 2021 at 11:05:07PM +0100, Frederic Weisbecker wrote: > On Tue, Mar 02, 2021 at 05:15:57PM -0800, Paul E. McKenney wrote: > > The first question is of course: Did you try this with lockdep enabled? ;-) > > Yep I always do. But I may miss some configs on my testings. I usually > test at least TREE01 on x86 and arm64. > > > > @@ -1702,43 +1692,50 @@ bool rcu_is_nocb_cpu(int cpu) > > > return false; > > > } > > > > > > -/* > > > - * Kick the GP kthread for this NOCB group. Caller holds ->nocb_lock > > > - * and this function releases it. > > > - */ > > > -static bool wake_nocb_gp(struct rcu_data *rdp, bool force, > > > - unsigned long flags) > > > - __releases(rdp->nocb_lock) > > > +static bool __wake_nocb_gp(struct rcu_data *rdp_gp, > > > +struct rcu_data *rdp, > > > +bool force, unsigned long flags) > > > + __releases(rdp_gp->nocb_gp_lock) > > > { > > > bool needwake = false; > > > - struct rcu_data *rdp_gp = rdp->nocb_gp_rdp; > > > > > > - lockdep_assert_held(>nocb_lock); > > > if (!READ_ONCE(rdp_gp->nocb_gp_kthread)) { > > > - rcu_nocb_unlock_irqrestore(rdp, flags); > > > + raw_spin_unlock_irqrestore(_gp->nocb_gp_lock, flags); > > > trace_rcu_nocb_wake(rcu_state.name, rdp->cpu, > > > TPS("AlreadyAwake")); > > > return false; > > > } > > > > > > - if (READ_ONCE(rdp->nocb_defer_wakeup) > RCU_NOCB_WAKE_NOT) { > > > - WRITE_ONCE(rdp->nocb_defer_wakeup, RCU_NOCB_WAKE_NOT); > > > - del_timer(>nocb_timer); > > > + if (rdp_gp->nocb_defer_wakeup > RCU_NOCB_WAKE_NOT) { > > > > So there are no longer any data races involving ->nocb_defer_wakeup? > > > > (Yes, I could fire up KCSAN, but my KCSAN-capable system is otherwise > > occupied for several more hours.) > > To be more specific, there is no more unlocked write to the timer > (queue/cancel) > and its nocb_defer_wakeup matching state. And there is only one (on purpose) > racy > reader of ->nocb_defer_wakeup which is the non-timer deferred wakeup. > > So the writes to the timer keep their WRITE_ONCE() and only the reader in > do_nocb_deferred_wakeup() keeps its READ_ONCE(). Other readers are protected > by the ->nocb_gp_lock. > > > > + > > > // Advance callbacks if helpful and low contention. > > > needwake_gp = false; > > > if (!rcu_segcblist_restempty(>cblist, > > > @@ -2178,11 +2182,18 @@ static void nocb_gp_wait(struct rcu_data *my_rdp) > > > my_rdp->nocb_gp_bypass = bypass; > > > my_rdp->nocb_gp_gp = needwait_gp; > > > my_rdp->nocb_gp_seq = needwait_gp ? wait_gp_seq : 0; > > > - if (bypass && !rcu_nocb_poll) { > > > - // At least one child with non-empty ->nocb_bypass, so set > > > - // timer in order to avoid stranding its callbacks. > > > + if (bypass) { > > > raw_spin_lock_irqsave(_rdp->nocb_gp_lock, flags); > > > - mod_timer(_rdp->nocb_bypass_timer, j + 2); > > > + // Avoid race with first bypass CB. > > > + if (my_rdp->nocb_defer_wakeup > RCU_NOCB_WAKE_NOT) { > > > + WRITE_ONCE(my_rdp->nocb_defer_wakeup, > > > RCU_NOCB_WAKE_NOT); > > > + del_timer(_rdp->nocb_timer); > > > + } > > > > Given that the timer does not get queued if rcu_nocb_poll, why not move the > > above "if" statement under the one following? > > It's done later in the set. > > > > > > + if (!rcu_nocb_poll) { > > > + // At least one child with non-empty ->nocb_bypass, so > > > set > > > + // timer in order to avoid stranding its callbacks. > > > + mod_timer(_rdp->nocb_bypass_timer, j + 2); > > > + } > > > raw_spin_unlock_irqrestore(_rdp->nocb_gp_lock, flags); > > > } > > > if (rcu_nocb_poll) { > > > @@ -2385,7 +2399,10 @@ static void do_nocb_deferred_wakeup_timer(struct > > > timer_list *t) > > > */ > > > static bool do_nocb_deferred_wakeup(struct rcu_data *rdp) > > > { > > > - if (rcu_nocb_need_deferred_wakeup(rdp)) > > > + if (!rdp->nocb_gp_rdp) > > > + return false; > > > > This check was not necessary previously because each CPU used its own rdp, > > correct? > > Exactly! > > > The theory is that this early return is taken only during boot, > > and that the spawning of the kthreads will act as an implicit wakeup? > > You guessed right! That probably deserve a comment. OK, I have queued these for for further review and testing. Also to look at the overall effect. Thank you very much! Thanx, Paul
Re: [PATCH 05/13] rcu/nocb: Use the rcuog CPU's ->nocb_timer
On Tue, Mar 02, 2021 at 05:15:57PM -0800, Paul E. McKenney wrote: > The first question is of course: Did you try this with lockdep enabled? ;-) Yep I always do. But I may miss some configs on my testings. I usually test at least TREE01 on x86 and arm64. > > @@ -1702,43 +1692,50 @@ bool rcu_is_nocb_cpu(int cpu) > > return false; > > } > > > > -/* > > - * Kick the GP kthread for this NOCB group. Caller holds ->nocb_lock > > - * and this function releases it. > > - */ > > -static bool wake_nocb_gp(struct rcu_data *rdp, bool force, > > -unsigned long flags) > > - __releases(rdp->nocb_lock) > > +static bool __wake_nocb_gp(struct rcu_data *rdp_gp, > > + struct rcu_data *rdp, > > + bool force, unsigned long flags) > > + __releases(rdp_gp->nocb_gp_lock) > > { > > bool needwake = false; > > - struct rcu_data *rdp_gp = rdp->nocb_gp_rdp; > > > > - lockdep_assert_held(>nocb_lock); > > if (!READ_ONCE(rdp_gp->nocb_gp_kthread)) { > > - rcu_nocb_unlock_irqrestore(rdp, flags); > > + raw_spin_unlock_irqrestore(_gp->nocb_gp_lock, flags); > > trace_rcu_nocb_wake(rcu_state.name, rdp->cpu, > > TPS("AlreadyAwake")); > > return false; > > } > > > > - if (READ_ONCE(rdp->nocb_defer_wakeup) > RCU_NOCB_WAKE_NOT) { > > - WRITE_ONCE(rdp->nocb_defer_wakeup, RCU_NOCB_WAKE_NOT); > > - del_timer(>nocb_timer); > > + if (rdp_gp->nocb_defer_wakeup > RCU_NOCB_WAKE_NOT) { > > So there are no longer any data races involving ->nocb_defer_wakeup? > > (Yes, I could fire up KCSAN, but my KCSAN-capable system is otherwise > occupied for several more hours.) To be more specific, there is no more unlocked write to the timer (queue/cancel) and its nocb_defer_wakeup matching state. And there is only one (on purpose) racy reader of ->nocb_defer_wakeup which is the non-timer deferred wakeup. So the writes to the timer keep their WRITE_ONCE() and only the reader in do_nocb_deferred_wakeup() keeps its READ_ONCE(). Other readers are protected by the ->nocb_gp_lock. > > + > > // Advance callbacks if helpful and low contention. > > needwake_gp = false; > > if (!rcu_segcblist_restempty(>cblist, > > @@ -2178,11 +2182,18 @@ static void nocb_gp_wait(struct rcu_data *my_rdp) > > my_rdp->nocb_gp_bypass = bypass; > > my_rdp->nocb_gp_gp = needwait_gp; > > my_rdp->nocb_gp_seq = needwait_gp ? wait_gp_seq : 0; > > - if (bypass && !rcu_nocb_poll) { > > - // At least one child with non-empty ->nocb_bypass, so set > > - // timer in order to avoid stranding its callbacks. > > + if (bypass) { > > raw_spin_lock_irqsave(_rdp->nocb_gp_lock, flags); > > - mod_timer(_rdp->nocb_bypass_timer, j + 2); > > + // Avoid race with first bypass CB. > > + if (my_rdp->nocb_defer_wakeup > RCU_NOCB_WAKE_NOT) { > > + WRITE_ONCE(my_rdp->nocb_defer_wakeup, > > RCU_NOCB_WAKE_NOT); > > + del_timer(_rdp->nocb_timer); > > + } > > Given that the timer does not get queued if rcu_nocb_poll, why not move the > above "if" statement under the one following? It's done later in the set. > > > + if (!rcu_nocb_poll) { > > + // At least one child with non-empty ->nocb_bypass, so > > set > > + // timer in order to avoid stranding its callbacks. > > + mod_timer(_rdp->nocb_bypass_timer, j + 2); > > + } > > raw_spin_unlock_irqrestore(_rdp->nocb_gp_lock, flags); > > } > > if (rcu_nocb_poll) { > > @@ -2385,7 +2399,10 @@ static void do_nocb_deferred_wakeup_timer(struct > > timer_list *t) > > */ > > static bool do_nocb_deferred_wakeup(struct rcu_data *rdp) > > { > > - if (rcu_nocb_need_deferred_wakeup(rdp)) > > + if (!rdp->nocb_gp_rdp) > > + return false; > > This check was not necessary previously because each CPU used its own rdp, > correct? Exactly! > The theory is that this early return is taken only during boot, > and that the spawning of the kthreads will act as an implicit wakeup? You guessed right! That probably deserve a comment. Thanks!
Re: [PATCH 05/13] rcu/nocb: Use the rcuog CPU's ->nocb_timer
On Tue, Feb 23, 2021 at 01:10:03AM +0100, Frederic Weisbecker wrote: > Currently each offline rdp has its own nocb_timer armed when the > nocb_gp wakeup must be deferred. This layout has many drawbacks, > compared to a solution based on a single timer per rdp group: > > * There are a lot of timers to maintain. > > * The per-rdp ->nocb_lock must be held to queue and cancel the timer > and this lock can already be quite contended. > > * One timer firing doesn't cancel the other timers in the same group: > - These other timers can thus cause spurious wakeups > - Each rdp that queued a timer must lock both ->nocb_lock and then > ->nocb_gp_lock upon exit from the kernel to idle/user/guest mode. > > * We can't cancel all of them if we detect an unflushed bypass in > nocb_gp_wait(). In fact currently we only ever cancel the nocb_timer > of the leader group. > > * The leader group's nocb_timer is cancelled without locking ->nocb_lock > in nocb_gp_wait(). This currently appears to be safe but is an > accident waiting to happen. > > * Since the timer acquires ->nocb_lock, it requires extra care in the > NOCB (de-)offloading process, requiring that it be either enabled or > disabled and flushed. > > This commit instead uses the rcuog kthread's CPU's ->nocb_timer instead. > It is protected by nocb_gp_lock, which is _way_ less contended and > remains so even after this change. As a matter of fact, the nocb_timer > almost never fires and the deferred wakeup is mostly carried out upon > idle/user/guest entry. Now the early check performed at this point in > do_nocb_deferred_wakeup() is done on rdp_gp->nocb_defer_wakeup, which > is of course racy. However, this raciness is harmless because we only > need the guarantee that the timer is queued if we were the last one to > queue it. Any other situation (another CPU has queued it and we either > see it or not) is fine. > > This solves all the issues listed above. > > Signed-off-by: Frederic Weisbecker > Cc: Josh Triplett > Cc: Lai Jiangshan > Cc: Joel Fernandes > Cc: Neeraj Upadhyay > Cc: Boqun Feng I pulled in the previous three (2-4/13) with the usual commit-log wordsmithing, thank you! And I could not resist wordsmithing above. I do very much like the general approach, but a few questions below. The first question is of course: Did you try this with lockdep enabled? ;-) > --- > kernel/rcu/tree.h| 1 - > kernel/rcu/tree_plugin.h | 142 +-- > 2 files changed, 78 insertions(+), 65 deletions(-) > > diff --git a/kernel/rcu/tree.h b/kernel/rcu/tree.h > index 71821d59d95c..b280a843bd2c 100644 > --- a/kernel/rcu/tree.h > +++ b/kernel/rcu/tree.h > @@ -257,7 +257,6 @@ struct rcu_data { > }; > > /* Values for nocb_defer_wakeup field in struct rcu_data. */ > -#define RCU_NOCB_WAKE_OFF-1 > #define RCU_NOCB_WAKE_NOT0 > #define RCU_NOCB_WAKE1 > #define RCU_NOCB_WAKE_FORCE 2 > diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h > index 587df271d640..847636d3e93d 100644 > --- a/kernel/rcu/tree_plugin.h > +++ b/kernel/rcu/tree_plugin.h > @@ -33,10 +33,6 @@ static inline bool rcu_current_is_nocb_kthread(struct > rcu_data *rdp) > return false; > } > > -static inline bool rcu_running_nocb_timer(struct rcu_data *rdp) > -{ > - return (timer_curr_running(>nocb_timer) && !in_irq()); > -} > #else > static inline int rcu_lockdep_is_held_nocb(struct rcu_data *rdp) > { > @@ -48,11 +44,6 @@ static inline bool rcu_current_is_nocb_kthread(struct > rcu_data *rdp) > return false; > } > > -static inline bool rcu_running_nocb_timer(struct rcu_data *rdp) > -{ > - return false; > -} > - > #endif /* #ifdef CONFIG_RCU_NOCB_CPU */ > > static bool rcu_rdp_is_offloaded(struct rcu_data *rdp) > @@ -72,8 +63,7 @@ static bool rcu_rdp_is_offloaded(struct rcu_data *rdp) > rcu_lockdep_is_held_nocb(rdp) || > (rdp == this_cpu_ptr(_data) && > !(IS_ENABLED(CONFIG_PREEMPT_COUNT) && preemptible())) || > - rcu_current_is_nocb_kthread(rdp) || > - rcu_running_nocb_timer(rdp)), > + rcu_current_is_nocb_kthread(rdp)), > "Unsafe read of RCU_NOCB offloaded state" > ); > > @@ -1702,43 +1692,50 @@ bool rcu_is_nocb_cpu(int cpu) > return false; > } > > -/* > - * Kick the GP kthread for this NOCB group. Caller holds ->nocb_lock > - * and this function releases it. > - */ > -static bool wake_nocb_gp(struct rcu_data *rdp, bool force, > - unsigned long flags) > - __releases(rdp->nocb_lock) > +static bool __wake_nocb_gp(struct rcu_data *rdp_gp, > +struct rcu_data *rdp, > +bool force, unsigned long flags) > + __releases(rdp_gp->nocb_gp_lock) > { > bool needwake = false; > - struct rcu_data *rdp_gp = rdp->nocb_gp_rdp; > > - lockdep_assert_held(>nocb_lock); >