Re: [RFC PATCH 4/8] rcu/nocb: Trigger self-IPI on late deferred wake up before user resume

2021-01-11 Thread Frederic Weisbecker
On Mon, Jan 11, 2021 at 01:04:24PM +0100, Peter Zijlstra wrote:
> > +static DEFINE_PER_CPU(struct irq_work, late_wakeup_work) =
> > +   IRQ_WORK_INIT(late_wakeup_func);
> > +
> >  /**
> >   * rcu_user_enter - inform RCU that we are resuming userspace.
> >   *
> > @@ -692,9 +704,17 @@ noinstr void rcu_user_enter(void)
> > struct rcu_data *rdp = this_cpu_ptr(_data);
> >  
> > lockdep_assert_irqs_disabled();
> > -   do_nocb_deferred_wakeup(rdp);
> > +   /*
> > +* We may be past the last rescheduling opportunity in the entry code.
> > +* Trigger a self IPI that will fire and reschedule once we resume to
> > +* user/guest mode.
> > +*/
> > +   if (do_nocb_deferred_wakeup(rdp) && need_resched())
> > +   irq_work_queue(this_cpu_ptr(_wakeup_work));
> > +
> > rcu_eqs_enter(true);
> >  }
> 
> Do we have the guarantee that every architecture that supports NOHZ_FULL
> has arch_irq_work_raise() on?

Yes it's a requirement for NOHZ_FULL to work. But you make me realize this
is tacit and isn't constrained anywhere in the code. I'm going to add
HAVE_IRQ_WORK_RAISE and replace the weak definition with a config
based.

> 
> Also, can't you do the same thing you did earlier and do that wakeup
> thing before we complete exit_to_user_mode_prepare() ?

I do it for CONFIG_GENERIC_ENTRY but the other architectures have their own
exit to user loop that I would need to audit and make sure that interrupts 
aren't
ever re-enabled before resuming to user and there is no possible rescheduling
point. I could manage to handle arm and arm64 but the others scare me:

$ git grep HAVE_CONTEXT_TRACKING
arch/csky/Kconfig:  select HAVE_CONTEXT_TRACKING
arch/mips/Kconfig:  select HAVE_CONTEXT_TRACKING
arch/powerpc/Kconfig:   select HAVE_CONTEXT_TRACKINGif PPC64
arch/riscv/Kconfig: select HAVE_CONTEXT_TRACKING
arch/sparc/Kconfig: select HAVE_CONTEXT_TRACKING

:-s


Re: [RFC PATCH 4/8] rcu/nocb: Trigger self-IPI on late deferred wake up before user resume

2021-01-11 Thread Peter Zijlstra
On Sat, Jan 09, 2021 at 03:05:32AM +0100, Frederic Weisbecker wrote:
> Entering RCU idle mode may cause a deferred wake up of an RCU NOCB_GP
> kthread (rcuog) to be serviced.
> 
> Unfortunately the call to rcu_user_enter() is already past the last
> rescheduling opportunity before we resume to userspace or to guest mode.
> We may escape there with the woken task ignored.
> 
> The ultimate resort to fix every callsites is to trigger a self-IPI
> (nohz_full depends on IRQ_WORK) that will trigger a reschedule on IRQ
> tail or guest exit.
> 
> Eventually every site that want a saner treatment will need to carefully
> place a call to rcu_nocb_flush_deferred_wakeup() before the last explicit
> need_resched() check upon resume.
> 
> Reported-by: Paul E. McKenney 
> Fixes: 96d3fd0d315a (rcu: Break call_rcu() deadlock involving scheduler and 
> perf)
> Cc: sta...@vger.kernel.org
> Cc: Rafael J. Wysocki 
> Cc: Peter Zijlstra 
> Cc: Thomas Gleixner 
> Cc: Ingo Molnar
> Signed-off-by: Frederic Weisbecker 
> ---
>  kernel/rcu/tree.c| 22 +-
>  kernel/rcu/tree.h|  2 +-
>  kernel/rcu/tree_plugin.h | 25 -
>  3 files changed, 38 insertions(+), 11 deletions(-)
> 
> diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
> index b6e134e3..2920dfc9f58c 100644
> --- a/kernel/rcu/tree.c
> +++ b/kernel/rcu/tree.c
> @@ -676,6 +676,18 @@ void rcu_idle_enter(void)
>  EXPORT_SYMBOL_GPL(rcu_idle_enter);
>  
>  #ifdef CONFIG_NO_HZ_FULL
> +
> +/*
> + * An empty function that will trigger a reschedule on
> + * IRQ tail once IRQs get re-enabled on userspace resume.
> + */
> +static void late_wakeup_func(struct irq_work *work)
> +{
> +}
> +
> +static DEFINE_PER_CPU(struct irq_work, late_wakeup_work) =
> + IRQ_WORK_INIT(late_wakeup_func);
> +
>  /**
>   * rcu_user_enter - inform RCU that we are resuming userspace.
>   *
> @@ -692,9 +704,17 @@ noinstr void rcu_user_enter(void)
>   struct rcu_data *rdp = this_cpu_ptr(_data);
>  
>   lockdep_assert_irqs_disabled();
> - do_nocb_deferred_wakeup(rdp);
> + /*
> +  * We may be past the last rescheduling opportunity in the entry code.
> +  * Trigger a self IPI that will fire and reschedule once we resume to
> +  * user/guest mode.
> +  */
> + if (do_nocb_deferred_wakeup(rdp) && need_resched())
> + irq_work_queue(this_cpu_ptr(_wakeup_work));
> +
>   rcu_eqs_enter(true);
>  }

Do we have the guarantee that every architecture that supports NOHZ_FULL
has arch_irq_work_raise() on?

Also, can't you do the same thing you did earlier and do that wakeup
thing before we complete exit_to_user_mode_prepare() ?


[RFC PATCH 4/8] rcu/nocb: Trigger self-IPI on late deferred wake up before user resume

2021-01-08 Thread Frederic Weisbecker
Entering RCU idle mode may cause a deferred wake up of an RCU NOCB_GP
kthread (rcuog) to be serviced.

Unfortunately the call to rcu_user_enter() is already past the last
rescheduling opportunity before we resume to userspace or to guest mode.
We may escape there with the woken task ignored.

The ultimate resort to fix every callsites is to trigger a self-IPI
(nohz_full depends on IRQ_WORK) that will trigger a reschedule on IRQ
tail or guest exit.

Eventually every site that want a saner treatment will need to carefully
place a call to rcu_nocb_flush_deferred_wakeup() before the last explicit
need_resched() check upon resume.

Reported-by: Paul E. McKenney 
Fixes: 96d3fd0d315a (rcu: Break call_rcu() deadlock involving scheduler and 
perf)
Cc: sta...@vger.kernel.org
Cc: Rafael J. Wysocki 
Cc: Peter Zijlstra 
Cc: Thomas Gleixner 
Cc: Ingo Molnar
Signed-off-by: Frederic Weisbecker 
---
 kernel/rcu/tree.c| 22 +-
 kernel/rcu/tree.h|  2 +-
 kernel/rcu/tree_plugin.h | 25 -
 3 files changed, 38 insertions(+), 11 deletions(-)

diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
index b6e134e3..2920dfc9f58c 100644
--- a/kernel/rcu/tree.c
+++ b/kernel/rcu/tree.c
@@ -676,6 +676,18 @@ void rcu_idle_enter(void)
 EXPORT_SYMBOL_GPL(rcu_idle_enter);
 
 #ifdef CONFIG_NO_HZ_FULL
+
+/*
+ * An empty function that will trigger a reschedule on
+ * IRQ tail once IRQs get re-enabled on userspace resume.
+ */
+static void late_wakeup_func(struct irq_work *work)
+{
+}
+
+static DEFINE_PER_CPU(struct irq_work, late_wakeup_work) =
+   IRQ_WORK_INIT(late_wakeup_func);
+
 /**
  * rcu_user_enter - inform RCU that we are resuming userspace.
  *
@@ -692,9 +704,17 @@ noinstr void rcu_user_enter(void)
struct rcu_data *rdp = this_cpu_ptr(_data);
 
lockdep_assert_irqs_disabled();
-   do_nocb_deferred_wakeup(rdp);
+   /*
+* We may be past the last rescheduling opportunity in the entry code.
+* Trigger a self IPI that will fire and reschedule once we resume to
+* user/guest mode.
+*/
+   if (do_nocb_deferred_wakeup(rdp) && need_resched())
+   irq_work_queue(this_cpu_ptr(_wakeup_work));
+
rcu_eqs_enter(true);
 }
+
 #endif /* CONFIG_NO_HZ_FULL */
 
 /**
diff --git a/kernel/rcu/tree.h b/kernel/rcu/tree.h
index 7708ed161f4a..9226f4021a36 100644
--- a/kernel/rcu/tree.h
+++ b/kernel/rcu/tree.h
@@ -433,7 +433,7 @@ static bool rcu_nocb_try_bypass(struct rcu_data *rdp, 
struct rcu_head *rhp,
 static void __call_rcu_nocb_wake(struct rcu_data *rdp, bool was_empty,
 unsigned long flags);
 static int rcu_nocb_need_deferred_wakeup(struct rcu_data *rdp);
-static void do_nocb_deferred_wakeup(struct rcu_data *rdp);
+static bool do_nocb_deferred_wakeup(struct rcu_data *rdp);
 static void rcu_boot_init_nocb_percpu_data(struct rcu_data *rdp);
 static void rcu_spawn_cpu_nocb_kthread(int cpu);
 static void __init rcu_spawn_nocb_kthreads(void);
diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h
index d5b38c28abd1..384856e4d13e 100644
--- a/kernel/rcu/tree_plugin.h
+++ b/kernel/rcu/tree_plugin.h
@@ -1631,8 +1631,8 @@ bool rcu_is_nocb_cpu(int cpu)
  * Kick the GP kthread for this NOCB group.  Caller holds ->nocb_lock
  * and this function releases it.
  */
-static void wake_nocb_gp(struct rcu_data *rdp, bool force,
-  unsigned long flags)
+static bool wake_nocb_gp(struct rcu_data *rdp, bool force,
+unsigned long flags)
__releases(rdp->nocb_lock)
 {
bool needwake = false;
@@ -1643,7 +1643,7 @@ static void wake_nocb_gp(struct rcu_data *rdp, bool force,
trace_rcu_nocb_wake(rcu_state.name, rdp->cpu,
TPS("AlreadyAwake"));
rcu_nocb_unlock_irqrestore(rdp, flags);
-   return;
+   return false;
}
del_timer(>nocb_timer);
rcu_nocb_unlock_irqrestore(rdp, flags);
@@ -1656,6 +1656,8 @@ static void wake_nocb_gp(struct rcu_data *rdp, bool force,
raw_spin_unlock_irqrestore(_gp->nocb_gp_lock, flags);
if (needwake)
wake_up_process(rdp_gp->nocb_gp_kthread);
+
+   return needwake;
 }
 
 /*
@@ -2152,20 +2154,23 @@ static int rcu_nocb_need_deferred_wakeup(struct 
rcu_data *rdp)
 }
 
 /* Do a deferred wakeup of rcu_nocb_kthread(). */
-static void do_nocb_deferred_wakeup_common(struct rcu_data *rdp)
+static bool do_nocb_deferred_wakeup_common(struct rcu_data *rdp)
 {
unsigned long flags;
int ndw;
+   int ret;
 
rcu_nocb_lock_irqsave(rdp, flags);
if (!rcu_nocb_need_deferred_wakeup(rdp)) {
rcu_nocb_unlock_irqrestore(rdp, flags);
-   return;
+   return false;
}
ndw = READ_ONCE(rdp->nocb_defer_wakeup);
WRITE_ONCE(rdp->nocb_defer_wakeup, RCU_NOCB_WAKE_NOT);
-   wake_nocb_gp(rdp, ndw ==