Hi Paul,

On Mon, Aug 10, 2020 at 09:19:45AM -0700, Paul E. McKenney wrote:
> On Fri, Aug 07, 2020 at 01:07:21PM -0400, Joel Fernandes (Google) wrote:
> > RCU has had deadlocks in the past related to synchronizing in a hotplug
> > notifier. Typically, this has occurred because timer callbacks did not get
> > migrated before the CPU hotplug notifier requesting RCU's services is
> > called. If RCU's grace period processing has a timer callback queued in
> > the meanwhile, it may never get called causing RCU stalls.
> > 
> > These issues have been fixed by removing such dependencies from grace
> > period processing, however there are no testing scenarios for such
> > cases.
> > 
> > This commit therefore reuses rcutorture's existing hotplug notifier to
> > invoke the flavor-specific synchronize callback. If anything locks up,
> > we expect stall warnings and/or other splats.
> > 
> > Obviously, we need not test for rcu_barrier from a notifier, since those
> > are not allowed from notifiers. This fact is already detailed in the
> > documentation as well.
> > 
> > Signed-off-by: Joel Fernandes (Google) <j...@joelfernandes.org>
> 
> Given that rcutorture_booster_init() is invoked on the CPU in question
> only after it is up and running, and that (if I remember correctly)
> rcutorture_booster_cleanup() is invoked on the outgoing CPU before it
> has really started going away, would this code really have caught that
> timer/CPU-hotplug/RCU bug?

You are right, it would not have caught that particular one because the timer
callbacks would have been migrated by the time the rcutorture_booster_init()
is called.

I still thought it is a good idea anyway to test if the dynamic hotplug
notifiers don't have these issues.

Did you have a better idea on how to test the timer/hotplug/rcu bug?

thanks,

 - Joel



> >  kernel/rcu/rcutorture.c | 81 +++++++++++++++++++++--------------------
> >  1 file changed, 42 insertions(+), 39 deletions(-)
> > 
> > diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c
> > index 92cb79620939..083b65e4877d 100644
> > --- a/kernel/rcu/rcutorture.c
> > +++ b/kernel/rcu/rcutorture.c
> > @@ -1645,12 +1645,37 @@ rcu_torture_print_module_parms(struct 
> > rcu_torture_ops *cur_ops, const char *tag)
> >              read_exit_delay, read_exit_burst);
> >  }
> >  
> > -static int rcutorture_booster_cleanup(unsigned int cpu)
> > +static bool rcu_torture_can_boost(void)
> > +{
> > +   static int boost_warn_once;
> > +   int prio;
> > +
> > +   if (!(test_boost == 1 && cur_ops->can_boost) && test_boost != 2)
> > +           return false;
> > +
> > +   prio = rcu_get_gp_kthreads_prio();
> > +   if (!prio)
> > +           return false;
> > +
> > +   if (prio < 2) {
> > +           if (boost_warn_once  == 1)
> > +                   return false;
> > +
> > +           pr_alert("%s: WARN: RCU kthread priority too low to test 
> > boosting.  Skipping RCU boost test. Try passing rcutree.kthread_prio > 1 on 
> > the kernel command line.\n", KBUILD_MODNAME);
> > +           boost_warn_once = 1;
> > +           return false;
> > +   }
> > +
> > +   return true;
> > +}
> > +
> > +static int rcutorture_hp_cleanup(unsigned int cpu)
> >  {
> >     struct task_struct *t;
> >  
> > -   if (boost_tasks[cpu] == NULL)
> > +   if (!rcu_torture_can_boost() || boost_tasks[cpu] == NULL)
> >             return 0;
> > +
> >     mutex_lock(&boost_mutex);
> >     t = boost_tasks[cpu];
> >     boost_tasks[cpu] = NULL;
> > @@ -1662,11 +1687,14 @@ static int rcutorture_booster_cleanup(unsigned int 
> > cpu)
> >     return 0;
> >  }
> >  
> > -static int rcutorture_booster_init(unsigned int cpu)
> > +static int rcutorture_hp_init(unsigned int cpu)
> >  {
> >     int retval;
> >  
> > -   if (boost_tasks[cpu] != NULL)
> > +   /* Force synchronizing from hotplug notifier to ensure it is safe. */
> > +   cur_ops->sync();
> > +
> > +   if (!rcu_torture_can_boost() || boost_tasks[cpu] != NULL)
> >             return 0;  /* Already created, nothing more to do. */
> >  
> >     /* Don't allow time recalculation while creating a new task. */
> > @@ -2336,30 +2364,6 @@ static void rcu_torture_barrier_cleanup(void)
> >     }
> >  }
> >  
> > -static bool rcu_torture_can_boost(void)
> > -{
> > -   static int boost_warn_once;
> > -   int prio;
> > -
> > -   if (!(test_boost == 1 && cur_ops->can_boost) && test_boost != 2)
> > -           return false;
> > -
> > -   prio = rcu_get_gp_kthreads_prio();
> > -   if (!prio)
> > -           return false;
> > -
> > -   if (prio < 2) {
> > -           if (boost_warn_once  == 1)
> > -                   return false;
> > -
> > -           pr_alert("%s: WARN: RCU kthread priority too low to test 
> > boosting.  Skipping RCU boost test. Try passing rcutree.kthread_prio > 1 on 
> > the kernel command line.\n", KBUILD_MODNAME);
> > -           boost_warn_once = 1;
> > -           return false;
> > -   }
> > -
> > -   return true;
> > -}
> > -
> >  static bool read_exit_child_stop;
> >  static bool read_exit_child_stopped;
> >  static wait_queue_head_t read_exit_wq;
> > @@ -2503,8 +2507,7 @@ rcu_torture_cleanup(void)
> >              rcutorture_seq_diff(gp_seq, start_gp_seq));
> >     torture_stop_kthread(rcu_torture_stats, stats_task);
> >     torture_stop_kthread(rcu_torture_fqs, fqs_task);
> > -   if (rcu_torture_can_boost())
> > -           cpuhp_remove_state(rcutor_hp);
> > +   cpuhp_remove_state(rcutor_hp);
> >  
> >     /*
> >      * Wait for all RCU callbacks to fire, then do torture-type-specific
> > @@ -2773,21 +2776,21 @@ rcu_torture_init(void)
> >             if (firsterr)
> >                     goto unwind;
> >     }
> > +
> > +   firsterr = cpuhp_setup_state(CPUHP_AP_ONLINE_DYN, "RCU_TORTURE",
> > +                   rcutorture_hp_init,
> > +                   rcutorture_hp_cleanup);
> > +   if (firsterr < 0)
> > +           goto unwind;
> > +   rcutor_hp = firsterr;
> > +
> >     if (test_boost_interval < 1)
> >             test_boost_interval = 1;
> >     if (test_boost_duration < 2)
> >             test_boost_duration = 2;
> > -   if (rcu_torture_can_boost()) {
> > -
> > +   if (rcu_torture_can_boost())
> >             boost_starttime = jiffies + test_boost_interval * HZ;
> >  
> > -           firsterr = cpuhp_setup_state(CPUHP_AP_ONLINE_DYN, "RCU_TORTURE",
> > -                                        rcutorture_booster_init,
> > -                                        rcutorture_booster_cleanup);
> > -           if (firsterr < 0)
> > -                   goto unwind;
> > -           rcutor_hp = firsterr;
> > -   }
> >     shutdown_jiffies = jiffies + shutdown_secs * HZ;
> >     firsterr = torture_shutdown_init(shutdown_secs, rcu_torture_cleanup);
> >     if (firsterr)
> > -- 
> > 2.28.0.236.gb10cc79966-goog
> > 

Reply via email to