On Thu, Oct 15, 2020 at 12:39:54AM +0200, Peter Zijlstra wrote:
> On Wed, Oct 14, 2020 at 03:11:52PM -0700, Paul E. McKenney wrote:
> > On Wed, Oct 14, 2020 at 11:53:19PM +0200, Peter Zijlstra wrote:
> > > On Wed, Oct 14, 2020 at 11:34:05AM -0700, Paul E. McKenney wrote:
> > > > commit 7deaa04b02298001426730ed0e6214ac20d1a1c1
> > > > Author: Paul E. McKenney <paul...@kernel.org>
> > > > Date:   Tue Oct 13 12:39:23 2020 -0700
> > > > 
> > > >     rcu: Prevent lockdep-RCU splats on lock acquisition/release
> > > >     
> > > >     The rcu_cpu_starting() and rcu_report_dead() functions transition 
> > > > the
> > > >     current CPU between online and offline state from an RCU 
> > > > perspective.
> > > >     Unfortunately, this means that the rcu_cpu_starting() function's 
> > > > lock
> > > >     acquisition and the rcu_report_dead() function's lock releases 
> > > > happen
> > > >     while the CPU is offline from an RCU perspective, which can result 
> > > > in
> > > >     lockdep-RCU splats about using RCU from an offline CPU.  In reality,
> > > >     aside from the splats, both transitions are safe because a new grace
> > > >     period cannot start until these functions release their locks.
> > > 
> > > But we call the trace_* crud before we acquire the lock. Are you sure
> > > that's a false-positive? 
> > 
> > You lost me on this one.
> > 
> > I am assuming that you are talking about rcu_cpu_starting(), because
> > that is the one where RCU is not initially watching, that is, the
> > case where tracing before the lock acquisition would be a problem.
> > You cannot be talking about rcu_cpu_starting() itself, because it does
> > not do any tracing before acquiring the lock.  But if you are talking
> > about the caller of rcu_cpu_starting(), then that caller should put the
> > rcu_cpu_starting() before the tracing.  But that would be the other
> > patch earlier in this thread that was proposing moving the call to
> > rcu_cpu_starting() much earlier in CPU bringup.
> > 
> > So what am I missing here?
> 
> rcu_cpu_starting();
>   raw_spin_lock_irqsave();
>     local_irq_save();
>     preempt_disable();
>     spin_acquire()
>       lock_acquire()
>         trace_lock_acquire() <--- *whoopsie-doodle*
>         /* uses RCU for tracing */
>     arch_spin_lock_flags() <--- the actual spinlock

Gah!  Idiot here left out the most important part, so good catch!!!
Much easier this way than finding out about it the hard way...

I should have asked myself harder questions earlier today about moving
the counter from the rcu_node structure to the rcu_data structure.

Perhaps something like the following untested patch on top of the
earlier patch?

                                                        Thanx, Paul

------------------------------------------------------------------------

diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
index 286dc0a..8b5215e 100644
--- a/kernel/rcu/tree.c
+++ b/kernel/rcu/tree.c
@@ -1159,8 +1159,8 @@ bool rcu_lockdep_current_cpu_online(void)
        preempt_disable_notrace();
        rdp = this_cpu_ptr(&rcu_data);
        rnp = rdp->mynode;
-       seq = READ_ONCE(rdp->ofl_seq) & ~0x1;
-       if (rdp->grpmask & rcu_rnp_online_cpus(rnp) || seq != 
READ_ONCE(rdp->ofl_seq))
+       seq = READ_ONCE(rnp->ofl_seq) & ~0x1;
+       if (rdp->grpmask & rcu_rnp_online_cpus(rnp) || seq != 
READ_ONCE(rnp->ofl_seq))
                ret = true;
        preempt_enable_notrace();
        return ret;
@@ -1982,6 +1982,7 @@ static void rcu_gp_fqs_loop(void)
 static void rcu_gp_cleanup(void)
 {
        int cpu;
+       unsigned long firstseq;
        bool needgp = false;
        unsigned long gp_duration;
        unsigned long new_gp_seq;
@@ -2019,6 +2020,12 @@ static void rcu_gp_cleanup(void)
        new_gp_seq = rcu_state.gp_seq;
        rcu_seq_end(&new_gp_seq);
        rcu_for_each_node_breadth_first(rnp) {
+               smp_mb(); // Pair with barriers used when updating ->ofl_seq to 
odd values.
+               firstseq = READ_ONCE(rnp->ofl_seq);
+               if (firstseq & 0x1)
+                       while (firstseq == smp_load_acquire(&rnp->ofl_seq))
+                               schedule_timeout_idle(1);  // Can't wake unless 
RCU is watching.
+               smp_mb(); // Pair with barriers used when updating ->ofl_seq to 
even values.
                raw_spin_lock_irq_rcu_node(rnp);
                if (WARN_ON_ONCE(rcu_preempt_blocked_readers_cgp(rnp)))
                        dump_blkd_tasks(rnp, 10);
@@ -4067,8 +4074,9 @@ void rcu_cpu_starting(unsigned int cpu)
 
        rnp = rdp->mynode;
        mask = rdp->grpmask;
-       WRITE_ONCE(rdp->ofl_seq, rdp->ofl_seq + 1);
-       WARN_ON_ONCE(!(rdp->ofl_seq & 0x1));
+       WRITE_ONCE(rnp->ofl_seq, rnp->ofl_seq + 1);
+       WARN_ON_ONCE(!(rnp->ofl_seq & 0x1));
+       smp_mb(); // Pair with rcu_gp_cleanup()'s ->ofl_seq barrier().
        raw_spin_lock_irqsave_rcu_node(rnp, flags);
        WRITE_ONCE(rnp->qsmaskinitnext, rnp->qsmaskinitnext | mask);
        newcpu = !(rnp->expmaskinitnext & mask);
@@ -4088,8 +4096,9 @@ void rcu_cpu_starting(unsigned int cpu)
        } else {
                raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
        }
-       WRITE_ONCE(rdp->ofl_seq, rdp->ofl_seq + 1);
-       WARN_ON_ONCE(rdp->ofl_seq & 0x1);
+       smp_mb(); // Pair with rcu_gp_cleanup()'s ->ofl_seq barrier().
+       WRITE_ONCE(rnp->ofl_seq, rnp->ofl_seq + 1);
+       WARN_ON_ONCE(rnp->ofl_seq & 0x1);
        smp_mb(); /* Ensure RCU read-side usage follows above initialization. */
 }
 
@@ -4117,8 +4126,9 @@ void rcu_report_dead(unsigned int cpu)
 
        /* Remove outgoing CPU from mask in the leaf rcu_node structure. */
        mask = rdp->grpmask;
-       WRITE_ONCE(rdp->ofl_seq, rdp->ofl_seq + 1);
-       WARN_ON_ONCE(!(rdp->ofl_seq & 0x1));
+       WRITE_ONCE(rnp->ofl_seq, rnp->ofl_seq + 1);
+       WARN_ON_ONCE(!(rnp->ofl_seq & 0x1));
+       smp_mb(); // Pair with rcu_gp_cleanup()'s ->ofl_seq barrier().
        raw_spin_lock(&rcu_state.ofl_lock);
        raw_spin_lock_irqsave_rcu_node(rnp, flags); /* Enforce GP memory-order 
guarantee. */
        rdp->rcu_ofl_gp_seq = READ_ONCE(rcu_state.gp_seq);
@@ -4131,8 +4141,9 @@ void rcu_report_dead(unsigned int cpu)
        WRITE_ONCE(rnp->qsmaskinitnext, rnp->qsmaskinitnext & ~mask);
        raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
        raw_spin_unlock(&rcu_state.ofl_lock);
-       WRITE_ONCE(rdp->ofl_seq, rdp->ofl_seq + 1);
-       WARN_ON_ONCE(rdp->ofl_seq & 0x1);
+       smp_mb(); // Pair with rcu_gp_cleanup()'s ->ofl_seq barrier().
+       WRITE_ONCE(rnp->ofl_seq, rnp->ofl_seq + 1);
+       WARN_ON_ONCE(rnp->ofl_seq & 0x1);
 
        rdp->cpu_started = false;
 }
diff --git a/kernel/rcu/tree.h b/kernel/rcu/tree.h
index bf0198d..7708ed1 100644
--- a/kernel/rcu/tree.h
+++ b/kernel/rcu/tree.h
@@ -56,6 +56,7 @@ struct rcu_node {
                                /*  Initialized from ->qsmaskinitnext at the */
                                /*  beginning of each grace period. */
        unsigned long qsmaskinitnext;
+       unsigned long ofl_seq;  /* CPU-hotplug operation sequence count. */
                                /* Online CPUs for next grace period. */
        unsigned long expmask;  /* CPUs or groups that need to check in */
                                /*  to allow the current expedited GP */
@@ -250,7 +251,6 @@ struct rcu_data {
        unsigned long rcu_onl_gp_seq;   /* ->gp_seq at last online. */
        short rcu_onl_gp_flags;         /* ->gp_flags at last online. */
        unsigned long last_fqs_resched; /* Time of last rcu_resched(). */
-       unsigned long ofl_seq;          /* CPU-hotplug operation sequence 
count. */
 
        int cpu;
 };

Reply via email to