On Thu, Aug 30, 2012 at 11:18:16AM -0700, Paul E. McKenney wrote:
> From: "Paul E. McKenney" <paul...@linux.vnet.ibm.com>
> 
> As the first step towards allowing grace-period initialization to be
> preemptible, this commit moves the RCU grace-period initialization
> into its own kthread.  This is needed to keep large-system scheduling
> latency at reasonable levels.
> 
> Reported-by: Mike Galbraith <mgalbra...@suse.de>
> Reported-by: Dimitri Sivanich <sivan...@sgi.com>
> Signed-off-by: Paul E. McKenney <paul...@linux.vnet.ibm.com>

Reviewed-by: Josh Triplett <j...@joshtriplett.org>

>  kernel/rcutree.c |  191 
> ++++++++++++++++++++++++++++++++++++------------------
>  kernel/rcutree.h |    3 +
>  2 files changed, 130 insertions(+), 64 deletions(-)
> 
> diff --git a/kernel/rcutree.c b/kernel/rcutree.c
> index f280e54..e1c5868 100644
> --- a/kernel/rcutree.c
> +++ b/kernel/rcutree.c
> @@ -1040,6 +1040,103 @@ rcu_start_gp_per_cpu(struct rcu_state *rsp, struct 
> rcu_node *rnp, struct rcu_dat
>  }
>  
>  /*
> + * Body of kthread that handles grace periods.
> + */
> +static int rcu_gp_kthread(void *arg)
> +{
> +     unsigned long flags;
> +     struct rcu_data *rdp;
> +     struct rcu_node *rnp;
> +     struct rcu_state *rsp = arg;
> +
> +     for (;;) {
> +
> +             /* Handle grace-period start. */
> +             rnp = rcu_get_root(rsp);
> +             for (;;) {
> +                     wait_event_interruptible(rsp->gp_wq, rsp->gp_flags);
> +                     if (rsp->gp_flags)
> +                             break;
> +                     flush_signals(current);
> +             }
> +             raw_spin_lock_irqsave(&rnp->lock, flags);
> +             rsp->gp_flags = 0;
> +             rdp = this_cpu_ptr(rsp->rda);
> +
> +             if (rcu_gp_in_progress(rsp)) {
> +                     /*
> +                      * A grace period is already in progress, so
> +                      * don't start another one.
> +                      */
> +                     raw_spin_unlock_irqrestore(&rnp->lock, flags);
> +                     continue;
> +             }
> +
> +             if (rsp->fqs_active) {
> +                     /*
> +                      * We need a grace period, but force_quiescent_state()
> +                      * is running.  Tell it to start one on our behalf.
> +                      */
> +                     rsp->fqs_need_gp = 1;
> +                     raw_spin_unlock_irqrestore(&rnp->lock, flags);
> +                     continue;
> +             }
> +
> +             /* Advance to a new grace period and initialize state. */
> +             rsp->gpnum++;
> +             trace_rcu_grace_period(rsp->name, rsp->gpnum, "start");
> +             WARN_ON_ONCE(rsp->fqs_state == RCU_GP_INIT);
> +             rsp->fqs_state = RCU_GP_INIT; /* Stop force_quiescent_state. */
> +             rsp->jiffies_force_qs = jiffies + RCU_JIFFIES_TILL_FORCE_QS;
> +             record_gp_stall_check_time(rsp);
> +             raw_spin_unlock(&rnp->lock);  /* leave irqs disabled. */
> +
> +             /* Exclude any concurrent CPU-hotplug operations. */
> +             raw_spin_lock(&rsp->onofflock);  /* irqs already disabled. */
> +
> +             /*
> +              * Set the quiescent-state-needed bits in all the rcu_node
> +              * structures for all currently online CPUs in breadth-first
> +              * order, starting from the root rcu_node structure.
> +              * This operation relies on the layout of the hierarchy
> +              * within the rsp->node[] array.  Note that other CPUs will
> +              * access only the leaves of the hierarchy, which still
> +              * indicate that no grace period is in progress, at least
> +              * until the corresponding leaf node has been initialized.
> +              * In addition, we have excluded CPU-hotplug operations.
> +              *
> +              * Note that the grace period cannot complete until
> +              * we finish the initialization process, as there will
> +              * be at least one qsmask bit set in the root node until
> +              * that time, namely the one corresponding to this CPU,
> +              * due to the fact that we have irqs disabled.
> +              */
> +             rcu_for_each_node_breadth_first(rsp, rnp) {
> +                     raw_spin_lock(&rnp->lock); /* irqs already disabled. */
> +                     rcu_preempt_check_blocked_tasks(rnp);
> +                     rnp->qsmask = rnp->qsmaskinit;
> +                     rnp->gpnum = rsp->gpnum;
> +                     rnp->completed = rsp->completed;
> +                     if (rnp == rdp->mynode)
> +                             rcu_start_gp_per_cpu(rsp, rnp, rdp);
> +                     rcu_preempt_boost_start_gp(rnp);
> +                     trace_rcu_grace_period_init(rsp->name, rnp->gpnum,
> +                                                 rnp->level, rnp->grplo,
> +                                                 rnp->grphi, rnp->qsmask);
> +                     raw_spin_unlock(&rnp->lock); /* irqs remain disabled. */
> +             }
> +
> +             rnp = rcu_get_root(rsp);
> +             raw_spin_lock(&rnp->lock); /* irqs already disabled. */
> +             /* force_quiescent_state() now OK. */
> +             rsp->fqs_state = RCU_SIGNAL_INIT;
> +             raw_spin_unlock(&rnp->lock); /* irqs remain disabled. */
> +             raw_spin_unlock_irqrestore(&rsp->onofflock, flags);
> +     }
> +     return 0;
> +}
> +
> +/*
>   * Start a new RCU grace period if warranted, re-initializing the hierarchy
>   * in preparation for detecting the next grace period.  The caller must hold
>   * the root node's ->lock, which is released before return.  Hard irqs must
> @@ -1056,77 +1153,20 @@ rcu_start_gp(struct rcu_state *rsp, unsigned long 
> flags)
>       struct rcu_data *rdp = this_cpu_ptr(rsp->rda);
>       struct rcu_node *rnp = rcu_get_root(rsp);
>  
> -     if (!rcu_scheduler_fully_active ||
> +     if (!rsp->gp_kthread ||
>           !cpu_needs_another_gp(rsp, rdp)) {
>               /*
> -              * Either the scheduler hasn't yet spawned the first
> -              * non-idle task or this CPU does not need another
> -              * grace period.  Either way, don't start a new grace
> -              * period.
> -              */
> -             raw_spin_unlock_irqrestore(&rnp->lock, flags);
> -             return;
> -     }
> -
> -     if (rsp->fqs_active) {
> -             /*
> -              * This CPU needs a grace period, but force_quiescent_state()
> -              * is running.  Tell it to start one on this CPU's behalf.
> +              * Either we have not yet spawned the grace-period
> +              * task or this CPU does not need another grace period.
> +              * Either way, don't start a new grace period.
>                */
> -             rsp->fqs_need_gp = 1;
>               raw_spin_unlock_irqrestore(&rnp->lock, flags);
>               return;
>       }
>  
> -     /* Advance to a new grace period and initialize state. */
> -     rsp->gpnum++;
> -     trace_rcu_grace_period(rsp->name, rsp->gpnum, "start");
> -     WARN_ON_ONCE(rsp->fqs_state == RCU_GP_INIT);
> -     rsp->fqs_state = RCU_GP_INIT; /* Hold off force_quiescent_state. */
> -     rsp->jiffies_force_qs = jiffies + RCU_JIFFIES_TILL_FORCE_QS;
> -     record_gp_stall_check_time(rsp);
> -     raw_spin_unlock(&rnp->lock);  /* leave irqs disabled. */
> -
> -     /* Exclude any concurrent CPU-hotplug operations. */
> -     raw_spin_lock(&rsp->onofflock);  /* irqs already disabled. */
> -
> -     /*
> -      * Set the quiescent-state-needed bits in all the rcu_node
> -      * structures for all currently online CPUs in breadth-first
> -      * order, starting from the root rcu_node structure.  This
> -      * operation relies on the layout of the hierarchy within the
> -      * rsp->node[] array.  Note that other CPUs will access only
> -      * the leaves of the hierarchy, which still indicate that no
> -      * grace period is in progress, at least until the corresponding
> -      * leaf node has been initialized.  In addition, we have excluded
> -      * CPU-hotplug operations.
> -      *
> -      * Note that the grace period cannot complete until we finish
> -      * the initialization process, as there will be at least one
> -      * qsmask bit set in the root node until that time, namely the
> -      * one corresponding to this CPU, due to the fact that we have
> -      * irqs disabled.
> -      */
> -     rcu_for_each_node_breadth_first(rsp, rnp) {
> -             raw_spin_lock(&rnp->lock);      /* irqs already disabled. */
> -             rcu_preempt_check_blocked_tasks(rnp);
> -             rnp->qsmask = rnp->qsmaskinit;
> -             rnp->gpnum = rsp->gpnum;
> -             rnp->completed = rsp->completed;
> -             if (rnp == rdp->mynode)
> -                     rcu_start_gp_per_cpu(rsp, rnp, rdp);
> -             rcu_preempt_boost_start_gp(rnp);
> -             trace_rcu_grace_period_init(rsp->name, rnp->gpnum,
> -                                         rnp->level, rnp->grplo,
> -                                         rnp->grphi, rnp->qsmask);
> -             raw_spin_unlock(&rnp->lock);    /* irqs remain disabled. */
> -     }
> -
> -     rnp = rcu_get_root(rsp);
> -     raw_spin_lock(&rnp->lock);              /* irqs already disabled. */
> -     rsp->fqs_state = RCU_SIGNAL_INIT; /* force_quiescent_state now OK. */
> -     raw_spin_unlock(&rnp->lock);            /* irqs remain disabled. */
> -     raw_spin_unlock_irqrestore(&rsp->onofflock, flags);
> +     rsp->gp_flags = 1;
> +     raw_spin_unlock_irqrestore(&rnp->lock, flags);
> +     wake_up(&rsp->gp_wq);
>  }
>  
>  /*
> @@ -2627,6 +2667,28 @@ static int __cpuinit rcu_cpu_notify(struct 
> notifier_block *self,
>  }
>  
>  /*
> + * Spawn the kthread that handles this RCU flavor's grace periods.
> + */
> +static int __init rcu_spawn_gp_kthread(void)
> +{
> +     unsigned long flags;
> +     struct rcu_node *rnp;
> +     struct rcu_state *rsp;
> +     struct task_struct *t;
> +
> +     for_each_rcu_flavor(rsp) {
> +             t = kthread_run(rcu_gp_kthread, rsp, rsp->name);
> +             BUG_ON(IS_ERR(t));
> +             rnp = rcu_get_root(rsp);
> +             raw_spin_lock_irqsave(&rnp->lock, flags);
> +             rsp->gp_kthread = t;
> +             raw_spin_unlock_irqrestore(&rnp->lock, flags);
> +     }
> +     return 0;
> +}
> +early_initcall(rcu_spawn_gp_kthread);
> +
> +/*
>   * This function is invoked towards the end of the scheduler's initialization
>   * process.  Before this is called, the idle task might contain
>   * RCU read-side critical sections (during which time, this idle
> @@ -2727,6 +2789,7 @@ static void __init rcu_init_one(struct rcu_state *rsp,
>       }
>  
>       rsp->rda = rda;
> +     init_waitqueue_head(&rsp->gp_wq);
>       rnp = rsp->level[rcu_num_lvls - 1];
>       for_each_possible_cpu(i) {
>               while (i > rnp->grphi)
> diff --git a/kernel/rcutree.h b/kernel/rcutree.h
> index 4d29169..117a150 100644
> --- a/kernel/rcutree.h
> +++ b/kernel/rcutree.h
> @@ -385,6 +385,9 @@ struct rcu_state {
>       u8      boost;                          /* Subject to priority boost. */
>       unsigned long gpnum;                    /* Current gp number. */
>       unsigned long completed;                /* # of last completed gp. */
> +     struct task_struct *gp_kthread;         /* Task for grace periods. */
> +     wait_queue_head_t gp_wq;                /* Where GP task waits. */
> +     int gp_flags;                           /* Commands for GP task. */
>  
>       /* End of fields guarded by root rcu_node's lock. */
>  
> -- 
> 1.7.8
> 
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Reply via email to