fair: Use task_groups instead of leaf_cfs_rq_list to walk all cfs_rqs

Peter Zijlstra Mon, 01 May 2017 10:00:08 -0700

On Tue, Apr 25, 2017 at 05:40:39PM -0700, Tejun Heo wrote:
> @@ -9355,24 +9366,18 @@ void online_fair_sched_group(struct task
>  void unregister_fair_sched_group(struct task_group *tg)
>  {
>       unsigned long flags;
> -     struct rq *rq;
>       int cpu;
>  
>       for_each_possible_cpu(cpu) {
> +             struct rq *rq = cpu_rq(cpu);
> +             struct cfs_rq *cfs_rq = tg->cfs_rq[cpu];
> +
>               if (tg->se[cpu])
>                       remove_entity_load_avg(tg->se[cpu]);
>  
> -             /*
> -              * Only empty task groups can be destroyed; so we can 
> speculatively
> -              * check on_list without danger of it being re-added.
> -              */
> -             if (!tg->cfs_rq[cpu]->on_list)
> -                     continue;
> -
> -             rq = cpu_rq(cpu);
> -
>               raw_spin_lock_irqsave(&rq->lock, flags);
> -             list_del_leaf_cfs_rq(tg->cfs_rq[cpu]);
> +             list_del_leaf_cfs_rq(cfs_rq);
> +             cfs_rq->online = 0;
>               raw_spin_unlock_irqrestore(&rq->lock, flags);
>       }
>  }


It would be nice to be able to retain that on_list check and avoid
taking all those rq->lock's.

Since per the previous mail, those online/offline loops are protected by
rq->lock, all we need is to ensure an rcu_sched GP passes between here
and sched_free_group().

Which I think we can do differently, see below. Then we don't need the
->online thing and can keep using the ->on_list check.


> @@ -9523,11 +9528,11 @@ const struct sched_class fair_sched_clas
>  #ifdef CONFIG_SCHED_DEBUG
>  void print_cfs_stats(struct seq_file *m, int cpu)
>  {
> -     struct cfs_rq *cfs_rq;
> +     struct task_group *tg;
>  
>       rcu_read_lock();
> -     for_each_leaf_cfs_rq(cpu_rq(cpu), cfs_rq)
> -             print_cfs_rq(m, cpu, cfs_rq);
> +     list_for_each_entry_rcu(tg, &task_groups, list)
> +             print_cfs_rq(m, cpu, tg->cfs_rq[cpu]);
>       rcu_read_unlock();
>  }

If you have a gazillion empty groups, do we really want to print them?



diff --git a/include/linux/rcutiny.h b/include/linux/rcutiny.h
index 74d9c3a1feee..8bad2c371b18 100644
--- a/include/linux/rcutiny.h
+++ b/include/linux/rcutiny.h
@@ -58,6 +58,11 @@ static inline void cond_synchronize_sched(unsigned long 
oldstate)
        might_sleep();
 }
 
+static bool void should_synchornize_sched(unsigned long oldstate)
+{
+       return true;
+}
+
 extern void rcu_barrier_bh(void);
 extern void rcu_barrier_sched(void);
 
diff --git a/include/linux/rcutree.h b/include/linux/rcutree.h
index 0bacb6b2af69..9918b632db27 100644
--- a/include/linux/rcutree.h
+++ b/include/linux/rcutree.h
@@ -78,6 +78,7 @@ unsigned long get_state_synchronize_rcu(void);
 void cond_synchronize_rcu(unsigned long oldstate);
 unsigned long get_state_synchronize_sched(void);
 void cond_synchronize_sched(unsigned long oldstate);
+bool should_synchronize_sched(unsigned long oldstate);
 
 extern unsigned long rcutorture_testseq;
 extern unsigned long rcutorture_vernum;
diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
index 91fff49d5869..59eb0d115217 100644
--- a/kernel/rcu/tree.c
+++ b/kernel/rcu/tree.c
@@ -3423,6 +3423,18 @@ void cond_synchronize_sched(unsigned long oldstate)
 }
 EXPORT_SYMBOL_GPL(cond_synchronize_sched);
 
+bool should_synchronize_sched(unsigned long oldstate)
+{
+       unsigned long newstate;
+
+       /*
+        * Ensure that this load happens before any RCU-destructive
+        * actions the caller might carry out after we return.
+        */
+       newstate = smp_load_acquire(&rcu_sched_state.completed);
+       return ULONG_CMP_GE(oldstate, newstate);
+}
+
 /*
  * Check to see if there is any immediate RCU-related work to be done
  * by the current CPU, for the specified type of RCU, returning 1 if so.
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index ecad8e2d2a29..d7630b34d197 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -6362,7 +6362,7 @@ void ia64_set_curr_task(int cpu, struct task_struct *p)
 /* task_group_lock serializes the addition/removal of task groups */
 static DEFINE_SPINLOCK(task_group_lock);
 
-static void sched_free_group(struct task_group *tg)
+static void __sched_free_group(struct task_group *tg)
 {
        free_fair_sched_group(tg);
        free_rt_sched_group(tg);
@@ -6370,6 +6370,22 @@ static void sched_free_group(struct task_group *tg)
        kmem_cache_free(task_group_cache, tg);
 }
 
+static void sched_free_group_rcu_sched(struct rcu_head *rhp)
+{
+       /* Now it should be safe to free those cfs_rqs: */
+       __sched_free_group(container_of(rhp, struct task_group, rcu));
+}
+
+static void sched_free_group(struct task_group *tg)
+{
+       if (should_synchronize_sched(tg->rcu_stamp)) {
+               call_rcu_sched(&tg->rcu, sched_free_group_rcu_sched);
+               return;
+       }
+
+       __sched_free_group(tg);
+}
+
 /* allocate runqueue etc for a new task group */
 struct task_group *sched_create_group(struct task_group *parent)
 {
@@ -6434,6 +6450,8 @@ void sched_offline_group(struct task_group *tg)
        list_del_rcu(&tg->list);
        list_del_rcu(&tg->siblings);
        spin_unlock_irqrestore(&task_group_lock, flags);
+
+       tg->rcu_stamp = get_state_synchronize_sched();
 }
 
 static void sched_change_group(struct task_struct *tsk, int type)
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index b2e5e9e64c86..1dbab563e798 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -302,6 +302,7 @@ struct task_group {
 #endif
 
        struct rcu_head rcu;
+       long rcu_stamp;
        struct list_head list;
 
        struct task_group *parent;

Re: [PATCH 1/2] sched/fair: Use task_groups instead of leaf_cfs_rq_list to walk all cfs_rqs

Reply via email to