On Fri, Jun 28, 2019 at 04:49:13PM -0400, Rik van Riel wrote: > Flatten the hierarchical runqueues into just the per CPU rq.cfs runqueue. > > Iteration of the sched_entity hierarchy is rate limited to once per jiffy > per sched_entity, which is a smaller change than it seems, because load > average adjustments were already rate limited to once per jiffy before this > patch series. > > This patch breaks CONFIG_CFS_BANDWIDTH. The plan for that is to park tasks > from throttled cgroups onto their cgroup runqueues, and slowly (using the > GENTLE_FAIR_SLEEPERS) wake them back up, in vruntime order, once the cgroup > gets unthrottled, to prevent thundering herd issues. > > Signed-off-by: Rik van Riel <r...@surriel.com> > --- > include/linux/sched.h | 2 + > kernel/sched/fair.c | 452 +++++++++++++++--------------------------- > kernel/sched/pelt.c | 6 +- > kernel/sched/pelt.h | 2 +- > kernel/sched/sched.h | 2 +- > 5 files changed, 171 insertions(+), 293 deletions(-) > > diff --git a/include/linux/sched.h b/include/linux/sched.h > index 84a6cc6f5c47..901c710363e7 100644 > --- a/include/linux/sched.h > +++ b/include/linux/sched.h > @@ -453,6 +453,8 @@ struct sched_entity { > #ifdef CONFIG_FAIR_GROUP_SCHED > int depth; > unsigned long enqueued_h_load; > + unsigned long enqueued_h_weight; > + struct load_weight h_load; > struct sched_entity *parent; > /* rq on which this entity is (to be) queued: */ > struct cfs_rq *cfs_rq; > diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c > index 6fea8849cc12..c31d3da081fb 100644 > --- a/kernel/sched/fair.c > +++ b/kernel/sched/fair.c > @@ -450,6 +450,19 @@ find_matching_se(struct sched_entity **se, struct > sched_entity **pse) > } > } > > +/* Add the cgroup cfs_rqs to the list, for update_blocked_averages */ > +static void enqueue_entity_cfs_rqs(struct sched_entity *se) > +{ > + SCHED_WARN_ON(!entity_is_task(se)); > + > + for_each_sched_entity(se) { > + struct cfs_rq *cfs_rq = group_cfs_rq_of_parent(se); > + > + if (list_add_leaf_cfs_rq(cfs_rq)) > + break; > + } > +} > + > #else /* !CONFIG_FAIR_GROUP_SCHED */ > > static inline struct task_struct *task_of(struct sched_entity *se) > @@ -672,8 +685,14 @@ int sched_proc_update_handler(struct ctl_table *table, > int write, > */ > static inline u64 calc_delta_fair(u64 delta, struct sched_entity *se) > { > - if (unlikely(se->load.weight != NICE_0_LOAD)) > + if (task_se_in_cgroup(se)) { > + unsigned long h_weight = task_se_h_weight(se); > + if (h_weight != se->h_load.weight) > + update_load_set(&se->h_load, h_weight); > + delta = __calc_delta(delta, NICE_0_LOAD, &se->h_load); > + } else if (unlikely(se->load.weight != NICE_0_LOAD)) { > delta = __calc_delta(delta, NICE_0_LOAD, &se->load); > + } > > return delta; > } > @@ -687,22 +706,16 @@ static inline u64 calc_delta_fair(u64 delta, struct > sched_entity *se) > static u64 sched_slice(struct cfs_rq *cfs_rq, struct sched_entity *se) > { > u64 slice = sysctl_sched_latency; > + struct load_weight *load = &cfs_rq->load; > + struct load_weight lw; > > - for_each_sched_entity(se) { > - struct load_weight *load; > - struct load_weight lw; > - > - cfs_rq = cfs_rq_of(se); > - load = &cfs_rq->load; > + if (unlikely(!se->on_rq)) { > + lw = cfs_rq->load; > > - if (unlikely(!se->on_rq)) { > - lw = cfs_rq->load; > - > - update_load_add(&lw, se->load.weight); > - load = &lw; > - } > - slice = __calc_delta(slice, se->load.weight, load); > + update_load_add(&lw, task_se_h_weight(se)); > + load = &lw; > } > + slice = __calc_delta(slice, task_se_h_weight(se), load); > > /* > * To avoid cache thrashing, run at least sysctl_sched_min_granularity. > @@ -2703,16 +2716,28 @@ static inline void update_scan_period(struct > task_struct *p, int new_cpu) > static void > account_entity_enqueue(struct cfs_rq *cfs_rq, struct sched_entity *se) > { > - update_load_add(&cfs_rq->load, se->load.weight); > - if (!parent_entity(se)) > + struct rq *rq; > + > + if (task_se_in_cgroup(se)) { > + struct cfs_rq *cgroup_rq = group_cfs_rq_of_parent(se); > + unsigned long h_weight; > + > + update_load_add(&cgroup_rq->load, se->load.weight); > + cgroup_rq->nr_running++; > + > + /* Add the hierarchical weight to the CPU rq */ > + h_weight = task_se_h_weight(se); > + se->enqueued_h_weight = h_weight; > + update_load_add(&rq_of(cfs_rq)->load, h_weight);
Ok I think this is where we need to handle the newly enqueued se's potential weight. Right now task_se_h_weight() is based on it's weight _and_ its load. So it's really it's task_se_h_weighted_load_avg, and not really task_se_h_weight, right? Instead we should separate these two things out, one gives us our heirarchal load, and one that gives us our heirarchal weight based purely on the ratio of (our weight) / (total se weight on parent). Then we can take this number and max it with the heirarchal load avg and use that with the update_load_add. Does that make sense? Thanks, Josef