When user changes sched_rt_{runtime, period}_us, then sched_rt_handler() --> sched_dl_bandwidth_validate() { new_bw = global_rt_runtime()/global_rt_period();
for_each_possible_cpu(cpu) { dl_b = dl_bw_of(cpu); if (new_bw < dl_b->total_bw) ret = -EBUSY; } } Under CONFIG_SMP, dl_bw is per root domain , but not per CPU, dl_b->total_bw is the allocated bandwidth of the whole root domain. we should compare dl_b->total_bw against cpus*new_bw, where 'cpus' is the number of CPUs of the root domain. Also, below annotation(in kernel/sched/sched.h) implied implementation only appeared in SCHED_DEADLINE v2[1], then deadline scheduler kept evolving till got merged(v9), but the annotation remains unchanged, meaningless and misleading, correct it. * With respect to SMP, the bandwidth is given on a per-CPU basis, * meaning that: * - dl_bw (< 100%) is the bandwidth of the system (group) on each CPU; * - dl_total_bw array contains, in the i-eth element, the currently * allocated bandwidth on the i-eth CPU. [1] https://lkml.org/lkml/2010/2/28/119 Signed-off-by: Peng Liu <iwtba...@gmail.com> --- In fact, I'm not 100% sure that's a bug, since it's too 'obvious' and not newly introduced code. Also, the introduced #ifdef...#endif pairs look ugly, I have no idea how to eliminate them. Ideas and comments are welcome. Thanks. kernel/sched/deadline.c | 48 ++++++++++++++++++++++++++++------------- kernel/sched/sched.h | 17 +++++---------- 2 files changed, 38 insertions(+), 27 deletions(-) diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c index 3862a28cd05d..6524cb31148e 100644 --- a/kernel/sched/deadline.c +++ b/kernel/sched/deadline.c @@ -2511,33 +2511,43 @@ const struct sched_class dl_sched_class .update_curr = update_curr_dl, }; +#ifdef CONFIG_SMP +static struct cpumask dl_local_possible_mask; +#endif /* CONFIG_SMP */ + int sched_dl_global_validate(void) { u64 runtime = global_rt_runtime(); u64 period = global_rt_period(); u64 new_bw = to_ratio(period, runtime); struct dl_bw *dl_b; - int cpu, ret = 0; + int cpu, cpus = 1, ret = 0; unsigned long flags; - + cpumask_t *possible_mask = NULL; +#ifdef CONFIG_SMP + cpumask_t *span; +#endif /* CONFIG_SMP */ /* * Here we want to check the bandwidth not being set to some * value smaller than the currently allocated bandwidth in * any of the root_domains. - * - * FIXME: Cycling on all the CPUs is overdoing, but simpler than - * cycling on root_domains... Discussion on different/better - * solutions is welcome! */ - for_each_possible_cpu(cpu) { +#ifdef CONFIG_SMP + possible_mask = &dl_local_possible_mask; + cpumask_copy(possible_mask, cpu_possible_mask); +#endif /* CONFIG_SMP */ + for_each_cpu(cpu, possible_mask) { rcu_read_lock_sched(); dl_b = dl_bw_of(cpu); - +#ifdef CONFIG_SMP + span = cpu_rq(cpu)->rd->span; + cpus = cpumask_weight(span); + cpumask_andnot(possible_mask, possible_mask, span); +#endif /* CONFIG_SMP */ raw_spin_lock_irqsave(&dl_b->lock, flags); - if (new_bw < dl_b->total_bw) + if (new_bw * cpus < dl_b->total_bw) ret = -EBUSY; raw_spin_unlock_irqrestore(&dl_b->lock, flags); - rcu_read_unlock_sched(); if (ret) @@ -2566,6 +2576,10 @@ void sched_dl_do_global(void) struct dl_bw *dl_b; int cpu; unsigned long flags; + cpumask_t *possible_mask = NULL; +#ifdef CONFIG_SMP + cpumask_t *span; +#endif /* CONFIG_SMP */ def_dl_bandwidth.dl_period = global_rt_period(); def_dl_bandwidth.dl_runtime = global_rt_runtime(); @@ -2573,17 +2587,21 @@ void sched_dl_do_global(void) if (global_rt_runtime() != RUNTIME_INF) new_bw = to_ratio(global_rt_period(), global_rt_runtime()); - /* - * FIXME: As above... - */ - for_each_possible_cpu(cpu) { +#ifdef CONFIG_SMP + possible_mask = &dl_local_possible_mask; + cpumask_copy(possible_mask, cpu_possible_mask); +#endif /* CONFIG_SMP */ + for_each_cpu(cpu, possible_mask) { rcu_read_lock_sched(); dl_b = dl_bw_of(cpu); raw_spin_lock_irqsave(&dl_b->lock, flags); dl_b->bw = new_bw; raw_spin_unlock_irqrestore(&dl_b->lock, flags); - +#ifdef CONFIG_SMP + span = cpu_rq(cpu)->rd->span; + cpumask_andnot(possible_mask, possible_mask, span); +#endif /* CONFIG_SMP */ rcu_read_unlock_sched(); init_dl_rq_bw_ratio(&cpu_rq(cpu)->dl); } diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 28709f6b0975..2602544e06ff 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -258,9 +258,9 @@ struct rt_bandwidth { void __dl_clear_params(struct task_struct *p); /* - * To keep the bandwidth of -deadline tasks and groups under control + * To keep the bandwidth of -deadline tasks under control * we need some place where: - * - store the maximum -deadline bandwidth of the system (the group); + * - store the maximum -deadline bandwidth of each root domain; * - cache the fraction of that bandwidth that is currently allocated. * * This is all done in the data structure below. It is similar to the @@ -269,17 +269,10 @@ void __dl_clear_params(struct task_struct *p); * do not decrease any runtime while the group "executes", neither we * need a timer to replenish it. * - * With respect to SMP, the bandwidth is given on a per-CPU basis, + * With respect to SMP, the bandwidth is given on a per root domain basis, * meaning that: - * - dl_bw (< 100%) is the bandwidth of the system (group) on each CPU; - * - dl_total_bw array contains, in the i-eth element, the currently - * allocated bandwidth on the i-eth CPU. - * Moreover, groups consume bandwidth on each CPU, while tasks only - * consume bandwidth on the CPU they're running on. - * Finally, dl_total_bw_cpu is used to cache the index of dl_total_bw - * that will be shown the next time the proc or cgroup controls will - * be red. It on its turn can be changed by writing on its own - * control. + * - bw (< 100%) is the bandwidth of the system on each CPU; + * - total_bw is the currently allocated bandwidth on each root domain. */ struct dl_bandwidth { raw_spinlock_t dl_runtime_lock; -- 2.20.1