Change real-time bandwidth management as to make dl_bw a sub-quota of rt_bw. This patch leaves rt_bw at its default value and sets dl_bw at 40% of rt_bw. It also remove sched_dl_period_us control knob using sched_rt_period_us as common period for both rt_bw and dl_bw.
Checks are made when the user tries to change dl_bw sub-quota as to not fall below what currently used. Since dl_bw now depends upon rt_bw, similar checks are performed when the users modifies rt_bw and dl_bw is changed accordingly. Setting rt_bw sysctl variable to -1 (actually disabling rt throttling) disables dl_bw checks as well. Signed-off-by: Juri Lelli <juri.le...@gmail.com> --- include/linux/sched/sysctl.h | 2 - kernel/sched/core.c | 285 ++++++++++++++++++++---------------------- kernel/sched/deadline.c | 5 +- kernel/sched/sched.h | 24 ++-- kernel/sysctl.c | 7 -- 5 files changed, 147 insertions(+), 176 deletions(-) diff --git a/include/linux/sched/sysctl.h b/include/linux/sched/sysctl.h index 33fbf32..444a257 100644 --- a/include/linux/sched/sysctl.h +++ b/include/linux/sched/sysctl.h @@ -86,10 +86,8 @@ extern int sysctl_sched_rt_runtime; /* * control SCHED_DEADLINE reservations: * - * /proc/sys/kernel/sched_dl_period_us * /proc/sys/kernel/sched_dl_runtime_us */ -extern unsigned int sysctl_sched_dl_period; extern int sysctl_sched_dl_runtime; #ifdef CONFIG_CFS_BANDWIDTH diff --git a/kernel/sched/core.c b/kernel/sched/core.c index ee0ba8b..ddc1f49 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -297,13 +297,12 @@ __read_mostly int scheduler_running; int sysctl_sched_rt_runtime = 950000; /* - * Maximum bandwidth available for all -deadline tasks and groups - * (if group scheduling is configured) on each CPU. + * Sub-quota or rt bandwidth available for all -deadline tasks + * on each CPU. * - * default: 5% + * default: 40% */ -unsigned int sysctl_sched_dl_period = 1000000; -int sysctl_sched_dl_runtime = 50000; +int sysctl_sched_dl_runtime = 400000; @@ -6816,6 +6815,8 @@ LIST_HEAD(task_groups); DECLARE_PER_CPU(cpumask_var_t, load_balance_mask); +static u64 actual_dl_runtime(void); + void __init sched_init(void) { int i, j; @@ -6863,8 +6864,7 @@ void __init sched_init(void) init_rt_bandwidth(&def_rt_bandwidth, global_rt_period(), global_rt_runtime()); - init_dl_bandwidth(&def_dl_bandwidth, - global_dl_period(), global_dl_runtime()); + init_dl_bandwidth(&def_dl_bandwidth, actual_dl_runtime()); #ifdef CONFIG_RT_GROUP_SCHED init_rt_bandwidth(&root_task_group.rt_bandwidth, @@ -7264,6 +7264,93 @@ void sched_move_task(struct task_struct *tsk) } #endif /* CONFIG_CGROUP_SCHED */ +static u64 actual_dl_runtime(void) +{ + u64 dl_runtime = global_dl_runtime(); + u64 rt_runtime = global_rt_runtime(); + u64 period = global_rt_period(); + + /* + * We want to calculate the sub-quota of rt_bw actually available + * for -dl tasks. It is a percentage of percentage. By default 95% + * of system bandwidth is allocate to -rt tasks; among this, a 40% + * quota is reserved for -dl tasks. To have the actual quota a simple + * multiplication is needed: .95 * .40 = .38 (38% of system bandwidth + * for deadline tasks). + * What follows is basically the same, but using unsigned integers. + * + * dl_runtime rt_runtime + * actual_runtime = ---------- * ---------- * period + * period period + */ + if (dl_runtime == RUNTIME_INF) + return RUNTIME_INF; + + return div64_u64 (dl_runtime * rt_runtime, period); +} + +static int check_dl_bw(void) +{ + int i; + u64 period = global_rt_period(); + u64 dl_actual_runtime = def_dl_bandwidth.dl_runtime; + u64 new_bw = to_ratio(period, dl_actual_runtime); + + /* + * Here we want to check the bandwidth not being set to some + * value smaller than the currently allocated bandwidth in + * any of the root_domains. + * + * FIXME: Cycling on all the CPUs is overdoing, but simpler than + * cycling on root_domains... Discussion on different/better + * solutions is welcome! + */ + for_each_possible_cpu(i) { +#ifdef CONFIG_SMP + struct dl_bw *dl_b = &cpu_rq(i)->rd->dl_bw; +#else + struct dl_bw *dl_b = &cpu_rq(i)->dl.dl_bw; +#endif + raw_spin_lock(&dl_b->lock); + if (new_bw < dl_b->total_bw) { + raw_spin_unlock(&dl_b->lock); + return -EBUSY; + } + raw_spin_unlock(&dl_b->lock); + } + + return 0; +} + +static void update_dl_bw(void) +{ + u64 new_bw; + int i; + + def_dl_bandwidth.dl_runtime = actual_dl_runtime(); + if (def_dl_bandwidth.dl_runtime == RUNTIME_INF || + global_rt_runtime() == RUNTIME_INF) + new_bw = -1; + else { + new_bw = to_ratio(global_rt_period(), + def_dl_bandwidth.dl_runtime); + } + /* + * FIXME: As above... + */ + for_each_possible_cpu(i) { +#ifdef CONFIG_SMP + struct dl_bw *dl_b = &cpu_rq(i)->rd->dl_bw; +#else + struct dl_bw *dl_b = &cpu_rq(i)->dl.dl_bw; +#endif + + raw_spin_lock(&dl_b->lock); + dl_b->bw = new_bw; + raw_spin_unlock(&dl_b->lock); + } +} + #ifdef CONFIG_RT_GROUP_SCHED /* * Ensure that the real time constraints are schedulable. @@ -7437,48 +7524,10 @@ static long sched_group_rt_period(struct task_group *tg) do_div(rt_period_us, NSEC_PER_USEC); return rt_period_us; } -#endif /* CONFIG_RT_GROUP_SCHED */ -/* - * Coupling of -rt and -deadline bandwidth. - * - * Here we check if the new -rt bandwidth value is consistent - * with the system settings for the bandwidth available - * to -deadline tasks. - * - * IOW, we want to enforce that - * - * rt_bandwidth + dl_bandwidth <= 100% - * - * is always true. - */ -static bool __sched_rt_dl_global_constraints(u64 rt_bw) -{ - unsigned long flags; - u64 dl_bw; - bool ret; - - raw_spin_lock_irqsave(&def_dl_bandwidth.dl_runtime_lock, flags); - if (global_rt_runtime() == RUNTIME_INF || - global_dl_runtime() == RUNTIME_INF) { - ret = true; - goto unlock; - } - - dl_bw = to_ratio(def_dl_bandwidth.dl_period, - def_dl_bandwidth.dl_runtime); - - ret = rt_bw + dl_bw <= to_ratio(RUNTIME_INF, RUNTIME_INF); -unlock: - raw_spin_unlock_irqrestore(&def_dl_bandwidth.dl_runtime_lock, flags); - - return ret; -} - -#ifdef CONFIG_RT_GROUP_SCHED static int sched_rt_global_constraints(void) { - u64 runtime, period, bw; + u64 runtime, period; int ret = 0; if (sysctl_sched_rt_period <= 0) @@ -7493,9 +7542,13 @@ static int sched_rt_global_constraints(void) if (runtime > period && runtime != RUNTIME_INF) return -EINVAL; - bw = to_ratio(period, runtime); - if (!__sched_rt_dl_global_constraints(bw)) - return -EINVAL; + /* + * Check if changing rt_bw could have negative effects + * on dl_bw + */ + ret = check_dl_bw(); + if (ret) + return ret; mutex_lock(&rt_constraints_mutex); read_lock(&tasklist_lock); @@ -7519,18 +7572,27 @@ static int sched_rt_can_attach(struct task_group *tg, struct task_struct *tsk) static int sched_rt_global_constraints(void) { unsigned long flags; - int i, ret = 0; - u64 bw; + int i, ret; if (sysctl_sched_rt_period <= 0) return -EINVAL; + /* + * There's always some RT tasks in the root group + * -- migration, kstopmachine etc.. + */ + if (sysctl_sched_rt_runtime == 0) + return -EBUSY; + + /* + * Check if changing rt_bw could have negative effects + * on dl_bw + */ + ret = check_dl_bw(); + if (ret) + return ret; + raw_spin_lock_irqsave(&def_rt_bandwidth.rt_runtime_lock, flags); - bw = to_ratio(global_rt_period(), global_rt_runtime()); - if (!__sched_rt_dl_global_constraints(bw)) { - ret = -EINVAL; - goto unlock; - } for_each_possible_cpu(i) { struct rt_rq *rt_rq = &cpu_rq(i)->rt; @@ -7539,48 +7601,12 @@ static int sched_rt_global_constraints(void) rt_rq->rt_runtime = global_rt_runtime(); raw_spin_unlock(&rt_rq->rt_runtime_lock); } -unlock: raw_spin_unlock_irqrestore(&def_rt_bandwidth.rt_runtime_lock, flags); - return ret; + return 0; } #endif /* CONFIG_RT_GROUP_SCHED */ -/* - * Coupling of -dl and -rt bandwidth. - * - * Here we check, while setting the system wide bandwidth available - * for -dl tasks and groups, if the new values are consistent with - * the system settings for the bandwidth available to -rt entities. - * - * IOW, we want to enforce that - * - * rt_bandwidth + dl_bandwidth <= 100% - * - * is always true. - */ -static bool __sched_dl_rt_global_constraints(u64 dl_bw) -{ - u64 rt_bw; - bool ret; - - raw_spin_lock(&def_rt_bandwidth.rt_runtime_lock); - if (global_dl_runtime() == RUNTIME_INF || - global_rt_runtime() == RUNTIME_INF) { - ret = true; - goto unlock; - } - - rt_bw = to_ratio(ktime_to_ns(def_rt_bandwidth.rt_period), - def_rt_bandwidth.rt_runtime); - - ret = rt_bw + dl_bw <= to_ratio(RUNTIME_INF, RUNTIME_INF); -unlock: - raw_spin_unlock(&def_rt_bandwidth.rt_runtime_lock); - - return ret; -} - static bool __sched_dl_global_constraints(u64 runtime, u64 period) { if (!period || (runtime != RUNTIME_INF && runtime > period)) @@ -7591,40 +7617,17 @@ static bool __sched_dl_global_constraints(u64 runtime, u64 period) static int sched_dl_global_constraints(void) { - u64 runtime = global_dl_runtime(); - u64 period = global_dl_period(); - u64 new_bw = to_ratio(period, runtime); - int ret, i; + u64 period = global_rt_period(); + u64 dl_actual_runtime = def_dl_bandwidth.dl_runtime; + int ret; - ret = __sched_dl_global_constraints(runtime, period); + ret = __sched_dl_global_constraints(dl_actual_runtime, period); if (ret) return ret; - if (!__sched_dl_rt_global_constraints(new_bw)) - return -EINVAL; - - /* - * Here we want to check the bandwidth not being set to some - * value smaller than the currently allocated bandwidth in - * any of the root_domains. - * - * FIXME: Cycling on all the CPUs is overdoing, but simpler than - * cycling on root_domains... Discussion on different/better - * solutions is welcome! - */ - for_each_possible_cpu(i) { -#ifdef CONFIG_SMP - struct dl_bw *dl_b = &cpu_rq(i)->rd->dl_bw; -#else - struct dl_bw *dl_b = &cpu_rq(i)->dl.dl_bw; -#endif - raw_spin_lock(&dl_b->lock); - if (new_bw < dl_b->total_bw) { - raw_spin_unlock(&dl_b->lock); - return -EBUSY; - } - raw_spin_unlock(&dl_b->lock); - } + ret = check_dl_bw(); + if (ret) + return ret; return 0; } @@ -7655,6 +7658,7 @@ int sched_rt_handler(struct ctl_table *table, int write, int ret; int old_period, old_runtime; static DEFINE_MUTEX(mutex); + unsigned long flags; mutex_lock(&mutex); old_period = sysctl_sched_rt_period; @@ -7664,6 +7668,8 @@ int sched_rt_handler(struct ctl_table *table, int write, if (!ret && write) { ret = sched_rt_global_constraints(); + raw_spin_lock_irqsave(&def_dl_bandwidth.dl_runtime_lock, + flags); if (ret) { sysctl_sched_rt_period = old_period; sysctl_sched_rt_runtime = old_runtime; @@ -7671,7 +7677,11 @@ int sched_rt_handler(struct ctl_table *table, int write, def_rt_bandwidth.rt_runtime = global_rt_runtime(); def_rt_bandwidth.rt_period = ns_to_ktime(global_rt_period()); + + update_dl_bw(); } + raw_spin_unlock_irqrestore(&def_dl_bandwidth.dl_runtime_lock, + flags); } mutex_unlock(&mutex); @@ -7683,12 +7693,11 @@ int sched_dl_handler(struct ctl_table *table, int write, loff_t *ppos) { int ret; - int old_period, old_runtime; + int old_runtime; static DEFINE_MUTEX(mutex); unsigned long flags; mutex_lock(&mutex); - old_period = sysctl_sched_dl_period; old_runtime = sysctl_sched_dl_runtime; ret = proc_dointvec(table, write, buffer, lenp, ppos); @@ -7699,33 +7708,9 @@ int sched_dl_handler(struct ctl_table *table, int write, ret = sched_dl_global_constraints(); if (ret) { - sysctl_sched_dl_period = old_period; sysctl_sched_dl_runtime = old_runtime; } else { - u64 new_bw; - int i; - - def_dl_bandwidth.dl_period = global_dl_period(); - def_dl_bandwidth.dl_runtime = global_dl_runtime(); - if (global_dl_runtime() == RUNTIME_INF) - new_bw = -1; - else - new_bw = to_ratio(global_dl_period(), - global_dl_runtime()); - /* - * FIXME: As above... - */ - for_each_possible_cpu(i) { -#ifdef CONFIG_SMP - struct dl_bw *dl_b = &cpu_rq(i)->rd->dl_bw; -#else - struct dl_bw *dl_b = &cpu_rq(i)->dl.dl_bw; -#endif - - raw_spin_lock(&dl_b->lock); - dl_b->bw = new_bw; - raw_spin_unlock(&dl_b->lock); - } + update_dl_bw(); } raw_spin_unlock_irqrestore(&def_dl_bandwidth.dl_runtime_lock, diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c index 35d7c2d..ca20110 100644 --- a/kernel/sched/deadline.c +++ b/kernel/sched/deadline.c @@ -48,10 +48,9 @@ static inline int is_leftmost(struct task_struct *p, struct dl_rq *dl_rq) return dl_rq->rb_leftmost == &dl_se->rb_node; } -void init_dl_bandwidth(struct dl_bandwidth *dl_b, u64 period, u64 runtime) +void init_dl_bandwidth(struct dl_bandwidth *dl_b, u64 runtime) { raw_spin_lock_init(&dl_b->dl_runtime_lock); - dl_b->dl_period = period; dl_b->dl_runtime = runtime; } @@ -64,7 +63,7 @@ void init_dl_bw(struct dl_bw *dl_b) if (global_dl_runtime() == RUNTIME_INF) dl_b->bw = -1; else - dl_b->bw = to_ratio(global_dl_period(), global_dl_runtime()); + dl_b->bw = to_ratio(global_rt_period(), global_dl_runtime()); raw_spin_unlock(&def_dl_bandwidth.dl_runtime_lock); dl_b->total_bw = 0; } diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 08931a6..bf4acf8 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -142,20 +142,20 @@ struct rt_bandwidth { struct hrtimer rt_period_timer; }; /* - * To keep the bandwidth of -deadline tasks and groups under control - * we need some place where: - * - store the maximum -deadline bandwidth of the system (the group); + * To keep the bandwidth of -deadline tasks under control we need some + * place where: + * - store the maximum -deadline bandwidth of the system; * - cache the fraction of that bandwidth that is currently allocated. * * This is all done in the data structure below. It is similar to the * one used for RT-throttling (rt_bandwidth), with the main difference * that, since here we are only interested in admission control, we - * do not decrease any runtime while the group "executes", neither we + * do not decrease any runtime while the task "executes", neither we * need a timer to replenish it. * * With respect to SMP, the bandwidth is given on a per-CPU basis, * meaning that: - * - dl_bw (< 100%) is the bandwidth of the system (group) on each CPU; + * - dl_bw (< 100%) is the bandwidth of the system on each CPU; * - dl_total_bw array contains, in the i-eth element, the currently * allocated bandwidth on the i-eth CPU. * Moreover, groups consume bandwidth on each CPU, while tasks only @@ -168,7 +168,6 @@ struct rt_bandwidth { struct dl_bandwidth { raw_spinlock_t dl_runtime_lock; u64 dl_runtime; - u64 dl_period; }; static inline int dl_bandwidth_enabled(void) @@ -178,10 +177,12 @@ static inline int dl_bandwidth_enabled(void) struct dl_bw { raw_spinlock_t lock; - u64 bw, total_bw; + /* default value */ + u64 bw; + /* allocated */ + u64 total_bw; }; -static inline u64 global_dl_period(void); static inline u64 global_dl_runtime(void); extern struct mutex sched_domains_mutex; @@ -924,11 +925,6 @@ static inline u64 global_rt_runtime(void) return (u64)sysctl_sched_rt_runtime * NSEC_PER_USEC; } -static inline u64 global_dl_period(void) -{ - return (u64)sysctl_sched_dl_period * NSEC_PER_USEC; -} - static inline u64 global_dl_runtime(void) { if (sysctl_sched_dl_runtime < 0) @@ -1192,7 +1188,7 @@ extern struct rt_bandwidth def_rt_bandwidth; extern void init_rt_bandwidth(struct rt_bandwidth *rt_b, u64 period, u64 runtime); extern struct dl_bandwidth def_dl_bandwidth; -extern void init_dl_bandwidth(struct dl_bandwidth *dl_b, u64 period, u64 runtime); +extern void init_dl_bandwidth(struct dl_bandwidth *dl_b, u64 runtime); extern void init_dl_task_timer(struct sched_dl_entity *dl_se); unsigned long to_ratio(u64 period, u64 runtime); diff --git a/kernel/sysctl.c b/kernel/sysctl.c index f0cf6d6..8159e7e 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -415,13 +415,6 @@ static struct ctl_table kern_table[] = { .proc_handler = sched_rr_handler, }, { - .procname = "sched_dl_period_us", - .data = &sysctl_sched_dl_period, - .maxlen = sizeof(unsigned int), - .mode = 0644, - .proc_handler = sched_dl_handler, - }, - { .procname = "sched_dl_runtime_us", .data = &sysctl_sched_dl_runtime, .maxlen = sizeof(int), -- 1.7.9.5 -- To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/