Tasks without a user-defined clamp value are considered not clamped and by default their utilization can be any value in the [0..SCHED_CAPACITY_SCALE] range. Tasks with a user-defined clamp value are allowed to request any value in that range, and we currently unconditionally enforce the required clamps. However, a "System Management Software" could be interested in unconditionally limiting the range of clamp values allowed for all tasks.
Let's fix this by explicitly adding a privileged interface to define a system default configuration via: /proc/sys/kernel/sched_uclamp_util_{min,max} which works as an unconditional clamp range restriction for all tasks. If a task specific value is not compliant with the system default range, it will be forced to the corresponding system default value. Signed-off-by: Patrick Bellasi <patrick.bell...@arm.com> Cc: Ingo Molnar <mi...@redhat.com> Cc: Peter Zijlstra <pet...@infradead.org> Cc: Tejun Heo <t...@kernel.org> Cc: Paul Turner <p...@google.com> Cc: Suren Baghdasaryan <sur...@google.com> Cc: Todd Kjos <tk...@google.com> Cc: Joel Fernandes <joe...@google.com> Cc: Steve Muckle <smuc...@google.com> Cc: Juri Lelli <juri.le...@redhat.com> Cc: Quentin Perret <quentin.per...@arm.com> Cc: Dietmar Eggemann <dietmar.eggem...@arm.com> Cc: Morten Rasmussen <morten.rasmus...@arm.com> Cc: linux-kernel@vger.kernel.org Cc: linux...@vger.kernel.org --- The current restriction could be too aggressive since, for example if a task has a util_min which is higher then the system default max, it will be forced to the system default min unconditionally. We should probably better restrict util_min to the maximum system default value, but that whould make the code more complex and we keep it for a future update. Changes in v5: Other: - rebased on v4.19 Changes in v4: Message-ID: <20180820122728.GM2960@e110439-lin> - fix unwanted reset of clamp values on refcount success Others: - by default all tasks have a UCLAMP_NOT_VALID task specific clamp - always use: p->uclamp[clamp_id].effective.value to track the actual clamp value the task has been refcounted into. This matches with the usage of p->uclamp[clamp_id].effective.group_id - rebased on v4.19-rc1 --- include/linux/sched.h | 5 ++ include/linux/sched/sysctl.h | 11 +++ kernel/sched/core.c | 131 ++++++++++++++++++++++++++++++++--- kernel/sysctl.c | 16 +++++ 4 files changed, 154 insertions(+), 9 deletions(-) diff --git a/include/linux/sched.h b/include/linux/sched.h index 3ab1cbd4e3b1..ec6783ea4e7d 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -614,6 +614,11 @@ struct uclamp_se { unsigned int group_id : order_base_2(UCLAMP_GROUPS); unsigned int mapped : 1; unsigned int active : 1; + /* Clamp group and value actually used by a RUNNABLE task */ + struct { + unsigned int value : SCHED_CAPACITY_SHIFT + 1; + unsigned int group_id : order_base_2(UCLAMP_GROUPS); + } effective; }; #endif /* CONFIG_UCLAMP_TASK */ diff --git a/include/linux/sched/sysctl.h b/include/linux/sched/sysctl.h index a9c32daeb9d8..445fb54eaeff 100644 --- a/include/linux/sched/sysctl.h +++ b/include/linux/sched/sysctl.h @@ -56,6 +56,11 @@ int sched_proc_update_handler(struct ctl_table *table, int write, extern unsigned int sysctl_sched_rt_period; extern int sysctl_sched_rt_runtime; +#ifdef CONFIG_UCLAMP_TASK +extern unsigned int sysctl_sched_uclamp_util_min; +extern unsigned int sysctl_sched_uclamp_util_max; +#endif + #ifdef CONFIG_CFS_BANDWIDTH extern unsigned int sysctl_sched_cfs_bandwidth_slice; #endif @@ -75,6 +80,12 @@ extern int sched_rt_handler(struct ctl_table *table, int write, void __user *buffer, size_t *lenp, loff_t *ppos); +#ifdef CONFIG_UCLAMP_TASK +extern int sched_uclamp_handler(struct ctl_table *table, int write, + void __user *buffer, size_t *lenp, + loff_t *ppos); +#endif + extern int sysctl_numa_balancing(struct ctl_table *table, int write, void __user *buffer, size_t *lenp, loff_t *ppos); diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 9b49062439f3..8421ef96ec97 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -729,6 +729,23 @@ static void set_load_weight(struct task_struct *p, bool update_load) */ static DEFINE_MUTEX(uclamp_mutex); +/* + * Minimum utilization for all tasks + * default: 0 + */ +unsigned int sysctl_sched_uclamp_util_min; + +/* + * Maximum utilization for all tasks + * default: 1024 + */ +unsigned int sysctl_sched_uclamp_util_max = SCHED_CAPACITY_SCALE; + +/* + * Tasks's clamp values are required to be within this range + */ +static struct uclamp_se uclamp_default[UCLAMP_CNT]; + /** * uclamp_map: reference count utilization clamp groups * @value: the utilization "clamp value" tracked by this clamp group @@ -857,6 +874,55 @@ static inline void uclamp_cpu_update(struct rq *rq, unsigned int clamp_id, rq->uclamp.value[clamp_id] = max_value; } +/** + * uclamp_effective_group_id: get the effective clamp group index of a task + * @p: the task to get the effective clamp value for + * @clamp_id: the clamp index to consider + * + * The effective clamp group index of a task depends on: + * - the task specific clamp value, explicitly requested from userspace + * - the system default clamp value, defined by the sysadmin + * and tasks specific's clamp values are always restricted by system + * defaults clamp values. + * + * This method returns the effective group index for a task, depending on its + * status and a proper aggregation of the clamp values listed above. + * Moreover, it ensures that the task's effective value: + * task_struct::uclamp::effective::value + * is updated to represent the clamp value corresponding to the taks effective + * group index. + */ +static inline unsigned int uclamp_effective_group_id(struct task_struct *p, + unsigned int clamp_id) +{ + unsigned int clamp_value; + unsigned int group_id; + + /* Task currently refcounted into a CPU clamp group */ + if (p->uclamp[clamp_id].active) + return p->uclamp[clamp_id].effective.group_id; + + /* Task specific clamp value */ + clamp_value = p->uclamp[clamp_id].value; + group_id = p->uclamp[clamp_id].group_id; + + /* System default restriction */ + if (unlikely(clamp_value < uclamp_default[UCLAMP_MIN].value || + clamp_value > uclamp_default[UCLAMP_MAX].value)) { + /* + * Unconditionally enforce system defaults, which is a simpler + * solution compared to a proper clamping. + */ + clamp_value = uclamp_default[clamp_id].value; + group_id = uclamp_default[clamp_id].group_id; + } + + p->uclamp[clamp_id].effective.value = clamp_value; + p->uclamp[clamp_id].effective.group_id = group_id; + + return group_id; +} + /** * uclamp_cpu_get_id(): increase reference count for a clamp group on a CPU * @p: the task being enqueued on a CPU @@ -869,16 +935,17 @@ static inline void uclamp_cpu_update(struct rq *rq, unsigned int clamp_id, static inline void uclamp_cpu_get_id(struct task_struct *p, struct rq *rq, unsigned int clamp_id) { - unsigned int clamp_value; + unsigned int effective; unsigned int group_id; if (unlikely(!p->uclamp[clamp_id].mapped)) return; - group_id = p->uclamp[clamp_id].group_id; + group_id = uclamp_effective_group_id(p, clamp_id); p->uclamp[clamp_id].active = true; rq->uclamp.group[clamp_id][group_id].tasks += 1; + effective = p->uclamp[clamp_id].effective.value; if (unlikely(rq->uclamp.flags & UCLAMP_FLAG_IDLE)) { /* @@ -889,16 +956,15 @@ static inline void uclamp_cpu_get_id(struct task_struct *p, struct rq *rq, */ if (clamp_id == UCLAMP_MAX) rq->uclamp.flags &= ~UCLAMP_FLAG_IDLE; - rq->uclamp.value[clamp_id] = p->uclamp[clamp_id].value; + rq->uclamp.value[clamp_id] = effective; } /* CPU's clamp groups track the max effective clamp value */ - clamp_value = p->uclamp[clamp_id].value; - if (clamp_value > rq->uclamp.group[clamp_id][group_id].value) - rq->uclamp.group[clamp_id][group_id].value = clamp_value; + if (effective > rq->uclamp.group[clamp_id][group_id].value) + rq->uclamp.group[clamp_id][group_id].value = effective; - if (rq->uclamp.value[clamp_id] < p->uclamp[clamp_id].value) - rq->uclamp.value[clamp_id] = p->uclamp[clamp_id].value; + if (rq->uclamp.value[clamp_id] < effective) + rq->uclamp.value[clamp_id] = effective; } /** @@ -922,7 +988,7 @@ static inline void uclamp_cpu_put_id(struct task_struct *p, struct rq *rq, if (unlikely(!p->uclamp[clamp_id].mapped)) return; - group_id = p->uclamp[clamp_id].group_id; + group_id = uclamp_effective_group_id(p, clamp_id); p->uclamp[clamp_id].active = false; if (likely(rq->uclamp.group[clamp_id][group_id].tasks)) @@ -1172,6 +1238,50 @@ static void uclamp_group_get(struct task_struct *p, struct uclamp_se *uc_se, uc_se->mapped = true; } +int sched_uclamp_handler(struct ctl_table *table, int write, + void __user *buffer, size_t *lenp, + loff_t *ppos) +{ + int old_min, old_max; + int result = 0; + + mutex_lock(&uclamp_mutex); + + old_min = sysctl_sched_uclamp_util_min; + old_max = sysctl_sched_uclamp_util_max; + + result = proc_dointvec(table, write, buffer, lenp, ppos); + if (result) + goto undo; + if (!write) + goto done; + + if (sysctl_sched_uclamp_util_min > sysctl_sched_uclamp_util_max || + sysctl_sched_uclamp_util_max > SCHED_CAPACITY_SCALE) { + result = -EINVAL; + goto undo; + } + + if (old_min != sysctl_sched_uclamp_util_min) { + uclamp_group_get(NULL, &uclamp_default[UCLAMP_MIN], + UCLAMP_MIN, sysctl_sched_uclamp_util_min); + } + if (old_max != sysctl_sched_uclamp_util_max) { + uclamp_group_get(NULL, &uclamp_default[UCLAMP_MAX], + UCLAMP_MAX, sysctl_sched_uclamp_util_max); + } + goto done; + +undo: + sysctl_sched_uclamp_util_min = old_min; + sysctl_sched_uclamp_util_max = old_max; + +done: + mutex_unlock(&uclamp_mutex); + + return result; +} + static int __setscheduler_uclamp(struct task_struct *p, const struct sched_attr *attr) { @@ -1268,6 +1378,9 @@ static void __init init_uclamp(void) for (clamp_id = 0; clamp_id < UCLAMP_CNT; ++clamp_id) { uc_se = &init_task.uclamp[clamp_id]; uclamp_group_get(NULL, uc_se, clamp_id, uclamp_none(clamp_id)); + + uc_se = &uclamp_default[clamp_id]; + uclamp_group_get(NULL, uc_se, clamp_id, uclamp_none(clamp_id)); } } diff --git a/kernel/sysctl.c b/kernel/sysctl.c index cc02050fd0c4..378ea57e5fc5 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -445,6 +445,22 @@ static struct ctl_table kern_table[] = { .mode = 0644, .proc_handler = sched_rr_handler, }, +#ifdef CONFIG_UCLAMP_TASK + { + .procname = "sched_uclamp_util_min", + .data = &sysctl_sched_uclamp_util_min, + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = sched_uclamp_handler, + }, + { + .procname = "sched_uclamp_util_max", + .data = &sysctl_sched_uclamp_util_max, + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = sched_uclamp_handler, + }, +#endif #ifdef CONFIG_SCHED_AUTOGROUP { .procname = "sched_autogroup_enabled", -- 2.18.0