Signed-off-by: Konstantin Khorenko <khore...@virtuozzo.com> --- kernel/sched/core.c | 2 +- kernel/sched/cpuacct.c | 215 +++++++++++++++++++++++++++++++++++++++++ 2 files changed, 216 insertions(+), 1 deletion(-)
diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 88bc46d163b3..e381085eb771 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -6569,7 +6569,7 @@ void sched_move_task(struct task_struct *tsk) task_rq_unlock(rq, tsk, &rf); } -static inline struct task_group *css_tg(struct cgroup_subsys_state *css) +inline struct task_group *css_tg(struct cgroup_subsys_state *css) { return css ? container_of(css, struct task_group, css) : NULL; } diff --git a/kernel/sched/cpuacct.c b/kernel/sched/cpuacct.c index 9fbb10383434..9f0ec721aec7 100644 --- a/kernel/sched/cpuacct.c +++ b/kernel/sched/cpuacct.c @@ -5,6 +5,7 @@ * Based on the work by Paul Menage (men...@google.com) and Balbir Singh * (bal...@in.ibm.com). */ +#include <linux/kernel_stat.h> #include "sched.h" /* Time spent by the tasks of the CPU accounting group executing in ... */ @@ -373,3 +374,217 @@ struct cgroup_subsys cpuacct_cgrp_subsys = { .legacy_cftypes = files, .early_init = true, }; + +extern inline struct task_group *css_tg(struct cgroup_subsys_state *css); + +static struct task_group *ve_root_tg(struct task_group *tg) { + struct cgroup *cg; + + if (!tg) + return NULL; + + cg = cgroup_get_ve_root1(tg->css.cgroup); + return cg ? css_tg(&cg->self) : NULL; +} + +unsigned int tg_cpu_rate(struct task_group *tg) +{ + unsigned int cpu_rate = 0; +#ifdef CONFIG_CFS_CPULIMIT + tg = ve_root_tg(tg); + if (tg) + cpu_rate = tg->cpu_rate; +#endif + return cpu_rate; +} + +static unsigned int tg_nr_cpus(struct task_group *tg) +{ + unsigned int nr_cpus = 0; + unsigned int max_nr_cpus = num_online_cpus(); + +#ifdef CONFIG_CFS_CPULIMIT + tg = ve_root_tg(tg); + if (tg) + nr_cpus = tg->nr_cpus; +#endif + if (!nr_cpus || nr_cpus > max_nr_cpus) + nr_cpus = max_nr_cpus; + + return nr_cpus; +} + +struct kernel_cpustat *cpuacct_cpustat(struct cgroup_subsys_state *css, int cpu) +{ + return per_cpu_ptr(css_ca(css)->cpustat, cpu); +} + +static void fixup_vcpustat_delta_usage(struct kernel_cpustat *cur, + struct kernel_cpustat *rem, int ind, + u64 cur_usage, u64 target_usage, + u64 rem_usage) +{ + s64 scaled_val; + u32 scale_pct = 0; + + /* distribute the delta among USER, NICE, and SYSTEM proportionally */ + if (cur_usage < target_usage) { + if ((s64)rem_usage > 0) /* sanity check to avoid div/0 */ + scale_pct = div64_u64(100 * rem->cpustat[ind], + rem_usage); + } else { + if ((s64)cur_usage > 0) /* sanity check to avoid div/0 */ + scale_pct = div64_u64(100 * cur->cpustat[ind], + cur_usage); + } + + scaled_val = div_s64(scale_pct * (target_usage - cur_usage), 100); + + cur->cpustat[ind] += scaled_val; + if ((s64)cur->cpustat[ind] < 0) + cur->cpustat[ind] = 0; + + rem->cpustat[ind] -= scaled_val; + if ((s64)rem->cpustat[ind] < 0) + rem->cpustat[ind] = 0; +} + +static void calc_vcpustat_delta_idle(struct kernel_cpustat *cur, + int ind, u64 cur_idle, u64 target_idle) +{ + /* distribute target_idle between IDLE and IOWAIT proportionally to + * what we initially had on this vcpu */ + if ((s64)cur_idle > 0) { + u32 scale_pct = div64_u64(100 * cur->cpustat[ind], cur_idle); + cur->cpustat[ind] = div_u64(scale_pct * target_idle, 100); + } else { + cur->cpustat[ind] = ind == CPUTIME_IDLE ? target_idle : 0; + } +} + +static void fixup_vcpustat_delta(struct kernel_cpustat *cur, + struct kernel_cpustat *rem, + u64 max_usage) +{ + u64 cur_usage, target_usage, rem_usage; + u64 cur_idle, target_idle; + + cur_usage = kernel_cpustat_total_usage(cur); + rem_usage = kernel_cpustat_total_usage(rem); + + target_usage = min(cur_usage + rem_usage, + max_usage); + + if (cur_usage != target_usage) { + fixup_vcpustat_delta_usage(cur, rem, CPUTIME_USER, + cur_usage, target_usage, rem_usage); + fixup_vcpustat_delta_usage(cur, rem, CPUTIME_NICE, + cur_usage, target_usage, rem_usage); + fixup_vcpustat_delta_usage(cur, rem, CPUTIME_SYSTEM, + cur_usage, target_usage, rem_usage); + } + + cur_idle = kernel_cpustat_total_idle(cur); + target_idle = max_usage - target_usage; + + if (cur_idle != target_idle) { + calc_vcpustat_delta_idle(cur, CPUTIME_IDLE, + cur_idle, target_idle); + calc_vcpustat_delta_idle(cur, CPUTIME_IOWAIT, + cur_idle, target_idle); + } + + /* do not show steal time inside ve */ + cur->cpustat[CPUTIME_STEAL] = 0; +} + +static void cpu_cgroup_update_vcpustat(struct cgroup_subsys_state *cpu_css, + struct cgroup_subsys_state *cpuacct_css) +{ + int i, j; + int nr_vcpus; + int vcpu_rate; + ktime_t now; + u64 max_usage; + struct kernel_cpustat stat_delta, stat_rem; + struct task_group *tg = css_tg(cpu_css); + int first_pass = 1; + + spin_lock(&tg->vcpustat_lock); + + now = ktime_get(); + nr_vcpus = tg_nr_cpus(tg); + vcpu_rate = DIV_ROUND_UP(tg_cpu_rate(tg), nr_vcpus); + if (!vcpu_rate || vcpu_rate > MAX_CPU_RATE) + vcpu_rate = MAX_CPU_RATE; + + if (!ktime_to_ns(tg->vcpustat_last_update)) { + /* on the first read initialize vcpu i stat as a sum of stats + * over pcpus j such that j % nr_vcpus == i */ + for (i = 0; i < nr_vcpus; i++) { + for (j = i; j < nr_cpu_ids; j += nr_vcpus) { + if (!cpu_possible(j)) + continue; + kernel_cpustat_add(tg->vcpustat + i, + cpuacct_cpustat(cpuacct_css, j), + tg->vcpustat + i); + } + } + goto out_update_last; + } + + max_usage = ktime_to_ns(ktime_sub(now, tg->vcpustat_last_update)); + max_usage = div_u64(max_usage * vcpu_rate, MAX_CPU_RATE); + /* don't allow to update stats too often to avoid calculation errors */ + if (max_usage < 10) + goto out_unlock; + + /* temporarily copy per cpu usage delta to tg->cpustat_last */ + for_each_possible_cpu(i) + kernel_cpustat_sub(cpuacct_cpustat(cpuacct_css, i), + tg->cpustat_last + i, + tg->cpustat_last + i); + + /* proceed to calculating per vcpu delta */ + kernel_cpustat_zero(&stat_rem); + +again: + for (i = 0; i < nr_vcpus; i++) { + int exceeds_max; + + kernel_cpustat_zero(&stat_delta); + for (j = i; j < nr_cpu_ids; j += nr_vcpus) { + if (!cpu_possible(j)) + continue; + kernel_cpustat_add(&stat_delta, + tg->cpustat_last + j, &stat_delta); + } + + exceeds_max = kernel_cpustat_total_usage(&stat_delta) >= + max_usage; + /* + * On the first pass calculate delta for vcpus with usage > + * max_usage in order to accumulate excess in stat_rem. + * + * Once the remainder is accumulated, proceed to the rest of + * vcpus so that it will be distributed among them. + */ + if (exceeds_max != first_pass) + continue; + + fixup_vcpustat_delta(&stat_delta, &stat_rem, max_usage); + kernel_cpustat_add(tg->vcpustat + i, &stat_delta, + tg->vcpustat + i); + } + + if (first_pass) { + first_pass = 0; + goto again; + } +out_update_last: + for_each_possible_cpu(i) + tg->cpustat_last[i] = *cpuacct_cpustat(cpuacct_css, i); + tg->vcpustat_last_update = now; +out_unlock: + spin_unlock(&tg->vcpustat_lock); +} -- 2.28.0 _______________________________________________ Devel mailing list Devel@openvz.org https://lists.openvz.org/mailman/listinfo/devel