Signed-off-by: Konstantin Khorenko <khore...@virtuozzo.com>
---
 kernel/sched/core.c    |   2 +-
 kernel/sched/cpuacct.c | 215 +++++++++++++++++++++++++++++++++++++++++
 2 files changed, 216 insertions(+), 1 deletion(-)

diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 88bc46d163b3..e381085eb771 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -6569,7 +6569,7 @@ void sched_move_task(struct task_struct *tsk)
        task_rq_unlock(rq, tsk, &rf);
 }
 
-static inline struct task_group *css_tg(struct cgroup_subsys_state *css)
+inline struct task_group *css_tg(struct cgroup_subsys_state *css)
 {
        return css ? container_of(css, struct task_group, css) : NULL;
 }
diff --git a/kernel/sched/cpuacct.c b/kernel/sched/cpuacct.c
index 9fbb10383434..9f0ec721aec7 100644
--- a/kernel/sched/cpuacct.c
+++ b/kernel/sched/cpuacct.c
@@ -5,6 +5,7 @@
  * Based on the work by Paul Menage (men...@google.com) and Balbir Singh
  * (bal...@in.ibm.com).
  */
+#include <linux/kernel_stat.h>
 #include "sched.h"
 
 /* Time spent by the tasks of the CPU accounting group executing in ... */
@@ -373,3 +374,217 @@ struct cgroup_subsys cpuacct_cgrp_subsys = {
        .legacy_cftypes = files,
        .early_init     = true,
 };
+
+extern inline struct task_group *css_tg(struct cgroup_subsys_state *css);
+
+static struct task_group *ve_root_tg(struct task_group *tg) {
+       struct cgroup *cg;
+
+       if (!tg)
+               return NULL;
+
+       cg = cgroup_get_ve_root1(tg->css.cgroup);
+       return cg ? css_tg(&cg->self) : NULL;
+}
+
+unsigned int tg_cpu_rate(struct task_group *tg)
+{
+       unsigned int cpu_rate = 0;
+#ifdef CONFIG_CFS_CPULIMIT
+       tg = ve_root_tg(tg);
+       if (tg)
+               cpu_rate = tg->cpu_rate;
+#endif
+       return cpu_rate;
+}
+
+static unsigned int tg_nr_cpus(struct task_group *tg)
+{
+       unsigned int nr_cpus = 0;
+       unsigned int max_nr_cpus = num_online_cpus();
+
+#ifdef CONFIG_CFS_CPULIMIT
+       tg = ve_root_tg(tg);
+       if (tg)
+               nr_cpus = tg->nr_cpus;
+#endif
+       if (!nr_cpus || nr_cpus > max_nr_cpus)
+               nr_cpus = max_nr_cpus;
+
+       return nr_cpus;
+}
+
+struct kernel_cpustat *cpuacct_cpustat(struct cgroup_subsys_state *css, int 
cpu)
+{
+       return per_cpu_ptr(css_ca(css)->cpustat, cpu);
+}
+
+static void fixup_vcpustat_delta_usage(struct kernel_cpustat *cur,
+                                      struct kernel_cpustat *rem, int ind,
+                                      u64 cur_usage, u64 target_usage,
+                                      u64 rem_usage)
+{
+       s64 scaled_val;
+       u32 scale_pct = 0;
+
+       /* distribute the delta among USER, NICE, and SYSTEM proportionally */
+       if (cur_usage < target_usage) {
+               if ((s64)rem_usage > 0) /* sanity check to avoid div/0 */
+                       scale_pct = div64_u64(100 * rem->cpustat[ind],
+                                             rem_usage);
+       } else {
+               if ((s64)cur_usage > 0) /* sanity check to avoid div/0 */
+                       scale_pct = div64_u64(100 * cur->cpustat[ind],
+                                             cur_usage);
+       }
+
+       scaled_val = div_s64(scale_pct * (target_usage - cur_usage), 100);
+
+       cur->cpustat[ind] += scaled_val;
+       if ((s64)cur->cpustat[ind] < 0)
+               cur->cpustat[ind] = 0;
+
+       rem->cpustat[ind] -= scaled_val;
+       if ((s64)rem->cpustat[ind] < 0)
+               rem->cpustat[ind] = 0;
+}
+
+static void calc_vcpustat_delta_idle(struct kernel_cpustat *cur,
+                                    int ind, u64 cur_idle, u64 target_idle)
+{
+       /* distribute target_idle between IDLE and IOWAIT proportionally to
+        * what we initially had on this vcpu */
+       if ((s64)cur_idle > 0) {
+               u32 scale_pct = div64_u64(100 * cur->cpustat[ind], cur_idle);
+               cur->cpustat[ind] = div_u64(scale_pct * target_idle, 100);
+       } else {
+               cur->cpustat[ind] = ind == CPUTIME_IDLE ? target_idle : 0;
+       }
+}
+
+static void fixup_vcpustat_delta(struct kernel_cpustat *cur,
+                                struct kernel_cpustat *rem,
+                                u64 max_usage)
+{
+       u64 cur_usage, target_usage, rem_usage;
+       u64 cur_idle, target_idle;
+
+       cur_usage = kernel_cpustat_total_usage(cur);
+       rem_usage = kernel_cpustat_total_usage(rem);
+
+       target_usage = min(cur_usage + rem_usage,
+                       max_usage);
+
+       if (cur_usage != target_usage) {
+               fixup_vcpustat_delta_usage(cur, rem, CPUTIME_USER,
+                               cur_usage, target_usage, rem_usage);
+               fixup_vcpustat_delta_usage(cur, rem, CPUTIME_NICE,
+                               cur_usage, target_usage, rem_usage);
+               fixup_vcpustat_delta_usage(cur, rem, CPUTIME_SYSTEM,
+                               cur_usage, target_usage, rem_usage);
+       }
+
+       cur_idle = kernel_cpustat_total_idle(cur);
+       target_idle = max_usage - target_usage;
+
+       if (cur_idle != target_idle) {
+               calc_vcpustat_delta_idle(cur, CPUTIME_IDLE,
+                                        cur_idle, target_idle);
+               calc_vcpustat_delta_idle(cur, CPUTIME_IOWAIT,
+                                        cur_idle, target_idle);
+       }
+
+       /* do not show steal time inside ve */
+       cur->cpustat[CPUTIME_STEAL] = 0;
+}
+
+static void cpu_cgroup_update_vcpustat(struct cgroup_subsys_state *cpu_css,
+                                      struct cgroup_subsys_state *cpuacct_css)
+{
+       int i, j;
+       int nr_vcpus;
+       int vcpu_rate;
+       ktime_t now;
+       u64 max_usage;
+       struct kernel_cpustat stat_delta, stat_rem;
+       struct task_group *tg = css_tg(cpu_css);
+       int first_pass = 1;
+
+       spin_lock(&tg->vcpustat_lock);
+
+       now = ktime_get();
+       nr_vcpus = tg_nr_cpus(tg);
+       vcpu_rate = DIV_ROUND_UP(tg_cpu_rate(tg), nr_vcpus);
+       if (!vcpu_rate || vcpu_rate > MAX_CPU_RATE)
+               vcpu_rate = MAX_CPU_RATE;
+
+       if (!ktime_to_ns(tg->vcpustat_last_update)) {
+               /* on the first read initialize vcpu i stat as a sum of stats
+                * over pcpus j such that j % nr_vcpus == i */
+               for (i = 0; i < nr_vcpus; i++) {
+                       for (j = i; j < nr_cpu_ids; j += nr_vcpus) {
+                               if (!cpu_possible(j))
+                                       continue;
+                               kernel_cpustat_add(tg->vcpustat + i,
+                                               cpuacct_cpustat(cpuacct_css, j),
+                                               tg->vcpustat + i);
+                       }
+               }
+               goto out_update_last;
+       }
+
+       max_usage = ktime_to_ns(ktime_sub(now, tg->vcpustat_last_update));
+       max_usage = div_u64(max_usage * vcpu_rate, MAX_CPU_RATE);
+       /* don't allow to update stats too often to avoid calculation errors */
+       if (max_usage < 10)
+               goto out_unlock;
+
+       /* temporarily copy per cpu usage delta to tg->cpustat_last */
+       for_each_possible_cpu(i)
+               kernel_cpustat_sub(cpuacct_cpustat(cpuacct_css, i),
+                                  tg->cpustat_last + i,
+                                  tg->cpustat_last + i);
+
+       /* proceed to calculating per vcpu delta */
+       kernel_cpustat_zero(&stat_rem);
+
+again:
+       for (i = 0; i < nr_vcpus; i++) {
+               int exceeds_max;
+
+               kernel_cpustat_zero(&stat_delta);
+               for (j = i; j < nr_cpu_ids; j += nr_vcpus) {
+                       if (!cpu_possible(j))
+                               continue;
+                       kernel_cpustat_add(&stat_delta,
+                                          tg->cpustat_last + j, &stat_delta);
+               }
+
+               exceeds_max = kernel_cpustat_total_usage(&stat_delta) >=
+                             max_usage;
+               /*
+                * On the first pass calculate delta for vcpus with usage >
+                * max_usage in order to accumulate excess in stat_rem.
+                *
+                * Once the remainder is accumulated, proceed to the rest of
+                * vcpus so that it will be distributed among them.
+                */
+               if (exceeds_max != first_pass)
+                       continue;
+
+               fixup_vcpustat_delta(&stat_delta, &stat_rem, max_usage);
+               kernel_cpustat_add(tg->vcpustat + i, &stat_delta,
+                                  tg->vcpustat + i);
+       }
+
+       if (first_pass) {
+               first_pass = 0;
+               goto again;
+       }
+out_update_last:
+       for_each_possible_cpu(i)
+               tg->cpustat_last[i] = *cpuacct_cpustat(cpuacct_css, i);
+       tg->vcpustat_last_update = now;
+out_unlock:
+       spin_unlock(&tg->vcpustat_lock);
+}
-- 
2.28.0

_______________________________________________
Devel mailing list
Devel@openvz.org
https://lists.openvz.org/mailman/listinfo/devel

Reply via email to