Re: [Devel] [PATCH vz8 1/3] arch/x86: introduce cpuid override

2020-11-03 Thread Kirill Tkhai
On 30.10.2020 14:45, Andrey Ryabinin wrote:
> From: Vladimir Davydov 
> 
> Port diff-arch-x86-introduce-cpuid-override
> 
> Recent Intel CPUs rejected CPUID masking, which is required for flex
> migration, in favor of CPUID faulting. So we need to support it in
> kenrel.
> 
> This patch adds user writable file /proc/vz/cpuid_override, which
> contains CPUID override table. Each table entry must have the following
> format:
> 
>   op[ count]: eax ebx ecx edx
> 
> where @op and optional @count define a CPUID function, whose output one
> would like to override (@op and @count are loaded to EAX and ECX
> registers respectively before calling CPUID); @eax, @ebx, @ecx, @edx -
> the desired CPUID output for the specified function. All values must be
> in HEX, 0x prefix is optional.
> 
> Notes:
> 
>  - the file is only present on hosts that support CPUID faulting;
>  - CPUID faulting is always enabled if it is supported;
>  - CPUID output is overridden on all present CPUs;
>  - the maximal number of entries one can override equals 16;
>  - each write(2) to the file removes all existing entries before adding
>new ones, so the whole table must be written in one write(2); in
>particular writing an empty line to the file removes all existing
>rules.
> 
> Example:
> 
> Suppose we want to mask out SSE2 (CPUID.01H:EDX:26) and RDTSCP
> (CPUID.8001H:EDX:27). Then we should execute the following sequence:
> 
>  - get the current cpuid value:
> 
># cpuid -r | grep -e '^\s*0x0001' -e '^\s*0x8001' | head -n 2
>   0x0001 0x00: eax=0x000306e4 ebx=0x00200800 ecx=0x7fbee3ff 
> edx=0xbfebfbff
>   0x8001 0x00: eax=0x ebx=0x ecx=0x0001 
> edx=0x2c100800
> 
>  - clear the feature bits we want to mask out and write the result to
>/proc/vz/cpuid_override:
> 
># cat >/proc/vz/cpuid_override <0x0001: 0x000306e4 0x00200800 0x7fbee3ff 0xbbebfbff
>0x8001: 0x 0x 0x0001 0x24100800
>EOF
> 
>  - check that cpuid output was overridden:
> 
># cpuid -r | grep -e '^\s*0x0001' -e '^\s*0x8001' | head -n 2
>   0x0001 0x00: eax=0x000306e4 ebx=0x00200800 ecx=0x7fbee3ff 
> edx=0xbbebfbff
>   0x8001 0x00: eax=0x ebx=0x ecx=0x0001 
> edx=0x24100800
> 
> https://jira.sw.ru/browse/PSBM-28682
> 
> Signed-off-by: Vladimir Davydov 
> 
> Acked-by: Cyrill Gorcunov 
> =
> 
> https://jira.sw.ru/browse/PSBM-33638
> 
> Signed-off-by: Vladimir Davydov 
> Rebase:
> Signed-off-by: Kirill Tkhai 
> 
> https://jira.sw.ru/browse/PSBM-121823
> [aryabinin: vz8 rebase]
> Signed-off-by: Andrey Ryabinin 

For the series:

Reviewed-by: Kirill Tkhai 

> ---
>  arch/x86/include/asm/msr-index.h |   1 +
>  arch/x86/include/asm/traps.h |   2 +
>  arch/x86/kernel/Makefile |   1 +
>  arch/x86/kernel/cpu/proc.c   |   4 +
>  arch/x86/kernel/cpuid_fault.c| 258 +++
>  arch/x86/kernel/traps.c  |  24 +++
>  6 files changed, 290 insertions(+)
>  create mode 100644 arch/x86/kernel/cpuid_fault.c
> 
> diff --git a/arch/x86/include/asm/msr-index.h 
> b/arch/x86/include/asm/msr-index.h
> index 6a21c227775c..9668ec6a064d 100644
> --- a/arch/x86/include/asm/msr-index.h
> +++ b/arch/x86/include/asm/msr-index.h
> @@ -114,6 +114,7 @@
>  
>  #define MSR_IA32_BBL_CR_CTL  0x0119
>  #define MSR_IA32_BBL_CR_CTL3 0x011e
> +#define MSR_MISC_FEATURES_ENABLES0x0140
>  
>  #define MSR_IA32_TSX_CTRL0x0122
>  #define TSX_CTRL_RTM_DISABLE BIT(0)  /* Disable RTM feature */
> diff --git a/arch/x86/include/asm/traps.h b/arch/x86/include/asm/traps.h
> index 0ae298ea01a1..0282c81719e7 100644
> --- a/arch/x86/include/asm/traps.h
> +++ b/arch/x86/include/asm/traps.h
> @@ -124,6 +124,8 @@ void __noreturn handle_stack_overflow(const char *message,
> unsigned long fault_address);
>  #endif
>  
> +void do_cpuid_fault(struct pt_regs *);
> +
>  /* Interrupts/Exceptions */
>  enum {
>   X86_TRAP_DE = 0,/*  0, Divide-by-zero */
> diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile
> index 431d8c6e641d..b9451b653b04 100644
> --- a/arch/x86/kernel/Makefile
> +++ b/arch/x86/kernel/Makefile
> @@ -63,6 +63,7 @@ obj-y   += pci-iommu_table.o
>  obj-y+= resource.o
>  obj-y+= irqflags.o
>  obj-y+= spec_ctrl.o
> +obj-y+= cpuid_fault.o
>  
>  obj-y+= process.o
>  obj-y+= fpu/
> diff --git a/arch/x86/kernel/cpu/proc.c b/arch/x86/kernel/cpu/proc.c
> index 2c8522a39ed5..d6b17a60acf6 100644
> --- a/arch/x86/kernel/cpu/proc.c
> +++ b/arch/x86/kernel/cpu/proc.c
> @@ -54,6 +54,10 @@ static void show_cpuinfo_misc(struct seq_file *m, struct 
> cpuinfo_x86 *c)
>  }
>  #endif

Re: [Devel] [PATCH vz8 v2 1/2] x86, cpuinfo: Fix race on parallel /proc/cpuinfo read

2020-11-03 Thread Kirill Tkhai
On 02.11.2020 20:13, Andrey Ryabinin wrote:
> If several threads read /proc/cpuinfo some can see in 'flags'
> values from c->x86_capability, before __do_cpuid_fault() called
> and masks applied. Fix this by forming 'flags' on stack first
> and copy them in per_cpu(cpu_flags, cpu) as a last step.
> 
> https://jira.sw.ru/browse/PSBM-121823
> Signed-off-by: Andrey Ryabinin 
> ---
> Changes since v1:
>  - none
>  
>  arch/x86/kernel/cpu/proc.c | 17 +
>  1 file changed, 9 insertions(+), 8 deletions(-)
> 
> diff --git a/arch/x86/kernel/cpu/proc.c b/arch/x86/kernel/cpu/proc.c
> index 4fe1577d5e6f..4cc2951e34fb 100644
> --- a/arch/x86/kernel/cpu/proc.c
> +++ b/arch/x86/kernel/cpu/proc.c
> @@ -69,11 +69,11 @@ static DEFINE_PER_CPU(struct cpu_flags, cpu_flags);
>  static void init_cpu_flags(void *dummy)
>  {
>   int cpu = smp_processor_id();
> - struct cpu_flags *flags = &per_cpu(cpu_flags, cpu);
> + struct cpu_flags flags;
>   struct cpuinfo_x86 *c = &cpu_data(cpu);
>   unsigned int eax, ebx, ecx, edx;
>  
> - memcpy(flags->val, c->x86_capability, NCAPINTS * sizeof(u32));
> + memcpy(&flags, c->x86_capability, sizeof(flags));
>  
>   /*
>* Clear feature bits masked using cpuid masking/faulting.
> @@ -81,26 +81,27 @@ static void init_cpu_flags(void *dummy)
>  
>   if (c->cpuid_level >= 0x0001) {
>   __do_cpuid_fault(0x0001, 0, &eax, &ebx, &ecx, &edx);
> - flags->val[4] &= ecx;
> - flags->val[0] &= edx;
> + flags.val[4] &= ecx;
> + flags.val[0] &= edx;
>   }
>  
>   if (c->cpuid_level >= 0x0007) {
>   __do_cpuid_fault(0x0007, 0, &eax, &ebx, &ecx, &edx);
> - flags->val[9] &= ebx;
> + flags.val[9] &= ebx;
>   }
>  
>   if ((c->extended_cpuid_level & 0x) == 0x8000 &&
>   c->extended_cpuid_level >= 0x8001) {
>   __do_cpuid_fault(0x8001, 0, &eax, &ebx, &ecx, &edx);
> - flags->val[6] &= ecx;
> - flags->val[1] &= edx;
> + flags.val[6] &= ecx;
> + flags.val[1] &= edx;
>   }
>  
>   if (c->cpuid_level >= 0x000d) {
>   __do_cpuid_fault(0x000d, 1, &eax, &ebx, &ecx, &edx);
> - flags->val[10] &= eax;
> + flags.val[10] &= eax;
>   }
> + memcpy(&per_cpu(cpu_flags, cpu), &flags, sizeof(flags));

This is still racy, since memcpy() is not atomic. Maybe we should add some lock 
on top of this?

>  }
>  
>  static int show_cpuinfo(struct seq_file *m, void *v)
> 

___
Devel mailing list
Devel@openvz.org
https://lists.openvz.org/mailman/listinfo/devel


Re: [Devel] [PATCH vz8 v2 2/2] x86: don't enable cpuid faults if /proc/vz/cpuid_override unused

2020-11-03 Thread Kirill Tkhai
On 02.11.2020 20:13, Andrey Ryabinin wrote:
> We don't need to enable cpuid faults if /proc/vz/cpuid_override
> was never used. If task was attached to ve before a write to
> 'cpuid_override' it will not get cpuid faults now. It shouldn't
> be a problem since the proper use of 'cpuid_override' requires
> stopping all containers.
> 
> https://jira.sw.ru/browse/PSBM-121823
> Signed-off-by: Andrey Ryabinin 

Reviewed-by: Kirill Tkhai 

> ---
> 
> Changes since v1:
>  - git add include/linux/cpuid_override.h 
>  
>  arch/x86/kernel/cpuid_fault.c  | 21 ++---
>  include/linux/cpuid_override.h | 30 ++
>  kernel/ve/ve.c |  5 -
>  3 files changed, 36 insertions(+), 20 deletions(-)
>  create mode 100644 include/linux/cpuid_override.h
> 
> diff --git a/arch/x86/kernel/cpuid_fault.c b/arch/x86/kernel/cpuid_fault.c
> index 1e8ffacc4412..cb6c2216fa8a 100644
> --- a/arch/x86/kernel/cpuid_fault.c
> +++ b/arch/x86/kernel/cpuid_fault.c
> @@ -1,3 +1,4 @@
> +#include 
>  #include 
>  #include 
>  #include 
> @@ -9,25 +10,7 @@
>  #include 
>  #include 
>  
> -struct cpuid_override_entry {
> - unsigned int op;
> - unsigned int count;
> - bool has_count;
> - unsigned int eax;
> - unsigned int ebx;
> - unsigned int ecx;
> - unsigned int edx;
> -};
> -
> -#define MAX_CPUID_OVERRIDE_ENTRIES   16
> -
> -struct cpuid_override_table {
> - struct rcu_head rcu_head;
> - int size;
> - struct cpuid_override_entry entries[MAX_CPUID_OVERRIDE_ENTRIES];
> -};
> -
> -static struct cpuid_override_table __rcu *cpuid_override __read_mostly;
> +struct cpuid_override_table __rcu *cpuid_override __read_mostly;
>  static DEFINE_SPINLOCK(cpuid_override_lock);
>  
>  static void cpuid_override_update(struct cpuid_override_table *new_table)
> diff --git a/include/linux/cpuid_override.h b/include/linux/cpuid_override.h
> new file mode 100644
> index ..ea0fa7af3d3c
> --- /dev/null
> +++ b/include/linux/cpuid_override.h
> @@ -0,0 +1,30 @@
> +#ifndef __CPUID_OVERRIDE_H
> +#define __CPUID_OVERRIDE_H
> +
> +#include 
> +
> +struct cpuid_override_entry {
> + unsigned int op;
> + unsigned int count;
> + bool has_count;
> + unsigned int eax;
> + unsigned int ebx;
> + unsigned int ecx;
> + unsigned int edx;
> +};
> +
> +#define MAX_CPUID_OVERRIDE_ENTRIES   16
> +
> +struct cpuid_override_table {
> + struct rcu_head rcu_head;
> + int size;
> + struct cpuid_override_entry entries[MAX_CPUID_OVERRIDE_ENTRIES];
> +};
> +
> +extern struct cpuid_override_table __rcu *cpuid_override;
> +
> +static inline bool cpuid_override_on(void)
> +{
> + return rcu_access_pointer(cpuid_override);
> +}
> +#endif
> diff --git a/kernel/ve/ve.c b/kernel/ve/ve.c
> index aad8ce69ca1f..0d4d0ab70369 100644
> --- a/kernel/ve/ve.c
> +++ b/kernel/ve/ve.c
> @@ -9,6 +9,7 @@
>   * 've.c' helper file performing VE sub-system initialization
>   */
>  
> +#include 
>  #include 
>  #include 
>  #include 
> @@ -801,6 +802,7 @@ static void ve_attach(struct cgroup_taskset *tset)
>  {
>   struct cgroup_subsys_state *css;
>   struct task_struct *task;
> + extern struct cpuid_override_table __rcu *cpuid_override;
>  
>   cgroup_taskset_for_each(task, css, tset) {
>   struct ve_struct *ve = css_to_ve(css);
> @@ -816,7 +818,8 @@ static void ve_attach(struct cgroup_taskset *tset)
>   /* Leave parent exec domain */
>   task->parent_exec_id--;
>  
> - set_tsk_thread_flag(task, TIF_CPUID_OVERRIDE);
> + if (cpuid_override_on())
> + set_tsk_thread_flag(task, TIF_CPUID_OVERRIDE);
>   task->task_ve = ve;
>   }
>  }
> 

___
Devel mailing list
Devel@openvz.org
https://lists.openvz.org/mailman/listinfo/devel


[Devel] [PATCH RHEL8 COMMIT] kernel/stat: Introduce kernel_cpustat operation wrappers

2020-11-03 Thread Konstantin Khorenko
The commit is pushed to "branch-rh8-4.18.0-193.6.3.vz8.4.x-ovz" and will appear 
at https://src.openvz.org/scm/ovz/vzkernel.git
after rh8-4.18.0-193.6.3.vz8.4.15
-->
commit 851926bdeb67516d0673e8582accc6a396c716ee
Author: Konstantin Khorenko 
Date:   Wed Oct 28 14:08:19 2020 +0300

kernel/stat: Introduce kernel_cpustat operation wrappers

Signed-off-by: Konstantin Khorenko 
Reviewed-by: Andrey Ryabinin 
---
 include/linux/kernel_stat.h | 36 
 1 file changed, 36 insertions(+)

diff --git a/include/linux/kernel_stat.h b/include/linux/kernel_stat.h
index 7ee2bb43b251..47b75b4be3d5 100644
--- a/include/linux/kernel_stat.h
+++ b/include/linux/kernel_stat.h
@@ -35,6 +35,42 @@ struct kernel_cpustat {
u64 cpustat[NR_STATS];
 };
 
+static inline u64 kernel_cpustat_total_usage(const struct kernel_cpustat *p)
+{
+   return p->cpustat[CPUTIME_USER] + p->cpustat[CPUTIME_NICE] +
+  p->cpustat[CPUTIME_SYSTEM];
+}
+
+static inline u64 kernel_cpustat_total_idle(const struct kernel_cpustat *p)
+{
+   return p->cpustat[CPUTIME_IDLE] + p->cpustat[CPUTIME_IOWAIT];
+}
+
+static inline void kernel_cpustat_zero(struct kernel_cpustat *p)
+{
+   memset(p, 0, sizeof(*p));
+}
+
+static inline void kernel_cpustat_add(const struct kernel_cpustat *lhs,
+ const struct kernel_cpustat *rhs,
+ struct kernel_cpustat *res)
+{
+   int i;
+
+   for (i = 0; i < NR_STATS; i++)
+   res->cpustat[i] = lhs->cpustat[i] + rhs->cpustat[i];
+}
+
+static inline void kernel_cpustat_sub(const struct kernel_cpustat *lhs,
+ const struct kernel_cpustat *rhs,
+ struct kernel_cpustat *res)
+{
+   int i;
+
+   for (i = 0; i < NR_STATS; i++)
+   res->cpustat[i] = lhs->cpustat[i] - rhs->cpustat[i];
+}
+
 struct kernel_stat {
unsigned long irqs_sum;
unsigned int softirqs[NR_SOFTIRQS];
___
Devel mailing list
Devel@openvz.org
https://lists.openvz.org/mailman/listinfo/devel


[Devel] [PATCH RHEL8 COMMIT] ve/cgroup: export cgroup_get_ve_root1() + cleanup

2020-11-03 Thread Konstantin Khorenko
The commit is pushed to "branch-rh8-4.18.0-193.6.3.vz8.4.x-ovz" and will appear 
at https://src.openvz.org/scm/ovz/vzkernel.git
after rh8-4.18.0-193.6.3.vz8.4.15
-->
commit ddd749df382cda3fc10fa5c1ab5aecfe872b21bc
Author: Konstantin Khorenko 
Date:   Fri Oct 23 19:12:55 2020 +0300

ve/cgroup: export cgroup_get_ve_root1() + cleanup

Will be used by later patches.

Fixes: 7afc6e2134c3 ("ve/cgroups: Introduce subgroups_limit control")

Signed-off-by: Konstantin Khorenko 
Reviewed-by: Andrey Ryabinin 
---
 include/linux/ve.h | 6 ++
 kernel/cgroup/cgroup.c | 6 +++---
 2 files changed, 9 insertions(+), 3 deletions(-)

diff --git a/include/linux/ve.h b/include/linux/ve.h
index 2f9204cbd0f4..447a91dbd4d4 100644
--- a/include/linux/ve.h
+++ b/include/linux/ve.h
@@ -152,6 +152,8 @@ extern void monotonic_ve_to_abs(clockid_t which_clock, 
struct timespec64 *tp);
 extern bool current_user_ns_initial(void);
 struct user_namespace *ve_init_user_ns(void);
 
+extern struct cgroup *cgroup_get_ve_root1(struct cgroup *cgrp);
+
 #define ve_uevent_seqnum   (get_exec_env()->_uevent_seqnum)
 
 extern int vz_security_family_check(struct net *net, int family);
@@ -178,6 +180,10 @@ static inline struct user_namespace *ve_init_user_ns(void)
return &init_user_ns;
 }
 
+static inline struct cgroup *cgroup_get_ve_root1(struct cgroup *cgrp)
+{
+   return NULL;
+}
 #define ve_uevent_seqnum uevent_seqnum
 
 static inline int vz_security_family_check(struct net *net, int family) { 
return 0; }
diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c
index 4ee3eb24b0d1..97bf375ae334 100644
--- a/kernel/cgroup/cgroup.c
+++ b/kernel/cgroup/cgroup.c
@@ -1901,7 +1901,7 @@ void cgroup_mark_ve_root(struct ve_struct *ve)
spin_unlock_irq(&css_set_lock);
 }
 
-static struct cgroup *cgroup_get_ve_root1(struct cgroup *cgrp)
+struct cgroup *cgroup_get_ve_root1(struct cgroup *cgrp)
 {
struct cgroup *ve_root = NULL;
 
@@ -1913,9 +1913,9 @@ static struct cgroup *cgroup_get_ve_root1(struct cgroup 
*cgrp)
}
cgrp = cgroup_parent(cgrp);
} while (cgrp);
-rcu_read_unlock();
+   rcu_read_unlock();
 
-return ve_root;
+   return ve_root;
 }
 
 static bool subgroup_limit_reached(struct cgroup *cgroup)
___
Devel mailing list
Devel@openvz.org
https://lists.openvz.org/mailman/listinfo/devel


[Devel] [PATCH RHEL8 COMMIT] ve/sched/stat: Introduce functions to calculate vcpustat data

2020-11-03 Thread Konstantin Khorenko
The commit is pushed to "branch-rh8-4.18.0-193.6.3.vz8.4.x-ovz" and will appear 
at https://src.openvz.org/scm/ovz/vzkernel.git
after rh8-4.18.0-193.6.3.vz8.4.15
-->
commit 82f2b4c771019330ed36773c00a77acb70f38204
Author: Konstantin Khorenko 
Date:   Wed Oct 28 15:26:59 2020 +0300

ve/sched/stat: Introduce functions to calculate vcpustat data

Signed-off-by: Konstantin Khorenko 
Reviewed-by: Andrey Ryabinin 
---
 kernel/sched/core.c|   2 +-
 kernel/sched/cpuacct.c | 373 +
 2 files changed, 374 insertions(+), 1 deletion(-)

diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 88bc46d163b3..e381085eb771 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -6569,7 +6569,7 @@ void sched_move_task(struct task_struct *tsk)
task_rq_unlock(rq, tsk, &rf);
 }
 
-static inline struct task_group *css_tg(struct cgroup_subsys_state *css)
+inline struct task_group *css_tg(struct cgroup_subsys_state *css)
 {
return css ? container_of(css, struct task_group, css) : NULL;
 }
diff --git a/kernel/sched/cpuacct.c b/kernel/sched/cpuacct.c
index 9fbb10383434..aafaee1f0722 100644
--- a/kernel/sched/cpuacct.c
+++ b/kernel/sched/cpuacct.c
@@ -5,6 +5,7 @@
  * Based on the work by Paul Menage (men...@google.com) and Balbir Singh
  * (bal...@in.ibm.com).
  */
+#include 
 #include "sched.h"
 
 /* Time spent by the tasks of the CPU accounting group executing in ... */
@@ -373,3 +374,375 @@ struct cgroup_subsys cpuacct_cgrp_subsys = {
.legacy_cftypes = files,
.early_init = true,
 };
+
+extern inline struct task_group *css_tg(struct cgroup_subsys_state *css);
+
+static struct task_group *ve_root_tg(struct task_group *tg) {
+   struct cgroup *cg;
+
+   if (!tg)
+   return NULL;
+
+   cg = cgroup_get_ve_root1(tg->css.cgroup);
+   return cg ? css_tg(&cg->self) : NULL;
+}
+
+unsigned int tg_cpu_rate(struct task_group *tg)
+{
+   unsigned int cpu_rate = 0;
+#ifdef CONFIG_CFS_CPULIMIT
+   tg = ve_root_tg(tg);
+   if (tg)
+   cpu_rate = tg->cpu_rate;
+#endif
+   return cpu_rate;
+}
+
+static unsigned int tg_nr_cpus(struct task_group *tg)
+{
+   unsigned int nr_cpus = 0;
+   unsigned int max_nr_cpus = num_online_cpus();
+
+#ifdef CONFIG_CFS_CPULIMIT
+   tg = ve_root_tg(tg);
+   if (tg)
+   nr_cpus = tg->nr_cpus;
+#endif
+   if (!nr_cpus || nr_cpus > max_nr_cpus)
+   nr_cpus = max_nr_cpus;
+
+   return nr_cpus;
+}
+
+struct kernel_cpustat *cpuacct_cpustat(struct cgroup_subsys_state *css, int 
cpu)
+{
+   return per_cpu_ptr(css_ca(css)->cpustat, cpu);
+}
+
+static void cpu_cgroup_update_stat(struct cgroup_subsys_state *cpu_css,
+  struct cgroup_subsys_state *cpuacct_css,
+  int i)
+{
+#if defined(CONFIG_SCHEDSTATS) && defined(CONFIG_FAIR_GROUP_SCHED)
+   struct task_group *tg = css_tg(cpu_css);
+   struct sched_entity *se = tg->se[i];
+   u64 *cpustat = cpuacct_cpustat(cpuacct_css, i)->cpustat;
+   u64 now = cpu_clock(i);
+   u64 delta, idle, iowait, steal;
+
+   /* root_task_group has not sched entities */
+   if (tg == &root_task_group)
+   return;
+
+   iowait = se->statistics.iowait_sum;
+   idle = se->statistics.sum_sleep_runtime;
+   steal = se->statistics.wait_sum;
+
+   if (idle > iowait)
+   idle -= iowait;
+   else
+   idle = 0;
+
+   if (se->statistics.sleep_start) {
+   delta = now - se->statistics.sleep_start;
+   if ((s64)delta > 0)
+   idle += delta;
+   } else if (se->statistics.block_start) {
+   delta = now - se->statistics.block_start;
+   if ((s64)delta > 0)
+   iowait += delta;
+   } else if (se->statistics.wait_start) {
+   delta = now - se->statistics.wait_start;
+   if ((s64)delta > 0)
+   steal += delta;
+   }
+
+   cpustat[CPUTIME_IDLE]   = max(cpustat[CPUTIME_IDLE], idle);
+   cpustat[CPUTIME_IOWAIT] = max(cpustat[CPUTIME_IOWAIT], iowait);
+   cpustat[CPUTIME_STEAL]  = steal;
+#endif
+}
+
+static void fixup_vcpustat_delta_usage(struct kernel_cpustat *cur,
+  struct kernel_cpustat *rem, int ind,
+  u64 cur_usage, u64 target_usage,
+  u64 rem_usage)
+{
+   s64 scaled_val;
+   u32 scale_pct = 0;
+
+   /* distribute the delta among USER, NICE, and SYSTEM proportionally */
+   if (cur_usage < target_usage) {
+   if ((s64)rem_usage > 0) /* sanity check to avoid div/0 */
+   scale_pct = div64_u64(100 * rem->cpustat[ind],
+ rem_usage);
+   } else {
+   if ((s64)cur_usage > 0) /* sanity c

[Devel] [PATCH RHEL8 COMMIT] ve/sched/stat: Add basic infrastructure for vcpu statistics

2020-11-03 Thread Konstantin Khorenko
The commit is pushed to "branch-rh8-4.18.0-193.6.3.vz8.4.x-ovz" and will appear 
at https://src.openvz.org/scm/ovz/vzkernel.git
after rh8-4.18.0-193.6.3.vz8.4.15
-->
commit 679f623fb05fec8e3c8316794a9fc7a081acaa92
Author: Konstantin Khorenko 
Date:   Wed Oct 28 14:26:14 2020 +0300

ve/sched/stat: Add basic infrastructure for vcpu statistics

Container might have a limit on CPUs in its config.
Previously we had a fair vcpu primitive and fair 2 level
cpu scheduler.

Nowadays we do not have fair vcpus for Containers, but still have
emulate this behavior.
The most important part here is to provide correct statistics:
it should be collected upon all cpus in the system and spread
equally by "vcpus" in the output.

The current patch introduces vcpu related necessary fields in
task_group and their allocation/deallocation.

Signed-off-by: Konstantin Khorenko 
Reviewed-by: Andrey Ryabinin 
---
 kernel/sched/core.c  | 15 +++
 kernel/sched/sched.h |  5 +
 2 files changed, 20 insertions(+)

diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 0116742de578..88bc46d163b3 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -6398,6 +6398,8 @@ static void sched_free_group(struct task_group *tg)
free_fair_sched_group(tg);
free_rt_sched_group(tg);
autogroup_free(tg);
+   kvfree(tg->cpustat_last);
+   kvfree(tg->vcpustat);
kmem_cache_free(task_group_cache, tg);
 }
 
@@ -6416,6 +6418,19 @@ struct task_group *sched_create_group(struct task_group 
*parent)
if (!alloc_rt_sched_group(tg, parent))
goto err;
 
+   tg->cpustat_last = kvzalloc(nr_cpu_ids * sizeof(struct kernel_cpustat),
+   GFP_KERNEL);
+   if (!tg->cpustat_last)
+   goto err;
+
+   tg->vcpustat = kvzalloc(nr_cpu_ids * sizeof(struct kernel_cpustat),
+   GFP_KERNEL);
+   if (!tg->vcpustat)
+   goto err;
+
+   tg->vcpustat_last_update = 0;
+   spin_lock_init(&tg->vcpustat_lock);
+
/* start_timespec is saved CT0 uptime */
tg->start_time = ktime_get_boot_ns();
 
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 3f1e5ba43910..d8331e5b4c4f 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -412,6 +412,11 @@ struct task_group {
/* Monotonic time in nsecs: */
u64 start_time;
 
+   struct kernel_cpustat   *cpustat_last;
+   struct kernel_cpustat   *vcpustat;
+   u64 vcpustat_last_update;
+   spinlock_t  vcpustat_lock;
+
struct cfs_bandwidthcfs_bandwidth;
 #ifdef CONFIG_CFS_CPULIMIT
 #define MAX_CPU_RATE 1024
___
Devel mailing list
Devel@openvz.org
https://lists.openvz.org/mailman/listinfo/devel


[Devel] [PATCH RHEL8 COMMIT] ve/proc/stat: Wire virtualized /proc/stat handler

2020-11-03 Thread Konstantin Khorenko
The commit is pushed to "branch-rh8-4.18.0-193.6.3.vz8.4.x-ovz" and will appear 
at https://src.openvz.org/scm/ovz/vzkernel.git
after rh8-4.18.0-193.6.3.vz8.4.15
-->
commit c5852f7b4a086208cb39c1f0762041f94a942910
Author: Konstantin Khorenko 
Date:   Wed Oct 28 15:43:19 2020 +0300

ve/proc/stat: Wire virtualized /proc/stat handler

Signed-off-by: Konstantin Khorenko 
Reviewed-by: Andrey Ryabinin 
---
 fs/proc/stat.c | 10 ++
 include/linux/ve.h |  2 ++
 kernel/ve/ve.c | 17 +
 3 files changed, 29 insertions(+)

diff --git a/fs/proc/stat.c b/fs/proc/stat.c
index dfcd5280a7c6..937790fd02cf 100644
--- a/fs/proc/stat.c
+++ b/fs/proc/stat.c
@@ -13,6 +13,7 @@
 #include 
 #include 
 #include 
+#include 
 
 #ifndef arch_irq_stat_cpu
 #define arch_irq_stat_cpu(cpu) 0
@@ -113,6 +114,15 @@ static int show_stat(struct seq_file *p, void *v)
u64 sum_softirq = 0;
unsigned int per_softirq_sums[NR_SOFTIRQS] = {0};
struct timespec64 boottime;
+   struct ve_struct *ve;
+
+   ve = get_exec_env();
+   if (!ve_is_super(ve)) {
+   int ret;
+   ret = ve_show_cpu_stat(ve, p);
+   if (ret != -ENOSYS)
+   return ret;
+   }
 
user = nice = system = idle = iowait =
irq = softirq = steal = 0;
diff --git a/include/linux/ve.h b/include/linux/ve.h
index 447a91dbd4d4..d88e4715a222 100644
--- a/include/linux/ve.h
+++ b/include/linux/ve.h
@@ -199,8 +199,10 @@ static inline void monotonic_ve_to_abs(clockid_t 
which_clock,
 struct seq_file;
 
 #if defined(CONFIG_VE) && defined(CONFIG_CGROUP_SCHED)
+int ve_show_cpu_stat(struct ve_struct *ve, struct seq_file *p);
 int ve_show_loadavg(struct ve_struct *ve, struct seq_file *p);
 #else
+static inline int ve_show_cpu_stat(struct ve_struct *ve, struct seq_file *p) { 
return -ENOSYS; }
 static inline int ve_show_loadavg(struct ve_struct *ve, struct seq_file *p) { 
return -ENOSYS; }
 #endif
 
diff --git a/kernel/ve/ve.c b/kernel/ve/ve.c
index 2cacb673c0f3..eeb1947a7d53 100644
--- a/kernel/ve/ve.c
+++ b/kernel/ve/ve.c
@@ -1395,6 +1395,23 @@ int vz_security_protocol_check(struct net *net, int 
protocol)
 EXPORT_SYMBOL_GPL(vz_security_protocol_check);
 
 #ifdef CONFIG_CGROUP_SCHED
+int cpu_cgroup_proc_stat(struct cgroup_subsys_state *cpu_css,
+struct cgroup_subsys_state *cpuacct_css,
+struct seq_file *p);
+
+int ve_show_cpu_stat(struct ve_struct *ve, struct seq_file *p)
+{
+   struct cgroup_subsys_state *cpu_css, *cpuacct_css;
+   int err;
+
+   cpu_css = ve_get_init_css(ve, cpu_cgrp_id);
+   cpuacct_css = ve_get_init_css(ve, cpuacct_cgrp_id);
+   err = cpu_cgroup_proc_stat(cpu_css, cpuacct_css, p);
+   css_put(cpuacct_css);
+   css_put(cpu_css);
+   return err;
+}
+
 int cpu_cgroup_proc_loadavg(struct cgroup_subsys_state *css,
struct seq_file *p);
 
___
Devel mailing list
Devel@openvz.org
https://lists.openvz.org/mailman/listinfo/devel


[Devel] [PATCH RHEL8 COMMIT] sched/stat: account ctxsw per task group

2020-11-03 Thread Konstantin Khorenko
The commit is pushed to "branch-rh8-4.18.0-193.6.3.vz8.4.x-ovz" and will appear 
at https://src.openvz.org/scm/ovz/vzkernel.git
after rh8-4.18.0-193.6.3.vz8.4.15
-->
commit b8f0da164311bd1acf6b78e3cbaa73ab8c2875f3
Author: Vladimir Davydov 
Date:   Thu Mar 14 21:00:44 2013 +0400

sched/stat: account ctxsw per task group

This is a backport of diff-sched-account-ctxsw-per-task-group:

 Subject: sched: account ctxsw per task group
 Date: Fri, 28 Dec 2012 15:09:45 +0400

* [sched] the number of context switches should be reported correctly
inside a CT in /proc/stat (PSBM-18113)

For /proc/stat:ctxt to be correct inside containers.

https://jira.sw.ru/browse/PSBM-18113

Signed-off-by: Vladimir Davydov 

(cherry picked from vz7 commit d388f0bf64adb74cd62c4deff58e181bd63d62ac)
Signed-off-by: Konstantin Khorenko 
Reviewed-by: Andrey Ryabinin 
---
 kernel/sched/cpuacct.c |  4 +++-
 kernel/sched/fair.c| 14 --
 kernel/sched/sched.h   |  3 +++
 3 files changed, 18 insertions(+), 3 deletions(-)

diff --git a/kernel/sched/cpuacct.c b/kernel/sched/cpuacct.c
index aafaee1f0722..8756560d0b4f 100644
--- a/kernel/sched/cpuacct.c
+++ b/kernel/sched/cpuacct.c
@@ -647,6 +647,7 @@ int cpu_cgroup_proc_stat(struct cgroup_subsys_state 
*cpu_css,
struct kernel_cpustat *kcpustat;
unsigned long tg_nr_running = 0;
unsigned long tg_nr_iowait = 0;
+   unsigned long long tg_nr_switches = 0;
 
getboottime64(&boottime);
 
@@ -665,6 +666,7 @@ int cpu_cgroup_proc_stat(struct cgroup_subsys_state 
*cpu_css,
 #ifdef CONFIG_FAIR_GROUP_SCHED
tg_nr_running += tg->cfs_rq[i]->h_nr_running;
tg_nr_iowait  += tg->cfs_rq[i]->nr_iowait;
+   tg_nr_switches += tg->cfs_rq[i]->nr_switches;
 #endif
 #ifdef CONFIG_RT_GROUP_SCHED
tg_nr_running += tg->rt_rq[i]->rt_nr_running;
@@ -738,7 +740,7 @@ int cpu_cgroup_proc_stat(struct cgroup_subsys_state 
*cpu_css,
   "processes %lu\n"
   "procs_running %lu\n"
   "procs_blocked %lu\n",
-  nr_context_switches(),
+  tg_nr_switches,
   (unsigned long long)boot_sec,
   total_forks,
   tg_nr_running,
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 6546d8511417..0b9bb108625a 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -4153,6 +4153,9 @@ dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity 
*se, int flags)
 
clear_buddies(cfs_rq, se);
 
+   if (cfs_rq->prev == se)
+   cfs_rq->prev = NULL;
+
if (se != cfs_rq->curr)
__dequeue_entity(cfs_rq, se);
se->on_rq = 0;
@@ -4167,8 +4170,12 @@ dequeue_entity(struct cfs_rq *cfs_rq, struct 
sched_entity *se, int flags)
if (!(flags & DEQUEUE_SLEEP))
se->vruntime -= cfs_rq->min_vruntime;
 
-   /* return excess runtime on last dequeue */
-   return_cfs_rq_runtime(cfs_rq);
+   if (!cfs_rq->nr_running) {
+   /* return excess runtime on last dequeue */
+   return_cfs_rq_runtime(cfs_rq);
+   /* account switch to idle task */
+   cfs_rq->nr_switches++;
+   }
 
update_cfs_group(se);
 
@@ -4242,6 +4249,8 @@ set_next_entity(struct cfs_rq *cfs_rq, struct 
sched_entity *se)
 
update_stats_curr_start(cfs_rq, se);
cfs_rq->curr = se;
+   if (cfs_rq->prev != se)
+   cfs_rq->nr_switches++;
 
/*
 * Track our maximum slice length, if the CPU's load is at
@@ -4341,6 +4350,7 @@ static void put_prev_entity(struct cfs_rq *cfs_rq, struct 
sched_entity *prev)
__enqueue_entity(cfs_rq, prev);
/* in !on_rq case, update occurred at dequeue */
update_load_avg(cfs_rq, prev, 0);
+   cfs_rq->prev = prev;
}
cfs_rq->curr = NULL;
 }
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index d8331e5b4c4f..3d55b45f1ea6 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -542,6 +542,9 @@ struct cfs_rq {
struct sched_entity *next;
struct sched_entity *last;
struct sched_entity *skip;
+   struct sched_entity *prev;
+
+   u64 nr_switches;
 
 #ifdef CONFIG_SCHED_DEBUG
unsigned intnr_spread_over;
___
Devel mailing list
Devel@openvz.org
https://lists.openvz.org/mailman/listinfo/devel


[Devel] [PATCH RHEL8 COMMIT] sched: Fix task_group "iowait_sum" statistic accounting

2020-11-03 Thread Konstantin Khorenko
The commit is pushed to "branch-rh8-4.18.0-193.6.3.vz8.4.x-ovz" and will appear 
at https://src.openvz.org/scm/ovz/vzkernel.git
after rh8-4.18.0-193.6.3.vz8.4.15
-->
commit 56c2e80177276c8044704140d098d206edfd2694
Author: Konstantin Khorenko 
Date:   Wed Oct 28 15:47:31 2020 +0300

sched: Fix task_group "iowait_sum" statistic accounting

Signed-off-by: Konstantin Khorenko 
Reviewed-by: Andrey Ryabinin 
---
 kernel/sched/fair.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 23a2f2452474..6546d8511417 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -1053,7 +1053,8 @@ update_stats_enqueue_sleeper(struct cfs_rq *cfs_rq, 
struct sched_entity *se)
delta >> 20);
}
account_scheduler_latency(tsk, delta >> 10, 0);
-   }
+   } else
+   __schedstat_add(se->statistics.iowait_sum, delta);
}
 }
 
___
Devel mailing list
Devel@openvz.org
https://lists.openvz.org/mailman/listinfo/devel


[Devel] [PATCH RHEL8 COMMIT] ve/sched/stat: Introduce handler for getting CT cpu statistics

2020-11-03 Thread Konstantin Khorenko
The commit is pushed to "branch-rh8-4.18.0-193.6.3.vz8.4.x-ovz" and will appear 
at https://src.openvz.org/scm/ovz/vzkernel.git
after rh8-4.18.0-193.6.3.vz8.4.15
-->
commit a890bf5d21b8e99fef319ce72cfdfb1416e5138b
Author: Konstantin Khorenko 
Date:   Fri Oct 30 12:26:34 2020 +0300

ve/sched/stat: Introduce handler for getting CT cpu statistics

It will be used later in
  * idle cpu stat virtualization in /proc/loadavg
  * /proc/vz/vestat output
  * VZCTL_GET_CPU_STAT ioctl

The patch is based on following vz7 commits:
  ecdce58b214c ("sched: Export per task_group statistics_work")
  75fc174adc36 ("sched: Port cpustat related patches")
  a58fb58bff1c ("Use ve init task's css instead of opening cgroup via vfs")

Signed-off-by: Konstantin Khorenko 
Reviewed-by: Andrey Ryabinin 
---
 include/linux/ve.h |  2 ++
 kernel/sched/cpuacct.c | 24 
 kernel/ve/ve.c | 18 ++
 3 files changed, 44 insertions(+)

diff --git a/include/linux/ve.h b/include/linux/ve.h
index d88e4715a222..656ee43e383e 100644
--- a/include/linux/ve.h
+++ b/include/linux/ve.h
@@ -201,9 +201,11 @@ struct seq_file;
 #if defined(CONFIG_VE) && defined(CONFIG_CGROUP_SCHED)
 int ve_show_cpu_stat(struct ve_struct *ve, struct seq_file *p);
 int ve_show_loadavg(struct ve_struct *ve, struct seq_file *p);
+int ve_get_cpu_stat(struct ve_struct *ve, struct kernel_cpustat *kstat);
 #else
 static inline int ve_show_cpu_stat(struct ve_struct *ve, struct seq_file *p) { 
return -ENOSYS; }
 static inline int ve_show_loadavg(struct ve_struct *ve, struct seq_file *p) { 
return -ENOSYS; }
+static inline int ve_get_cpu_stat(struct ve_struct *ve, struct kernel_cpustat 
*kstat) { return -ENOSYS; }
 #endif
 
 #endif /* _LINUX_VE_H */
diff --git a/kernel/sched/cpuacct.c b/kernel/sched/cpuacct.c
index 8756560d0b4f..ed59607f2157 100644
--- a/kernel/sched/cpuacct.c
+++ b/kernel/sched/cpuacct.c
@@ -748,3 +748,27 @@ int cpu_cgroup_proc_stat(struct cgroup_subsys_state 
*cpu_css,
 
return 0;
 }
+
+int cpu_cgroup_get_stat(struct cgroup_subsys_state *cpu_css,
+   struct cgroup_subsys_state *cpuacct_css,
+   struct kernel_cpustat *kstat)
+{
+   struct task_group *tg = css_tg(cpu_css);
+   int nr_vcpus = tg_nr_cpus(tg);
+   int i;
+
+   kernel_cpustat_zero(kstat);
+
+   if (tg == &root_task_group)
+   return -ENOENT;
+
+   for_each_possible_cpu(i)
+   cpu_cgroup_update_stat(cpu_css, cpuacct_css, i);
+
+   cpu_cgroup_update_vcpustat(cpu_css, cpuacct_css);
+
+   for (i = 0; i < nr_vcpus; i++)
+   kernel_cpustat_add(tg->vcpustat + i, kstat, kstat);
+
+   return 0;
+}
diff --git a/kernel/ve/ve.c b/kernel/ve/ve.c
index eeb1947a7d53..10cebe10beab 100644
--- a/kernel/ve/ve.c
+++ b/kernel/ve/ve.c
@@ -1425,4 +1425,22 @@ int ve_show_loadavg(struct ve_struct *ve, struct 
seq_file *p)
css_put(css);
return err;
 }
+
+int cpu_cgroup_get_stat(struct cgroup_subsys_state *cpu_css,
+   struct cgroup_subsys_state *cpuacct_css,
+   struct kernel_cpustat *kstat);
+
+int ve_get_cpu_stat(struct ve_struct *ve, struct kernel_cpustat *kstat)
+{
+   struct cgroup_subsys_state *cpu_css, *cpuacct_css;
+   int err;
+
+   cpu_css = ve_get_init_css(ve, cpu_cgrp_id);
+   cpuacct_css = ve_get_init_css(ve, cpuacct_cgrp_id);
+   err = cpu_cgroup_get_stat(cpu_css, cpuacct_css, kstat);
+   css_put(cpuacct_css);
+   css_put(cpu_css);
+   return err;
+}
+EXPORT_SYMBOL(ve_get_cpu_stat);
 #endif /* CONFIG_CGROUP_SCHED */
___
Devel mailing list
Devel@openvz.org
https://lists.openvz.org/mailman/listinfo/devel


[Devel] [PATCH RHEL8 COMMIT] ve/vestat: Introduce /proc/vz/vestat

2020-11-03 Thread Konstantin Khorenko
The commit is pushed to "branch-rh8-4.18.0-193.6.3.vz8.4.x-ovz" and will appear 
at https://src.openvz.org/scm/ovz/vzkernel.git
after rh8-4.18.0-193.6.3.vz8.4.15
-->
commit 9644a237d401cdf127046eda0f4d71107e341008
Author: Konstantin Khorenko 
Date:   Fri Oct 30 16:03:05 2020 +0300

ve/vestat: Introduce /proc/vz/vestat

The patch is based on following vz7 commits:

  f997bf6c613a ("ve: initial patch")
  75fc174adc36 ("sched: Port cpustat related patches")
  09e1cb4a7d4d ("ve/proc: restricted proc-entries scope")
  a58fb58bff1c ("Use ve init task's css instead of opening cgroup via vfs")

Signed-off-by: Konstantin Khorenko 
Reviewed-by: Andrey Ryabinin 
---
 kernel/ve/vecalls.c | 98 +
 1 file changed, 98 insertions(+)

diff --git a/kernel/ve/vecalls.c b/kernel/ve/vecalls.c
index 78773c21b8db..3258b49b15b2 100644
--- a/kernel/ve/vecalls.c
+++ b/kernel/ve/vecalls.c
@@ -30,6 +30,11 @@
 #include 
 #include 
 
+static u64 ve_get_uptime(struct ve_struct *ve)
+{
+   return ktime_get_boot_ns() - ve->real_start_time;
+}
+
 /**
  **
  *
@@ -38,6 +43,74 @@
  **
  **/
 #ifdef CONFIG_PROC_FS
+#if BITS_PER_LONG == 32
+#define VESTAT_LINE_WIDTH (6 * 11 + 6 * 21)
+#define VESTAT_LINE_FMT "%10s %10lu %10lu %10lu %10Lu %20Lu %20Lu %20Lu %20Lu 
%20Lu %20Lu %10lu\n"
+#define VESTAT_HEAD_FMT "%10s %10s %10s %10s %10s %20s %20s %20s %20s %20s 
%20s %10s\n"
+#else
+#define VESTAT_LINE_WIDTH (12 * 21)
+#define VESTAT_LINE_FMT "%20s %20lu %20lu %20lu %20Lu %20Lu %20Lu %20Lu %20Lu 
%20Lu %20Lu %20lu\n"
+#define VESTAT_HEAD_FMT "%20s %20s %20s %20s %20s %20s %20s %20s %20s %20s 
%20s %20s\n"
+#endif
+
+static int vestat_seq_show(struct seq_file *m, void *v)
+{
+   struct list_head *entry;
+   struct ve_struct *ve;
+   struct ve_struct *curve;
+   int ret;
+   unsigned long user_ve, nice_ve, system_ve;
+   unsigned long long uptime;
+   u64 uptime_cycles, idle_time, strv_time, used;
+   struct kernel_cpustat kstat;
+
+   entry = (struct list_head *)v;
+   ve = list_entry(entry, struct ve_struct, ve_list);
+
+   curve = get_exec_env();
+   if (entry == ve_list_head.next ||
+   (!ve_is_super(curve) && ve == curve)) {
+   /* print header */
+   seq_printf(m, "%-*s\n",
+  VESTAT_LINE_WIDTH - 1,
+  "Version: 2.2");
+   seq_printf(m, VESTAT_HEAD_FMT, "VEID",
+  "user", "nice", "system",
+  "uptime", "idle",
+  "strv", "uptime", "used",
+  "maxlat", "totlat", "numsched");
+   }
+
+   if (ve == get_ve0())
+   return 0;
+
+   ret = ve_get_cpu_stat(ve, &kstat);
+   if (ret)
+   return ret;
+
+   strv_time   = 0;
+   user_ve = nsecs_to_jiffies(kstat.cpustat[CPUTIME_USER]);
+   nice_ve = nsecs_to_jiffies(kstat.cpustat[CPUTIME_NICE]);
+   system_ve   = nsecs_to_jiffies(kstat.cpustat[CPUTIME_SYSTEM]);
+   used= kstat.cpustat[CPUTIME_USED];
+   idle_time   = kstat.cpustat[CPUTIME_IDLE];
+
+   uptime_cycles = ve_get_uptime(ve);
+   uptime = get_jiffies_64() - ve->start_jiffies;
+
+   seq_printf(m, VESTAT_LINE_FMT, ve_name(ve),
+  user_ve, nice_ve, system_ve,
+  (unsigned long long)uptime,
+  (unsigned long long)idle_time,
+  (unsigned long long)strv_time,
+  (unsigned long long)uptime_cycles,
+  (unsigned long long)used,
+  (unsigned long long)ve->sched_lat_ve.last.maxlat,
+  (unsigned long long)ve->sched_lat_ve.last.totlat,
+  ve->sched_lat_ve.last.count);
+   return 0;
+}
+
 static void *ve_seq_start(struct seq_file *m, loff_t *pos)
 {
struct ve_struct *curve;
@@ -66,6 +139,25 @@ static void ve_seq_stop(struct seq_file *m, void *v)
mutex_unlock(&ve_list_lock);
 }
 
+static struct seq_operations vestat_seq_op = {
+   .start  = ve_seq_start,
+   .next   = ve_seq_next,
+   .stop   = ve_seq_stop,
+   .show   = vestat_seq_show
+};
+
+static int vestat_open(struct inode *inode, struct file *file)
+{
+   return seq_open(file, &vestat_seq_op);
+}
+
+static struct file_operations proc_vestat_operations = {
+   .open   = vestat_open,
+   .read   = seq_read,
+   .llseek = seq_lseek,
+   .release = seq_release
+};
+
 static int devperms_seq_show(struct seq_file *m, void *v)
 {
struct ve_struct *ve = list_entry(v, struct ve_struct, ve_list);
@@

[Devel] [PATCH RHEL8 COMMIT] ve/proc/stat: Introduce CPUTIME_USED field in cpustat statistic

2020-11-03 Thread Konstantin Khorenko
The commit is pushed to "branch-rh8-4.18.0-193.6.3.vz8.4.x-ovz" and will appear 
at https://src.openvz.org/scm/ovz/vzkernel.git
after rh8-4.18.0-193.6.3.vz8.4.15
-->
commit c15bffe2e2cdc0b2c7bd52f5c0dec58524e115e8
Author: Konstantin Khorenko 
Date:   Tue Nov 3 14:51:13 2020 +0300

ve/proc/stat: Introduce CPUTIME_USED field in cpustat statistic

It will be shown later in /proc/vestat file.

Signed-off-by: Konstantin Khorenko 
Reviewed-by: Andrey Ryabinin 
---
 include/linux/kernel_stat.h | 1 +
 kernel/sched/cpuacct.c  | 6 +-
 2 files changed, 6 insertions(+), 1 deletion(-)

diff --git a/include/linux/kernel_stat.h b/include/linux/kernel_stat.h
index 47b75b4be3d5..5a3851b1d771 100644
--- a/include/linux/kernel_stat.h
+++ b/include/linux/kernel_stat.h
@@ -25,6 +25,7 @@ enum cpu_usage_stat {
CPUTIME_IRQ,
CPUTIME_IDLE,
CPUTIME_IOWAIT,
+   CPUTIME_USED,
CPUTIME_STEAL,
CPUTIME_GUEST,
CPUTIME_GUEST_NICE,
diff --git a/kernel/sched/cpuacct.c b/kernel/sched/cpuacct.c
index ed59607f2157..646bbd257110 100644
--- a/kernel/sched/cpuacct.c
+++ b/kernel/sched/cpuacct.c
@@ -428,7 +428,7 @@ static void cpu_cgroup_update_stat(struct 
cgroup_subsys_state *cpu_css,
struct sched_entity *se = tg->se[i];
u64 *cpustat = cpuacct_cpustat(cpuacct_css, i)->cpustat;
u64 now = cpu_clock(i);
-   u64 delta, idle, iowait, steal;
+   u64 delta, idle, iowait, steal, used;
 
/* root_task_group has not sched entities */
if (tg == &root_task_group)
@@ -437,6 +437,7 @@ static void cpu_cgroup_update_stat(struct 
cgroup_subsys_state *cpu_css,
iowait = se->statistics.iowait_sum;
idle = se->statistics.sum_sleep_runtime;
steal = se->statistics.wait_sum;
+   used = se->sum_exec_runtime;
 
if (idle > iowait)
idle -= iowait;
@@ -460,6 +461,7 @@ static void cpu_cgroup_update_stat(struct 
cgroup_subsys_state *cpu_css,
cpustat[CPUTIME_IDLE]   = max(cpustat[CPUTIME_IDLE], idle);
cpustat[CPUTIME_IOWAIT] = max(cpustat[CPUTIME_IOWAIT], iowait);
cpustat[CPUTIME_STEAL]  = steal;
+   cpustat[CPUTIME_USED]   = used;
 #endif
 }
 
@@ -538,6 +540,8 @@ static void fixup_vcpustat_delta(struct kernel_cpustat *cur,
 cur_idle, target_idle);
}
 
+   cur->cpustat[CPUTIME_USED] = target_usage;
+
/* do not show steal time inside ve */
cur->cpustat[CPUTIME_STEAL] = 0;
 }
___
Devel mailing list
Devel@openvz.org
https://lists.openvz.org/mailman/listinfo/devel


[Devel] [PATCH RHEL8 COMMIT] ve/time/stat: idle time virtualization in /proc/loadavg

2020-11-03 Thread Konstantin Khorenko
The commit is pushed to "branch-rh8-4.18.0-193.6.3.vz8.4.x-ovz" and will appear 
at https://src.openvz.org/scm/ovz/vzkernel.git
after rh8-4.18.0-193.6.3.vz8.4.15
-->
commit de23bc4787594c1a8a989eb7d0ae3159a9e518bc
Author: Konstantin Khorenko 
Date:   Fri Oct 30 13:37:09 2020 +0300

ve/time/stat: idle time virtualization in /proc/loadavg

The patch is based on following vz7 commits:
  a58fb58bff1c ("Use ve init task's css instead of opening cgroup via vfs")
  75fc174adc36 ("sched: Port cpustat related patches")

Fixes: a3c4d1d8f383 ("ve/time: Customize VE uptime")

TODO: to separate FIXME hunks from a3c4d1d8f383 ("ve/time: Customize VE
uptime") and merge them into this commit

Signed-off-by: Konstantin Khorenko 
Reviewed-by: Andrey Ryabinin 
---
 fs/proc/uptime.c | 27 +++
 1 file changed, 7 insertions(+), 20 deletions(-)

diff --git a/fs/proc/uptime.c b/fs/proc/uptime.c
index bc07d42ce9f5..dae407953903 100644
--- a/fs/proc/uptime.c
+++ b/fs/proc/uptime.c
@@ -23,37 +23,24 @@ static inline void get_ve0_idle(struct timespec64 *idle)
idle->tv_nsec = rem;
 }
 
-static inline void get_veX_idle(struct timespec *idle, struct cgroup* cgrp)
+static inline void get_veX_idle(struct ve_struct *ve, struct timespec64 *idle)
 {
-#if 0
-FIXME: to be reworked anyway in
-   "Use ve init task's css instead of opening cgroup via vfs"
-
struct kernel_cpustat kstat;
 
-   cpu_cgroup_get_stat(cgrp, &kstat);
-   *idle = ns_to_timespec(kstat.cpustat[CPUTIME_IDLE]);
-#endif
+   ve_get_cpu_stat(ve, &kstat);
+   *idle = ns_to_timespec64(kstat.cpustat[CPUTIME_IDLE]);
 }
 
 static int uptime_proc_show(struct seq_file *m, void *v)
 {
struct timespec uptime, offset;
struct timespec64 idle;
+   struct ve_struct *ve = get_exec_env();
 
-   if (ve_is_super(get_exec_env()))
-   get_ve0_idle(&idle);
-   else {
+   if (ve_is_super(ve))
get_ve0_idle(&idle);
-#if 0
-FIXME:  to be reworked anyway in
-"Use ve init task's css instead of opening cgroup via vfs"
-
-   rcu_read_lock();
-   get_veX_idle(&idle, task_cgroup(current, cpu_cgroup_subsys_id));
-   rcu_read_unlock();
-#endif
-   }
+   else
+   get_veX_idle(ve, &idle);
 
get_monotonic_boottime(&uptime);
 #ifdef CONFIG_VE
___
Devel mailing list
Devel@openvz.org
https://lists.openvz.org/mailman/listinfo/devel


Re: [Devel] [PATCH rh8 5/8] ve/proc/stat: Introduce /proc/stat virtualized handler for Containers

2020-11-03 Thread Konstantin Khorenko

has been merged into
[PATCH RHEL8 COMMIT] ve/sched/stat: Introduce functions to calculate vcpustat 
data

--
Best regards,

Konstantin Khorenko,
Virtuozzo Linux Kernel Team

On 10/28/2020 06:57 PM, Konstantin Khorenko wrote:

vz8 rebase notes:
  * "swap 0 0" line has been dropped
  * extra empty line between "intr" and "ctxt" has been dropped

Known issues:
 - it's known "procs_blocked" is shown incorrectly inside a CT
TODO: to fix

Signed-off-by: Konstantin Khorenko 

Commit messages of related patches in vz7:
==
sched: use cpuacct->cpustat for showing cpu stats

In contrast to RH6, where tg->cpustat was used, now cpu stats are
accounted in the cpuacct cgroup. So zap tg->cpustat and use
cpuacct->cpustat for showing cpu.proc.stat instead. Fortunately cpu and
cpuacct cgroups are always mounted together (even by systemd by
default), so this will work.

Related to https://jira.sw.ru/browse/PSBM-33642

Signed-off-by: Vladimir Davydov 

+++
Use ve init task's css instead of opening cgroup via vfs

Currently, whenever we need to get cpu or devices cgroup corresponding
to a ve, we open it using cgroup_kernel_open(). This is inflexible,
because it relies on the fact that all container cgroups are located at
a specific location which can never change (at the top level). Since we
want to move container cgroups to machine.slice, we need to rework this.

This patch does the trick. It makes each ve remember its init task at
container start, and use css corresponding to init task whenever we need
to get a corresponding cgroup. Note, that after this patch is applied,
we don't need to mount cpu and devices cgroup in kernel.

https://jira.sw.ru/browse/PSBM-48629

Signed-off-by: Vladimir Davydov 

+++
ve/cpustat: don't try to update vcpustats for root_task_group

root_task_group doesn't have vcpu stats. Attempt to update them leads
to NULL-ptr deref:

BUG: unable to handle kernel NULL pointer dereference at   
(null)
IP: [] cpu_cgroup_update_vcpustat+0x13c/0x620
...
Call Trace:
 [] cpu_cgroup_get_stat+0x7b/0x180
 [] ve_get_cpu_stat+0x27/0x70
 [] fill_cpu_stat+0x91/0x1e0 [vzmon]
 [] vzcalls_ioctl+0x2bb/0x430 [vzmon]
 [] vzctl_ioctl+0x45/0x60 [vzdev]
 [] do_vfs_ioctl+0x255/0x4f0
 [] SyS_ioctl+0x54/0xa0
 [] system_call_fastpath+0x16/0x1b

So, return -ENOENT if we asked for vcpu stats of root_task_group.

https://jira.sw.ru/browse/PSBM-48721

Signed-off-by: Andrey Ryabinin 
Reviewed-by: Vladimir Davydov 

+++
sched: Port cpustat related patches

This patch ports:

diff-sched-rework-_proc_stat-output
diff-sched-fix-output-of-vestat-idle
diff-sched-make-allowance-for-vcpu-rate-in-_proc_stat
diff-sched-hide-steal-time-from-inside-CT
diff-sched-cpu.proc.stat-always-count-nr_running-and-co-on-all-cpus

Author: Vladimir Davydov
Email: vdavy...@parallels.com
Subject: sched: rework /proc/stat output
Date: Thu, 29 May 2014 11:40:50 +0400

Initially we mapped usage pct on physical cpu i to vcpu (i % nr_vcpus).
Obviously, if a CT is running on physical cpus equal mod nr_vcpus, we'll
miss usage on one or more vcpus. F.e., if there is a 2-vcpus CT with
several cpu-eaters running on physical cpus 0, 2, we'll get vcpu 0
200%-busy, and vcpu 1 idling.

To fix that, we changed behavior so that vcpu i usage equals total cpu
usage divided by nr_vcpus. That led to customers' dissatisfaction,
because such an algorithm reveals the fake.

So, now we're going to use the first algorithm, but if the usage of one
of vcpus turns out to be greater than abs time delta, we'll "move" the
usage excess to other vcpus, so that one vcpu will never consume more
than one pcpu. F.e., in the situation described above, we'll move 100%
of vcpu 0 time to vcpu 1, so that both vcpus will be 100%-busy.

To achieve that, we serialize access to /proc/stat and making readers
update stats basing on the pcpu usage delta, so that it can fix up per
vcpu usage to be <= 100% and calculate idle time accordingly.

https://jira.sw.ru/browse/PSBM-26714

Signed-off-by: Vladimir Davydov 

Acked-by: Kirill Tkhai 
=

Author: Vladimir Davydov
Email: vdavy...@parallels.com
Subject: sched: fix output of vestat:idle
Date: Tue, 22 Jul 2014 12:25:24 +0400

/proc/vz/vestat must report virtualized idle time, but since commit
diff-sched-rework-_proc_stat-output it shows total time CTs have been
idling on all physical cpus. This is, because in cpu_cgroup_get_stat we
use task_group->cpustat instead of vcpustat. Fix it.

https://jira.sw.ru/browse/PSBM-28403
https://bugzilla.openvz.org/show_bug.cgi?id=3035

Signed-off-by: Vladimir Davydov 

Acked-by: Kirill Tkhai 
=

Author: Vladimir Davydov
Email: vdavy...@parallels.com
Subject: sched: make allowance for vcpu rate in /proc/stat
Date: Wed, 13 Aug 2014 15:44:36 +

[Devel] [PATCH RHEL8 COMMIT] x86: make ARCH_[SET|GET]_CPUID friends with /proc/vz/cpuid_override

2020-11-03 Thread Konstantin Khorenko
The commit is pushed to "branch-rh8-4.18.0-193.6.3.vz8.4.x-ovz" and will appear 
at https://src.openvz.org/scm/ovz/vzkernel.git
after rh8-4.18.0-193.6.3.vz8.4.15
-->
commit fd8c92144c52332eb3488642a68f7a426b4dd4af
Author: Andrey Ryabinin 
Date:   Tue Nov 3 16:32:58 2020 +0300

x86: make ARCH_[SET|GET]_CPUID friends with /proc/vz/cpuid_override

We are using cpuid faults to emulate cpuid in containers. This
conflicts with arch_prctl(ARCH_SET_CPUID, 0) which allows to enable
cpuid faulting so that cpuid instruction causes SIGSEGV.

Add TIF_CPUID_OVERRIDE thread info flag which is added on all
!ve0 tasks. And check this flag along with TIF_NOCPUID to
decide whether we need to enable/disable cpuid faults or not.

https://jira.sw.ru/browse/PSBM-121823

Signed-off-by: Andrey Ryabinin 
Reviewed-by: Kirill Tkhai 
---
 arch/x86/include/asm/thread_info.h |  4 +++-
 arch/x86/kernel/cpuid_fault.c  |  3 ++-
 arch/x86/kernel/process.c  | 13 +
 arch/x86/kernel/traps.c|  3 +++
 kernel/ve/ve.c |  1 +
 5 files changed, 18 insertions(+), 6 deletions(-)

diff --git a/arch/x86/include/asm/thread_info.h 
b/arch/x86/include/asm/thread_info.h
index c0da378eed8b..6ffb64d25383 100644
--- a/arch/x86/include/asm/thread_info.h
+++ b/arch/x86/include/asm/thread_info.h
@@ -92,6 +92,7 @@ struct thread_info {
 #define TIF_NOCPUID15  /* CPUID is not accessible in userland 
*/
 #define TIF_NOTSC  16  /* TSC is not accessible in userland */
 #define TIF_IA32   17  /* IA32 compatibility process */
+#define TIF_CPUID_OVERRIDE 18  /* CPUID emulation enabled */
 #define TIF_NOHZ   19  /* in adaptive nohz mode */
 #define TIF_MEMDIE 20  /* is terminating due to OOM killer */
 #define TIF_POLLING_NRFLAG 21  /* idle is polling for TIF_NEED_RESCHED 
*/
@@ -122,6 +123,7 @@ struct thread_info {
 #define _TIF_NOCPUID   (1 << TIF_NOCPUID)
 #define _TIF_NOTSC (1 << TIF_NOTSC)
 #define _TIF_IA32  (1 << TIF_IA32)
+#define _TIF_CPUID_OVERRIDE(1 << TIF_CPUID_OVERRIDE)
 #define _TIF_NOHZ  (1 << TIF_NOHZ)
 #define _TIF_POLLING_NRFLAG(1 << TIF_POLLING_NRFLAG)
 #define _TIF_IO_BITMAP (1 << TIF_IO_BITMAP)
@@ -153,7 +155,7 @@ struct thread_info {
 /* flags to check in __switch_to() */
 #define _TIF_WORK_CTXSW_BASE   \
(_TIF_IO_BITMAP|_TIF_NOCPUID|_TIF_NOTSC|_TIF_BLOCKSTEP| \
-_TIF_SSBD | _TIF_SPEC_FORCE_UPDATE)
+_TIF_SSBD | _TIF_SPEC_FORCE_UPDATE | _TIF_CPUID_OVERRIDE)
 
 /*
  * Avoid calls to __switch_to_xtra() on UP as STIBP is not evaluated.
diff --git a/arch/x86/kernel/cpuid_fault.c b/arch/x86/kernel/cpuid_fault.c
index 339e2638c3b8..1e8ffacc4412 100644
--- a/arch/x86/kernel/cpuid_fault.c
+++ b/arch/x86/kernel/cpuid_fault.c
@@ -6,7 +6,8 @@
 #include 
 #include 
 #include 
-#include 
+#include 
+#include 
 
 struct cpuid_override_entry {
unsigned int op;
diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c
index e5c5b1d724ab..788b9b8f8f9c 100644
--- a/arch/x86/kernel/process.c
+++ b/arch/x86/kernel/process.c
@@ -209,7 +209,8 @@ static void set_cpuid_faulting(bool on)
 static void disable_cpuid(void)
 {
preempt_disable();
-   if (!test_and_set_thread_flag(TIF_NOCPUID)) {
+   if (!test_and_set_thread_flag(TIF_NOCPUID) ||
+   test_thread_flag(TIF_CPUID_OVERRIDE)) {
/*
 * Must flip the CPU state synchronously with
 * TIF_NOCPUID in the current running context.
@@ -222,7 +223,8 @@ static void disable_cpuid(void)
 static void enable_cpuid(void)
 {
preempt_disable();
-   if (test_and_clear_thread_flag(TIF_NOCPUID)) {
+   if (test_and_clear_thread_flag(TIF_NOCPUID) &&
+   !test_thread_flag(TIF_CPUID_OVERRIDE)) {
/*
 * Must flip the CPU state synchronously with
 * TIF_NOCPUID in the current running context.
@@ -505,6 +507,7 @@ void __switch_to_xtra(struct task_struct *prev_p, struct 
task_struct *next_p)
 {
struct thread_struct *prev, *next;
unsigned long tifp, tifn;
+   bool prev_cpuid, next_cpuid;
 
prev = &prev_p->thread;
next = &next_p->thread;
@@ -529,8 +532,10 @@ void __switch_to_xtra(struct task_struct *prev_p, struct 
task_struct *next_p)
if ((tifp ^ tifn) & _TIF_NOTSC)
cr4_toggle_bits_irqsoff(X86_CR4_TSD);
 
-   if ((tifp ^ tifn) & _TIF_NOCPUID)
-   set_cpuid_faulting(!!(tifn & _TIF_NOCPUID));
+   prev_cpuid = (tifp & _TIF_NOCPUID) || (tifp & _TIF_CPUID_OVERRIDE);
+   next_cpuid = (tifn & _TIF_NOCPUID) || (tifn & _TIF_CPUID_OVERRIDE);
+   if (prev_cpuid != next_cpuid)
+   set_cpuid_faulting(next_cpuid);
 
if (likely(!((tifp | tifn) & _TIF_SPEC_FOR

[Devel] [PATCH RHEL8 COMMIT] arch/x86: introduce cpuid override

2020-11-03 Thread Konstantin Khorenko
The commit is pushed to "branch-rh8-4.18.0-193.6.3.vz8.4.x-ovz" and will appear 
at https://src.openvz.org/scm/ovz/vzkernel.git
after rh8-4.18.0-193.6.3.vz8.4.15
-->
commit c4eda1a1f99dd7a00df91dd1975874df4b64266a
Author: Vladimir Davydov 
Date:   Tue Nov 3 16:32:58 2020 +0300

arch/x86: introduce cpuid override

Port diff-arch-x86-introduce-cpuid-override

Recent Intel CPUs rejected CPUID masking, which is required for flex
migration, in favor of CPUID faulting. So we need to support it in
kenrel.

This patch adds user writable file /proc/vz/cpuid_override, which
contains CPUID override table. Each table entry must have the following
format:

  op[ count]: eax ebx ecx edx

where @op and optional @count define a CPUID function, whose output one
would like to override (@op and @count are loaded to EAX and ECX
registers respectively before calling CPUID); @eax, @ebx, @ecx, @edx -
the desired CPUID output for the specified function. All values must be
in HEX, 0x prefix is optional.

Notes:

 - the file is only present on hosts that support CPUID faulting;
 - CPUID faulting is always enabled if it is supported;
 - CPUID output is overridden on all present CPUs;
 - the maximal number of entries one can override equals 16;
 - each write(2) to the file removes all existing entries before adding
   new ones, so the whole table must be written in one write(2); in
   particular writing an empty line to the file removes all existing
   rules.

Example:

Suppose we want to mask out SSE2 (CPUID.01H:EDX:26) and RDTSCP
(CPUID.8001H:EDX:27). Then we should execute the following sequence:

 - get the current cpuid value:

   # cpuid -r | grep -e '^\s*0x0001' -e '^\s*0x8001' | head -n 2
  0x0001 0x00: eax=0x000306e4 ebx=0x00200800 ecx=0x7fbee3ff 
edx=0xbfebfbff
  0x8001 0x00: eax=0x ebx=0x ecx=0x0001 
edx=0x2c100800

 - clear the feature bits we want to mask out and write the result to
   /proc/vz/cpuid_override:

   # cat >/proc/vz/cpuid_override 

[Devel] [PATCH RHEL8 COMMIT] x86: Show vcpu cpuflags in cpuinfo

2020-11-03 Thread Konstantin Khorenko
The commit is pushed to "branch-rh8-4.18.0-193.6.3.vz8.4.x-ovz" and will appear 
at https://src.openvz.org/scm/ovz/vzkernel.git
after rh8-4.18.0-193.6.3.vz8.4.15
-->
commit e651dcf4b551ca9e85737914b5059681b71a2e77
Author: Kirill Tkhai 
Date:   Tue Nov 3 16:32:59 2020 +0300

x86: Show vcpu cpuflags in cpuinfo

Show cpu_i flags as flags of vcpu_i.

Extracted from "Initial patch". Merged several reworks.

TODO: Maybe replace/rework on_each_cpu() with smp_call_function_single().
Then we won't need split c_start() in previous patch (as the call
function will be called right before specific cpu is being prepared
to show). This should be rather easy.
[aryabinin: Don't see what it buys us, so I didn't try to implement it]

Signed-off-by: Kirill Tkhai 

https://jira.sw.ru/browse/PSBM-121823

[aryabinin:vz8 rebase]
Signed-off-by: Andrey Ryabinin 
Reviewed-by: Kirill Tkhai 
---
 arch/x86/kernel/cpu/proc.c | 63 +++---
 1 file changed, 59 insertions(+), 4 deletions(-)

diff --git a/arch/x86/kernel/cpu/proc.c b/arch/x86/kernel/cpu/proc.c
index d6b17a60acf6..4fe1577d5e6f 100644
--- a/arch/x86/kernel/cpu/proc.c
+++ b/arch/x86/kernel/cpu/proc.c
@@ -4,6 +4,8 @@
 #include 
 #include 
 #include 
+#include 
+#include 
 
 #include "cpu.h"
 
@@ -58,10 +60,54 @@ extern void __do_cpuid_fault(unsigned int op, unsigned int 
count,
 unsigned int *eax, unsigned int *ebx,
 unsigned int *ecx, unsigned int *edx);
 
+struct cpu_flags {
+   u32 val[NCAPINTS];
+};
+
+static DEFINE_PER_CPU(struct cpu_flags, cpu_flags);
+
+static void init_cpu_flags(void *dummy)
+{
+   int cpu = smp_processor_id();
+   struct cpu_flags *flags = &per_cpu(cpu_flags, cpu);
+   struct cpuinfo_x86 *c = &cpu_data(cpu);
+   unsigned int eax, ebx, ecx, edx;
+
+   memcpy(flags->val, c->x86_capability, NCAPINTS * sizeof(u32));
+
+   /*
+* Clear feature bits masked using cpuid masking/faulting.
+*/
+
+   if (c->cpuid_level >= 0x0001) {
+   __do_cpuid_fault(0x0001, 0, &eax, &ebx, &ecx, &edx);
+   flags->val[4] &= ecx;
+   flags->val[0] &= edx;
+   }
+
+   if (c->cpuid_level >= 0x0007) {
+   __do_cpuid_fault(0x0007, 0, &eax, &ebx, &ecx, &edx);
+   flags->val[9] &= ebx;
+   }
+
+   if ((c->extended_cpuid_level & 0x) == 0x8000 &&
+   c->extended_cpuid_level >= 0x8001) {
+   __do_cpuid_fault(0x8001, 0, &eax, &ebx, &ecx, &edx);
+   flags->val[6] &= ecx;
+   flags->val[1] &= edx;
+   }
+
+   if (c->cpuid_level >= 0x000d) {
+   __do_cpuid_fault(0x000d, 1, &eax, &ebx, &ecx, &edx);
+   flags->val[10] &= eax;
+   }
+}
+
 static int show_cpuinfo(struct seq_file *m, void *v)
 {
struct cpuinfo_x86 *c = v;
unsigned int cpu;
+   int is_super = ve_is_super(get_exec_env());
int i;
 
cpu = c->cpu_index;
@@ -103,7 +149,10 @@ static int show_cpuinfo(struct seq_file *m, void *v)
 
seq_puts(m, "flags\t\t:");
for (i = 0; i < 32*NCAPINTS; i++)
-   if (cpu_has(c, i) && x86_cap_flags[i] != NULL)
+   if (x86_cap_flags[i] != NULL &&
+   ((is_super && cpu_has(c, i)) ||
+(!is_super && test_bit(i, (unsigned long *)
+   &per_cpu(cpu_flags, 
cpu)
seq_printf(m, " %s", x86_cap_flags[i]);
 
seq_puts(m, "\nbugs\t\t:");
@@ -145,18 +194,24 @@ static int show_cpuinfo(struct seq_file *m, void *v)
return 0;
 }
 
-static void *c_start(struct seq_file *m, loff_t *pos)
+static void *__c_start(struct seq_file *m, loff_t *pos)
 {
*pos = cpumask_next(*pos - 1, cpu_online_mask);
-   if ((*pos) < nr_cpu_ids)
+   if (bitmap_weight(cpumask_bits(cpu_online_mask), *pos) < 
num_online_vcpus())
return &cpu_data(*pos);
return NULL;
 }
 
+static void *c_start(struct seq_file *m, loff_t *pos)
+{
+   on_each_cpu(init_cpu_flags, NULL, 1);
+   return __c_start(m, pos);
+}
+
 static void *c_next(struct seq_file *m, void *v, loff_t *pos)
 {
(*pos)++;
-   return c_start(m, pos);
+   return __c_start(m, pos);
 }
 
 static void c_stop(struct seq_file *m, void *v)
___
Devel mailing list
Devel@openvz.org
https://lists.openvz.org/mailman/listinfo/devel


[Devel] [PATCH vz8 v3 2/2] x86: don't enable cpuid faults if /proc/vz/cpuid_override unused

2020-11-03 Thread Andrey Ryabinin
We don't need to enable cpuid faults if /proc/vz/cpuid_override
was never used. If task was attached to ve before a write to
'cpuid_override' it will not get cpuid faults now. It shouldn't
be a problem since the proper use of 'cpuid_override' requires
stopping all containers.

https://jira.sw.ru/browse/PSBM-121823
Signed-off-by: Andrey Ryabinin 
Reviewed-by: Kirill Tkhai 
---

Changes since v1:
 - git add include/linux/cpuid_override.h 
Changes since v2:
 - add review tag

 arch/x86/kernel/cpuid_fault.c  | 21 ++---
 include/linux/cpuid_override.h | 30 ++
 kernel/ve/ve.c |  5 -
 3 files changed, 36 insertions(+), 20 deletions(-)
 create mode 100644 include/linux/cpuid_override.h

diff --git a/arch/x86/kernel/cpuid_fault.c b/arch/x86/kernel/cpuid_fault.c
index 1e8ffacc4412..cb6c2216fa8a 100644
--- a/arch/x86/kernel/cpuid_fault.c
+++ b/arch/x86/kernel/cpuid_fault.c
@@ -1,3 +1,4 @@
+#include 
 #include 
 #include 
 #include 
@@ -9,25 +10,7 @@
 #include 
 #include 
 
-struct cpuid_override_entry {
-   unsigned int op;
-   unsigned int count;
-   bool has_count;
-   unsigned int eax;
-   unsigned int ebx;
-   unsigned int ecx;
-   unsigned int edx;
-};
-
-#define MAX_CPUID_OVERRIDE_ENTRIES 16
-
-struct cpuid_override_table {
-   struct rcu_head rcu_head;
-   int size;
-   struct cpuid_override_entry entries[MAX_CPUID_OVERRIDE_ENTRIES];
-};
-
-static struct cpuid_override_table __rcu *cpuid_override __read_mostly;
+struct cpuid_override_table __rcu *cpuid_override __read_mostly;
 static DEFINE_SPINLOCK(cpuid_override_lock);
 
 static void cpuid_override_update(struct cpuid_override_table *new_table)
diff --git a/include/linux/cpuid_override.h b/include/linux/cpuid_override.h
new file mode 100644
index ..ea0fa7af3d3c
--- /dev/null
+++ b/include/linux/cpuid_override.h
@@ -0,0 +1,30 @@
+#ifndef __CPUID_OVERRIDE_H
+#define __CPUID_OVERRIDE_H
+
+#include 
+
+struct cpuid_override_entry {
+   unsigned int op;
+   unsigned int count;
+   bool has_count;
+   unsigned int eax;
+   unsigned int ebx;
+   unsigned int ecx;
+   unsigned int edx;
+};
+
+#define MAX_CPUID_OVERRIDE_ENTRIES 16
+
+struct cpuid_override_table {
+   struct rcu_head rcu_head;
+   int size;
+   struct cpuid_override_entry entries[MAX_CPUID_OVERRIDE_ENTRIES];
+};
+
+extern struct cpuid_override_table __rcu *cpuid_override;
+
+static inline bool cpuid_override_on(void)
+{
+   return rcu_access_pointer(cpuid_override);
+}
+#endif
diff --git a/kernel/ve/ve.c b/kernel/ve/ve.c
index aad8ce69ca1f..0d4d0ab70369 100644
--- a/kernel/ve/ve.c
+++ b/kernel/ve/ve.c
@@ -9,6 +9,7 @@
  * 've.c' helper file performing VE sub-system initialization
  */
 
+#include 
 #include 
 #include 
 #include 
@@ -801,6 +802,7 @@ static void ve_attach(struct cgroup_taskset *tset)
 {
struct cgroup_subsys_state *css;
struct task_struct *task;
+   extern struct cpuid_override_table __rcu *cpuid_override;
 
cgroup_taskset_for_each(task, css, tset) {
struct ve_struct *ve = css_to_ve(css);
@@ -816,7 +818,8 @@ static void ve_attach(struct cgroup_taskset *tset)
/* Leave parent exec domain */
task->parent_exec_id--;
 
-   set_tsk_thread_flag(task, TIF_CPUID_OVERRIDE);
+   if (cpuid_override_on())
+   set_tsk_thread_flag(task, TIF_CPUID_OVERRIDE);
task->task_ve = ve;
}
 }
-- 
2.26.2

___
Devel mailing list
Devel@openvz.org
https://lists.openvz.org/mailman/listinfo/devel


Re: [Devel] [PATCH vz8 v2 1/2] x86, cpuinfo: Fix race on parallel /proc/cpuinfo read

2020-11-03 Thread Andrey Ryabinin



On 11/3/20 2:28 PM, Kirill Tkhai wrote:
> On 02.11.2020 20:13, Andrey Ryabinin wrote:
>> If several threads read /proc/cpuinfo some can see in 'flags'
>> values from c->x86_capability, before __do_cpuid_fault() called
>> and masks applied. Fix this by forming 'flags' on stack first
>> and copy them in per_cpu(cpu_flags, cpu) as a last step.
>>
>> https://jira.sw.ru/browse/PSBM-121823
>> Signed-off-by: Andrey Ryabinin 
>> ---
>> Changes since v1:
>>  - none
>>  
>>  arch/x86/kernel/cpu/proc.c | 17 +
>>  1 file changed, 9 insertions(+), 8 deletions(-)
>>
>> diff --git a/arch/x86/kernel/cpu/proc.c b/arch/x86/kernel/cpu/proc.c
>> index 4fe1577d5e6f..4cc2951e34fb 100644
>> --- a/arch/x86/kernel/cpu/proc.c
>> +++ b/arch/x86/kernel/cpu/proc.c
>> @@ -69,11 +69,11 @@ static DEFINE_PER_CPU(struct cpu_flags, cpu_flags);
>>  static void init_cpu_flags(void *dummy)
>>  {
>>  int cpu = smp_processor_id();
>> -struct cpu_flags *flags = &per_cpu(cpu_flags, cpu);
>> +struct cpu_flags flags;
>>  struct cpuinfo_x86 *c = &cpu_data(cpu);
>>  unsigned int eax, ebx, ecx, edx;
>>  
>> -memcpy(flags->val, c->x86_capability, NCAPINTS * sizeof(u32));
>> +memcpy(&flags, c->x86_capability, sizeof(flags));
>>  
>>  /*
>>   * Clear feature bits masked using cpuid masking/faulting.
>> @@ -81,26 +81,27 @@ static void init_cpu_flags(void *dummy)
>>  
>>  if (c->cpuid_level >= 0x0001) {
>>  __do_cpuid_fault(0x0001, 0, &eax, &ebx, &ecx, &edx);
>> -flags->val[4] &= ecx;
>> -flags->val[0] &= edx;
>> +flags.val[4] &= ecx;
>> +flags.val[0] &= edx;
>>  }
>>  
>>  if (c->cpuid_level >= 0x0007) {
>>  __do_cpuid_fault(0x0007, 0, &eax, &ebx, &ecx, &edx);
>> -flags->val[9] &= ebx;
>> +flags.val[9] &= ebx;
>>  }
>>  
>>  if ((c->extended_cpuid_level & 0x) == 0x8000 &&
>>  c->extended_cpuid_level >= 0x8001) {
>>  __do_cpuid_fault(0x8001, 0, &eax, &ebx, &ecx, &edx);
>> -flags->val[6] &= ecx;
>> -flags->val[1] &= edx;
>> +flags.val[6] &= ecx;
>> +flags.val[1] &= edx;
>>  }
>>  
>>  if (c->cpuid_level >= 0x000d) {
>>  __do_cpuid_fault(0x000d, 1, &eax, &ebx, &ecx, &edx);
>> -flags->val[10] &= eax;
>> +flags.val[10] &= eax;
>>  }
>> +memcpy(&per_cpu(cpu_flags, cpu), &flags, sizeof(flags));
> 
> This is still racy, since memcpy() is not atomic. Maybe we should add some 
> lock on top of this?
> 

This race shouldn't be a problem since flags are not supposed to change during 
ve lifetime.
So we overwriting same values. But don't mind to add spinlock protection.
___
Devel mailing list
Devel@openvz.org
https://lists.openvz.org/mailman/listinfo/devel


[Devel] [PATCH vz8 v3 1/2] x86, cpuinfo: Fix race on parallel /proc/cpuinfo read

2020-11-03 Thread Andrey Ryabinin
If several threads read /proc/cpuinfo some can see in 'flags'
values from c->x86_capability, before __do_cpuid_fault() called
and masks applied. Fix this by forming 'flags' on stack first
and copy them in per_cpu(cpu_flags, cpu) as a last step.

https://jira.sw.ru/browse/PSBM-121823
Signed-off-by: Andrey Ryabinin 
---

Changes since v1:
 - none

Changes since v2:
 - add spinlock, use temporary ve_flags in show_cpuinfo()
 
 arch/x86/kernel/cpu/proc.c | 31 ++-
 1 file changed, 22 insertions(+), 9 deletions(-)

diff --git a/arch/x86/kernel/cpu/proc.c b/arch/x86/kernel/cpu/proc.c
index 4fe1577d5e6f..08fd7ff9a55b 100644
--- a/arch/x86/kernel/cpu/proc.c
+++ b/arch/x86/kernel/cpu/proc.c
@@ -65,15 +65,16 @@ struct cpu_flags {
 };
 
 static DEFINE_PER_CPU(struct cpu_flags, cpu_flags);
+static DEFINE_SPINLOCK(cpu_flags_lock);
 
 static void init_cpu_flags(void *dummy)
 {
int cpu = smp_processor_id();
-   struct cpu_flags *flags = &per_cpu(cpu_flags, cpu);
+   struct cpu_flags flags;
struct cpuinfo_x86 *c = &cpu_data(cpu);
unsigned int eax, ebx, ecx, edx;
 
-   memcpy(flags->val, c->x86_capability, NCAPINTS * sizeof(u32));
+   memcpy(&flags, c->x86_capability, sizeof(flags));
 
/*
 * Clear feature bits masked using cpuid masking/faulting.
@@ -81,26 +82,30 @@ static void init_cpu_flags(void *dummy)
 
if (c->cpuid_level >= 0x0001) {
__do_cpuid_fault(0x0001, 0, &eax, &ebx, &ecx, &edx);
-   flags->val[4] &= ecx;
-   flags->val[0] &= edx;
+   flags.val[4] &= ecx;
+   flags.val[0] &= edx;
}
 
if (c->cpuid_level >= 0x0007) {
__do_cpuid_fault(0x0007, 0, &eax, &ebx, &ecx, &edx);
-   flags->val[9] &= ebx;
+   flags.val[9] &= ebx;
}
 
if ((c->extended_cpuid_level & 0x) == 0x8000 &&
c->extended_cpuid_level >= 0x8001) {
__do_cpuid_fault(0x8001, 0, &eax, &ebx, &ecx, &edx);
-   flags->val[6] &= ecx;
-   flags->val[1] &= edx;
+   flags.val[6] &= ecx;
+   flags.val[1] &= edx;
}
 
if (c->cpuid_level >= 0x000d) {
__do_cpuid_fault(0x000d, 1, &eax, &ebx, &ecx, &edx);
-   flags->val[10] &= eax;
+   flags.val[10] &= eax;
}
+
+   spin_lock(&cpu_flags_lock);
+   memcpy(&per_cpu(cpu_flags, cpu), &flags, sizeof(flags));
+   spin_unlock(&cpu_flags_lock);
 }
 
 static int show_cpuinfo(struct seq_file *m, void *v)
@@ -108,6 +113,7 @@ static int show_cpuinfo(struct seq_file *m, void *v)
struct cpuinfo_x86 *c = v;
unsigned int cpu;
int is_super = ve_is_super(get_exec_env());
+   struct cpu_flags ve_flags;
int i;
 
cpu = c->cpu_index;
@@ -147,12 +153,19 @@ static int show_cpuinfo(struct seq_file *m, void *v)
show_cpuinfo_core(m, c, cpu);
show_cpuinfo_misc(m, c);
 
+   if (!is_super) {
+   spin_lock_irq(&cpu_flags_lock);
+   memcpy(&ve_flags, &per_cpu(cpu_flags, cpu), sizeof(ve_flags));
+   spin_unlock_irq(&cpu_flags_lock);
+   }
+
+
seq_puts(m, "flags\t\t:");
for (i = 0; i < 32*NCAPINTS; i++)
if (x86_cap_flags[i] != NULL &&
((is_super && cpu_has(c, i)) ||
 (!is_super && test_bit(i, (unsigned long *)
-   &per_cpu(cpu_flags, 
cpu)
+   &ve_flags
seq_printf(m, " %s", x86_cap_flags[i]);
 
seq_puts(m, "\nbugs\t\t:");
-- 
2.26.2

___
Devel mailing list
Devel@openvz.org
https://lists.openvz.org/mailman/listinfo/devel


[Devel] [PATCH vz8] x86_64, vclock_gettime: Use standart division instead of __iter_div_u64_rem()

2020-11-03 Thread Andrey Ryabinin
timespec_sub_ns() historically uses __iter_div_u64_rem() for division.
Probably it's supposed to be faster

/*
 * Iterative div/mod for use when dividend is not expected to be much
 * bigger than divisor.
 */
u32 iter_div_u64_rem(u64 dividend, u32 divisor, u64 *remainder)

However in our case ve_start_time may make dividend much bigger than divisor.
So let's use standard "/" instead of iterative one. With 0 ve_start_time
I wasn't able to see measurable difference, however with big ve_start_time
the difference is rather significant:

 # time ./clock_iter_div
real1m30.224s
user1m30.343s
sys 0m0.008s

 # time taskset ./clock_div
real0m2.757s
user0m1.730s
sys 0m0.066s

32-bit vdso doesn't like 64-bit division and doesn't link.
I think it needs __udivsi3(). So just fallback to __iter_div_u64_rem()
on 32-bit.

https://jira.sw.ru/browse/PSBM-121856
Signed-off-by: Andrey Ryabinin 
---
 arch/x86/entry/vdso/vclock_gettime.c | 18 --
 1 file changed, 16 insertions(+), 2 deletions(-)

diff --git a/arch/x86/entry/vdso/vclock_gettime.c 
b/arch/x86/entry/vdso/vclock_gettime.c
index be1de6c4cafa..224dbe80da66 100644
--- a/arch/x86/entry/vdso/vclock_gettime.c
+++ b/arch/x86/entry/vdso/vclock_gettime.c
@@ -229,13 +229,27 @@ notrace static int __always_inline do_realtime(struct 
timespec *ts)
return mode;
 }
 
+static inline u64 divu64(u64 dividend, u32 divisor, u64 *remainder)
+{
+   /* 32-bit wants __udivsi3() and fails to link, so fallback to iter */
+#ifndef BUILD_VDSO32
+   u64 res;
+
+   res = dividend/divisor;
+   *remainder = dividend % divisor;
+   return res;
+#else
+   return __iter_div_u64_rem(dividend, divisor, remainder);
+#endif
+}
+
 static inline void timespec_sub_ns(struct timespec *ts, u64 ns)
 {
if ((s64)ns <= 0) {
-   ts->tv_sec += __iter_div_u64_rem(-ns, NSEC_PER_SEC, &ns);
+   ts->tv_sec += divu64(-ns, NSEC_PER_SEC, &ns);
ts->tv_nsec = ns;
} else {
-   ts->tv_sec -= __iter_div_u64_rem(ns, NSEC_PER_SEC, &ns);
+   ts->tv_sec -= divu64(ns, NSEC_PER_SEC, &ns);
if (ns) {
ts->tv_sec--;
ns = NSEC_PER_SEC - ns;
-- 
2.26.2

___
Devel mailing list
Devel@openvz.org
https://lists.openvz.org/mailman/listinfo/devel


Re: [Devel] [PATCH vz8 v3 1/2] x86, cpuinfo: Fix race on parallel /proc/cpuinfo read

2020-11-03 Thread Kirill Tkhai
On 03.11.2020 17:36, Andrey Ryabinin wrote:
> If several threads read /proc/cpuinfo some can see in 'flags'
> values from c->x86_capability, before __do_cpuid_fault() called
> and masks applied. Fix this by forming 'flags' on stack first
> and copy them in per_cpu(cpu_flags, cpu) as a last step.
> 
> https://jira.sw.ru/browse/PSBM-121823
> Signed-off-by: Andrey Ryabinin 

Reviewed-by: Kirill Tkhai 

> ---
> 
> Changes since v1:
>  - none
> 
> Changes since v2:
>  - add spinlock, use temporary ve_flags in show_cpuinfo()
>  
>  arch/x86/kernel/cpu/proc.c | 31 ++-
>  1 file changed, 22 insertions(+), 9 deletions(-)
> 
> diff --git a/arch/x86/kernel/cpu/proc.c b/arch/x86/kernel/cpu/proc.c
> index 4fe1577d5e6f..08fd7ff9a55b 100644
> --- a/arch/x86/kernel/cpu/proc.c
> +++ b/arch/x86/kernel/cpu/proc.c
> @@ -65,15 +65,16 @@ struct cpu_flags {
>  };
>  
>  static DEFINE_PER_CPU(struct cpu_flags, cpu_flags);
> +static DEFINE_SPINLOCK(cpu_flags_lock);
>  
>  static void init_cpu_flags(void *dummy)
>  {
>   int cpu = smp_processor_id();
> - struct cpu_flags *flags = &per_cpu(cpu_flags, cpu);
> + struct cpu_flags flags;
>   struct cpuinfo_x86 *c = &cpu_data(cpu);
>   unsigned int eax, ebx, ecx, edx;
>  
> - memcpy(flags->val, c->x86_capability, NCAPINTS * sizeof(u32));
> + memcpy(&flags, c->x86_capability, sizeof(flags));
>  
>   /*
>* Clear feature bits masked using cpuid masking/faulting.
> @@ -81,26 +82,30 @@ static void init_cpu_flags(void *dummy)
>  
>   if (c->cpuid_level >= 0x0001) {
>   __do_cpuid_fault(0x0001, 0, &eax, &ebx, &ecx, &edx);
> - flags->val[4] &= ecx;
> - flags->val[0] &= edx;
> + flags.val[4] &= ecx;
> + flags.val[0] &= edx;
>   }
>  
>   if (c->cpuid_level >= 0x0007) {
>   __do_cpuid_fault(0x0007, 0, &eax, &ebx, &ecx, &edx);
> - flags->val[9] &= ebx;
> + flags.val[9] &= ebx;
>   }
>  
>   if ((c->extended_cpuid_level & 0x) == 0x8000 &&
>   c->extended_cpuid_level >= 0x8001) {
>   __do_cpuid_fault(0x8001, 0, &eax, &ebx, &ecx, &edx);
> - flags->val[6] &= ecx;
> - flags->val[1] &= edx;
> + flags.val[6] &= ecx;
> + flags.val[1] &= edx;
>   }
>  
>   if (c->cpuid_level >= 0x000d) {
>   __do_cpuid_fault(0x000d, 1, &eax, &ebx, &ecx, &edx);
> - flags->val[10] &= eax;
> + flags.val[10] &= eax;
>   }
> +
> + spin_lock(&cpu_flags_lock);
> + memcpy(&per_cpu(cpu_flags, cpu), &flags, sizeof(flags));
> + spin_unlock(&cpu_flags_lock);
>  }
>  
>  static int show_cpuinfo(struct seq_file *m, void *v)
> @@ -108,6 +113,7 @@ static int show_cpuinfo(struct seq_file *m, void *v)
>   struct cpuinfo_x86 *c = v;
>   unsigned int cpu;
>   int is_super = ve_is_super(get_exec_env());
> + struct cpu_flags ve_flags;
>   int i;
>  
>   cpu = c->cpu_index;
> @@ -147,12 +153,19 @@ static int show_cpuinfo(struct seq_file *m, void *v)
>   show_cpuinfo_core(m, c, cpu);
>   show_cpuinfo_misc(m, c);
>  
> + if (!is_super) {
> + spin_lock_irq(&cpu_flags_lock);
> + memcpy(&ve_flags, &per_cpu(cpu_flags, cpu), sizeof(ve_flags));
> + spin_unlock_irq(&cpu_flags_lock);
> + }
> +
> +
>   seq_puts(m, "flags\t\t:");
>   for (i = 0; i < 32*NCAPINTS; i++)
>   if (x86_cap_flags[i] != NULL &&
>   ((is_super && cpu_has(c, i)) ||
>(!is_super && test_bit(i, (unsigned long *)
> - &per_cpu(cpu_flags, 
> cpu)
> + &ve_flags
>   seq_printf(m, " %s", x86_cap_flags[i]);
>  
>   seq_puts(m, "\nbugs\t\t:");
> 

___
Devel mailing list
Devel@openvz.org
https://lists.openvz.org/mailman/listinfo/devel