[Devel] [PATCH vz7, vz8 1/1] kvm: fix AMD IBRS/IBPB/STIBP/SSBD reporting #PSBM-120787
We should report these bits in 8008 EBX on AMD only, i.e. when AMD specific feature bits are enabled. Signed-off-by: Denis V. Lunev CC: Vasily Averin CC: Konstantin Khorenko --- arch/x86/kvm/cpuid.c | 8 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c index fe8b92723990..05898112a306 100644 --- a/arch/x86/kvm/cpuid.c +++ b/arch/x86/kvm/cpuid.c @@ -641,13 +641,13 @@ static inline int __do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function, * arch/x86/kernel/cpu/bugs.c is kind enough to * record that in cpufeatures so use them. */ - if (boot_cpu_has(X86_FEATURE_IBPB)) + if (boot_cpu_has(X86_FEATURE_AMD_IBPB)) entry->ebx |= F(AMD_IBPB); - if (boot_cpu_has(X86_FEATURE_IBRS)) + if (boot_cpu_has(X86_FEATURE_AMD_IBRS)) entry->ebx |= F(AMD_IBRS); - if (boot_cpu_has(X86_FEATURE_STIBP)) + if (boot_cpu_has(X86_FEATURE_AMD_STIBP)) entry->ebx |= F(AMD_STIBP); - if (boot_cpu_has(X86_FEATURE_SSBD)) + if (boot_cpu_has(X86_FEATURE_AMD_SSBD)) entry->ebx |= F(AMD_SSBD); if (!boot_cpu_has_bug(X86_BUG_SPEC_STORE_BYPASS)) entry->ebx |= F(AMD_SSB_NO); -- 2.17.1 ___ Devel mailing list Devel@openvz.org https://lists.openvz.org/mailman/listinfo/devel
[Devel] [PATCH vz8 2/4] ia32: add 32-bit vdso virtualization.
Similarly to the 64-bit vdso, make 32-bit vdso mapping per-ve. This will allow per container modification of the linux version xin .note section of vdso and monotonic time. https://jira.sw.ru/browse/PSBM-121668 Signed-off-by: Andrey Ryabinin --- arch/x86/entry/vdso/vma.c| 4 ++-- arch/x86/kernel/process_64.c | 2 +- include/linux/ve.h | 1 + kernel/ve/ve.c | 35 +-- 4 files changed, 25 insertions(+), 17 deletions(-) diff --git a/arch/x86/entry/vdso/vma.c b/arch/x86/entry/vdso/vma.c index c48deffc1473..538c6730f436 100644 --- a/arch/x86/entry/vdso/vma.c +++ b/arch/x86/entry/vdso/vma.c @@ -56,7 +56,7 @@ static void vdso_fix_landing(const struct vdso_image *image, struct vm_area_struct *new_vma) { #if defined CONFIG_X86_32 || defined CONFIG_IA32_EMULATION - if (in_ia32_syscall() && image == &vdso_image_32) { + if (in_ia32_syscall() && image == get_exec_env()->vdso_32) { struct pt_regs *regs = current_pt_regs(); unsigned long vdso_land = image->sym_int80_landing_pad; unsigned long old_land_addr = vdso_land + @@ -281,7 +281,7 @@ static int load_vdso32(void) if (vdso32_enabled != 1) /* Other values all mean "disabled" */ return 0; - return map_vdso(&vdso_image_32, 0); + return map_vdso(get_exec_env()->vdso_32, 0); } #endif diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c index a010d4b9d126..22215141 100644 --- a/arch/x86/kernel/process_64.c +++ b/arch/x86/kernel/process_64.c @@ -686,7 +686,7 @@ long do_arch_prctl_64(struct task_struct *task, int option, unsigned long arg2) # endif # if defined CONFIG_X86_32 || defined CONFIG_IA32_EMULATION case ARCH_MAP_VDSO_32: - return prctl_map_vdso(&vdso_image_32, arg2); + return prctl_map_vdso(get_exec_env()->vdso_32, arg2); # endif case ARCH_MAP_VDSO_64: return prctl_map_vdso(get_exec_env()->vdso_64, arg2); diff --git a/include/linux/ve.h b/include/linux/ve.h index 0e85a4032c3a..5b1962ff4c66 100644 --- a/include/linux/ve.h +++ b/include/linux/ve.h @@ -95,6 +95,7 @@ struct ve_struct { struct cn_private *cn; #endif struct vdso_image *vdso_64; + struct vdso_image *vdso_32; }; #define VE_MEMINFO_DEFAULT 1 /* default behaviour */ diff --git a/kernel/ve/ve.c b/kernel/ve/ve.c index 186deb3f88f4..03b8d126a0ed 100644 --- a/kernel/ve/ve.c +++ b/kernel/ve/ve.c @@ -58,6 +58,7 @@ struct ve_struct ve0 = { .netns_max_nr = INT_MAX, .meminfo_val= VE_MEMINFO_SYSTEM, .vdso_64= (struct vdso_image*)&vdso_image_64, + .vdso_32= (struct vdso_image*)&vdso_image_32, }; EXPORT_SYMBOL(ve0); @@ -540,13 +541,12 @@ static __u64 ve_setup_iptables_mask(__u64 init_mask) } #endif -static int copy_vdso(struct ve_struct *ve) +static int copy_vdso(struct vdso_image **vdso_dst, const struct vdso_image *vdso_src) { - const struct vdso_image *vdso_src = &vdso_image_64; struct vdso_image *vdso; void *vdso_data; - if (ve->vdso_64) + if (*vdso_dst) return 0; vdso = kmemdup(vdso_src, sizeof(*vdso), GFP_KERNEL); @@ -563,10 +563,22 @@ static int copy_vdso(struct ve_struct *ve) vdso->data = vdso_data; - ve->vdso_64 = vdso; + *vdso_dst = vdso; return 0; } +static void ve_free_vdso(struct ve_struct *ve) +{ + if (ve->vdso_64 && ve->vdso_64 != &vdso_image_64) { + kfree(ve->vdso_64->data); + kfree(ve->vdso_64); + } + if (ve->vdso_32 && ve->vdso_32 != &vdso_image_32) { + kfree(ve->vdso_32->data); + kfree(ve->vdso_32); + } +} + static struct cgroup_subsys_state *ve_create(struct cgroup_subsys_state *parent_css) { struct ve_struct *ve = &ve0; @@ -592,7 +604,10 @@ static struct cgroup_subsys_state *ve_create(struct cgroup_subsys_state *parent_ if (err) goto err_log; - if (copy_vdso(ve)) + if (copy_vdso(&ve->vdso_64, &vdso_image_64)) + goto err_vdso; + + if (copy_vdso(&ve->vdso_32, &vdso_image_32)) goto err_vdso; ve->features = VE_FEATURES_DEF; @@ -619,6 +634,7 @@ static struct cgroup_subsys_state *ve_create(struct cgroup_subsys_state *parent_ return &ve->css; err_vdso: + ve_free_vdso(ve); ve_log_destroy(ve); err_log: free_percpu(ve->sched_lat_ve.cur); @@ -658,15 +674,6 @@ static void ve_offline(struct cgroup_subsys_state *css) ve->ve_name = NULL; } -static void ve_free_vdso(struct ve_struct *ve) -{ - if (ve->vdso_64 == &vdso_image_64) - return; - - kfree(ve->vdso_64->data); - kfree(ve->vdso_64); -} - static void ve_destroy(struct cgroup_subsys_state *css) {
[Devel] [PATCH vz8 3/4] ve: patch linux_version_code in vdso
On the write to ve.os_release file patch the linux_version_code in the .note section of vdso. https://jira.sw.ru/browse/PSBM-121668 Signed-off-by: Andrey Ryabinin --- arch/x86/entry/vdso/vdso-note.S | 2 ++ arch/x86/entry/vdso/vdso2c.c | 1 + arch/x86/entry/vdso/vdso32/note.S | 2 ++ arch/x86/include/asm/vdso.h | 1 + kernel/ve/ve.c| 7 +++ 5 files changed, 13 insertions(+) diff --git a/arch/x86/entry/vdso/vdso-note.S b/arch/x86/entry/vdso/vdso-note.S index 79a071e4357e..c0e6e65f9fec 100644 --- a/arch/x86/entry/vdso/vdso-note.S +++ b/arch/x86/entry/vdso/vdso-note.S @@ -7,6 +7,8 @@ #include #include + .globl linux_version_code ELFNOTE_START(Linux, 0, "a") +linux_version_code: .long LINUX_VERSION_CODE ELFNOTE_END diff --git a/arch/x86/entry/vdso/vdso2c.c b/arch/x86/entry/vdso/vdso2c.c index 4674f58581a1..7fab0bd96ac1 100644 --- a/arch/x86/entry/vdso/vdso2c.c +++ b/arch/x86/entry/vdso/vdso2c.c @@ -109,6 +109,7 @@ struct vdso_sym required_syms[] = { {"__kernel_sigreturn", true}, {"__kernel_rt_sigreturn", true}, {"int80_landing_pad", true}, + {"linux_version_code", true}, }; __attribute__((format(printf, 1, 2))) __attribute__((noreturn)) diff --git a/arch/x86/entry/vdso/vdso32/note.S b/arch/x86/entry/vdso/vdso32/note.S index 9fd51f206314..096b62f14863 100644 --- a/arch/x86/entry/vdso/vdso32/note.S +++ b/arch/x86/entry/vdso/vdso32/note.S @@ -10,7 +10,9 @@ /* Ideally this would use UTS_NAME, but using a quoted string here doesn't work. Remember to change this when changing the kernel's name. */ + .globl linux_version_code ELFNOTE_START(Linux, 0, "a") +linux_version_code: .long LINUX_VERSION_CODE ELFNOTE_END diff --git a/arch/x86/include/asm/vdso.h b/arch/x86/include/asm/vdso.h index 27566e57e87d..92c7ac06828e 100644 --- a/arch/x86/include/asm/vdso.h +++ b/arch/x86/include/asm/vdso.h @@ -27,6 +27,7 @@ struct vdso_image { long sym___kernel_rt_sigreturn; long sym___kernel_vsyscall; long sym_int80_landing_pad; + long sym_linux_version_code; }; #ifdef CONFIG_X86_64 diff --git a/kernel/ve/ve.c b/kernel/ve/ve.c index 03b8d126a0ed..98c2e7e3d2c6 100644 --- a/kernel/ve/ve.c +++ b/kernel/ve/ve.c @@ -954,6 +954,7 @@ static ssize_t ve_os_release_write(struct kernfs_open_file *of, char *buf, { struct cgroup_subsys_state *css = of_css(of); struct ve_struct *ve = css_to_ve(css); + int n1, n2, n3, new_version; char *release; int ret = 0; @@ -964,6 +965,12 @@ static ssize_t ve_os_release_write(struct kernfs_open_file *of, char *buf, goto up_opsem; } + if (sscanf(buf, "%d.%d.%d", &n1, &n2, &n3) == 3) { + new_version = ((n1 << 16) + (n2 << 8)) + n3; + *((int *)(ve->vdso_64->data + ve->vdso_64->sym_linux_version_code)) = new_version; + *((int *)(ve->vdso_32->data + ve->vdso_32->sym_linux_version_code)) = new_version; + } + down_write(&uts_sem); release = ve->ve_ns->uts_ns->name.release; strncpy(release, buf, __NEW_UTS_LEN); -- 2.26.2 ___ Devel mailing list Devel@openvz.org https://lists.openvz.org/mailman/listinfo/devel
[Devel] [PATCH vz8 1/4] ve, x86_64: add per-ve vdso mapping.
Make vdso mapping per-ve. This will allow per container modification of the linux version in .note section of vdso and monotonic time. https://jira.sw.ru/browse/PSBM-121668 Signed-off-by: Andrey Ryabinin --- arch/x86/entry/vdso/vma.c| 3 ++- arch/x86/kernel/process_64.c | 2 +- include/linux/ve.h | 2 ++ kernel/ve/ve.c | 43 4 files changed, 48 insertions(+), 2 deletions(-) diff --git a/arch/x86/entry/vdso/vma.c b/arch/x86/entry/vdso/vma.c index eb3d85f87884..c48deffc1473 100644 --- a/arch/x86/entry/vdso/vma.c +++ b/arch/x86/entry/vdso/vma.c @@ -291,7 +291,8 @@ int arch_setup_additional_pages(struct linux_binprm *bprm, int uses_interp) if (!vdso64_enabled) return 0; - return map_vdso_randomized(&vdso_image_64); + + return map_vdso_randomized(get_exec_env()->vdso_64); } #ifdef CONFIG_COMPAT diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c index c1c8d66cbe70..a010d4b9d126 100644 --- a/arch/x86/kernel/process_64.c +++ b/arch/x86/kernel/process_64.c @@ -689,7 +689,7 @@ long do_arch_prctl_64(struct task_struct *task, int option, unsigned long arg2) return prctl_map_vdso(&vdso_image_32, arg2); # endif case ARCH_MAP_VDSO_64: - return prctl_map_vdso(&vdso_image_64, arg2); + return prctl_map_vdso(get_exec_env()->vdso_64, arg2); #endif default: diff --git a/include/linux/ve.h b/include/linux/ve.h index ec7dc522ac1f..0e85a4032c3a 100644 --- a/include/linux/ve.h +++ b/include/linux/ve.h @@ -15,6 +15,7 @@ #include #include #include +#include struct nsproxy; struct veip_struct; @@ -93,6 +94,7 @@ struct ve_struct { #ifdef CONFIG_CONNECTOR struct cn_private *cn; #endif + struct vdso_image *vdso_64; }; #define VE_MEMINFO_DEFAULT 1 /* default behaviour */ diff --git a/kernel/ve/ve.c b/kernel/ve/ve.c index cc26d3b2fa9b..186deb3f88f4 100644 --- a/kernel/ve/ve.c +++ b/kernel/ve/ve.c @@ -57,6 +57,7 @@ struct ve_struct ve0 = { .netns_avail_nr = ATOMIC_INIT(INT_MAX), .netns_max_nr = INT_MAX, .meminfo_val= VE_MEMINFO_SYSTEM, + .vdso_64= (struct vdso_image*)&vdso_image_64, }; EXPORT_SYMBOL(ve0); @@ -539,6 +540,33 @@ static __u64 ve_setup_iptables_mask(__u64 init_mask) } #endif +static int copy_vdso(struct ve_struct *ve) +{ + const struct vdso_image *vdso_src = &vdso_image_64; + struct vdso_image *vdso; + void *vdso_data; + + if (ve->vdso_64) + return 0; + + vdso = kmemdup(vdso_src, sizeof(*vdso), GFP_KERNEL); + if (!vdso) + return -ENOMEM; + + vdso_data = kmalloc(vdso_src->size, GFP_KERNEL); + if (!vdso_data) { + kfree(vdso); + return -ENOMEM; + } + + memcpy(vdso_data, vdso_src->data, vdso_src->size); + + vdso->data = vdso_data; + + ve->vdso_64 = vdso; + return 0; +} + static struct cgroup_subsys_state *ve_create(struct cgroup_subsys_state *parent_css) { struct ve_struct *ve = &ve0; @@ -564,6 +592,9 @@ static struct cgroup_subsys_state *ve_create(struct cgroup_subsys_state *parent_ if (err) goto err_log; + if (copy_vdso(ve)) + goto err_vdso; + ve->features = VE_FEATURES_DEF; ve->_randomize_va_space = ve0._randomize_va_space; @@ -587,6 +618,8 @@ static struct cgroup_subsys_state *ve_create(struct cgroup_subsys_state *parent_ return &ve->css; +err_vdso: + ve_log_destroy(ve); err_log: free_percpu(ve->sched_lat_ve.cur); err_lat: @@ -625,12 +658,22 @@ static void ve_offline(struct cgroup_subsys_state *css) ve->ve_name = NULL; } +static void ve_free_vdso(struct ve_struct *ve) +{ + if (ve->vdso_64 == &vdso_image_64) + return; + + kfree(ve->vdso_64->data); + kfree(ve->vdso_64); +} + static void ve_destroy(struct cgroup_subsys_state *css) { struct ve_struct *ve = css_to_ve(css); kmapset_unlink(&ve->sysfs_perms_key, &sysfs_ve_perms_set); ve_log_destroy(ve); + ve_free_vdso(ve); #if IS_ENABLED(CONFIG_BINFMT_MISC) kfree(ve->binfmt_misc); #endif -- 2.26.2 ___ Devel mailing list Devel@openvz.org https://lists.openvz.org/mailman/listinfo/devel
[Devel] [PATCH vz8 4/4] ve: add per-ve CLOCK_MONOTONIC time via __vclock_getttime()
Make possible to read virtualized container's CLOCK_MONOTONIC time via __vclock_getttime(). Record containers start time in per-ve vdso and substruct it from the host's time on clock read. https://jira.sw.ru/browse/PSBM-121668 Signed-off-by: Andrey Ryabinin --- arch/x86/entry/vdso/vclock_gettime.c | 27 +++ arch/x86/entry/vdso/vdso2c.c | 1 + arch/x86/include/asm/vdso.h | 1 + kernel/ve/ve.c | 14 ++ 4 files changed, 39 insertions(+), 4 deletions(-) diff --git a/arch/x86/entry/vdso/vclock_gettime.c b/arch/x86/entry/vdso/vclock_gettime.c index e48ca3afa091..be1de6c4cafa 100644 --- a/arch/x86/entry/vdso/vclock_gettime.c +++ b/arch/x86/entry/vdso/vclock_gettime.c @@ -24,6 +24,8 @@ #define gtod (&VVAR(vsyscall_gtod_data)) +u64 ve_start_time; + extern int __vdso_clock_gettime(clockid_t clock, struct timespec *ts); extern int __vdso_gettimeofday(struct timeval *tv, struct timezone *tz); extern time_t __vdso_time(time_t *t); @@ -227,6 +229,21 @@ notrace static int __always_inline do_realtime(struct timespec *ts) return mode; } +static inline void timespec_sub_ns(struct timespec *ts, u64 ns) +{ + if ((s64)ns <= 0) { + ts->tv_sec += __iter_div_u64_rem(-ns, NSEC_PER_SEC, &ns); + ts->tv_nsec = ns; + } else { + ts->tv_sec -= __iter_div_u64_rem(ns, NSEC_PER_SEC, &ns); + if (ns) { + ts->tv_sec--; + ns = NSEC_PER_SEC - ns; + } + ts->tv_nsec = ns; + } +} + notrace static int __always_inline do_monotonic(struct timespec *ts) { unsigned long seq; @@ -242,9 +259,7 @@ notrace static int __always_inline do_monotonic(struct timespec *ts) ns >>= gtod->shift; } while (unlikely(gtod_read_retry(gtod, seq))); - ts->tv_sec += __iter_div_u64_rem(ns, NSEC_PER_SEC, &ns); - ts->tv_nsec = ns; - + timespec_sub_ns(ts, ve_start_time - ns); return mode; } @@ -260,12 +275,16 @@ notrace static void do_realtime_coarse(struct timespec *ts) notrace static void do_monotonic_coarse(struct timespec *ts) { + u64 ns; unsigned long seq; + do { seq = gtod_read_begin(gtod); ts->tv_sec = gtod->monotonic_time_coarse_sec; - ts->tv_nsec = gtod->monotonic_time_coarse_nsec; + ns = gtod->monotonic_time_coarse_nsec; } while (unlikely(gtod_read_retry(gtod, seq))); + + timespec_sub_ns(ts, ve_start_time - ns); } notrace int __vdso_clock_gettime(clockid_t clock, struct timespec *ts) diff --git a/arch/x86/entry/vdso/vdso2c.c b/arch/x86/entry/vdso/vdso2c.c index 7fab0bd96ac1..c76141e9ca16 100644 --- a/arch/x86/entry/vdso/vdso2c.c +++ b/arch/x86/entry/vdso/vdso2c.c @@ -110,6 +110,7 @@ struct vdso_sym required_syms[] = { {"__kernel_rt_sigreturn", true}, {"int80_landing_pad", true}, {"linux_version_code", true}, + {"ve_start_time", true}, }; __attribute__((format(printf, 1, 2))) __attribute__((noreturn)) diff --git a/arch/x86/include/asm/vdso.h b/arch/x86/include/asm/vdso.h index 92c7ac06828e..9c265f79a126 100644 --- a/arch/x86/include/asm/vdso.h +++ b/arch/x86/include/asm/vdso.h @@ -28,6 +28,7 @@ struct vdso_image { long sym___kernel_vsyscall; long sym_int80_landing_pad; long sym_linux_version_code; + long sym_ve_start_time; }; #ifdef CONFIG_X86_64 diff --git a/kernel/ve/ve.c b/kernel/ve/ve.c index 98c2e7e3d2c6..ac3dda55e9ae 100644 --- a/kernel/ve/ve.c +++ b/kernel/ve/ve.c @@ -374,6 +374,17 @@ static int ve_start_kthreadd(struct ve_struct *ve) return err; } +static void ve_set_vdso_time(struct ve_struct *ve, u64 time) +{ + u64 *vdso_start_time; + + vdso_start_time = ve->vdso_64->data + ve->vdso_64->sym_ve_start_time; + *vdso_start_time = time; + + vdso_start_time = ve->vdso_32->data + ve->vdso_32->sym_ve_start_time; + *vdso_start_time = time; +} + /* under ve->op_sem write-lock */ static int ve_start_container(struct ve_struct *ve) { @@ -408,6 +419,8 @@ static int ve_start_container(struct ve_struct *ve) if (ve->start_time == 0) { ve->start_time = tsk->start_time; ve->real_start_time = tsk->real_start_time; + + ve_set_vdso_time(ve, ve->start_time); } /* The value is wrong, but it is never compared to process * start times */ @@ -1028,6 +1041,7 @@ static ssize_t ve_ts_write(struct kernfs_open_file *of, char *buf, case VE_CF_CLOCK_MONOTONIC: now = ktime_get_ns(); target = &ve->start_time; + ve_set_vdso_time(ve, now - delta_ns); break; case VE_CF_CLOCK_BOOTBASED: now = ktime_get_boot_ns(); -- 2.26.2 __
Re: [Devel] [PATCH rh8] mm/swap: activate swapped in pages on fault
On 10/19/20 7:32 PM, Konstantin Khorenko wrote: > From: Andrey Ryabinin > > Move swapped in anon pages directly to active list. This should > help us to prevent anon thrashing. Recently swapped in pages > has more chances to stay in memory. > > https://pmc.acronis.com/browse/VSTOR-20859 > Signed-off-by: Andrey Ryabinin > [VvS RHEL7.8 rebase] context changes > > (cherry picked from vz7 commit 134cd9b20a914080539e6310f76fe3f7b32bc710) > Signed-off-by: Konstantin Khorenko Reviewed-by: Andrey Ryabinin ___ Devel mailing list Devel@openvz.org https://lists.openvz.org/mailman/listinfo/devel
Re: [Devel] [PATCH rh8] ve: Virtualize /proc/swaps to watch from inside CT
On 10/19/20 5:27 PM, Konstantin Khorenko wrote: > From: Kirill Tkhai > > Customize /proc/swaps when showing from !ve_is_super. > Extracted from "Initial patch". > > Signed-off-by: Kirill Tkhai > > (cherry picked from vz7 commit 88c087f1fdb4b0f7934804269df36035ab6b83eb) > Signed-off-by: Konstantin Khorenko Reviewed-by: Andrey Ryabinin ___ Devel mailing list Devel@openvz.org https://lists.openvz.org/mailman/listinfo/devel
[Devel] [PATCH rh8 5/6] ve/proc/loadavg: Virtualize /proc/loadavg in Containers
The patch is based on following vz7 commits: ecdce58b214c ("sched: Export per task_group statistics_work") a58fb58bff1c ("Use ve init task's css instead of opening cgroup via vfs") 5f2a49a05629 ("sched/ve: Use cfs_rq::h_nr_running to count loadavg") vz8 rebase notes: 1) cpu cgroup vz specific file "proc.loadavg" has been dropped 2) "nr_running" field in /proc/loadavg inside a CT includes running realtime tasks (although they are not allowed to be run inside a CT) and tasks in D state (like on the Host) Signed-off-by: Konstantin Khorenko --- fs/proc/loadavg.c | 10 ++ include/linux/ve.h | 8 kernel/sched/core.c | 40 kernel/ve/ve.c | 16 4 files changed, 74 insertions(+) diff --git a/fs/proc/loadavg.c b/fs/proc/loadavg.c index 40467c3ade86..b884a1a59a3d 100644 --- a/fs/proc/loadavg.c +++ b/fs/proc/loadavg.c @@ -9,10 +9,20 @@ #include #include #include +#include static int loadavg_proc_show(struct seq_file *m, void *v) { unsigned long avnrun[3]; + struct ve_struct *ve; + + ve = get_exec_env(); + if (!ve_is_super(ve)) { + int ret; + ret = ve_show_loadavg(ve, m); + if (ret != -ENOSYS) + return ret; + } get_avenrun(avnrun, FIXED_1/200, 0); diff --git a/include/linux/ve.h b/include/linux/ve.h index ec7dc522ac1f..0341bb915923 100644 --- a/include/linux/ve.h +++ b/include/linux/ve.h @@ -176,4 +176,12 @@ static inline void monotonic_ve_to_abs(clockid_t which_clock, #endif /* CONFIG_VE */ +struct seq_file; + +#if defined(CONFIG_VE) && defined(CONFIG_CGROUP_SCHED) +int ve_show_loadavg(struct ve_struct *ve, struct seq_file *p); +#else +static inline int ve_show_loadavg(struct ve_struct *ve, struct seq_file *p) { return -ENOSYS; } +#endif + #endif /* _LINUX_VE_H */ diff --git a/kernel/sched/core.c b/kernel/sched/core.c index a6100bf3f625..0116742de578 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -41,6 +41,8 @@ const_debug unsigned int sysctl_sched_features = #undef SCHED_FEAT #endif +#include "../cgroup/cgroup-internal.h" /* For cgroup_task_count() */ + /* * Number of tasks to iterate in a single balance run. * Limited because this is done with IRQs disabled. @@ -7134,6 +7136,44 @@ static u64 cpu_rt_period_read_uint(struct cgroup_subsys_state *css, } #endif /* CONFIG_RT_GROUP_SCHED */ +int cpu_cgroup_proc_loadavg(struct cgroup_subsys_state *css, + struct seq_file *p) +{ + struct cgroup *cgrp = css->cgroup; + struct task_group *tg = css_tg(css); + unsigned long avnrun[3]; + int nr_running = 0; + int i; + + avnrun[0] = tg->avenrun[0] + FIXED_1/200; + avnrun[1] = tg->avenrun[1] + FIXED_1/200; + avnrun[2] = tg->avenrun[2] + FIXED_1/200; + + for_each_possible_cpu(i) { +#ifdef CONFIG_FAIR_GROUP_SCHED + nr_running += tg->cfs_rq[i]->h_nr_running; + /* +* We do not export nr_unint to parent task groups +* like we do for h_nr_running, as it gives additional +* overhead for activate/deactivate operations. So, we +* don't account child cgroup unint tasks here. +*/ + nr_running += tg->cfs_rq[i]->nr_unint; +#endif +#ifdef CONFIG_RT_GROUP_SCHED + nr_running += tg->rt_rq[i]->rt_nr_running; +#endif + } + + seq_printf(p, "%lu.%02lu %lu.%02lu %lu.%02lu %d/%d %d\n", + LOAD_INT(avnrun[0]), LOAD_FRAC(avnrun[0]), + LOAD_INT(avnrun[1]), LOAD_FRAC(avnrun[1]), + LOAD_INT(avnrun[2]), LOAD_FRAC(avnrun[2]), + nr_running, cgroup_task_count(cgrp), + idr_get_cursor(&task_active_pid_ns(current)->idr)); + return 0; +} + static struct cftype cpu_legacy_files[] = { #ifdef CONFIG_FAIR_GROUP_SCHED { diff --git a/kernel/ve/ve.c b/kernel/ve/ve.c index 43e37b27e887..193fdb95daab 100644 --- a/kernel/ve/ve.c +++ b/kernel/ve/ve.c @@ -1147,3 +1147,19 @@ int vz_security_protocol_check(struct net *net, int protocol) } } EXPORT_SYMBOL_GPL(vz_security_protocol_check); + +#ifdef CONFIG_CGROUP_SCHED +int cpu_cgroup_proc_loadavg(struct cgroup_subsys_state *css, + struct seq_file *p); + +int ve_show_loadavg(struct ve_struct *ve, struct seq_file *p) +{ + struct cgroup_subsys_state *css; + int err; + + css = ve_get_init_css(ve, cpu_cgrp_id); + err = cpu_cgroup_proc_loadavg(css, p); + css_put(css); + return err; +} +#endif /* CONFIG_CGROUP_SCHED */ -- 2.28.0 ___ Devel mailing list Devel@openvz.org https://lists.openvz.org/mailman/listinfo/devel
[Devel] [PATCH rh8 3/6] ve/sched/loadavg: Calculate avenrun for Containers root cpu cgroups
This patch is a part of vz7 commit (only avenrun part) 34a1dc1e4e3d ("sched: Account task_group::cpustat,taskstats,avenrun") Extracted from "Initial patch". Signed-off-by: Kirill Tkhai +++ ve/sched: Do not use kstat_glb_lock to update kstat_glob::nr_unint_avg kstat_glob::nr_unint_avg can't be updated in parallel on two or more cpus, so on modifications we have to protect against readers only. So, avoid using global kstat_glb_lock here, to minimize its sharing with another counters it protects. Signed-off-by: Kirill Tkhai (cherry picked from commit 715f311fdb4ab0b7922f9e53617c5821ae36bfaf) Signed-off-by: Konstantin Khorenko +++ sched/ve: Use cfs_rq::h_nr_running to count loadavg cfs_rq::nr_running contains number of child entities of one level below: tasks and cfs_rq, but it does not contain tasks from deeper levels. Use cfs_rq::h_nr_running instead as it contains number of tasks among all child hierarchy. https://jira.sw.ru/browse/PSBM-81572 Signed-off-by: Kirill Tkhai Reviewed-by: Andrey Ryabinin mFixes: 028c54e613a3 ("sched: Account task_group::avenrun") (cherry picked from vz7 commit 5f2a49a05629bd709ad6bfce83bfacc58a4db3d9) Signed-off-by: Konstantin Khorenko +++ sched/ve: Iterate only VE root cpu cgroups to count loadavg Counting loadavg we are interested in VE root cpu cgroup only, as it's analogy of node's loadavg. So, this patch makes iterate only such types of cpu cgroup, when we calc loadavg. Since this code called from interrupt, this may give positive performance resuts. https://jira.sw.ru/browse/PSBM-81572 Signed-off-by: Kirill Tkhai Reviewed-by: Andrey Ryabinin (cherry picked from vz7 commit 4140a241e5ec2230105f5c4513400a6b5ecea92f) Signed-off-by: Konstantin Khorenko +++ sched: Export calc_load_ve() This will be used in next patch. Signed-off-by: Kirill Tkhai = Patchset description: Make calc_load_ve() be executed out of jiffies_lock https://jira.sw.ru/browse/PSBM-84967 Kirill Tkhai (3): sched: Make calc_global_load() return true when it's need to update ve statistic sched: Export calc_load_ve() sched: Call calc_load_ve() out of jiffies_lock (cherry picked from vz7 commit 738b92fb2cdd6577925a6b7019925f320cd379df) Signed-off-by: Konstantin Khorenko +++ sched: Call calc_load_ve() out of jiffies_lock jiffies_lock is a big global seqlock, which is used in many places. In combination with another actions like smp call functions and readers of this seqlock, system may hang for a long time. There is already a pair of hard lockups because of long iteration in calc_load_ve() with jiffies_lock held, which made readers of this seqlock to spin long time. This patch makes calc_load_ve() to use separate lock, and this relaxes jiffies_lock. I think, this should be enough to resolve the problem, since both the crashes I saw contains readers of the seqlock on parallel cpus, and we won't have to relax further (say, moving calc_load_ve() to softirq). Note, that the principal change of this patch makes is jiffies_lock readers on parallel cpus won't wait till calc_load_ve() finishes, so instead of (n_readers + 1) cpus waiting till this function completes, there will be only 1 cpu doing that. https://jira.sw.ru/browse/PSBM-84967 Signed-off-by: Kirill Tkhai = Patchset description: Make calc_load_ve() be executed out of jiffies_lock https://jira.sw.ru/browse/PSBM-84967 Kirill Tkhai (3): sched: Make calc_global_load() return true when it's need to update ve statistic sched: Export calc_load_ve() sched: Call calc_load_ve() out of jiffies_lock +++ sched: really don't call calc_load_ve() under jiffies_lock Previously we've done all preparation work for calc_load_ve() not being executed under jiffies_lock, and thus not called from calc_global_load(), but forgot to drop the call in calc_global_load(). So now we still call expensive calc_load_ve() under the jiffies_lock and get NMI. Fix that. mFixes:19bc294a5691d ("sched: Call calc_load_ve() out of jiffies_lock") https://jira.sw.ru/browse/PSBM-102573 Signed-off-by: Konstantin Khorenko Signed-off-by: Valeriy Vdovin (cherry picked from vz7 commit 0610b98e5b6537d2ecd99522c3cbd1aa939565e7) Signed-off-by: Konstantin Khorenko --- include/linux/sched/loadavg.h | 8 ++ kernel/sched/loadavg.c| 50 +++ kernel/sched/sched.h | 1 + kernel/time/tick-common.c | 9 ++- kernel/time/tick-sched.c | 6 - kernel/time/timekeeping.c | 5 +++- 6 files changed, 76 insertions(+), 3 deletions(-) diff --git a/include/linux/sched/loadavg.h b/include/linux/sched/loadavg.h index 34061919f880..1da5768389b7 100644 --- a/include/linux/sched/loadavg.h +++ b/include/linux/sched/loadavg.h @@ -16,6 +16,8 @@ */ extern unsigned long avenrun[];/* Load averages */ extern void get_avenrun(unsigned long *loads, unsigned long offset, int shift); +extern vo
[Devel] [PATCH rh8 6/6] vzstat: Add kstat_glob.nr_unint_avg real accounting
This should be a part of commit 127bd48f3385 ("vzstat: Add vzstat module and kstat interfaces") but depends on task_group::avenrun accounting and thus goes separately. Signed-off-by: Konstantin Khorenko --- kernel/sched/loadavg.c | 10 +- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/kernel/sched/loadavg.c b/kernel/sched/loadavg.c index c62f34033112..c76b1c842ad8 100644 --- a/kernel/sched/loadavg.c +++ b/kernel/sched/loadavg.c @@ -105,7 +105,7 @@ extern spinlock_t load_ve_lock; void calc_load_ve(void) { - unsigned long nr_active; + unsigned long nr_unint, nr_active; struct task_group *tg; int i; @@ -137,6 +137,14 @@ void calc_load_ve(void) tg->avenrun[1] = calc_load(tg->avenrun[1], EXP_5, nr_active); tg->avenrun[2] = calc_load(tg->avenrun[2], EXP_15, nr_active); } + + nr_unint = nr_uninterruptible() * FIXED_1; + + write_seqcount_begin(&kstat_glob.nr_unint_avg_seq); + calc_load(kstat_glob.nr_unint_avg[0], EXP_1, nr_unint); + calc_load(kstat_glob.nr_unint_avg[1], EXP_5, nr_unint); + calc_load(kstat_glob.nr_unint_avg[2], EXP_15, nr_unint); + write_seqcount_end(&kstat_glob.nr_unint_avg_seq); spin_unlock(&load_ve_lock); } #endif /* CONFIG_VE */ -- 2.28.0 ___ Devel mailing list Devel@openvz.org https://lists.openvz.org/mailman/listinfo/devel
[Devel] [PATCH rh8 4/6] ve/sysinfo/loadavg: Virtualize loadavg values in sysinfo()
Fixes: 688c65f8eaf1 ("ve: Virtualize sysinfo") TODO: move appropriate hunk to this commit from the commit above Signed-off-by: Konstantin Khorenko --- kernel/sys.c | 4 1 file changed, 4 deletions(-) diff --git a/kernel/sys.c b/kernel/sys.c index 2646c8041258..e7e07ea8d7ef 100644 --- a/kernel/sys.c +++ b/kernel/sys.c @@ -2575,11 +2575,7 @@ static int do_sysinfo(struct sysinfo *info) info->procs = nr_threads_ve(ve); -#if 0 -FIXME after -715f311fdb4a ("sched: Account task_group::cpustat,taskstats,avenrun") is ported get_avenrun_ve(info->loads, 0, SI_LOAD_SHIFT - FSHIFT); -#endif } /* -- 2.28.0 ___ Devel mailing list Devel@openvz.org https://lists.openvz.org/mailman/listinfo/devel
[Devel] [PATCH rh8 2/6] sched: Make calc_global_load() return true when it's need to update ve statistic
From: Kirill Tkhai This will be used in next patches to call calc_load_ve() out of jiffies lock. Signed-off-by: Kirill Tkhai = Patchset description: Make calc_load_ve() be executed out of jiffies_lock https://jira.sw.ru/browse/PSBM-84967 Kirill Tkhai (3): sched: Make calc_global_load() return true when it's need to update ve statistic sched: Export calc_load_ve() sched: Call calc_load_ve() out of jiffies_lock (cherry picked from vz commit b26208e2f8bae0bc539bef9f37d5fc650e47e092) Signed-off-by: Konstantin Khorenko --- include/linux/sched/loadavg.h | 4 +++- kernel/sched/loadavg.c| 5 +++-- kernel/time/timekeeping.c | 4 ++-- kernel/time/timekeeping.h | 2 +- 4 files changed, 9 insertions(+), 6 deletions(-) diff --git a/include/linux/sched/loadavg.h b/include/linux/sched/loadavg.h index 4859bea47a7b..34061919f880 100644 --- a/include/linux/sched/loadavg.h +++ b/include/linux/sched/loadavg.h @@ -2,6 +2,8 @@ #ifndef _LINUX_SCHED_LOADAVG_H #define _LINUX_SCHED_LOADAVG_H +#include + /* * These are the constant used to fake the fixed-point load-average * counting. Some notes: @@ -43,6 +45,6 @@ extern unsigned long calc_load_n(unsigned long load, unsigned long exp, #define LOAD_INT(x) ((x) >> FSHIFT) #define LOAD_FRAC(x) LOAD_INT(((x) & (FIXED_1-1)) * 100) -extern void calc_global_load(unsigned long ticks); +extern bool calc_global_load(unsigned long ticks); #endif /* _LINUX_SCHED_LOADAVG_H */ diff --git a/kernel/sched/loadavg.c b/kernel/sched/loadavg.c index de22da666ac7..a7b373053dc4 100644 --- a/kernel/sched/loadavg.c +++ b/kernel/sched/loadavg.c @@ -347,14 +347,14 @@ static inline void calc_global_nohz(void) { } * * Called from the global timer code. */ -void calc_global_load(unsigned long ticks) +bool calc_global_load(unsigned long ticks) { unsigned long sample_window; long active, delta; sample_window = READ_ONCE(calc_load_update); if (time_before(jiffies, sample_window + 10)) - return; + return false; /* * Fold the 'old' NO_HZ-delta to include all NO_HZ CPUs. @@ -377,6 +377,7 @@ void calc_global_load(unsigned long ticks) * catch up in bulk. */ calc_global_nohz(); + return true; } /* diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c index 4118179d8c75..bce92a9952f4 100644 --- a/kernel/time/timekeeping.c +++ b/kernel/time/timekeeping.c @@ -2192,10 +2192,10 @@ EXPORT_SYMBOL(ktime_get_coarse_ts64); /* * Must hold jiffies_lock */ -void do_timer(unsigned long ticks) +bool do_timer(unsigned long ticks) { jiffies_64 += ticks; - calc_global_load(ticks); + return calc_global_load(ticks); } /** diff --git a/kernel/time/timekeeping.h b/kernel/time/timekeeping.h index 7a9b4eb7a1d5..7b6cdb0563f4 100644 --- a/kernel/time/timekeeping.h +++ b/kernel/time/timekeeping.h @@ -15,7 +15,7 @@ extern void timekeeping_warp_clock(void); extern int timekeeping_suspend(void); extern void timekeeping_resume(void); -extern void do_timer(unsigned long ticks); +extern bool do_timer(unsigned long ticks); extern void update_wall_time(void); extern seqlock_t jiffies_lock; -- 2.28.0 ___ Devel mailing list Devel@openvz.org https://lists.openvz.org/mailman/listinfo/devel
[Devel] [PATCH rh8 1/6] ve/sched: Link VE root cpu cgroups in separate list
From: Kirill Tkhai The idea is to link small number of VE root cpu cgroups to a separate list. This allows to avoid unnecessary calculations of loadavg for VE children cpu cgroups in next patches, and it should positively improve the performance of calc_load_ve(). https://jira.sw.ru/browse/PSBM-81572 Signed-off-by: Kirill Tkhai Reviewed-by: Andrey Ryabinin (cherry picked from commit vz7 c9af0076fff0ac796dcbec8ef17424ae08a9f54d) Signed-off-by: Konstantin Khorenko +++ ve/cgroup: do not link a CT cpu cgroup twice into ve_root_list Container's cpu cgroup is linked to "ve_root_list" on CT start. But if someone holds CT's cpu cgroup while CT is being stopped, next CT start tries to create same cpu cgroup (fails, already exists) and links this cpu cgroup to the "ve_root_list", thus corrupting it. As a consequence calc_load_ve() goes in an endless loop. Let's check if task_group has been already linked to the list and skip redundant linking. Locking scheme change: - drop rcu for list ve_root_list, we hold spinlocks anyway - use "load_ve_lock" spinlock for both list add/del/iterate, "task_group_lock" is unrelated here How to reproduce: # vzctl start 200 # echo $$ > /sys/fs/cgroup/cpu/machine.slice/200/tasks # vzctl stop 200 // At this moment VE cgroup got destroyed, but cpu cgroup is still alive // and linked to "ve_root_list" list # vzctl start 200 // double add of same tg (same cpu cgroup) to "ve_root_list" list => // list corruption => endless loop in next calc_load_ve() call https://jira.sw.ru/browse/PSBM-88251 Signed-off-by: Konstantin Khorenko Acked-by: Kirill Tkhai Reviewed-by: Andrey Ryabinin v2 changes: - change locking scheme: drop rcu, use "load_ve_lock" everywhere - drop tg->linked field, check if linked using list_empty() [VvS RHEL77b rebase] (cherry picked from vz7 commit cba368b94c0ad159f676539f554e9cc9d53aedaa) Signed-off-by: Konstantin Khorenko --- include/linux/sched.h | 8 kernel/cgroup/cgroup.c | 1 + kernel/sched/core.c| 31 +++ kernel/sched/sched.h | 4 4 files changed, 44 insertions(+) diff --git a/include/linux/sched.h b/include/linux/sched.h index 4326aa24e9dc..cabed6a47a70 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -2015,4 +2015,12 @@ static inline void rseq_syscall(struct pt_regs *regs) #endif +#ifdef CONFIG_VE +struct cgroup_subsys_state; +extern void link_ve_root_cpu_cgroup(struct cgroup_subsys_state *css); +void unlink_ve_root_cpu_cgroup(struct cgroup_subsys_state *css); +#else /* CONFIG_VE */ +void unlink_ve_root_cpu_cgroup(struct cgroup_subsys_state *css) { } +#endif /* CONFIG_VE */ + #endif diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c index 08137d43f3ab..4ee3eb24b0d1 100644 --- a/kernel/cgroup/cgroup.c +++ b/kernel/cgroup/cgroup.c @@ -1894,6 +1894,7 @@ void cgroup_mark_ve_root(struct ve_struct *ve) cgrp = link->cgrp; set_bit(CGRP_VE_ROOT, &cgrp->flags); } + link_ve_root_cpu_cgroup(cset->subsys[cpu_cgrp_id]); unlock: rcu_read_unlock(); diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 8a57956d64d6..a6100bf3f625 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -6581,6 +6581,9 @@ static int cpu_cgroup_css_online(struct cgroup_subsys_state *css) struct task_group *tg = css_tg(css); struct task_group *parent = css_tg(css->parent); +#ifdef CONFIG_VE + INIT_LIST_HEAD(&tg->ve_root_list); +#endif if (parent) sched_online_group(tg, parent); return 0; @@ -6590,6 +6593,7 @@ static void cpu_cgroup_css_released(struct cgroup_subsys_state *css) { struct task_group *tg = css_tg(css); + unlink_ve_root_cpu_cgroup(css); sched_offline_group(tg); } @@ -6677,6 +6681,33 @@ static u64 cpu_shares_read_u64(struct cgroup_subsys_state *css, return (u64) scale_load_down(tg->shares); } +#ifdef CONFIG_VE +LIST_HEAD(ve_root_list); +DEFINE_SPINLOCK(load_ve_lock); + +void link_ve_root_cpu_cgroup(struct cgroup_subsys_state *css) +{ + struct task_group *tg = css_tg(css); + unsigned long flags; + + spin_lock_irqsave(&load_ve_lock, flags); + BUG_ON(!(css->flags & CSS_ONLINE)); + if (list_empty(&tg->ve_root_list)) + list_add(&tg->ve_root_list, &ve_root_list); + spin_unlock_irqrestore(&load_ve_lock, flags); +} + +void unlink_ve_root_cpu_cgroup(struct cgroup_subsys_state *css) +{ + struct task_group *tg = css_tg(css); + unsigned long flags; + + spin_lock_irqsave(&load_ve_lock, flags); + list_del_init(&tg->ve_root_list); + spin_unlock_irqrestore(&load_ve_lock, flags); +} +#endif /* CONFIG_VE */ + #ifdef CONFIG_CFS_BANDWIDTH static DEFINE_MUTEX(cfs_constraints_mutex); diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index b2f0c26b2c50..93bf1d78c27d 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -404,6 +404,10
[Devel] [PATCH rh8 0/6] ve/sched/loadavg: loadavg virtualization
Current patchset is a rework of following vz7 patches: 5655edce75a2 vzstat: Add kstat_glob.nr_unint_avg real accounting 7ca32010adaa ve/proc/loadavg: Virtualize /proc/loadavg in Containers feba442cc064 sched: Call calc_load_ve() out of jiffies_lock 3c158be41cd2 sched: Export calc_load_ve() a113575a6c6e sched: Make calc_global_load() return true when it's need to update ve statistic 6fb0a9d805a1 sched/ve: Iterate only VE root cpu cgroups to count loadavg 71e893d4a552 sched/ve: Use cfs_rq::h_nr_running to count loadavg 028c54e613a3 sched: Account task_group::avenrun -> rename to ve/sched/loadavg: Calculate avenrun for Containers root cpu cgroups 72108f28ffca ve/cgroup: do not link a CT cpu cgroup twice into ve_root_list 8d5159d1f0d7 sched/ve: Link VE root cpu cgroups in separate list loadavg values are virtualized in /proc/loadavg file and in sysinfo() output. cpu cgroup::proc.loadavg file has been dropped (presents in vz7, but seems nobody uses it) This patchset obsoletes previously sent patches: sched: Account task_group::avenrun vzstat: Add kstat_glob.nr_unint_avg real accounting Kirill Tkhai (2): ve/sched: Link VE root cpu cgroups in separate list sched: Make calc_global_load() return true when it's need to update ve statistic Konstantin Khorenko (4): ve/sched/loadavg: Calculate avenrun for Containers root cpu cgroups ve/sysinfo/loadavg: Virtualize loadavg values in sysinfo() ve/proc/loadavg: Virtualize /proc/loadavg in Containers vzstat: Add kstat_glob.nr_unint_avg real accounting fs/proc/loadavg.c | 10 + include/linux/sched.h | 8 include/linux/sched/loadavg.h | 12 +- include/linux/ve.h| 8 kernel/cgroup/cgroup.c| 1 + kernel/sched/core.c | 71 +++ kernel/sched/loadavg.c| 63 ++- kernel/sched/sched.h | 5 +++ kernel/sys.c | 4 -- kernel/time/tick-common.c | 9 - kernel/time/tick-sched.c | 6 ++- kernel/time/timekeeping.c | 9 +++-- kernel/time/timekeeping.h | 2 +- kernel/ve/ve.c| 16 14 files changed, 211 insertions(+), 13 deletions(-) -- 2.28.0 ___ Devel mailing list Devel@openvz.org https://lists.openvz.org/mailman/listinfo/devel