On 30.10.2020 14:45, Andrey Ryabinin wrote: > From: Vladimir Davydov <vdavy...@virtuozzo.com> > > Port diff-arch-x86-introduce-cpuid-override > > Recent Intel CPUs rejected CPUID masking, which is required for flex > migration, in favor of CPUID faulting. So we need to support it in > kenrel. > > This patch adds user writable file /proc/vz/cpuid_override, which > contains CPUID override table. Each table entry must have the following > format: > > op[ count]: eax ebx ecx edx > > where @op and optional @count define a CPUID function, whose output one > would like to override (@op and @count are loaded to EAX and ECX > registers respectively before calling CPUID); @eax, @ebx, @ecx, @edx - > the desired CPUID output for the specified function. All values must be > in HEX, 0x prefix is optional. > > Notes: > > - the file is only present on hosts that support CPUID faulting; > - CPUID faulting is always enabled if it is supported; > - CPUID output is overridden on all present CPUs; > - the maximal number of entries one can override equals 16; > - each write(2) to the file removes all existing entries before adding > new ones, so the whole table must be written in one write(2); in > particular writing an empty line to the file removes all existing > rules. > > Example: > > Suppose we want to mask out SSE2 (CPUID.01H:EDX:26) and RDTSCP > (CPUID.80000001H:EDX:27). Then we should execute the following sequence: > > - get the current cpuid value: > > # cpuid -r | grep -e '^\s*0x00000001' -e '^\s*0x80000001' | head -n 2 > 0x00000001 0x00: eax=0x000306e4 ebx=0x00200800 ecx=0x7fbee3ff > edx=0xbfebfbff > 0x80000001 0x00: eax=0x00000000 ebx=0x00000000 ecx=0x00000001 > edx=0x2c100800 > > - clear the feature bits we want to mask out and write the result to > /proc/vz/cpuid_override: > > # cat >/proc/vz/cpuid_override <<EOF > 0x00000001: 0x000306e4 0x00200800 0x7fbee3ff 0xbbebfbff > 0x80000001: 0x00000000 0x00000000 0x00000001 0x24100800 > EOF > > - check that cpuid output was overridden: > > # cpuid -r | grep -e '^\s*0x00000001' -e '^\s*0x80000001' | head -n 2 > 0x00000001 0x00: eax=0x000306e4 ebx=0x00200800 ecx=0x7fbee3ff > edx=0xbbebfbff > 0x80000001 0x00: eax=0x00000000 ebx=0x00000000 ecx=0x00000001 > edx=0x24100800 > > https://jira.sw.ru/browse/PSBM-28682 > > Signed-off-by: Vladimir Davydov <vdavy...@parallels.com> > > Acked-by: Cyrill Gorcunov <gorcu...@parallels.com> > ============================================================================= > > https://jira.sw.ru/browse/PSBM-33638 > > Signed-off-by: Vladimir Davydov <vdavy...@virtuozzo.com> > Rebase: > Signed-off-by: Kirill Tkhai <ktk...@virtuozzo.com> > > https://jira.sw.ru/browse/PSBM-121823 > [aryabinin: vz8 rebase] > Signed-off-by: Andrey Ryabinin <aryabi...@virtuozzo.com>
For the series: Reviewed-by: Kirill Tkhai <ktk...@virtuozzo.com> > --- > arch/x86/include/asm/msr-index.h | 1 + > arch/x86/include/asm/traps.h | 2 + > arch/x86/kernel/Makefile | 1 + > arch/x86/kernel/cpu/proc.c | 4 + > arch/x86/kernel/cpuid_fault.c | 258 +++++++++++++++++++++++++++++++ > arch/x86/kernel/traps.c | 24 +++ > 6 files changed, 290 insertions(+) > create mode 100644 arch/x86/kernel/cpuid_fault.c > > diff --git a/arch/x86/include/asm/msr-index.h > b/arch/x86/include/asm/msr-index.h > index 6a21c227775c..9668ec6a064d 100644 > --- a/arch/x86/include/asm/msr-index.h > +++ b/arch/x86/include/asm/msr-index.h > @@ -114,6 +114,7 @@ > > #define MSR_IA32_BBL_CR_CTL 0x00000119 > #define MSR_IA32_BBL_CR_CTL3 0x0000011e > +#define MSR_MISC_FEATURES_ENABLES 0x00000140 > > #define MSR_IA32_TSX_CTRL 0x00000122 > #define TSX_CTRL_RTM_DISABLE BIT(0) /* Disable RTM feature */ > diff --git a/arch/x86/include/asm/traps.h b/arch/x86/include/asm/traps.h > index 0ae298ea01a1..0282c81719e7 100644 > --- a/arch/x86/include/asm/traps.h > +++ b/arch/x86/include/asm/traps.h > @@ -124,6 +124,8 @@ void __noreturn handle_stack_overflow(const char *message, > unsigned long fault_address); > #endif > > +void do_cpuid_fault(struct pt_regs *); > + > /* Interrupts/Exceptions */ > enum { > X86_TRAP_DE = 0, /* 0, Divide-by-zero */ > diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile > index 431d8c6e641d..b9451b653b04 100644 > --- a/arch/x86/kernel/Makefile > +++ b/arch/x86/kernel/Makefile > @@ -63,6 +63,7 @@ obj-y += pci-iommu_table.o > obj-y += resource.o > obj-y += irqflags.o > obj-y += spec_ctrl.o > +obj-y += cpuid_fault.o > > obj-y += process.o > obj-y += fpu/ > diff --git a/arch/x86/kernel/cpu/proc.c b/arch/x86/kernel/cpu/proc.c > index 2c8522a39ed5..d6b17a60acf6 100644 > --- a/arch/x86/kernel/cpu/proc.c > +++ b/arch/x86/kernel/cpu/proc.c > @@ -54,6 +54,10 @@ static void show_cpuinfo_misc(struct seq_file *m, struct > cpuinfo_x86 *c) > } > #endif > > +extern void __do_cpuid_fault(unsigned int op, unsigned int count, > + unsigned int *eax, unsigned int *ebx, > + unsigned int *ecx, unsigned int *edx); > + > static int show_cpuinfo(struct seq_file *m, void *v) > { > struct cpuinfo_x86 *c = v; > diff --git a/arch/x86/kernel/cpuid_fault.c b/arch/x86/kernel/cpuid_fault.c > new file mode 100644 > index 000000000000..339e2638c3b8 > --- /dev/null > +++ b/arch/x86/kernel/cpuid_fault.c > @@ -0,0 +1,258 @@ > +#include <linux/gfp.h> > +#include <linux/slab.h> > +#include <linux/spinlock.h> > +#include <linux/rcupdate.h> > +#include <linux/module.h> > +#include <linux/proc_fs.h> > +#include <linux/seq_file.h> > +#include <linux/ve.h> > +#include <asm/uaccess.h> > + > +struct cpuid_override_entry { > + unsigned int op; > + unsigned int count; > + bool has_count; > + unsigned int eax; > + unsigned int ebx; > + unsigned int ecx; > + unsigned int edx; > +}; > + > +#define MAX_CPUID_OVERRIDE_ENTRIES 16 > + > +struct cpuid_override_table { > + struct rcu_head rcu_head; > + int size; > + struct cpuid_override_entry entries[MAX_CPUID_OVERRIDE_ENTRIES]; > +}; > + > +static struct cpuid_override_table __rcu *cpuid_override __read_mostly; > +static DEFINE_SPINLOCK(cpuid_override_lock); > + > +static void cpuid_override_update(struct cpuid_override_table *new_table) > +{ > + struct cpuid_override_table *old_table; > + > + spin_lock(&cpuid_override_lock); > + old_table = rcu_access_pointer(cpuid_override); > + rcu_assign_pointer(cpuid_override, new_table); > + spin_unlock(&cpuid_override_lock); > + > + if (old_table) > + kfree_rcu(old_table, rcu_head); > +} > + > +static bool cpuid_override_match(unsigned int op, unsigned int count, > + unsigned int *eax, unsigned int *ebx, > + unsigned int *ecx, unsigned int *edx) > +{ > + bool ret = false; > + struct cpuid_override_table *t; > + struct cpuid_override_entry *e; > + int i; > + > + rcu_read_lock(); > + t = rcu_dereference(cpuid_override); > + if (!t) > + goto out; > + > + for (i = 0; i < t->size; i++) { > + e = &t->entries[i]; > + if (e->op != op) > + continue; > + if (e->has_count && e->count != count) > + continue; > + *eax = e->eax; > + *ebx = e->ebx; > + *ecx = e->ecx; > + *edx = e->edx; > + ret = true; > + break; > + } > +out: > + rcu_read_unlock(); > + return ret; > +} > + > +void __do_cpuid_fault(unsigned int op, unsigned int count, > + unsigned int *eax, unsigned int *ebx, > + unsigned int *ecx, unsigned int *edx) > +{ > + /* check if op is overridden */ > + if (cpuid_override_match(op, count, eax, ebx, ecx, edx)) > + return; > + > + /* fallback to real cpuid */ > + cpuid_count(op, count, eax, ebx, ecx, edx); > +} > + > +void do_cpuid_fault(struct pt_regs *regs) > +{ > + unsigned int eax, ebx, ecx, edx; > + > + __do_cpuid_fault(regs->ax, regs->cx, &eax, &ebx, &ecx, &edx); > + > + regs->ax = eax; > + regs->bx = ebx; > + regs->cx = ecx; > + regs->dx = edx; > +} > + > +/* > + * CPUID override entry format: > + * > + * op[ count]: eax ebx ecx edx > + * > + * All values are in HEX. > + */ > +static int cpuid_override_entry_parse(const char *s, char **endp, > + struct cpuid_override_entry *e) > +{ > + int taken; > + char *end; > + > + if (sscanf(s, "%x %x: %x %x %x %x%n", > + &e->op, &e->count, &e->eax, &e->ebx, &e->ecx, &e->edx, > + &taken) == 6) > + e->has_count = true; > + else if (sscanf(s, "%x: %x %x %x %x%n", > + &e->op, &e->eax, &e->ebx, &e->ecx, &e->edx, > + &taken) == 5) > + e->has_count = false; > + else > + return -EINVAL; > + > + end = (char *)s + taken; > + if (*end) { > + if (*end != '\n') > + return -EINVAL; > + ++end; > + } > + *endp = end; > + return 0; > +} > + > +static ssize_t cpuid_override_write(struct file *file, const char __user > *buf, > + size_t count, loff_t *ppos) > +{ > + struct cpuid_override_table *t = NULL; > + void *page = NULL; > + char *s; > + int err; > + > + err = -E2BIG; > + if (count >= PAGE_SIZE) > + goto out; > + > + err = -ENOMEM; > + t = kmalloc(sizeof(*t), GFP_KERNEL); > + if (!t) > + goto out; > + > + page = (void *)__get_free_page(GFP_KERNEL); > + if (!page) > + goto out; > + > + err = copy_from_user(page, buf, count); > + if (err) > + goto out; > + > + s = page; > + s[count] = '\0'; > + t->size = 0; > + while (*(s = skip_spaces(s))) { > + err = -E2BIG; > + if (t->size == MAX_CPUID_OVERRIDE_ENTRIES) > + goto out; > + err = -EINVAL; > + if (cpuid_override_entry_parse(s, &s, &t->entries[t->size++])) > + goto out; > + } > + if (!t->size) { > + kfree(t); > + t = NULL; > + } > + err = 0; > +out: > + free_page((unsigned long)page); > + > + if (!err) > + cpuid_override_update(t); > + else > + kfree(t); > + > + return err ?: count; > +} > + > +static void *__cpuid_override_seq_start(loff_t pos) > +{ > + struct cpuid_override_table *t = rcu_dereference(cpuid_override); > + return t && pos < t->size ? &t->entries[pos] : NULL; > +} > + > +static void *cpuid_override_seq_start(struct seq_file *seq, loff_t *ppos) > +{ > + rcu_read_lock(); > + return __cpuid_override_seq_start(*ppos); > +} > + > +static void *cpuid_override_seq_next(struct seq_file *seq, > + void *v, loff_t *ppos) > +{ > + ++*ppos; > + return __cpuid_override_seq_start(*ppos); > +} > + > +static void cpuid_override_seq_stop(struct seq_file *s, void *v) > +{ > + rcu_read_unlock(); > +} > + > +static int cpuid_override_seq_show(struct seq_file *s, void *v) > +{ > + struct cpuid_override_entry *e = v; > + > + seq_printf(s, "0x%08x", e->op); > + if (e->has_count) > + seq_printf(s, " 0x%08x", e->count); > + seq_printf(s, ": 0x%08x 0x%08x 0x%08x 0x%08x\n", > + e->eax, e->ebx, e->ecx, e->edx); > + return 0; > +} > + > +static struct seq_operations cpuid_override_seq_ops = { > + .start = cpuid_override_seq_start, > + .next = cpuid_override_seq_next, > + .stop = cpuid_override_seq_stop, > + .show = cpuid_override_seq_show, > +}; > + > +static int cpuid_override_seq_open(struct inode *inode, struct file *file) > +{ > + return seq_open(file, &cpuid_override_seq_ops); > +} > + > +static struct file_operations proc_cpuid_override_ops = { > + .owner = THIS_MODULE, > + .open = cpuid_override_seq_open, > + .read = seq_read, > + .llseek = seq_lseek, > + .release = seq_release, > + .write = cpuid_override_write, > +}; > + > +static int __init cpuid_fault_init(void) > +{ > + struct proc_dir_entry *proc; > + > + if (!static_cpu_has(X86_FEATURE_CPUID_FAULT)) > + return 0; > + > + proc = proc_create("cpuid_override", 0644, proc_vz_dir, > + &proc_cpuid_override_ops); > + if (!proc) > + return -ENOMEM; > + > + return 0; > +} > +module_init(cpuid_fault_init); > diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c > index 4b96d9a574ff..c43e3b80e50f 100644 > --- a/arch/x86/kernel/traps.c > +++ b/arch/x86/kernel/traps.c > @@ -518,6 +518,27 @@ dotraplinkage void do_bounds(struct pt_regs *regs, long > error_code) > do_trap(X86_TRAP_BR, SIGSEGV, "bounds", regs, error_code, 0, NULL); > } > > +static int check_cpuid_fault(struct pt_regs *regs, long error_code) > +{ > + unsigned long addr; > + unsigned short opcode; > + > + if (error_code != 0) > + return 0; > + > + addr = convert_ip_to_linear(current, regs); > + if (get_user(opcode, (unsigned short __user *)addr)) > + return 0; > + > + if (opcode != 0xa20f) > + return 0; > + > + do_cpuid_fault(regs); > + > + regs->ip += 2; > + return 1; > +} > + > dotraplinkage void > do_general_protection(struct pt_regs *regs, long error_code) > { > @@ -551,6 +572,9 @@ do_general_protection(struct pt_regs *regs, long > error_code) > return; > } > > + if (check_cpuid_fault(regs, error_code)) > + return; > + > tsk->thread.error_code = error_code; > tsk->thread.trap_nr = X86_TRAP_GP; > > _______________________________________________ Devel mailing list Devel@openvz.org https://lists.openvz.org/mailman/listinfo/devel