In pcs7 we support that named cpuid masking via procfs "/proc/vz/cpuid_override" interface. Thus any attempt to do cpuid() instruciton inside container cause kernel fault which provides results requested from the entry above.
This works well inside container but we do checkpoint via criu on ve0 where masking is not happening. Thus on restore we may hit a situation where cpu features saved in CRIU image is not supported inside container. There was a conversation how to make it more convenient: - modify kernel to cause cpuid fault even on node - modify criu code so it could take cpu feature options from the command line - run criu inside container when checkpointing solely for cpu feature testing, then jump out and proceed a natural suspend All the above looks too complicated from my POV: what we need is simply teach criu to read former "/proc/vz/cpuid_override" procfs entry and use this information when we do fetch cpu features. https://jira.sw.ru/browse/PSBM-47748 Signed-off-by: Cyrill Gorcunov <gorcu...@virtuozzo.com> CC: Dmitry Mishin <d...@virtuozzo.com> CC: Andrey Vagin <ava...@virtuozzo.com> CC: Pavel Emelianov <xe...@virtuozzo.com> CC: Vladimir Davydov <vdavy...@virtuozzo.com> CC: Konstantin Khorenko <khore...@virtuozzo.com> CC: Igor Sukhih <i...@virtuozzo.com> --- criu/arch/x86/cpu.c | 172 ++++++++++++++++++++++++++++++++++++++++++++++++---- 1 file changed, 161 insertions(+), 11 deletions(-) diff --git a/criu/arch/x86/cpu.c b/criu/arch/x86/cpu.c index 0b90c0c..44f3e18 100644 --- a/criu/arch/x86/cpu.c +++ b/criu/arch/x86/cpu.c @@ -52,6 +52,153 @@ bool cpu_has_feature(unsigned int feature) return test_cpu_cap(&rt_cpu_info, feature); } +/* + * VZ specific cpuid VE masking: the kernel provides + * the following entry /proc/vz/cpuid_override which + * carries text representation of cpuid masking which + * which works via cpuid faulting inside kernel in the + * next format: + * + * op count eax ebx ecx edx + * 0x%08x 0x%08x: 0x%08x 0x%08x 0x%08x 0x%08x + * + * the @count is optional. + */ + +typedef struct { + unsigned int op; + unsigned int count; + bool has_count; + unsigned int eax; + unsigned int ebx; + unsigned int ecx; + unsigned int edx; +} vz_cpuid_override_entry_t; + +static vz_cpuid_override_entry_t *vz_cpuid_override_entries; +static unsigned int nr_vz_cpuid_override_entries; + +static int vz_cpu_parse_cpuid_override(void) +{ + static const char path[] = "/proc/vz/cpuid_override"; + int ret = -1; + char s[256]; + FILE *f; + + pr_debug("Parsing %s\n", path); + + f = fopen(path, "r"); + if (!f) { + pr_info("Can't access %s, ignoring\n", path); + return 0; + } + + while (fgets(s, sizeof(s), f)) { + vz_cpuid_override_entry_t e; + + if (sscanf(s, "%x %x: %x %x %x %x", + &e.op, &e.count, &e.eax, + &e.ebx, &e.ecx, &e.edx) == 6) + e.has_count = true; + else if (sscanf(s, "%x: %x %x %x %x", + &e.op, &e.eax, &e.ebx, + &e.ecx, &e.edx) == 5) + e.has_count = false; + else { + pr_warn("Unexpected format in %s (%s)\n", path, s); + break; + } + + if (xrealloc_safe(&vz_cpuid_override_entries, + (nr_vz_cpuid_override_entries + 1) * sizeof(e))) { + goto out; + } + + pr_debug("Got cpuid override: %x %x: %x %x %x %x\n", + e.op, e.count, e.eax, e.ebx, e.ecx, e.edx); + + vz_cpuid_override_entries[nr_vz_cpuid_override_entries++] = e; + } + + ret = 0; +out: + fclose(f); + return ret; +} + +static vz_cpuid_override_entry_t *vz_cpuid_override_lookup(unsigned int op, + bool has_count, + unsigned int count) +{ + size_t i; + + for (i = 0; i < nr_vz_cpuid_override_entries; i++) { + if (vz_cpuid_override_entries[i].op != op || + vz_cpuid_override_entries[i].has_count != has_count || + count != vz_cpuid_override_entries[i].count) + continue; + return &vz_cpuid_override_entries[i]; + } + + return NULL; +} + +static inline void vz_cpuid(unsigned int op, + unsigned int *eax, unsigned int *ebx, + unsigned int *ecx, unsigned int *edx) +{ + vz_cpuid_override_entry_t *e; + + e = vz_cpuid_override_lookup(op, false, 0); + if (e) { + *eax = e->eax; + *ebx = e->ebx; + *ecx = e->ecx; + *edx = e->edx; + } else + cpuid(op, eax, ebx, ecx, edx); +} + +static inline void vz_cpuid_count(unsigned int op, int count, + unsigned int *eax, unsigned int *ebx, + unsigned int *ecx, unsigned int *edx) +{ + vz_cpuid_override_entry_t *e; + + e = vz_cpuid_override_lookup(op, true, count); + if (e) { + *eax = e->eax; + *ebx = e->ebx; + *ecx = e->ecx; + *edx = e->edx; + } else + cpuid_count(op, count, eax, ebx, ecx, edx); +} + +static inline unsigned int vz_cpuid_eax(unsigned int op) +{ + unsigned int eax, ebx, ecx, edx; + + vz_cpuid(op, &eax, &ebx, &ecx, &edx); + return eax; +} + +static inline unsigned int vz_cpuid_ecx(unsigned int op) +{ + unsigned int eax, ebx, ecx, edx; + + vz_cpuid(op, &eax, &ebx, &ecx, &edx); + return ecx; +} + +static inline unsigned int vz_cpuid_edx(unsigned int op) +{ + unsigned int eax, ebx, ecx, edx; + + vz_cpuid(op, &eax, &ebx, &ecx, &edx); + return edx; +} + static int cpu_init_cpuid(struct cpuinfo_x86 *c) { /* @@ -62,7 +209,7 @@ static int cpu_init_cpuid(struct cpuinfo_x86 *c) */ /* Get vendor name */ - cpuid(0x00000000, + vz_cpuid(0x00000000, (unsigned int *)&c->cpuid_level, (unsigned int *)&c->x86_vendor_id[0], (unsigned int *)&c->x86_vendor_id[8], @@ -84,7 +231,7 @@ static int cpu_init_cpuid(struct cpuinfo_x86 *c) if (c->cpuid_level >= 0x00000001) { u32 eax, ebx, ecx, edx; - cpuid(0x00000001, &eax, &ebx, &ecx, &edx); + vz_cpuid(0x00000001, &eax, &ebx, &ecx, &edx); c->x86_family = (eax >> 8) & 0xf; c->x86_model = (eax >> 4) & 0xf; c->x86_mask = eax & 0xf; @@ -102,7 +249,7 @@ static int cpu_init_cpuid(struct cpuinfo_x86 *c) if (c->cpuid_level >= 0x00000007) { u32 eax, ebx, ecx, edx; - cpuid_count(0x00000007, 0, &eax, &ebx, &ecx, &edx); + vz_cpuid_count(0x00000007, 0, &eax, &ebx, &ecx, &edx); c->x86_capability[9] = ebx; c->x86_capability[11] = ecx; } @@ -111,17 +258,17 @@ static int cpu_init_cpuid(struct cpuinfo_x86 *c) if (c->cpuid_level >= 0x0000000d) { u32 eax, ebx, ecx, edx; - cpuid_count(0x0000000d, 1, &eax, &ebx, &ecx, &edx); + vz_cpuid_count(0x0000000d, 1, &eax, &ebx, &ecx, &edx); c->x86_capability[10] = eax; } /* AMD-defined flags: level 0x80000001 */ - c->extended_cpuid_level = cpuid_eax(0x80000000); + c->extended_cpuid_level = vz_cpuid_eax(0x80000000); if ((c->extended_cpuid_level & 0xffff0000) == 0x80000000) { if (c->extended_cpuid_level >= 0x80000001) { - c->x86_capability[1] = cpuid_edx(0x80000001); - c->x86_capability[6] = cpuid_ecx(0x80000001); + c->x86_capability[1] = vz_cpuid_edx(0x80000001); + c->x86_capability[6] = vz_cpuid_ecx(0x80000001); } } @@ -135,9 +282,9 @@ static int cpu_init_cpuid(struct cpuinfo_x86 *c) unsigned int *v; char *p, *q; v = (unsigned int *)c->x86_model_id; - cpuid(0x80000002, &v[0], &v[1], &v[2], &v[3]); - cpuid(0x80000003, &v[4], &v[5], &v[6], &v[7]); - cpuid(0x80000004, &v[8], &v[9], &v[10], &v[11]); + vz_cpuid(0x80000002, &v[0], &v[1], &v[2], &v[3]); + vz_cpuid(0x80000003, &v[4], &v[5], &v[6], &v[7]); + vz_cpuid(0x80000004, &v[8], &v[9], &v[10], &v[11]); c->x86_model_id[48] = 0; /* @@ -188,7 +335,7 @@ static int cpu_init_cpuid(struct cpuinfo_x86 *c) u32 level; /* On C+ stepping K8 rep microcode works well for copy/memset */ - level = cpuid_eax(1); + level = vz_cpuid_eax(1); if ((level >= 0x0f48 && level < 0x0f50) || level >= 0x0f58) set_cpu_cap(c, X86_FEATURE_REP_GOOD); } @@ -200,6 +347,9 @@ static int cpu_init_cpuid(struct cpuinfo_x86 *c) int cpu_init(void) { + if (vz_cpu_parse_cpuid_override()) + return -1; + if (cpu_init_cpuid(&rt_cpu_info)) return -1; -- 2.5.5 _______________________________________________ Devel mailing list Devel@openvz.org https://lists.openvz.org/mailman/listinfo/devel