In pcs7 we support that named cpuid masking via
procfs "/proc/vz/cpuid_override" interface. Thus
any attempt to do cpuid() instruciton inside container
cause kernel fault which provides results requested
from the entry above.

This works well inside container but we do checkpoint
via criu on ve0 where masking is not happening. Thus
on restore we may hit a situation where cpu features
saved in CRIU image is not supported inside container.

There was a conversation how to make it more convenient:

 - modify kernel to cause cpuid fault even on node
 - modify criu code so it could take cpu feature options
   from the command line
 - run criu inside container when checkpointing solely for
   cpu feature testing, then jump out and proceed a natural
   suspend

All the above looks too complicated from my POV: what we
need is simply teach criu to read former "/proc/vz/cpuid_override"
procfs entry and use this information when we do fetch cpu
features.

https://jira.sw.ru/browse/PSBM-47748

Signed-off-by: Cyrill Gorcunov <gorcu...@virtuozzo.com>
CC: Dmitry Mishin <d...@virtuozzo.com>
CC: Andrey Vagin <ava...@virtuozzo.com>
CC: Pavel Emelianov <xe...@virtuozzo.com>
CC: Vladimir Davydov <vdavy...@virtuozzo.com>
CC: Konstantin Khorenko <khore...@virtuozzo.com>
CC: Igor Sukhih <i...@virtuozzo.com>
---
 criu/arch/x86/cpu.c | 172 ++++++++++++++++++++++++++++++++++++++++++++++++----
 1 file changed, 161 insertions(+), 11 deletions(-)

diff --git a/criu/arch/x86/cpu.c b/criu/arch/x86/cpu.c
index 0b90c0c..44f3e18 100644
--- a/criu/arch/x86/cpu.c
+++ b/criu/arch/x86/cpu.c
@@ -52,6 +52,153 @@ bool cpu_has_feature(unsigned int feature)
        return test_cpu_cap(&rt_cpu_info, feature);
 }
 
+/*
+ * VZ specific cpuid VE masking: the kernel provides
+ * the following entry /proc/vz/cpuid_override which
+ * carries text representation of cpuid masking which
+ * which works via cpuid faulting inside kernel in the
+ * next format:
+ *
+ *     op     count   eax    ebx    ecx    edx
+ *     0x%08x 0x%08x: 0x%08x 0x%08x 0x%08x 0x%08x
+ *
+ * the @count is optional.
+ */
+
+typedef struct {
+       unsigned int    op;
+       unsigned int    count;
+       bool            has_count;
+       unsigned int    eax;
+       unsigned int    ebx;
+       unsigned int    ecx;
+       unsigned int    edx;
+} vz_cpuid_override_entry_t;
+
+static vz_cpuid_override_entry_t *vz_cpuid_override_entries;
+static unsigned int nr_vz_cpuid_override_entries;
+
+static int vz_cpu_parse_cpuid_override(void)
+{
+       static const char path[] = "/proc/vz/cpuid_override";
+       int ret = -1;
+       char s[256];
+       FILE *f;
+
+       pr_debug("Parsing %s\n", path);
+
+       f = fopen(path, "r");
+       if (!f) {
+               pr_info("Can't access %s, ignoring\n", path);
+                       return 0;
+       }
+
+       while (fgets(s, sizeof(s), f)) {
+               vz_cpuid_override_entry_t e;
+
+               if (sscanf(s, "%x %x: %x %x %x %x",
+                          &e.op, &e.count, &e.eax,
+                          &e.ebx, &e.ecx, &e.edx) == 6)
+                       e.has_count = true;
+               else if (sscanf(s, "%x: %x %x %x %x",
+                               &e.op, &e.eax, &e.ebx,
+                               &e.ecx, &e.edx) == 5)
+                       e.has_count = false;
+               else {
+                       pr_warn("Unexpected format in %s (%s)\n", path, s);
+                       break;
+               }
+
+               if (xrealloc_safe(&vz_cpuid_override_entries,
+                                 (nr_vz_cpuid_override_entries + 1) * 
sizeof(e))) {
+                       goto out;
+               }
+
+               pr_debug("Got cpuid override: %x %x: %x %x %x %x\n",
+                          e.op, e.count, e.eax, e.ebx, e.ecx, e.edx);
+
+               vz_cpuid_override_entries[nr_vz_cpuid_override_entries++] = e;
+       }
+
+       ret = 0;
+out:
+       fclose(f);
+       return ret;
+}
+
+static vz_cpuid_override_entry_t *vz_cpuid_override_lookup(unsigned int op,
+                                                          bool has_count,
+                                                          unsigned int count)
+{
+       size_t i;
+
+       for (i = 0; i < nr_vz_cpuid_override_entries; i++) {
+               if (vz_cpuid_override_entries[i].op != op ||
+                   vz_cpuid_override_entries[i].has_count != has_count ||
+                   count != vz_cpuid_override_entries[i].count)
+                       continue;
+               return &vz_cpuid_override_entries[i];
+       }
+
+       return NULL;
+}
+
+static inline void vz_cpuid(unsigned int op,
+                           unsigned int *eax, unsigned int *ebx,
+                           unsigned int *ecx, unsigned int *edx)
+{
+       vz_cpuid_override_entry_t *e;
+
+       e = vz_cpuid_override_lookup(op, false, 0);
+       if (e) {
+               *eax = e->eax;
+               *ebx = e->ebx;
+               *ecx = e->ecx;
+               *edx = e->edx;
+       } else
+               cpuid(op, eax, ebx, ecx, edx);
+}
+
+static inline void vz_cpuid_count(unsigned int op, int count,
+                                 unsigned int *eax, unsigned int *ebx,
+                                 unsigned int *ecx, unsigned int *edx)
+{
+       vz_cpuid_override_entry_t *e;
+
+       e = vz_cpuid_override_lookup(op, true, count);
+       if (e) {
+               *eax = e->eax;
+               *ebx = e->ebx;
+               *ecx = e->ecx;
+               *edx = e->edx;
+        } else
+                cpuid_count(op, count, eax, ebx, ecx, edx);
+}
+
+static inline unsigned int vz_cpuid_eax(unsigned int op)
+{
+       unsigned int eax, ebx, ecx, edx;
+
+       vz_cpuid(op, &eax, &ebx, &ecx, &edx);
+       return eax;
+}
+
+static inline unsigned int vz_cpuid_ecx(unsigned int op)
+{
+       unsigned int eax, ebx, ecx, edx;
+
+       vz_cpuid(op, &eax, &ebx, &ecx, &edx);
+       return ecx;
+}
+
+static inline unsigned int vz_cpuid_edx(unsigned int op)
+{
+       unsigned int eax, ebx, ecx, edx;
+
+       vz_cpuid(op, &eax, &ebx, &ecx, &edx);
+       return edx;
+}
+
 static int cpu_init_cpuid(struct cpuinfo_x86 *c)
 {
        /*
@@ -62,7 +209,7 @@ static int cpu_init_cpuid(struct cpuinfo_x86 *c)
         */
 
        /* Get vendor name */
-       cpuid(0x00000000,
+       vz_cpuid(0x00000000,
              (unsigned int *)&c->cpuid_level,
              (unsigned int *)&c->x86_vendor_id[0],
              (unsigned int *)&c->x86_vendor_id[8],
@@ -84,7 +231,7 @@ static int cpu_init_cpuid(struct cpuinfo_x86 *c)
        if (c->cpuid_level >= 0x00000001) {
                u32 eax, ebx, ecx, edx;
 
-               cpuid(0x00000001, &eax, &ebx, &ecx, &edx);
+               vz_cpuid(0x00000001, &eax, &ebx, &ecx, &edx);
                c->x86_family = (eax >> 8) & 0xf;
                c->x86_model = (eax >> 4) & 0xf;
                c->x86_mask = eax & 0xf;
@@ -102,7 +249,7 @@ static int cpu_init_cpuid(struct cpuinfo_x86 *c)
        if (c->cpuid_level >= 0x00000007) {
                u32 eax, ebx, ecx, edx;
 
-               cpuid_count(0x00000007, 0, &eax, &ebx, &ecx, &edx);
+               vz_cpuid_count(0x00000007, 0, &eax, &ebx, &ecx, &edx);
                c->x86_capability[9] = ebx;
                c->x86_capability[11] = ecx;
        }
@@ -111,17 +258,17 @@ static int cpu_init_cpuid(struct cpuinfo_x86 *c)
        if (c->cpuid_level >= 0x0000000d) {
                u32 eax, ebx, ecx, edx;
 
-               cpuid_count(0x0000000d, 1, &eax, &ebx, &ecx, &edx);
+               vz_cpuid_count(0x0000000d, 1, &eax, &ebx, &ecx, &edx);
                c->x86_capability[10] = eax;
        }
 
        /* AMD-defined flags: level 0x80000001 */
-       c->extended_cpuid_level = cpuid_eax(0x80000000);
+       c->extended_cpuid_level = vz_cpuid_eax(0x80000000);
 
        if ((c->extended_cpuid_level & 0xffff0000) == 0x80000000) {
                if (c->extended_cpuid_level >= 0x80000001) {
-                       c->x86_capability[1] = cpuid_edx(0x80000001);
-                       c->x86_capability[6] = cpuid_ecx(0x80000001);
+                       c->x86_capability[1] = vz_cpuid_edx(0x80000001);
+                       c->x86_capability[6] = vz_cpuid_ecx(0x80000001);
                }
        }
 
@@ -135,9 +282,9 @@ static int cpu_init_cpuid(struct cpuinfo_x86 *c)
                unsigned int *v;
                char *p, *q;
                v = (unsigned int *)c->x86_model_id;
-               cpuid(0x80000002, &v[0], &v[1], &v[2], &v[3]);
-               cpuid(0x80000003, &v[4], &v[5], &v[6], &v[7]);
-               cpuid(0x80000004, &v[8], &v[9], &v[10], &v[11]);
+               vz_cpuid(0x80000002, &v[0], &v[1], &v[2], &v[3]);
+               vz_cpuid(0x80000003, &v[4], &v[5], &v[6], &v[7]);
+               vz_cpuid(0x80000004, &v[8], &v[9], &v[10], &v[11]);
                c->x86_model_id[48] = 0;
 
                /*
@@ -188,7 +335,7 @@ static int cpu_init_cpuid(struct cpuinfo_x86 *c)
                        u32 level;
 
                        /* On C+ stepping K8 rep microcode works well for 
copy/memset */
-                       level = cpuid_eax(1);
+                       level = vz_cpuid_eax(1);
                        if ((level >= 0x0f48 && level < 0x0f50) || level >= 
0x0f58)
                                set_cpu_cap(c, X86_FEATURE_REP_GOOD);
                }
@@ -200,6 +347,9 @@ static int cpu_init_cpuid(struct cpuinfo_x86 *c)
 
 int cpu_init(void)
 {
+       if (vz_cpu_parse_cpuid_override())
+               return -1;
+
        if (cpu_init_cpuid(&rt_cpu_info))
                return -1;
 
-- 
2.5.5

_______________________________________________
Devel mailing list
Devel@openvz.org
https://lists.openvz.org/mailman/listinfo/devel

Reply via email to