This completes support of AVX512F in the insn emulator.

Note that in the test harness there's a little bit of trickery needed to
get around the not fully consistent naming of AVX512VL gather and
scatter built-ins. To suppress expansion of the "di" and "si" tokens
they get constructed by token concatenation in BS(), which is different
from BG().

Signed-off-by: Jan Beulich <jbeul...@suse.com>
---
TBD: I couldn't really decide whether to duplicate code or merge scatter
     into gather emulation.
---
v7: Re-base.
v6: New.

--- a/tools/tests/x86_emulator/evex-disp8.c
+++ b/tools/tests/x86_emulator/evex-disp8.c
@@ -270,6 +270,8 @@ static const struct test avx512f_all[] =
     INSN(prolv,        66, 0f38, 15,    vl,     dq, vl),
     INSNX(pror,        66,   0f, 72, 0, vl,     dq, vl),
     INSN(prorv,        66, 0f38, 14,    vl,     dq, vl),
+    INSN(pscatterd,    66, 0f38, a0,    vl,     dq, el),
+    INSN(pscatterq,    66, 0f38, a1,    vl,     dq, el),
     INSN(pshufd,       66,   0f, 70,    vl,      d, vl),
     INSN(pslld,        66,   0f, f2,    el_4,    d, vl),
     INSNX(pslld,       66,   0f, 72, 6, vl,      d, vl),
@@ -305,6 +307,8 @@ static const struct test avx512f_all[] =
     INSN(rsqrt14,      66, 0f38, 4f,    el,     sd, el),
     INSN(scalef,       66, 0f38, 2c,    vl,     sd, vl),
     INSN(scalef,       66, 0f38, 2d,    el,     sd, el),
+    INSN(scatterd,     66, 0f38, a2,    vl,     sd, el),
+    INSN(scatterq,     66, 0f38, a3,    vl,     sd, el),
     INSN_PFP(shuf,           0f, c6),
     INSN_FP(sqrt,            0f, 51),
     INSN_FP(sub,             0f, 5c),
--- a/tools/tests/x86_emulator/simd-sg.c
+++ b/tools/tests/x86_emulator/simd-sg.c
@@ -48,10 +48,14 @@ typedef long long __attribute__((vector_
 #  endif
 #  define BG_(dt, it, reg, mem, idx, msk, scl) \
     __builtin_ia32_gather##it##dt(reg, mem, idx, to_mask(msk), scl)
+#  define BS_(dt, it, mem, idx, reg, msk, scl) \
+    __builtin_ia32_scatter##it##dt(mem, to_mask(msk), idx, reg, scl)
 # else
 #  define eq(x, y) (B(pcmpeqq, _mask, (vdi_t)(x), (vdi_t)(y), -1) == ALL_TRUE)
 #  define BG_(dt, it, reg, mem, idx, msk, scl) \
     __builtin_ia32_gather##it##dt(reg, mem, idx, B(ptestmq, , (vdi_t)(msk), 
(vdi_t)(msk), ~0), scl)
+#  define BS_(dt, it, mem, idx, reg, msk, scl) \
+    __builtin_ia32_scatter##it##dt(mem, B(ptestmq, , (vdi_t)(msk), 
(vdi_t)(msk), ~0), idx, reg, scl)
 # endif
 /*
  * Instead of replicating the main IDX_SIZE conditional below three times, use
@@ -59,6 +63,7 @@ typedef long long __attribute__((vector_
  * respective relevant macro argument tokens.
  */
 # define BG(dt, it, reg, mem, idx, msk, scl) BG_(dt, it, reg, mem, idx, msk, 
scl)
+# define BS(dt, it, mem, idx, reg, msk, scl) BS_(dt, it##i, mem, idx, reg, 
msk, scl)
 # if VEC_MAX < 64
 /*
  * The sub-512-bit built-ins have an extra "3" infix, presumably because the
@@ -82,22 +87,30 @@ typedef long long __attribute__((vector_
 # if IDX_SIZE == 4
 #  if INT_SIZE == 4
 #   define gather(reg, mem, idx, msk, scl) BG(v16si, si, reg, mem, idx, msk, 
scl)
+#   define scatter(mem, idx, reg, msk, scl) BS(v16si, s, mem, idx, reg, msk, 
scl)
 #  elif INT_SIZE == 8
 #   define gather(reg, mem, idx, msk, scl) (vec_t)(BG(v8di, si, (vdi_t)(reg), 
mem, idx, msk, scl))
+#   define scatter(mem, idx, reg, msk, scl) BS(v8di, s, mem, idx, 
(vdi_t)(reg), msk, scl)
 #  elif FLOAT_SIZE == 4
 #   define gather(reg, mem, idx, msk, scl) BG(v16sf, si, reg, mem, idx, msk, 
scl)
+#   define scatter(mem, idx, reg, msk, scl) BS(v16sf, s, mem, idx, reg, msk, 
scl)
 #  elif FLOAT_SIZE == 8
 #   define gather(reg, mem, idx, msk, scl) BG(v8df, si, reg, mem, idx, msk, 
scl)
+#   define scatter(mem, idx, reg, msk, scl) BS(v8df, s, mem, idx, reg, msk, 
scl)
 #  endif
 # elif IDX_SIZE == 8
 #  if INT_SIZE == 4
 #   define gather(reg, mem, idx, msk, scl) BG(v16si, di, reg, mem, 
(idi_t)(idx), msk, scl)
+#   define scatter(mem, idx, reg, msk, scl) BS(v16si, d, mem, (idi_t)(idx), 
reg, msk, scl)
 #  elif INT_SIZE == 8
 #   define gather(reg, mem, idx, msk, scl) (vec_t)(BG(v8di, di, (vdi_t)(reg), 
mem, (idi_t)(idx), msk, scl))
+#   define scatter(mem, idx, reg, msk, scl) BS(v8di, d, mem, (idi_t)(idx), 
(vdi_t)(reg), msk, scl)
 #  elif FLOAT_SIZE == 4
 #   define gather(reg, mem, idx, msk, scl) BG(v16sf, di, reg, mem, 
(idi_t)(idx), msk, scl)
+#   define scatter(mem, idx, reg, msk, scl) BS(v16sf, d, mem, (idi_t)(idx), 
reg, msk, scl)
 #  elif FLOAT_SIZE == 8
 #   define gather(reg, mem, idx, msk, scl) BG(v8df, di, reg, mem, 
(idi_t)(idx), msk, scl)
+#   define scatter(mem, idx, reg, msk, scl) BS(v8df, d, mem, (idi_t)(idx), 
reg, msk, scl)
 #  endif
 # endif
 #elif defined(__AVX2__)
@@ -195,6 +208,8 @@ const typeof((vec_t){}[0]) array[] = {
     GLUE(PUT, VEC_MAX)(VEC_MAX + 1)
 };
 
+typeof((vec_t){}[0]) out[VEC_MAX * 2];
+
 int sg_test(void)
 {
     unsigned int i;
@@ -275,5 +290,41 @@ int sg_test(void)
 # endif
 #endif
 
+#ifdef scatter
+
+    for ( i = 0; i < sizeof(out) / sizeof(*out); ++i )
+        out[i] = 0;
+
+    for ( i = 0; i < ITEM_COUNT; ++i )
+        x[i] = i + 1;
+
+    touch(x);
+
+    scatter(out, (idx_t){}, x, (vec_t){ 1 } != 0, 1);
+    if ( out[0] != 1 )
+        return __LINE__;
+    for ( i = 1; i < ITEM_COUNT; ++i )
+        if ( out[i] )
+            return __LINE__;
+
+    scatter(out, (idx_t){}, x, full, 1);
+    if ( out[0] != ITEM_COUNT )
+        return __LINE__;
+    for ( i = 1; i < ITEM_COUNT; ++i )
+        if ( out[i] )
+            return __LINE__;
+
+    scatter(out, idx, x, full, ELEM_SIZE);
+    for ( i = 1; i <= ITEM_COUNT; ++i )
+        if ( out[i] != i )
+            return __LINE__;
+
+    scatter(out, inv, x, full, ELEM_SIZE);
+    for ( i = 1; i <= ITEM_COUNT; ++i )
+        if ( out[i] != ITEM_COUNT + 1 - i )
+            return __LINE__;
+
+#endif
+
     return 0;
 }
--- a/xen/arch/x86/x86_emulate/x86_emulate.c
+++ b/xen/arch/x86/x86_emulate/x86_emulate.c
@@ -508,6 +508,7 @@ static const struct ext0f38_table {
     [0x9d] = { .simd_size = simd_scalar_vexw, .d8s = d8s_dq },
     [0x9e] = { .simd_size = simd_packed_fp, .d8s = d8s_vl },
     [0x9f] = { .simd_size = simd_scalar_vexw, .d8s = d8s_dq },
+    [0xa0 ... 0xa3] = { .simd_size = simd_other, .vsib = 1, .d8s = d8s_dq },
     [0xa6 ... 0xa8] = { .simd_size = simd_packed_fp, .d8s = d8s_vl },
     [0xa9] = { .simd_size = simd_scalar_vexw, .d8s = d8s_dq },
     [0xaa] = { .simd_size = simd_packed_fp, .d8s = d8s_vl },
@@ -9319,6 +9320,102 @@ x86_emulate(
             avx512_vlen_check(true);
         goto simd_zmm;
 
+    case X86EMUL_OPC_EVEX_66(0x0f38, 0xa0): /* vpscatterd{d,q} [xyz]mm,mem{k} 
*/
+    case X86EMUL_OPC_EVEX_66(0x0f38, 0xa1): /* vpscatterq{d,q} [xyz]mm,mem{k} 
*/
+    case X86EMUL_OPC_EVEX_66(0x0f38, 0xa2): /* vscatterdp{s,d} [xyz]mm,mem{k} 
*/
+    case X86EMUL_OPC_EVEX_66(0x0f38, 0xa3): /* vscatterqp{s,d} [xyz]mm,mem{k} 
*/
+    {
+        typeof(evex) *pevex;
+        union {
+            int32_t dw[16];
+            int64_t qw[8];
+        } index;
+        bool done = false;
+
+        ASSERT(ea.type == OP_MEM);
+        fail_if(!ops->write);
+        generate_exception_if((!evex.opmsk || evex.brs || evex.z ||
+                               evex.reg != 0xf ||
+                               modrm_reg == state->sib_index),
+                              EXC_UD);
+        avx512_vlen_check(false);
+        host_and_vcpu_must_have(avx512f);
+        get_fpu(X86EMUL_FPU_zmm);
+
+        /* Read source and index registers. */
+        opc = init_evex(stub);
+        pevex = copy_EVEX(opc, evex);
+        pevex->opcx = vex_0f;
+        opc[0] = 0x7f; /* vmovdqa{32,64} */
+        /* Use (%rax) as destination and modrm_reg as source. */
+        pevex->b = 1;
+        opc[1] = (modrm_reg & 7) << 3;
+        pevex->RX = 1;
+        opc[2] = 0xc3;
+
+        invoke_stub("", "", "=m" (*mmvalp) : "a" (mmvalp));
+
+        pevex->pfx = vex_f3; /* vmovdqu{32,64} */
+        pevex->w = b & 1;
+        /* Switch to sib_index as source. */
+        pevex->r = !mode_64bit() || !(state->sib_index & 0x08);
+        pevex->R = !mode_64bit() || !(state->sib_index & 0x10);
+        opc[1] = (state->sib_index & 7) << 3;
+
+        invoke_stub("", "", "=m" (index) : "a" (&index));
+        put_stub(stub);
+
+        /* Clear untouched parts of the mask value. */
+        n = 1 << (2 + evex.lr - ((b & 1) | evex.w));
+        op_bytes = 4 << evex.w;
+        op_mask &= (1 << n) - 1;
+
+        for ( i = 0; op_mask; ++i )
+        {
+            signed long idx = b & 1 ? index.qw[i] : index.dw[i];
+
+            if ( !(op_mask & (1 << i)) )
+                continue;
+
+            rc = ops->write(ea.mem.seg,
+                            truncate_ea(ea.mem.off + (idx << 
state->sib_scale)),
+                            (void *)mmvalp + i * op_bytes, op_bytes, ctxt);
+            if ( rc != X86EMUL_OKAY )
+            {
+                /* See comment in gather emulation. */
+                if ( rc != X86EMUL_EXCEPTION && done )
+                    rc = X86EMUL_RETRY;
+                break;
+            }
+
+            op_mask &= ~(1 << i);
+            done = true;
+
+#ifdef __XEN__
+            if ( op_mask && local_events_need_delivery() )
+            {
+                rc = X86EMUL_RETRY;
+                break;
+            }
+#endif
+        }
+
+        /* Write mask register. See comment in gather emulation. */
+        opc = get_stub(stub);
+        opc[0] = 0xc5;
+        opc[1] = 0xf8;
+        opc[2] = 0x90;
+        /* Use (%rax) as source. */
+        opc[3] = evex.opmsk << 3;
+        opc[4] = 0xc3;
+
+        invoke_stub("", "", "+m" (op_mask) : "a" (&op_mask));
+        put_stub(stub);
+
+        state->simd_size = simd_none;
+        break;
+    }
+
     case X86EMUL_OPC(0x0f38, 0xc8):     /* sha1nexte xmm/m128,xmm */
     case X86EMUL_OPC(0x0f38, 0xc9):     /* sha1msg1 xmm/m128,xmm */
     case X86EMUL_OPC(0x0f38, 0xca):     /* sha1msg2 xmm/m128,xmm */




_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xenproject.org
https://lists.xenproject.org/mailman/listinfo/xen-devel

Reply via email to