Re: [PATCH v1 3/7] contrib/gitdm: add Paul to individual contributors
Yes, I'm happy for p...@codesourcery.com to be linked to my current email for attribution purposes. Paul On 26 September 2022 14:46:05 BST, "Alex Bennée" wrote: >Do you want to map old commits to your canonical email now as well? > >Signed-off-by: Alex Bennée >Cc: Paul Brook >--- > contrib/gitdm/group-map-individuals | 1 + > 1 file changed, 1 insertion(+) > >diff --git a/contrib/gitdm/group-map-individuals >b/contrib/gitdm/group-map-individuals >index d5b05041bc..0ec003048c 100644 >--- a/contrib/gitdm/group-map-individuals >+++ b/contrib/gitdm/group-map-individuals >@@ -35,3 +35,4 @@ liq...@gmail.com > chetan4wind...@gmail.com > akihiko.od...@gmail.com > si...@simonsafar.com >+p...@nowt.org >-- >2.34.1 > >
[PATCH v2 19/42] i386: Rewrite blendv helpers
Rewrite the blendv helpers so that they can easily be extended to support the AVX encodings, which make all 4 arguments explicit. No functional changes to the existing helpers Signed-off-by: Paul Brook --- target/i386/ops_sse.h | 119 +- 1 file changed, 60 insertions(+), 59 deletions(-) diff --git a/target/i386/ops_sse.h b/target/i386/ops_sse.h index 3202c00572..9f388b02b9 100644 --- a/target/i386/ops_sse.h +++ b/target/i386/ops_sse.h @@ -2141,73 +2141,74 @@ void glue(helper_palignr, SUFFIX)(CPUX86State *env, Reg *d, Reg *s, } } -#define XMM0 (env->xmm_regs[0]) +#if SHIFT >= 1 + +#define BLEND_V128(elem, num, F, b) do {\ +d->elem(b + 0) = F(v->elem(b + 0), s->elem(b + 0), m->elem(b + 0)); \ +d->elem(b + 1) = F(v->elem(b + 1), s->elem(b + 1), m->elem(b + 1)); \ +if (num > 2) { \ +d->elem(b + 2) = F(v->elem(b + 2), s->elem(b + 2), m->elem(b + 2)); \ +d->elem(b + 3) = F(v->elem(b + 3), s->elem(b + 3), m->elem(b + 3)); \ +} \ +if (num > 4) { \ +d->elem(b + 4) = F(v->elem(b + 4), s->elem(b + 4), m->elem(b + 4)); \ +d->elem(b + 5) = F(v->elem(b + 5), s->elem(b + 5), m->elem(b + 5)); \ +d->elem(b + 6) = F(v->elem(b + 6), s->elem(b + 6), m->elem(b + 6)); \ +d->elem(b + 7) = F(v->elem(b + 7), s->elem(b + 7), m->elem(b + 7)); \ +} \ +if (num > 8) { \ +d->elem(b + 8) = F(v->elem(b + 8), s->elem(b + 8), m->elem(b + 8)); \ +d->elem(b + 9) = F(v->elem(b + 9), s->elem(b + 9), m->elem(b + 9)); \ +d->elem(b + 10) = F(v->elem(b + 10), s->elem(b + 10), m->elem(b + 10));\ +d->elem(b + 11) = F(v->elem(b + 11), s->elem(b + 11), m->elem(b + 11));\ +d->elem(b + 12) = F(v->elem(b + 12), s->elem(b + 12), m->elem(b + 12));\ +d->elem(b + 13) = F(v->elem(b + 13), s->elem(b + 13), m->elem(b + 13));\ +d->elem(b + 14) = F(v->elem(b + 14), s->elem(b + 14), m->elem(b + 14));\ +d->elem(b + 15) = F(v->elem(b + 15), s->elem(b + 15), m->elem(b + 15));\ +} \ +} while (0) -#if SHIFT == 1 #define SSE_HELPER_V(name, elem, num, F)\ -void glue(name, SUFFIX)(CPUX86State *env, Reg *d, Reg *s) \ +void glue(name, SUFFIX)(CPUX86State *env, Reg *d, Reg *s) \ { \ -d->elem(0) = F(d->elem(0), s->elem(0), XMM0.elem(0)); \ -d->elem(1) = F(d->elem(1), s->elem(1), XMM0.elem(1)); \ -if (num > 2) { \ -d->elem(2) = F(d->elem(2), s->elem(2), XMM0.elem(2)); \ -d->elem(3) = F(d->elem(3), s->elem(3), XMM0.elem(3)); \ -if (num > 4) { \ -d->elem(4) = F(d->elem(4), s->elem(4), XMM0.elem(4)); \ -d->elem(5) = F(d->elem(5), s->elem(5), XMM0.elem(5)); \ -d->elem(6) = F(d->elem(6), s->elem(6), XMM0.elem(6)); \ -d->elem(7) = F(d->elem(7), s->elem(7), XMM0.elem(7)); \ -if (num > 8) { \ -d->elem(8) = F(d->elem(8), s->elem(8), XMM0.elem(8)); \ -d->elem(9) = F(d->elem(9), s->elem(9), XMM0.elem(9)); \ -d->elem(10) = F(d->elem(10), s->elem(10), XMM0.elem(10)); \ -d->elem(11) = F(d->elem(11), s->elem(11), XMM0.elem(11)); \ -d->elem(12) = F(d->elem(12), s->elem(12), XMM0.elem(12)); \ -d->elem(13) = F(d->elem(13), s->elem(13), XMM0.elem(13)); \ -d->elem(14) = F(d->elem(14), s->elem(14), XMM0.elem(14)); \ -d->elem(15) = F(d->elem(15), s->elem(15), XMM0.elem(15)); \ -} \ -} \ -} \ -} +Reg *v = d; \ +Reg *m = >xmm_regs[0];
[PATCH v2 31/42] i386: Implement AVX variable shifts
These use the W bit to encode the operand width, but otherwise fairly straightforward. Signed-off-by: Paul Brook --- target/i386/ops_sse.h| 17 + target/i386/ops_sse_header.h | 6 ++ target/i386/tcg/translate.c | 17 + 3 files changed, 40 insertions(+) diff --git a/target/i386/ops_sse.h b/target/i386/ops_sse.h index 9b92b9790a..8f2bd48394 100644 --- a/target/i386/ops_sse.h +++ b/target/i386/ops_sse.h @@ -3195,6 +3195,23 @@ void glue(helper_vpermilps_imm, SUFFIX)(CPUX86State *env, #endif } +#if SHIFT == 1 +#define FPSRLVD(x, c) (c < 32 ? ((x) >> c) : 0) +#define FPSRLVQ(x, c) (c < 64 ? ((x) >> c) : 0) +#define FPSRAVD(x, c) ((int32_t)(x) >> (c < 64 ? c : 31)) +#define FPSRAVQ(x, c) ((int64_t)(x) >> (c < 64 ? c : 63)) +#define FPSLLVD(x, c) (c < 32 ? ((x) << c) : 0) +#define FPSLLVQ(x, c) (c < 64 ? ((x) << c) : 0) +#endif + +SSE_HELPER_L(helper_vpsrlvd, FPSRLVD) +SSE_HELPER_L(helper_vpsravd, FPSRAVD) +SSE_HELPER_L(helper_vpsllvd, FPSLLVD) + +SSE_HELPER_Q(helper_vpsrlvq, FPSRLVQ) +SSE_HELPER_Q(helper_vpsravq, FPSRAVQ) +SSE_HELPER_Q(helper_vpsllvq, FPSLLVQ) + #if SHIFT == 2 void glue(helper_vbroadcastdq, SUFFIX)(CPUX86State *env, Reg *d, Reg *s) { diff --git a/target/i386/ops_sse_header.h b/target/i386/ops_sse_header.h index c52169a030..20db6c4240 100644 --- a/target/i386/ops_sse_header.h +++ b/target/i386/ops_sse_header.h @@ -421,6 +421,12 @@ DEF_HELPER_4(glue(vpermilpd, SUFFIX), void, env, Reg, Reg, Reg) DEF_HELPER_4(glue(vpermilps, SUFFIX), void, env, Reg, Reg, Reg) DEF_HELPER_4(glue(vpermilpd_imm, SUFFIX), void, env, Reg, Reg, i32) DEF_HELPER_4(glue(vpermilps_imm, SUFFIX), void, env, Reg, Reg, i32) +DEF_HELPER_4(glue(vpsrlvd, SUFFIX), void, env, Reg, Reg, Reg) +DEF_HELPER_4(glue(vpsravd, SUFFIX), void, env, Reg, Reg, Reg) +DEF_HELPER_4(glue(vpsllvd, SUFFIX), void, env, Reg, Reg, Reg) +DEF_HELPER_4(glue(vpsrlvq, SUFFIX), void, env, Reg, Reg, Reg) +DEF_HELPER_4(glue(vpsravq, SUFFIX), void, env, Reg, Reg, Reg) +DEF_HELPER_4(glue(vpsllvq, SUFFIX), void, env, Reg, Reg, Reg) #if SHIFT == 2 DEF_HELPER_3(glue(vbroadcastdq, SUFFIX), void, env, Reg, Reg) DEF_HELPER_1(vzeroall, void, env) diff --git a/target/i386/tcg/translate.c b/target/i386/tcg/translate.c index 358c3ecb0b..4990470083 100644 --- a/target/i386/tcg/translate.c +++ b/target/i386/tcg/translate.c @@ -3293,6 +3293,9 @@ static const struct SSEOpHelper_table6 sse_op_table6[256] = { [0x40] = BINARY_OP(pmulld, SSE41, SSE_OPF_MMX), #define gen_helper_phminposuw_ymm NULL [0x41] = UNARY_OP(phminposuw, SSE41, 0), +[0x45] = BINARY_OP(vpsrlvd, AVX, SSE_OPF_AVX2), +[0x46] = BINARY_OP(vpsravd, AVX, SSE_OPF_AVX2), +[0x47] = BINARY_OP(vpsllvd, AVX, SSE_OPF_AVX2), /* vpbroadcastd */ [0x58] = UNARY_OP(vbroadcastl, AVX, SSE_OPF_SCALAR | SSE_OPF_MMX), /* vpbroadcastq */ @@ -3357,6 +3360,15 @@ static const struct SSEOpHelper_table7 sse_op_table7[256] = { #undef BLENDV_OP #undef SPECIAL_OP +#define SSE_OP(name) \ +{gen_helper_ ## name ##_xmm, gen_helper_ ## name ##_ymm} +static const SSEFunc_0_eppp sse_op_table8[3][2] = { +SSE_OP(vpsrlvq), +SSE_OP(vpsravq), +SSE_OP(vpsllvq), +}; +#undef SSE_OP + /* VEX prefix not allowed */ #define CHECK_NO_VEX(s) do { \ if (s->prefix & PREFIX_VEX) \ @@ -4439,6 +4451,11 @@ static void gen_sse(CPUX86State *env, DisasContext *s, int b, tcg_temp_free_ptr(mask); } else { SSEFunc_0_eppp fn = op6.fn[b1].op2; +if (REX_W(s)) { +if (b >= 0x45 && b <= 0x47) { +fn = sse_op_table8[b - 0x45][b1 - 1]; +} +} fn(cpu_env, s->ptr0, s->ptr2, s->ptr1); } } -- 2.36.0
[PATCH v2 12/42] i386: Misc integer AVX helper prep
More perparatory work for AVX support in various integer vector helpers No functional changes to existing helpers. Signed-off-by: Paul Brook --- target/i386/ops_sse.h | 133 +- 1 file changed, 104 insertions(+), 29 deletions(-) diff --git a/target/i386/ops_sse.h b/target/i386/ops_sse.h index bb9cbf9ead..d0424140d9 100644 --- a/target/i386/ops_sse.h +++ b/target/i386/ops_sse.h @@ -557,19 +557,25 @@ SSE_HELPER_W(helper_pavgw, FAVG) void glue(helper_pmuludq, SUFFIX)(CPUX86State *env, Reg *d, Reg *s) { -d->Q(0) = (uint64_t)s->L(0) * (uint64_t)d->L(0); -#if SHIFT == 1 -d->Q(1) = (uint64_t)s->L(2) * (uint64_t)d->L(2); +Reg *v = d; +d->Q(0) = (uint64_t)s->L(0) * (uint64_t)v->L(0); +#if SHIFT >= 1 +d->Q(1) = (uint64_t)s->L(2) * (uint64_t)v->L(2); +#if SHIFT == 2 +d->Q(2) = (uint64_t)s->L(4) * (uint64_t)v->L(4); +d->Q(3) = (uint64_t)s->L(6) * (uint64_t)v->L(6); +#endif #endif } void glue(helper_pmaddwd, SUFFIX)(CPUX86State *env, Reg *d, Reg *s) { +Reg *v = d; int i; for (i = 0; i < (2 << SHIFT); i++) { -d->L(i) = (int16_t)s->W(2 * i) * (int16_t)d->W(2 * i) + -(int16_t)s->W(2 * i + 1) * (int16_t)d->W(2 * i + 1); +d->L(i) = (int16_t)s->W(2 * i) * (int16_t)v->W(2 * i) + +(int16_t)s->W(2 * i + 1) * (int16_t)v->W(2 * i + 1); } } @@ -583,31 +589,55 @@ static inline int abs1(int a) } } #endif + void glue(helper_psadbw, SUFFIX)(CPUX86State *env, Reg *d, Reg *s) { +Reg *v = d; unsigned int val; val = 0; -val += abs1(d->B(0) - s->B(0)); -val += abs1(d->B(1) - s->B(1)); -val += abs1(d->B(2) - s->B(2)); -val += abs1(d->B(3) - s->B(3)); -val += abs1(d->B(4) - s->B(4)); -val += abs1(d->B(5) - s->B(5)); -val += abs1(d->B(6) - s->B(6)); -val += abs1(d->B(7) - s->B(7)); +val += abs1(v->B(0) - s->B(0)); +val += abs1(v->B(1) - s->B(1)); +val += abs1(v->B(2) - s->B(2)); +val += abs1(v->B(3) - s->B(3)); +val += abs1(v->B(4) - s->B(4)); +val += abs1(v->B(5) - s->B(5)); +val += abs1(v->B(6) - s->B(6)); +val += abs1(v->B(7) - s->B(7)); d->Q(0) = val; -#if SHIFT == 1 +#if SHIFT >= 1 val = 0; -val += abs1(d->B(8) - s->B(8)); -val += abs1(d->B(9) - s->B(9)); -val += abs1(d->B(10) - s->B(10)); -val += abs1(d->B(11) - s->B(11)); -val += abs1(d->B(12) - s->B(12)); -val += abs1(d->B(13) - s->B(13)); -val += abs1(d->B(14) - s->B(14)); -val += abs1(d->B(15) - s->B(15)); +val += abs1(v->B(8) - s->B(8)); +val += abs1(v->B(9) - s->B(9)); +val += abs1(v->B(10) - s->B(10)); +val += abs1(v->B(11) - s->B(11)); +val += abs1(v->B(12) - s->B(12)); +val += abs1(v->B(13) - s->B(13)); +val += abs1(v->B(14) - s->B(14)); +val += abs1(v->B(15) - s->B(15)); d->Q(1) = val; +#if SHIFT == 2 +val = 0; +val += abs1(v->B(16) - s->B(16)); +val += abs1(v->B(17) - s->B(17)); +val += abs1(v->B(18) - s->B(18)); +val += abs1(v->B(19) - s->B(19)); +val += abs1(v->B(20) - s->B(20)); +val += abs1(v->B(21) - s->B(21)); +val += abs1(v->B(22) - s->B(22)); +val += abs1(v->B(23) - s->B(23)); +d->Q(2) = val; +val = 0; +val += abs1(v->B(24) - s->B(24)); +val += abs1(v->B(25) - s->B(25)); +val += abs1(v->B(26) - s->B(26)); +val += abs1(v->B(27) - s->B(27)); +val += abs1(v->B(28) - s->B(28)); +val += abs1(v->B(29) - s->B(29)); +val += abs1(v->B(30) - s->B(30)); +val += abs1(v->B(31) - s->B(31)); +d->Q(3) = val; +#endif #endif } @@ -627,8 +657,12 @@ void glue(helper_movl_mm_T0, SUFFIX)(Reg *d, uint32_t val) { d->L(0) = val; d->L(1) = 0; -#if SHIFT == 1 +#if SHIFT >= 1 d->Q(1) = 0; +#if SHIFT == 2 +d->Q(2) = 0; +d->Q(3) = 0; +#endif #endif } @@ -636,8 +670,12 @@ void glue(helper_movl_mm_T0, SUFFIX)(Reg *d, uint32_t val) void glue(helper_movq_mm_T0, SUFFIX)(Reg *d, uint64_t val) { d->Q(0) = val; -#if SHIFT == 1 +#if SHIFT >= 1 d->Q(1) = 0; +#if SHIFT == 2 +d->Q(2) = 0; +d->Q(3) = 0; +#endif #endif } #endif @@ -1251,7 +1289,7 @@ uint32_t glue(helper_pmovmskb, SUFFIX)(CPUX86State *env, Reg *s) val |= (s->B(5) >> 2) & 0x20; val |= (s->B(6) >> 1) & 0x40; val |= (s->B(7)) & 0x80; -#if SHIFT == 1 +#if SHIFT >= 1 val |= (s->B(8) << 1) & 0x0100; val |= (s->B(9) << 2) & 0x0200; val |= (s->B(10) << 3) & 0x0400; @@ -1260,6 +1
[PATCH v2 20/42] i386: AVX pclmulqdq
Make the pclmulqdq helper AVX ready Signed-off-by: Paul Brook --- target/i386/ops_sse.h | 31 --- 1 file changed, 24 insertions(+), 7 deletions(-) diff --git a/target/i386/ops_sse.h b/target/i386/ops_sse.h index 9f388b02b9..b7100fdce1 100644 --- a/target/i386/ops_sse.h +++ b/target/i386/ops_sse.h @@ -2885,14 +2885,14 @@ target_ulong helper_crc32(uint32_t crc1, target_ulong msg, uint32_t len) #endif -void glue(helper_pclmulqdq, SUFFIX)(CPUX86State *env, Reg *d, Reg *s, -uint32_t ctrl) +#if SHIFT == 1 +static void clmulq(uint64_t *dest_l, uint64_t *dest_h, + uint64_t a, uint64_t b) { -uint64_t ah, al, b, resh, resl; +uint64_t al, ah, resh, resl; ah = 0; -al = d->Q((ctrl & 1) != 0); -b = s->Q((ctrl & 16) != 0); +al = a; resh = resl = 0; while (b) { @@ -2905,8 +2905,25 @@ void glue(helper_pclmulqdq, SUFFIX)(CPUX86State *env, Reg *d, Reg *s, b >>= 1; } -d->Q(0) = resl; -d->Q(1) = resh; +*dest_l = resl; +*dest_h = resh; +} +#endif + +void glue(helper_pclmulqdq, SUFFIX)(CPUX86State *env, Reg *d, Reg *s, +uint32_t ctrl) +{ +Reg *v = d; +uint64_t a, b; + +a = v->Q((ctrl & 1) != 0); +b = s->Q((ctrl & 16) != 0); +clmulq(>Q(0), >Q(1), a, b); +#if SHIFT == 2 +a = v->Q(((ctrl & 1) != 0) + 2); +b = s->Q(((ctrl & 16) != 0) + 2); +clmulq(>Q(2), >Q(3), a, b); +#endif } void glue(helper_aesdec, SUFFIX)(CPUX86State *env, Reg *d, Reg *s) -- 2.36.0
[PATCH v2 32/42] i386: Implement VTEST
Noting special here Signed-off-by: Paul Brook --- target/i386/ops_sse.h| 28 target/i386/ops_sse_header.h | 2 ++ target/i386/tcg/translate.c | 2 ++ 3 files changed, 32 insertions(+) diff --git a/target/i386/ops_sse.h b/target/i386/ops_sse.h index 8f2bd48394..edf14a25d7 100644 --- a/target/i386/ops_sse.h +++ b/target/i386/ops_sse.h @@ -3212,6 +3212,34 @@ SSE_HELPER_Q(helper_vpsrlvq, FPSRLVQ) SSE_HELPER_Q(helper_vpsravq, FPSRAVQ) SSE_HELPER_Q(helper_vpsllvq, FPSLLVQ) +void glue(helper_vtestps, SUFFIX)(CPUX86State *env, Reg *d, Reg *s) +{ +uint32_t zf = (s->L(0) & d->L(0)) | (s->L(1) & d->L(1)); +uint32_t cf = (s->L(0) & ~d->L(0)) | (s->L(1) & ~d->L(1)); + +zf |= (s->L(2) & d->L(2)) | (s->L(3) & d->L(3)); +cf |= (s->L(2) & ~d->L(2)) | (s->L(3) & ~d->L(3)); +#if SHIFT == 2 +zf |= (s->L(4) & d->L(4)) | (s->L(5) & d->L(5)); +cf |= (s->L(4) & ~d->L(4)) | (s->L(5) & ~d->L(5)); +zf |= (s->L(6) & d->L(6)) | (s->L(7) & d->L(7)); +cf |= (s->L(6) & ~d->L(6)) | (s->L(7) & ~d->L(7)); +#endif +CC_SRC = ((zf >> 31) ? 0 : CC_Z) | ((cf >> 31) ? 0 : CC_C); +} + +void glue(helper_vtestpd, SUFFIX)(CPUX86State *env, Reg *d, Reg *s) +{ +uint64_t zf = (s->Q(0) & d->Q(0)) | (s->Q(1) & d->Q(1)); +uint64_t cf = (s->Q(0) & ~d->Q(0)) | (s->Q(1) & ~d->Q(1)); + +#if SHIFT == 2 +zf |= (s->Q(2) & d->Q(2)) | (s->Q(3) & d->Q(3)); +cf |= (s->Q(2) & ~d->Q(2)) | (s->Q(3) & ~d->Q(3)); +#endif +CC_SRC = ((zf >> 63) ? 0 : CC_Z) | ((cf >> 63) ? 0 : CC_C); +} + #if SHIFT == 2 void glue(helper_vbroadcastdq, SUFFIX)(CPUX86State *env, Reg *d, Reg *s) { diff --git a/target/i386/ops_sse_header.h b/target/i386/ops_sse_header.h index 20db6c4240..8b93b8e6d6 100644 --- a/target/i386/ops_sse_header.h +++ b/target/i386/ops_sse_header.h @@ -427,6 +427,8 @@ DEF_HELPER_4(glue(vpsllvd, SUFFIX), void, env, Reg, Reg, Reg) DEF_HELPER_4(glue(vpsrlvq, SUFFIX), void, env, Reg, Reg, Reg) DEF_HELPER_4(glue(vpsravq, SUFFIX), void, env, Reg, Reg, Reg) DEF_HELPER_4(glue(vpsllvq, SUFFIX), void, env, Reg, Reg, Reg) +DEF_HELPER_3(glue(vtestps, SUFFIX), void, env, Reg, Reg) +DEF_HELPER_3(glue(vtestpd, SUFFIX), void, env, Reg, Reg) #if SHIFT == 2 DEF_HELPER_3(glue(vbroadcastdq, SUFFIX), void, env, Reg, Reg) DEF_HELPER_1(vzeroall, void, env) diff --git a/target/i386/tcg/translate.c b/target/i386/tcg/translate.c index 4990470083..2fbb7bfcad 100644 --- a/target/i386/tcg/translate.c +++ b/target/i386/tcg/translate.c @@ -3253,6 +3253,8 @@ static const struct SSEOpHelper_table6 sse_op_table6[256] = { [0x0b] = BINARY_OP_MMX(pmulhrsw, SSSE3), [0x0c] = BINARY_OP(vpermilps, AVX, 0), [0x0d] = BINARY_OP(vpermilpd, AVX, 0), +[0x0e] = CMP_OP(vtestps, AVX), +[0x0f] = CMP_OP(vtestpd, AVX), [0x10] = BLENDV_OP(pblendvb, SSE41, SSE_OPF_MMX), [0x14] = BLENDV_OP(blendvps, SSE41, 0), [0x15] = BLENDV_OP(blendvpd, SSE41, 0), -- 2.36.0
[PATCH v2 14/42] i386: Add size suffix to vector FP helpers
For AVX we're going to need both 128 bit (xmm) and 256 bit (ymm) variants of floating point helpers. Add the register type suffix to the existing *PS and *PD helpers (SS and SD variants are only valid on 128 bit vectors) No functional changes. Signed-off-by: Paul Brook --- target/i386/ops_sse.h| 48 ++-- target/i386/ops_sse_header.h | 48 ++-- target/i386/tcg/translate.c | 37 +-- 3 files changed, 67 insertions(+), 66 deletions(-) diff --git a/target/i386/ops_sse.h b/target/i386/ops_sse.h index c645d2ddbf..fc8fd57aa5 100644 --- a/target/i386/ops_sse.h +++ b/target/i386/ops_sse.h @@ -699,7 +699,7 @@ void glue(helper_pshufw, SUFFIX)(Reg *d, Reg *s, int order) SHUFFLE4(W, s, s, 0); } #else -void helper_shufps(Reg *d, Reg *s, int order) +void glue(helper_shufps, SUFFIX)(Reg *d, Reg *s, int order) { Reg *v = d; uint32_t r0, r1, r2, r3; @@ -710,7 +710,7 @@ void helper_shufps(Reg *d, Reg *s, int order) #endif } -void helper_shufpd(Reg *d, Reg *s, int order) +void glue(helper_shufpd, SUFFIX)(Reg *d, Reg *s, int order) { Reg *v = d; uint64_t r0, r1; @@ -767,7 +767,7 @@ void glue(helper_pshufhw, SUFFIX)(Reg *d, Reg *s, int order) /* XXX: not accurate */ #define SSE_HELPER_S(name, F) \ -void helper_ ## name ## ps(CPUX86State *env, Reg *d, Reg *s)\ +void glue(helper_ ## name ## ps, SUFFIX)(CPUX86State *env, Reg *d, Reg *s)\ { \ d->ZMM_S(0) = F(32, d->ZMM_S(0), s->ZMM_S(0)); \ d->ZMM_S(1) = F(32, d->ZMM_S(1), s->ZMM_S(1)); \ @@ -780,7 +780,7 @@ void glue(helper_pshufhw, SUFFIX)(Reg *d, Reg *s, int order) d->ZMM_S(0) = F(32, d->ZMM_S(0), s->ZMM_S(0)); \ } \ \ -void helper_ ## name ## pd(CPUX86State *env, Reg *d, Reg *s)\ +void glue(helper_ ## name ## pd, SUFFIX)(CPUX86State *env, Reg *d, Reg *s)\ { \ d->ZMM_D(0) = F(64, d->ZMM_D(0), s->ZMM_D(0)); \ d->ZMM_D(1) = F(64, d->ZMM_D(1), s->ZMM_D(1)); \ @@ -816,7 +816,7 @@ SSE_HELPER_S(sqrt, FPU_SQRT) /* float to float conversions */ -void helper_cvtps2pd(CPUX86State *env, Reg *d, Reg *s) +void glue(helper_cvtps2pd, SUFFIX)(CPUX86State *env, Reg *d, Reg *s) { float32 s0, s1; @@ -826,7 +826,7 @@ void helper_cvtps2pd(CPUX86State *env, Reg *d, Reg *s) d->ZMM_D(1) = float32_to_float64(s1, >sse_status); } -void helper_cvtpd2ps(CPUX86State *env, Reg *d, Reg *s) +void glue(helper_cvtpd2ps, SUFFIX)(CPUX86State *env, Reg *d, Reg *s) { d->ZMM_S(0) = float64_to_float32(s->ZMM_D(0), >sse_status); d->ZMM_S(1) = float64_to_float32(s->ZMM_D(1), >sse_status); @@ -844,7 +844,7 @@ void helper_cvtsd2ss(CPUX86State *env, Reg *d, Reg *s) } /* integer to float */ -void helper_cvtdq2ps(CPUX86State *env, Reg *d, Reg *s) +void glue(helper_cvtdq2ps, SUFFIX)(CPUX86State *env, Reg *d, Reg *s) { d->ZMM_S(0) = int32_to_float32(s->ZMM_L(0), >sse_status); d->ZMM_S(1) = int32_to_float32(s->ZMM_L(1), >sse_status); @@ -852,7 +852,7 @@ void helper_cvtdq2ps(CPUX86State *env, Reg *d, Reg *s) d->ZMM_S(3) = int32_to_float32(s->ZMM_L(3), >sse_status); } -void helper_cvtdq2pd(CPUX86State *env, Reg *d, Reg *s) +void glue(helper_cvtdq2pd, SUFFIX)(CPUX86State *env, Reg *d, Reg *s) { int32_t l0, l1; @@ -929,7 +929,7 @@ WRAP_FLOATCONV(int64_t, float32_to_int64_round_to_zero, float32, INT64_MIN) WRAP_FLOATCONV(int64_t, float64_to_int64, float64, INT64_MIN) WRAP_FLOATCONV(int64_t, float64_to_int64_round_to_zero, float64, INT64_MIN) -void helper_cvtps2dq(CPUX86State *env, ZMMReg *d, ZMMReg *s) +void glue(helper_cvtps2dq, SUFFIX)(CPUX86State *env, ZMMReg *d, ZMMReg *s) { d->ZMM_L(0) = x86_float32_to_int32(s->ZMM_S(0), >sse_status); d->ZMM_L(1) = x86_float32_to_int32(s->ZMM_S(1), >sse_status); @@ -937,7 +937,7 @@ void helper_cvtps2dq(CPUX86State *env, ZMMReg *d, ZMMReg *s) d->ZMM_L(3) = x86_float32_to_int32(s->ZMM_S(3), >sse_status); } -void helper_cvtpd2dq(CPUX86State *env, ZMMReg *d, ZMMReg *s) +void glue(helper_cvtpd2dq, SUFFIX)(CPUX86State *env, ZMMReg *d, ZMMReg *s) { d->ZMM_L(0) = x86_float64_to_int32(s->ZMM_D(0), >sse_status); d->ZMM_L(1) = x86_float64_to_int32(s->ZMM_D(1), >sse_status); @@ -979,7 +979,7 @@ int64_t helper_cvtsd2sq(CPUX86State *env, ZMMReg *s) #endif /* float to integer truncated */ -void helper_cvttps2dq(CPUX86State *env, ZMMReg *d, ZMMReg *s) +vo
[PATCH v2 27/42] i386: Translate 256 bit AVX instructions
All the work for the helper functions is already done, we just need to build them, and a few macro tweaks to poulate the lookup tables. For sse_op_table6 and sse_op_table7 we use #defines to fill in the entries where and opcode only supports one vector size, rather than complicating the main table. Several of the open-coded mov type instruction need special handling, but most of the rest falls out from the infrastructure we already added. Also clear the top half of the register after 128 bit VEX register writes. In the current code this correlates with VEX.L == 0, but there are exceptios later. Signed-off-by: Paul Brook --- target/i386/helper.h | 2 + target/i386/tcg/fpu_helper.c | 3 + target/i386/tcg/translate.c | 370 +-- 3 files changed, 319 insertions(+), 56 deletions(-) diff --git a/target/i386/helper.h b/target/i386/helper.h index ac3b4d1ee3..3da5df98b9 100644 --- a/target/i386/helper.h +++ b/target/i386/helper.h @@ -218,6 +218,8 @@ DEF_HELPER_3(movq, void, env, ptr, ptr) #include "ops_sse_header.h" #define SHIFT 1 #include "ops_sse_header.h" +#define SHIFT 2 +#include "ops_sse_header.h" DEF_HELPER_3(rclb, tl, env, tl, tl) DEF_HELPER_3(rclw, tl, env, tl, tl) diff --git a/target/i386/tcg/fpu_helper.c b/target/i386/tcg/fpu_helper.c index b391b69635..74cf86c986 100644 --- a/target/i386/tcg/fpu_helper.c +++ b/target/i386/tcg/fpu_helper.c @@ -3053,3 +3053,6 @@ void helper_movq(CPUX86State *env, void *d, void *s) #define SHIFT 1 #include "ops_sse.h" + +#define SHIFT 2 +#include "ops_sse.h" diff --git a/target/i386/tcg/translate.c b/target/i386/tcg/translate.c index 278ed8ed1c..bcd6d47fd0 100644 --- a/target/i386/tcg/translate.c +++ b/target/i386/tcg/translate.c @@ -2742,6 +2742,29 @@ static inline void gen_ldo_env_A0(DisasContext *s, int offset) tcg_gen_st_i64(s->tmp1_i64, cpu_env, offset + offsetof(ZMMReg, ZMM_Q(1))); } +static inline void gen_ldo_env_A0_ymmh(DisasContext *s, int offset) +{ +int mem_index = s->mem_index; +tcg_gen_qemu_ld_i64(s->tmp1_i64, s->A0, mem_index, MO_LEUQ); +tcg_gen_st_i64(s->tmp1_i64, cpu_env, offset + offsetof(ZMMReg, ZMM_Q(2))); +tcg_gen_addi_tl(s->tmp0, s->A0, 8); +tcg_gen_qemu_ld_i64(s->tmp1_i64, s->tmp0, mem_index, MO_LEUQ); +tcg_gen_st_i64(s->tmp1_i64, cpu_env, offset + offsetof(ZMMReg, ZMM_Q(3))); +} + +/* Load 256-bit ymm register value */ +static inline void gen_ldy_env_A0(DisasContext *s, int offset) +{ +int mem_index = s->mem_index; +gen_ldo_env_A0(s, offset); +tcg_gen_addi_tl(s->tmp0, s->A0, 16); +tcg_gen_qemu_ld_i64(s->tmp1_i64, s->tmp0, mem_index, MO_LEUQ); +tcg_gen_st_i64(s->tmp1_i64, cpu_env, offset + offsetof(ZMMReg, ZMM_Q(2))); +tcg_gen_addi_tl(s->tmp0, s->A0, 24); +tcg_gen_qemu_ld_i64(s->tmp1_i64, s->tmp0, mem_index, MO_LEUQ); +tcg_gen_st_i64(s->tmp1_i64, cpu_env, offset + offsetof(ZMMReg, ZMM_Q(3))); +} + static inline void gen_sto_env_A0(DisasContext *s, int offset) { int mem_index = s->mem_index; @@ -2752,6 +2775,29 @@ static inline void gen_sto_env_A0(DisasContext *s, int offset) tcg_gen_qemu_st_i64(s->tmp1_i64, s->tmp0, mem_index, MO_LEUQ); } +static inline void gen_sto_env_A0_ymmh(DisasContext *s, int offset) +{ +int mem_index = s->mem_index; +tcg_gen_ld_i64(s->tmp1_i64, cpu_env, offset + offsetof(ZMMReg, ZMM_Q(2))); +tcg_gen_qemu_st_i64(s->tmp1_i64, s->A0, mem_index, MO_LEUQ); +tcg_gen_addi_tl(s->tmp0, s->A0, 8); +tcg_gen_ld_i64(s->tmp1_i64, cpu_env, offset + offsetof(ZMMReg, ZMM_Q(3))); +tcg_gen_qemu_st_i64(s->tmp1_i64, s->tmp0, mem_index, MO_LEUQ); +} + +/* Store 256-bit ymm register value */ +static inline void gen_sty_env_A0(DisasContext *s, int offset) +{ +int mem_index = s->mem_index; +gen_sto_env_A0(s, offset); +tcg_gen_addi_tl(s->tmp0, s->A0, 16); +tcg_gen_ld_i64(s->tmp1_i64, cpu_env, offset + offsetof(ZMMReg, ZMM_Q(2))); +tcg_gen_qemu_st_i64(s->tmp1_i64, s->tmp0, mem_index, MO_LEUQ); +tcg_gen_addi_tl(s->tmp0, s->A0, 24); +tcg_gen_ld_i64(s->tmp1_i64, cpu_env, offset + offsetof(ZMMReg, ZMM_Q(3))); +tcg_gen_qemu_st_i64(s->tmp1_i64, s->tmp0, mem_index, MO_LEUQ); +} + static inline void gen_op_movo(DisasContext *s, int d_offset, int s_offset) { tcg_gen_ld_i64(s->tmp1_i64, cpu_env, s_offset + offsetof(ZMMReg, ZMM_Q(0))); @@ -2760,6 +2806,14 @@ static inline void gen_op_movo(DisasContext *s, int d_offset, int s_offset) tcg_gen_st_i64(s->tmp1_i64, cpu_env, d_offset + offsetof(ZMMReg, ZMM_Q(1))); } +static inline void gen_op_movo_ymmh(DisasContext *s, int d_offset, int s_offset) +{ +tcg_gen_ld_i64(s->tmp1_i64, cpu_env, s_offset + offsetof(ZMMReg, ZMM_Q(2))); +tcg_gen_st_i64(s->tmp1_i64, cpu_env, d_offset + offsetof(ZMMReg, ZMM_Q(2)
[PATCH v2 15/42] i386: Floating point atithmetic helper AVX prep
Prepare the "easy" floating point vector helpers for AVX No functional changes to existing helpers. Signed-off-by: Paul Brook --- target/i386/ops_sse.h | 144 ++ 1 file changed, 119 insertions(+), 25 deletions(-) diff --git a/target/i386/ops_sse.h b/target/i386/ops_sse.h index fc8fd57aa5..d308a1ec40 100644 --- a/target/i386/ops_sse.h +++ b/target/i386/ops_sse.h @@ -762,40 +762,66 @@ void glue(helper_pshufhw, SUFFIX)(Reg *d, Reg *s, int order) } #endif -#if SHIFT == 1 +#if SHIFT >= 1 /* FPU ops */ /* XXX: not accurate */ -#define SSE_HELPER_S(name, F) \ -void glue(helper_ ## name ## ps, SUFFIX)(CPUX86State *env, Reg *d, Reg *s)\ +#define SSE_HELPER_P(name, F) \ +void glue(helper_ ## name ## ps, SUFFIX)(CPUX86State *env, \ +Reg *d, Reg *s) \ { \ -d->ZMM_S(0) = F(32, d->ZMM_S(0), s->ZMM_S(0)); \ -d->ZMM_S(1) = F(32, d->ZMM_S(1), s->ZMM_S(1)); \ -d->ZMM_S(2) = F(32, d->ZMM_S(2), s->ZMM_S(2)); \ -d->ZMM_S(3) = F(32, d->ZMM_S(3), s->ZMM_S(3)); \ +Reg *v = d; \ +d->ZMM_S(0) = F(32, v->ZMM_S(0), s->ZMM_S(0)); \ +d->ZMM_S(1) = F(32, v->ZMM_S(1), s->ZMM_S(1)); \ +d->ZMM_S(2) = F(32, v->ZMM_S(2), s->ZMM_S(2)); \ +d->ZMM_S(3) = F(32, v->ZMM_S(3), s->ZMM_S(3)); \ +YMM_ONLY( \ +d->ZMM_S(4) = F(32, v->ZMM_S(4), s->ZMM_S(4)); \ +d->ZMM_S(5) = F(32, v->ZMM_S(5), s->ZMM_S(5)); \ +d->ZMM_S(6) = F(32, v->ZMM_S(6), s->ZMM_S(6)); \ +d->ZMM_S(7) = F(32, v->ZMM_S(7), s->ZMM_S(7)); \ +) \ } \ \ -void helper_ ## name ## ss(CPUX86State *env, Reg *d, Reg *s)\ +void glue(helper_ ## name ## pd, SUFFIX)(CPUX86State *env, \ +Reg *d, Reg *s) \ { \ -d->ZMM_S(0) = F(32, d->ZMM_S(0), s->ZMM_S(0)); \ -} \ +Reg *v = d; \ +d->ZMM_D(0) = F(64, v->ZMM_D(0), s->ZMM_D(0)); \ +d->ZMM_D(1) = F(64, v->ZMM_D(1), s->ZMM_D(1)); \ +YMM_ONLY( \ +d->ZMM_D(2) = F(64, v->ZMM_D(2), s->ZMM_D(2)); \ +d->ZMM_D(3) = F(64, v->ZMM_D(3), s->ZMM_D(3)); \ +) \ +} + +#if SHIFT == 1 + +#define SSE_HELPER_S(name, F) \ +SSE_HELPER_P(name, F) \ \ -void glue(helper_ ## name ## pd, SUFFIX)(CPUX86State *env, Reg *d, Reg *s)\ +void helper_ ## name ## ss(CPUX86State *env, Reg *d, Reg *s)\ { \ -d->ZMM_D(0) = F(64, d->ZMM_D(0), s->ZMM_D(0)); \ -d->ZMM_D(1) = F(64, d->ZMM_D(1), s->ZMM_D(1)); \ +Reg *v = d; \ +d->ZMM_S(0) = F(32, v->ZMM_S(0), s->ZMM_S(0)); \ } \ \ -void helper_ ## name ## sd(CPUX86State *env, Reg *d, Reg *s)\ +void helper_ ## name ## sd(CPUX86State *env, Reg *d, Reg *s)\ { \ -d->ZMM_D(0) = F(64, d->ZMM_D(0), s->ZMM_D(0)); \ +Reg *v = d; \ +d->ZMM_D(0) = F(64, v->ZMM_D(0), s->ZMM_D(0)); \ } +#else + +#define SSE_HELPER_S(name, F) SSE_HELPER_P(name, F) + +#endif + #define FPU_ADD(size, a, b) float ## size ## _add(a, b, >sse_st
[PATCH v2 26/42] i386: Utility function for 128 bit AVX
VEX encoded instructions that write to a (128 bit) xmm register clear the rest (upper half) of the corresonding (256 bit) ymm register. When legacy SSE encodings are used the rest of the ymm register is left unchanged. Add a utility fuction so that we don't have to keep duplicating this logic. Signed-off-by: Paul Brook --- target/i386/tcg/translate.c | 12 1 file changed, 12 insertions(+) diff --git a/target/i386/tcg/translate.c b/target/i386/tcg/translate.c index d148a2319d..278ed8ed1c 100644 --- a/target/i386/tcg/translate.c +++ b/target/i386/tcg/translate.c @@ -2780,6 +2780,18 @@ static inline void gen_op_movq_env_0(DisasContext *s, int d_offset) #define ZMM_OFFSET(reg) offsetof(CPUX86State, xmm_regs[reg]) +/* + * Clear the top half of the ymm register after a VEX.128 instruction + * This could be optimized by tracking this in env->hflags + */ +static void gen_clear_ymmh(DisasContext *s, int reg) +{ +if (s->prefix & PREFIX_VEX) { +gen_op_movq_env_0(s, offsetof(CPUX86State, xmm_regs[reg].ZMM_Q(2))); +gen_op_movq_env_0(s, offsetof(CPUX86State, xmm_regs[reg].ZMM_Q(3))); +} +} + typedef void (*SSEFunc_i_ep)(TCGv_i32 val, TCGv_ptr env, TCGv_ptr reg); typedef void (*SSEFunc_l_ep)(TCGv_i64 val, TCGv_ptr env, TCGv_ptr reg); typedef void (*SSEFunc_0_epi)(TCGv_ptr env, TCGv_ptr reg, TCGv_i32 val); -- 2.36.0
[PATCH v2 25/42] i386: VEX.V encodings (3 operand)
Enable translation of VEX encoded AVX instructions. The big change is the addition of an additional register operand in the VEX.V field. This is usually (but not always!) used to explictly encode the first source operand. The changes to ops_sse.h and ops_sse_header.h are purely mechanical, with pervious changes ensuring that the relevant helper functions are ready to handle the non destructive source operand. We now have a grater variety of operand patterns for the vector helper functions. The SSE_OPF_* flags we added to the opcode lookup tables are used to select between these. This includes e.g. pshufX and cmpX instructions which were previously overriden by opcode. One gotcha is the "scalar" vector instructions. The SSE encodings write a single element to the destination and leave the remainder of the register unchanged. The VEX encodings which copy the remainder of the destination from first source operand. If the operation only has a single source value, then the VEX.V encodes an additional operand from which is coped to the the remainder of destination. Signed-off-by: Paul Brook --- target/i386/ops_sse.h| 214 +-- target/i386/ops_sse_header.h | 149 ++--- target/i386/tcg/translate.c | 399 +-- 3 files changed, 463 insertions(+), 299 deletions(-) diff --git a/target/i386/ops_sse.h b/target/i386/ops_sse.h index e48dfc2fc5..ad3312d353 100644 --- a/target/i386/ops_sse.h +++ b/target/i386/ops_sse.h @@ -97,9 +97,8 @@ #define FPSLL(x, c) ((x) << shift) #endif -void glue(helper_psrlw, SUFFIX)(CPUX86State *env, Reg *d, Reg *c) +void glue(helper_psrlw, SUFFIX)(CPUX86State *env, Reg *d, Reg *s, Reg *c) { -Reg *s = d; int shift; if (c->Q(0) > 15) { d->Q(0) = 0; @@ -114,9 +113,8 @@ void glue(helper_psrlw, SUFFIX)(CPUX86State *env, Reg *d, Reg *c) } } -void glue(helper_psllw, SUFFIX)(CPUX86State *env, Reg *d, Reg *c) +void glue(helper_psllw, SUFFIX)(CPUX86State *env, Reg *d, Reg *s, Reg *c) { -Reg *s = d; int shift; if (c->Q(0) > 15) { d->Q(0) = 0; @@ -131,9 +129,8 @@ void glue(helper_psllw, SUFFIX)(CPUX86State *env, Reg *d, Reg *c) } } -void glue(helper_psraw, SUFFIX)(CPUX86State *env, Reg *d, Reg *c) +void glue(helper_psraw, SUFFIX)(CPUX86State *env, Reg *d, Reg *s, Reg *c) { -Reg *s = d; int shift; if (c->Q(0) > 15) { shift = 15; @@ -143,9 +140,8 @@ void glue(helper_psraw, SUFFIX)(CPUX86State *env, Reg *d, Reg *c) SHIFT_HELPER_BODY(4 << SHIFT, W, FPSRAW); } -void glue(helper_psrld, SUFFIX)(CPUX86State *env, Reg *d, Reg *c) +void glue(helper_psrld, SUFFIX)(CPUX86State *env, Reg *d, Reg *s, Reg *c) { -Reg *s = d; int shift; if (c->Q(0) > 31) { d->Q(0) = 0; @@ -160,9 +156,8 @@ void glue(helper_psrld, SUFFIX)(CPUX86State *env, Reg *d, Reg *c) } } -void glue(helper_pslld, SUFFIX)(CPUX86State *env, Reg *d, Reg *c) +void glue(helper_pslld, SUFFIX)(CPUX86State *env, Reg *d, Reg *s, Reg *c) { -Reg *s = d; int shift; if (c->Q(0) > 31) { d->Q(0) = 0; @@ -177,9 +172,8 @@ void glue(helper_pslld, SUFFIX)(CPUX86State *env, Reg *d, Reg *c) } } -void glue(helper_psrad, SUFFIX)(CPUX86State *env, Reg *d, Reg *c) +void glue(helper_psrad, SUFFIX)(CPUX86State *env, Reg *d, Reg *s, Reg *c) { -Reg *s = d; int shift; if (c->Q(0) > 31) { shift = 31; @@ -189,9 +183,8 @@ void glue(helper_psrad, SUFFIX)(CPUX86State *env, Reg *d, Reg *c) SHIFT_HELPER_BODY(2 << SHIFT, L, FPSRAL); } -void glue(helper_psrlq, SUFFIX)(CPUX86State *env, Reg *d, Reg *c) +void glue(helper_psrlq, SUFFIX)(CPUX86State *env, Reg *d, Reg *s, Reg *c) { -Reg *s = d; int shift; if (c->Q(0) > 63) { d->Q(0) = 0; @@ -206,9 +199,8 @@ void glue(helper_psrlq, SUFFIX)(CPUX86State *env, Reg *d, Reg *c) } } -void glue(helper_psllq, SUFFIX)(CPUX86State *env, Reg *d, Reg *c) +void glue(helper_psllq, SUFFIX)(CPUX86State *env, Reg *d, Reg *s, Reg *c) { -Reg *s = d; int shift; if (c->Q(0) > 63) { d->Q(0) = 0; @@ -224,9 +216,8 @@ void glue(helper_psllq, SUFFIX)(CPUX86State *env, Reg *d, Reg *c) } #if SHIFT >= 1 -void glue(helper_psrldq, SUFFIX)(CPUX86State *env, Reg *d, Reg *c) +void glue(helper_psrldq, SUFFIX)(CPUX86State *env, Reg *d, Reg *s, Reg *c) { -Reg *s = d; int shift, i; shift = c->L(0); @@ -249,9 +240,8 @@ void glue(helper_psrldq, SUFFIX)(CPUX86State *env, Reg *d, Reg *c) #endif } -void glue(helper_pslldq, SUFFIX)(CPUX86State *env, Reg *d, Reg *c) +void glue(helper_pslldq, SUFFIX)(CPUX86State *env, Reg *d, Reg *s, Reg *c) { -Reg *s = d; int shift, i; shift = c->L(0); @@ -321,9 +311,8 @@ void glue(helper_pslldq, SUFFIX)(CPUX86State *env, Reg *d, Reg *c) } #define SSE_HELPER_B(name, F)
[PATCH v2 36/42] i386: Implement VINSERT128/VEXTRACT128
128-bit vinsert/vextract instructions. The integer and loating point variants have the same semantics. This is where we encounter an instruction encoded with VEX.L == 1 and a 128 bit (xmm) destination operand. Signed-off-by: Paul Brook --- target/i386/tcg/translate.c | 78 + 1 file changed, 78 insertions(+) diff --git a/target/i386/tcg/translate.c b/target/i386/tcg/translate.c index 5a11d3c083..4072fa28d3 100644 --- a/target/i386/tcg/translate.c +++ b/target/i386/tcg/translate.c @@ -2814,6 +2814,24 @@ static inline void gen_op_movo_ymmh(DisasContext *s, int d_offset, int s_offset) tcg_gen_st_i64(s->tmp1_i64, cpu_env, d_offset + offsetof(ZMMReg, ZMM_Q(3))); } +static inline void gen_op_movo_ymm_l2h(DisasContext *s, + int d_offset, int s_offset) +{ +tcg_gen_ld_i64(s->tmp1_i64, cpu_env, s_offset + offsetof(ZMMReg, ZMM_Q(0))); +tcg_gen_st_i64(s->tmp1_i64, cpu_env, d_offset + offsetof(ZMMReg, ZMM_Q(2))); +tcg_gen_ld_i64(s->tmp1_i64, cpu_env, s_offset + offsetof(ZMMReg, ZMM_Q(1))); +tcg_gen_st_i64(s->tmp1_i64, cpu_env, d_offset + offsetof(ZMMReg, ZMM_Q(3))); +} + +static inline void gen_op_movo_ymm_h2l(DisasContext *s, + int d_offset, int s_offset) +{ +tcg_gen_ld_i64(s->tmp1_i64, cpu_env, s_offset + offsetof(ZMMReg, ZMM_Q(2))); +tcg_gen_st_i64(s->tmp1_i64, cpu_env, d_offset + offsetof(ZMMReg, ZMM_Q(0))); +tcg_gen_ld_i64(s->tmp1_i64, cpu_env, s_offset + offsetof(ZMMReg, ZMM_Q(3))); +tcg_gen_st_i64(s->tmp1_i64, cpu_env, d_offset + offsetof(ZMMReg, ZMM_Q(1))); +} + static inline void gen_op_movq(DisasContext *s, int d_offset, int s_offset) { tcg_gen_ld_i64(s->tmp1_i64, cpu_env, s_offset); @@ -3353,9 +3371,13 @@ static const struct SSEOpHelper_table7 sse_op_table7[256] = { [0x15] = SPECIAL_OP(SSE41), /* pextrw */ [0x16] = SPECIAL_OP(SSE41), /* pextrd/pextrq */ [0x17] = SPECIAL_OP(SSE41), /* extractps */ +[0x18] = SPECIAL_OP(AVX), /* vinsertf128 */ +[0x19] = SPECIAL_OP(AVX), /* vextractf128 */ [0x20] = SPECIAL_OP(SSE41), /* pinsrb */ [0x21] = SPECIAL_OP(SSE41), /* insertps */ [0x22] = SPECIAL_OP(SSE41), /* pinsrd/pinsrq */ +[0x38] = SPECIAL_OP(AVX), /* vinserti128 */ +[0x39] = SPECIAL_OP(AVX), /* vextracti128 */ [0x40] = BINARY_OP(dpps, SSE41, 0), #define gen_helper_dppd_ymm NULL [0x41] = BINARY_OP(dppd, SSE41, 0), @@ -5145,6 +5167,62 @@ static void gen_sse(CPUX86State *env, DisasContext *s, int b, } gen_clear_ymmh(s, reg); break; +case 0x38: /* vinserti128 */ +CHECK_AVX2_256(s); +/* fall through */ +case 0x18: /* vinsertf128 */ +CHECK_AVX(s); +if ((s->prefix & PREFIX_VEX) == 0 || s->vex_l == 0) { +goto illegal_op; +} +if (mod == 3) { +if (val & 1) { +gen_op_movo_ymm_l2h(s, ZMM_OFFSET(reg), +ZMM_OFFSET(rm)); +} else { +gen_op_movo(s, ZMM_OFFSET(reg), ZMM_OFFSET(rm)); +} +} else { +if (val & 1) { +gen_ldo_env_A0_ymmh(s, ZMM_OFFSET(reg)); +} else { +gen_ldo_env_A0(s, ZMM_OFFSET(reg)); +} +} +if (reg != reg_v) { +if (val & 1) { +gen_op_movo(s, ZMM_OFFSET(reg), ZMM_OFFSET(reg_v)); +} else { +gen_op_movo_ymmh(s, ZMM_OFFSET(reg), + ZMM_OFFSET(reg_v)); +} +} +break; +case 0x39: /* vextracti128 */ +CHECK_AVX2_256(s); +/* fall through */ +case 0x19: /* vextractf128 */ +CHECK_AVX_V0(s); +if ((s->prefix & PREFIX_VEX) == 0 || s->vex_l == 0) { +goto illegal_op; +} +if (mod == 3) { +op1_offset = ZMM_OFFSET(rm); +if (val & 1) { +gen_op_movo_ymm_h2l(s, ZMM_OFFSET(rm), +ZMM_OFFSET(reg)); +} else { +gen_op_movo(s, ZMM_OFFSET(rm), ZMM_OFFSET(reg)); +} +gen_clear_ymmh(s, rm); +} else{ +if (val & 1
[PATCH v2 21/42] i386: AVX+AES helpers
Make the AES vector helpers AVX ready No functional changes to existing helpers Signed-off-by: Paul Brook --- target/i386/ops_sse.h| 63 ++-- target/i386/ops_sse_header.h | 55 ++- 2 files changed, 85 insertions(+), 33 deletions(-) diff --git a/target/i386/ops_sse.h b/target/i386/ops_sse.h index b7100fdce1..48cec40074 100644 --- a/target/i386/ops_sse.h +++ b/target/i386/ops_sse.h @@ -2929,64 +2929,92 @@ void glue(helper_pclmulqdq, SUFFIX)(CPUX86State *env, Reg *d, Reg *s, void glue(helper_aesdec, SUFFIX)(CPUX86State *env, Reg *d, Reg *s) { int i; -Reg st = *d; +Reg st = *d; // v Reg rk = *s; for (i = 0 ; i < 4 ; i++) { -d->L(i) = rk.L(i) ^ bswap32(AES_Td0[st.B(AES_ishifts[4*i+0])] ^ -AES_Td1[st.B(AES_ishifts[4*i+1])] ^ -AES_Td2[st.B(AES_ishifts[4*i+2])] ^ -AES_Td3[st.B(AES_ishifts[4*i+3])]); +d->L(i) = rk.L(i) ^ bswap32(AES_Td0[st.B(AES_ishifts[4 * i + 0])] ^ +AES_Td1[st.B(AES_ishifts[4 * i + 1])] ^ +AES_Td2[st.B(AES_ishifts[4 * i + 2])] ^ +AES_Td3[st.B(AES_ishifts[4 * i + 3])]); } +#if SHIFT == 2 +for (i = 0 ; i < 4 ; i++) { +d->L(i + 4) = rk.L(i + 4) ^ bswap32( +AES_Td0[st.B(AES_ishifts[4 * i + 0] + 16)] ^ +AES_Td1[st.B(AES_ishifts[4 * i + 1] + 16)] ^ +AES_Td2[st.B(AES_ishifts[4 * i + 2] + 16)] ^ +AES_Td3[st.B(AES_ishifts[4 * i + 3] + 16)]); +} +#endif } void glue(helper_aesdeclast, SUFFIX)(CPUX86State *env, Reg *d, Reg *s) { int i; -Reg st = *d; +Reg st = *d; // v Reg rk = *s; for (i = 0; i < 16; i++) { d->B(i) = rk.B(i) ^ (AES_isbox[st.B(AES_ishifts[i])]); } +#if SHIFT == 2 +for (i = 0; i < 16; i++) { +d->B(i + 16) = rk.B(i + 16) ^ (AES_isbox[st.B(AES_ishifts[i] + 16)]); +} +#endif } void glue(helper_aesenc, SUFFIX)(CPUX86State *env, Reg *d, Reg *s) { int i; -Reg st = *d; +Reg st = *d; // v Reg rk = *s; for (i = 0 ; i < 4 ; i++) { -d->L(i) = rk.L(i) ^ bswap32(AES_Te0[st.B(AES_shifts[4*i+0])] ^ -AES_Te1[st.B(AES_shifts[4*i+1])] ^ -AES_Te2[st.B(AES_shifts[4*i+2])] ^ -AES_Te3[st.B(AES_shifts[4*i+3])]); +d->L(i) = rk.L(i) ^ bswap32(AES_Te0[st.B(AES_shifts[4 * i + 0])] ^ +AES_Te1[st.B(AES_shifts[4 * i + 1])] ^ +AES_Te2[st.B(AES_shifts[4 * i + 2])] ^ +AES_Te3[st.B(AES_shifts[4 * i + 3])]); } +#if SHIFT == 2 +for (i = 0 ; i < 4 ; i++) { +d->L(i + 4) = rk.L(i + 4) ^ bswap32( +AES_Te0[st.B(AES_shifts[4 * i + 0] + 16)] ^ +AES_Te1[st.B(AES_shifts[4 * i + 1] + 16)] ^ +AES_Te2[st.B(AES_shifts[4 * i + 2] + 16)] ^ +AES_Te3[st.B(AES_shifts[4 * i + 3] + 16)]); +} +#endif } void glue(helper_aesenclast, SUFFIX)(CPUX86State *env, Reg *d, Reg *s) { int i; -Reg st = *d; +Reg st = *d; // v Reg rk = *s; for (i = 0; i < 16; i++) { d->B(i) = rk.B(i) ^ (AES_sbox[st.B(AES_shifts[i])]); } - +#if SHIFT == 2 +for (i = 0; i < 16; i++) { +d->B(i + 16) = rk.B(i + 16) ^ (AES_sbox[st.B(AES_shifts[i] + 16)]); +} +#endif } +#if SHIFT == 1 void glue(helper_aesimc, SUFFIX)(CPUX86State *env, Reg *d, Reg *s) { int i; Reg tmp = *s; for (i = 0 ; i < 4 ; i++) { -d->L(i) = bswap32(AES_imc[tmp.B(4*i+0)][0] ^ - AES_imc[tmp.B(4*i+1)][1] ^ - AES_imc[tmp.B(4*i+2)][2] ^ - AES_imc[tmp.B(4*i+3)][3]); +d->L(i) = bswap32(AES_imc[tmp.B(4 * i + 0)][0] ^ + AES_imc[tmp.B(4 * i + 1)][1] ^ + AES_imc[tmp.B(4 * i + 2)][2] ^ + AES_imc[tmp.B(4 * i + 3)][3]); } } @@ -3004,6 +3032,7 @@ void glue(helper_aeskeygenassist, SUFFIX)(CPUX86State *env, Reg *d, Reg *s, d->L(3) = (d->L(2) << 24 | d->L(2) >> 8) ^ ctrl; } #endif +#endif #undef SSE_HELPER_S diff --git a/target/i386/ops_sse_header.h b/target/i386/ops_sse_header.h index b8b0666f61..203afbb5a1 100644 --- a/target/i386/ops_sse_header.h +++ b/target/i386/ops_sse_header.h @@ -47,7 +47,7 @@ DEF_HELPER_3(glue(pslld, SUFFIX), void, env, Reg, Reg) DEF_HELPER_3(glue(psrlq, SUFFIX), void, env, Reg, Reg) DEF_HELPER_3(glue(psllq, SUFFIX), void, env, Reg, Reg) -#if SHIFT == 1 +#if SHIFT >= 1 DEF_HELPER_3(glue(psrldq, SUFFIX), void, env, Reg, Reg) DEF_HELPER_3(glue
[PATCH v2 35/42] i386: Implement VPERM
A set of shuffle operations that operate on complete 256 bit registers. The integer and floating point variants have identical semantics. Signed-off-by: Paul Brook --- target/i386/ops_sse.h| 73 target/i386/ops_sse_header.h | 3 ++ target/i386/tcg/translate.c | 9 + 3 files changed, 85 insertions(+) diff --git a/target/i386/ops_sse.h b/target/i386/ops_sse.h index 14a2d1bf78..04d2006cd8 100644 --- a/target/i386/ops_sse.h +++ b/target/i386/ops_sse.h @@ -3407,6 +3407,79 @@ void helper_vzeroupper_hi8(CPUX86State *env) } } #endif + +void helper_vpermdq_ymm(CPUX86State *env, +Reg *d, Reg *v, Reg *s, uint32_t order) +{ +uint64_t r0, r1, r2, r3; + +switch (order & 3) { +case 0: +r0 = v->Q(0); +r1 = v->Q(1); +break; +case 1: +r0 = v->Q(2); +r1 = v->Q(3); +break; +case 2: +r0 = s->Q(0); +r1 = s->Q(1); +break; +case 3: +r0 = s->Q(2); +r1 = s->Q(3); +break; +} +switch ((order >> 4) & 3) { +case 0: +r2 = v->Q(0); +r3 = v->Q(1); +break; +case 1: +r2 = v->Q(2); +r3 = v->Q(3); +break; +case 2: +r2 = s->Q(0); +r3 = s->Q(1); +break; +case 3: +r2 = s->Q(2); +r3 = s->Q(3); +break; +} +d->Q(0) = r0; +d->Q(1) = r1; +d->Q(2) = r2; +d->Q(3) = r3; +} + +void helper_vpermq_ymm(CPUX86State *env, Reg *d, Reg *s, uint32_t order) +{ +uint64_t r0, r1, r2, r3; +r0 = s->Q(order & 3); +r1 = s->Q((order >> 2) & 3); +r2 = s->Q((order >> 4) & 3); +r3 = s->Q((order >> 6) & 3); +d->Q(0) = r0; +d->Q(1) = r1; +d->Q(2) = r2; +d->Q(3) = r3; +} + +void helper_vpermd_ymm(CPUX86State *env, Reg *d, Reg *v, Reg *s) +{ +uint32_t r[8]; +int i; + +for (i = 0; i < 8; i++) { +r[i] = s->L(v->L(i) & 7); +} +for (i = 0; i < 8; i++) { +d->L(i) = r[i]; +} +} #endif #endif diff --git a/target/i386/ops_sse_header.h b/target/i386/ops_sse_header.h index e5d8ea9bb7..099e6e8ffc 100644 --- a/target/i386/ops_sse_header.h +++ b/target/i386/ops_sse_header.h @@ -457,6 +457,9 @@ DEF_HELPER_1(vzeroupper, void, env) DEF_HELPER_1(vzeroall_hi8, void, env) DEF_HELPER_1(vzeroupper_hi8, void, env) #endif +DEF_HELPER_5(vpermdq_ymm, void, env, Reg, Reg, Reg, i32) +DEF_HELPER_4(vpermq_ymm, void, env, Reg, Reg, i32) +DEF_HELPER_4(vpermd_ymm, void, env, Reg, Reg, Reg) #endif #endif diff --git a/target/i386/tcg/translate.c b/target/i386/tcg/translate.c index fe1ab58d07..5a11d3c083 100644 --- a/target/i386/tcg/translate.c +++ b/target/i386/tcg/translate.c @@ -3258,6 +3258,8 @@ static const struct SSEOpHelper_table6 sse_op_table6[256] = { [0x10] = BLENDV_OP(pblendvb, SSE41, SSE_OPF_MMX), [0x14] = BLENDV_OP(blendvps, SSE41, 0), [0x15] = BLENDV_OP(blendvpd, SSE41, 0), +#define gen_helper_vpermd_xmm NULL +[0x16] = BINARY_OP(vpermd, AVX, SSE_OPF_AVX2), /* vpermps */ [0x17] = CMP_OP(ptest, SSE41), /* TODO:Some vbroadcast variants require AVX2 */ [0x18] = UNARY_OP(vbroadcastl, AVX, SSE_OPF_SCALAR), /* vbroadcastss */ @@ -3287,6 +3289,7 @@ static const struct SSEOpHelper_table6 sse_op_table6[256] = { [0x33] = UNARY_OP(pmovzxwd, SSE41, SSE_OPF_MMX), [0x34] = UNARY_OP(pmovzxwq, SSE41, SSE_OPF_MMX), [0x35] = UNARY_OP(pmovzxdq, SSE41, SSE_OPF_MMX), +[0x36] = BINARY_OP(vpermd, AVX, SSE_OPF_AVX2), /* vpermd */ [0x37] = BINARY_OP(pcmpgtq, SSE41, SSE_OPF_MMX), [0x38] = BINARY_OP(pminsb, SSE41, SSE_OPF_MMX), [0x39] = BINARY_OP(pminsd, SSE41, SSE_OPF_MMX), @@ -3329,8 +3332,13 @@ static const struct SSEOpHelper_table6 sse_op_table6[256] = { /* prefix [66] 0f 3a */ static const struct SSEOpHelper_table7 sse_op_table7[256] = { +#define gen_helper_vpermq_xmm NULL +[0x00] = UNARY_OP(vpermq, AVX, SSE_OPF_AVX2), +[0x01] = UNARY_OP(vpermq, AVX, SSE_OPF_AVX2), /* vpermpd */ [0x04] = UNARY_OP(vpermilps_imm, AVX, 0), [0x05] = UNARY_OP(vpermilpd_imm, AVX, 0), +#define gen_helper_vpermdq_xmm NULL +[0x06] = BINARY_OP(vpermdq, AVX, 0), /* vperm2f128 */ [0x08] = UNARY_OP(roundps, SSE41, 0), [0x09] = UNARY_OP(roundpd, SSE41, 0), #define gen_helper_roundss_ymm NULL @@ -3353,6 +3361,7 @@ static const struct SSEOpHelper_table7 sse_op_table7[256] = { [0x41] = BINARY_OP(dppd, SSE41, 0), [0x42] = BINARY_OP(mpsadbw, SSE41, SSE_OPF_MMX), [0x44] = BINARY_OP(pclmulqdq, PCLMULQDQ, 0), +[0x46] = BINARY_OP(vpermdq, AVX, SSE_OPF_AVX2), /* vperm2i128 */ #define gen_helper_pcmpestrm_ymm NULL [0x60] = CMP_OP(pcmpestrm, SSE42), #define gen_helper_pcmpestri_ymm NULL -- 2.36.0
[PATCH v2 39/42] i386: Enable AVX cpuid bits when using TCG
Include AVX and AVX2 in the guest cpuid features supported by TCG Signed-off-by: Paul Brook --- target/i386/cpu.c | 8 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/target/i386/cpu.c b/target/i386/cpu.c index 99343be926..bd35233d5b 100644 --- a/target/i386/cpu.c +++ b/target/i386/cpu.c @@ -625,12 +625,12 @@ void x86_cpu_vendor_words2str(char *dst, uint32_t vendor1, CPUID_EXT_SSE41 | CPUID_EXT_SSE42 | CPUID_EXT_POPCNT | \ CPUID_EXT_XSAVE | /* CPUID_EXT_OSXSAVE is dynamic */ \ CPUID_EXT_MOVBE | CPUID_EXT_AES | CPUID_EXT_HYPERVISOR | \ - CPUID_EXT_RDRAND) + CPUID_EXT_RDRAND | CPUID_EXT_AVX) /* missing: CPUID_EXT_DTES64, CPUID_EXT_DSCPL, CPUID_EXT_VMX, CPUID_EXT_SMX, CPUID_EXT_EST, CPUID_EXT_TM2, CPUID_EXT_CID, CPUID_EXT_FMA, CPUID_EXT_XTPR, CPUID_EXT_PDCM, CPUID_EXT_PCID, CPUID_EXT_DCA, - CPUID_EXT_X2APIC, CPUID_EXT_TSC_DEADLINE_TIMER, CPUID_EXT_AVX, + CPUID_EXT_X2APIC, CPUID_EXT_TSC_DEADLINE_TIMER, CPUID_EXT_F16C */ #ifdef TARGET_X86_64 @@ -653,9 +653,9 @@ void x86_cpu_vendor_words2str(char *dst, uint32_t vendor1, CPUID_7_0_EBX_BMI1 | CPUID_7_0_EBX_BMI2 | CPUID_7_0_EBX_ADX | \ CPUID_7_0_EBX_PCOMMIT | CPUID_7_0_EBX_CLFLUSHOPT |\ CPUID_7_0_EBX_CLWB | CPUID_7_0_EBX_MPX | CPUID_7_0_EBX_FSGSBASE | \ - CPUID_7_0_EBX_ERMS) + CPUID_7_0_EBX_ERMS | CPUID_7_0_EBX_AVX2) /* missing: - CPUID_7_0_EBX_HLE, CPUID_7_0_EBX_AVX2, + CPUID_7_0_EBX_HLE CPUID_7_0_EBX_INVPCID, CPUID_7_0_EBX_RTM, CPUID_7_0_EBX_RDSEED */ #define TCG_7_0_ECX_FEATURES (CPUID_7_0_ECX_UMIP | CPUID_7_0_ECX_PKU | \ -- 2.36.0
[PATCH v2 42/42] i386: Add sha512-avx test
Include sha512 built with avx[2] in the tcg tests. Signed-off-by: Paul Brook --- tests/tcg/i386/Makefile.target | 9 - 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/tests/tcg/i386/Makefile.target b/tests/tcg/i386/Makefile.target index eb06f7eb89..a0335fff6d 100644 --- a/tests/tcg/i386/Makefile.target +++ b/tests/tcg/i386/Makefile.target @@ -79,7 +79,14 @@ sha512-sse: sha512.c run-sha512-sse: QEMU_OPTS+=-cpu max run-plugin-sha512-sse-with-%: QEMU_OPTS+=-cpu max -TESTS+=sha512-sse +sha512-avx: CFLAGS=-mavx2 -mavx -O3 +sha512-avx: sha512.c + $(CC) $(CFLAGS) $(EXTRA_CFLAGS) $< -o $@ $(LDFLAGS) + +run-sha512-avx: QEMU_OPTS+=-cpu max +run-plugin-sha512-avx-with-%: QEMU_OPTS+=-cpu max + +TESTS+=sha512-sse sha512-avx test-avx.h: test-avx.py x86.csv $(PYTHON) $(I386_SRC)/test-avx.py $(I386_SRC)/x86.csv $@ -- 2.36.0
[PATCH v2 16/42] i386: Dot product AVX helper prep
Make the dpps and dppd helpers AVX-ready I can't see any obvious reason why dppd shouldn't work on 256 bit ymm registers, but both AMD and Intel agree that it's xmm only. Signed-off-by: Paul Brook --- target/i386/ops_sse.h | 54 --- 1 file changed, 46 insertions(+), 8 deletions(-) diff --git a/target/i386/ops_sse.h b/target/i386/ops_sse.h index d308a1ec40..4137e6e1fa 100644 --- a/target/i386/ops_sse.h +++ b/target/i386/ops_sse.h @@ -2366,8 +2366,10 @@ SSE_HELPER_I(helper_blendps, L, 4, FBLENDP) SSE_HELPER_I(helper_blendpd, Q, 2, FBLENDP) SSE_HELPER_I(helper_pblendw, W, 8, FBLENDP) -void glue(helper_dpps, SUFFIX)(CPUX86State *env, Reg *d, Reg *s, uint32_t mask) +void glue(helper_dpps, SUFFIX)(CPUX86State *env, Reg *d, Reg *s, + uint32_t mask) { +Reg *v = d; float32 prod, iresult, iresult2; /* @@ -2375,23 +2377,23 @@ void glue(helper_dpps, SUFFIX)(CPUX86State *env, Reg *d, Reg *s, uint32_t mask) * to correctly round the intermediate results */ if (mask & (1 << 4)) { -iresult = float32_mul(d->ZMM_S(0), s->ZMM_S(0), >sse_status); +iresult = float32_mul(v->ZMM_S(0), s->ZMM_S(0), >sse_status); } else { iresult = float32_zero; } if (mask & (1 << 5)) { -prod = float32_mul(d->ZMM_S(1), s->ZMM_S(1), >sse_status); +prod = float32_mul(v->ZMM_S(1), s->ZMM_S(1), >sse_status); } else { prod = float32_zero; } iresult = float32_add(iresult, prod, >sse_status); if (mask & (1 << 6)) { -iresult2 = float32_mul(d->ZMM_S(2), s->ZMM_S(2), >sse_status); +iresult2 = float32_mul(v->ZMM_S(2), s->ZMM_S(2), >sse_status); } else { iresult2 = float32_zero; } if (mask & (1 << 7)) { -prod = float32_mul(d->ZMM_S(3), s->ZMM_S(3), >sse_status); +prod = float32_mul(v->ZMM_S(3), s->ZMM_S(3), >sse_status); } else { prod = float32_zero; } @@ -2402,26 +2404,62 @@ void glue(helper_dpps, SUFFIX)(CPUX86State *env, Reg *d, Reg *s, uint32_t mask) d->ZMM_S(1) = (mask & (1 << 1)) ? iresult : float32_zero; d->ZMM_S(2) = (mask & (1 << 2)) ? iresult : float32_zero; d->ZMM_S(3) = (mask & (1 << 3)) ? iresult : float32_zero; +#if SHIFT == 2 +if (mask & (1 << 4)) { +iresult = float32_mul(v->ZMM_S(4), s->ZMM_S(4), >sse_status); +} else { +iresult = float32_zero; +} +if (mask & (1 << 5)) { +prod = float32_mul(v->ZMM_S(5), s->ZMM_S(5), >sse_status); +} else { +prod = float32_zero; +} +iresult = float32_add(iresult, prod, >sse_status); +if (mask & (1 << 6)) { +iresult2 = float32_mul(v->ZMM_S(6), s->ZMM_S(6), >sse_status); +} else { +iresult2 = float32_zero; +} +if (mask & (1 << 7)) { +prod = float32_mul(v->ZMM_S(7), s->ZMM_S(7), >sse_status); +} else { +prod = float32_zero; +} +iresult2 = float32_add(iresult2, prod, >sse_status); +iresult = float32_add(iresult, iresult2, >sse_status); + +d->ZMM_S(4) = (mask & (1 << 0)) ? iresult : float32_zero; +d->ZMM_S(5) = (mask & (1 << 1)) ? iresult : float32_zero; +d->ZMM_S(6) = (mask & (1 << 2)) ? iresult : float32_zero; +d->ZMM_S(7) = (mask & (1 << 3)) ? iresult : float32_zero; +#endif } -void glue(helper_dppd, SUFFIX)(CPUX86State *env, Reg *d, Reg *s, uint32_t mask) +#if SHIFT == 1 +/* Oddly, there is no ymm version of dppd */ +void glue(helper_dppd, SUFFIX)(CPUX86State *env, + Reg *d, Reg *s, uint32_t mask) { +Reg *v = d; float64 iresult; if (mask & (1 << 4)) { -iresult = float64_mul(d->ZMM_D(0), s->ZMM_D(0), >sse_status); +iresult = float64_mul(v->ZMM_D(0), s->ZMM_D(0), >sse_status); } else { iresult = float64_zero; } + if (mask & (1 << 5)) { iresult = float64_add(iresult, - float64_mul(d->ZMM_D(1), s->ZMM_D(1), + float64_mul(v->ZMM_D(1), s->ZMM_D(1), >sse_status), >sse_status); } d->ZMM_D(0) = (mask & (1 << 0)) ? iresult : float64_zero; d->ZMM_D(1) = (mask & (1 << 1)) ? iresult : float64_zero; } +#endif void glue(helper_mpsadbw, SUFFIX)(CPUX86State *env, Reg *d, Reg *s, uint32_t offset) -- 2.36.0
[PATCH v2 34/42] i386: Implement VGATHER
These are scatter load instructions that need introduce a new "Vector SIB" encoding. Also a bit of hair to handle different index sizes and scaling factors, but overall the combinatorial explosion doesn't end up too bad. The other thing of note is probably that these also modify the mask operand. Thankfully the operands may not overlap, and we do not have to make the whole thing appear atomic. Signed-off-by: Paul Brook --- target/i386/ops_sse.h| 65 +++ target/i386/ops_sse_header.h | 16 target/i386/tcg/translate.c | 74 3 files changed, 155 insertions(+) diff --git a/target/i386/ops_sse.h b/target/i386/ops_sse.h index ffcba3d02c..14a2d1bf78 100644 --- a/target/i386/ops_sse.h +++ b/target/i386/ops_sse.h @@ -3288,6 +3288,71 @@ void glue(helper_vpmaskmovq, SUFFIX)(CPUX86State *env, Reg *d, Reg *v, Reg *s) #endif } +#define VGATHER_HELPER(scale) \ +void glue(helper_vpgatherdd ## scale, SUFFIX)(CPUX86State *env, \ +Reg *d, Reg *v, Reg *s, target_ulong a0)\ +{ \ +int i; \ +for (i = 0; i < (2 << SHIFT); i++) {\ +if (v->L(i) >> 31) {\ +target_ulong addr = a0 \ ++ ((target_ulong)(int32_t)s->L(i) << scale);\ +d->L(i) = cpu_ldl_data_ra(env, addr, GETPC()); \ +} \ +v->L(i) = 0;\ +} \ +} \ +void glue(helper_vpgatherdq ## scale, SUFFIX)(CPUX86State *env, \ +Reg *d, Reg *v, Reg *s, target_ulong a0)\ +{ \ +int i; \ +for (i = 0; i < (1 << SHIFT); i++) {\ +if (v->Q(i) >> 63) {\ +target_ulong addr = a0 \ ++ ((target_ulong)(int32_t)s->L(i) << scale);\ +d->Q(i) = cpu_ldq_data_ra(env, addr, GETPC()); \ +} \ +v->Q(i) = 0;\ +} \ +} \ +void glue(helper_vpgatherqd ## scale, SUFFIX)(CPUX86State *env, \ +Reg *d, Reg *v, Reg *s, target_ulong a0)\ +{ \ +int i; \ +for (i = 0; i < (1 << SHIFT); i++) {\ +if (v->L(i) >> 31) {\ +target_ulong addr = a0 \ ++ ((target_ulong)(int64_t)s->Q(i) << scale);\ +d->L(i) = cpu_ldl_data_ra(env, addr, GETPC()); \ +} \ +v->L(i) = 0;\ +} \ +d->Q(SHIFT) = 0;\ +v->Q(SHIFT) = 0;\ +YMM_ONLY( \ +d->Q(3) = 0;\ +v->Q(3) = 0;\ +) \ +} \ +void glue(helper_vpgatherqq ## scale, SUFFIX)(CPUX86State *env, \ +Reg *d, Reg *v, Reg *s, target_ulong a0)\ +{ \ +int i; \ +for (i = 0; i < (1 << SHIFT); i++) {\ +if (v->Q(i) >> 63) {\ +target_ulong addr = a0 \ ++ ((target_ulong)(int64_t)s->Q(i) << scale);\ +d
[PATCH v2 23/42] i386: AVX comparison helpers
AVX includes additional a more extensive set of comparison predicates, some of some of which our softfloat implementation does not expose directly. Rewrite the helpers in terms of floatN_compare Signed-off-by: Paul Brook --- target/i386/ops_sse.h| 149 --- target/i386/ops_sse_header.h | 47 --- target/i386/tcg/translate.c | 49 +--- 3 files changed, 177 insertions(+), 68 deletions(-) diff --git a/target/i386/ops_sse.h b/target/i386/ops_sse.h index 48cec40074..e48dfc2fc5 100644 --- a/target/i386/ops_sse.h +++ b/target/i386/ops_sse.h @@ -1394,57 +1394,112 @@ void glue(helper_addsubpd, SUFFIX)(CPUX86State *env, Reg *d, Reg *s) #endif } -/* XXX: unordered */ -#define SSE_HELPER_CMP(name, F) \ -void glue(helper_ ## name ## ps, SUFFIX)(CPUX86State *env, Reg *d, Reg *s)\ -{ \ -d->ZMM_L(0) = F(32, d->ZMM_S(0), s->ZMM_S(0)); \ -d->ZMM_L(1) = F(32, d->ZMM_S(1), s->ZMM_S(1)); \ -d->ZMM_L(2) = F(32, d->ZMM_S(2), s->ZMM_S(2)); \ -d->ZMM_L(3) = F(32, d->ZMM_S(3), s->ZMM_S(3)); \ -} \ -\ -void helper_ ## name ## ss(CPUX86State *env, Reg *d, Reg *s)\ -{ \ -d->ZMM_L(0) = F(32, d->ZMM_S(0), s->ZMM_S(0)); \ -} \ -\ -void glue(helper_ ## name ## pd, SUFFIX)(CPUX86State *env, Reg *d, Reg *s)\ +#define SSE_HELPER_CMP_P(name, F, C)\ +void glue(helper_ ## name ## ps, SUFFIX)(CPUX86State *env, \ + Reg *d, Reg *s)\ { \ -d->ZMM_Q(0) = F(64, d->ZMM_D(0), s->ZMM_D(0)); \ -d->ZMM_Q(1) = F(64, d->ZMM_D(1), s->ZMM_D(1)); \ +Reg *v = d; \ +d->ZMM_L(0) = F(32, C, v->ZMM_S(0), s->ZMM_S(0)); \ +d->ZMM_L(1) = F(32, C, v->ZMM_S(1), s->ZMM_S(1)); \ +d->ZMM_L(2) = F(32, C, v->ZMM_S(2), s->ZMM_S(2)); \ +d->ZMM_L(3) = F(32, C, v->ZMM_S(3), s->ZMM_S(3)); \ +YMM_ONLY( \ +d->ZMM_L(4) = F(32, C, v->ZMM_S(4), s->ZMM_S(4)); \ +d->ZMM_L(5) = F(32, C, v->ZMM_S(5), s->ZMM_S(5)); \ +d->ZMM_L(6) = F(32, C, v->ZMM_S(6), s->ZMM_S(6)); \ +d->ZMM_L(7) = F(32, C, v->ZMM_S(7), s->ZMM_S(7)); \ +) \ } \ \ -void helper_ ## name ## sd(CPUX86State *env, Reg *d, Reg *s)\ +void glue(helper_ ## name ## pd, SUFFIX)(CPUX86State *env, \ + Reg *d, Reg *s)\ { \ -d->ZMM_Q(0) = F(64, d->ZMM_D(0), s->ZMM_D(0)); \ -} - -#define FPU_CMPEQ(size, a, b) \ -(float ## size ## _eq_quiet(a, b, >sse_status) ? -1 : 0) -#define FPU_CMPLT(size, a, b) \ -(float ## size ## _lt(a, b, >sse_status) ? -1 : 0) -#define FPU_CMPLE(size, a, b) \ -(float ## size ## _le(a, b, >sse_status) ? -1 : 0) -#define FPU_CMPUNORD(size, a, b)\ -(float ## size ## _unordered_quiet(a, b, >sse_status) ? -1 : 0) -#define FPU_CMPNEQ(size, a, b) \ -(float ## size ## _eq_quiet(a, b, >sse_status) ? 0 : -1) -#define FPU_CMPNLT(size, a, b) \ -(float ## size ## _lt(a, b, >sse_status) ? 0 : -1) -#define FPU_CMPNLE(size, a, b) \ -(float ## size ## _le(a, b, >sse_status) ? 0 : -1) -#define FPU_CMPORD(size, a, b) \ -(float ## size ## _unordered_quiet(a, b, >sse_status) ? 0 : -1) - -SSE_HELPER_CMP(cmpeq, FPU_CMPEQ) -SSE_HELPER_CMP(cmplt, FPU_CMPLT) -SSE_HELPER_
[PATCH v2 24/42] i386: Move 3DNOW decoder
Handle 3DNOW instructions early to avoid complicating the AVX logic. Signed-off-by: Paul Brook --- target/i386/tcg/translate.c | 30 +- 1 file changed, 17 insertions(+), 13 deletions(-) diff --git a/target/i386/tcg/translate.c b/target/i386/tcg/translate.c index 64f026c0af..6c40df61d4 100644 --- a/target/i386/tcg/translate.c +++ b/target/i386/tcg/translate.c @@ -3297,6 +3297,11 @@ static void gen_sse(CPUX86State *env, DisasContext *s, int b, is_xmm = 1; } } +if (sse_op.flags & SSE_OPF_3DNOW) { +if (!(s->cpuid_ext2_features & CPUID_EXT2_3DNOW)) { +goto illegal_op; +} +} /* simple MMX/SSE operation */ if (s->flags & HF_TS_MASK) { gen_exception(s, EXCP07_PREX, pc_start - s->cs_base); @@ -4761,21 +4766,20 @@ static void gen_sse(CPUX86State *env, DisasContext *s, int b, rm = (modrm & 7); op2_offset = offsetof(CPUX86State,fpregs[rm].mmx); } +if (sse_op.flags & SSE_OPF_3DNOW) { +/* 3DNow! data insns */ +val = x86_ldub_code(env, s); +SSEFunc_0_epp op_3dnow = sse_op_table5[val]; +if (!op_3dnow) { +goto unknown_op; +} +tcg_gen_addi_ptr(s->ptr0, cpu_env, op1_offset); +tcg_gen_addi_ptr(s->ptr1, cpu_env, op2_offset); +op_3dnow(cpu_env, s->ptr0, s->ptr1); +return; +} } switch(b) { -case 0x0f: /* 3DNow! data insns */ -val = x86_ldub_code(env, s); -sse_fn_epp = sse_op_table5[val]; -if (!sse_fn_epp) { -goto unknown_op; -} -if (!(s->cpuid_ext2_features & CPUID_EXT2_3DNOW)) { -goto illegal_op; -} -tcg_gen_addi_ptr(s->ptr0, cpu_env, op1_offset); -tcg_gen_addi_ptr(s->ptr1, cpu_env, op2_offset); -sse_fn_epp(cpu_env, s->ptr0, s->ptr1); -break; case 0x70: /* pshufx insn */ case 0xc6: /* pshufx insn */ val = x86_ldub_code(env, s); -- 2.36.0
[PATCH v2 22/42] i386: Update ops_sse_helper.h ready for 256 bit AVX
Update ops_sse_helper.h ready for 256 bit AVX helpers Signed-off-by: Paul Brook --- target/i386/ops_sse_header.h | 67 +--- 1 file changed, 40 insertions(+), 27 deletions(-) diff --git a/target/i386/ops_sse_header.h b/target/i386/ops_sse_header.h index 203afbb5a1..63b63eb532 100644 --- a/target/i386/ops_sse_header.h +++ b/target/i386/ops_sse_header.h @@ -105,7 +105,7 @@ SSE_HELPER_L(pcmpeql, FCMPEQ) SSE_HELPER_W(pmullw, FMULLW) #if SHIFT == 0 -DEF_HELPER_3(glue(pmulhrw, SUFFIX), FMULHRW) +DEF_HELPER_3(glue(pmulhrw, SUFFIX), void, env, Reg, Reg) #endif SSE_HELPER_W(pmulhuw, FMULHUW) SSE_HELPER_W(pmulhw, FMULHW) @@ -137,23 +137,39 @@ DEF_HELPER_3(glue(pshufhw, SUFFIX), void, Reg, Reg, int) /* FPU ops */ /* XXX: not accurate */ -DEF_HELPER_3(glue(shufps, SUFFIX), void, Reg, Reg, int) -DEF_HELPER_3(glue(shufpd, SUFFIX), void, Reg, Reg, int) +#define SSE_HELPER_P4(name) \ +DEF_HELPER_3(glue(name ## ps, SUFFIX), void, env, Reg, Reg) \ +DEF_HELPER_3(glue(name ## pd, SUFFIX), void, env, Reg, Reg) + +#define SSE_HELPER_P3(name, ...)\ +DEF_HELPER_3(glue(name ## ps, SUFFIX), void, env, Reg, Reg) \ +DEF_HELPER_3(glue(name ## pd, SUFFIX), void, env, Reg, Reg) -#define SSE_HELPER_S(name, F)\ -DEF_HELPER_3(glue(name ## ps, SUFFIX), void, env, Reg, Reg)\ -DEF_HELPER_3(name ## ss, void, env, Reg, Reg)\ -DEF_HELPER_3(glue(name ## pd, SUFFIX), void, env, Reg, Reg)\ +#if SHIFT == 1 +#define SSE_HELPER_S4(name) \ +SSE_HELPER_P4(name) \ +DEF_HELPER_3(name ## ss, void, env, Reg, Reg) \ DEF_HELPER_3(name ## sd, void, env, Reg, Reg) +#define SSE_HELPER_S3(name) \ +SSE_HELPER_P3(name) \ +DEF_HELPER_3(name ## ss, void, env, Reg, Reg) \ +DEF_HELPER_3(name ## sd, void, env, Reg, Reg) +#else +#define SSE_HELPER_S4(name, ...) SSE_HELPER_P4(name) +#define SSE_HELPER_S3(name, ...) SSE_HELPER_P3(name) +#endif + +DEF_HELPER_3(glue(shufps, SUFFIX), void, Reg, Reg, int) +DEF_HELPER_3(glue(shufpd, SUFFIX), void, Reg, Reg, int) -SSE_HELPER_S(add, FPU_ADD) -SSE_HELPER_S(sub, FPU_SUB) -SSE_HELPER_S(mul, FPU_MUL) -SSE_HELPER_S(div, FPU_DIV) -SSE_HELPER_S(min, FPU_MIN) -SSE_HELPER_S(max, FPU_MAX) -SSE_HELPER_S(sqrt, FPU_SQRT) +SSE_HELPER_S4(add) +SSE_HELPER_S4(sub) +SSE_HELPER_S4(mul) +SSE_HELPER_S4(div) +SSE_HELPER_S4(min) +SSE_HELPER_S4(max) +SSE_HELPER_S3(sqrt) DEF_HELPER_3(glue(cvtps2pd, SUFFIX), void, env, Reg, Reg) DEF_HELPER_3(glue(cvtpd2ps, SUFFIX), void, env, Reg, Reg) @@ -208,18 +224,12 @@ DEF_HELPER_4(extrq_i, void, env, ZMMReg, int, int) DEF_HELPER_3(insertq_r, void, env, ZMMReg, ZMMReg) DEF_HELPER_4(insertq_i, void, env, ZMMReg, int, int) #endif -DEF_HELPER_3(glue(haddps, SUFFIX), void, env, ZMMReg, ZMMReg) -DEF_HELPER_3(glue(haddpd, SUFFIX), void, env, ZMMReg, ZMMReg) -DEF_HELPER_3(glue(hsubps, SUFFIX), void, env, ZMMReg, ZMMReg) -DEF_HELPER_3(glue(hsubpd, SUFFIX), void, env, ZMMReg, ZMMReg) -DEF_HELPER_3(glue(addsubps, SUFFIX), void, env, ZMMReg, ZMMReg) -DEF_HELPER_3(glue(addsubpd, SUFFIX), void, env, ZMMReg, ZMMReg) - -#define SSE_HELPER_CMP(name, F) \ -DEF_HELPER_3(glue(name ## ps, SUFFIX), void, env, Reg, Reg) \ -DEF_HELPER_3(name ## ss, void, env, Reg, Reg) \ -DEF_HELPER_3(glue(name ## pd, SUFFIX), void, env, Reg, Reg) \ -DEF_HELPER_3(name ## sd, void, env, Reg, Reg) + +SSE_HELPER_P4(hadd) +SSE_HELPER_P4(hsub) +SSE_HELPER_P4(addsub) + +#define SSE_HELPER_CMP(name, F) SSE_HELPER_S4(name) SSE_HELPER_CMP(cmpeq, FPU_CMPEQ) SSE_HELPER_CMP(cmplt, FPU_CMPLT) @@ -381,6 +391,9 @@ DEF_HELPER_4(glue(pclmulqdq, SUFFIX), void, env, Reg, Reg, i32) #undef SSE_HELPER_W #undef SSE_HELPER_L #undef SSE_HELPER_Q -#undef SSE_HELPER_S +#undef SSE_HELPER_S3 +#undef SSE_HELPER_S4 +#undef SSE_HELPER_P3 +#undef SSE_HELPER_P4 #undef SSE_HELPER_CMP #undef UNPCK_OP -- 2.36.0
[PATCH v2 18/42] i386: Misc AVX helper prep
Fixup various vector helpers that either trivially exten to 256 bit, or don't have 256 bit variants. No functional changes to existing helpers Signed-off-by: Paul Brook --- target/i386/ops_sse.h | 159 -- 1 file changed, 139 insertions(+), 20 deletions(-) diff --git a/target/i386/ops_sse.h b/target/i386/ops_sse.h index d128af6cc8..3202c00572 100644 --- a/target/i386/ops_sse.h +++ b/target/i386/ops_sse.h @@ -641,6 +641,7 @@ void glue(helper_psadbw, SUFFIX)(CPUX86State *env, Reg *d, Reg *s) #endif } +#if SHIFT < 2 void glue(helper_maskmov, SUFFIX)(CPUX86State *env, Reg *d, Reg *s, target_ulong a0) { @@ -652,6 +653,7 @@ void glue(helper_maskmov, SUFFIX)(CPUX86State *env, Reg *d, Reg *s, } } } +#endif void glue(helper_movl_mm_T0, SUFFIX)(Reg *d, uint32_t val) { @@ -882,6 +884,13 @@ void glue(helper_cvtps2pd, SUFFIX)(CPUX86State *env, Reg *d, Reg *s) s0 = s->ZMM_S(0); s1 = s->ZMM_S(1); +#if SHIFT == 2 +float32 s2, s3; +s2 = s->ZMM_S(2); +s3 = s->ZMM_S(3); +d->ZMM_D(2) = float32_to_float64(s2, >sse_status); +d->ZMM_D(3) = float32_to_float64(s3, >sse_status); +#endif d->ZMM_D(0) = float32_to_float64(s0, >sse_status); d->ZMM_D(1) = float32_to_float64(s1, >sse_status); } @@ -890,9 +899,17 @@ void glue(helper_cvtpd2ps, SUFFIX)(CPUX86State *env, Reg *d, Reg *s) { d->ZMM_S(0) = float64_to_float32(s->ZMM_D(0), >sse_status); d->ZMM_S(1) = float64_to_float32(s->ZMM_D(1), >sse_status); +#if SHIFT == 2 +d->ZMM_S(2) = float64_to_float32(s->ZMM_D(2), >sse_status); +d->ZMM_S(3) = float64_to_float32(s->ZMM_D(3), >sse_status); +d->Q(2) = 0; +d->Q(3) = 0; +#else d->Q(1) = 0; +#endif } +#if SHIFT == 1 void helper_cvtss2sd(CPUX86State *env, Reg *d, Reg *s) { d->ZMM_D(0) = float32_to_float64(s->ZMM_S(0), >sse_status); @@ -902,6 +919,7 @@ void helper_cvtsd2ss(CPUX86State *env, Reg *d, Reg *s) { d->ZMM_S(0) = float64_to_float32(s->ZMM_D(0), >sse_status); } +#endif /* integer to float */ void glue(helper_cvtdq2ps, SUFFIX)(CPUX86State *env, Reg *d, Reg *s) @@ -910,6 +928,12 @@ void glue(helper_cvtdq2ps, SUFFIX)(CPUX86State *env, Reg *d, Reg *s) d->ZMM_S(1) = int32_to_float32(s->ZMM_L(1), >sse_status); d->ZMM_S(2) = int32_to_float32(s->ZMM_L(2), >sse_status); d->ZMM_S(3) = int32_to_float32(s->ZMM_L(3), >sse_status); +#if SHIFT == 2 +d->ZMM_S(4) = int32_to_float32(s->ZMM_L(4), >sse_status); +d->ZMM_S(5) = int32_to_float32(s->ZMM_L(5), >sse_status); +d->ZMM_S(6) = int32_to_float32(s->ZMM_L(6), >sse_status); +d->ZMM_S(7) = int32_to_float32(s->ZMM_L(7), >sse_status); +#endif } void glue(helper_cvtdq2pd, SUFFIX)(CPUX86State *env, Reg *d, Reg *s) @@ -918,10 +942,18 @@ void glue(helper_cvtdq2pd, SUFFIX)(CPUX86State *env, Reg *d, Reg *s) l0 = (int32_t)s->ZMM_L(0); l1 = (int32_t)s->ZMM_L(1); +#if SHIFT == 2 +int32_t l2, l3; +l2 = (int32_t)s->ZMM_L(2); +l3 = (int32_t)s->ZMM_L(3); +d->ZMM_D(2) = int32_to_float64(l2, >sse_status); +d->ZMM_D(3) = int32_to_float64(l3, >sse_status); +#endif d->ZMM_D(0) = int32_to_float64(l0, >sse_status); d->ZMM_D(1) = int32_to_float64(l1, >sse_status); } +#if SHIFT == 1 void helper_cvtpi2ps(CPUX86State *env, ZMMReg *d, MMXReg *s) { d->ZMM_S(0) = int32_to_float32(s->MMX_L(0), >sse_status); @@ -956,8 +988,11 @@ void helper_cvtsq2sd(CPUX86State *env, ZMMReg *d, uint64_t val) } #endif +#endif + /* float to integer */ +#if SHIFT == 1 /* * x86 mandates that we return the indefinite integer value for the result * of any float-to-integer conversion that raises the 'invalid' exception. @@ -988,6 +1023,7 @@ WRAP_FLOATCONV(int64_t, float32_to_int64, float32, INT64_MIN) WRAP_FLOATCONV(int64_t, float32_to_int64_round_to_zero, float32, INT64_MIN) WRAP_FLOATCONV(int64_t, float64_to_int64, float64, INT64_MIN) WRAP_FLOATCONV(int64_t, float64_to_int64_round_to_zero, float64, INT64_MIN) +#endif void glue(helper_cvtps2dq, SUFFIX)(CPUX86State *env, ZMMReg *d, ZMMReg *s) { @@ -995,15 +1031,29 @@ void glue(helper_cvtps2dq, SUFFIX)(CPUX86State *env, ZMMReg *d, ZMMReg *s) d->ZMM_L(1) = x86_float32_to_int32(s->ZMM_S(1), >sse_status); d->ZMM_L(2) = x86_float32_to_int32(s->ZMM_S(2), >sse_status); d->ZMM_L(3) = x86_float32_to_int32(s->ZMM_S(3), >sse_status); +#if SHIFT == 2 +d->ZMM_L(4) = x86_float32_to_int32(s->ZMM_S(4), >sse_status); +d->ZMM_L(5) = x86_float32_to_int32(s->ZMM_S(5), >sse_status); +d->ZMM_L(6) = x86_float32_to_int32(s->ZMM_S(6), >sse_status); +d->ZMM_L(7) = x86_float32_to_int32(s->ZMM_S(7), >sse_status); +#endif }
[PATCH v2 28/42] i386: Implement VZEROALL and VZEROUPPER
The use the same opcode as EMMS, which I guess makes some sort of sense. Fairly strightforward other than that. If we were wanting to optimize out gen_clear_ymmh then this would be one of the starting points. Signed-off-by: Paul Brook --- target/i386/ops_sse.h| 48 target/i386/ops_sse_header.h | 9 +++ target/i386/tcg/translate.c | 26 --- 3 files changed, 80 insertions(+), 3 deletions(-) diff --git a/target/i386/ops_sse.h b/target/i386/ops_sse.h index ad3312d353..a1f50f0c8b 100644 --- a/target/i386/ops_sse.h +++ b/target/i386/ops_sse.h @@ -3071,6 +3071,54 @@ void glue(helper_aeskeygenassist, SUFFIX)(CPUX86State *env, Reg *d, Reg *s, #endif #endif +#if SHIFT == 2 +void helper_vzeroall(CPUX86State *env) +{ +int i; + +for (i = 0; i < 8; i++) { +env->xmm_regs[i].ZMM_Q(0) = 0; +env->xmm_regs[i].ZMM_Q(1) = 0; +env->xmm_regs[i].ZMM_Q(2) = 0; +env->xmm_regs[i].ZMM_Q(3) = 0; +} +} + +void helper_vzeroupper(CPUX86State *env) +{ +int i; + +for (i = 0; i < 8; i++) { +env->xmm_regs[i].ZMM_Q(2) = 0; +env->xmm_regs[i].ZMM_Q(3) = 0; +} +} + +#ifdef TARGET_X86_64 +void helper_vzeroall_hi8(CPUX86State *env) +{ +int i; + +for (i = 8; i < 16; i++) { +env->xmm_regs[i].ZMM_Q(0) = 0; +env->xmm_regs[i].ZMM_Q(1) = 0; +env->xmm_regs[i].ZMM_Q(2) = 0; +env->xmm_regs[i].ZMM_Q(3) = 0; +} +} + +void helper_vzeroupper_hi8(CPUX86State *env) +{ +int i; + +for (i = 8; i < 16; i++) { +env->xmm_regs[i].ZMM_Q(2) = 0; +env->xmm_regs[i].ZMM_Q(3) = 0; +} +} +#endif +#endif + #undef SSE_HELPER_S #undef SHIFT diff --git a/target/i386/ops_sse_header.h b/target/i386/ops_sse_header.h index cfcfba154b..48f0945917 100644 --- a/target/i386/ops_sse_header.h +++ b/target/i386/ops_sse_header.h @@ -411,6 +411,15 @@ DEF_HELPER_4(glue(aeskeygenassist, SUFFIX), void, env, Reg, Reg, i32) DEF_HELPER_5(glue(pclmulqdq, SUFFIX), void, env, Reg, Reg, Reg, i32) #endif +#if SHIFT == 2 +DEF_HELPER_1(vzeroall, void, env) +DEF_HELPER_1(vzeroupper, void, env) +#ifdef TARGET_X86_64 +DEF_HELPER_1(vzeroall_hi8, void, env) +DEF_HELPER_1(vzeroupper_hi8, void, env) +#endif +#endif + #undef SHIFT #undef Reg #undef SUFFIX diff --git a/target/i386/tcg/translate.c b/target/i386/tcg/translate.c index bcd6d47fd0..ba70aeb039 100644 --- a/target/i386/tcg/translate.c +++ b/target/i386/tcg/translate.c @@ -3455,9 +3455,29 @@ static void gen_sse(CPUX86State *env, DisasContext *s, int b, return; } if (b == 0x77) { -/* emms */ -gen_helper_emms(cpu_env); -return; +if (s->prefix & PREFIX_VEX) { +CHECK_AVX(s); +if (s->vex_l) { +gen_helper_vzeroall(cpu_env); +#ifdef TARGET_X86_64 +if (CODE64(s)) { +gen_helper_vzeroall_hi8(cpu_env); +} +#endif +} else { +gen_helper_vzeroupper(cpu_env); +#ifdef TARGET_X86_64 +if (CODE64(s)) { +gen_helper_vzeroupper_hi8(cpu_env); +} +#endif +} +return; +} else { +/* emms */ +gen_helper_emms(cpu_env); +return; +} } /* prepare MMX state (XXX: optimize by storing fptt and fptags in the static cpu state) */ -- 2.36.0
[PATCH v2 13/42] i386: Destructive vector helpers for AVX
These helpers need to take special care to avoid overwriting source values before the wole result has been calculated. Currently they use a dummy Reg typed variable to store the result then assign the whole register. This will cause 128 bit operations to corrupt the upper half of the register, so replace it with explicit temporaries and element assignments. Signed-off-by: Paul Brook --- target/i386/ops_sse.h | 707 ++ 1 file changed, 437 insertions(+), 270 deletions(-) diff --git a/target/i386/ops_sse.h b/target/i386/ops_sse.h index d0424140d9..c645d2ddbf 100644 --- a/target/i386/ops_sse.h +++ b/target/i386/ops_sse.h @@ -680,71 +680,85 @@ void glue(helper_movq_mm_T0, SUFFIX)(Reg *d, uint64_t val) } #endif +#define SHUFFLE4(F, a, b, offset) do { \ +r0 = a->F((order & 3) + offset);\ +r1 = a->F(((order >> 2) & 3) + offset); \ +r2 = b->F(((order >> 4) & 3) + offset); \ +r3 = b->F(((order >> 6) & 3) + offset); \ +d->F(offset) = r0; \ +d->F(offset + 1) = r1; \ +d->F(offset + 2) = r2; \ +d->F(offset + 3) = r3; \ +} while (0) + #if SHIFT == 0 void glue(helper_pshufw, SUFFIX)(Reg *d, Reg *s, int order) { -Reg r; +uint16_t r0, r1, r2, r3; -r.W(0) = s->W(order & 3); -r.W(1) = s->W((order >> 2) & 3); -r.W(2) = s->W((order >> 4) & 3); -r.W(3) = s->W((order >> 6) & 3); -MOVE(*d, r); +SHUFFLE4(W, s, s, 0); } #else void helper_shufps(Reg *d, Reg *s, int order) { -Reg r; +Reg *v = d; +uint32_t r0, r1, r2, r3; -r.L(0) = d->L(order & 3); -r.L(1) = d->L((order >> 2) & 3); -r.L(2) = s->L((order >> 4) & 3); -r.L(3) = s->L((order >> 6) & 3); -MOVE(*d, r); +SHUFFLE4(L, v, s, 0); +#if SHIFT == 2 +SHUFFLE4(L, v, s, 4); +#endif } void helper_shufpd(Reg *d, Reg *s, int order) { -Reg r; +Reg *v = d; +uint64_t r0, r1; -r.Q(0) = d->Q(order & 1); -r.Q(1) = s->Q((order >> 1) & 1); -MOVE(*d, r); +r0 = v->Q(order & 1); +r1 = s->Q((order >> 1) & 1); +d->Q(0) = r0; +d->Q(1) = r1; +#if SHIFT == 2 +r0 = v->Q(((order >> 2) & 1) + 2); +r1 = s->Q(((order >> 3) & 1) + 2); +d->Q(2) = r0; +d->Q(3) = r1; +#endif } void glue(helper_pshufd, SUFFIX)(Reg *d, Reg *s, int order) { -Reg r; +uint32_t r0, r1, r2, r3; -r.L(0) = s->L(order & 3); -r.L(1) = s->L((order >> 2) & 3); -r.L(2) = s->L((order >> 4) & 3); -r.L(3) = s->L((order >> 6) & 3); -MOVE(*d, r); +SHUFFLE4(L, s, s, 0); +#if SHIFT == 2 +SHUFFLE4(L, s, s, 4); +#endif } void glue(helper_pshuflw, SUFFIX)(Reg *d, Reg *s, int order) { -Reg r; +uint16_t r0, r1, r2, r3; -r.W(0) = s->W(order & 3); -r.W(1) = s->W((order >> 2) & 3); -r.W(2) = s->W((order >> 4) & 3); -r.W(3) = s->W((order >> 6) & 3); -r.Q(1) = s->Q(1); -MOVE(*d, r); +SHUFFLE4(W, s, s, 0); +d->Q(1) = s->Q(1); +#if SHIFT == 2 +SHUFFLE4(W, s, s, 8); +d->Q(3) = s->Q(3); +#endif } void glue(helper_pshufhw, SUFFIX)(Reg *d, Reg *s, int order) { -Reg r; +uint16_t r0, r1, r2, r3; -r.Q(0) = s->Q(0); -r.W(4) = s->W(4 + (order & 3)); -r.W(5) = s->W(4 + ((order >> 2) & 3)); -r.W(6) = s->W(4 + ((order >> 4) & 3)); -r.W(7) = s->W(4 + ((order >> 6) & 3)); -MOVE(*d, r); +d->Q(0) = s->Q(0); +SHUFFLE4(W, s, s, 4); +#if SHIFT == 2 +d->Q(2) = s->Q(2); +SHUFFLE4(W, s, s, 12); +#endif } #endif @@ -1320,156 +1334,190 @@ uint32_t glue(helper_pmovmskb, SUFFIX)(CPUX86State *env, Reg *s) return val; } -void glue(helper_packsswb, SUFFIX)(CPUX86State *env, Reg *d, Reg *s) -{ -Reg r; - -r.B(0) = satsb((int16_t)d->W(0)); -r.B(1) = satsb((int16_t)d->W(1)); -r.B(2) = satsb((int16_t)d->W(2)); -r.B(3) = satsb((int16_t)d->W(3)); -#if SHIFT == 1 -r.B(4) = satsb((int16_t)d->W(4)); -r.B(5) = satsb((int16_t)d->W(5)); -r.B(6) = satsb((int16_t)d->W(6)); -r.B(7) = satsb((int16_t)d->W(7)); -#endif -r.B((4 << SHIFT) + 0) = satsb((int16_t)s->W(0)); -r.B((4 << SHIFT) + 1) = satsb((int16_t)s->W(1)); -r.B((4 << SHIFT) + 2) = satsb((int16_t)s->W(2)); -r.B((4 << SHIFT) + 3) = satsb((int16_t)s->W(3)); -#if SHIFT == 1 -r.B(12) = satsb((int16_t)s->W(4)); -r.B(13) = satsb((int16_t)s->W(5)); -r.B(14) = satsb((int16_t)s->W(6)); -r.B(15) = satsb((int16_t)s->W(7)); -#endif -MOVE(*d, r); -} - -void
[PATCH v2 40/42] Enable all x86-64 cpu features in user mode
We don't have any migration concerns for usermode emulation, so we may as well enable all available CPU features by default. Signed-off-by: Paul Brook --- linux-user/x86_64/target_elf.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/linux-user/x86_64/target_elf.h b/linux-user/x86_64/target_elf.h index 7b76a90de8..3f628f8d66 100644 --- a/linux-user/x86_64/target_elf.h +++ b/linux-user/x86_64/target_elf.h @@ -9,6 +9,6 @@ #define X86_64_TARGET_ELF_H static inline const char *cpu_get_model(uint32_t eflags) { -return "qemu64"; +return "max"; } #endif -- 2.36.0
[PATCH v2 29/42] i386: Implement VBROADCAST
The catch here is that these are whole vector operations (not independent 128 bit lanes). We abuse the SSE_OPF_SCALAR flag to select the memory operand width appropriately. Signed-off-by: Paul Brook --- target/i386/ops_sse.h| 51 target/i386/ops_sse_header.h | 8 ++ target/i386/tcg/translate.c | 42 - 3 files changed, 100 insertions(+), 1 deletion(-) diff --git a/target/i386/ops_sse.h b/target/i386/ops_sse.h index a1f50f0c8b..4115c9a257 100644 --- a/target/i386/ops_sse.h +++ b/target/i386/ops_sse.h @@ -3071,7 +3071,57 @@ void glue(helper_aeskeygenassist, SUFFIX)(CPUX86State *env, Reg *d, Reg *s, #endif #endif +#if SHIFT >= 1 +void glue(helper_vbroadcastb, SUFFIX)(CPUX86State *env, Reg *d, Reg *s) +{ +uint8_t val = s->B(0); +int i; + +for (i = 0; i < 16 * SHIFT; i++) { +d->B(i) = val; +} +} + +void glue(helper_vbroadcastw, SUFFIX)(CPUX86State *env, Reg *d, Reg *s) +{ +uint16_t val = s->W(0); +int i; + +for (i = 0; i < 8 * SHIFT; i++) { +d->W(i) = val; +} +} + +void glue(helper_vbroadcastl, SUFFIX)(CPUX86State *env, Reg *d, Reg *s) +{ +uint32_t val = s->L(0); +int i; + +for (i = 0; i < 8 * SHIFT; i++) { +d->L(i) = val; +} +} + +void glue(helper_vbroadcastq, SUFFIX)(CPUX86State *env, Reg *d, Reg *s) +{ +uint64_t val = s->Q(0); +d->Q(0) = val; +d->Q(1) = val; #if SHIFT == 2 +d->Q(2) = val; +d->Q(3) = val; +#endif +} + +#if SHIFT == 2 +void glue(helper_vbroadcastdq, SUFFIX)(CPUX86State *env, Reg *d, Reg *s) +{ +d->Q(0) = s->Q(0); +d->Q(1) = s->Q(1); +d->Q(2) = s->Q(0); +d->Q(3) = s->Q(1); +} + void helper_vzeroall(CPUX86State *env) { int i; @@ -3118,6 +3168,7 @@ void helper_vzeroupper_hi8(CPUX86State *env) } #endif #endif +#endif #undef SSE_HELPER_S diff --git a/target/i386/ops_sse_header.h b/target/i386/ops_sse_header.h index 48f0945917..51e02cd4fa 100644 --- a/target/i386/ops_sse_header.h +++ b/target/i386/ops_sse_header.h @@ -411,7 +411,14 @@ DEF_HELPER_4(glue(aeskeygenassist, SUFFIX), void, env, Reg, Reg, i32) DEF_HELPER_5(glue(pclmulqdq, SUFFIX), void, env, Reg, Reg, Reg, i32) #endif +/* AVX helpers */ +#if SHIFT >= 1 +DEF_HELPER_3(glue(vbroadcastb, SUFFIX), void, env, Reg, Reg) +DEF_HELPER_3(glue(vbroadcastw, SUFFIX), void, env, Reg, Reg) +DEF_HELPER_3(glue(vbroadcastl, SUFFIX), void, env, Reg, Reg) +DEF_HELPER_3(glue(vbroadcastq, SUFFIX), void, env, Reg, Reg) #if SHIFT == 2 +DEF_HELPER_3(glue(vbroadcastdq, SUFFIX), void, env, Reg, Reg) DEF_HELPER_1(vzeroall, void, env) DEF_HELPER_1(vzeroupper, void, env) #ifdef TARGET_X86_64 @@ -419,6 +426,7 @@ DEF_HELPER_1(vzeroall_hi8, void, env) DEF_HELPER_1(vzeroupper_hi8, void, env) #endif #endif +#endif #undef SHIFT #undef Reg diff --git a/target/i386/tcg/translate.c b/target/i386/tcg/translate.c index ba70aeb039..59ab1dc562 100644 --- a/target/i386/tcg/translate.c +++ b/target/i386/tcg/translate.c @@ -3255,6 +3255,11 @@ static const struct SSEOpHelper_table6 sse_op_table6[256] = { [0x14] = BLENDV_OP(blendvps, SSE41, 0), [0x15] = BLENDV_OP(blendvpd, SSE41, 0), [0x17] = CMP_OP(ptest, SSE41), +/* TODO:Some vbroadcast variants require AVX2 */ +[0x18] = UNARY_OP(vbroadcastl, AVX, SSE_OPF_SCALAR), /* vbroadcastss */ +[0x19] = UNARY_OP(vbroadcastq, AVX, SSE_OPF_SCALAR), /* vbroadcastsd */ +#define gen_helper_vbroadcastdq_xmm NULL +[0x1a] = UNARY_OP(vbroadcastdq, AVX, SSE_OPF_SCALAR), /* vbroadcastf128 */ [0x1c] = UNARY_OP_MMX(pabsb, SSSE3), [0x1d] = UNARY_OP_MMX(pabsw, SSSE3), [0x1e] = UNARY_OP_MMX(pabsd, SSSE3), @@ -3286,6 +3291,16 @@ static const struct SSEOpHelper_table6 sse_op_table6[256] = { [0x40] = BINARY_OP(pmulld, SSE41, SSE_OPF_MMX), #define gen_helper_phminposuw_ymm NULL [0x41] = UNARY_OP(phminposuw, SSE41, 0), +/* vpbroadcastd */ +[0x58] = UNARY_OP(vbroadcastl, AVX, SSE_OPF_SCALAR | SSE_OPF_MMX), +/* vpbroadcastq */ +[0x59] = UNARY_OP(vbroadcastq, AVX, SSE_OPF_SCALAR | SSE_OPF_MMX), +/* vbroadcasti128 */ +[0x5a] = UNARY_OP(vbroadcastdq, AVX, SSE_OPF_SCALAR | SSE_OPF_MMX), +/* vpbroadcastb */ +[0x78] = UNARY_OP(vbroadcastb, AVX, SSE_OPF_SCALAR | SSE_OPF_MMX), +/* vpbroadcastw */ +[0x79] = UNARY_OP(vbroadcastw, AVX, SSE_OPF_SCALAR | SSE_OPF_MMX), #define gen_helper_aesimc_ymm NULL [0xdb] = UNARY_OP(aesimc, AES, 0), [0xdc] = BINARY_OP(aesenc, AES, 0), @@ -4323,6 +4338,24 @@ static void gen_sse(CPUX86State *env, DisasContext *s, int b, op2_offset = offsetof(CPUX86State, xmm_t0); gen_lea_modrm(env, s, modrm); switch (b) { +case 0x78: /* vpbroadcastb */ +size = 8; +break; +case 0x79: /* vpbroadcas
[PATCH v2 11/42] i386: Rewrite simple integer vector helpers
Rewrite the "simple" vector integer helpers in preperation for AVX support. While the current code is able to use the same prototype for unary (a = F(b)) and binary (a = F(b, c)) operations, future changes will cause them to diverge. No functional changes to existing helpers Signed-off-by: Paul Brook --- target/i386/ops_sse.h | 180 -- 1 file changed, 137 insertions(+), 43 deletions(-) diff --git a/target/i386/ops_sse.h b/target/i386/ops_sse.h index 9297c96d04..bb9cbf9ead 100644 --- a/target/i386/ops_sse.h +++ b/target/i386/ops_sse.h @@ -275,61 +275,148 @@ void glue(helper_pslldq, SUFFIX)(CPUX86State *env, Reg *d, Reg *c) } #endif -#define SSE_HELPER_B(name, F) \ +#define SSE_HELPER_1(name, elem, num, F) \ void glue(name, SUFFIX)(CPUX86State *env, Reg *d, Reg *s) \ { \ -d->B(0) = F(d->B(0), s->B(0)); \ -d->B(1) = F(d->B(1), s->B(1)); \ -d->B(2) = F(d->B(2), s->B(2)); \ -d->B(3) = F(d->B(3), s->B(3)); \ -d->B(4) = F(d->B(4), s->B(4)); \ -d->B(5) = F(d->B(5), s->B(5)); \ -d->B(6) = F(d->B(6), s->B(6)); \ -d->B(7) = F(d->B(7), s->B(7)); \ +d->elem(0) = F(s->elem(0)); \ +d->elem(1) = F(s->elem(1)); \ +if ((num << SHIFT) > 2) { \ +d->elem(2) = F(s->elem(2)); \ +d->elem(3) = F(s->elem(3)); \ +} \ +if ((num << SHIFT) > 4) { \ +d->elem(4) = F(s->elem(4)); \ +d->elem(5) = F(s->elem(5)); \ +d->elem(6) = F(s->elem(6)); \ +d->elem(7) = F(s->elem(7)); \ +} \ +if ((num << SHIFT) > 8) { \ +d->elem(8) = F(s->elem(8)); \ +d->elem(9) = F(s->elem(9)); \ +d->elem(10) = F(s->elem(10)); \ +d->elem(11) = F(s->elem(11)); \ +d->elem(12) = F(s->elem(12)); \ +d->elem(13) = F(s->elem(13)); \ +d->elem(14) = F(s->elem(14)); \ +d->elem(15) = F(s->elem(15)); \ +} \ +if ((num << SHIFT) > 16) { \ +d->elem(16) = F(s->elem(16)); \ +d->elem(17) = F(s->elem(17)); \ +d->elem(18) = F(s->elem(18)); \ +d->elem(19) = F(s->elem(19)); \ +d->elem(20) = F(s->elem(20)); \ +d->elem(21) = F(s->elem(21)); \ +d->elem(22) = F(s->elem(22)); \ +d->elem(23) = F(s->elem(23)); \ +d->elem(24) = F(s->elem(24)); \ +d->elem(25) = F(s->elem(25)); \ +d->elem(26) = F(s->elem(26)); \ +d->elem(27) = F(s->elem(27)); \ +d->elem(28) = F(s->elem(28)); \ +d->elem(29) = F(s->elem(29)); \ +d->elem(30) = F(s->elem(30)); \ +d->elem(31) = F(s->elem(31)); \ +} \ +} + +#define SSE_HELPER_B(name, F) \ +void glue(name, SUFFIX)(CPUX86State *env, Reg *d, Reg *s) \ +{ \ +Reg *v = d; \ +d->B(0) = F(v->B(0), s->B(0)); \ +d->B(1) = F(v->B(1), s->B(1)); \ +d->B(2) = F(v->B(2), s->B
[PATCH v2 38/42] i386: Implement VPBLENDD
This is semantically eqivalent to VBLENDPS. Signed-off-by: Paul Brook --- target/i386/tcg/translate.c | 1 + 1 file changed, 1 insertion(+) diff --git a/target/i386/tcg/translate.c b/target/i386/tcg/translate.c index 95ecdea8fe..73f3842c36 100644 --- a/target/i386/tcg/translate.c +++ b/target/i386/tcg/translate.c @@ -3353,6 +3353,7 @@ static const struct SSEOpHelper_table7 sse_op_table7[256] = { #define gen_helper_vpermq_xmm NULL [0x00] = UNARY_OP(vpermq, AVX, SSE_OPF_AVX2), [0x01] = UNARY_OP(vpermq, AVX, SSE_OPF_AVX2), /* vpermpd */ +[0x02] = BINARY_OP(blendps, AVX, SSE_OPF_AVX2), /* vpblendd */ [0x04] = UNARY_OP(vpermilps_imm, AVX, 0), [0x05] = UNARY_OP(vpermilpd_imm, AVX, 0), #define gen_helper_vpermdq_xmm NULL -- 2.36.0
[PATCH v2 17/42] i386: Destructive FP helpers for AVX
Perpare the horizontal atithmetic vector helpers for AVX These currently use a dummy Reg typed variable to store the result then assign the whole register. This will cause 128 bit operations to corrupt the upper half of the register, so replace it with explicit temporaries and element assignments. Signed-off-by: Paul Brook --- target/i386/ops_sse.h | 96 +++ 1 file changed, 70 insertions(+), 26 deletions(-) diff --git a/target/i386/ops_sse.h b/target/i386/ops_sse.h index 4137e6e1fa..d128af6cc8 100644 --- a/target/i386/ops_sse.h +++ b/target/i386/ops_sse.h @@ -1196,44 +1196,88 @@ void helper_insertq_i(CPUX86State *env, ZMMReg *d, int index, int length) d->ZMM_Q(0) = helper_insertq(d->ZMM_Q(0), index, length); } -void glue(helper_haddps, SUFFIX)(CPUX86State *env, ZMMReg *d, ZMMReg *s) +void glue(helper_haddps, SUFFIX)(CPUX86State *env, Reg *d, Reg *s) { -ZMMReg r; - -r.ZMM_S(0) = float32_add(d->ZMM_S(0), d->ZMM_S(1), >sse_status); -r.ZMM_S(1) = float32_add(d->ZMM_S(2), d->ZMM_S(3), >sse_status); -r.ZMM_S(2) = float32_add(s->ZMM_S(0), s->ZMM_S(1), >sse_status); -r.ZMM_S(3) = float32_add(s->ZMM_S(2), s->ZMM_S(3), >sse_status); -MOVE(*d, r); +Reg *v = d; +float32 r0, r1, r2, r3; + +r0 = float32_add(v->ZMM_S(0), v->ZMM_S(1), >sse_status); +r1 = float32_add(v->ZMM_S(2), v->ZMM_S(3), >sse_status); +r2 = float32_add(s->ZMM_S(0), s->ZMM_S(1), >sse_status); +r3 = float32_add(s->ZMM_S(2), s->ZMM_S(3), >sse_status); +d->ZMM_S(0) = r0; +d->ZMM_S(1) = r1; +d->ZMM_S(2) = r2; +d->ZMM_S(3) = r3; +#if SHIFT == 2 +r0 = float32_add(v->ZMM_S(4), v->ZMM_S(5), >sse_status); +r1 = float32_add(v->ZMM_S(6), v->ZMM_S(7), >sse_status); +r2 = float32_add(s->ZMM_S(4), s->ZMM_S(5), >sse_status); +r3 = float32_add(s->ZMM_S(6), s->ZMM_S(7), >sse_status); +d->ZMM_S(4) = r0; +d->ZMM_S(5) = r1; +d->ZMM_S(6) = r2; +d->ZMM_S(7) = r3; +#endif } -void glue(helper_haddpd, SUFFIX)(CPUX86State *env, ZMMReg *d, ZMMReg *s) +void glue(helper_haddpd, SUFFIX)(CPUX86State *env, Reg *d, Reg *s) { -ZMMReg r; +Reg *v = d; +float64 r0, r1; -r.ZMM_D(0) = float64_add(d->ZMM_D(0), d->ZMM_D(1), >sse_status); -r.ZMM_D(1) = float64_add(s->ZMM_D(0), s->ZMM_D(1), >sse_status); -MOVE(*d, r); +r0 = float64_add(v->ZMM_D(0), v->ZMM_D(1), >sse_status); +r1 = float64_add(s->ZMM_D(0), s->ZMM_D(1), >sse_status); +d->ZMM_D(0) = r0; +d->ZMM_D(1) = r1; +#if SHIFT == 2 +r0 = float64_add(v->ZMM_D(2), v->ZMM_D(3), >sse_status); +r1 = float64_add(s->ZMM_D(2), s->ZMM_D(3), >sse_status); +d->ZMM_D(2) = r0; +d->ZMM_D(3) = r1; +#endif } -void glue(helper_hsubps, SUFFIX)(CPUX86State *env, ZMMReg *d, ZMMReg *s) +void glue(helper_hsubps, SUFFIX)(CPUX86State *env, Reg *d, Reg *s) { -ZMMReg r; - -r.ZMM_S(0) = float32_sub(d->ZMM_S(0), d->ZMM_S(1), >sse_status); -r.ZMM_S(1) = float32_sub(d->ZMM_S(2), d->ZMM_S(3), >sse_status); -r.ZMM_S(2) = float32_sub(s->ZMM_S(0), s->ZMM_S(1), >sse_status); -r.ZMM_S(3) = float32_sub(s->ZMM_S(2), s->ZMM_S(3), >sse_status); -MOVE(*d, r); +Reg *v = d; +float32 r0, r1, r2, r3; + +r0 = float32_sub(v->ZMM_S(0), v->ZMM_S(1), >sse_status); +r1 = float32_sub(v->ZMM_S(2), v->ZMM_S(3), >sse_status); +r2 = float32_sub(s->ZMM_S(0), s->ZMM_S(1), >sse_status); +r3 = float32_sub(s->ZMM_S(2), s->ZMM_S(3), >sse_status); +d->ZMM_S(0) = r0; +d->ZMM_S(1) = r1; +d->ZMM_S(2) = r2; +d->ZMM_S(3) = r3; +#if SHIFT == 2 +r0 = float32_sub(v->ZMM_S(4), v->ZMM_S(5), >sse_status); +r1 = float32_sub(v->ZMM_S(6), v->ZMM_S(7), >sse_status); +r2 = float32_sub(s->ZMM_S(4), s->ZMM_S(5), >sse_status); +r3 = float32_sub(s->ZMM_S(6), s->ZMM_S(7), >sse_status); +d->ZMM_S(4) = r0; +d->ZMM_S(5) = r1; +d->ZMM_S(6) = r2; +d->ZMM_S(7) = r3; +#endif } -void glue(helper_hsubpd, SUFFIX)(CPUX86State *env, ZMMReg *d, ZMMReg *s) +void glue(helper_hsubpd, SUFFIX)(CPUX86State *env, Reg *d, Reg *s) { -ZMMReg r; +Reg *v = d; +float64 r0, r1; -r.ZMM_D(0) = float64_sub(d->ZMM_D(0), d->ZMM_D(1), >sse_status); -r.ZMM_D(1) = float64_sub(s->ZMM_D(0), s->ZMM_D(1), >sse_status); -MOVE(*d, r); +r0 = float64_sub(v->ZMM_D(0), v->ZMM_D(1), >sse_status); +r1 = float64_sub(s->ZMM_D(0), s->ZMM_D(1), >sse_status); +d->ZMM_D(0) = r0; +d->ZMM_D(1) = r1; +#if SHIFT == 2 +r0 = float64_sub(v->ZMM_D(2), v->ZMM_D(3), >sse_status); +r1 = float64_sub(s->ZMM_D(2), s->ZMM_D(3), >sse_status); +d->ZMM_D(2) = r0; +d->ZMM_D(3) = r1; +#endif } void glue(helper_addsubps, SUFFIX)(CPUX86State *env, Reg *d, Reg *s) -- 2.36.0
[PATCH v2 30/42] i386: Implement VPERMIL
Some potentially surprising details when comparing vpermilpd v.s. vpermilps, but overall pretty straightforward. Signed-off-by: Paul Brook --- target/i386/ops_sse.h| 82 target/i386/ops_sse_header.h | 4 ++ target/i386/tcg/translate.c | 4 ++ 3 files changed, 90 insertions(+) diff --git a/target/i386/ops_sse.h b/target/i386/ops_sse.h index 4115c9a257..9b92b9790a 100644 --- a/target/i386/ops_sse.h +++ b/target/i386/ops_sse.h @@ -3113,6 +3113,88 @@ void glue(helper_vbroadcastq, SUFFIX)(CPUX86State *env, Reg *d, Reg *s) #endif } +void glue(helper_vpermilpd, SUFFIX)(CPUX86State *env, Reg *d, Reg *v, Reg *s) +{ +uint64_t r0, r1; + +r0 = v->Q((s->Q(0) >> 1) & 1); +r1 = v->Q((s->Q(1) >> 1) & 1); +d->Q(0) = r0; +d->Q(1) = r1; +#if SHIFT == 2 +r0 = v->Q(((s->Q(2) >> 1) & 1) + 2); +r1 = v->Q(((s->Q(3) >> 1) & 1) + 2); +d->Q(2) = r0; +d->Q(3) = r1; +#endif +} + +void glue(helper_vpermilps, SUFFIX)(CPUX86State *env, Reg *d, Reg *v, Reg *s) +{ +uint32_t r0, r1, r2, r3; + +r0 = v->L(s->L(0) & 3); +r1 = v->L(s->L(1) & 3); +r2 = v->L(s->L(2) & 3); +r3 = v->L(s->L(3) & 3); +d->L(0) = r0; +d->L(1) = r1; +d->L(2) = r2; +d->L(3) = r3; +#if SHIFT == 2 +r0 = v->L((s->L(4) & 3) + 4); +r1 = v->L((s->L(5) & 3) + 4); +r2 = v->L((s->L(6) & 3) + 4); +r3 = v->L((s->L(7) & 3) + 4); +d->L(4) = r0; +d->L(5) = r1; +d->L(6) = r2; +d->L(7) = r3; +#endif +} + +void glue(helper_vpermilpd_imm, SUFFIX)(CPUX86State *env, +Reg *d, Reg *s, uint32_t order) +{ +uint64_t r0, r1; + +r0 = s->Q((order >> 0) & 1); +r1 = s->Q((order >> 1) & 1); +d->Q(0) = r0; +d->Q(1) = r1; +#if SHIFT == 2 +r0 = s->Q(((order >> 2) & 1) + 2); +r1 = s->Q(((order >> 3) & 1) + 2); +d->Q(2) = r0; +d->Q(3) = r1; +#endif +} + +void glue(helper_vpermilps_imm, SUFFIX)(CPUX86State *env, +Reg *d, Reg *s, uint32_t order) +{ +uint32_t r0, r1, r2, r3; + +r0 = s->L((order >> 0) & 3); +r1 = s->L((order >> 2) & 3); +r2 = s->L((order >> 4) & 3); +r3 = s->L((order >> 6) & 3); +d->L(0) = r0; +d->L(1) = r1; +d->L(2) = r2; +d->L(3) = r3; +#if SHIFT == 2 +r0 = s->L(((order >> 0) & 3) + 4); +r1 = s->L(((order >> 2) & 3) + 4); +r2 = s->L(((order >> 4) & 3) + 4); +r3 = s->L(((order >> 6) & 3) + 4); +d->L(4) = r0; +d->L(5) = r1; +d->L(6) = r2; +d->L(7) = r3; +#endif +} + #if SHIFT == 2 void glue(helper_vbroadcastdq, SUFFIX)(CPUX86State *env, Reg *d, Reg *s) { diff --git a/target/i386/ops_sse_header.h b/target/i386/ops_sse_header.h index 51e02cd4fa..c52169a030 100644 --- a/target/i386/ops_sse_header.h +++ b/target/i386/ops_sse_header.h @@ -417,6 +417,10 @@ DEF_HELPER_3(glue(vbroadcastb, SUFFIX), void, env, Reg, Reg) DEF_HELPER_3(glue(vbroadcastw, SUFFIX), void, env, Reg, Reg) DEF_HELPER_3(glue(vbroadcastl, SUFFIX), void, env, Reg, Reg) DEF_HELPER_3(glue(vbroadcastq, SUFFIX), void, env, Reg, Reg) +DEF_HELPER_4(glue(vpermilpd, SUFFIX), void, env, Reg, Reg, Reg) +DEF_HELPER_4(glue(vpermilps, SUFFIX), void, env, Reg, Reg, Reg) +DEF_HELPER_4(glue(vpermilpd_imm, SUFFIX), void, env, Reg, Reg, i32) +DEF_HELPER_4(glue(vpermilps_imm, SUFFIX), void, env, Reg, Reg, i32) #if SHIFT == 2 DEF_HELPER_3(glue(vbroadcastdq, SUFFIX), void, env, Reg, Reg) DEF_HELPER_1(vzeroall, void, env) diff --git a/target/i386/tcg/translate.c b/target/i386/tcg/translate.c index 59ab1dc562..358c3ecb0b 100644 --- a/target/i386/tcg/translate.c +++ b/target/i386/tcg/translate.c @@ -3251,6 +3251,8 @@ static const struct SSEOpHelper_table6 sse_op_table6[256] = { [0x09] = BINARY_OP_MMX(psignw, SSSE3), [0x0a] = BINARY_OP_MMX(psignd, SSSE3), [0x0b] = BINARY_OP_MMX(pmulhrsw, SSSE3), +[0x0c] = BINARY_OP(vpermilps, AVX, 0), +[0x0d] = BINARY_OP(vpermilpd, AVX, 0), [0x10] = BLENDV_OP(pblendvb, SSE41, SSE_OPF_MMX), [0x14] = BLENDV_OP(blendvps, SSE41, 0), [0x15] = BLENDV_OP(blendvpd, SSE41, 0), @@ -3311,6 +3313,8 @@ static const struct SSEOpHelper_table6 sse_op_table6[256] = { /* prefix [66] 0f 3a */ static const struct SSEOpHelper_table7 sse_op_table7[256] = { +[0x04] = UNARY_OP(vpermilps_imm, AVX, 0), +[0x05] = UNARY_OP(vpermilpd_imm, AVX, 0), [0x08] = UNARY_OP(roundps, SSE41, 0), [0x09] = UNARY_OP(roundpd, SSE41, 0), #define gen_helper_roundss_ymm NULL -- 2.36.0
[PATCH v2 33/42] i386: Implement VMASKMOV
Decoding these is a bit messy, but at least the integer and float variants have the same semantics once decoded. We don't try and be clever with the load forms, instead load the whole vector then mask out the elements we want. Signed-off-by: Paul Brook --- target/i386/ops_sse.h| 48 target/i386/ops_sse_header.h | 4 +++ target/i386/tcg/translate.c | 34 + 3 files changed, 86 insertions(+) diff --git a/target/i386/ops_sse.h b/target/i386/ops_sse.h index edf14a25d7..ffcba3d02c 100644 --- a/target/i386/ops_sse.h +++ b/target/i386/ops_sse.h @@ -3240,6 +3240,54 @@ void glue(helper_vtestpd, SUFFIX)(CPUX86State *env, Reg *d, Reg *s) CC_SRC = ((zf >> 63) ? 0 : CC_Z) | ((cf >> 63) ? 0 : CC_C); } +void glue(helper_vpmaskmovd_st, SUFFIX)(CPUX86State *env, +Reg *s, Reg *v, target_ulong a0) +{ +int i; + +for (i = 0; i < (2 << SHIFT); i++) { +if (v->L(i) >> 31) { +cpu_stl_data_ra(env, a0 + i * 4, s->L(i), GETPC()); +} +} +} + +void glue(helper_vpmaskmovq_st, SUFFIX)(CPUX86State *env, +Reg *s, Reg *v, target_ulong a0) +{ +int i; + +for (i = 0; i < (1 << SHIFT); i++) { +if (v->Q(i) >> 63) { +cpu_stq_data_ra(env, a0 + i * 8, s->Q(i), GETPC()); +} +} +} + +void glue(helper_vpmaskmovd, SUFFIX)(CPUX86State *env, Reg *d, Reg *v, Reg *s) +{ +d->L(0) = (v->L(0) >> 31) ? s->L(0) : 0; +d->L(1) = (v->L(1) >> 31) ? s->L(1) : 0; +d->L(2) = (v->L(2) >> 31) ? s->L(2) : 0; +d->L(3) = (v->L(3) >> 31) ? s->L(3) : 0; +#if SHIFT == 2 +d->L(4) = (v->L(4) >> 31) ? s->L(4) : 0; +d->L(5) = (v->L(5) >> 31) ? s->L(5) : 0; +d->L(6) = (v->L(6) >> 31) ? s->L(6) : 0; +d->L(7) = (v->L(7) >> 31) ? s->L(7) : 0; +#endif +} + +void glue(helper_vpmaskmovq, SUFFIX)(CPUX86State *env, Reg *d, Reg *v, Reg *s) +{ +d->Q(0) = (v->Q(0) >> 63) ? s->Q(0) : 0; +d->Q(1) = (v->Q(1) >> 63) ? s->Q(1) : 0; +#if SHIFT == 2 +d->Q(2) = (v->Q(2) >> 63) ? s->Q(2) : 0; +d->Q(3) = (v->Q(3) >> 63) ? s->Q(3) : 0; +#endif +} + #if SHIFT == 2 void glue(helper_vbroadcastdq, SUFFIX)(CPUX86State *env, Reg *d, Reg *s) { diff --git a/target/i386/ops_sse_header.h b/target/i386/ops_sse_header.h index 8b93b8e6d6..a7a6bf6b10 100644 --- a/target/i386/ops_sse_header.h +++ b/target/i386/ops_sse_header.h @@ -429,6 +429,10 @@ DEF_HELPER_4(glue(vpsravq, SUFFIX), void, env, Reg, Reg, Reg) DEF_HELPER_4(glue(vpsllvq, SUFFIX), void, env, Reg, Reg, Reg) DEF_HELPER_3(glue(vtestps, SUFFIX), void, env, Reg, Reg) DEF_HELPER_3(glue(vtestpd, SUFFIX), void, env, Reg, Reg) +DEF_HELPER_4(glue(vpmaskmovd_st, SUFFIX), void, env, Reg, Reg, tl) +DEF_HELPER_4(glue(vpmaskmovq_st, SUFFIX), void, env, Reg, Reg, tl) +DEF_HELPER_4(glue(vpmaskmovd, SUFFIX), void, env, Reg, Reg, Reg) +DEF_HELPER_4(glue(vpmaskmovq, SUFFIX), void, env, Reg, Reg, Reg) #if SHIFT == 2 DEF_HELPER_3(glue(vbroadcastdq, SUFFIX), void, env, Reg, Reg) DEF_HELPER_1(vzeroall, void, env) diff --git a/target/i386/tcg/translate.c b/target/i386/tcg/translate.c index 2fbb7bfcad..e00195d301 100644 --- a/target/i386/tcg/translate.c +++ b/target/i386/tcg/translate.c @@ -3277,6 +3277,10 @@ static const struct SSEOpHelper_table6 sse_op_table6[256] = { [0x29] = BINARY_OP(pcmpeqq, SSE41, SSE_OPF_MMX), [0x2a] = SPECIAL_OP(SSE41), /* movntqda */ [0x2b] = BINARY_OP(packusdw, SSE41, SSE_OPF_MMX), +[0x2c] = BINARY_OP(vpmaskmovd, AVX, 0), /* vmaskmovps */ +[0x2d] = BINARY_OP(vpmaskmovq, AVX, 0), /* vmaskmovpd */ +[0x2e] = SPECIAL_OP(AVX), /* vmaskmovps */ +[0x2f] = SPECIAL_OP(AVX), /* vmaskmovpd */ [0x30] = UNARY_OP(pmovzxbw, SSE41, SSE_OPF_MMX), [0x31] = UNARY_OP(pmovzxbd, SSE41, SSE_OPF_MMX), [0x32] = UNARY_OP(pmovzxbq, SSE41, SSE_OPF_MMX), @@ -3308,6 +3312,9 @@ static const struct SSEOpHelper_table6 sse_op_table6[256] = { [0x78] = UNARY_OP(vbroadcastb, AVX, SSE_OPF_SCALAR | SSE_OPF_MMX), /* vpbroadcastw */ [0x79] = UNARY_OP(vbroadcastw, AVX, SSE_OPF_SCALAR | SSE_OPF_MMX), +/* vpmaskmovd, vpmaskmovq */ +[0x8c] = BINARY_OP(vpmaskmovd, AVX, SSE_OPF_AVX2), +[0x8e] = SPECIAL_OP(AVX), /* vpmaskmovd, vpmaskmovq */ #define gen_helper_aesimc_ymm NULL [0xdb] = UNARY_OP(aesimc, AES, 0), [0xdc] = BINARY_OP(aesenc, AES, 0), @@ -3369,6 +3376,11 @@ static const SSEFunc_0_eppp sse_op_table8[3][2] = { SSE_OP(vpsravq), SSE_OP(vpsllvq), }; + +static const SSEFunc_0_eppt sse_op_table9[2][2] = { +SSE_OP(vpmaskmovd_st), +SSE_OP(vpmaskmovq_st), +}; #undef SSE_OP /* VEX prefix not allowed */ @@ -4394,6 +4406,22 @@ static void gen_sse(C
[PATCH v2 37/42] i386: Implement VBLENDV
The AVX variants of the BLENDV instructions use a different opcode prefix to support the additional operands. We already modified the helper functions in anticipation of this. Signed-off-by: Paul Brook --- target/i386/tcg/translate.c | 18 -- 1 file changed, 16 insertions(+), 2 deletions(-) diff --git a/target/i386/tcg/translate.c b/target/i386/tcg/translate.c index 4072fa28d3..95ecdea8fe 100644 --- a/target/i386/tcg/translate.c +++ b/target/i386/tcg/translate.c @@ -3384,6 +3384,9 @@ static const struct SSEOpHelper_table7 sse_op_table7[256] = { [0x42] = BINARY_OP(mpsadbw, SSE41, SSE_OPF_MMX), [0x44] = BINARY_OP(pclmulqdq, PCLMULQDQ, 0), [0x46] = BINARY_OP(vpermdq, AVX, SSE_OPF_AVX2), /* vperm2i128 */ +[0x4a] = BLENDV_OP(blendvps, AVX, 0), +[0x4b] = BLENDV_OP(blendvpd, AVX, 0), +[0x4c] = BLENDV_OP(pblendvb, AVX, SSE_OPF_MMX), #define gen_helper_pcmpestrm_ymm NULL [0x60] = CMP_OP(pcmpestrm, SSE42), #define gen_helper_pcmpestri_ymm NULL @@ -5268,6 +5271,10 @@ static void gen_sse(CPUX86State *env, DisasContext *s, int b, } /* SSE */ +if (op7.flags & SSE_OPF_BLENDV && !(s->prefix & PREFIX_VEX)) { +/* Only VEX encodings are valid for these blendv opcodes */ +goto illegal_op; +} op1_offset = ZMM_OFFSET(reg); if (mod == 3) { op2_offset = ZMM_OFFSET(rm | REX_B(s)); @@ -5316,8 +5323,15 @@ static void gen_sse(CPUX86State *env, DisasContext *s, int b, op7.fn[b1].op1(cpu_env, s->ptr0, s->ptr1, tcg_const_i32(val)); } else { tcg_gen_addi_ptr(s->ptr2, cpu_env, v_offset); -op7.fn[b1].op2(cpu_env, s->ptr0, s->ptr2, s->ptr1, - tcg_const_i32(val)); +if (op7.flags & SSE_OPF_BLENDV) { +TCGv_ptr mask = tcg_temp_new_ptr(); +tcg_gen_addi_ptr(mask, cpu_env, ZMM_OFFSET(val >> 4)); +op7.fn[b1].op3(cpu_env, s->ptr0, s->ptr2, s->ptr1, mask); +tcg_temp_free_ptr(mask); +} else { +op7.fn[b1].op2(cpu_env, s->ptr0, s->ptr2, s->ptr1, + tcg_const_i32(val)); +} } if ((op7.flags & SSE_OPF_CMP) == 0 && s->vex_l == 0) { gen_clear_ymmh(s, reg); -- 2.36.0
[PATCH v2 10/42] i386: Rewrite vector shift helper
Rewrite the vector shift helpers in preperation for AVX support (3 operand form and 256 bit vectors). For now keep the existing two operand interface. No functional changes to existing helpers. Signed-off-by: Paul Brook --- target/i386/ops_sse.h | 250 ++ 1 file changed, 133 insertions(+), 117 deletions(-) diff --git a/target/i386/ops_sse.h b/target/i386/ops_sse.h index 23daab6b50..9297c96d04 100644 --- a/target/i386/ops_sse.h +++ b/target/i386/ops_sse.h @@ -63,199 +63,215 @@ #define MOVE(d, r) memcpy(&(d).B(0), &(r).B(0), SIZE) #endif -void glue(helper_psrlw, SUFFIX)(CPUX86State *env, Reg *d, Reg *s) +#if SHIFT == 0 +#define SHIFT_HELPER_BODY(n, elem, F) do { \ +d->elem(0) = F(s->elem(0), shift); \ +if ((n) > 1) { \ +d->elem(1) = F(s->elem(1), shift); \ +} \ +if ((n) > 2) { \ +d->elem(2) = F(s->elem(2), shift); \ +d->elem(3) = F(s->elem(3), shift); \ +} \ +if ((n) > 4) { \ +d->elem(4) = F(s->elem(4), shift); \ +d->elem(5) = F(s->elem(5), shift); \ +d->elem(6) = F(s->elem(6), shift); \ +d->elem(7) = F(s->elem(7), shift); \ +} \ +if ((n) > 8) { \ +d->elem(8) = F(s->elem(8), shift); \ +d->elem(9) = F(s->elem(9), shift); \ +d->elem(10) = F(s->elem(10), shift);\ +d->elem(11) = F(s->elem(11), shift);\ +d->elem(12) = F(s->elem(12), shift);\ +d->elem(13) = F(s->elem(13), shift);\ +d->elem(14) = F(s->elem(14), shift);\ +d->elem(15) = F(s->elem(15), shift);\ +} \ +} while (0) + +#define FPSRL(x, c) ((x) >> shift) +#define FPSRAW(x, c) ((int16_t)(x) >> shift) +#define FPSRAL(x, c) ((int32_t)(x) >> shift) +#define FPSLL(x, c) ((x) << shift) +#endif + +void glue(helper_psrlw, SUFFIX)(CPUX86State *env, Reg *d, Reg *c) { +Reg *s = d; int shift; - -if (s->Q(0) > 15) { +if (c->Q(0) > 15) { d->Q(0) = 0; -#if SHIFT == 1 -d->Q(1) = 0; -#endif +XMM_ONLY(d->Q(1) = 0;) +YMM_ONLY( +d->Q(2) = 0; +d->Q(3) = 0; +) } else { -shift = s->B(0); -d->W(0) >>= shift; -d->W(1) >>= shift; -d->W(2) >>= shift; -d->W(3) >>= shift; -#if SHIFT == 1 -d->W(4) >>= shift; -d->W(5) >>= shift; -d->W(6) >>= shift; -d->W(7) >>= shift; -#endif +shift = c->B(0); +SHIFT_HELPER_BODY(4 << SHIFT, W, FPSRL); } } -void glue(helper_psraw, SUFFIX)(CPUX86State *env, Reg *d, Reg *s) +void glue(helper_psllw, SUFFIX)(CPUX86State *env, Reg *d, Reg *c) { +Reg *s = d; int shift; - -if (s->Q(0) > 15) { -shift = 15; +if (c->Q(0) > 15) { +d->Q(0) = 0; +XMM_ONLY(d->Q(1) = 0;) +YMM_ONLY( +d->Q(2) = 0; +d->Q(3) = 0; +) } else { -shift = s->B(0); +shift = c->B(0); +SHIFT_HELPER_BODY(4 << SHIFT, W, FPSLL); } -d->W(0) = (int16_t)d->W(0) >> shift; -d->W(1) = (int16_t)d->W(1) >> shift; -d->W(2) = (int16_t)d->W(2) >> shift; -d->W(3) = (int16_t)d->W(3) >> shift; -#if SHIFT == 1 -d->W(4) = (int16_t)d->W(4) >> shift; -d->W(5) = (int16_t)d->W(5) >> shift; -d->W(6) = (int16_t)d->W(6) >> shift; -d->W(7) = (int16_t)d->W(7) >> shift; -#endif } -void glue(helper_psllw, SUFFIX)(CPUX86State *env, Reg *d, Reg *s) +void glue(helper_psraw, SUFFIX)(CPUX86State *env, Reg *d, Reg *c) { +Reg *s = d; int shift; - -if (s->Q(0) > 15) { -d->Q(0) = 0; -#if SHIFT == 1 -d->Q(1) = 0; -#endif +if (c->Q(0) > 15) { +shift = 15; } else { -shift = s->B(0); -d->W(0) <<= shift; -d->W(1) <<= shift; -d->W(2) <<= shift; -d->W(3) <<= shift; -#if SHIFT == 1 -d->W(4) <<= shift; -d->W(5) <<= shift; -d->W(6) <<= shift; -d->W(7) <<= shift; -#endif +shift = c->B(0); } +SHIFT_HELPER_BODY(4 << SHIFT, W, FPSRAW); } -void glue(helper_psrld, SUFFIX)(CPUX86State *env, Reg *d, Reg *s) +void
[PATCH v2 05/42] i386: Rework sse_op_table6/7
Add a flags field each row in sse_op_table6 and sse_op_table7. Initially this is only used as a replacement for the magic SSE41_SPECIAL pointer. The other flags will become relevant as the rest of the avx implementation is built out. Signed-off-by: Paul Brook --- target/i386/tcg/translate.c | 232 1 file changed, 132 insertions(+), 100 deletions(-) diff --git a/target/i386/tcg/translate.c b/target/i386/tcg/translate.c index 7fec582358..5335b86c01 100644 --- a/target/i386/tcg/translate.c +++ b/target/i386/tcg/translate.c @@ -2977,7 +2977,6 @@ static const struct SSEOpHelper_table1 sse_op_table1[256] = { #undef SSE_SPECIAL #define MMX_OP2(x) { gen_helper_ ## x ## _mmx, gen_helper_ ## x ## _xmm } -#define SSE_SPECIAL_FN ((void *)1) static const SSEFunc_0_epp sse_op_table2[3 * 8][2] = { [0 + 2] = MMX_OP2(psrlw), @@ -3061,113 +3060,134 @@ static const SSEFunc_0_epp sse_op_table5[256] = { [0xbf] = gen_helper_pavgb_mmx /* pavgusb */ }; -struct SSEOpHelper_epp { +struct SSEOpHelper_table6 { SSEFunc_0_epp op[2]; uint32_t ext_mask; +int flags; }; -struct SSEOpHelper_eppi { +struct SSEOpHelper_table7 { SSEFunc_0_eppi op[2]; uint32_t ext_mask; +int flags; }; -#define SSSE3_OP(x) { MMX_OP2(x), CPUID_EXT_SSSE3 } -#define SSE41_OP(x) { { NULL, gen_helper_ ## x ## _xmm }, CPUID_EXT_SSE41 } -#define SSE42_OP(x) { { NULL, gen_helper_ ## x ## _xmm }, CPUID_EXT_SSE42 } -#define SSE41_SPECIAL { { NULL, SSE_SPECIAL_FN }, CPUID_EXT_SSE41 } -#define PCLMULQDQ_OP(x) { { NULL, gen_helper_ ## x ## _xmm }, \ -CPUID_EXT_PCLMULQDQ } -#define AESNI_OP(x) { { NULL, gen_helper_ ## x ## _xmm }, CPUID_EXT_AES } - -static const struct SSEOpHelper_epp sse_op_table6[256] = { -[0x00] = SSSE3_OP(pshufb), -[0x01] = SSSE3_OP(phaddw), -[0x02] = SSSE3_OP(phaddd), -[0x03] = SSSE3_OP(phaddsw), -[0x04] = SSSE3_OP(pmaddubsw), -[0x05] = SSSE3_OP(phsubw), -[0x06] = SSSE3_OP(phsubd), -[0x07] = SSSE3_OP(phsubsw), -[0x08] = SSSE3_OP(psignb), -[0x09] = SSSE3_OP(psignw), -[0x0a] = SSSE3_OP(psignd), -[0x0b] = SSSE3_OP(pmulhrsw), -[0x10] = SSE41_OP(pblendvb), -[0x14] = SSE41_OP(blendvps), -[0x15] = SSE41_OP(blendvpd), -[0x17] = SSE41_OP(ptest), -[0x1c] = SSSE3_OP(pabsb), -[0x1d] = SSSE3_OP(pabsw), -[0x1e] = SSSE3_OP(pabsd), -[0x20] = SSE41_OP(pmovsxbw), -[0x21] = SSE41_OP(pmovsxbd), -[0x22] = SSE41_OP(pmovsxbq), -[0x23] = SSE41_OP(pmovsxwd), -[0x24] = SSE41_OP(pmovsxwq), -[0x25] = SSE41_OP(pmovsxdq), -[0x28] = SSE41_OP(pmuldq), -[0x29] = SSE41_OP(pcmpeqq), -[0x2a] = SSE41_SPECIAL, /* movntqda */ -[0x2b] = SSE41_OP(packusdw), -[0x30] = SSE41_OP(pmovzxbw), -[0x31] = SSE41_OP(pmovzxbd), -[0x32] = SSE41_OP(pmovzxbq), -[0x33] = SSE41_OP(pmovzxwd), -[0x34] = SSE41_OP(pmovzxwq), -[0x35] = SSE41_OP(pmovzxdq), -[0x37] = SSE42_OP(pcmpgtq), -[0x38] = SSE41_OP(pminsb), -[0x39] = SSE41_OP(pminsd), -[0x3a] = SSE41_OP(pminuw), -[0x3b] = SSE41_OP(pminud), -[0x3c] = SSE41_OP(pmaxsb), -[0x3d] = SSE41_OP(pmaxsd), -[0x3e] = SSE41_OP(pmaxuw), -[0x3f] = SSE41_OP(pmaxud), -[0x40] = SSE41_OP(pmulld), -[0x41] = SSE41_OP(phminposuw), -[0xdb] = AESNI_OP(aesimc), -[0xdc] = AESNI_OP(aesenc), -[0xdd] = AESNI_OP(aesenclast), -[0xde] = AESNI_OP(aesdec), -[0xdf] = AESNI_OP(aesdeclast), +#define gen_helper_special_xmm NULL + +#define OP(name, op, flags, ext, mmx_name) \ +{{mmx_name, gen_helper_ ## name ## _xmm}, CPUID_EXT_ ## ext, flags} +#define BINARY_OP_MMX(name, ext) \ +OP(name, op2, SSE_OPF_MMX, ext, gen_helper_ ## name ## _mmx) +#define BINARY_OP(name, ext, flags) \ +OP(name, op2, flags, ext, NULL) +#define UNARY_OP_MMX(name, ext) \ +OP(name, op1, SSE_OPF_V0 | SSE_OPF_MMX, ext, gen_helper_ ## name ## _mmx) +#define UNARY_OP(name, ext, flags) \ +OP(name, op1, SSE_OPF_V0 | flags, ext, NULL) +#define BLENDV_OP(name, ext, flags) OP(name, op3, SSE_OPF_BLENDV, ext, NULL) +#define CMP_OP(name, ext) OP(name, op1, SSE_OPF_CMP | SSE_OPF_V0, ext, NULL) +#define SPECIAL_OP(ext) OP(special, op1, SSE_OPF_SPECIAL, ext, NULL) + +/* prefix [66] 0f 38 */ +static const struct SSEOpHelper_table6 sse_op_table6[256] = { +[0x00] = BINARY_OP_MMX(pshufb, SSSE3), +[0x01] = BINARY_OP_MMX(phaddw, SSSE3), +[0x02] = BINARY_OP_MMX(phaddd, SSSE3), +[0x03] = BINARY_OP_MMX(phaddsw, SSSE3), +[0x04] = BINARY_OP_MMX(pmaddubsw, SSSE3), +[0x05] = BINARY_OP_MMX(phsubw, SSSE3), +[0x06] = BINARY_OP_MMX(phsubd, SSSE3), +[0x07] = BINARY_OP_MMX(phsubsw, SSSE3), +[0x08] = BINARY_OP_MMX(psignb, SSSE3), +[0x09] = BINARY_OP_MMX(psignw, SSSE3), +[0x0a] = BINARY_OP_MMX(psignd, SSSE3), +[0x0b] = BINARY_OP_MMX(pmulhrsw, SSSE3), +[0x10] = BLENDV_OP(pblendvb, SSE41, SSE_OPF_MMX), +[0x14] = BLENDV_OP(blendvps, SSE41, 0), +[0x15] = BLENDV_OP(blendvpd, SSE41, 0
[PATCH v2 08/42] i386: Add ZMM_OFFSET macro
Add a convenience macro to get the address of an xmm_regs element within CPUX86State. This was originally going to be the basis of an implementation that broke operations into 128 bit chunks. I scrapped that idea, so this is now a purely cosmetic change. But I think a worthwhile one - it reduces the number of function calls that need to be split over multiple lines. No functional changes. Signed-off-by: Paul Brook --- target/i386/tcg/translate.c | 60 + 1 file changed, 27 insertions(+), 33 deletions(-) diff --git a/target/i386/tcg/translate.c b/target/i386/tcg/translate.c index 2f5cc24e0c..e9e6062b7f 100644 --- a/target/i386/tcg/translate.c +++ b/target/i386/tcg/translate.c @@ -2777,6 +2777,8 @@ static inline void gen_op_movq_env_0(DisasContext *s, int d_offset) tcg_gen_st_i64(s->tmp1_i64, cpu_env, d_offset); } +#define ZMM_OFFSET(reg) offsetof(CPUX86State, xmm_regs[reg]) + typedef void (*SSEFunc_i_ep)(TCGv_i32 val, TCGv_ptr env, TCGv_ptr reg); typedef void (*SSEFunc_l_ep)(TCGv_i64 val, TCGv_ptr env, TCGv_ptr reg); typedef void (*SSEFunc_0_epi)(TCGv_ptr env, TCGv_ptr reg, TCGv_i32 val); @@ -3329,14 +3331,14 @@ static void gen_sse(CPUX86State *env, DisasContext *s, int b, if (mod == 3) goto illegal_op; gen_lea_modrm(env, s, modrm); -gen_sto_env_A0(s, offsetof(CPUX86State, xmm_regs[reg])); +gen_sto_env_A0(s, ZMM_OFFSET(reg)); break; case 0x3f0: /* lddqu */ CHECK_AVX_V0(s); if (mod == 3) goto illegal_op; gen_lea_modrm(env, s, modrm); -gen_ldo_env_A0(s, offsetof(CPUX86State, xmm_regs[reg])); +gen_ldo_env_A0(s, ZMM_OFFSET(reg)); break; case 0x22b: /* movntss */ case 0x32b: /* movntsd */ @@ -3375,15 +3377,13 @@ static void gen_sse(CPUX86State *env, DisasContext *s, int b, #ifdef TARGET_X86_64 if (s->dflag == MO_64) { gen_ldst_modrm(env, s, modrm, MO_64, OR_TMP0, 0); -tcg_gen_addi_ptr(s->ptr0, cpu_env, - offsetof(CPUX86State,xmm_regs[reg])); +tcg_gen_addi_ptr(s->ptr0, cpu_env, ZMM_OFFSET(reg)); gen_helper_movq_mm_T0_xmm(s->ptr0, s->T0); } else #endif { gen_ldst_modrm(env, s, modrm, MO_32, OR_TMP0, 0); -tcg_gen_addi_ptr(s->ptr0, cpu_env, - offsetof(CPUX86State,xmm_regs[reg])); +tcg_gen_addi_ptr(s->ptr0, cpu_env, ZMM_OFFSET(reg)); tcg_gen_trunc_tl_i32(s->tmp2_i32, s->T0); gen_helper_movl_mm_T0_xmm(s->ptr0, s->tmp2_i32); } @@ -3410,11 +3410,10 @@ static void gen_sse(CPUX86State *env, DisasContext *s, int b, CHECK_AVX_V0(s); if (mod != 3) { gen_lea_modrm(env, s, modrm); -gen_ldo_env_A0(s, offsetof(CPUX86State, xmm_regs[reg])); +gen_ldo_env_A0(s, ZMM_OFFSET(reg)); } else { rm = (modrm & 7) | REX_B(s); -gen_op_movo(s, offsetof(CPUX86State, xmm_regs[reg]), -offsetof(CPUX86State,xmm_regs[rm])); +gen_op_movo(s, ZMM_OFFSET(reg), ZMM_OFFSET(rm)); } break; case 0x210: /* movss xmm, ea */ @@ -3474,7 +3473,7 @@ static void gen_sse(CPUX86State *env, DisasContext *s, int b, CHECK_AVX_V0(s); if (mod != 3) { gen_lea_modrm(env, s, modrm); -gen_ldo_env_A0(s, offsetof(CPUX86State, xmm_regs[reg])); +gen_ldo_env_A0(s, ZMM_OFFSET(reg)); } else { rm = (modrm & 7) | REX_B(s); gen_op_movl(s, offsetof(CPUX86State, xmm_regs[reg].ZMM_L(0)), @@ -3519,7 +3518,7 @@ static void gen_sse(CPUX86State *env, DisasContext *s, int b, CHECK_AVX_V0(s); if (mod != 3) { gen_lea_modrm(env, s, modrm); -gen_ldo_env_A0(s, offsetof(CPUX86State, xmm_regs[reg])); +gen_ldo_env_A0(s, ZMM_OFFSET(reg)); } else { rm = (modrm & 7) | REX_B(s); gen_op_movl(s, offsetof(CPUX86State, xmm_regs[reg].ZMM_L(1)), @@ -3542,8 +3541,7 @@ static void gen_sse(CPUX86State *env, DisasContext *s, int b, goto illegal_op; field_length = x86_ldub_code(env, s) & 0x3F; bit_index = x86_ldub_code(env, s) & 0x3F; -tcg_gen_addi_ptr(s->ptr0, cpu_env, -offsetof(CPUX86State,xmm_regs[reg])); +tcg_gen_addi_ptr(s->ptr0, cpu_env, ZMM_OFFSET(reg)); if (b1 == 1) gen_helper_extrq_i(cpu_env, s->ptr0, tcg_
[PATCH v2 07/42] Enforce VEX encoding restrictions
Add CHECK_AVX* macros, and use them to validate VEX encoded AVX instructions All AVX instructions require both CPU and OS support, this is encapsulated by HF_AVX_EN. Some also require specific values in the VEX.L and VEX.V fields. Some (mostly integer operations) also require AVX2 Signed-off-by: Paul Brook --- target/i386/tcg/translate.c | 159 +--- 1 file changed, 149 insertions(+), 10 deletions(-) diff --git a/target/i386/tcg/translate.c b/target/i386/tcg/translate.c index 66ba690b7d..2f5cc24e0c 100644 --- a/target/i386/tcg/translate.c +++ b/target/i386/tcg/translate.c @@ -3185,10 +3185,54 @@ static const struct SSEOpHelper_table7 sse_op_table7[256] = { goto illegal_op; \ } while (0) +/* + * VEX encodings require AVX + * Allow legacy SSE encodings even if AVX not enabled + */ +#define CHECK_AVX(s) do { \ +if ((s->prefix & PREFIX_VEX) \ +&& !(env->hflags & HF_AVX_EN_MASK)) \ +goto illegal_op; \ +} while (0) + +/* If a VEX prefix is used then it must have V=b */ +#define CHECK_AVX_V0(s) do { \ +CHECK_AVX(s); \ +if ((s->prefix & PREFIX_VEX) && (s->vex_v != 0)) \ +goto illegal_op; \ +} while (0) + +/* If a VEX prefix is used then it must have L=0 */ +#define CHECK_AVX_128(s) do { \ +CHECK_AVX(s); \ +if ((s->prefix & PREFIX_VEX) && (s->vex_l != 0)) \ +goto illegal_op; \ +} while (0) + +/* If a VEX prefix is used then it must have V=b and L=0 */ +#define CHECK_AVX_V0_128(s) do { \ +CHECK_AVX(s); \ +if ((s->prefix & PREFIX_VEX) && (s->vex_v != 0 || s->vex_l != 0)) \ +goto illegal_op; \ +} while (0) + +/* 256-bit (ymm) variants require AVX2 */ +#define CHECK_AVX2_256(s) do { \ +if (s->vex_l && !(s->cpuid_7_0_ebx_features & CPUID_7_0_EBX_AVX2)) \ +goto illegal_op; \ +} while (0) + +/* Requires AVX2 and VEX encoding */ +#define CHECK_AVX2(s) do { \ +if ((s->prefix & PREFIX_VEX) == 0 \ +|| !(s->cpuid_7_0_ebx_features & CPUID_7_0_EBX_AVX2)) \ +goto illegal_op; \ +} while (0) + static void gen_sse(CPUX86State *env, DisasContext *s, int b, target_ulong pc_start) { -int b1, op1_offset, op2_offset, is_xmm, val; +int b1, op1_offset, op2_offset, is_xmm, val, scalar_op; int modrm, mod, rm, reg; struct SSEOpHelper_table1 sse_op; struct SSEOpHelper_table6 op6; @@ -3228,15 +3272,18 @@ static void gen_sse(CPUX86State *env, DisasContext *s, int b, gen_exception(s, EXCP07_PREX, pc_start - s->cs_base); return; } -if (s->flags & HF_EM_MASK) { -illegal_op: -gen_illegal_opcode(s); -return; -} -if (is_xmm -&& !(s->flags & HF_OSFXSR_MASK) -&& (b != 0x38 && b != 0x3a)) { -goto unknown_op; +/* VEX encoded instuctions ignore EM bit. See also CHECK_AVX */ +if (!(s->prefix & PREFIX_VEX)) { +if (s->flags & HF_EM_MASK) { +illegal_op: +gen_illegal_opcode(s); +return; +} +if (is_xmm +&& !(s->flags & HF_OSFXSR_MASK) +&& (b != 0x38 && b != 0x3a)) { +goto unknown_op; +} } if (b == 0x0e) { if (!(s->cpuid_ext2_features & CPUID_EXT2_3DNOW)) { @@ -3278,12 +3325,14 @@ static void gen_sse(CPUX86State *env, DisasContext *s, int b, case 0x1e7: /* movntdq */ case 0x02b: /* movntps */ case 0x12b: /* movntps */ +CHECK_AVX_V0(s); if (mod == 3) goto illegal_op; gen_lea_modrm(env, s, modrm); gen_sto_env_A0(s, offsetof(CPUX86State, xmm_regs[reg])); break; case 0x3f0: /* lddqu */ +CHECK_AVX_V0(s); if (mod == 3) goto illegal_op; gen_lea_modrm(env, s, modrm); @@ -3291,6 +3340,7 @@ static void gen_sse(CPUX86State *env, DisasContext *s, int b, break; case 0x22b: /* movntss */ case 0x32b: /* movntsd */ +CHECK_AVX_V0_128(s); if (mod == 3) goto illegal_op; gen_lea_modrm(env, s, modrm); @@ -3321,6 +3371,7 @@ static void gen_sse(CPUX86State *env, DisasContext *s, int b, } break; case 0x16e: /* movd xmm, ea */ +CHECK_AVX_V0_128(s); #ifdef TARGET_X86_64 if (s->dflag == MO_64) { gen_ldst_modrm(env, s, modrm, MO_64, OR_TMP0, 0); @@ -3356,6 +3407,7 @@ static void gen_sse(CPUX86State *env, DisasContext *s, int b, case 0x128: /* movapd */ case 0x16f: /* movdqa xmm, ea */ case 0x26f: /* movdqu xmm, ea */ +CHECK_AVX_V0(s); if (mod != 3) {
[PATCH v2 03/42] Add AVX_EN hflag
Add a new hflag bit to determine whether AVX instructions are allowed Signed-off-by: Paul Brook --- target/i386/cpu.h| 3 +++ target/i386/helper.c | 12 target/i386/tcg/fpu_helper.c | 1 + 3 files changed, 16 insertions(+) diff --git a/target/i386/cpu.h b/target/i386/cpu.h index 9661f9fbd1..65200a1917 100644 --- a/target/i386/cpu.h +++ b/target/i386/cpu.h @@ -169,6 +169,7 @@ typedef enum X86Seg { #define HF_MPX_EN_SHIFT 25 /* MPX Enabled (CR4+XCR0+BNDCFGx) */ #define HF_MPX_IU_SHIFT 26 /* BND registers in-use */ #define HF_UMIP_SHIFT 27 /* CR4.UMIP */ +#define HF_AVX_EN_SHIFT 28 /* AVX Enabled (CR4+XCR0) */ #define HF_CPL_MASK (3 << HF_CPL_SHIFT) #define HF_INHIBIT_IRQ_MASK (1 << HF_INHIBIT_IRQ_SHIFT) @@ -195,6 +196,7 @@ typedef enum X86Seg { #define HF_MPX_EN_MASK (1 << HF_MPX_EN_SHIFT) #define HF_MPX_IU_MASK (1 << HF_MPX_IU_SHIFT) #define HF_UMIP_MASK (1 << HF_UMIP_SHIFT) +#define HF_AVX_EN_MASK (1 << HF_AVX_EN_SHIFT) /* hflags2 */ @@ -2035,6 +2037,7 @@ void host_cpuid(uint32_t function, uint32_t count, /* helper.c */ void x86_cpu_set_a20(X86CPU *cpu, int a20_state); +void cpu_sync_avx_hflag(CPUX86State *env); #ifndef CONFIG_USER_ONLY static inline int x86_asidx_from_attrs(CPUState *cs, MemTxAttrs attrs) diff --git a/target/i386/helper.c b/target/i386/helper.c index fa409e9c44..30083c9cff 100644 --- a/target/i386/helper.c +++ b/target/i386/helper.c @@ -29,6 +29,17 @@ #endif #include "qemu/log.h" +void cpu_sync_avx_hflag(CPUX86State *env) +{ +if ((env->cr[4] & CR4_OSXSAVE_MASK) +&& (env->xcr0 & (XSTATE_SSE_MASK | XSTATE_YMM_MASK)) +== (XSTATE_SSE_MASK | XSTATE_YMM_MASK)) { +env->hflags |= HF_AVX_EN_MASK; +} else{ +env->hflags &= ~HF_AVX_EN_MASK; +} +} + void cpu_sync_bndcs_hflags(CPUX86State *env) { uint32_t hflags = env->hflags; @@ -209,6 +220,7 @@ void cpu_x86_update_cr4(CPUX86State *env, uint32_t new_cr4) env->hflags = hflags; cpu_sync_bndcs_hflags(env); +cpu_sync_avx_hflag(env); } #if !defined(CONFIG_USER_ONLY) diff --git a/target/i386/tcg/fpu_helper.c b/target/i386/tcg/fpu_helper.c index ebf5e73df9..b391b69635 100644 --- a/target/i386/tcg/fpu_helper.c +++ b/target/i386/tcg/fpu_helper.c @@ -2943,6 +2943,7 @@ void helper_xsetbv(CPUX86State *env, uint32_t ecx, uint64_t mask) env->xcr0 = mask; cpu_sync_bndcs_hflags(env); +cpu_sync_avx_hflag(env); return; do_gpf: -- 2.36.0
[PATCH v2 06/42] i386: Add CHECK_NO_VEX
Reject invalid VEX encodings on MMX instructions. Signed-off-by: Paul Brook --- target/i386/tcg/translate.c | 26 ++ 1 file changed, 26 insertions(+) diff --git a/target/i386/tcg/translate.c b/target/i386/tcg/translate.c index 5335b86c01..66ba690b7d 100644 --- a/target/i386/tcg/translate.c +++ b/target/i386/tcg/translate.c @@ -3179,6 +3179,12 @@ static const struct SSEOpHelper_table7 sse_op_table7[256] = { #undef BLENDV_OP #undef SPECIAL_OP +/* VEX prefix not allowed */ +#define CHECK_NO_VEX(s) do { \ +if (s->prefix & PREFIX_VEX) \ +goto illegal_op; \ +} while (0) + static void gen_sse(CPUX86State *env, DisasContext *s, int b, target_ulong pc_start) { @@ -3262,6 +3268,7 @@ static void gen_sse(CPUX86State *env, DisasContext *s, int b, b |= (b1 << 8); switch(b) { case 0x0e7: /* movntq */ +CHECK_NO_VEX(s); if (mod == 3) { goto illegal_op; } @@ -3297,6 +3304,7 @@ static void gen_sse(CPUX86State *env, DisasContext *s, int b, } break; case 0x6e: /* movd mm, ea */ +CHECK_NO_VEX(s); #ifdef TARGET_X86_64 if (s->dflag == MO_64) { gen_ldst_modrm(env, s, modrm, MO_64, OR_TMP0, 0); @@ -3330,6 +3338,7 @@ static void gen_sse(CPUX86State *env, DisasContext *s, int b, } break; case 0x6f: /* movq mm, ea */ +CHECK_NO_VEX(s); if (mod != 3) { gen_lea_modrm(env, s, modrm); gen_ldq_env_A0(s, offsetof(CPUX86State, fpregs[reg].mmx)); @@ -3464,6 +3473,7 @@ static void gen_sse(CPUX86State *env, DisasContext *s, int b, break; case 0x178: case 0x378: +CHECK_NO_VEX(s); { int bit_index, field_length; @@ -3484,6 +3494,7 @@ static void gen_sse(CPUX86State *env, DisasContext *s, int b, } break; case 0x7e: /* movd ea, mm */ +CHECK_NO_VEX(s); #ifdef TARGET_X86_64 if (s->dflag == MO_64) { tcg_gen_ld_i64(s->T0, cpu_env, @@ -3524,6 +3535,7 @@ static void gen_sse(CPUX86State *env, DisasContext *s, int b, gen_op_movq_env_0(s, offsetof(CPUX86State, xmm_regs[reg].ZMM_Q(1))); break; case 0x7f: /* movq ea, mm */ +CHECK_NO_VEX(s); if (mod != 3) { gen_lea_modrm(env, s, modrm); gen_stq_env_A0(s, offsetof(CPUX86State, fpregs[reg].mmx)); @@ -3607,6 +3619,7 @@ static void gen_sse(CPUX86State *env, DisasContext *s, int b, offsetof(CPUX86State, xmm_t0.ZMM_L(1))); op1_offset = offsetof(CPUX86State,xmm_t0); } else { +CHECK_NO_VEX(s); tcg_gen_movi_tl(s->T0, val); tcg_gen_st32_tl(s->T0, cpu_env, offsetof(CPUX86State, mmx_t0.MMX_L(0))); @@ -3648,6 +3661,7 @@ static void gen_sse(CPUX86State *env, DisasContext *s, int b, break; case 0x02a: /* cvtpi2ps */ case 0x12a: /* cvtpi2pd */ +CHECK_NO_VEX(s); gen_helper_enter_mmx(cpu_env); if (mod != 3) { gen_lea_modrm(env, s, modrm); @@ -3693,6 +3707,7 @@ static void gen_sse(CPUX86State *env, DisasContext *s, int b, case 0x12c: /* cvttpd2pi */ case 0x02d: /* cvtps2pi */ case 0x12d: /* cvtpd2pi */ +CHECK_NO_VEX(s); gen_helper_enter_mmx(cpu_env); if (mod != 3) { gen_lea_modrm(env, s, modrm); @@ -3766,6 +3781,7 @@ static void gen_sse(CPUX86State *env, DisasContext *s, int b, tcg_gen_st16_tl(s->T0, cpu_env, offsetof(CPUX86State,xmm_regs[reg].ZMM_W(val))); } else { +CHECK_NO_VEX(s); val &= 3; tcg_gen_st16_tl(s->T0, cpu_env, offsetof(CPUX86State,fpregs[reg].mmx.MMX_W(val))); @@ -3805,6 +3821,7 @@ static void gen_sse(CPUX86State *env, DisasContext *s, int b, } break; case 0x2d6: /* movq2dq */ +CHECK_NO_VEX(s); gen_helper_enter_mmx(cpu_env); rm = (modrm & 7); gen_op_movq(s, offsetof(CPUX86State, xmm_regs[reg].ZMM_Q(0)), @@ -3812,6 +3829,7 @@ static void gen_sse(CPUX86State *env, DisasContext *s, int b, gen_op_movq_env_0(s, offsetof(CPUX86State, xmm_regs[reg].ZMM_Q(1))); break; case 0x3d6: /* movdq2q */ +CHECK_NO_VEX(s); gen_helper_enter_mmx(cpu_env); rm = (modrm & 7) | REX_B(s); gen_op_movq(s, offsetof(CPUX86State, fpregs[reg & 7].mmx), @@ -3827,6 +3845,7 @@ static void
[PATCH v2 04/42] i386: Rework sse_op_table1
Add a flags field each row in sse_op_table1. Initially this is only used as a replacement for the magic SSE_SPECIAL and SSE_DUMMY pointers, the other flags will become relevant as the rest of the AVX implementation is built out. Signed-off-by: Paul Brook --- target/i386/tcg/translate.c | 316 +--- 1 file changed, 186 insertions(+), 130 deletions(-) diff --git a/target/i386/tcg/translate.c b/target/i386/tcg/translate.c index b7972f0ff5..7fec582358 100644 --- a/target/i386/tcg/translate.c +++ b/target/i386/tcg/translate.c @@ -2788,146 +2788,196 @@ typedef void (*SSEFunc_0_ppi)(TCGv_ptr reg_a, TCGv_ptr reg_b, TCGv_i32 val); typedef void (*SSEFunc_0_eppt)(TCGv_ptr env, TCGv_ptr reg_a, TCGv_ptr reg_b, TCGv val); -#define SSE_SPECIAL ((void *)1) -#define SSE_DUMMY ((void *)2) +#define SSE_OPF_V0(1 << 0) /* vex.v must be b (only 2 operands) */ +#define SSE_OPF_CMP (1 << 1) /* does not write for first operand */ +#define SSE_OPF_BLENDV(1 << 2) /* blendv* instruction */ +#define SSE_OPF_SPECIAL (1 << 3) /* magic */ +#define SSE_OPF_3DNOW (1 << 4) /* 3DNow! instruction */ +#define SSE_OPF_MMX (1 << 5) /* MMX/integer/AVX2 instruction */ +#define SSE_OPF_SCALAR(1 << 6) /* Has SSE scalar variants */ +#define SSE_OPF_AVX2 (1 << 7) /* AVX2 instruction */ +#define SSE_OPF_SHUF (1 << 9) /* pshufx/shufpx */ + +#define OP(op, flags, a, b, c, d) \ +{flags, {a, b, c, d} } + +#define MMX_OP(x) OP(op2, SSE_OPF_MMX, \ +gen_helper_ ## x ## _mmx, gen_helper_ ## x ## _xmm, NULL, NULL) + +#define SSE_FOP(name) OP(op2, SSE_OPF_SCALAR, \ +gen_helper_##name##ps, gen_helper_##name##pd, \ +gen_helper_##name##ss, gen_helper_##name##sd) +#define SSE_OP(sname, dname, op, flags) OP(op, flags, \ +gen_helper_##sname##_xmm, gen_helper_##dname##_xmm, NULL, NULL) + +struct SSEOpHelper_table1 { +int flags; +SSEFunc_0_epp op[4]; +}; -#define MMX_OP2(x) { gen_helper_ ## x ## _mmx, gen_helper_ ## x ## _xmm } -#define SSE_FOP(x) { gen_helper_ ## x ## ps, gen_helper_ ## x ## pd, \ - gen_helper_ ## x ## ss, gen_helper_ ## x ## sd, } +#define SSE_3DNOW { SSE_OPF_3DNOW } +#define SSE_SPECIAL { SSE_OPF_SPECIAL } -static const SSEFunc_0_epp sse_op_table1[256][4] = { +static const struct SSEOpHelper_table1 sse_op_table1[256] = { /* 3DNow! extensions */ -[0x0e] = { SSE_DUMMY }, /* femms */ -[0x0f] = { SSE_DUMMY }, /* pf... */ +[0x0e] = SSE_SPECIAL, /* femms */ +[0x0f] = SSE_3DNOW, /* pf... (sse_op_table5) */ /* pure SSE operations */ -[0x10] = { SSE_SPECIAL, SSE_SPECIAL, SSE_SPECIAL, SSE_SPECIAL }, /* movups, movupd, movss, movsd */ -[0x11] = { SSE_SPECIAL, SSE_SPECIAL, SSE_SPECIAL, SSE_SPECIAL }, /* movups, movupd, movss, movsd */ -[0x12] = { SSE_SPECIAL, SSE_SPECIAL, SSE_SPECIAL, SSE_SPECIAL }, /* movlps, movlpd, movsldup, movddup */ -[0x13] = { SSE_SPECIAL, SSE_SPECIAL }, /* movlps, movlpd */ -[0x14] = { gen_helper_punpckldq_xmm, gen_helper_punpcklqdq_xmm }, -[0x15] = { gen_helper_punpckhdq_xmm, gen_helper_punpckhqdq_xmm }, -[0x16] = { SSE_SPECIAL, SSE_SPECIAL, SSE_SPECIAL }, /* movhps, movhpd, movshdup */ -[0x17] = { SSE_SPECIAL, SSE_SPECIAL }, /* movhps, movhpd */ - -[0x28] = { SSE_SPECIAL, SSE_SPECIAL }, /* movaps, movapd */ -[0x29] = { SSE_SPECIAL, SSE_SPECIAL }, /* movaps, movapd */ -[0x2a] = { SSE_SPECIAL, SSE_SPECIAL, SSE_SPECIAL, SSE_SPECIAL }, /* cvtpi2ps, cvtpi2pd, cvtsi2ss, cvtsi2sd */ -[0x2b] = { SSE_SPECIAL, SSE_SPECIAL, SSE_SPECIAL, SSE_SPECIAL }, /* movntps, movntpd, movntss, movntsd */ -[0x2c] = { SSE_SPECIAL, SSE_SPECIAL, SSE_SPECIAL, SSE_SPECIAL }, /* cvttps2pi, cvttpd2pi, cvttsd2si, cvttss2si */ -[0x2d] = { SSE_SPECIAL, SSE_SPECIAL, SSE_SPECIAL, SSE_SPECIAL }, /* cvtps2pi, cvtpd2pi, cvtsd2si, cvtss2si */ -[0x2e] = { gen_helper_ucomiss, gen_helper_ucomisd }, -[0x2f] = { gen_helper_comiss, gen_helper_comisd }, -[0x50] = { SSE_SPECIAL, SSE_SPECIAL }, /* movmskps, movmskpd */ -[0x51] = SSE_FOP(sqrt), -[0x52] = { gen_helper_rsqrtps, NULL, gen_helper_rsqrtss, NULL }, -[0x53] = { gen_helper_rcpps, NULL, gen_helper_rcpss, NULL }, -[0x54] = { gen_helper_pand_xmm, gen_helper_pand_xmm }, /* andps, andpd */ -[0x55] = { gen_helper_pandn_xmm, gen_helper_pandn_xmm }, /* andnps, andnpd */ -[0x56] = { gen_helper_por_xmm, gen_helper_por_xmm }, /* orps, orpd */ -[0x57] = { gen_helper_pxor_xmm, gen_helper_pxor_xmm }, /* xorps, xorpd */ +[0x10] = SSE_SPECIAL, /* movups, movupd, movss, movsd */ +[0x11] = SSE_SPECIAL, /* movups, movupd, movss, movsd */ +[0x12] = SSE_SPECIAL, /* movlps, movlpd, movsldup, movddup */ +[0x13] = SSE_SPECIAL, /* movlps, movlpd */ +[0x14] = SSE_OP(punpckldq, punpcklqdq, op2, 0), /* unpcklps, unpcklpd */ +[0x15] = SSE_OP
[PATCH v2 02/42] i386: DPPS rounding fix
The DPPS (Dot Product) instruction is defined to first sum pairs of intermediate results, then sum those values to get the final result. i.e. (A+B)+(C+D) We incrementally sum the results, i.e. ((A+B)+C)+D, which can result in incorrect rouding. For consistency, also remove the redundant (but harmless) add operation from DPPD Signed-off-by: Paul Brook --- target/i386/ops_sse.h | 47 +++ 1 file changed, 25 insertions(+), 22 deletions(-) diff --git a/target/i386/ops_sse.h b/target/i386/ops_sse.h index 535440f882..a5a48a20f6 100644 --- a/target/i386/ops_sse.h +++ b/target/i386/ops_sse.h @@ -1934,32 +1934,36 @@ SSE_HELPER_I(helper_pblendw, W, 8, FBLENDP) void glue(helper_dpps, SUFFIX)(CPUX86State *env, Reg *d, Reg *s, uint32_t mask) { -float32 iresult = float32_zero; +float32 prod, iresult, iresult2; +/* + * We must evaluate (A+B)+(C+D), not ((A+B)+C)+D + * to correctly round the intermediate results + */ if (mask & (1 << 4)) { -iresult = float32_add(iresult, - float32_mul(d->ZMM_S(0), s->ZMM_S(0), - >sse_status), - >sse_status); +iresult = float32_mul(d->ZMM_S(0), s->ZMM_S(0), >sse_status); +} else { +iresult = float32_zero; } if (mask & (1 << 5)) { -iresult = float32_add(iresult, - float32_mul(d->ZMM_S(1), s->ZMM_S(1), - >sse_status), - >sse_status); +prod = float32_mul(d->ZMM_S(1), s->ZMM_S(1), >sse_status); +} else { +prod = float32_zero; } +iresult = float32_add(iresult, prod, >sse_status); if (mask & (1 << 6)) { -iresult = float32_add(iresult, - float32_mul(d->ZMM_S(2), s->ZMM_S(2), - >sse_status), - >sse_status); +iresult2 = float32_mul(d->ZMM_S(2), s->ZMM_S(2), >sse_status); +} else { +iresult2 = float32_zero; } if (mask & (1 << 7)) { -iresult = float32_add(iresult, - float32_mul(d->ZMM_S(3), s->ZMM_S(3), - >sse_status), - >sse_status); +prod = float32_mul(d->ZMM_S(3), s->ZMM_S(3), >sse_status); +} else { +prod = float32_zero; } +iresult2 = float32_add(iresult2, prod, >sse_status); +iresult = float32_add(iresult, iresult2, >sse_status); + d->ZMM_S(0) = (mask & (1 << 0)) ? iresult : float32_zero; d->ZMM_S(1) = (mask & (1 << 1)) ? iresult : float32_zero; d->ZMM_S(2) = (mask & (1 << 2)) ? iresult : float32_zero; @@ -1968,13 +1972,12 @@ void glue(helper_dpps, SUFFIX)(CPUX86State *env, Reg *d, Reg *s, uint32_t mask) void glue(helper_dppd, SUFFIX)(CPUX86State *env, Reg *d, Reg *s, uint32_t mask) { -float64 iresult = float64_zero; +float64 iresult; if (mask & (1 << 4)) { -iresult = float64_add(iresult, - float64_mul(d->ZMM_D(0), s->ZMM_D(0), - >sse_status), - >sse_status); +iresult = float64_mul(d->ZMM_D(0), s->ZMM_D(0), >sse_status); +} else { +iresult = float64_zero; } if (mask & (1 << 5)) { iresult = float64_add(iresult, -- 2.36.0
[PATCH v2 01/42] i386: pcmpestr 64-bit sign extension bug
The abs1 function in ops_sse.h only works sorrectly when the result fits in a signed int. This is fine most of the time because we're only dealing with byte sized values. However pcmp_elen helper function uses abs1 to calculate the absolute value of a cpu register. This incorrectly truncates to 32 bits, and will give the wrong anser for the most negative value. Fix by open coding the saturation check before taking the absolute value. Signed-off-by: Paul Brook --- target/i386/ops_sse.h | 20 +--- 1 file changed, 9 insertions(+), 11 deletions(-) diff --git a/target/i386/ops_sse.h b/target/i386/ops_sse.h index e4d74b814a..535440f882 100644 --- a/target/i386/ops_sse.h +++ b/target/i386/ops_sse.h @@ -2011,25 +2011,23 @@ SSE_HELPER_Q(helper_pcmpgtq, FCMPGTQ) static inline int pcmp_elen(CPUX86State *env, int reg, uint32_t ctrl) { -int val; +target_long val, limit; /* Presence of REX.W is indicated by a bit higher than 7 set */ if (ctrl >> 8) { -val = abs1((int64_t)env->regs[reg]); +val = (target_long)env->regs[reg]; } else { -val = abs1((int32_t)env->regs[reg]); +val = (int32_t)env->regs[reg]; } - if (ctrl & 1) { -if (val > 8) { -return 8; -} +limit = 8; } else { -if (val > 16) { -return 16; -} +limit = 16; } -return val; +if ((val > limit) || (val < -limit)) { +return limit; +} +return abs1(val); } static inline int pcmp_ilen(Reg *r, uint8_t ctrl) -- 2.36.0
[PATCH v2 09/42] i386: Helper macro for 256 bit AVX helpers
Once all the code is in place, 256 bit vector helpers will be generated by including ops_sse.h a third time with SHIFT=2. The first bit of support for this is to define a YMM_ONLY macro for code that only apples to 256 bit vectors. XXM_ONLY code will be executed for both 128 and 256 bit vectors. Signed-off-by: Paul Brook --- target/i386/ops_sse.h| 8 target/i386/ops_sse_header.h | 4 2 files changed, 12 insertions(+) diff --git a/target/i386/ops_sse.h b/target/i386/ops_sse.h index a5a48a20f6..23daab6b50 100644 --- a/target/i386/ops_sse.h +++ b/target/i386/ops_sse.h @@ -24,6 +24,7 @@ #define Reg MMXReg #define SIZE 8 #define XMM_ONLY(...) +#define YMM_ONLY(...) #define B(n) MMX_B(n) #define W(n) MMX_W(n) #define L(n) MMX_L(n) @@ -37,7 +38,13 @@ #define W(n) ZMM_W(n) #define L(n) ZMM_L(n) #define Q(n) ZMM_Q(n) +#if SHIFT == 1 #define SUFFIX _xmm +#define YMM_ONLY(...) +#else +#define SUFFIX _ymm +#define YMM_ONLY(...) __VA_ARGS__ +#endif #endif /* @@ -2337,6 +2344,7 @@ void glue(helper_aeskeygenassist, SUFFIX)(CPUX86State *env, Reg *d, Reg *s, #undef SHIFT #undef XMM_ONLY +#undef YMM_ONLY #undef Reg #undef B #undef W diff --git a/target/i386/ops_sse_header.h b/target/i386/ops_sse_header.h index cef28f2aae..7e7f2cee2a 100644 --- a/target/i386/ops_sse_header.h +++ b/target/i386/ops_sse_header.h @@ -21,7 +21,11 @@ #define SUFFIX _mmx #else #define Reg ZMMReg +#if SHIFT == 1 #define SUFFIX _xmm +#else +#define SUFFIX _ymm +#endif #endif #define dh_alias_Reg ptr -- 2.36.0
Re: [PATCH 2/4] TCG support for AVX
On Wed, 2022-04-20 at 16:19 +0200, Paolo Bonzini wrote: > On 4/18/22 21:45, Paul Brook wrote: > > > Massively too large for a single patch, I'm afraid. This needs > > > to be split, probably into at least twenty patches, which each > > > are a reviewable chunk of code that does one coherent thing. > > Hmm, I'mm see what I can do. > > > > Unfortunately the table driven decoding means that going from two > > to > > three operands tends to be a bit all or nothing just to get the > > thing > > to compile. > > Hi Paul, welcome back and thanks for this huge work. It should be > possible at least to split the patch as follows (at least that's > what _I_ would do in order to review it): > [snip] Ok, that sounds like a reasonable start. > I can do some of the work too since I was planning to do this > anyway (but have hardly started yet). I'll push my changes to https://github.com/pbrook/qemu . This is a personal project, so I'll be working on it as and when. If you have additional comments/suggestions on the approach taken then I'd be happy to hear them. Paul
Re: [PATCH 2/4] TCG support for AVX
On Mon, 2022-04-18 at 20:33 +0100, Peter Maydell wrote: > On Mon, 18 Apr 2022 at 18:48, Paul Brook wrote: > > > > Add TCG translation of guest AVX/AVX2 instructions > > This comprises: > > > > Massively too large for a single patch, I'm afraid. This needs > to be split, probably into at least twenty patches, which each > are a reviewable chunk of code that does one coherent thing. Hmm, I'mm see what I can do. Unfortunately the table driven decoding means that going from two to three operands tends to be a bit all or nothing just to get the thing to compile. Paul
[PATCH 3/4] Enable all x86-64 cpu features in user mode
We don't have any migration concerns for usermode emulation, so we may as well enable all available CPU features by default. Signed-off-by: Paul Brook --- linux-user/x86_64/target_elf.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/linux-user/x86_64/target_elf.h b/linux-user/x86_64/target_elf.h index 7b76a90de8..3f628f8d66 100644 --- a/linux-user/x86_64/target_elf.h +++ b/linux-user/x86_64/target_elf.h @@ -9,6 +9,6 @@ #define X86_64_TARGET_ELF_H static inline const char *cpu_get_model(uint32_t eflags) { -return "qemu64"; +return "max"; } #endif -- 2.35.2
[PATCH 1/4] Add AVX_EN hflag
Add a new hflag bit to determine whether AVX instructions are allowed Signed-off-by: Paul Brook --- target/i386/cpu.h| 3 +++ target/i386/helper.c | 12 target/i386/tcg/fpu_helper.c | 1 + 3 files changed, 16 insertions(+) diff --git a/target/i386/cpu.h b/target/i386/cpu.h index 982c532353..0c7162e2fd 100644 --- a/target/i386/cpu.h +++ b/target/i386/cpu.h @@ -168,6 +168,7 @@ typedef enum X86Seg { #define HF_MPX_EN_SHIFT 25 /* MPX Enabled (CR4+XCR0+BNDCFGx) */ #define HF_MPX_IU_SHIFT 26 /* BND registers in-use */ #define HF_UMIP_SHIFT 27 /* CR4.UMIP */ +#define HF_AVX_EN_SHIFT 28 /* AVX Enabled (CR4+XCR0) */ #define HF_CPL_MASK (3 << HF_CPL_SHIFT) #define HF_INHIBIT_IRQ_MASK (1 << HF_INHIBIT_IRQ_SHIFT) @@ -194,6 +195,7 @@ typedef enum X86Seg { #define HF_MPX_EN_MASK (1 << HF_MPX_EN_SHIFT) #define HF_MPX_IU_MASK (1 << HF_MPX_IU_SHIFT) #define HF_UMIP_MASK (1 << HF_UMIP_SHIFT) +#define HF_AVX_EN_MASK (1 << HF_AVX_EN_SHIFT) /* hflags2 */ @@ -2045,6 +2047,7 @@ void host_cpuid(uint32_t function, uint32_t count, /* helper.c */ void x86_cpu_set_a20(X86CPU *cpu, int a20_state); +void cpu_sync_avx_hflag(CPUX86State *env); #ifndef CONFIG_USER_ONLY static inline int x86_asidx_from_attrs(CPUState *cs, MemTxAttrs attrs) diff --git a/target/i386/helper.c b/target/i386/helper.c index fa409e9c44..30083c9cff 100644 --- a/target/i386/helper.c +++ b/target/i386/helper.c @@ -29,6 +29,17 @@ #endif #include "qemu/log.h" +void cpu_sync_avx_hflag(CPUX86State *env) +{ +if ((env->cr[4] & CR4_OSXSAVE_MASK) +&& (env->xcr0 & (XSTATE_SSE_MASK | XSTATE_YMM_MASK)) +== (XSTATE_SSE_MASK | XSTATE_YMM_MASK)) { +env->hflags |= HF_AVX_EN_MASK; +} else{ +env->hflags &= ~HF_AVX_EN_MASK; +} +} + void cpu_sync_bndcs_hflags(CPUX86State *env) { uint32_t hflags = env->hflags; @@ -209,6 +220,7 @@ void cpu_x86_update_cr4(CPUX86State *env, uint32_t new_cr4) env->hflags = hflags; cpu_sync_bndcs_hflags(env); +cpu_sync_avx_hflag(env); } #if !defined(CONFIG_USER_ONLY) diff --git a/target/i386/tcg/fpu_helper.c b/target/i386/tcg/fpu_helper.c index ebf5e73df9..b391b69635 100644 --- a/target/i386/tcg/fpu_helper.c +++ b/target/i386/tcg/fpu_helper.c @@ -2943,6 +2943,7 @@ void helper_xsetbv(CPUX86State *env, uint32_t ecx, uint64_t mask) env->xcr0 = mask; cpu_sync_bndcs_hflags(env); +cpu_sync_avx_hflag(env); return; do_gpf: -- 2.35.2
[PATCH 0/3] AVX guest implementation
Patch series to implement AXV/AVX2 guest support in TCG. All the system level code for this (cpid, xsave, wider registers, etc) already exists, we just need to implement the instruction translation. The majority of the new 256-bit operations operate on each 128-bit "lane" independently, so in theory we could use a single set of 128-bit helpers to implement both widths piecemeal. However this would further complicate the already over-long gen_sse function. Instead I chose to generate a whole new set of 256 bit "ymm" helpers using the framework already in place for 64/128 bit mm/xmm operations. I've included the tests I used during development to the linux-user testsuite, and also ran these manually inside a debian x86-64 guest. Appologies for the big patch, but I can't think of a good way to split the bulk of the instruction translation. Paul Brook (4): Add AVX_EN hflag TCG support for AVX Enable all x86-64 cpu features in user mode AVX tests linux-user/x86_64/target_elf.h |2 +- target/i386/cpu.c |8 +- target/i386/cpu.h |3 + target/i386/helper.c | 12 + target/i386/helper.h |2 + target/i386/ops_sse.h | 2606 +- target/i386/ops_sse_header.h | 364 ++- target/i386/tcg/fpu_helper.c |4 + target/i386/tcg/translate.c| 1902 ++--- tests/tcg/i386/Makefile.target | 10 +- tests/tcg/i386/README |9 + tests/tcg/i386/test-avx.c | 347 +++ tests/tcg/i386/test-avx.py | 352 +++ tests/tcg/i386/x86.csv | 4658 14 files changed, 8988 insertions(+), 1291 deletions(-) create mode 100644 tests/tcg/i386/test-avx.c create mode 100755 tests/tcg/i386/test-avx.py create mode 100644 tests/tcg/i386/x86.csv -- 2.35.2
[PATCH] linux-user: Fix inotify on aarch64
The inotify implementation originally called the raw host syscalls. Commit 3b3f24add0 changed this to use the glibc wrappers. However ifdefs in syscall.c still test for presence of the raw syscalls. This causes a problem on e.g. aarch64 hosts which never had the inotify_init syscall - it had been obsoleted by inotify_init1 before aarch64 was invented! However it does have a perfectly good glibc implementation of inotify_wait. Fix this by removing all the raw __NR_inotify_* tests, and instead check CONFIG_INOTIFY, which already tests for the glibc functionality we use. Also remove the now-pointless sys_inotify* wrappers. Tested using x86-64 inotifywatch on aarch64 host, and vice-versa Signed-off-by: Paul Brook --- linux-user/fd-trans.c | 5 ++--- linux-user/syscall.c | 50 +-- 2 files changed, 12 insertions(+), 43 deletions(-) diff --git a/linux-user/fd-trans.c b/linux-user/fd-trans.c index 6941089959..30e7b49112 100644 --- a/linux-user/fd-trans.c +++ b/linux-user/fd-trans.c @@ -1460,9 +1460,8 @@ TargetFdTrans target_eventfd_trans = { .target_to_host_data = swap_data_eventfd, }; -#if (defined(TARGET_NR_inotify_init) && defined(__NR_inotify_init)) || \ -(defined(CONFIG_INOTIFY1) && defined(TARGET_NR_inotify_init1) && \ - defined(__NR_inotify_init1)) +#if defined(CONFIG_INOTIFY) && (defined(TARGET_NR_inotify_init) || \ +defined(TARGET_NR_inotify_init1)) static abi_long host_to_target_data_inotify(void *buf, size_t len) { struct inotify_event *ev; diff --git a/linux-user/syscall.c b/linux-user/syscall.c index 56a3e17183..17cc38fe34 100644 --- a/linux-user/syscall.c +++ b/linux-user/syscall.c @@ -272,9 +272,6 @@ static type name (type1 arg1,type2 arg2,type3 arg3,type4 arg4,type5 arg5, \ #if defined(__NR_futex_time64) # define __NR_sys_futex_time64 __NR_futex_time64 #endif -#define __NR_sys_inotify_init __NR_inotify_init -#define __NR_sys_inotify_add_watch __NR_inotify_add_watch -#define __NR_sys_inotify_rm_watch __NR_inotify_rm_watch #define __NR_sys_statx __NR_statx #if defined(__alpha__) || defined(__x86_64__) || defined(__s390x__) @@ -447,33 +444,6 @@ static int sys_renameat2(int oldfd, const char *old, #ifdef CONFIG_INOTIFY #include - -#if defined(TARGET_NR_inotify_init) && defined(__NR_inotify_init) -static int sys_inotify_init(void) -{ - return (inotify_init()); -} -#endif -#if defined(TARGET_NR_inotify_add_watch) && defined(__NR_inotify_add_watch) -static int sys_inotify_add_watch(int fd,const char *pathname, int32_t mask) -{ - return (inotify_add_watch(fd, pathname, mask)); -} -#endif -#if defined(TARGET_NR_inotify_rm_watch) && defined(__NR_inotify_rm_watch) -static int sys_inotify_rm_watch(int fd, int32_t wd) -{ - return (inotify_rm_watch(fd, wd)); -} -#endif -#ifdef CONFIG_INOTIFY1 -#if defined(TARGET_NR_inotify_init1) && defined(__NR_inotify_init1) -static int sys_inotify_init1(int flags) -{ - return (inotify_init1(flags)); -} -#endif -#endif #else /* Userspace can usually survive runtime without inotify */ #undef TARGET_NR_inotify_init @@ -12263,35 +12233,35 @@ static abi_long do_syscall1(void *cpu_env, int num, abi_long arg1, case TARGET_NR_futex_time64: return do_futex_time64(cpu, arg1, arg2, arg3, arg4, arg5, arg6); #endif -#if defined(TARGET_NR_inotify_init) && defined(__NR_inotify_init) +#ifdef CONFIG_INOTIFY +#if defined(TARGET_NR_inotify_init) case TARGET_NR_inotify_init: -ret = get_errno(sys_inotify_init()); +ret = get_errno(inotify_init()); if (ret >= 0) { fd_trans_register(ret, _inotify_trans); } return ret; #endif -#ifdef CONFIG_INOTIFY1 -#if defined(TARGET_NR_inotify_init1) && defined(__NR_inotify_init1) +#if defined(TARGET_NR_inotify_init1) && defined(CONFIG_INOTIFY1) case TARGET_NR_inotify_init1: -ret = get_errno(sys_inotify_init1(target_to_host_bitmask(arg1, +ret = get_errno(inotify_init1(target_to_host_bitmask(arg1, fcntl_flags_tbl))); if (ret >= 0) { fd_trans_register(ret, _inotify_trans); } return ret; #endif -#endif -#if defined(TARGET_NR_inotify_add_watch) && defined(__NR_inotify_add_watch) +#if defined(TARGET_NR_inotify_add_watch) case TARGET_NR_inotify_add_watch: p = lock_user_string(arg2); -ret = get_errno(sys_inotify_add_watch(arg1, path(p), arg3)); +ret = get_errno(inotify_add_watch(arg1, path(p), arg3)); unlock_user(p, arg2, 0); return ret; #endif -#if defined(TARGET_NR_inotify_rm_watch) && defined(__NR_inotify_rm_watch) +#if defined(TARGET_NR_inotify_rm_watch) case TARGET_NR_inotify_rm_watch: -return get_errno(sys_inotify_rm_watch(arg1, arg2)); +return get_errno(inotify_rm_watch(arg1, arg2)); +#endif #endif #if defined(TARGET_NR_mq_open) && defined(__NR_mq_open) -- 2.34.1
Re: [Qemu-devel] [PATCH] softfloat: rebase to version 2a
The license of SoftFloat-2b is claimed to be GPLv2 incompatible by the FSF due to an indemnification clause. The previous release, SoftFloat-2a, did not contain this clause. The only changes between these two versions as far as QEMU is concerned is the license change and a global modification of the comment structure. This patch rebases our softfloat code to SoftFloat-2a in order to have a GPLv2 compatible license. Acked-by: Paul Brook p...@codesourcery.com
Re: [Qemu-devel] [PATCH v2 00/11] Fix versatile_pci (now without breaking linux)
This patch series fixes a number of serious bugs in our emulation of the PCI controller found on VersatilePB and the early Realview boards: * our interrupt mapping was totally wrong * the I/O window wasn't mapped on VersatilePB FWIW the documentation avaiable at the time I implemented the VersatilePB did not include the IO region. The PCI interrupt routing still seems to be missing from the docs. Acked-by: Paul Brook p...@codesourcery.com [Ignoring any issues with the backwards comaptibility hacks]
Re: [Qemu-devel] [PATCH 4/4] target-arm: always set endian bits in big-endian mode
On 03/01/2013 09:58 PM, Paul Brook wrote: +#ifdef TARGET_WORDS_BIGENDIAN +if (arm_feature(env, ARM_FEATURE_V6) +|| arm_feature(env, ARM_FEATURE_V7)) { +/* IE and EE bits stay set for big-endian */ +env-cp15.c1_sys |= (1 31) | (1 25); +} +#endif This is wrong for all the CPUs QEMU crrently supports. SCTLR.IE is defined to be zero. Again I'd like to have more information. Why is it wrong to set IE when we are in big-endian? The ARM architecture defines two big-endian modes. In BE8 mode only data accesses big-endian, code fetches are still little-endian. In BE32 mode both code and data are big-endian. In theory a fourth mode (big-endian code, little-endian data) exists, though I've never seen that used. All the v7 cores QEMU currently supports[1] only implement BE8 mode. The IE bit is reserved and most be zero. Usermode emulation implements both, but the privileged cp15 registers can safely be ignored there. Paul [1] Except maybe the M profile cores, but they use a different system model anyway.
Re: [Qemu-devel] [PATCH 3/4] target-arm: Fix VFP register byte order in GDB remote
The bytes with the register are transmitted in target byte order. /* Aliases for Q regs. */ nregs += 16; if (reg nregs) { -stfq_le_p(buf, env-vfp.regs[(reg - 32) * 2]); -stfq_le_p(buf + 8, env-vfp.regs[(reg - 32) * 2 + 1]); +stfq_p(buf, env-vfp.regs[(reg - 32) * 2]); +stfq_p(buf + 8, env-vfp.regs[(reg - 32) * 2 + 1]); This is wrong. You're still using little-endian ordering of words. Can you explain a little bit further? If I'm in big-endian mode, stfq_p() will be stfq_be_p(), right? Because we're actually storing two halves of a 128-bit value. You still store the least significant half first. Paul
Re: [Qemu-devel] [PATCH 3/4] target-arm: Fix VFP register byte order in GDB remote
From GDB Remote Serial Protocol doc: The bytes with the register are transmitted in target byte order. /* Aliases for Q regs. */ nregs += 16; if (reg nregs) { -stfq_le_p(buf, env-vfp.regs[(reg - 32) * 2]); -stfq_le_p(buf + 8, env-vfp.regs[(reg - 32) * 2 + 1]); +stfq_p(buf, env-vfp.regs[(reg - 32) * 2]); +stfq_p(buf + 8, env-vfp.regs[(reg - 32) * 2 + 1]); This is wrong. You're still using little-endian ordering of words. Paul
Re: [Qemu-devel] [PATCH 4/4] target-arm: always set endian bits in big-endian mode
+#ifdef TARGET_WORDS_BIGENDIAN +if (arm_feature(env, ARM_FEATURE_V6) +|| arm_feature(env, ARM_FEATURE_V7)) { +/* IE and EE bits stay set for big-endian */ +env-cp15.c1_sys |= (1 31) | (1 25); +} +#endif This is wrong for all the CPUs QEMU crrently supports. SCTLR.IE is defined to be zero. Paul
Re: [Qemu-devel] [ARM] Cortex-R4F and VFP3-D16
Probably what you'll want is to have a separate feature bit for 32 dregs which is set by default for vfpv3, and then use that in VFP_DREG rather than the vfpv3 feature bit. Right, it might be easier than I though. Maybe add a ARM_FEATURE_VFP3_D16 and do: #define VFP_DREG(reg, insn, bigbit, smallbit) do { \ if (arm_feature(env, ARM_FEATURE_VFP3) \ !arm_feature(env, ARM_FEATURE_VFP3_D16)) { \ There's no need to check both flags. I've got a patch to implement this as a side-effect of a different feature, I'll look at pushing it out. Paul
Re: [Qemu-devel] [PATCH 4/6] Handle CPU interrupts by inline checking of a flag
@@ -100,6 +102,7 @@ struct CPUState { bool stop; bool stopped; volatile sig_atomic_t exit_request; +volatile sig_atomic_t tcg_exit_req; Do we really need annother variable/check? It seems like this should be at least partially redundant with the existing icount code. I have a simialr patch to that effect. Paul
Re: [Qemu-devel] [PATCH 24028/24028] Evaluate breakpoint condition on target.
In addition to the comments others made about patch formatting, etc: +/* conditional breakpoint evaluation on target*/ +pstrcat(buf, sizeof(buf), ;ConditionalBreakpoints+); I'm pretty sure this is a lie for most targets, given later on we have: +#if defined(TARGET_ARM) +cpu_get_reg_var_func = cpu_get_reg_var_arm; +#else +cpu_get_reg_var_func = 0; +#endif +for (i = 0 ; i bp_cond_len ; i++) { +if (!isxdigit(*p) || !isxdigit(*(p + 1))) { +bp_cond_len = 0 ; +g_free(bp_cond_expr); +bp_cond_expr = NULL; +perror(Error in breakpoint condition); perror is the wrong way to report a malformed gdb command. +#if TARGET_LONG_SIZE == 4 +typedef float target_double; +#else /* TARGET_LONG_SIZE == 8 */ +typedef double target_double; +#endif This clearly has nothing to do with the target double precision floating point type. +int qemu_rw_debug_flag; This appears to be a write-only variable. +#define BP_AGENT_MAX_COND_SIZE 1024 By my reading this isn't the maximim size, it's the maximum stack depth. +void cpu_get_reg_var_arm(TCGv var, int reg) +{ +tcg_gen_mov_i32(var, cpu_R[reg]); +} Looks like it will break horribly when the user requests anything other than r0-r15. And r15 is probably also wrong. +bswap16(val); Clearly wrong. +fprintf(stderr, +GDB agent: const 64 is not supported for 32 bit This is not a good way to report user errors. Several other occurances. +static target_long bp_agent_get_arg(const uint8_t *cond_exp, ... +case 4: +default: I'd be amazed if this default case is correct. +/*for case error , ex.buffer overloading - + need to set labels anyway in order to avoid segmentation fault */ Sounds like you're failing to check for errors somewhere else.
Re: [Qemu-devel] [PATCH V2 2/6] hw/mdio: Generalize etraxfs MDIO bitbanging emulation (fwd)
To be able to create generic GPIO devices or other devices that have GPIO like pins (e.g MDIO), and hook those up to external buses through common frameworks, we need agreement on how to model tristate pins. A tristate pin model, or at least agreement on how to model these with multiple qemu_irqs. hmm, feels like we've opened a can of worms... Probably. I'm not going to insist you use/implement generic GPIO for MDIO, but I still think separation between the PHY register interface and the bitbang code is good (i.e. same as bitbang_i2c). Anyway, how would such a qemu_tristate_pin be modelled? [point 1 moved later so my answers read in a sensible order] 2. Every connection point provides an output/value and an output_enable. I think we'd be better providing a single state i.e. an output of 0, 1 or Z. Possibly additional Z0/Z1 states to represent high-impedance with pull-up/down resistors. 3. There is a mean for reading the pin value, which is computed based on all connection points outputs and output_enables (can be cached). For MDIO being able to read the value is sufficient. However in general we don't want to have to poll it. We want to be told when it changes. 4. The pin value can be invalid (multiple drivers or no drivers), 0 or 1. I can't think of any cases where this is important. In most cases it's undefined, in the rest it causes physical damage. 1. It's not point-to-point, has an arbitrary nr of connection points. QoM currently only does asymmetric 1-1 connections between objects. However I don't think this is a fatal problem. We can still retain an asymmetric API (effectively equivalent to male and female physical connectors), adding virtual wire objects where they don't match up. It should be possible to implement this as a backward compatible extension to qemu_irq[1]. In most cases the additional wire should not be needed. For simple output-input (i.e. all existing code) we just need to ignore Z states. Preferably before they get to the input device. For simple bidirectional point-point lines (which should include bitbang-i2c and bitbang-mdio) the bitbang object controls the value when subject to a Z output. For arbitrary pin connections they all connect to a set of ports on a virtual wire device. It takes care of arbitrating line state and sending notifications to the connected devices. There are a couple of technical issues: Fristly qemu_irq is currently stateless[2]. Giving it state is fine in principle, but means a lot of load/save code needs fixing. In pactice we can probably avoid this, but there are some nice benefits from keeping state in qemu_irq. Secondly, the [parent of the] qemu_irq object needs to be able to signal value changes to the object on the other side of the link. Currently QoM allows a property to be linked to an object, but provides no way for the object to identify/communicate with the property/device linked to it. Paul [1] I've no particular attachment to the name qemu_irq. But I really don't want to have to make anything other than purely mechanical changes to all its users. [2] More precicely it has no state that changes over its lifetime.
Re: [Qemu-devel] [PATCH V2 6/6] hw/mdio: Use bitbang core for smc91c111 network device
@@ -44,6 +45,10 @@ typedef struct { uint8_t int_level; uint8_t int_mask; MemoryRegion mmio; + +/* MDIO bus and the attached phy */ +struct qemu_mdio mdio_bus; +struct qemu_phy phy; } smc91c111_state; static const VMStateDescription vmstate_smc91c111 = { @@ -71,6 +76,8 @@ static const VMStateDescription vmstate_smc91c111 = { VMSTATE_BUFFER_UNSAFE(data, smc91c111_state, 0, NUM_PACKETS * 2048), VMSTATE_UINT8(int_level, smc91c111_state), VMSTATE_UINT8(int_mask, smc91c111_state), +VMSTATE_MDIO(mdio_bus, smc91c111_state), +VMSTATE_MDIO_PHY(phy, smc91c111_state), VMSTATE_END_OF_LIST() } }; @@ -754,6 +768,9 @@ static int smc91c111_init1(SysBusDevice *dev) s-nic = qemu_new_nic(net_smc91c111_info, s-conf, object_get_typename(OBJECT(dev)), dev-qdev.id, s); qemu_format_nic_info_str(s-nic-nc, s-conf.macaddr.a); + +tdk_init(s-phy); +mdio_attach(s-mdio_bus, s-phy, 0); /* ??? Save/restore. */ return 0; } There's no reason for smc91c111_state to contain the PHY state. For devices with an off-chip PHY we have no way of knowing which phy is used, or what state is required. The PHY should be a device in its own right, and know how to save/restore itself. smc91c111_init1 should create the PHY, attach it to the MDIO bus, then forget about it. Paul
Re: [Qemu-devel] [PATCH v2 19/20] arm: add Faraday FTKBC010 support for A369
From: Kuo-Jung Su dant...@faraday-tech.com Faraday keyboard/mouse controller (FTKBC010) is compliant with the IBM PS/2 interface. Your description doesn't appear to match the code at all. Surely if this were true then you should be using the existing PS2 keyboard emulation. Paul
Re: [Qemu-devel] [PATCH v2 20/20] arm: add generic ROM model for Faraday SoC platforms
Since the NAND and SPI flash memories do not support random access, so most of the systems which use such memory as main storages usually has some bootstrap code stored inside the embedded ROM of its SoC, and the bootstrap code is responsible for SDRAM initialization and then load the specific software(i.e. u-boot/linux) into SDRAM, and finally jumps into the loaded primary software. No. For a start the block device you're using is for parallel plash devices, which are directly mapped. This contradicts your desciption which talks about serial flash. Please look at how other boards work. There are already mechanisms for creating rom areas, or preloading images into ram. Paul
Re: [Qemu-devel] [PATCH v2 03/20] arm: add Faraday FTAPBBRG020 APB DMA support
The FTAPBBRG020 supports the DMA functions for the AHB-to-AHB, AHB-to-APB, APB-to-AHB, and APB-to-APB transactions. All the timer code in this file looks suspect. As a general rule everything should be event driven and complete immediately (or at least schedule a BH for immediate action if recursion is a concern), not relying on a periodic timer interrupts. +qemu_mod_timer(s-qtimer, +qemu_get_clock_ns(vm_clock) + 1); For all practical purposes this is going to happen immediately, so you should not be using a timer. +qemu_mod_timer(s-qtimer, +qemu_get_clock_ns(vm_clock) + (get_ticks_per_sec() 2)); Why 0.25 seconds? Usually this sort of try-again-soon behavior means you've missed a trigger event somewhere else. +if (!cpu_physical_memory_is_io(c-src)) { +src_map = src_ptr = cpu_physical_memory_map(c-src, src_len, 0); +} +if (!cpu_physical_memory_is_io(c-dst)) { +dst_map = dst_ptr = cpu_physical_memory_map(c-dst, dst_len, 1); +} cpu_physical_memory_map might not map the whole region you requested. This will cause badness in the subsequent code. I suspect a log of this code anc and should be shared with your other DMA controller, and probably several of the existing DMA controllers. Paul
Re: [Qemu-devel] [PATCH v2 05/20] arm: add Faraday FTGMAC100 1Gbps ethernet support
In order to reduce the processing load of the host CPU, the FTGMAC100 implements TCP, UDP, and IP V4 checksum generation and validation, and supports VLAN tagging. I see no evidence of these features in the code. +static void ftgmac100_read_desc(hwaddr addr, void *desc) +{ +int i; +uint32_t *p = desc; + +cpu_physical_memory_read(addr, desc, 16); + +for (i = 0; i 16; i += 4) { +*p = le32_to_cpu(*p); +} +} You're relying on the compiler choosing a particular bitfield and structure layout. Don't do that. Especially when one of the fields is a void*. Clearly never been tested on a 64-bit host. void *desc is just plain lazy. +buf = s-txbuff.buf + s-txbuff.len; +cpu_physical_memory_read(txd.buf, (uint8_t *)buf, txd.len); Buffer overflow. In at least two differnt ways. +if (!(s-maccr MACCR_HT_MULTI_EN)) { +printf([qemu] ftgmac100_receive: mcst filtered\n); +return -1; Looks like stray debug code. Several other occurences. +case REG_TXPD: +case REG_HPTXPD: +qemu_mod_timer(s-qtimer, qemu_get_clock_ns(vm_clock) + 1); Using a timer here is wrong. Either you should transmit immediately, or you should wait for something else to happen. Delaying by 1ns is never the right answer. Paul
Re: [Qemu-devel] [PATCH v2 06/20] arm: add Faraday FTMAC110 10/100Mbps ethernet support
The FTMAC110 is a high quality 10/100 Ethernet controller Which looks largely the same as the other ethernet controller you added in the previous patch. Paul
Re: [Qemu-devel] [PATCH v2 10/20] arm: add Faraday FTSDC010 MMC/SD controller support
+if (!(s-dcr DCR_WR) (s-datacnt 0)) { +ret = sd_read_data(s-card) +| sd_read_data(s-card) 8 +| sd_read_data(s-card) 16 +| sd_read_data(s-card) 24; +s-datacnt -= 4; +if (s-datacnt = 0) { +s-status |= STR_DAT_END; +} This will fail if datacnt is not a multiple of 4. Paul
Re: [Qemu-devel] [PATCH v2 14/20] arm: add Faraday FTRTC011 RTC timer support
+qemu_mod_timer(s-qtimer, +qemu_get_clock_ns(vm_clock) + get_ticks_per_sec()); This will not work reliably. You can not rely on timers triggering promptly. Plus you're loosing the time taken to execute the callback every tick. Additionally you can calculate values on demand, and only trigger a timer tick when an interrupt is actually enabled. You don't need high precision, so use timer_ms rather than timer_ns. Paul
Re: [Qemu-devel] [PATCH V2 2/6] hw/mdio: Generalize etraxfs MDIO bitbanging emulation (fwd)
It also worries me that there isn't a clean separation between the MDIO bus and the bitbang interface. IMO the bitbang interface should be a separate device, and if we're wiring up bitbang interfaces then it really should be via standard GPIO pins (aka qemu_irq). Only the bitbang state machine is in the mdio layer. It says nothing about where those signals come from, gpio or otherwise. Not all cases will actually be GPIOs. For instance, the smc91c111 has dedicated pins for MDIO operations which are not GPIOs, even though the driver has to manage the bigbanging. There's no such thing as a dedicated pin managed by software. That's exactly what a GPIO pin is. It may be that particular pins are usually used for a particular purpose, but I don't think that is sufficient reason to create a whole new API. The way to solve that is to give the pins appropriate names. Don't be distracted by the fact that the smc91c111 is two devices (MAC and PHY) on the same chip. That said, I'm not opposed to changing the model if that is the design direction. However, I hope that the series won't be blocked on this point. This series moves and enhances existing code. A move to qemu_irq should be done as a follow-on patch. Maybe we should do it like the i2c framework? It does very similar things as mdio would need (with a nice split). It addresses Pauls comments (I think) and also the split between slaves and the bus. It also makes it possible to select PHY model from board code. Yes. Though on closer inspection the bitbang I2C module introduces bitbang_i2c_set, which I'd preter to avoid. This isn't quite as easy as it should be because we don't have a nice solution for tristate pins (currently modelled as a cross-wired output and input pair). Paul
Re: [Qemu-devel] [PATCH V2 2/6] hw/mdio: Generalize etraxfs MDIO bitbanging emulation
+#ifdef USE_THIS_DEAD_CODE +void mdio_detach(struct qemu_mdio *bus, struct qemu_phy *phy, unsigned int addr) +{ +bus-devs[addr 0x1f] = NULL; +} +#endif This is clearly wrong. It also worries me that there isn't a clean separation between the MDIO bus and the bitbang interface. IMO the bitbang interface should be a separate device, and if we're wiring up bitbang interfaces then it really should be via standard GPIO pins (aka qemu_irq). Paul
Re: [Qemu-devel] [PATCH] Annotate questionable fallthroughs
diff --git a/disas/cris.c b/disas/cris.c +/* XXX: questionable fallthrough */ Inherited from binutils; if you want to clean this up, suggest to do it there. Except that upstream binutils is GPLv3, so this code is effectively orphaned. Paul
Re: [Qemu-devel] [PATCH] Annotate questionable fallthroughs
I don't think there's much point adding tons of XXX comments when a bunch of these aren't actually wrong code. If you want to fix this I think a better approach would be more focused patches aimed at adding 'break;' or /* fallthrough */ based on actual human examination of the surrounding code. I agree. I encourage annotation of intentional fall through, but blindly pasting the output of an automated tool is liable to cause more harm than good. IMO running code analysis tools is easy. It's only when you take the time to manually inspect and fix the code that this really becomes valuable. Paul
Re: [Qemu-devel] [PATCH] target-arm: add Faraday ARMv5TE processors support
* ARMv5TE series (FA606TE, FA626TE, FA616TE, FA726TE) All the single core RISC listed above are included in this patch. And there are two Faraday CP15 extensions (AUX and I/D-Scratchpad) have been implemented as NOP. Is a NOP appropriate? Should you at least read the value back? * Confidentiality Notice This electronic message and any attachments may contain confidential and legally privileged information or information which is otherwise protected from disclosure. If you are not the intended recipient,please do not disclose the contents, either in whole or in part, to anyone,and immediately delete the message and any attachments from your computer system and destroy all hard copies. Thank you for your cooperation. *** This sort of disclaimer is completely inappropriate for public mailing lists, and I'm unwilling to touch anything subject to these restrictions. As instructed I have deleted all your other email unread. Paul
Re: [Qemu-devel] [PATCH v1 3/4] hw: Deduce the default machine from the specified CPU model
This changes the driver behavior to choose the default machine model based on the CPU being used. Defaulting the machine this way makes it easier to use QEMU as an ISS by just specifying the -cpu option since a default machine that is suitable for emulating the full ISA can be chosen. For example, currently on ARM the ARM Integrator/CP board is chosen as the default machine when specifying just a CPU. However, this doesn't work well when passing -cpu cortex-m3 since on ARMv7-M processors the NVIC is a part of the architecture and is needed to support instructions like SVC. Personally I'd rather we didn't support a default machine at all, at least for ARM. It does matter what board you run on, so you need to specify. A possible compromise is to only accept -cpu if -M is also specified. Just to pick an obvious example, you can't stick a core which supports VFPv4 (the A15 is the only one we have) into the integratorcp Yes you can. Your OS probably doesn't support it, and you might have trouble persuading the OS vendor to support something that doesn't physically exist, but those are a competely separate problems. We could reasonably add patches which made boards error out if you tried to use them with unsupported CPUs, I guess. That suffers from a large fuzzy region containing interesting combinations that could/do work, but will probably never be created in silicon. If done properly this the QOM conversion should give you this for free. Paul
Re: [Qemu-devel] [PATCH v1 3/4] hw: Deduce the default machine from the specified CPU model
Just to pick an obvious example, you can't stick a core which supports VFPv4 (the A15 is the only one we have) into the integratorcp Yes you can. No you can't. integratorcp.c doesn't create the parts of the CPU which live in QEMU's 'a15mpcore_priv' device, so the resulting mess is liable to just fall over. If anybody reports bugs in QEMU in such a configuration I will tell them to go away and use a supported configuration instead. The A15 core itself will work just fine. The core is completely independent of the interrupt controller. Unlike the M profile cores where the NVIC is inherently part of the CPU exception handling mechanism. Paul
Re: [Qemu-devel] [PATCH 0/3] Drop default SD card creation
On 16 August 2012 15:11, Markus Armbruster arm...@redhat.com wrote: Peter Maydell peter.mayd...@linaro.org writes: As suggested in the recent discussion on Markus' patchset to suppress unused default drives, this patchset cleans up the omap and pxa2xx SD card controllers to behave like the other controllers: * the init function looks for the next IF_SD drive * if there isn't one, we start up as a controller with no card present Isn't this an incompatible change? Before, you get an SD card reader backed by an empty BDS default. You can load/unload cards in the monitor. After, you get an SD card reader that isn't backed by a BDS by default. Device models prepared for that can treat it as permanently empty. Hmm, yes, but most of our SD controllers already act that way. We should probably fix them all... So what's the block layer equivalent of drive_get_next() that always returns us something we can get a bdrv from? I think this may be the wrong way to fix this. SD cards aren't really have removable media. In the same way that a SCSI HDD are generally not removable media - you hotplug the whole drive. Don't we really want a proper QOM device for the SD card, with hotplug support. Paul
Re: [Qemu-devel] [PATCH 0/3] Drop default SD card creation
On 16 August 2012 16:17, Markus Armbruster arm...@redhat.com wrote: Paul Brook p...@codesourcery.com writes: I think this may be the wrong way to fix this. SD cards aren't really have removable media. In the same way that a SCSI HDD are generally not removable media - you hotplug the whole drive. If an SD card device doesn't support media change, then the device model should: 1. Insist on non-null, non-empty BDS on initialization (this ensures we got media) This seems to be trying to draw a distinction that I don't understand. The SD card *is* the media, it's the physical object you stuff in and out of the slot on the side of your device. It's the difference between not present and present but empty. In the case of an SD card the media (i.e. flash) is generally not seperable from the SD device - I don't remember if the SD spec even supports removable media. The same is true for most hard disks - the disk platters are an integral part of the drive. In these cases the present but empty state does not exist. c.f. cdrom drives where the concept of an empty device is clearly very different to an absent device. I guess that that means that change SD card should ideally be modelled as destroy the sd.c device object and create a new one and reconnect it to the controller but we don't really model things quite in the right way to permit that, so we fake it up at the moment by allowing the underlying BDS to change its idea of media. This works except that if the initial state is no card present we have a NULL BDS rather than one which is non-NULL but has no media at the moment. (I think Paul is suggesting that we should fix our model to move closer to this idea rather than faking things...) I think we have two options: A) Model the SD slot and card explicitly as separate objects. Effectively the same way we have a scsi bus with scsi drives connected to it. Cards can be hotplugged. A card has a block device that is not optional, and not removable. I don't know how well our UI handles this. It may well require user-visible changes. B) Continue to effectively model just the SD slot, with the card being implicit. The slot should always create/find a [removable] block device. An empty block device is modelled as an absent card. A slot without a block device is IMO a bug. This can create awkwardness because there's no good way to expose card specific properties (we don't curently implement any interesting ones). These should really be per-card, i.e. may change when you change the contents. However the only thing we have to attach them to is the long-lived slot object. e.g. in some cases data may be either an SD or an SDHC card. We currently make a guess. The only place to attach a user override is the SD slot, and that must be determined at machine creation, not when you associate data with the block device. Paul
Re: [Qemu-devel] [PATCH 0/3] Drop default SD card creation
One way is to treat the SD card as a hot-pluggable device. A card reader device model provides a connector for the SD card device model. The SD card device model is backed by a block backend, with non-removable medium. Card change is device hot plug. ... Note that we could model floppies and CD-ROMs that way, too. That's a good point. e.g. for a cdrom I'm pretty sure there's a bit somewhere that tells you whether it's a pressed cd or a cd-r. Attaching this information to a cdrom-disk device (hotplugged into a cdrom-drive) seems to make sense. Paul
Re: [Qemu-devel] [PATCH 00/23] Suppress unused default drives
*can* use it for something entirely else, if=sd notwithstanding: (qemu) device_add lsi (qemu) device_add scsi-cd,drive=sd0 If/when we get a PCI SD card controller model, would all the PCI using machines need to be added to take the 'no default sd card' setting out again, or does it get overridden anyway if you say and I'd like an sd controller? For SD cards we shouldn't need this to start with. Why are we creating SD cards when there's no host controller to connect them to? Surely we shold be able to figure that out automatically. Especially important for board variants with multiple SD interfaces. Is this all a hangover from before we have proper -drive options? Paul
[Qemu-devel] [PATCH] Fix ALSA configure check
Recent gcc notice that the ASLA configure check uses an uninitialized variable, causing spurious failures. Adjust the testcase to avoid this. Signed-off-by: Paul Brook p...@codesourcery.com --- configure |2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/configure b/configure index c65b5f6..9152798 100755 --- a/configure +++ b/configure @@ -1890,7 +1890,7 @@ for drv in $audio_drv_list; do case $drv in alsa) audio_drv_probe $drv alsa/asoundlib.h -lasound \ -snd_pcm_t **handle; return snd_pcm_close(*handle); +snd_pcm_t *handle = NULL; return snd_pcm_close(handle); libs_softmmu=-lasound $libs_softmmu ;; -- 1.7.10.4
[Qemu-devel] [PATCH] target-arm: Fix CP15 based WFI
The coprocessor register rework broke cp15 based WFI instructions. We incorrectly fall through the normal register write case, which incorrectly adds a forced block termination. We've already done a special version of this (DISAS_WFI), so return immediately. Signed-off-by: Paul Brook p...@codesourcery.com --- target-arm/translate.c |2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/target-arm/translate.c b/target-arm/translate.c index a2a0ecd..f39b9ca 100644 --- a/target-arm/translate.c +++ b/target-arm/translate.c @@ -6236,7 +6236,7 @@ static int disas_coproc_insn(CPUARMState * env, DisasContext *s, uint32_t insn) } gen_set_pc_im(s-pc); s-is_jmp = DISAS_WFI; -break; +return 0; default: break; } -- 1.7.10
Re: [Qemu-devel] [RFC PATCH 0/4] virtio-rng and RngBackend infrastructure (v2)
This series depends on my QOM -object series that I just posted. In Amit's thread on virtio-rng, danpb mentioned that we really ought to have a proper RNG backend infrastructure and of course he's correct on that. Now that we have QOM, I wanted to demonstrate how we can use QOM to construct a complete backend without adding any new infrastructure. I've now implemented a urandom and egd backend and tested them. I think the first three patches are ready to go. I never really understood why this exists in the first place. It's a simple readonly charcter device. IMHO you should be using virtio-serial. This is virtio-console v.s. virtio-serial all over again. The only thing close to a reason I've heard is that guest OS is incompetent and can't source random rata from a serial device. Even accepting the pointless guest device, I see absolutely no reason to have special infrastructure for this within qemu. Character devices do everything you need. Creating annother read stream of data API is needless duplication and only going to reintroduce bugs we already fixed in the character device layer. Paul
Re: [Qemu-devel] [RFC PATCH 1/1] linux-user: Probe the guest base for shared objects when needed
openSUSE uses a version patched so that IIUC 3G are reserved. Just today this failed on a system where swap got disabled and the mmap() thus failed. Err... why? We map with MAP_NORESERVE, so swap shouldn't matter... I can't say if it's the same cause, but we fail with ulimit -v 4046848. Incidentally, it seems a strange that we only reserve 0xf700 bytes, not the full 4G. Paul
Re: [Qemu-devel] [RFC PATCH 1/1] linux-user: Probe the guest base for shared objects when needed
On 28.06.2012, at 02:06, Paul Brook wrote: openSUSE uses a version patched so that IIUC 3G are reserved. Just today this failed on a system where swap got disabled and the mmap() thus failed. Err... why? We map with MAP_NORESERVE, so swap shouldn't matter... I can't say if it's the same cause, but we fail with ulimit -v 4046848. Incidentally, it seems a strange that we only reserve 0xf700 bytes, not the full 4G. Uh, I think that was because of the vdso shared page that is allocated on top of -R. That can't be right. The whole point of -R is that it defines all the guest accessible virtual address space. The surrounding space is liable to be used by something else, and we must not make any assumptions about it. Further inspection shows that guest_validate_base contains some extremely bogus code. If the guest needs something at the top of its address space then we need to offset address zero within the block, and ensure accesses wrap appropriately. Paul
Re: [Qemu-devel] [RFC PATCH 1/1] linux-user: Probe the guest base for shared objects when needed
'guest_validate_base' is currently called for three reasons: (1) in main.c when using -B, (2) in main.c when using -R after mapping the reserved va region, and (3) and when probing for a guest base in probe_guest_base. For case (1) I suppose things are pretty much the same -- we just need to map the extra region when needed (e.g. for the ARM kernel helpers). Yes. For case (2) maybe we can do a probing similar to what I mentioned here [1], but taking into account what you stated above and ensuring that the probing finds a single region for the request va region size and any needed extra stuff. Something like that, yes. I suspect there are better ways to implement it though. In principle your patch is making (2) a variant of (3). Instead of probing for the segments covered by the image we probe for the reserved regions (e.g. for ARM [0-reserved_va, 0x - 0x]). A good implementation should automagically DTRT for both 32-bit and 64-bit hosts. Case (3) is mostly the same as (2) but we are probing for a guest base with a region size deduced from looking at the image we are loading. I suppose it is still OK to map two regions here. The single region only applies to -R? I'd say (3) is more similar to (1). There's no fundamental reason why -R has to allocate a single block. In all cases we should be checking the same thing - are the addresses we need available on the host? Having different code paths calling guest_validate_base, etc. for different reasons makes me think we're doing it wrong :-) Paul
Re: [Qemu-devel] [RFC] QOMification of AXI stream
Im looking to QOMifying and refactoring the AXI stream interfaces between the AXI ethernet and AXI DMA modules. I could use some guidance on how to do this as I can think of about 6 different solutions. Sources are hw/xilinx_axienet.c and hw/xilinx_axidma.c. ... So what im proposing is AXI stream is implemented as a unidirectional point to point bus. The xilinx ethernet system would consist of two of these buses one for tx, one for rx. I thought the idea was that with QOM the bus/device model would go away. The DMA controller implements an AXIDMA interface, and the device has a AXIDMA link that's connected to that interface. Of course we then hit the usual problem with QOM that we can only link to objects, and it's impossible to expose multiple interfaces of the same type. The DMA controller probably needs a proxy object for each DMA channel. Paul
Re: [Qemu-devel] [RFC] QOMification of AXI stream
On 8 June 2012 10:13, Paul Brook p...@codesourcery.com wrote: Of course we then hit the usual problem with QOM that we can only link to objects, and it's impossible to expose multiple interfaces of the same type. I'm pretty sure Anthony claimed this was entirely possible -- presumably that's how Pins are going to work. Really? Every time I've talked to him I've got the opposite impression. Part of the response has been that interrupt pins are the only case where this actually occurs, so It's not worth fixing properly. I disagree with this assesment. Given we do need to expose multiple instances of the same interface, I see a few different options: - Create a proxy object for each reciever which multiplexes onto a different interface on the main object. For interrupt pins this basically means making the qemu_irq object part of the device tree, and have the actual device implement qemu_irq_handler (see hw/irq.h). The equivalent of qemu_irq (i.e. irq.c/h) needs to be created for every duplicated interface. It's worth noting that qemu_irq is about as simple as it gets, it's a single unidirectional call. - Make some form of handle an explicit part of the API. IMO this is a really bad idea, and a step backwards. In the qemu_irq case it means that the device raising the interrupt needs to know how the interrupt controller enumerates its input pins, and which one it's connected to. Instead of making connections via a nice clean links we have a link and some other device specific information. It's worse than the old callback+opaque pointer pair because user [machine description] has to provide that device specific additional value. - Link to properties, not objects. This probably ends up similar to the first option, except with a framework and consistent implementation across different interfaces. Paul
Re: [Qemu-devel] [RFC] QOMification of AXI stream
So what im proposing is AXI stream is implemented as a unidirectional point to point bus. The xilinx ethernet system would consist of two of these buses one for tx, one for rx. I thought the idea was that with QOM the bus/device model would go away. The DMA controller implements an AXIDMA interface, and the device has a AXIDMA link that's connected to that interface. Of course we then hit the usual problem with QOM that we can only link to objects, and it's impossible to expose multiple interfaces of the same type. No, QOM supports multiple inheritance of interfaces so you absolutely can inherit from multiple different interfaces. But you can't have multiple instances of the same interface. And the interfaces must be stateless. Hence you need the proxy object. Paul
Re: [Qemu-devel] [RFC] QOMification of AXI stream
Of course we then hit the usual problem with QOM that we can only link to objects, and it's impossible to expose multiple interfaces of the same type. I'm pretty sure Anthony claimed this was entirely possible -- presumably that's how Pins are going to work. Really? Every time I've talked to him I've got the opposite impression. Part of the response has been that interrupt pins are the only case where this actually occurs, so It's not worth fixing properly. I think it depends on your definition of properly. There's really only three concepts in QOM that matter for this discussion: 1) objects 2) children and 3) links. There is absolutely no difference between a Pin object and a SerialState object. They both are first-class objects are far as QOM and concerned. Both can have links to other objects. The most common way for other objects to create objects is via children. A device could have a bunch of Pin child objects with that being the sole communication mechanism with the outside world. And those pin objects would presumably communicate back to the device via some as-yet unimplemented PinMultiplex interface link? Or are you expecting the Pin objects to have an API call that allows the device to register an arbitrary QEMUBH (or equivalent)? A device could also have a 'PCISocket' child object (which inherits from PCIDevice) in order to expose a PCI interface to the world. For most bus-based devices, I think the above is poor design. But that's my opinion from a modeling PoV, QOM doesn't have an opinion from an infrastructure PoV. So what is a good design? Are you hoping most of your interfaces are stateless, so can be implemented directly on the device object? Paul
Re: [Qemu-devel] [PATCH V4 0/5] Ehnahced SSI bus support + M25P80 SPI flash + Xilinx SPI controller
On 5th April, when we first RFC'd our SPI layer support, you said to Peter: == I don't believe there is any difference between SSI and SPI. It's the exact same thing - the same way that many devices support a two-wire interface that is actually just I2C with a different name. The behavior of the CS pin varies between devices. It sounds like you need a bit of extra logic not present in the current ssi code. You should fix that, not invent a whole new bus. == He's gone and done exactly that, indeed generalised it with the proposed changes to SSI. No. There are two changes. Modelling the CS line in the SPI bus, and having SSI be a multipoint bus rather than point-point. Paul
Re: [Qemu-devel] [PATCH V4 0/5] Ehnahced SSI bus support + M25P80 SPI flash + Xilinx SPI controller
I'm still not convinced modelling this as a multipoint bus is a good idea. If nothing else you've failed to model the case where multiple slaves are selected simultanously. The bus can easily be changed such that multiple devices are selectable at once to get your desired multi device behaviour. AFAICT though nothing in QEMU behaves like this ATM. By my reading your xilinx device *should* behave like this. Given the chip selects are actual wires, not part of the bus itself, I think multiple point-point busses are a better fit. For the stellaris device we still have the synthetic mux device and intermediate bus. Yes, because in your stellaris architecture, the SSI controller (pl022) is point to point so that exactly matches the hardware. In the microblaze controller in this series, the controller has inbuilt muxing with one-hot CS behavior. To implement with point to point, I would have to dynamically create a number of sub-busses (driven by a qdev property). I would also have to have a device within a device to model the internal mux which increases my code volume significantly. Also you end up with this little piece of ugliness in your machine model and device model: I don't see why would would need a separate mux device. One of my issues is that you've made this a device property. A SPI device has no concept of address. This really is a property of the controller. The multi-slave bus is a direct superset on point-to-point. There is nothing stopping anyone from using it as p2p. Its just things are very ugly for SPI controllers with integrated muxes to treat everything as point to point. IMHO the resulting tree device is better with multiple point-point links. I'm hoping the hardcoded board descriptions (i.e. everything using ssi_create_slave) will go away sooner rather than later. Having two m25p80 devices that are indistinguishable apart from one minor property seems undesirable. Paul
Re: [Qemu-devel] [PATCH V4 0/5] Ehnahced SSI bus support + M25P80 SPI flash + Xilinx SPI controller
Patch 1 Enhances SSI bus support to properly support multiple attached devices. An api is provided for SSI/SPI masters to select a particular device attached to the bus. Patch 2 is a device model for the m25p80 style SPI flash chip. Patch 3 is the Xilinx XPS SPI contoller. Its a sysbus device that instantiates a ssi bus, and interfaces the two (as per the controllers functionality) Patch 4 instantiates the XPS SPI controller in the petalogix ML605 reference platform and connects two m25p80s to it. Patch 5 updates the stellaris machine model to use the multi slave SSI support I'm still not convinced modelling this as a multipoint bus is a good idea. If nothing else you've failed to model the case where multiple slaves are selected simultanously. Given the chip selects are actual wires, not part of the bus itself, I think multiple point-point busses are a better fit. For the stellaris device we still have the synthetic mux device and intermediate bus. Paul
Re: [Qemu-devel] [PATCH qom-next 57/59] cpu: Introduce mandatory tlb_flush callback
+void cpu_tlb_flush(CPUState *cpu, bool flush_global) +{ +CPUClass *cc = CPU_GET_CLASS(cpu); + +g_assert(cc-tlb_flush != NULL); + +cc-tlb_flush(cpu, flush_global); +} This needs to be able to call tlb_flush() itself rather than having to have every single subclass of CPUState implement an identical tlb_flush method. You could do this if there was a CPU_GET_ENV()... Which is exactly the point: CPUState does not know about the target-specific env. And CPU_GET_ENV() is just plain wrong conceptually because it adds yet another cpu.h dependency. Maybe so, but having every single taget implement its own copy of the exact same target independent wrapper seems even more wrong. There's a separation between old code using env and new, clean code: Just like Anthony doesn't want old concepts rewritten with the new type (cf. object_realize() discussion) I don't want the old cpu.h #define mess leaking into code that I'm redesigning specifically to get rid of that target-*/cpu.h dependency in favor of a single qemu/cpu.h. qom/cpu.c is by definition not compiled per target so it cannot contain any target-specific code. At minimum it should be clearly documented[1] that this is a transitional hack, and how it should be removed. There have already been two posts in this thread suggesting this is a feature, implying that this operation is somehow target specific. I think the opposite is true: This is a target agnostic detail of the TCG implementation, and implementing architecturally defined MMU/TLB behavior here is activley wrong. Paul [1] In the code, not the commit message. Commit logs are not documentation. Commit logs are transient information valid only when the patch is applied. After that point they become archeological evidence, and you should not expect subsequent developers to be aware of them.
[Qemu-devel] [PATCH 0/2] SPI SDcard fixes v2
Recent testing showed the SPI mode SD card emulation (ssi-sd.c) doesn't actually work if the guest tries to use features from the SD Physical Layer Specification v2 (this incluldes SDHC). This series replaces my previous single patch. Paul Brook (2): Fix SPI SD card command responses hw/sd.c: Implement CMD58 hw/sd.c | 140 +-- hw/sd.h | 17 hw/ssi-sd.c | 83 +++ 3 files changed, 138 insertions(+), 102 deletions(-) -- 1.7.10
[Qemu-devel] [PATCH 2/2] hw/sd.c: Implement CMD58
Implement CMD58. This command is only valid in SPI mode, and required when we implement CMD8. Most of the code is already there, we just need to trigger it. Signed-off-by: Paul Brook p...@codesourcery.com --- hw/sd.c | 15 ++- 1 file changed, 14 insertions(+), 1 deletion(-) diff --git a/hw/sd.c b/hw/sd.c index 220562e..952d5d8 100644 --- a/hw/sd.c +++ b/hw/sd.c @@ -141,7 +141,7 @@ static const sd_cmd_type_t sd_cmd_type[64] = { sd_ac, sd_ac, sd_none, sd_none, sd_none, sd_none, sd_ac, sd_none, sd_none, sd_none, sd_bc, sd_none, sd_none, sd_none, sd_none, sd_none, sd_none, sd_none, sd_none, sd_none, sd_none, sd_none, sd_none, sd_ac, -sd_adtc, sd_none, sd_none, sd_none, sd_none, sd_none, sd_none, sd_none, +sd_adtc, sd_none, sd_bcr, sd_none, sd_none, sd_none, sd_none, sd_none, }; static const sd_cmd_type_t sd_acmd_type[64] = { @@ -1223,6 +1223,19 @@ static sd_rsp_type_t sd_normal_command(SDState *sd, break; } break; +case 58:/* CMD58: READ_OCR */ +if (!sd-spi) { +goto bad_cmd; +} +switch (sd-state) { +case sd_idle_state: +case sd_transfer_state: +return sd_r3; + +default: +break; +} +break; default: bad_cmd: -- 1.7.10
[Qemu-devel] [PATCH 1/2] Fix SPI SD card command responses
When in SPI mode, we give a bogus response to CMD8 (part of the SD physical spec v2). This command should return both the status byte and the register value. The current code returns long status words from sd.c, then parses translates those to SPI status bytes ssi-sd.c. For CMD8 (and CMD58 to follow) this gets messy, with both parts requiring command specific knowledge. We already have magic SPI-mode behavior in sd.c, so may as well just generate the correct response there. Signed-off-by: Paul Brook p...@codesourcery.com --- hw/sd.c | 125 --- hw/sd.h | 17 hw/ssi-sd.c | 83 +++ 3 files changed, 124 insertions(+), 101 deletions(-) diff --git a/hw/sd.c b/hw/sd.c index 07eb263..220562e 100644 --- a/hw/sd.c +++ b/hw/sd.c @@ -52,6 +52,7 @@ typedef enum { sd_r7,/* Operating voltage */ sd_r1b = -1, sd_illegal = -2, +sd_r1_long = -3, /* Two byte status in SPI mode. */ } sd_rsp_type_t; struct SDState { @@ -342,24 +343,93 @@ static int sd_req_crc_validate(SDRequest *req) return sd_crc7(buffer, 5) != req-crc; /* TODO */ } -static void sd_response_r1_make(SDState *sd, uint8_t *response) + +/* Make SPI status word from full card status. Most commands only use + the high byte. */ +static uint16_t sd_get_spi_status(SDState *sd, uint32_t cardstatus) +{ +uint16_t status = 0; + +if (((cardstatus 9) 0xf) 4) +status |= SPI_SDR_IDLE; +if (cardstatus ERASE_RESET) +status |= SPI_SDR_ERASE_RESET; +if (cardstatus ILLEGAL_COMMAND) +status |= SPI_SDR_ILLEGAL_COMMAND; +if (cardstatus COM_CRC_ERROR) +status |= SPI_SDR_COM_CRC_ERROR; +if (cardstatus ERASE_SEQ_ERROR) +status |= SPI_SDR_ERASE_SEQ_ERROR; +if (cardstatus ADDRESS_ERROR) +status |= SPI_SDR_ADDRESS_ERROR; +if (cardstatus CARD_IS_LOCKED) +status |= SPI_SDR_LOCKED; +if (cardstatus (LOCK_UNLOCK_FAILED | WP_ERASE_SKIP)) +status |= SPI_SDR_WP_ERASE; +if (cardstatus SD_ERROR) +status |= SPI_SDR_ERROR; +if (cardstatus CC_ERROR) +status |= SPI_SDR_CC_ERROR; +if (cardstatus CARD_ECC_FAILED) +status |= SPI_SDR_ECC_FAILED; +if (cardstatus WP_VIOLATION) +status |= SPI_SDR_WP_VIOLATION; +if (cardstatus ERASE_PARAM) +status |= SPI_SDR_ERASE_PARAM; +if (cardstatus (OUT_OF_RANGE | CID_CSD_OVERWRITE)) +status |= SPI_SDR_OUT_OF_RANGE; +/* ??? Don't know what Parameter Error really means, so + assume it's set if the second byte is nonzero. */ +if (status 0xff) +status |= SPI_SDR_PARAMETER_ERROR; + +return status; +} + +static int sd_response_r1_make(SDState *sd, uint8_t *response) { uint32_t status = sd-card_status; /* Clear the clear on read status bits */ sd-card_status = ~CARD_STATUS_C; -response[0] = (status 24) 0xff; -response[1] = (status 16) 0xff; -response[2] = (status 8) 0xff; -response[3] = (status 0) 0xff; +if (sd-spi) { +response[0] = sd_get_spi_status(sd, status) 8; +return 1; +} else { +response[0] = (status 24) 0xff; +response[1] = (status 16) 0xff; +response[2] = (status 8) 0xff; +response[3] = (status 0) 0xff; +return 4; +} +} + +/* Only used in SPI mode. */ +static int sd_response_r1_long_make(SDState *sd, uint8_t *response) +{ +uint32_t status = sd-card_status; +/* Clear the clear on read status bits */ +sd-card_status = ~CARD_STATUS_C; +status = sd_get_spi_status(sd, status); +response[0] = status 8; +response[1] = status 0xff; +return 2; } -static void sd_response_r3_make(SDState *sd, uint8_t *response) +static int sd_response_r3_make(SDState *sd, uint8_t *response) { -response[0] = (sd-ocr 24) 0xff; -response[1] = (sd-ocr 16) 0xff; -response[2] = (sd-ocr 8) 0xff; -response[3] = (sd-ocr 0) 0xff; +int len = 4; + +if (sd-spi) { +len = 5; +*(response++) = sd_get_spi_status(sd, sd-card_status) 8; +} +*(response++) = (sd-ocr 24) 0xff; +*(response++) = (sd-ocr 16) 0xff; +*(response++) = (sd-ocr 8) 0xff; +*(response++) = (sd-ocr 0) 0xff; + +return len; } static void sd_response_r6_make(SDState *sd, uint8_t *response) @@ -379,12 +449,20 @@ static void sd_response_r6_make(SDState *sd, uint8_t *response) response[3] = status 0xff; } -static void sd_response_r7_make(SDState *sd, uint8_t *response) +static int sd_response_r7_make(SDState *sd, uint8_t *response) { -response[0] = (sd-vhs 24) 0xff; -response[1] = (sd-vhs 16) 0xff; -response[2] = (sd-vhs 8) 0xff; -response[3] = (sd-vhs 0) 0xff; +int len = 4; + +if (sd-spi) { +len = 5; +*(response++) = sd_get_spi_status(sd, sd-card_status) 8; +} +*(response
Re: [Qemu-devel] [PATCH] Fix SPI SD emulation
If this command could be issued in transfer state maybe in addition to IDLE_STATE you also need to set other bits (ADDRESS_ERROR, COM_CRC_ERROR, ILLEGAL_COMMAND, ERASE_SEQ_ERROR) in MSB of R3 response? In theory, yes. I was thinking of a follow-up patch to move the spi status byte generation into sd.c. Maybe I should do that first. Do you mean the one in ssi-sd.c? That would be nice I think, a bit less confusing. I posted v2 of the patch earlier today: http://lists.nongnu.org/archive/html/qemu-devel/2012-04/msg04214.html Paul
Re: [Qemu-devel] [PATCH V3 11/13] SD card: introduce spi property for SD card objects
And drop passing is_spi argument to SDCardClass::init function. spi property could be set while SD card is in IDLE state. It defaults to false. Why? This isn't something that should be under user or board control. The SD card object is an implementation detail. It's something that's part of the host controller. i.e. ether an SD host controller or an SPI bus device. Do you have an example of why you would need to defer this decision? If you want to separate instantiation of the SD card from the controller (e.g. to implement sdio devices) then you need a SD bus, plus an sd card device. Something like we do for spi connected cards. Paul
Re: [Qemu-devel] [PATCH 10/14] target-arm: Move feature register setup to per-CPU init fns
Move feature register value setup to per-CPU init functions. +env-cp15.c0_c1[0] = cpu-id_pfr0; +env-cp15.c0_c1[1] = cpu-id_pfr1; +env-cp15.c0_c1[2] = cpu-id_dfr0; +env-cp15.c0_c1[3] = cpu-id_afr0; +env-cp15.c0_c1[4] = cpu-id_mmfr0; +env-cp15.c0_c1[5] = cpu-id_mmfr1; +env-cp15.c0_c1[6] = cpu-id_mmfr2; +env-cp15.c0_c1[7] = cpu-id_mmfr3; +env-cp15.c0_c2[0] = cpu-id_isar0; +env-cp15.c0_c2[1] = cpu-id_isar1; +env-cp15.c0_c2[2] = cpu-id_isar2; +env-cp15.c0_c2[3] = cpu-id_isar3; +env-cp15.c0_c2[4] = cpu-id_isar4; +env-cp15.c0_c2[5] = cpu-id_isar5; Why are we copying these values? All these registers are readonly, so the duplication seems wrong. Shouldn't we should be using cpu-whatever everywhere? I feel like I've asked this before, but don't remember seeing an answer. Also, I'd prefer that id_isr5 were explicitly initialized, rather than relying on it being implicitly zero. Bugs in an earlier patch series show how easy it is to accidentally miss a register. IMO it's worth distinguishing a defined register that happens to be zero from a register this core doesn't have. Overall I'm not convinced that the new open-coded initialization is better then the tables it replaces. Paul
Re: [Qemu-devel] [PATCH] Fix SPI SD emulation
-sd_adtc, sd_none, sd_none, sd_none, sd_none, sd_none, sd_none, +sd_adtc, sd_none, sd_bc, sd_bc, sd_none, sd_none, sd_bcr? not that it really matters though Err, probably. +case 58:/* CMD58: READ_OCR */ +if (!sd-spi) { +goto bad_cmd; +} +switch (sd-state) { +case sd_idle_state: +case sd_transfer_state: +return sd_r3; If this command could be issued in transfer state maybe in addition to IDLE_STATE you also need to set other bits (ADDRESS_ERROR, COM_CRC_ERROR, ILLEGAL_COMMAND, ERASE_SEQ_ERROR) in MSB of R3 response? In theory, yes. I was thinking of a follow-up patch to move the spi status byte generation into sd.c. Maybe I should do that first. Paul
Re: [Qemu-devel] [PATCH v6 5/5] FreeSCALE i.MX31 support: KZM-ARM11-01 evaluation board
On 23 April 2012 23:21, Peter Chubb peter.ch...@nicta.com.au wrote: Peter Are these two devices really on the same IRQ? Yes. A single interrupt line comes from the FPGA into the AVIC. Inside the FPGA the interrupts for the UARTs, SD card and NAND flash are connected to that single interrupt line. The non-touchscreen FPGA UART isn't mentioned in the KZM manual, but is available on the board as a debug port. To avoid confusion I think I'll just get rid of it. Up to you. A comment would be fine if you'd rather keep the device. No it's not. You must never connect multiple devices to the same IRQ. You need an explicit mux in between. Paul
Re: [Qemu-devel] [PATCH v3 1/4] SSI: Built in multiple device support
Im happy to spend the 10 mins updating stellaris.c accordingly, but is someone sitting on a binary package and brief instructions or some such to regression test it? Do you of this machine have some sort of kernel image handy? I've attached a tarball with some test binaries. They're built from the example libraries shipped with this board. The first exercises the display, amongst other things. The second exercises the SD card. Simple SD card image also included (remember to ungzip it first). ls and cat readme.txt over the serial port to make it do something verifiable. Run them with: ./qemu-system-arm -M lm3s6965evb -kernel qs_ek-lm3s6965.bin ./qemu-system-arm -M lm3s6965evb -serial stdio -kernel sd_card.bin -sd sdcard.img I don't have any software handy that exercises both simultaneously. It's probably worth mentioning that we don't currently implement the all CS lines accurately for this board. Most pins on this device are dual-function. They can be configured either as regular GPIO, or driven from a periperal (aka alternate function, e.g. the SSI controller). Config is cone via the GPIO controllers. There are 7 GPIO contollers (A-G) with 8 pins each. On reset all pins are configured as floating GPIO, and we let D0 float high. The frame start/chip select line from the SPI controller goes via GPIO A3. This is connected to the display controller (ssd0323) CS pin. The SD card CS pin is connected to GPIO D0. When comminucating with the display controller the SSI pins will be configured normally. When communicating with the SD card we configure A3 as a GPIO pin, set high (inactive), and pull D0 low to select the SD card. The current implementation ignores the SSI select pin (A3), and assumes the display controller is selected whenever the SD card (D0) is not. We do not implement the alternate function select in the GPIO controller. It's a bit of a strange setup, but I guess probably not that unusual. Paul