from:"Paul Brook"

Re: [PATCH v1 3/7] contrib/gitdm: add Paul to individual contributors

2022-09-27 Thread Paul Brook

Yes, I'm happy for p...@codesourcery.com to be linked to my current email for 
attribution purposes.

Paul

On 26 September 2022 14:46:05 BST, "Alex Bennée"  wrote:
>Do you want to map old commits to your canonical email now as well?
>
>Signed-off-by: Alex Bennée 
>Cc: Paul Brook 
>---
> contrib/gitdm/group-map-individuals | 1 +
> 1 file changed, 1 insertion(+)
>
>diff --git a/contrib/gitdm/group-map-individuals 
>b/contrib/gitdm/group-map-individuals
>index d5b05041bc..0ec003048c 100644
>--- a/contrib/gitdm/group-map-individuals
>+++ b/contrib/gitdm/group-map-individuals
>@@ -35,3 +35,4 @@ liq...@gmail.com
> chetan4wind...@gmail.com
> akihiko.od...@gmail.com
> si...@simonsafar.com
>+p...@nowt.org
>-- 
>2.34.1
>
>

[PATCH v2 19/42] i386: Rewrite blendv helpers

2022-04-24 Thread Paul Brook

Rewrite the blendv helpers so that they can easily be extended to support
the AVX encodings, which make all 4 arguments explicit.

No functional changes to the existing helpers

Signed-off-by: Paul Brook 
---
 target/i386/ops_sse.h | 119 +-
 1 file changed, 60 insertions(+), 59 deletions(-)

diff --git a/target/i386/ops_sse.h b/target/i386/ops_sse.h
index 3202c00572..9f388b02b9 100644
--- a/target/i386/ops_sse.h
+++ b/target/i386/ops_sse.h
@@ -2141,73 +2141,74 @@ void glue(helper_palignr, SUFFIX)(CPUX86State *env, Reg 
*d, Reg *s,
 }
 }
 
-#define XMM0 (env->xmm_regs[0])
+#if SHIFT >= 1
+
+#define BLEND_V128(elem, num, F, b) do {\
+d->elem(b + 0) = F(v->elem(b + 0), s->elem(b + 0), m->elem(b + 0)); \
+d->elem(b + 1) = F(v->elem(b + 1), s->elem(b + 1), m->elem(b + 1)); \
+if (num > 2) {  \
+d->elem(b + 2) = F(v->elem(b + 2), s->elem(b + 2), m->elem(b + 2)); \
+d->elem(b + 3) = F(v->elem(b + 3), s->elem(b + 3), m->elem(b + 3)); \
+}   \
+if (num > 4) {  \
+d->elem(b + 4) = F(v->elem(b + 4), s->elem(b + 4), m->elem(b + 4)); \
+d->elem(b + 5) = F(v->elem(b + 5), s->elem(b + 5), m->elem(b + 5)); \
+d->elem(b + 6) = F(v->elem(b + 6), s->elem(b + 6), m->elem(b + 6)); \
+d->elem(b + 7) = F(v->elem(b + 7), s->elem(b + 7), m->elem(b + 7)); \
+}   \
+if (num > 8) {  \
+d->elem(b + 8) = F(v->elem(b + 8), s->elem(b + 8), m->elem(b + 8)); \
+d->elem(b + 9) = F(v->elem(b + 9), s->elem(b + 9), m->elem(b + 9)); \
+d->elem(b + 10) = F(v->elem(b + 10), s->elem(b + 10), m->elem(b + 
10));\
+d->elem(b + 11) = F(v->elem(b + 11), s->elem(b + 11), m->elem(b + 
11));\
+d->elem(b + 12) = F(v->elem(b + 12), s->elem(b + 12), m->elem(b + 
12));\
+d->elem(b + 13) = F(v->elem(b + 13), s->elem(b + 13), m->elem(b + 
13));\
+d->elem(b + 14) = F(v->elem(b + 14), s->elem(b + 14), m->elem(b + 
14));\
+d->elem(b + 15) = F(v->elem(b + 15), s->elem(b + 15), m->elem(b + 
15));\
+}   \
+} while (0)
 
-#if SHIFT == 1
 #define SSE_HELPER_V(name, elem, num, F)\
-void glue(name, SUFFIX)(CPUX86State *env, Reg *d, Reg *s)   \
+void glue(name, SUFFIX)(CPUX86State *env, Reg *d, Reg *s)   \
 {   \
-d->elem(0) = F(d->elem(0), s->elem(0), XMM0.elem(0));   \
-d->elem(1) = F(d->elem(1), s->elem(1), XMM0.elem(1));   \
-if (num > 2) {  \
-d->elem(2) = F(d->elem(2), s->elem(2), XMM0.elem(2));   \
-d->elem(3) = F(d->elem(3), s->elem(3), XMM0.elem(3));   \
-if (num > 4) {  \
-d->elem(4) = F(d->elem(4), s->elem(4), XMM0.elem(4));   \
-d->elem(5) = F(d->elem(5), s->elem(5), XMM0.elem(5));   \
-d->elem(6) = F(d->elem(6), s->elem(6), XMM0.elem(6));   \
-d->elem(7) = F(d->elem(7), s->elem(7), XMM0.elem(7));   \
-if (num > 8) {  \
-d->elem(8) = F(d->elem(8), s->elem(8), XMM0.elem(8)); \
-d->elem(9) = F(d->elem(9), s->elem(9), XMM0.elem(9)); \
-d->elem(10) = F(d->elem(10), s->elem(10), XMM0.elem(10)); \
-d->elem(11) = F(d->elem(11), s->elem(11), XMM0.elem(11)); \
-d->elem(12) = F(d->elem(12), s->elem(12), XMM0.elem(12)); \
-d->elem(13) = F(d->elem(13), s->elem(13), XMM0.elem(13)); \
-d->elem(14) = F(d->elem(14), s->elem(14), XMM0.elem(14)); \
-d->elem(15) = F(d->elem(15), s->elem(15), XMM0.elem(15)); \
-}   \
-}   \
-}   \
-}
+Reg *v = d; \
+Reg *m = >xmm_regs[0];

[PATCH v2 31/42] i386: Implement AVX variable shifts

2022-04-24 Thread Paul Brook

These use the W bit to encode the operand width, but otherwise fairly
straightforward.

Signed-off-by: Paul Brook 
---
 target/i386/ops_sse.h| 17 +
 target/i386/ops_sse_header.h |  6 ++
 target/i386/tcg/translate.c  | 17 +
 3 files changed, 40 insertions(+)

diff --git a/target/i386/ops_sse.h b/target/i386/ops_sse.h
index 9b92b9790a..8f2bd48394 100644
--- a/target/i386/ops_sse.h
+++ b/target/i386/ops_sse.h
@@ -3195,6 +3195,23 @@ void glue(helper_vpermilps_imm, SUFFIX)(CPUX86State *env,
 #endif
 }
 
+#if SHIFT == 1
+#define FPSRLVD(x, c) (c < 32 ? ((x) >> c) : 0)
+#define FPSRLVQ(x, c) (c < 64 ? ((x) >> c) : 0)
+#define FPSRAVD(x, c) ((int32_t)(x) >> (c < 64 ? c : 31))
+#define FPSRAVQ(x, c) ((int64_t)(x) >> (c < 64 ? c : 63))
+#define FPSLLVD(x, c) (c < 32 ? ((x) << c) : 0)
+#define FPSLLVQ(x, c) (c < 64 ? ((x) << c) : 0)
+#endif
+
+SSE_HELPER_L(helper_vpsrlvd, FPSRLVD)
+SSE_HELPER_L(helper_vpsravd, FPSRAVD)
+SSE_HELPER_L(helper_vpsllvd, FPSLLVD)
+
+SSE_HELPER_Q(helper_vpsrlvq, FPSRLVQ)
+SSE_HELPER_Q(helper_vpsravq, FPSRAVQ)
+SSE_HELPER_Q(helper_vpsllvq, FPSLLVQ)
+
 #if SHIFT == 2
 void glue(helper_vbroadcastdq, SUFFIX)(CPUX86State *env, Reg *d, Reg *s)
 {
diff --git a/target/i386/ops_sse_header.h b/target/i386/ops_sse_header.h
index c52169a030..20db6c4240 100644
--- a/target/i386/ops_sse_header.h
+++ b/target/i386/ops_sse_header.h
@@ -421,6 +421,12 @@ DEF_HELPER_4(glue(vpermilpd, SUFFIX), void, env, Reg, Reg, 
Reg)
 DEF_HELPER_4(glue(vpermilps, SUFFIX), void, env, Reg, Reg, Reg)
 DEF_HELPER_4(glue(vpermilpd_imm, SUFFIX), void, env, Reg, Reg, i32)
 DEF_HELPER_4(glue(vpermilps_imm, SUFFIX), void, env, Reg, Reg, i32)
+DEF_HELPER_4(glue(vpsrlvd, SUFFIX), void, env, Reg, Reg, Reg)
+DEF_HELPER_4(glue(vpsravd, SUFFIX), void, env, Reg, Reg, Reg)
+DEF_HELPER_4(glue(vpsllvd, SUFFIX), void, env, Reg, Reg, Reg)
+DEF_HELPER_4(glue(vpsrlvq, SUFFIX), void, env, Reg, Reg, Reg)
+DEF_HELPER_4(glue(vpsravq, SUFFIX), void, env, Reg, Reg, Reg)
+DEF_HELPER_4(glue(vpsllvq, SUFFIX), void, env, Reg, Reg, Reg)
 #if SHIFT == 2
 DEF_HELPER_3(glue(vbroadcastdq, SUFFIX), void, env, Reg, Reg)
 DEF_HELPER_1(vzeroall, void, env)
diff --git a/target/i386/tcg/translate.c b/target/i386/tcg/translate.c
index 358c3ecb0b..4990470083 100644
--- a/target/i386/tcg/translate.c
+++ b/target/i386/tcg/translate.c
@@ -3293,6 +3293,9 @@ static const struct SSEOpHelper_table6 sse_op_table6[256] 
= {
 [0x40] = BINARY_OP(pmulld, SSE41, SSE_OPF_MMX),
 #define gen_helper_phminposuw_ymm NULL
 [0x41] = UNARY_OP(phminposuw, SSE41, 0),
+[0x45] = BINARY_OP(vpsrlvd, AVX, SSE_OPF_AVX2),
+[0x46] = BINARY_OP(vpsravd, AVX, SSE_OPF_AVX2),
+[0x47] = BINARY_OP(vpsllvd, AVX, SSE_OPF_AVX2),
 /* vpbroadcastd */
 [0x58] = UNARY_OP(vbroadcastl, AVX, SSE_OPF_SCALAR | SSE_OPF_MMX),
 /* vpbroadcastq */
@@ -3357,6 +3360,15 @@ static const struct SSEOpHelper_table7 
sse_op_table7[256] = {
 #undef BLENDV_OP
 #undef SPECIAL_OP
 
+#define SSE_OP(name) \
+{gen_helper_ ## name ##_xmm, gen_helper_ ## name ##_ymm}
+static const SSEFunc_0_eppp sse_op_table8[3][2] = {
+SSE_OP(vpsrlvq),
+SSE_OP(vpsravq),
+SSE_OP(vpsllvq),
+};
+#undef SSE_OP
+
 /* VEX prefix not allowed */
 #define CHECK_NO_VEX(s) do { \
 if (s->prefix & PREFIX_VEX) \
@@ -4439,6 +4451,11 @@ static void gen_sse(CPUX86State *env, DisasContext *s, 
int b,
 tcg_temp_free_ptr(mask);
 } else {
 SSEFunc_0_eppp fn = op6.fn[b1].op2;
+if (REX_W(s)) {
+if (b >= 0x45 && b <= 0x47) {
+fn = sse_op_table8[b - 0x45][b1 - 1];
+}
+}
 fn(cpu_env, s->ptr0, s->ptr2, s->ptr1);
 }
 }
-- 
2.36.0

[PATCH v2 12/42] i386: Misc integer AVX helper prep

2022-04-24 Thread Paul Brook

More perparatory work for AVX support in various integer vector helpers

No functional changes to existing helpers.

Signed-off-by: Paul Brook 
---
 target/i386/ops_sse.h | 133 +-
 1 file changed, 104 insertions(+), 29 deletions(-)

diff --git a/target/i386/ops_sse.h b/target/i386/ops_sse.h
index bb9cbf9ead..d0424140d9 100644
--- a/target/i386/ops_sse.h
+++ b/target/i386/ops_sse.h
@@ -557,19 +557,25 @@ SSE_HELPER_W(helper_pavgw, FAVG)
 
 void glue(helper_pmuludq, SUFFIX)(CPUX86State *env, Reg *d, Reg *s)
 {
-d->Q(0) = (uint64_t)s->L(0) * (uint64_t)d->L(0);
-#if SHIFT == 1
-d->Q(1) = (uint64_t)s->L(2) * (uint64_t)d->L(2);
+Reg *v = d;
+d->Q(0) = (uint64_t)s->L(0) * (uint64_t)v->L(0);
+#if SHIFT >= 1
+d->Q(1) = (uint64_t)s->L(2) * (uint64_t)v->L(2);
+#if SHIFT == 2
+d->Q(2) = (uint64_t)s->L(4) * (uint64_t)v->L(4);
+d->Q(3) = (uint64_t)s->L(6) * (uint64_t)v->L(6);
+#endif
 #endif
 }
 
 void glue(helper_pmaddwd, SUFFIX)(CPUX86State *env, Reg *d, Reg *s)
 {
+Reg *v = d;
 int i;
 
 for (i = 0; i < (2 << SHIFT); i++) {
-d->L(i) = (int16_t)s->W(2 * i) * (int16_t)d->W(2 * i) +
-(int16_t)s->W(2 * i + 1) * (int16_t)d->W(2 * i + 1);
+d->L(i) = (int16_t)s->W(2 * i) * (int16_t)v->W(2 * i) +
+(int16_t)s->W(2 * i + 1) * (int16_t)v->W(2 * i + 1);
 }
 }
 
@@ -583,31 +589,55 @@ static inline int abs1(int a)
 }
 }
 #endif
+
 void glue(helper_psadbw, SUFFIX)(CPUX86State *env, Reg *d, Reg *s)
 {
+Reg *v = d;
 unsigned int val;
 
 val = 0;
-val += abs1(d->B(0) - s->B(0));
-val += abs1(d->B(1) - s->B(1));
-val += abs1(d->B(2) - s->B(2));
-val += abs1(d->B(3) - s->B(3));
-val += abs1(d->B(4) - s->B(4));
-val += abs1(d->B(5) - s->B(5));
-val += abs1(d->B(6) - s->B(6));
-val += abs1(d->B(7) - s->B(7));
+val += abs1(v->B(0) - s->B(0));
+val += abs1(v->B(1) - s->B(1));
+val += abs1(v->B(2) - s->B(2));
+val += abs1(v->B(3) - s->B(3));
+val += abs1(v->B(4) - s->B(4));
+val += abs1(v->B(5) - s->B(5));
+val += abs1(v->B(6) - s->B(6));
+val += abs1(v->B(7) - s->B(7));
 d->Q(0) = val;
-#if SHIFT == 1
+#if SHIFT >= 1
 val = 0;
-val += abs1(d->B(8) - s->B(8));
-val += abs1(d->B(9) - s->B(9));
-val += abs1(d->B(10) - s->B(10));
-val += abs1(d->B(11) - s->B(11));
-val += abs1(d->B(12) - s->B(12));
-val += abs1(d->B(13) - s->B(13));
-val += abs1(d->B(14) - s->B(14));
-val += abs1(d->B(15) - s->B(15));
+val += abs1(v->B(8) - s->B(8));
+val += abs1(v->B(9) - s->B(9));
+val += abs1(v->B(10) - s->B(10));
+val += abs1(v->B(11) - s->B(11));
+val += abs1(v->B(12) - s->B(12));
+val += abs1(v->B(13) - s->B(13));
+val += abs1(v->B(14) - s->B(14));
+val += abs1(v->B(15) - s->B(15));
 d->Q(1) = val;
+#if SHIFT == 2
+val = 0;
+val += abs1(v->B(16) - s->B(16));
+val += abs1(v->B(17) - s->B(17));
+val += abs1(v->B(18) - s->B(18));
+val += abs1(v->B(19) - s->B(19));
+val += abs1(v->B(20) - s->B(20));
+val += abs1(v->B(21) - s->B(21));
+val += abs1(v->B(22) - s->B(22));
+val += abs1(v->B(23) - s->B(23));
+d->Q(2) = val;
+val = 0;
+val += abs1(v->B(24) - s->B(24));
+val += abs1(v->B(25) - s->B(25));
+val += abs1(v->B(26) - s->B(26));
+val += abs1(v->B(27) - s->B(27));
+val += abs1(v->B(28) - s->B(28));
+val += abs1(v->B(29) - s->B(29));
+val += abs1(v->B(30) - s->B(30));
+val += abs1(v->B(31) - s->B(31));
+d->Q(3) = val;
+#endif
 #endif
 }
 
@@ -627,8 +657,12 @@ void glue(helper_movl_mm_T0, SUFFIX)(Reg *d, uint32_t val)
 {
 d->L(0) = val;
 d->L(1) = 0;
-#if SHIFT == 1
+#if SHIFT >= 1
 d->Q(1) = 0;
+#if SHIFT == 2
+d->Q(2) = 0;
+d->Q(3) = 0;
+#endif
 #endif
 }
 
@@ -636,8 +670,12 @@ void glue(helper_movl_mm_T0, SUFFIX)(Reg *d, uint32_t val)
 void glue(helper_movq_mm_T0, SUFFIX)(Reg *d, uint64_t val)
 {
 d->Q(0) = val;
-#if SHIFT == 1
+#if SHIFT >= 1
 d->Q(1) = 0;
+#if SHIFT == 2
+d->Q(2) = 0;
+d->Q(3) = 0;
+#endif
 #endif
 }
 #endif
@@ -1251,7 +1289,7 @@ uint32_t glue(helper_pmovmskb, SUFFIX)(CPUX86State *env, 
Reg *s)
 val |= (s->B(5) >> 2) & 0x20;
 val |= (s->B(6) >> 1) & 0x40;
 val |= (s->B(7)) & 0x80;
-#if SHIFT == 1
+#if SHIFT >= 1
 val |= (s->B(8) << 1) & 0x0100;
 val |= (s->B(9) << 2) & 0x0200;
 val |= (s->B(10) << 3) & 0x0400;
@@ -1260,6 +1

[PATCH v2 20/42] i386: AVX pclmulqdq

2022-04-24 Thread Paul Brook

Make the pclmulqdq helper AVX ready

Signed-off-by: Paul Brook 
---
 target/i386/ops_sse.h | 31 ---
 1 file changed, 24 insertions(+), 7 deletions(-)

diff --git a/target/i386/ops_sse.h b/target/i386/ops_sse.h
index 9f388b02b9..b7100fdce1 100644
--- a/target/i386/ops_sse.h
+++ b/target/i386/ops_sse.h
@@ -2885,14 +2885,14 @@ target_ulong helper_crc32(uint32_t crc1, target_ulong 
msg, uint32_t len)
 
 #endif
 
-void glue(helper_pclmulqdq, SUFFIX)(CPUX86State *env, Reg *d, Reg *s,
-uint32_t ctrl)
+#if SHIFT == 1
+static void clmulq(uint64_t *dest_l, uint64_t *dest_h,
+  uint64_t a, uint64_t b)
 {
-uint64_t ah, al, b, resh, resl;
+uint64_t al, ah, resh, resl;
 
 ah = 0;
-al = d->Q((ctrl & 1) != 0);
-b = s->Q((ctrl & 16) != 0);
+al = a;
 resh = resl = 0;
 
 while (b) {
@@ -2905,8 +2905,25 @@ void glue(helper_pclmulqdq, SUFFIX)(CPUX86State *env, 
Reg *d, Reg *s,
 b >>= 1;
 }
 
-d->Q(0) = resl;
-d->Q(1) = resh;
+*dest_l = resl;
+*dest_h = resh;
+}
+#endif
+
+void glue(helper_pclmulqdq, SUFFIX)(CPUX86State *env, Reg *d, Reg *s,
+uint32_t ctrl)
+{
+Reg *v = d;
+uint64_t a, b;
+
+a = v->Q((ctrl & 1) != 0);
+b = s->Q((ctrl & 16) != 0);
+clmulq(>Q(0), >Q(1), a, b);
+#if SHIFT == 2
+a = v->Q(((ctrl & 1) != 0) + 2);
+b = s->Q(((ctrl & 16) != 0) + 2);
+clmulq(>Q(2), >Q(3), a, b);
+#endif
 }
 
 void glue(helper_aesdec, SUFFIX)(CPUX86State *env, Reg *d, Reg *s)
-- 
2.36.0

[PATCH v2 32/42] i386: Implement VTEST

2022-04-24 Thread Paul Brook

Noting special here

Signed-off-by: Paul Brook 
---
 target/i386/ops_sse.h| 28 
 target/i386/ops_sse_header.h |  2 ++
 target/i386/tcg/translate.c  |  2 ++
 3 files changed, 32 insertions(+)

diff --git a/target/i386/ops_sse.h b/target/i386/ops_sse.h
index 8f2bd48394..edf14a25d7 100644
--- a/target/i386/ops_sse.h
+++ b/target/i386/ops_sse.h
@@ -3212,6 +3212,34 @@ SSE_HELPER_Q(helper_vpsrlvq, FPSRLVQ)
 SSE_HELPER_Q(helper_vpsravq, FPSRAVQ)
 SSE_HELPER_Q(helper_vpsllvq, FPSLLVQ)
 
+void glue(helper_vtestps, SUFFIX)(CPUX86State *env, Reg *d, Reg *s)
+{
+uint32_t zf = (s->L(0) &  d->L(0)) | (s->L(1) &  d->L(1));
+uint32_t cf = (s->L(0) & ~d->L(0)) | (s->L(1) & ~d->L(1));
+
+zf |= (s->L(2) &  d->L(2)) | (s->L(3) &  d->L(3));
+cf |= (s->L(2) & ~d->L(2)) | (s->L(3) & ~d->L(3));
+#if SHIFT == 2
+zf |= (s->L(4) &  d->L(4)) | (s->L(5) &  d->L(5));
+cf |= (s->L(4) & ~d->L(4)) | (s->L(5) & ~d->L(5));
+zf |= (s->L(6) &  d->L(6)) | (s->L(7) &  d->L(7));
+cf |= (s->L(6) & ~d->L(6)) | (s->L(7) & ~d->L(7));
+#endif
+CC_SRC = ((zf >> 31) ? 0 : CC_Z) | ((cf >> 31) ? 0 : CC_C);
+}
+
+void glue(helper_vtestpd, SUFFIX)(CPUX86State *env, Reg *d, Reg *s)
+{
+uint64_t zf = (s->Q(0) &  d->Q(0)) | (s->Q(1) &  d->Q(1));
+uint64_t cf = (s->Q(0) & ~d->Q(0)) | (s->Q(1) & ~d->Q(1));
+
+#if SHIFT == 2
+zf |= (s->Q(2) &  d->Q(2)) | (s->Q(3) &  d->Q(3));
+cf |= (s->Q(2) & ~d->Q(2)) | (s->Q(3) & ~d->Q(3));
+#endif
+CC_SRC = ((zf >> 63) ? 0 : CC_Z) | ((cf >> 63) ? 0 : CC_C);
+}
+
 #if SHIFT == 2
 void glue(helper_vbroadcastdq, SUFFIX)(CPUX86State *env, Reg *d, Reg *s)
 {
diff --git a/target/i386/ops_sse_header.h b/target/i386/ops_sse_header.h
index 20db6c4240..8b93b8e6d6 100644
--- a/target/i386/ops_sse_header.h
+++ b/target/i386/ops_sse_header.h
@@ -427,6 +427,8 @@ DEF_HELPER_4(glue(vpsllvd, SUFFIX), void, env, Reg, Reg, 
Reg)
 DEF_HELPER_4(glue(vpsrlvq, SUFFIX), void, env, Reg, Reg, Reg)
 DEF_HELPER_4(glue(vpsravq, SUFFIX), void, env, Reg, Reg, Reg)
 DEF_HELPER_4(glue(vpsllvq, SUFFIX), void, env, Reg, Reg, Reg)
+DEF_HELPER_3(glue(vtestps, SUFFIX), void, env, Reg, Reg)
+DEF_HELPER_3(glue(vtestpd, SUFFIX), void, env, Reg, Reg)
 #if SHIFT == 2
 DEF_HELPER_3(glue(vbroadcastdq, SUFFIX), void, env, Reg, Reg)
 DEF_HELPER_1(vzeroall, void, env)
diff --git a/target/i386/tcg/translate.c b/target/i386/tcg/translate.c
index 4990470083..2fbb7bfcad 100644
--- a/target/i386/tcg/translate.c
+++ b/target/i386/tcg/translate.c
@@ -3253,6 +3253,8 @@ static const struct SSEOpHelper_table6 sse_op_table6[256] 
= {
 [0x0b] = BINARY_OP_MMX(pmulhrsw, SSSE3),
 [0x0c] = BINARY_OP(vpermilps, AVX, 0),
 [0x0d] = BINARY_OP(vpermilpd, AVX, 0),
+[0x0e] = CMP_OP(vtestps, AVX),
+[0x0f] = CMP_OP(vtestpd, AVX),
 [0x10] = BLENDV_OP(pblendvb, SSE41, SSE_OPF_MMX),
 [0x14] = BLENDV_OP(blendvps, SSE41, 0),
 [0x15] = BLENDV_OP(blendvpd, SSE41, 0),
-- 
2.36.0

[PATCH v2 14/42] i386: Add size suffix to vector FP helpers

2022-04-24 Thread Paul Brook

For AVX we're going to need both 128 bit (xmm) and 256 bit (ymm) variants of
floating point helpers. Add the register type suffix to the existing
*PS and *PD helpers (SS and SD variants are only valid on 128 bit vectors)

No functional changes.

Signed-off-by: Paul Brook 
---
 target/i386/ops_sse.h| 48 ++--
 target/i386/ops_sse_header.h | 48 ++--
 target/i386/tcg/translate.c  | 37 +--
 3 files changed, 67 insertions(+), 66 deletions(-)

diff --git a/target/i386/ops_sse.h b/target/i386/ops_sse.h
index c645d2ddbf..fc8fd57aa5 100644
--- a/target/i386/ops_sse.h
+++ b/target/i386/ops_sse.h
@@ -699,7 +699,7 @@ void glue(helper_pshufw, SUFFIX)(Reg *d, Reg *s, int order)
 SHUFFLE4(W, s, s, 0);
 }
 #else
-void helper_shufps(Reg *d, Reg *s, int order)
+void glue(helper_shufps, SUFFIX)(Reg *d, Reg *s, int order)
 {
 Reg *v = d;
 uint32_t r0, r1, r2, r3;
@@ -710,7 +710,7 @@ void helper_shufps(Reg *d, Reg *s, int order)
 #endif
 }
 
-void helper_shufpd(Reg *d, Reg *s, int order)
+void glue(helper_shufpd, SUFFIX)(Reg *d, Reg *s, int order)
 {
 Reg *v = d;
 uint64_t r0, r1;
@@ -767,7 +767,7 @@ void glue(helper_pshufhw, SUFFIX)(Reg *d, Reg *s, int order)
 /* XXX: not accurate */
 
 #define SSE_HELPER_S(name, F)   \
-void helper_ ## name ## ps(CPUX86State *env, Reg *d, Reg *s)\
+void glue(helper_ ## name ## ps, SUFFIX)(CPUX86State *env, Reg *d, Reg *s)\
 {   \
 d->ZMM_S(0) = F(32, d->ZMM_S(0), s->ZMM_S(0));  \
 d->ZMM_S(1) = F(32, d->ZMM_S(1), s->ZMM_S(1));  \
@@ -780,7 +780,7 @@ void glue(helper_pshufhw, SUFFIX)(Reg *d, Reg *s, int order)
 d->ZMM_S(0) = F(32, d->ZMM_S(0), s->ZMM_S(0));  \
 }   \
 \
-void helper_ ## name ## pd(CPUX86State *env, Reg *d, Reg *s)\
+void glue(helper_ ## name ## pd, SUFFIX)(CPUX86State *env, Reg *d, Reg *s)\
 {   \
 d->ZMM_D(0) = F(64, d->ZMM_D(0), s->ZMM_D(0));  \
 d->ZMM_D(1) = F(64, d->ZMM_D(1), s->ZMM_D(1));  \
@@ -816,7 +816,7 @@ SSE_HELPER_S(sqrt, FPU_SQRT)
 
 
 /* float to float conversions */
-void helper_cvtps2pd(CPUX86State *env, Reg *d, Reg *s)
+void glue(helper_cvtps2pd, SUFFIX)(CPUX86State *env, Reg *d, Reg *s)
 {
 float32 s0, s1;
 
@@ -826,7 +826,7 @@ void helper_cvtps2pd(CPUX86State *env, Reg *d, Reg *s)
 d->ZMM_D(1) = float32_to_float64(s1, >sse_status);
 }
 
-void helper_cvtpd2ps(CPUX86State *env, Reg *d, Reg *s)
+void glue(helper_cvtpd2ps, SUFFIX)(CPUX86State *env, Reg *d, Reg *s)
 {
 d->ZMM_S(0) = float64_to_float32(s->ZMM_D(0), >sse_status);
 d->ZMM_S(1) = float64_to_float32(s->ZMM_D(1), >sse_status);
@@ -844,7 +844,7 @@ void helper_cvtsd2ss(CPUX86State *env, Reg *d, Reg *s)
 }
 
 /* integer to float */
-void helper_cvtdq2ps(CPUX86State *env, Reg *d, Reg *s)
+void glue(helper_cvtdq2ps, SUFFIX)(CPUX86State *env, Reg *d, Reg *s)
 {
 d->ZMM_S(0) = int32_to_float32(s->ZMM_L(0), >sse_status);
 d->ZMM_S(1) = int32_to_float32(s->ZMM_L(1), >sse_status);
@@ -852,7 +852,7 @@ void helper_cvtdq2ps(CPUX86State *env, Reg *d, Reg *s)
 d->ZMM_S(3) = int32_to_float32(s->ZMM_L(3), >sse_status);
 }
 
-void helper_cvtdq2pd(CPUX86State *env, Reg *d, Reg *s)
+void glue(helper_cvtdq2pd, SUFFIX)(CPUX86State *env, Reg *d, Reg *s)
 {
 int32_t l0, l1;
 
@@ -929,7 +929,7 @@ WRAP_FLOATCONV(int64_t, float32_to_int64_round_to_zero, 
float32, INT64_MIN)
 WRAP_FLOATCONV(int64_t, float64_to_int64, float64, INT64_MIN)
 WRAP_FLOATCONV(int64_t, float64_to_int64_round_to_zero, float64, INT64_MIN)
 
-void helper_cvtps2dq(CPUX86State *env, ZMMReg *d, ZMMReg *s)
+void glue(helper_cvtps2dq, SUFFIX)(CPUX86State *env, ZMMReg *d, ZMMReg *s)
 {
 d->ZMM_L(0) = x86_float32_to_int32(s->ZMM_S(0), >sse_status);
 d->ZMM_L(1) = x86_float32_to_int32(s->ZMM_S(1), >sse_status);
@@ -937,7 +937,7 @@ void helper_cvtps2dq(CPUX86State *env, ZMMReg *d, ZMMReg *s)
 d->ZMM_L(3) = x86_float32_to_int32(s->ZMM_S(3), >sse_status);
 }
 
-void helper_cvtpd2dq(CPUX86State *env, ZMMReg *d, ZMMReg *s)
+void glue(helper_cvtpd2dq, SUFFIX)(CPUX86State *env, ZMMReg *d, ZMMReg *s)
 {
 d->ZMM_L(0) = x86_float64_to_int32(s->ZMM_D(0), >sse_status);
 d->ZMM_L(1) = x86_float64_to_int32(s->ZMM_D(1), >sse_status);
@@ -979,7 +979,7 @@ int64_t helper_cvtsd2sq(CPUX86State *env, ZMMReg *s)
 #endif
 
 /* float to integer truncated */
-void helper_cvttps2dq(CPUX86State *env, ZMMReg *d, ZMMReg *s)
+vo

[PATCH v2 27/42] i386: Translate 256 bit AVX instructions

2022-04-24 Thread Paul Brook

All the work for the helper functions is already done, we just need to build
them, and a few macro tweaks to poulate the lookup tables.

For sse_op_table6 and sse_op_table7 we use #defines to fill in the entries
where and opcode only supports one vector size, rather than complicating the
main table.

Several of the open-coded mov type instruction need special handling, but most
of the rest falls out from the infrastructure we already added.

Also clear the top half of the register after 128 bit VEX register writes.
In the current code this correlates with VEX.L == 0, but there are exceptios
later.

Signed-off-by: Paul Brook 
---
 target/i386/helper.h |   2 +
 target/i386/tcg/fpu_helper.c |   3 +
 target/i386/tcg/translate.c  | 370 +--
 3 files changed, 319 insertions(+), 56 deletions(-)

diff --git a/target/i386/helper.h b/target/i386/helper.h
index ac3b4d1ee3..3da5df98b9 100644
--- a/target/i386/helper.h
+++ b/target/i386/helper.h
@@ -218,6 +218,8 @@ DEF_HELPER_3(movq, void, env, ptr, ptr)
 #include "ops_sse_header.h"
 #define SHIFT 1
 #include "ops_sse_header.h"
+#define SHIFT 2
+#include "ops_sse_header.h"
 
 DEF_HELPER_3(rclb, tl, env, tl, tl)
 DEF_HELPER_3(rclw, tl, env, tl, tl)
diff --git a/target/i386/tcg/fpu_helper.c b/target/i386/tcg/fpu_helper.c
index b391b69635..74cf86c986 100644
--- a/target/i386/tcg/fpu_helper.c
+++ b/target/i386/tcg/fpu_helper.c
@@ -3053,3 +3053,6 @@ void helper_movq(CPUX86State *env, void *d, void *s)
 
 #define SHIFT 1
 #include "ops_sse.h"
+
+#define SHIFT 2
+#include "ops_sse.h"
diff --git a/target/i386/tcg/translate.c b/target/i386/tcg/translate.c
index 278ed8ed1c..bcd6d47fd0 100644
--- a/target/i386/tcg/translate.c
+++ b/target/i386/tcg/translate.c
@@ -2742,6 +2742,29 @@ static inline void gen_ldo_env_A0(DisasContext *s, int 
offset)
 tcg_gen_st_i64(s->tmp1_i64, cpu_env, offset + offsetof(ZMMReg, ZMM_Q(1)));
 }
 
+static inline void gen_ldo_env_A0_ymmh(DisasContext *s, int offset)
+{
+int mem_index = s->mem_index;
+tcg_gen_qemu_ld_i64(s->tmp1_i64, s->A0, mem_index, MO_LEUQ);
+tcg_gen_st_i64(s->tmp1_i64, cpu_env, offset + offsetof(ZMMReg, ZMM_Q(2)));
+tcg_gen_addi_tl(s->tmp0, s->A0, 8);
+tcg_gen_qemu_ld_i64(s->tmp1_i64, s->tmp0, mem_index, MO_LEUQ);
+tcg_gen_st_i64(s->tmp1_i64, cpu_env, offset + offsetof(ZMMReg, ZMM_Q(3)));
+}
+
+/* Load 256-bit ymm register value */
+static inline void gen_ldy_env_A0(DisasContext *s, int offset)
+{
+int mem_index = s->mem_index;
+gen_ldo_env_A0(s, offset);
+tcg_gen_addi_tl(s->tmp0, s->A0, 16);
+tcg_gen_qemu_ld_i64(s->tmp1_i64, s->tmp0, mem_index, MO_LEUQ);
+tcg_gen_st_i64(s->tmp1_i64, cpu_env, offset + offsetof(ZMMReg, ZMM_Q(2)));
+tcg_gen_addi_tl(s->tmp0, s->A0, 24);
+tcg_gen_qemu_ld_i64(s->tmp1_i64, s->tmp0, mem_index, MO_LEUQ);
+tcg_gen_st_i64(s->tmp1_i64, cpu_env, offset + offsetof(ZMMReg, ZMM_Q(3)));
+}
+
 static inline void gen_sto_env_A0(DisasContext *s, int offset)
 {
 int mem_index = s->mem_index;
@@ -2752,6 +2775,29 @@ static inline void gen_sto_env_A0(DisasContext *s, int 
offset)
 tcg_gen_qemu_st_i64(s->tmp1_i64, s->tmp0, mem_index, MO_LEUQ);
 }
 
+static inline void gen_sto_env_A0_ymmh(DisasContext *s, int offset)
+{
+int mem_index = s->mem_index;
+tcg_gen_ld_i64(s->tmp1_i64, cpu_env, offset + offsetof(ZMMReg, ZMM_Q(2)));
+tcg_gen_qemu_st_i64(s->tmp1_i64, s->A0, mem_index, MO_LEUQ);
+tcg_gen_addi_tl(s->tmp0, s->A0, 8);
+tcg_gen_ld_i64(s->tmp1_i64, cpu_env, offset + offsetof(ZMMReg, ZMM_Q(3)));
+tcg_gen_qemu_st_i64(s->tmp1_i64, s->tmp0, mem_index, MO_LEUQ);
+}
+
+/* Store 256-bit ymm register value */
+static inline void gen_sty_env_A0(DisasContext *s, int offset)
+{
+int mem_index = s->mem_index;
+gen_sto_env_A0(s, offset);
+tcg_gen_addi_tl(s->tmp0, s->A0, 16);
+tcg_gen_ld_i64(s->tmp1_i64, cpu_env, offset + offsetof(ZMMReg, ZMM_Q(2)));
+tcg_gen_qemu_st_i64(s->tmp1_i64, s->tmp0, mem_index, MO_LEUQ);
+tcg_gen_addi_tl(s->tmp0, s->A0, 24);
+tcg_gen_ld_i64(s->tmp1_i64, cpu_env, offset + offsetof(ZMMReg, ZMM_Q(3)));
+tcg_gen_qemu_st_i64(s->tmp1_i64, s->tmp0, mem_index, MO_LEUQ);
+}
+
 static inline void gen_op_movo(DisasContext *s, int d_offset, int s_offset)
 {
 tcg_gen_ld_i64(s->tmp1_i64, cpu_env, s_offset + offsetof(ZMMReg, 
ZMM_Q(0)));
@@ -2760,6 +2806,14 @@ static inline void gen_op_movo(DisasContext *s, int 
d_offset, int s_offset)
 tcg_gen_st_i64(s->tmp1_i64, cpu_env, d_offset + offsetof(ZMMReg, 
ZMM_Q(1)));
 }
 
+static inline void gen_op_movo_ymmh(DisasContext *s, int d_offset, int 
s_offset)
+{
+tcg_gen_ld_i64(s->tmp1_i64, cpu_env, s_offset + offsetof(ZMMReg, 
ZMM_Q(2)));
+tcg_gen_st_i64(s->tmp1_i64, cpu_env, d_offset + offsetof(ZMMReg, 
ZMM_Q(2)

[PATCH v2 15/42] i386: Floating point atithmetic helper AVX prep

2022-04-24 Thread Paul Brook

Prepare the "easy" floating point vector helpers for AVX

No functional changes to existing helpers.

Signed-off-by: Paul Brook 
---
 target/i386/ops_sse.h | 144 ++
 1 file changed, 119 insertions(+), 25 deletions(-)

diff --git a/target/i386/ops_sse.h b/target/i386/ops_sse.h
index fc8fd57aa5..d308a1ec40 100644
--- a/target/i386/ops_sse.h
+++ b/target/i386/ops_sse.h
@@ -762,40 +762,66 @@ void glue(helper_pshufhw, SUFFIX)(Reg *d, Reg *s, int 
order)
 }
 #endif
 
-#if SHIFT == 1
+#if SHIFT >= 1
 /* FPU ops */
 /* XXX: not accurate */
 
-#define SSE_HELPER_S(name, F)   \
-void glue(helper_ ## name ## ps, SUFFIX)(CPUX86State *env, Reg *d, Reg *s)\
+#define SSE_HELPER_P(name, F)   \
+void glue(helper_ ## name ## ps, SUFFIX)(CPUX86State *env,  \
+Reg *d, Reg *s) \
 {   \
-d->ZMM_S(0) = F(32, d->ZMM_S(0), s->ZMM_S(0));  \
-d->ZMM_S(1) = F(32, d->ZMM_S(1), s->ZMM_S(1));  \
-d->ZMM_S(2) = F(32, d->ZMM_S(2), s->ZMM_S(2));  \
-d->ZMM_S(3) = F(32, d->ZMM_S(3), s->ZMM_S(3));  \
+Reg *v = d; \
+d->ZMM_S(0) = F(32, v->ZMM_S(0), s->ZMM_S(0));  \
+d->ZMM_S(1) = F(32, v->ZMM_S(1), s->ZMM_S(1));  \
+d->ZMM_S(2) = F(32, v->ZMM_S(2), s->ZMM_S(2));  \
+d->ZMM_S(3) = F(32, v->ZMM_S(3), s->ZMM_S(3));  \
+YMM_ONLY(   \
+d->ZMM_S(4) = F(32, v->ZMM_S(4), s->ZMM_S(4));  \
+d->ZMM_S(5) = F(32, v->ZMM_S(5), s->ZMM_S(5));  \
+d->ZMM_S(6) = F(32, v->ZMM_S(6), s->ZMM_S(6));  \
+d->ZMM_S(7) = F(32, v->ZMM_S(7), s->ZMM_S(7));  \
+)   \
 }   \
 \
-void helper_ ## name ## ss(CPUX86State *env, Reg *d, Reg *s)\
+void glue(helper_ ## name ## pd, SUFFIX)(CPUX86State *env,  \
+Reg *d, Reg *s) \
 {   \
-d->ZMM_S(0) = F(32, d->ZMM_S(0), s->ZMM_S(0));  \
-}   \
+Reg *v = d; \
+d->ZMM_D(0) = F(64, v->ZMM_D(0), s->ZMM_D(0));  \
+d->ZMM_D(1) = F(64, v->ZMM_D(1), s->ZMM_D(1));  \
+YMM_ONLY(   \
+d->ZMM_D(2) = F(64, v->ZMM_D(2), s->ZMM_D(2));  \
+d->ZMM_D(3) = F(64, v->ZMM_D(3), s->ZMM_D(3));  \
+)   \
+}
+
+#if SHIFT == 1
+
+#define SSE_HELPER_S(name, F)   \
+SSE_HELPER_P(name, F)   \
 \
-void glue(helper_ ## name ## pd, SUFFIX)(CPUX86State *env, Reg *d, Reg *s)\
+void helper_ ## name ## ss(CPUX86State *env, Reg *d, Reg *s)\
 {   \
-d->ZMM_D(0) = F(64, d->ZMM_D(0), s->ZMM_D(0));  \
-d->ZMM_D(1) = F(64, d->ZMM_D(1), s->ZMM_D(1));  \
+Reg *v = d; \
+d->ZMM_S(0) = F(32, v->ZMM_S(0), s->ZMM_S(0));  \
 }   \
 \
-void helper_ ## name ## sd(CPUX86State *env, Reg *d, Reg *s)\
+void helper_ ## name ## sd(CPUX86State *env, Reg *d, Reg *s)\
 {   \
-d->ZMM_D(0) = F(64, d->ZMM_D(0), s->ZMM_D(0));  \
+Reg *v = d; \
+d->ZMM_D(0) = F(64, v->ZMM_D(0), s->ZMM_D(0));  \
 }
 
+#else
+
+#define SSE_HELPER_S(name, F) SSE_HELPER_P(name, F)
+
+#endif
+
 #define FPU_ADD(size, a, b) float ## size ## _add(a, b, >sse_st

[PATCH v2 26/42] i386: Utility function for 128 bit AVX

2022-04-24 Thread Paul Brook

VEX encoded instructions that write to a (128 bit) xmm register clear the
rest (upper half) of the corresonding (256 bit) ymm register.
When legacy SSE encodings are used the rest of the ymm register is left
unchanged.

Add a utility fuction so that we don't have to keep duplicating this logic.

Signed-off-by: Paul Brook 
---
 target/i386/tcg/translate.c | 12 
 1 file changed, 12 insertions(+)

diff --git a/target/i386/tcg/translate.c b/target/i386/tcg/translate.c
index d148a2319d..278ed8ed1c 100644
--- a/target/i386/tcg/translate.c
+++ b/target/i386/tcg/translate.c
@@ -2780,6 +2780,18 @@ static inline void gen_op_movq_env_0(DisasContext *s, 
int d_offset)
 
 #define ZMM_OFFSET(reg) offsetof(CPUX86State, xmm_regs[reg])
 
+/*
+ * Clear the top half of the ymm register after a VEX.128 instruction
+ * This could be optimized by tracking this in env->hflags
+ */
+static void gen_clear_ymmh(DisasContext *s, int reg)
+{
+if (s->prefix & PREFIX_VEX) {
+gen_op_movq_env_0(s, offsetof(CPUX86State, xmm_regs[reg].ZMM_Q(2)));
+gen_op_movq_env_0(s, offsetof(CPUX86State, xmm_regs[reg].ZMM_Q(3)));
+}
+}
+
 typedef void (*SSEFunc_i_ep)(TCGv_i32 val, TCGv_ptr env, TCGv_ptr reg);
 typedef void (*SSEFunc_l_ep)(TCGv_i64 val, TCGv_ptr env, TCGv_ptr reg);
 typedef void (*SSEFunc_0_epi)(TCGv_ptr env, TCGv_ptr reg, TCGv_i32 val);
-- 
2.36.0

[PATCH v2 25/42] i386: VEX.V encodings (3 operand)

2022-04-24 Thread Paul Brook

Enable translation of VEX encoded AVX instructions.

The big change is the addition of an additional register operand in the VEX.V
field.  This is usually (but not always!) used to explictly encode the
first source operand.

The changes to ops_sse.h and ops_sse_header.h are purely mechanical, with
pervious changes ensuring that the relevant helper functions are ready to
handle the non destructive source operand.

We now have a grater variety of operand patterns for the vector helper
functions. The SSE_OPF_* flags we added to the opcode lookup tables are used
to select between these. This includes e.g. pshufX and cmpX instructions
which were previously overriden by opcode.

One gotcha is the "scalar" vector instructions. The SSE encodings write a
single element to the destination and leave the remainder of the register
unchanged.  The VEX encodings which copy the remainder of the destination from
first source operand. If the operation only has a single source value,
then the VEX.V encodes an additional operand from which is coped to the
the remainder of destination.

Signed-off-by: Paul Brook 
---
 target/i386/ops_sse.h| 214 +--
 target/i386/ops_sse_header.h | 149 ++---
 target/i386/tcg/translate.c  | 399 +--
 3 files changed, 463 insertions(+), 299 deletions(-)

diff --git a/target/i386/ops_sse.h b/target/i386/ops_sse.h
index e48dfc2fc5..ad3312d353 100644
--- a/target/i386/ops_sse.h
+++ b/target/i386/ops_sse.h
@@ -97,9 +97,8 @@
 #define FPSLL(x, c) ((x) << shift)
 #endif
 
-void glue(helper_psrlw, SUFFIX)(CPUX86State *env, Reg *d, Reg *c)
+void glue(helper_psrlw, SUFFIX)(CPUX86State *env, Reg *d, Reg *s, Reg *c)
 {
-Reg *s = d;
 int shift;
 if (c->Q(0) > 15) {
 d->Q(0) = 0;
@@ -114,9 +113,8 @@ void glue(helper_psrlw, SUFFIX)(CPUX86State *env, Reg *d, 
Reg *c)
 }
 }
 
-void glue(helper_psllw, SUFFIX)(CPUX86State *env, Reg *d, Reg *c)
+void glue(helper_psllw, SUFFIX)(CPUX86State *env, Reg *d, Reg *s, Reg *c)
 {
-Reg *s = d;
 int shift;
 if (c->Q(0) > 15) {
 d->Q(0) = 0;
@@ -131,9 +129,8 @@ void glue(helper_psllw, SUFFIX)(CPUX86State *env, Reg *d, 
Reg *c)
 }
 }
 
-void glue(helper_psraw, SUFFIX)(CPUX86State *env, Reg *d, Reg *c)
+void glue(helper_psraw, SUFFIX)(CPUX86State *env, Reg *d, Reg *s, Reg *c)
 {
-Reg *s = d;
 int shift;
 if (c->Q(0) > 15) {
 shift = 15;
@@ -143,9 +140,8 @@ void glue(helper_psraw, SUFFIX)(CPUX86State *env, Reg *d, 
Reg *c)
 SHIFT_HELPER_BODY(4 << SHIFT, W, FPSRAW);
 }
 
-void glue(helper_psrld, SUFFIX)(CPUX86State *env, Reg *d, Reg *c)
+void glue(helper_psrld, SUFFIX)(CPUX86State *env, Reg *d, Reg *s, Reg *c)
 {
-Reg *s = d;
 int shift;
 if (c->Q(0) > 31) {
 d->Q(0) = 0;
@@ -160,9 +156,8 @@ void glue(helper_psrld, SUFFIX)(CPUX86State *env, Reg *d, 
Reg *c)
 }
 }
 
-void glue(helper_pslld, SUFFIX)(CPUX86State *env, Reg *d, Reg *c)
+void glue(helper_pslld, SUFFIX)(CPUX86State *env, Reg *d, Reg *s, Reg *c)
 {
-Reg *s = d;
 int shift;
 if (c->Q(0) > 31) {
 d->Q(0) = 0;
@@ -177,9 +172,8 @@ void glue(helper_pslld, SUFFIX)(CPUX86State *env, Reg *d, 
Reg *c)
 }
 }
 
-void glue(helper_psrad, SUFFIX)(CPUX86State *env, Reg *d, Reg *c)
+void glue(helper_psrad, SUFFIX)(CPUX86State *env, Reg *d, Reg *s, Reg *c)
 {
-Reg *s = d;
 int shift;
 if (c->Q(0) > 31) {
 shift = 31;
@@ -189,9 +183,8 @@ void glue(helper_psrad, SUFFIX)(CPUX86State *env, Reg *d, 
Reg *c)
 SHIFT_HELPER_BODY(2 << SHIFT, L, FPSRAL);
 }
 
-void glue(helper_psrlq, SUFFIX)(CPUX86State *env, Reg *d, Reg *c)
+void glue(helper_psrlq, SUFFIX)(CPUX86State *env, Reg *d, Reg *s, Reg *c)
 {
-Reg *s = d;
 int shift;
 if (c->Q(0) > 63) {
 d->Q(0) = 0;
@@ -206,9 +199,8 @@ void glue(helper_psrlq, SUFFIX)(CPUX86State *env, Reg *d, 
Reg *c)
 }
 }
 
-void glue(helper_psllq, SUFFIX)(CPUX86State *env, Reg *d, Reg *c)
+void glue(helper_psllq, SUFFIX)(CPUX86State *env, Reg *d, Reg *s, Reg *c)
 {
-Reg *s = d;
 int shift;
 if (c->Q(0) > 63) {
 d->Q(0) = 0;
@@ -224,9 +216,8 @@ void glue(helper_psllq, SUFFIX)(CPUX86State *env, Reg *d, 
Reg *c)
 }
 
 #if SHIFT >= 1
-void glue(helper_psrldq, SUFFIX)(CPUX86State *env, Reg *d, Reg *c)
+void glue(helper_psrldq, SUFFIX)(CPUX86State *env, Reg *d, Reg *s, Reg *c)
 {
-Reg *s = d;
 int shift, i;
 
 shift = c->L(0);
@@ -249,9 +240,8 @@ void glue(helper_psrldq, SUFFIX)(CPUX86State *env, Reg *d, 
Reg *c)
 #endif
 }
 
-void glue(helper_pslldq, SUFFIX)(CPUX86State *env, Reg *d, Reg *c)
+void glue(helper_pslldq, SUFFIX)(CPUX86State *env, Reg *d, Reg *s, Reg *c)
 {
-Reg *s = d;
 int shift, i;
 
 shift = c->L(0);
@@ -321,9 +311,8 @@ void glue(helper_pslldq, SUFFIX)(CPUX86State *env, Reg *d, 
Reg *c)
 }
 
 #define SSE_HELPER_B(name, F)

[PATCH v2 36/42] i386: Implement VINSERT128/VEXTRACT128

2022-04-24 Thread Paul Brook

128-bit vinsert/vextract instructions. The integer and loating point variants
have the same semantics.

This is where we encounter an instruction encoded with VEX.L == 1 and
a 128 bit (xmm) destination operand.

Signed-off-by: Paul Brook 
---
 target/i386/tcg/translate.c | 78 +
 1 file changed, 78 insertions(+)

diff --git a/target/i386/tcg/translate.c b/target/i386/tcg/translate.c
index 5a11d3c083..4072fa28d3 100644
--- a/target/i386/tcg/translate.c
+++ b/target/i386/tcg/translate.c
@@ -2814,6 +2814,24 @@ static inline void gen_op_movo_ymmh(DisasContext *s, int 
d_offset, int s_offset)
 tcg_gen_st_i64(s->tmp1_i64, cpu_env, d_offset + offsetof(ZMMReg, 
ZMM_Q(3)));
 }
 
+static inline void gen_op_movo_ymm_l2h(DisasContext *s,
+   int d_offset, int s_offset)
+{
+tcg_gen_ld_i64(s->tmp1_i64, cpu_env, s_offset + offsetof(ZMMReg, 
ZMM_Q(0)));
+tcg_gen_st_i64(s->tmp1_i64, cpu_env, d_offset + offsetof(ZMMReg, 
ZMM_Q(2)));
+tcg_gen_ld_i64(s->tmp1_i64, cpu_env, s_offset + offsetof(ZMMReg, 
ZMM_Q(1)));
+tcg_gen_st_i64(s->tmp1_i64, cpu_env, d_offset + offsetof(ZMMReg, 
ZMM_Q(3)));
+}
+
+static inline void gen_op_movo_ymm_h2l(DisasContext *s,
+   int d_offset, int s_offset)
+{
+tcg_gen_ld_i64(s->tmp1_i64, cpu_env, s_offset + offsetof(ZMMReg, 
ZMM_Q(2)));
+tcg_gen_st_i64(s->tmp1_i64, cpu_env, d_offset + offsetof(ZMMReg, 
ZMM_Q(0)));
+tcg_gen_ld_i64(s->tmp1_i64, cpu_env, s_offset + offsetof(ZMMReg, 
ZMM_Q(3)));
+tcg_gen_st_i64(s->tmp1_i64, cpu_env, d_offset + offsetof(ZMMReg, 
ZMM_Q(1)));
+}
+
 static inline void gen_op_movq(DisasContext *s, int d_offset, int s_offset)
 {
 tcg_gen_ld_i64(s->tmp1_i64, cpu_env, s_offset);
@@ -3353,9 +3371,13 @@ static const struct SSEOpHelper_table7 
sse_op_table7[256] = {
 [0x15] = SPECIAL_OP(SSE41), /* pextrw */
 [0x16] = SPECIAL_OP(SSE41), /* pextrd/pextrq */
 [0x17] = SPECIAL_OP(SSE41), /* extractps */
+[0x18] = SPECIAL_OP(AVX), /* vinsertf128 */
+[0x19] = SPECIAL_OP(AVX), /* vextractf128 */
 [0x20] = SPECIAL_OP(SSE41), /* pinsrb */
 [0x21] = SPECIAL_OP(SSE41), /* insertps */
 [0x22] = SPECIAL_OP(SSE41), /* pinsrd/pinsrq */
+[0x38] = SPECIAL_OP(AVX), /* vinserti128 */
+[0x39] = SPECIAL_OP(AVX), /* vextracti128 */
 [0x40] = BINARY_OP(dpps, SSE41, 0),
 #define gen_helper_dppd_ymm NULL
 [0x41] = BINARY_OP(dppd, SSE41, 0),
@@ -5145,6 +5167,62 @@ static void gen_sse(CPUX86State *env, DisasContext *s, 
int b,
 }
 gen_clear_ymmh(s, reg);
 break;
+case 0x38: /* vinserti128 */
+CHECK_AVX2_256(s);
+/* fall through */
+case 0x18: /* vinsertf128 */
+CHECK_AVX(s);
+if ((s->prefix & PREFIX_VEX) == 0 || s->vex_l == 0) {
+goto illegal_op;
+}
+if (mod == 3) {
+if (val & 1) {
+gen_op_movo_ymm_l2h(s, ZMM_OFFSET(reg),
+ZMM_OFFSET(rm));
+} else {
+gen_op_movo(s, ZMM_OFFSET(reg), ZMM_OFFSET(rm));
+}
+} else {
+if (val & 1) {
+gen_ldo_env_A0_ymmh(s, ZMM_OFFSET(reg));
+} else {
+gen_ldo_env_A0(s, ZMM_OFFSET(reg));
+}
+}
+if (reg != reg_v) {
+if (val & 1) {
+gen_op_movo(s, ZMM_OFFSET(reg), ZMM_OFFSET(reg_v));
+} else {
+gen_op_movo_ymmh(s, ZMM_OFFSET(reg),
+ ZMM_OFFSET(reg_v));
+}
+}
+break;
+case 0x39: /* vextracti128 */
+CHECK_AVX2_256(s);
+/* fall through */
+case 0x19: /* vextractf128 */
+CHECK_AVX_V0(s);
+if ((s->prefix & PREFIX_VEX) == 0 || s->vex_l == 0) {
+goto illegal_op;
+}
+if (mod == 3) {
+op1_offset = ZMM_OFFSET(rm);
+if (val & 1) {
+gen_op_movo_ymm_h2l(s, ZMM_OFFSET(rm),
+ZMM_OFFSET(reg));
+} else {
+gen_op_movo(s, ZMM_OFFSET(rm), ZMM_OFFSET(reg));
+}
+gen_clear_ymmh(s, rm);
+} else{
+if (val & 1

[PATCH v2 21/42] i386: AVX+AES helpers

2022-04-24 Thread Paul Brook

Make the AES vector helpers AVX ready

No functional changes to existing helpers

Signed-off-by: Paul Brook 
---
 target/i386/ops_sse.h| 63 ++--
 target/i386/ops_sse_header.h | 55 ++-
 2 files changed, 85 insertions(+), 33 deletions(-)

diff --git a/target/i386/ops_sse.h b/target/i386/ops_sse.h
index b7100fdce1..48cec40074 100644
--- a/target/i386/ops_sse.h
+++ b/target/i386/ops_sse.h
@@ -2929,64 +2929,92 @@ void glue(helper_pclmulqdq, SUFFIX)(CPUX86State *env, 
Reg *d, Reg *s,
 void glue(helper_aesdec, SUFFIX)(CPUX86State *env, Reg *d, Reg *s)
 {
 int i;
-Reg st = *d;
+Reg st = *d; // v
 Reg rk = *s;
 
 for (i = 0 ; i < 4 ; i++) {
-d->L(i) = rk.L(i) ^ bswap32(AES_Td0[st.B(AES_ishifts[4*i+0])] ^
-AES_Td1[st.B(AES_ishifts[4*i+1])] ^
-AES_Td2[st.B(AES_ishifts[4*i+2])] ^
-AES_Td3[st.B(AES_ishifts[4*i+3])]);
+d->L(i) = rk.L(i) ^ bswap32(AES_Td0[st.B(AES_ishifts[4 * i + 0])] ^
+AES_Td1[st.B(AES_ishifts[4 * i + 1])] ^
+AES_Td2[st.B(AES_ishifts[4 * i + 2])] ^
+AES_Td3[st.B(AES_ishifts[4 * i + 3])]);
 }
+#if SHIFT == 2
+for (i = 0 ; i < 4 ; i++) {
+d->L(i + 4) = rk.L(i + 4) ^ bswap32(
+AES_Td0[st.B(AES_ishifts[4 * i + 0] + 16)] ^
+AES_Td1[st.B(AES_ishifts[4 * i + 1] + 16)] ^
+AES_Td2[st.B(AES_ishifts[4 * i + 2] + 16)] ^
+AES_Td3[st.B(AES_ishifts[4 * i + 3] + 16)]);
+}
+#endif
 }
 
 void glue(helper_aesdeclast, SUFFIX)(CPUX86State *env, Reg *d, Reg *s)
 {
 int i;
-Reg st = *d;
+Reg st = *d; // v
 Reg rk = *s;
 
 for (i = 0; i < 16; i++) {
 d->B(i) = rk.B(i) ^ (AES_isbox[st.B(AES_ishifts[i])]);
 }
+#if SHIFT == 2
+for (i = 0; i < 16; i++) {
+d->B(i + 16) = rk.B(i + 16) ^ (AES_isbox[st.B(AES_ishifts[i] + 16)]);
+}
+#endif
 }
 
 void glue(helper_aesenc, SUFFIX)(CPUX86State *env, Reg *d, Reg *s)
 {
 int i;
-Reg st = *d;
+Reg st = *d; // v
 Reg rk = *s;
 
 for (i = 0 ; i < 4 ; i++) {
-d->L(i) = rk.L(i) ^ bswap32(AES_Te0[st.B(AES_shifts[4*i+0])] ^
-AES_Te1[st.B(AES_shifts[4*i+1])] ^
-AES_Te2[st.B(AES_shifts[4*i+2])] ^
-AES_Te3[st.B(AES_shifts[4*i+3])]);
+d->L(i) = rk.L(i) ^ bswap32(AES_Te0[st.B(AES_shifts[4 * i + 0])] ^
+AES_Te1[st.B(AES_shifts[4 * i + 1])] ^
+AES_Te2[st.B(AES_shifts[4 * i + 2])] ^
+AES_Te3[st.B(AES_shifts[4 * i + 3])]);
 }
+#if SHIFT == 2
+for (i = 0 ; i < 4 ; i++) {
+d->L(i + 4) = rk.L(i + 4) ^ bswap32(
+AES_Te0[st.B(AES_shifts[4 * i + 0] + 16)] ^
+AES_Te1[st.B(AES_shifts[4 * i + 1] + 16)] ^
+AES_Te2[st.B(AES_shifts[4 * i + 2] + 16)] ^
+AES_Te3[st.B(AES_shifts[4 * i + 3] + 16)]);
+}
+#endif
 }
 
 void glue(helper_aesenclast, SUFFIX)(CPUX86State *env, Reg *d, Reg *s)
 {
 int i;
-Reg st = *d;
+Reg st = *d; // v
 Reg rk = *s;
 
 for (i = 0; i < 16; i++) {
 d->B(i) = rk.B(i) ^ (AES_sbox[st.B(AES_shifts[i])]);
 }
-
+#if SHIFT == 2
+for (i = 0; i < 16; i++) {
+d->B(i + 16) = rk.B(i + 16) ^ (AES_sbox[st.B(AES_shifts[i] + 16)]);
+}
+#endif
 }
 
+#if SHIFT == 1
 void glue(helper_aesimc, SUFFIX)(CPUX86State *env, Reg *d, Reg *s)
 {
 int i;
 Reg tmp = *s;
 
 for (i = 0 ; i < 4 ; i++) {
-d->L(i) = bswap32(AES_imc[tmp.B(4*i+0)][0] ^
-  AES_imc[tmp.B(4*i+1)][1] ^
-  AES_imc[tmp.B(4*i+2)][2] ^
-  AES_imc[tmp.B(4*i+3)][3]);
+d->L(i) = bswap32(AES_imc[tmp.B(4 * i + 0)][0] ^
+  AES_imc[tmp.B(4 * i + 1)][1] ^
+  AES_imc[tmp.B(4 * i + 2)][2] ^
+  AES_imc[tmp.B(4 * i + 3)][3]);
 }
 }
 
@@ -3004,6 +3032,7 @@ void glue(helper_aeskeygenassist, SUFFIX)(CPUX86State 
*env, Reg *d, Reg *s,
 d->L(3) = (d->L(2) << 24 | d->L(2) >> 8) ^ ctrl;
 }
 #endif
+#endif
 
 #undef SSE_HELPER_S
 
diff --git a/target/i386/ops_sse_header.h b/target/i386/ops_sse_header.h
index b8b0666f61..203afbb5a1 100644
--- a/target/i386/ops_sse_header.h
+++ b/target/i386/ops_sse_header.h
@@ -47,7 +47,7 @@ DEF_HELPER_3(glue(pslld, SUFFIX), void, env, Reg, Reg)
 DEF_HELPER_3(glue(psrlq, SUFFIX), void, env, Reg, Reg)
 DEF_HELPER_3(glue(psllq, SUFFIX), void, env, Reg, Reg)
 
-#if SHIFT == 1
+#if SHIFT >= 1
 DEF_HELPER_3(glue(psrldq, SUFFIX), void, env, Reg, Reg)
 DEF_HELPER_3(glue

[PATCH v2 35/42] i386: Implement VPERM

2022-04-24 Thread Paul Brook

A set of shuffle operations that operate on complete 256 bit registers.
The integer and floating point variants have identical semantics.

Signed-off-by: Paul Brook 
---
 target/i386/ops_sse.h| 73 
 target/i386/ops_sse_header.h |  3 ++
 target/i386/tcg/translate.c  |  9 +
 3 files changed, 85 insertions(+)

diff --git a/target/i386/ops_sse.h b/target/i386/ops_sse.h
index 14a2d1bf78..04d2006cd8 100644
--- a/target/i386/ops_sse.h
+++ b/target/i386/ops_sse.h
@@ -3407,6 +3407,79 @@ void helper_vzeroupper_hi8(CPUX86State *env)
 }
 }
 #endif
+
+void helper_vpermdq_ymm(CPUX86State *env,
+Reg *d, Reg *v, Reg *s, uint32_t order)
+{
+uint64_t r0, r1, r2, r3;
+
+switch (order & 3) {
+case 0:
+r0 = v->Q(0);
+r1 = v->Q(1);
+break;
+case 1:
+r0 = v->Q(2);
+r1 = v->Q(3);
+break;
+case 2:
+r0 = s->Q(0);
+r1 = s->Q(1);
+break;
+case 3:
+r0 = s->Q(2);
+r1 = s->Q(3);
+break;
+}
+switch ((order >> 4) & 3) {
+case 0:
+r2 = v->Q(0);
+r3 = v->Q(1);
+break;
+case 1:
+r2 = v->Q(2);
+r3 = v->Q(3);
+break;
+case 2:
+r2 = s->Q(0);
+r3 = s->Q(1);
+break;
+case 3:
+r2 = s->Q(2);
+r3 = s->Q(3);
+break;
+}
+d->Q(0) = r0;
+d->Q(1) = r1;
+d->Q(2) = r2;
+d->Q(3) = r3;
+}
+
+void helper_vpermq_ymm(CPUX86State *env, Reg *d, Reg *s, uint32_t order)
+{
+uint64_t r0, r1, r2, r3;
+r0 = s->Q(order & 3);
+r1 = s->Q((order >> 2) & 3);
+r2 = s->Q((order >> 4) & 3);
+r3 = s->Q((order >> 6) & 3);
+d->Q(0) = r0;
+d->Q(1) = r1;
+d->Q(2) = r2;
+d->Q(3) = r3;
+}
+
+void helper_vpermd_ymm(CPUX86State *env, Reg *d, Reg *v, Reg *s)
+{
+uint32_t r[8];
+int i;
+
+for (i = 0; i < 8; i++) {
+r[i] = s->L(v->L(i) & 7);
+}
+for (i = 0; i < 8; i++) {
+d->L(i) = r[i];
+}
+}
 #endif
 #endif
 
diff --git a/target/i386/ops_sse_header.h b/target/i386/ops_sse_header.h
index e5d8ea9bb7..099e6e8ffc 100644
--- a/target/i386/ops_sse_header.h
+++ b/target/i386/ops_sse_header.h
@@ -457,6 +457,9 @@ DEF_HELPER_1(vzeroupper, void, env)
 DEF_HELPER_1(vzeroall_hi8, void, env)
 DEF_HELPER_1(vzeroupper_hi8, void, env)
 #endif
+DEF_HELPER_5(vpermdq_ymm, void, env, Reg, Reg, Reg, i32)
+DEF_HELPER_4(vpermq_ymm, void, env, Reg, Reg, i32)
+DEF_HELPER_4(vpermd_ymm, void, env, Reg, Reg, Reg)
 #endif
 #endif
 
diff --git a/target/i386/tcg/translate.c b/target/i386/tcg/translate.c
index fe1ab58d07..5a11d3c083 100644
--- a/target/i386/tcg/translate.c
+++ b/target/i386/tcg/translate.c
@@ -3258,6 +3258,8 @@ static const struct SSEOpHelper_table6 sse_op_table6[256] 
= {
 [0x10] = BLENDV_OP(pblendvb, SSE41, SSE_OPF_MMX),
 [0x14] = BLENDV_OP(blendvps, SSE41, 0),
 [0x15] = BLENDV_OP(blendvpd, SSE41, 0),
+#define gen_helper_vpermd_xmm NULL
+[0x16] = BINARY_OP(vpermd, AVX, SSE_OPF_AVX2), /* vpermps */
 [0x17] = CMP_OP(ptest, SSE41),
 /* TODO:Some vbroadcast variants require AVX2 */
 [0x18] = UNARY_OP(vbroadcastl, AVX, SSE_OPF_SCALAR), /* vbroadcastss */
@@ -3287,6 +3289,7 @@ static const struct SSEOpHelper_table6 sse_op_table6[256] 
= {
 [0x33] = UNARY_OP(pmovzxwd, SSE41, SSE_OPF_MMX),
 [0x34] = UNARY_OP(pmovzxwq, SSE41, SSE_OPF_MMX),
 [0x35] = UNARY_OP(pmovzxdq, SSE41, SSE_OPF_MMX),
+[0x36] = BINARY_OP(vpermd, AVX, SSE_OPF_AVX2), /* vpermd */
 [0x37] = BINARY_OP(pcmpgtq, SSE41, SSE_OPF_MMX),
 [0x38] = BINARY_OP(pminsb, SSE41, SSE_OPF_MMX),
 [0x39] = BINARY_OP(pminsd, SSE41, SSE_OPF_MMX),
@@ -3329,8 +3332,13 @@ static const struct SSEOpHelper_table6 
sse_op_table6[256] = {
 
 /* prefix [66] 0f 3a */
 static const struct SSEOpHelper_table7 sse_op_table7[256] = {
+#define gen_helper_vpermq_xmm NULL
+[0x00] = UNARY_OP(vpermq, AVX, SSE_OPF_AVX2),
+[0x01] = UNARY_OP(vpermq, AVX, SSE_OPF_AVX2), /* vpermpd */
 [0x04] = UNARY_OP(vpermilps_imm, AVX, 0),
 [0x05] = UNARY_OP(vpermilpd_imm, AVX, 0),
+#define gen_helper_vpermdq_xmm NULL
+[0x06] = BINARY_OP(vpermdq, AVX, 0), /* vperm2f128 */
 [0x08] = UNARY_OP(roundps, SSE41, 0),
 [0x09] = UNARY_OP(roundpd, SSE41, 0),
 #define gen_helper_roundss_ymm NULL
@@ -3353,6 +3361,7 @@ static const struct SSEOpHelper_table7 sse_op_table7[256] 
= {
 [0x41] = BINARY_OP(dppd, SSE41, 0),
 [0x42] = BINARY_OP(mpsadbw, SSE41, SSE_OPF_MMX),
 [0x44] = BINARY_OP(pclmulqdq, PCLMULQDQ, 0),
+[0x46] = BINARY_OP(vpermdq, AVX, SSE_OPF_AVX2), /* vperm2i128 */
 #define gen_helper_pcmpestrm_ymm NULL
 [0x60] = CMP_OP(pcmpestrm, SSE42),
 #define gen_helper_pcmpestri_ymm NULL
-- 
2.36.0

[PATCH v2 39/42] i386: Enable AVX cpuid bits when using TCG

2022-04-24 Thread Paul Brook

Include AVX and AVX2 in the guest cpuid features supported by TCG

Signed-off-by: Paul Brook 
---
 target/i386/cpu.c | 8 
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/target/i386/cpu.c b/target/i386/cpu.c
index 99343be926..bd35233d5b 100644
--- a/target/i386/cpu.c
+++ b/target/i386/cpu.c
@@ -625,12 +625,12 @@ void x86_cpu_vendor_words2str(char *dst, uint32_t vendor1,
   CPUID_EXT_SSE41 | CPUID_EXT_SSE42 | CPUID_EXT_POPCNT | \
   CPUID_EXT_XSAVE | /* CPUID_EXT_OSXSAVE is dynamic */   \
   CPUID_EXT_MOVBE | CPUID_EXT_AES | CPUID_EXT_HYPERVISOR | \
-  CPUID_EXT_RDRAND)
+  CPUID_EXT_RDRAND | CPUID_EXT_AVX)
   /* missing:
   CPUID_EXT_DTES64, CPUID_EXT_DSCPL, CPUID_EXT_VMX, CPUID_EXT_SMX,
   CPUID_EXT_EST, CPUID_EXT_TM2, CPUID_EXT_CID, CPUID_EXT_FMA,
   CPUID_EXT_XTPR, CPUID_EXT_PDCM, CPUID_EXT_PCID, CPUID_EXT_DCA,
-  CPUID_EXT_X2APIC, CPUID_EXT_TSC_DEADLINE_TIMER, CPUID_EXT_AVX,
+  CPUID_EXT_X2APIC, CPUID_EXT_TSC_DEADLINE_TIMER,
   CPUID_EXT_F16C */
 
 #ifdef TARGET_X86_64
@@ -653,9 +653,9 @@ void x86_cpu_vendor_words2str(char *dst, uint32_t vendor1,
   CPUID_7_0_EBX_BMI1 | CPUID_7_0_EBX_BMI2 | CPUID_7_0_EBX_ADX | \
   CPUID_7_0_EBX_PCOMMIT | CPUID_7_0_EBX_CLFLUSHOPT |\
   CPUID_7_0_EBX_CLWB | CPUID_7_0_EBX_MPX | CPUID_7_0_EBX_FSGSBASE | \
-  CPUID_7_0_EBX_ERMS)
+  CPUID_7_0_EBX_ERMS | CPUID_7_0_EBX_AVX2)
   /* missing:
-  CPUID_7_0_EBX_HLE, CPUID_7_0_EBX_AVX2,
+  CPUID_7_0_EBX_HLE
   CPUID_7_0_EBX_INVPCID, CPUID_7_0_EBX_RTM,
   CPUID_7_0_EBX_RDSEED */
 #define TCG_7_0_ECX_FEATURES (CPUID_7_0_ECX_UMIP | CPUID_7_0_ECX_PKU | \
-- 
2.36.0

[PATCH v2 42/42] i386: Add sha512-avx test

2022-04-24 Thread Paul Brook

Include sha512 built with avx[2] in the tcg tests.

Signed-off-by: Paul Brook 
---
 tests/tcg/i386/Makefile.target | 9 -
 1 file changed, 8 insertions(+), 1 deletion(-)

diff --git a/tests/tcg/i386/Makefile.target b/tests/tcg/i386/Makefile.target
index eb06f7eb89..a0335fff6d 100644
--- a/tests/tcg/i386/Makefile.target
+++ b/tests/tcg/i386/Makefile.target
@@ -79,7 +79,14 @@ sha512-sse: sha512.c
 run-sha512-sse: QEMU_OPTS+=-cpu max
 run-plugin-sha512-sse-with-%: QEMU_OPTS+=-cpu max
 
-TESTS+=sha512-sse
+sha512-avx: CFLAGS=-mavx2 -mavx -O3
+sha512-avx: sha512.c
+   $(CC) $(CFLAGS) $(EXTRA_CFLAGS) $< -o $@ $(LDFLAGS)
+
+run-sha512-avx: QEMU_OPTS+=-cpu max
+run-plugin-sha512-avx-with-%: QEMU_OPTS+=-cpu max
+
+TESTS+=sha512-sse sha512-avx
 
 test-avx.h: test-avx.py x86.csv
$(PYTHON) $(I386_SRC)/test-avx.py $(I386_SRC)/x86.csv $@
-- 
2.36.0

[PATCH v2 16/42] i386: Dot product AVX helper prep

2022-04-24 Thread Paul Brook

Make the dpps and dppd helpers AVX-ready

I can't see any obvious reason why dppd shouldn't work on 256 bit ymm
registers, but both AMD and Intel agree that it's xmm only.

Signed-off-by: Paul Brook 
---
 target/i386/ops_sse.h | 54 ---
 1 file changed, 46 insertions(+), 8 deletions(-)

diff --git a/target/i386/ops_sse.h b/target/i386/ops_sse.h
index d308a1ec40..4137e6e1fa 100644
--- a/target/i386/ops_sse.h
+++ b/target/i386/ops_sse.h
@@ -2366,8 +2366,10 @@ SSE_HELPER_I(helper_blendps, L, 4, FBLENDP)
 SSE_HELPER_I(helper_blendpd, Q, 2, FBLENDP)
 SSE_HELPER_I(helper_pblendw, W, 8, FBLENDP)
 
-void glue(helper_dpps, SUFFIX)(CPUX86State *env, Reg *d, Reg *s, uint32_t mask)
+void glue(helper_dpps, SUFFIX)(CPUX86State *env, Reg *d, Reg *s,
+   uint32_t mask)
 {
+Reg *v = d;
 float32 prod, iresult, iresult2;
 
 /*
@@ -2375,23 +2377,23 @@ void glue(helper_dpps, SUFFIX)(CPUX86State *env, Reg 
*d, Reg *s, uint32_t mask)
  * to correctly round the intermediate results
  */
 if (mask & (1 << 4)) {
-iresult = float32_mul(d->ZMM_S(0), s->ZMM_S(0), >sse_status);
+iresult = float32_mul(v->ZMM_S(0), s->ZMM_S(0), >sse_status);
 } else {
 iresult = float32_zero;
 }
 if (mask & (1 << 5)) {
-prod = float32_mul(d->ZMM_S(1), s->ZMM_S(1), >sse_status);
+prod = float32_mul(v->ZMM_S(1), s->ZMM_S(1), >sse_status);
 } else {
 prod = float32_zero;
 }
 iresult = float32_add(iresult, prod, >sse_status);
 if (mask & (1 << 6)) {
-iresult2 = float32_mul(d->ZMM_S(2), s->ZMM_S(2), >sse_status);
+iresult2 = float32_mul(v->ZMM_S(2), s->ZMM_S(2), >sse_status);
 } else {
 iresult2 = float32_zero;
 }
 if (mask & (1 << 7)) {
-prod = float32_mul(d->ZMM_S(3), s->ZMM_S(3), >sse_status);
+prod = float32_mul(v->ZMM_S(3), s->ZMM_S(3), >sse_status);
 } else {
 prod = float32_zero;
 }
@@ -2402,26 +2404,62 @@ void glue(helper_dpps, SUFFIX)(CPUX86State *env, Reg 
*d, Reg *s, uint32_t mask)
 d->ZMM_S(1) = (mask & (1 << 1)) ? iresult : float32_zero;
 d->ZMM_S(2) = (mask & (1 << 2)) ? iresult : float32_zero;
 d->ZMM_S(3) = (mask & (1 << 3)) ? iresult : float32_zero;
+#if SHIFT == 2
+if (mask & (1 << 4)) {
+iresult = float32_mul(v->ZMM_S(4), s->ZMM_S(4), >sse_status);
+} else {
+iresult = float32_zero;
+}
+if (mask & (1 << 5)) {
+prod = float32_mul(v->ZMM_S(5), s->ZMM_S(5), >sse_status);
+} else {
+prod = float32_zero;
+}
+iresult = float32_add(iresult, prod, >sse_status);
+if (mask & (1 << 6)) {
+iresult2 = float32_mul(v->ZMM_S(6), s->ZMM_S(6), >sse_status);
+} else {
+iresult2 = float32_zero;
+}
+if (mask & (1 << 7)) {
+prod = float32_mul(v->ZMM_S(7), s->ZMM_S(7), >sse_status);
+} else {
+prod = float32_zero;
+}
+iresult2 = float32_add(iresult2, prod, >sse_status);
+iresult = float32_add(iresult, iresult2, >sse_status);
+
+d->ZMM_S(4) = (mask & (1 << 0)) ? iresult : float32_zero;
+d->ZMM_S(5) = (mask & (1 << 1)) ? iresult : float32_zero;
+d->ZMM_S(6) = (mask & (1 << 2)) ? iresult : float32_zero;
+d->ZMM_S(7) = (mask & (1 << 3)) ? iresult : float32_zero;
+#endif
 }
 
-void glue(helper_dppd, SUFFIX)(CPUX86State *env, Reg *d, Reg *s, uint32_t mask)
+#if SHIFT == 1
+/* Oddly, there is no ymm version of dppd */
+void glue(helper_dppd, SUFFIX)(CPUX86State *env,
+   Reg *d, Reg *s, uint32_t mask)
 {
+Reg *v = d;
 float64 iresult;
 
 if (mask & (1 << 4)) {
-iresult = float64_mul(d->ZMM_D(0), s->ZMM_D(0), >sse_status);
+iresult = float64_mul(v->ZMM_D(0), s->ZMM_D(0), >sse_status);
 } else {
 iresult = float64_zero;
 }
+
 if (mask & (1 << 5)) {
 iresult = float64_add(iresult,
-  float64_mul(d->ZMM_D(1), s->ZMM_D(1),
+  float64_mul(v->ZMM_D(1), s->ZMM_D(1),
   >sse_status),
   >sse_status);
 }
 d->ZMM_D(0) = (mask & (1 << 0)) ? iresult : float64_zero;
 d->ZMM_D(1) = (mask & (1 << 1)) ? iresult : float64_zero;
 }
+#endif
 
 void glue(helper_mpsadbw, SUFFIX)(CPUX86State *env, Reg *d, Reg *s,
   uint32_t offset)
-- 
2.36.0

[PATCH v2 34/42] i386: Implement VGATHER

2022-04-24 Thread Paul Brook

These are scatter load instructions that need introduce a new "Vector SIB"
encoding.  Also a bit of hair to handle different index sizes and scaling
factors, but overall the combinatorial explosion doesn't end up too bad.

The other thing of note is probably that these also modify the mask operand.
Thankfully the operands may not overlap, and we do not have to make the whole
thing appear atomic.

Signed-off-by: Paul Brook 
---
 target/i386/ops_sse.h| 65 +++
 target/i386/ops_sse_header.h | 16 
 target/i386/tcg/translate.c  | 74 
 3 files changed, 155 insertions(+)

diff --git a/target/i386/ops_sse.h b/target/i386/ops_sse.h
index ffcba3d02c..14a2d1bf78 100644
--- a/target/i386/ops_sse.h
+++ b/target/i386/ops_sse.h
@@ -3288,6 +3288,71 @@ void glue(helper_vpmaskmovq, SUFFIX)(CPUX86State *env, 
Reg *d, Reg *v, Reg *s)
 #endif
 }
 
+#define VGATHER_HELPER(scale)   \
+void glue(helper_vpgatherdd ## scale, SUFFIX)(CPUX86State *env, \
+Reg *d, Reg *v, Reg *s, target_ulong a0)\
+{   \
+int i;  \
+for (i = 0; i < (2 << SHIFT); i++) {\
+if (v->L(i) >> 31) {\
+target_ulong addr = a0  \
++ ((target_ulong)(int32_t)s->L(i) << scale);\
+d->L(i) = cpu_ldl_data_ra(env, addr, GETPC());  \
+}   \
+v->L(i) = 0;\
+}   \
+}   \
+void glue(helper_vpgatherdq ## scale, SUFFIX)(CPUX86State *env, \
+Reg *d, Reg *v, Reg *s, target_ulong a0)\
+{   \
+int i;  \
+for (i = 0; i < (1 << SHIFT); i++) {\
+if (v->Q(i) >> 63) {\
+target_ulong addr = a0  \
++ ((target_ulong)(int32_t)s->L(i) << scale);\
+d->Q(i) = cpu_ldq_data_ra(env, addr, GETPC());  \
+}   \
+v->Q(i) = 0;\
+}   \
+}   \
+void glue(helper_vpgatherqd ## scale, SUFFIX)(CPUX86State *env, \
+Reg *d, Reg *v, Reg *s, target_ulong a0)\
+{   \
+int i;  \
+for (i = 0; i < (1 << SHIFT); i++) {\
+if (v->L(i) >> 31) {\
+target_ulong addr = a0  \
++ ((target_ulong)(int64_t)s->Q(i) << scale);\
+d->L(i) = cpu_ldl_data_ra(env, addr, GETPC());  \
+}   \
+v->L(i) = 0;\
+}   \
+d->Q(SHIFT) = 0;\
+v->Q(SHIFT) = 0;\
+YMM_ONLY(   \
+d->Q(3) = 0;\
+v->Q(3) = 0;\
+)   \
+}   \
+void glue(helper_vpgatherqq ## scale, SUFFIX)(CPUX86State *env, \
+Reg *d, Reg *v, Reg *s, target_ulong a0)\
+{   \
+int i;  \
+for (i = 0; i < (1 << SHIFT); i++) {\
+if (v->Q(i) >> 63) {\
+target_ulong addr = a0  \
++ ((target_ulong)(int64_t)s->Q(i) << scale);\
+d

[PATCH v2 23/42] i386: AVX comparison helpers

2022-04-24 Thread Paul Brook

AVX includes additional a more extensive set of comparison predicates,
some of some of which our softfloat implementation does not expose directly.
Rewrite the helpers in terms of floatN_compare

Signed-off-by: Paul Brook 
---
 target/i386/ops_sse.h| 149 ---
 target/i386/ops_sse_header.h |  47 ---
 target/i386/tcg/translate.c  |  49 +---
 3 files changed, 177 insertions(+), 68 deletions(-)

diff --git a/target/i386/ops_sse.h b/target/i386/ops_sse.h
index 48cec40074..e48dfc2fc5 100644
--- a/target/i386/ops_sse.h
+++ b/target/i386/ops_sse.h
@@ -1394,57 +1394,112 @@ void glue(helper_addsubpd, SUFFIX)(CPUX86State *env, 
Reg *d, Reg *s)
 #endif
 }
 
-/* XXX: unordered */
-#define SSE_HELPER_CMP(name, F) \
-void glue(helper_ ## name ## ps, SUFFIX)(CPUX86State *env, Reg *d, Reg *s)\
-{   \
-d->ZMM_L(0) = F(32, d->ZMM_S(0), s->ZMM_S(0));  \
-d->ZMM_L(1) = F(32, d->ZMM_S(1), s->ZMM_S(1));  \
-d->ZMM_L(2) = F(32, d->ZMM_S(2), s->ZMM_S(2));  \
-d->ZMM_L(3) = F(32, d->ZMM_S(3), s->ZMM_S(3));  \
-}   \
-\
-void helper_ ## name ## ss(CPUX86State *env, Reg *d, Reg *s)\
-{   \
-d->ZMM_L(0) = F(32, d->ZMM_S(0), s->ZMM_S(0));  \
-}   \
-\
-void glue(helper_ ## name ## pd, SUFFIX)(CPUX86State *env, Reg *d, Reg *s)\
+#define SSE_HELPER_CMP_P(name, F, C)\
+void glue(helper_ ## name ## ps, SUFFIX)(CPUX86State *env,  \
+ Reg *d, Reg *s)\
 {   \
-d->ZMM_Q(0) = F(64, d->ZMM_D(0), s->ZMM_D(0));  \
-d->ZMM_Q(1) = F(64, d->ZMM_D(1), s->ZMM_D(1));  \
+Reg *v = d; \
+d->ZMM_L(0) = F(32, C, v->ZMM_S(0), s->ZMM_S(0));   \
+d->ZMM_L(1) = F(32, C, v->ZMM_S(1), s->ZMM_S(1));   \
+d->ZMM_L(2) = F(32, C, v->ZMM_S(2), s->ZMM_S(2));   \
+d->ZMM_L(3) = F(32, C, v->ZMM_S(3), s->ZMM_S(3));   \
+YMM_ONLY(   \
+d->ZMM_L(4) = F(32, C, v->ZMM_S(4), s->ZMM_S(4));   \
+d->ZMM_L(5) = F(32, C, v->ZMM_S(5), s->ZMM_S(5));   \
+d->ZMM_L(6) = F(32, C, v->ZMM_S(6), s->ZMM_S(6));   \
+d->ZMM_L(7) = F(32, C, v->ZMM_S(7), s->ZMM_S(7));   \
+)   \
 }   \
 \
-void helper_ ## name ## sd(CPUX86State *env, Reg *d, Reg *s)\
+void glue(helper_ ## name ## pd, SUFFIX)(CPUX86State *env,  \
+ Reg *d, Reg *s)\
 {   \
-d->ZMM_Q(0) = F(64, d->ZMM_D(0), s->ZMM_D(0));  \
-}
-
-#define FPU_CMPEQ(size, a, b)   \
-(float ## size ## _eq_quiet(a, b, >sse_status) ? -1 : 0)
-#define FPU_CMPLT(size, a, b)   \
-(float ## size ## _lt(a, b, >sse_status) ? -1 : 0)
-#define FPU_CMPLE(size, a, b)   \
-(float ## size ## _le(a, b, >sse_status) ? -1 : 0)
-#define FPU_CMPUNORD(size, a, b)\
-(float ## size ## _unordered_quiet(a, b, >sse_status) ? -1 : 0)
-#define FPU_CMPNEQ(size, a, b)  \
-(float ## size ## _eq_quiet(a, b, >sse_status) ? 0 : -1)
-#define FPU_CMPNLT(size, a, b)  \
-(float ## size ## _lt(a, b, >sse_status) ? 0 : -1)
-#define FPU_CMPNLE(size, a, b)  \
-(float ## size ## _le(a, b, >sse_status) ? 0 : -1)
-#define FPU_CMPORD(size, a, b)  \
-(float ## size ## _unordered_quiet(a, b, >sse_status) ? 0 : -1)
-
-SSE_HELPER_CMP(cmpeq, FPU_CMPEQ)
-SSE_HELPER_CMP(cmplt, FPU_CMPLT)
-SSE_HELPER_

[PATCH v2 24/42] i386: Move 3DNOW decoder

2022-04-24 Thread Paul Brook

Handle 3DNOW instructions early to avoid complicating the AVX logic.

Signed-off-by: Paul Brook 
---
 target/i386/tcg/translate.c | 30 +-
 1 file changed, 17 insertions(+), 13 deletions(-)

diff --git a/target/i386/tcg/translate.c b/target/i386/tcg/translate.c
index 64f026c0af..6c40df61d4 100644
--- a/target/i386/tcg/translate.c
+++ b/target/i386/tcg/translate.c
@@ -3297,6 +3297,11 @@ static void gen_sse(CPUX86State *env, DisasContext *s, 
int b,
 is_xmm = 1;
 }
 }
+if (sse_op.flags & SSE_OPF_3DNOW) {
+if (!(s->cpuid_ext2_features & CPUID_EXT2_3DNOW)) {
+goto illegal_op;
+}
+}
 /* simple MMX/SSE operation */
 if (s->flags & HF_TS_MASK) {
 gen_exception(s, EXCP07_PREX, pc_start - s->cs_base);
@@ -4761,21 +4766,20 @@ static void gen_sse(CPUX86State *env, DisasContext *s, 
int b,
 rm = (modrm & 7);
 op2_offset = offsetof(CPUX86State,fpregs[rm].mmx);
 }
+if (sse_op.flags & SSE_OPF_3DNOW) {
+/* 3DNow! data insns */
+val = x86_ldub_code(env, s);
+SSEFunc_0_epp op_3dnow = sse_op_table5[val];
+if (!op_3dnow) {
+goto unknown_op;
+}
+tcg_gen_addi_ptr(s->ptr0, cpu_env, op1_offset);
+tcg_gen_addi_ptr(s->ptr1, cpu_env, op2_offset);
+op_3dnow(cpu_env, s->ptr0, s->ptr1);
+return;
+}
 }
 switch(b) {
-case 0x0f: /* 3DNow! data insns */
-val = x86_ldub_code(env, s);
-sse_fn_epp = sse_op_table5[val];
-if (!sse_fn_epp) {
-goto unknown_op;
-}
-if (!(s->cpuid_ext2_features & CPUID_EXT2_3DNOW)) {
-goto illegal_op;
-}
-tcg_gen_addi_ptr(s->ptr0, cpu_env, op1_offset);
-tcg_gen_addi_ptr(s->ptr1, cpu_env, op2_offset);
-sse_fn_epp(cpu_env, s->ptr0, s->ptr1);
-break;
 case 0x70: /* pshufx insn */
 case 0xc6: /* pshufx insn */
 val = x86_ldub_code(env, s);
-- 
2.36.0

[PATCH v2 22/42] i386: Update ops_sse_helper.h ready for 256 bit AVX

2022-04-24 Thread Paul Brook

Update ops_sse_helper.h ready for 256 bit AVX helpers

Signed-off-by: Paul Brook 
---
 target/i386/ops_sse_header.h | 67 +---
 1 file changed, 40 insertions(+), 27 deletions(-)

diff --git a/target/i386/ops_sse_header.h b/target/i386/ops_sse_header.h
index 203afbb5a1..63b63eb532 100644
--- a/target/i386/ops_sse_header.h
+++ b/target/i386/ops_sse_header.h
@@ -105,7 +105,7 @@ SSE_HELPER_L(pcmpeql, FCMPEQ)
 
 SSE_HELPER_W(pmullw, FMULLW)
 #if SHIFT == 0
-DEF_HELPER_3(glue(pmulhrw, SUFFIX), FMULHRW)
+DEF_HELPER_3(glue(pmulhrw, SUFFIX), void, env, Reg, Reg)
 #endif
 SSE_HELPER_W(pmulhuw, FMULHUW)
 SSE_HELPER_W(pmulhw, FMULHW)
@@ -137,23 +137,39 @@ DEF_HELPER_3(glue(pshufhw, SUFFIX), void, Reg, Reg, int)
 /* FPU ops */
 /* XXX: not accurate */
 
-DEF_HELPER_3(glue(shufps, SUFFIX), void, Reg, Reg, int)
-DEF_HELPER_3(glue(shufpd, SUFFIX), void, Reg, Reg, int)
+#define SSE_HELPER_P4(name) \
+DEF_HELPER_3(glue(name ## ps, SUFFIX), void, env, Reg, Reg) \
+DEF_HELPER_3(glue(name ## pd, SUFFIX), void, env, Reg, Reg)
+
+#define SSE_HELPER_P3(name, ...)\
+DEF_HELPER_3(glue(name ## ps, SUFFIX), void, env, Reg, Reg) \
+DEF_HELPER_3(glue(name ## pd, SUFFIX), void, env, Reg, Reg)
 
-#define SSE_HELPER_S(name, F)\
-DEF_HELPER_3(glue(name ## ps, SUFFIX), void, env, Reg, Reg)\
-DEF_HELPER_3(name ## ss, void, env, Reg, Reg)\
-DEF_HELPER_3(glue(name ## pd, SUFFIX), void, env, Reg, Reg)\
+#if SHIFT == 1
+#define SSE_HELPER_S4(name) \
+SSE_HELPER_P4(name) \
+DEF_HELPER_3(name ## ss, void, env, Reg, Reg)   \
 DEF_HELPER_3(name ## sd, void, env, Reg, Reg)
+#define SSE_HELPER_S3(name) \
+SSE_HELPER_P3(name) \
+DEF_HELPER_3(name ## ss, void, env, Reg, Reg)   \
+DEF_HELPER_3(name ## sd, void, env, Reg, Reg)
+#else
+#define SSE_HELPER_S4(name, ...) SSE_HELPER_P4(name)
+#define SSE_HELPER_S3(name, ...) SSE_HELPER_P3(name)
+#endif
+
+DEF_HELPER_3(glue(shufps, SUFFIX), void, Reg, Reg, int)
+DEF_HELPER_3(glue(shufpd, SUFFIX), void, Reg, Reg, int)
 
-SSE_HELPER_S(add, FPU_ADD)
-SSE_HELPER_S(sub, FPU_SUB)
-SSE_HELPER_S(mul, FPU_MUL)
-SSE_HELPER_S(div, FPU_DIV)
-SSE_HELPER_S(min, FPU_MIN)
-SSE_HELPER_S(max, FPU_MAX)
-SSE_HELPER_S(sqrt, FPU_SQRT)
+SSE_HELPER_S4(add)
+SSE_HELPER_S4(sub)
+SSE_HELPER_S4(mul)
+SSE_HELPER_S4(div)
+SSE_HELPER_S4(min)
+SSE_HELPER_S4(max)
 
+SSE_HELPER_S3(sqrt)
 
 DEF_HELPER_3(glue(cvtps2pd, SUFFIX), void, env, Reg, Reg)
 DEF_HELPER_3(glue(cvtpd2ps, SUFFIX), void, env, Reg, Reg)
@@ -208,18 +224,12 @@ DEF_HELPER_4(extrq_i, void, env, ZMMReg, int, int)
 DEF_HELPER_3(insertq_r, void, env, ZMMReg, ZMMReg)
 DEF_HELPER_4(insertq_i, void, env, ZMMReg, int, int)
 #endif
-DEF_HELPER_3(glue(haddps, SUFFIX), void, env, ZMMReg, ZMMReg)
-DEF_HELPER_3(glue(haddpd, SUFFIX), void, env, ZMMReg, ZMMReg)
-DEF_HELPER_3(glue(hsubps, SUFFIX), void, env, ZMMReg, ZMMReg)
-DEF_HELPER_3(glue(hsubpd, SUFFIX), void, env, ZMMReg, ZMMReg)
-DEF_HELPER_3(glue(addsubps, SUFFIX), void, env, ZMMReg, ZMMReg)
-DEF_HELPER_3(glue(addsubpd, SUFFIX), void, env, ZMMReg, ZMMReg)
-
-#define SSE_HELPER_CMP(name, F)   \
-DEF_HELPER_3(glue(name ## ps, SUFFIX), void, env, Reg, Reg) \
-DEF_HELPER_3(name ## ss, void, env, Reg, Reg) \
-DEF_HELPER_3(glue(name ## pd, SUFFIX), void, env, Reg, Reg) \
-DEF_HELPER_3(name ## sd, void, env, Reg, Reg)
+
+SSE_HELPER_P4(hadd)
+SSE_HELPER_P4(hsub)
+SSE_HELPER_P4(addsub)
+
+#define SSE_HELPER_CMP(name, F) SSE_HELPER_S4(name)
 
 SSE_HELPER_CMP(cmpeq, FPU_CMPEQ)
 SSE_HELPER_CMP(cmplt, FPU_CMPLT)
@@ -381,6 +391,9 @@ DEF_HELPER_4(glue(pclmulqdq, SUFFIX), void, env, Reg, Reg, 
i32)
 #undef SSE_HELPER_W
 #undef SSE_HELPER_L
 #undef SSE_HELPER_Q
-#undef SSE_HELPER_S
+#undef SSE_HELPER_S3
+#undef SSE_HELPER_S4
+#undef SSE_HELPER_P3
+#undef SSE_HELPER_P4
 #undef SSE_HELPER_CMP
 #undef UNPCK_OP
-- 
2.36.0

[PATCH v2 18/42] i386: Misc AVX helper prep

2022-04-24 Thread Paul Brook

Fixup various vector helpers that either trivially exten to 256 bit,
or don't have 256 bit variants.

No functional changes to existing helpers

Signed-off-by: Paul Brook 
---
 target/i386/ops_sse.h | 159 --
 1 file changed, 139 insertions(+), 20 deletions(-)

diff --git a/target/i386/ops_sse.h b/target/i386/ops_sse.h
index d128af6cc8..3202c00572 100644
--- a/target/i386/ops_sse.h
+++ b/target/i386/ops_sse.h
@@ -641,6 +641,7 @@ void glue(helper_psadbw, SUFFIX)(CPUX86State *env, Reg *d, 
Reg *s)
 #endif
 }
 
+#if SHIFT < 2
 void glue(helper_maskmov, SUFFIX)(CPUX86State *env, Reg *d, Reg *s,
   target_ulong a0)
 {
@@ -652,6 +653,7 @@ void glue(helper_maskmov, SUFFIX)(CPUX86State *env, Reg *d, 
Reg *s,
 }
 }
 }
+#endif
 
 void glue(helper_movl_mm_T0, SUFFIX)(Reg *d, uint32_t val)
 {
@@ -882,6 +884,13 @@ void glue(helper_cvtps2pd, SUFFIX)(CPUX86State *env, Reg 
*d, Reg *s)
 
 s0 = s->ZMM_S(0);
 s1 = s->ZMM_S(1);
+#if SHIFT == 2
+float32 s2, s3;
+s2 = s->ZMM_S(2);
+s3 = s->ZMM_S(3);
+d->ZMM_D(2) = float32_to_float64(s2, >sse_status);
+d->ZMM_D(3) = float32_to_float64(s3, >sse_status);
+#endif
 d->ZMM_D(0) = float32_to_float64(s0, >sse_status);
 d->ZMM_D(1) = float32_to_float64(s1, >sse_status);
 }
@@ -890,9 +899,17 @@ void glue(helper_cvtpd2ps, SUFFIX)(CPUX86State *env, Reg 
*d, Reg *s)
 {
 d->ZMM_S(0) = float64_to_float32(s->ZMM_D(0), >sse_status);
 d->ZMM_S(1) = float64_to_float32(s->ZMM_D(1), >sse_status);
+#if SHIFT == 2
+d->ZMM_S(2) = float64_to_float32(s->ZMM_D(2), >sse_status);
+d->ZMM_S(3) = float64_to_float32(s->ZMM_D(3), >sse_status);
+d->Q(2) = 0;
+d->Q(3) = 0;
+#else
 d->Q(1) = 0;
+#endif
 }
 
+#if SHIFT == 1
 void helper_cvtss2sd(CPUX86State *env, Reg *d, Reg *s)
 {
 d->ZMM_D(0) = float32_to_float64(s->ZMM_S(0), >sse_status);
@@ -902,6 +919,7 @@ void helper_cvtsd2ss(CPUX86State *env, Reg *d, Reg *s)
 {
 d->ZMM_S(0) = float64_to_float32(s->ZMM_D(0), >sse_status);
 }
+#endif
 
 /* integer to float */
 void glue(helper_cvtdq2ps, SUFFIX)(CPUX86State *env, Reg *d, Reg *s)
@@ -910,6 +928,12 @@ void glue(helper_cvtdq2ps, SUFFIX)(CPUX86State *env, Reg 
*d, Reg *s)
 d->ZMM_S(1) = int32_to_float32(s->ZMM_L(1), >sse_status);
 d->ZMM_S(2) = int32_to_float32(s->ZMM_L(2), >sse_status);
 d->ZMM_S(3) = int32_to_float32(s->ZMM_L(3), >sse_status);
+#if SHIFT == 2
+d->ZMM_S(4) = int32_to_float32(s->ZMM_L(4), >sse_status);
+d->ZMM_S(5) = int32_to_float32(s->ZMM_L(5), >sse_status);
+d->ZMM_S(6) = int32_to_float32(s->ZMM_L(6), >sse_status);
+d->ZMM_S(7) = int32_to_float32(s->ZMM_L(7), >sse_status);
+#endif
 }
 
 void glue(helper_cvtdq2pd, SUFFIX)(CPUX86State *env, Reg *d, Reg *s)
@@ -918,10 +942,18 @@ void glue(helper_cvtdq2pd, SUFFIX)(CPUX86State *env, Reg 
*d, Reg *s)
 
 l0 = (int32_t)s->ZMM_L(0);
 l1 = (int32_t)s->ZMM_L(1);
+#if SHIFT == 2
+int32_t l2, l3;
+l2 = (int32_t)s->ZMM_L(2);
+l3 = (int32_t)s->ZMM_L(3);
+d->ZMM_D(2) = int32_to_float64(l2, >sse_status);
+d->ZMM_D(3) = int32_to_float64(l3, >sse_status);
+#endif
 d->ZMM_D(0) = int32_to_float64(l0, >sse_status);
 d->ZMM_D(1) = int32_to_float64(l1, >sse_status);
 }
 
+#if SHIFT == 1
 void helper_cvtpi2ps(CPUX86State *env, ZMMReg *d, MMXReg *s)
 {
 d->ZMM_S(0) = int32_to_float32(s->MMX_L(0), >sse_status);
@@ -956,8 +988,11 @@ void helper_cvtsq2sd(CPUX86State *env, ZMMReg *d, uint64_t 
val)
 }
 #endif
 
+#endif
+
 /* float to integer */
 
+#if SHIFT == 1
 /*
  * x86 mandates that we return the indefinite integer value for the result
  * of any float-to-integer conversion that raises the 'invalid' exception.
@@ -988,6 +1023,7 @@ WRAP_FLOATCONV(int64_t, float32_to_int64, float32, 
INT64_MIN)
 WRAP_FLOATCONV(int64_t, float32_to_int64_round_to_zero, float32, INT64_MIN)
 WRAP_FLOATCONV(int64_t, float64_to_int64, float64, INT64_MIN)
 WRAP_FLOATCONV(int64_t, float64_to_int64_round_to_zero, float64, INT64_MIN)
+#endif
 
 void glue(helper_cvtps2dq, SUFFIX)(CPUX86State *env, ZMMReg *d, ZMMReg *s)
 {
@@ -995,15 +1031,29 @@ void glue(helper_cvtps2dq, SUFFIX)(CPUX86State *env, 
ZMMReg *d, ZMMReg *s)
 d->ZMM_L(1) = x86_float32_to_int32(s->ZMM_S(1), >sse_status);
 d->ZMM_L(2) = x86_float32_to_int32(s->ZMM_S(2), >sse_status);
 d->ZMM_L(3) = x86_float32_to_int32(s->ZMM_S(3), >sse_status);
+#if SHIFT == 2
+d->ZMM_L(4) = x86_float32_to_int32(s->ZMM_S(4), >sse_status);
+d->ZMM_L(5) = x86_float32_to_int32(s->ZMM_S(5), >sse_status);
+d->ZMM_L(6) = x86_float32_to_int32(s->ZMM_S(6), >sse_status);
+d->ZMM_L(7) = x86_float32_to_int32(s->ZMM_S(7), >sse_status);
+#endif
 }

[PATCH v2 28/42] i386: Implement VZEROALL and VZEROUPPER

2022-04-24 Thread Paul Brook

The use the same opcode as EMMS, which I guess makes some sort of sense.
Fairly strightforward other than that.

If we were wanting to optimize out gen_clear_ymmh then this would be one of
the starting points.

Signed-off-by: Paul Brook 
---
 target/i386/ops_sse.h| 48 
 target/i386/ops_sse_header.h |  9 +++
 target/i386/tcg/translate.c  | 26 ---
 3 files changed, 80 insertions(+), 3 deletions(-)

diff --git a/target/i386/ops_sse.h b/target/i386/ops_sse.h
index ad3312d353..a1f50f0c8b 100644
--- a/target/i386/ops_sse.h
+++ b/target/i386/ops_sse.h
@@ -3071,6 +3071,54 @@ void glue(helper_aeskeygenassist, SUFFIX)(CPUX86State 
*env, Reg *d, Reg *s,
 #endif
 #endif
 
+#if SHIFT == 2
+void helper_vzeroall(CPUX86State *env)
+{
+int i;
+
+for (i = 0; i < 8; i++) {
+env->xmm_regs[i].ZMM_Q(0) = 0;
+env->xmm_regs[i].ZMM_Q(1) = 0;
+env->xmm_regs[i].ZMM_Q(2) = 0;
+env->xmm_regs[i].ZMM_Q(3) = 0;
+}
+}
+
+void helper_vzeroupper(CPUX86State *env)
+{
+int i;
+
+for (i = 0; i < 8; i++) {
+env->xmm_regs[i].ZMM_Q(2) = 0;
+env->xmm_regs[i].ZMM_Q(3) = 0;
+}
+}
+
+#ifdef TARGET_X86_64
+void helper_vzeroall_hi8(CPUX86State *env)
+{
+int i;
+
+for (i = 8; i < 16; i++) {
+env->xmm_regs[i].ZMM_Q(0) = 0;
+env->xmm_regs[i].ZMM_Q(1) = 0;
+env->xmm_regs[i].ZMM_Q(2) = 0;
+env->xmm_regs[i].ZMM_Q(3) = 0;
+}
+}
+
+void helper_vzeroupper_hi8(CPUX86State *env)
+{
+int i;
+
+for (i = 8; i < 16; i++) {
+env->xmm_regs[i].ZMM_Q(2) = 0;
+env->xmm_regs[i].ZMM_Q(3) = 0;
+}
+}
+#endif
+#endif
+
 #undef SSE_HELPER_S
 
 #undef SHIFT
diff --git a/target/i386/ops_sse_header.h b/target/i386/ops_sse_header.h
index cfcfba154b..48f0945917 100644
--- a/target/i386/ops_sse_header.h
+++ b/target/i386/ops_sse_header.h
@@ -411,6 +411,15 @@ DEF_HELPER_4(glue(aeskeygenassist, SUFFIX), void, env, 
Reg, Reg, i32)
 DEF_HELPER_5(glue(pclmulqdq, SUFFIX), void, env, Reg, Reg, Reg, i32)
 #endif
 
+#if SHIFT == 2
+DEF_HELPER_1(vzeroall, void, env)
+DEF_HELPER_1(vzeroupper, void, env)
+#ifdef TARGET_X86_64
+DEF_HELPER_1(vzeroall_hi8, void, env)
+DEF_HELPER_1(vzeroupper_hi8, void, env)
+#endif
+#endif
+
 #undef SHIFT
 #undef Reg
 #undef SUFFIX
diff --git a/target/i386/tcg/translate.c b/target/i386/tcg/translate.c
index bcd6d47fd0..ba70aeb039 100644
--- a/target/i386/tcg/translate.c
+++ b/target/i386/tcg/translate.c
@@ -3455,9 +3455,29 @@ static void gen_sse(CPUX86State *env, DisasContext *s, 
int b,
 return;
 }
 if (b == 0x77) {
-/* emms */
-gen_helper_emms(cpu_env);
-return;
+if (s->prefix & PREFIX_VEX) {
+CHECK_AVX(s);
+if (s->vex_l) {
+gen_helper_vzeroall(cpu_env);
+#ifdef TARGET_X86_64
+if (CODE64(s)) {
+gen_helper_vzeroall_hi8(cpu_env);
+}
+#endif
+} else {
+gen_helper_vzeroupper(cpu_env);
+#ifdef TARGET_X86_64
+if (CODE64(s)) {
+gen_helper_vzeroupper_hi8(cpu_env);
+}
+#endif
+}
+return;
+} else {
+/* emms */
+gen_helper_emms(cpu_env);
+return;
+}
 }
 /* prepare MMX state (XXX: optimize by storing fptt and fptags in
the static cpu state) */
-- 
2.36.0

[PATCH v2 13/42] i386: Destructive vector helpers for AVX

2022-04-24 Thread Paul Brook

These helpers need to take special care to avoid overwriting source values
before the wole result has been calculated.  Currently they use a dummy
Reg typed variable to store the result then assign the whole register.
This will cause 128 bit operations to corrupt the upper half of the register,
so replace it with explicit temporaries and element assignments.

Signed-off-by: Paul Brook 
---
 target/i386/ops_sse.h | 707 ++
 1 file changed, 437 insertions(+), 270 deletions(-)

diff --git a/target/i386/ops_sse.h b/target/i386/ops_sse.h
index d0424140d9..c645d2ddbf 100644
--- a/target/i386/ops_sse.h
+++ b/target/i386/ops_sse.h
@@ -680,71 +680,85 @@ void glue(helper_movq_mm_T0, SUFFIX)(Reg *d, uint64_t val)
 }
 #endif
 
+#define SHUFFLE4(F, a, b, offset) do {  \
+r0 = a->F((order & 3) + offset);\
+r1 = a->F(((order >> 2) & 3) + offset); \
+r2 = b->F(((order >> 4) & 3) + offset); \
+r3 = b->F(((order >> 6) & 3) + offset); \
+d->F(offset) = r0;  \
+d->F(offset + 1) = r1;  \
+d->F(offset + 2) = r2;  \
+d->F(offset + 3) = r3;  \
+} while (0)
+
 #if SHIFT == 0
 void glue(helper_pshufw, SUFFIX)(Reg *d, Reg *s, int order)
 {
-Reg r;
+uint16_t r0, r1, r2, r3;
 
-r.W(0) = s->W(order & 3);
-r.W(1) = s->W((order >> 2) & 3);
-r.W(2) = s->W((order >> 4) & 3);
-r.W(3) = s->W((order >> 6) & 3);
-MOVE(*d, r);
+SHUFFLE4(W, s, s, 0);
 }
 #else
 void helper_shufps(Reg *d, Reg *s, int order)
 {
-Reg r;
+Reg *v = d;
+uint32_t r0, r1, r2, r3;
 
-r.L(0) = d->L(order & 3);
-r.L(1) = d->L((order >> 2) & 3);
-r.L(2) = s->L((order >> 4) & 3);
-r.L(3) = s->L((order >> 6) & 3);
-MOVE(*d, r);
+SHUFFLE4(L, v, s, 0);
+#if SHIFT == 2
+SHUFFLE4(L, v, s, 4);
+#endif
 }
 
 void helper_shufpd(Reg *d, Reg *s, int order)
 {
-Reg r;
+Reg *v = d;
+uint64_t r0, r1;
 
-r.Q(0) = d->Q(order & 1);
-r.Q(1) = s->Q((order >> 1) & 1);
-MOVE(*d, r);
+r0 = v->Q(order & 1);
+r1 = s->Q((order >> 1) & 1);
+d->Q(0) = r0;
+d->Q(1) = r1;
+#if SHIFT == 2
+r0 = v->Q(((order >> 2) & 1) + 2);
+r1 = s->Q(((order >> 3) & 1) + 2);
+d->Q(2) = r0;
+d->Q(3) = r1;
+#endif
 }
 
 void glue(helper_pshufd, SUFFIX)(Reg *d, Reg *s, int order)
 {
-Reg r;
+uint32_t r0, r1, r2, r3;
 
-r.L(0) = s->L(order & 3);
-r.L(1) = s->L((order >> 2) & 3);
-r.L(2) = s->L((order >> 4) & 3);
-r.L(3) = s->L((order >> 6) & 3);
-MOVE(*d, r);
+SHUFFLE4(L, s, s, 0);
+#if SHIFT ==  2
+SHUFFLE4(L, s, s, 4);
+#endif
 }
 
 void glue(helper_pshuflw, SUFFIX)(Reg *d, Reg *s, int order)
 {
-Reg r;
+uint16_t r0, r1, r2, r3;
 
-r.W(0) = s->W(order & 3);
-r.W(1) = s->W((order >> 2) & 3);
-r.W(2) = s->W((order >> 4) & 3);
-r.W(3) = s->W((order >> 6) & 3);
-r.Q(1) = s->Q(1);
-MOVE(*d, r);
+SHUFFLE4(W, s, s, 0);
+d->Q(1) = s->Q(1);
+#if SHIFT == 2
+SHUFFLE4(W, s, s, 8);
+d->Q(3) = s->Q(3);
+#endif
 }
 
 void glue(helper_pshufhw, SUFFIX)(Reg *d, Reg *s, int order)
 {
-Reg r;
+uint16_t r0, r1, r2, r3;
 
-r.Q(0) = s->Q(0);
-r.W(4) = s->W(4 + (order & 3));
-r.W(5) = s->W(4 + ((order >> 2) & 3));
-r.W(6) = s->W(4 + ((order >> 4) & 3));
-r.W(7) = s->W(4 + ((order >> 6) & 3));
-MOVE(*d, r);
+d->Q(0) = s->Q(0);
+SHUFFLE4(W, s, s, 4);
+#if SHIFT == 2
+d->Q(2) = s->Q(2);
+SHUFFLE4(W, s, s, 12);
+#endif
 }
 #endif
 
@@ -1320,156 +1334,190 @@ uint32_t glue(helper_pmovmskb, SUFFIX)(CPUX86State 
*env, Reg *s)
 return val;
 }
 
-void glue(helper_packsswb, SUFFIX)(CPUX86State *env, Reg *d, Reg *s)
-{
-Reg r;
-
-r.B(0) = satsb((int16_t)d->W(0));
-r.B(1) = satsb((int16_t)d->W(1));
-r.B(2) = satsb((int16_t)d->W(2));
-r.B(3) = satsb((int16_t)d->W(3));
-#if SHIFT == 1
-r.B(4) = satsb((int16_t)d->W(4));
-r.B(5) = satsb((int16_t)d->W(5));
-r.B(6) = satsb((int16_t)d->W(6));
-r.B(7) = satsb((int16_t)d->W(7));
-#endif
-r.B((4 << SHIFT) + 0) = satsb((int16_t)s->W(0));
-r.B((4 << SHIFT) + 1) = satsb((int16_t)s->W(1));
-r.B((4 << SHIFT) + 2) = satsb((int16_t)s->W(2));
-r.B((4 << SHIFT) + 3) = satsb((int16_t)s->W(3));
-#if SHIFT == 1
-r.B(12) = satsb((int16_t)s->W(4));
-r.B(13) = satsb((int16_t)s->W(5));
-r.B(14) = satsb((int16_t)s->W(6));
-r.B(15) = satsb((int16_t)s->W(7));
-#endif
-MOVE(*d, r);
-}
-
-void

[PATCH v2 40/42] Enable all x86-64 cpu features in user mode

2022-04-24 Thread Paul Brook

We don't have any migration concerns for usermode emulation, so we may
as well enable all available CPU features by default.

Signed-off-by: Paul Brook 
---
 linux-user/x86_64/target_elf.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/linux-user/x86_64/target_elf.h b/linux-user/x86_64/target_elf.h
index 7b76a90de8..3f628f8d66 100644
--- a/linux-user/x86_64/target_elf.h
+++ b/linux-user/x86_64/target_elf.h
@@ -9,6 +9,6 @@
 #define X86_64_TARGET_ELF_H
 static inline const char *cpu_get_model(uint32_t eflags)
 {
-return "qemu64";
+return "max";
 }
 #endif
-- 
2.36.0

[PATCH v2 29/42] i386: Implement VBROADCAST

2022-04-24 Thread Paul Brook

The catch here is that these are whole vector operations (not independent 128
bit lanes). We abuse the SSE_OPF_SCALAR flag to select the memory operand
width appropriately.

Signed-off-by: Paul Brook 
---
 target/i386/ops_sse.h| 51 
 target/i386/ops_sse_header.h |  8 ++
 target/i386/tcg/translate.c  | 42 -
 3 files changed, 100 insertions(+), 1 deletion(-)

diff --git a/target/i386/ops_sse.h b/target/i386/ops_sse.h
index a1f50f0c8b..4115c9a257 100644
--- a/target/i386/ops_sse.h
+++ b/target/i386/ops_sse.h
@@ -3071,7 +3071,57 @@ void glue(helper_aeskeygenassist, SUFFIX)(CPUX86State 
*env, Reg *d, Reg *s,
 #endif
 #endif
 
+#if SHIFT >= 1
+void glue(helper_vbroadcastb, SUFFIX)(CPUX86State *env, Reg *d, Reg *s)
+{
+uint8_t val = s->B(0);
+int i;
+
+for (i = 0; i < 16 * SHIFT; i++) {
+d->B(i) = val;
+}
+}
+
+void glue(helper_vbroadcastw, SUFFIX)(CPUX86State *env, Reg *d, Reg *s)
+{
+uint16_t val = s->W(0);
+int i;
+
+for (i = 0; i < 8 * SHIFT; i++) {
+d->W(i) = val;
+}
+}
+
+void glue(helper_vbroadcastl, SUFFIX)(CPUX86State *env, Reg *d, Reg *s)
+{
+uint32_t val = s->L(0);
+int i;
+
+for (i = 0; i < 8 * SHIFT; i++) {
+d->L(i) = val;
+}
+}
+
+void glue(helper_vbroadcastq, SUFFIX)(CPUX86State *env, Reg *d, Reg *s)
+{
+uint64_t val = s->Q(0);
+d->Q(0) = val;
+d->Q(1) = val;
 #if SHIFT == 2
+d->Q(2) = val;
+d->Q(3) = val;
+#endif
+}
+
+#if SHIFT == 2
+void glue(helper_vbroadcastdq, SUFFIX)(CPUX86State *env, Reg *d, Reg *s)
+{
+d->Q(0) = s->Q(0);
+d->Q(1) = s->Q(1);
+d->Q(2) = s->Q(0);
+d->Q(3) = s->Q(1);
+}
+
 void helper_vzeroall(CPUX86State *env)
 {
 int i;
@@ -3118,6 +3168,7 @@ void helper_vzeroupper_hi8(CPUX86State *env)
 }
 #endif
 #endif
+#endif
 
 #undef SSE_HELPER_S
 
diff --git a/target/i386/ops_sse_header.h b/target/i386/ops_sse_header.h
index 48f0945917..51e02cd4fa 100644
--- a/target/i386/ops_sse_header.h
+++ b/target/i386/ops_sse_header.h
@@ -411,7 +411,14 @@ DEF_HELPER_4(glue(aeskeygenassist, SUFFIX), void, env, 
Reg, Reg, i32)
 DEF_HELPER_5(glue(pclmulqdq, SUFFIX), void, env, Reg, Reg, Reg, i32)
 #endif
 
+/* AVX helpers */
+#if SHIFT >= 1
+DEF_HELPER_3(glue(vbroadcastb, SUFFIX), void, env, Reg, Reg)
+DEF_HELPER_3(glue(vbroadcastw, SUFFIX), void, env, Reg, Reg)
+DEF_HELPER_3(glue(vbroadcastl, SUFFIX), void, env, Reg, Reg)
+DEF_HELPER_3(glue(vbroadcastq, SUFFIX), void, env, Reg, Reg)
 #if SHIFT == 2
+DEF_HELPER_3(glue(vbroadcastdq, SUFFIX), void, env, Reg, Reg)
 DEF_HELPER_1(vzeroall, void, env)
 DEF_HELPER_1(vzeroupper, void, env)
 #ifdef TARGET_X86_64
@@ -419,6 +426,7 @@ DEF_HELPER_1(vzeroall_hi8, void, env)
 DEF_HELPER_1(vzeroupper_hi8, void, env)
 #endif
 #endif
+#endif
 
 #undef SHIFT
 #undef Reg
diff --git a/target/i386/tcg/translate.c b/target/i386/tcg/translate.c
index ba70aeb039..59ab1dc562 100644
--- a/target/i386/tcg/translate.c
+++ b/target/i386/tcg/translate.c
@@ -3255,6 +3255,11 @@ static const struct SSEOpHelper_table6 
sse_op_table6[256] = {
 [0x14] = BLENDV_OP(blendvps, SSE41, 0),
 [0x15] = BLENDV_OP(blendvpd, SSE41, 0),
 [0x17] = CMP_OP(ptest, SSE41),
+/* TODO:Some vbroadcast variants require AVX2 */
+[0x18] = UNARY_OP(vbroadcastl, AVX, SSE_OPF_SCALAR), /* vbroadcastss */
+[0x19] = UNARY_OP(vbroadcastq, AVX, SSE_OPF_SCALAR), /* vbroadcastsd */
+#define gen_helper_vbroadcastdq_xmm NULL
+[0x1a] = UNARY_OP(vbroadcastdq, AVX, SSE_OPF_SCALAR), /* vbroadcastf128 */
 [0x1c] = UNARY_OP_MMX(pabsb, SSSE3),
 [0x1d] = UNARY_OP_MMX(pabsw, SSSE3),
 [0x1e] = UNARY_OP_MMX(pabsd, SSSE3),
@@ -3286,6 +3291,16 @@ static const struct SSEOpHelper_table6 
sse_op_table6[256] = {
 [0x40] = BINARY_OP(pmulld, SSE41, SSE_OPF_MMX),
 #define gen_helper_phminposuw_ymm NULL
 [0x41] = UNARY_OP(phminposuw, SSE41, 0),
+/* vpbroadcastd */
+[0x58] = UNARY_OP(vbroadcastl, AVX, SSE_OPF_SCALAR | SSE_OPF_MMX),
+/* vpbroadcastq */
+[0x59] = UNARY_OP(vbroadcastq, AVX, SSE_OPF_SCALAR | SSE_OPF_MMX),
+/* vbroadcasti128 */
+[0x5a] = UNARY_OP(vbroadcastdq, AVX, SSE_OPF_SCALAR | SSE_OPF_MMX),
+/* vpbroadcastb */
+[0x78] = UNARY_OP(vbroadcastb, AVX, SSE_OPF_SCALAR | SSE_OPF_MMX),
+/* vpbroadcastw */
+[0x79] = UNARY_OP(vbroadcastw, AVX, SSE_OPF_SCALAR | SSE_OPF_MMX),
 #define gen_helper_aesimc_ymm NULL
 [0xdb] = UNARY_OP(aesimc, AES, 0),
 [0xdc] = BINARY_OP(aesenc, AES, 0),
@@ -4323,6 +4338,24 @@ static void gen_sse(CPUX86State *env, DisasContext *s, 
int b,
 op2_offset = offsetof(CPUX86State, xmm_t0);
 gen_lea_modrm(env, s, modrm);
 switch (b) {
+case 0x78: /* vpbroadcastb */
+size = 8;
+break;
+case 0x79: /* vpbroadcas

[PATCH v2 11/42] i386: Rewrite simple integer vector helpers

2022-04-24 Thread Paul Brook

Rewrite the "simple" vector integer helpers in preperation for AVX support.

While the current code is able to use the same prototype for unary
(a = F(b)) and binary (a = F(b, c)) operations, future changes will cause
them to diverge.

No functional changes to existing helpers

Signed-off-by: Paul Brook 
---
 target/i386/ops_sse.h | 180 --
 1 file changed, 137 insertions(+), 43 deletions(-)

diff --git a/target/i386/ops_sse.h b/target/i386/ops_sse.h
index 9297c96d04..bb9cbf9ead 100644
--- a/target/i386/ops_sse.h
+++ b/target/i386/ops_sse.h
@@ -275,61 +275,148 @@ void glue(helper_pslldq, SUFFIX)(CPUX86State *env, Reg 
*d, Reg *c)
 }
 #endif
 
-#define SSE_HELPER_B(name, F)   \
+#define SSE_HELPER_1(name, elem, num, F)   \
 void glue(name, SUFFIX)(CPUX86State *env, Reg *d, Reg *s)   \
 {   \
-d->B(0) = F(d->B(0), s->B(0));  \
-d->B(1) = F(d->B(1), s->B(1));  \
-d->B(2) = F(d->B(2), s->B(2));  \
-d->B(3) = F(d->B(3), s->B(3));  \
-d->B(4) = F(d->B(4), s->B(4));  \
-d->B(5) = F(d->B(5), s->B(5));  \
-d->B(6) = F(d->B(6), s->B(6));  \
-d->B(7) = F(d->B(7), s->B(7));  \
+d->elem(0) = F(s->elem(0)); \
+d->elem(1) = F(s->elem(1)); \
+if ((num << SHIFT) > 2) {   \
+d->elem(2) = F(s->elem(2)); \
+d->elem(3) = F(s->elem(3)); \
+}   \
+if ((num << SHIFT) > 4) {   \
+d->elem(4) = F(s->elem(4)); \
+d->elem(5) = F(s->elem(5)); \
+d->elem(6) = F(s->elem(6)); \
+d->elem(7) = F(s->elem(7)); \
+}   \
+if ((num << SHIFT) > 8) {   \
+d->elem(8) = F(s->elem(8)); \
+d->elem(9) = F(s->elem(9)); \
+d->elem(10) = F(s->elem(10));   \
+d->elem(11) = F(s->elem(11));   \
+d->elem(12) = F(s->elem(12));   \
+d->elem(13) = F(s->elem(13));   \
+d->elem(14) = F(s->elem(14));   \
+d->elem(15) = F(s->elem(15));   \
+}   \
+if ((num << SHIFT) > 16) {  \
+d->elem(16) = F(s->elem(16));   \
+d->elem(17) = F(s->elem(17));   \
+d->elem(18) = F(s->elem(18));   \
+d->elem(19) = F(s->elem(19));   \
+d->elem(20) = F(s->elem(20));   \
+d->elem(21) = F(s->elem(21));   \
+d->elem(22) = F(s->elem(22));   \
+d->elem(23) = F(s->elem(23));   \
+d->elem(24) = F(s->elem(24));   \
+d->elem(25) = F(s->elem(25));   \
+d->elem(26) = F(s->elem(26));   \
+d->elem(27) = F(s->elem(27));   \
+d->elem(28) = F(s->elem(28));   \
+d->elem(29) = F(s->elem(29));   \
+d->elem(30) = F(s->elem(30));   \
+d->elem(31) = F(s->elem(31));   \
+}   \
+}
+
+#define SSE_HELPER_B(name, F)   \
+void glue(name, SUFFIX)(CPUX86State *env, Reg *d, Reg *s) \
+{   \
+Reg *v = d; \
+d->B(0) = F(v->B(0), s->B(0));  \
+d->B(1) = F(v->B(1), s->B(1));  \
+d->B(2) = F(v->B(2), s->B

[PATCH v2 38/42] i386: Implement VPBLENDD

2022-04-24 Thread Paul Brook

This is semantically eqivalent to VBLENDPS.

Signed-off-by: Paul Brook 
---
 target/i386/tcg/translate.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/target/i386/tcg/translate.c b/target/i386/tcg/translate.c
index 95ecdea8fe..73f3842c36 100644
--- a/target/i386/tcg/translate.c
+++ b/target/i386/tcg/translate.c
@@ -3353,6 +3353,7 @@ static const struct SSEOpHelper_table7 sse_op_table7[256] 
= {
 #define gen_helper_vpermq_xmm NULL
 [0x00] = UNARY_OP(vpermq, AVX, SSE_OPF_AVX2),
 [0x01] = UNARY_OP(vpermq, AVX, SSE_OPF_AVX2), /* vpermpd */
+[0x02] = BINARY_OP(blendps, AVX, SSE_OPF_AVX2), /* vpblendd */
 [0x04] = UNARY_OP(vpermilps_imm, AVX, 0),
 [0x05] = UNARY_OP(vpermilpd_imm, AVX, 0),
 #define gen_helper_vpermdq_xmm NULL
-- 
2.36.0

[PATCH v2 17/42] i386: Destructive FP helpers for AVX

2022-04-24 Thread Paul Brook

Perpare the horizontal atithmetic vector helpers for AVX
These currently use a dummy Reg typed variable to store the result then
assign the whole register.  This will cause 128 bit operations to corrupt
the upper half of the register, so replace it with explicit temporaries
and element assignments.

Signed-off-by: Paul Brook 
---
 target/i386/ops_sse.h | 96 +++
 1 file changed, 70 insertions(+), 26 deletions(-)

diff --git a/target/i386/ops_sse.h b/target/i386/ops_sse.h
index 4137e6e1fa..d128af6cc8 100644
--- a/target/i386/ops_sse.h
+++ b/target/i386/ops_sse.h
@@ -1196,44 +1196,88 @@ void helper_insertq_i(CPUX86State *env, ZMMReg *d, int 
index, int length)
 d->ZMM_Q(0) = helper_insertq(d->ZMM_Q(0), index, length);
 }
 
-void glue(helper_haddps, SUFFIX)(CPUX86State *env, ZMMReg *d, ZMMReg *s)
+void glue(helper_haddps, SUFFIX)(CPUX86State *env, Reg *d, Reg *s)
 {
-ZMMReg r;
-
-r.ZMM_S(0) = float32_add(d->ZMM_S(0), d->ZMM_S(1), >sse_status);
-r.ZMM_S(1) = float32_add(d->ZMM_S(2), d->ZMM_S(3), >sse_status);
-r.ZMM_S(2) = float32_add(s->ZMM_S(0), s->ZMM_S(1), >sse_status);
-r.ZMM_S(3) = float32_add(s->ZMM_S(2), s->ZMM_S(3), >sse_status);
-MOVE(*d, r);
+Reg *v = d;
+float32 r0, r1, r2, r3;
+
+r0 = float32_add(v->ZMM_S(0), v->ZMM_S(1), >sse_status);
+r1 = float32_add(v->ZMM_S(2), v->ZMM_S(3), >sse_status);
+r2 = float32_add(s->ZMM_S(0), s->ZMM_S(1), >sse_status);
+r3 = float32_add(s->ZMM_S(2), s->ZMM_S(3), >sse_status);
+d->ZMM_S(0) = r0;
+d->ZMM_S(1) = r1;
+d->ZMM_S(2) = r2;
+d->ZMM_S(3) = r3;
+#if SHIFT == 2
+r0 = float32_add(v->ZMM_S(4), v->ZMM_S(5), >sse_status);
+r1 = float32_add(v->ZMM_S(6), v->ZMM_S(7), >sse_status);
+r2 = float32_add(s->ZMM_S(4), s->ZMM_S(5), >sse_status);
+r3 = float32_add(s->ZMM_S(6), s->ZMM_S(7), >sse_status);
+d->ZMM_S(4) = r0;
+d->ZMM_S(5) = r1;
+d->ZMM_S(6) = r2;
+d->ZMM_S(7) = r3;
+#endif
 }
 
-void glue(helper_haddpd, SUFFIX)(CPUX86State *env, ZMMReg *d, ZMMReg *s)
+void glue(helper_haddpd, SUFFIX)(CPUX86State *env, Reg *d, Reg *s)
 {
-ZMMReg r;
+Reg *v = d;
+float64 r0, r1;
 
-r.ZMM_D(0) = float64_add(d->ZMM_D(0), d->ZMM_D(1), >sse_status);
-r.ZMM_D(1) = float64_add(s->ZMM_D(0), s->ZMM_D(1), >sse_status);
-MOVE(*d, r);
+r0 = float64_add(v->ZMM_D(0), v->ZMM_D(1), >sse_status);
+r1 = float64_add(s->ZMM_D(0), s->ZMM_D(1), >sse_status);
+d->ZMM_D(0) = r0;
+d->ZMM_D(1) = r1;
+#if SHIFT == 2
+r0 = float64_add(v->ZMM_D(2), v->ZMM_D(3), >sse_status);
+r1 = float64_add(s->ZMM_D(2), s->ZMM_D(3), >sse_status);
+d->ZMM_D(2) = r0;
+d->ZMM_D(3) = r1;
+#endif
 }
 
-void glue(helper_hsubps, SUFFIX)(CPUX86State *env, ZMMReg *d, ZMMReg *s)
+void glue(helper_hsubps, SUFFIX)(CPUX86State *env, Reg *d, Reg *s)
 {
-ZMMReg r;
-
-r.ZMM_S(0) = float32_sub(d->ZMM_S(0), d->ZMM_S(1), >sse_status);
-r.ZMM_S(1) = float32_sub(d->ZMM_S(2), d->ZMM_S(3), >sse_status);
-r.ZMM_S(2) = float32_sub(s->ZMM_S(0), s->ZMM_S(1), >sse_status);
-r.ZMM_S(3) = float32_sub(s->ZMM_S(2), s->ZMM_S(3), >sse_status);
-MOVE(*d, r);
+Reg *v = d;
+float32 r0, r1, r2, r3;
+
+r0 = float32_sub(v->ZMM_S(0), v->ZMM_S(1), >sse_status);
+r1 = float32_sub(v->ZMM_S(2), v->ZMM_S(3), >sse_status);
+r2 = float32_sub(s->ZMM_S(0), s->ZMM_S(1), >sse_status);
+r3 = float32_sub(s->ZMM_S(2), s->ZMM_S(3), >sse_status);
+d->ZMM_S(0) = r0;
+d->ZMM_S(1) = r1;
+d->ZMM_S(2) = r2;
+d->ZMM_S(3) = r3;
+#if SHIFT == 2
+r0 = float32_sub(v->ZMM_S(4), v->ZMM_S(5), >sse_status);
+r1 = float32_sub(v->ZMM_S(6), v->ZMM_S(7), >sse_status);
+r2 = float32_sub(s->ZMM_S(4), s->ZMM_S(5), >sse_status);
+r3 = float32_sub(s->ZMM_S(6), s->ZMM_S(7), >sse_status);
+d->ZMM_S(4) = r0;
+d->ZMM_S(5) = r1;
+d->ZMM_S(6) = r2;
+d->ZMM_S(7) = r3;
+#endif
 }
 
-void glue(helper_hsubpd, SUFFIX)(CPUX86State *env, ZMMReg *d, ZMMReg *s)
+void glue(helper_hsubpd, SUFFIX)(CPUX86State *env, Reg *d, Reg *s)
 {
-ZMMReg r;
+Reg *v = d;
+float64 r0, r1;
 
-r.ZMM_D(0) = float64_sub(d->ZMM_D(0), d->ZMM_D(1), >sse_status);
-r.ZMM_D(1) = float64_sub(s->ZMM_D(0), s->ZMM_D(1), >sse_status);
-MOVE(*d, r);
+r0 = float64_sub(v->ZMM_D(0), v->ZMM_D(1), >sse_status);
+r1 = float64_sub(s->ZMM_D(0), s->ZMM_D(1), >sse_status);
+d->ZMM_D(0) = r0;
+d->ZMM_D(1) = r1;
+#if SHIFT == 2
+r0 = float64_sub(v->ZMM_D(2), v->ZMM_D(3), >sse_status);
+r1 = float64_sub(s->ZMM_D(2), s->ZMM_D(3), >sse_status);
+d->ZMM_D(2) = r0;
+d->ZMM_D(3) = r1;
+#endif
 }
 
 void glue(helper_addsubps, SUFFIX)(CPUX86State *env, Reg *d, Reg *s)
-- 
2.36.0

[PATCH v2 30/42] i386: Implement VPERMIL

2022-04-24 Thread Paul Brook

Some potentially surprising details when comparing vpermilpd v.s. vpermilps,
but overall pretty straightforward.

Signed-off-by: Paul Brook 
---
 target/i386/ops_sse.h| 82 
 target/i386/ops_sse_header.h |  4 ++
 target/i386/tcg/translate.c  |  4 ++
 3 files changed, 90 insertions(+)

diff --git a/target/i386/ops_sse.h b/target/i386/ops_sse.h
index 4115c9a257..9b92b9790a 100644
--- a/target/i386/ops_sse.h
+++ b/target/i386/ops_sse.h
@@ -3113,6 +3113,88 @@ void glue(helper_vbroadcastq, SUFFIX)(CPUX86State *env, 
Reg *d, Reg *s)
 #endif
 }
 
+void glue(helper_vpermilpd, SUFFIX)(CPUX86State *env, Reg *d, Reg *v, Reg *s)
+{
+uint64_t r0, r1;
+
+r0 = v->Q((s->Q(0) >> 1) & 1);
+r1 = v->Q((s->Q(1) >> 1) & 1);
+d->Q(0) = r0;
+d->Q(1) = r1;
+#if SHIFT == 2
+r0 = v->Q(((s->Q(2) >> 1) & 1) + 2);
+r1 = v->Q(((s->Q(3) >> 1) & 1) + 2);
+d->Q(2) = r0;
+d->Q(3) = r1;
+#endif
+}
+
+void glue(helper_vpermilps, SUFFIX)(CPUX86State *env, Reg *d, Reg *v, Reg *s)
+{
+uint32_t r0, r1, r2, r3;
+
+r0 = v->L(s->L(0) & 3);
+r1 = v->L(s->L(1) & 3);
+r2 = v->L(s->L(2) & 3);
+r3 = v->L(s->L(3) & 3);
+d->L(0) = r0;
+d->L(1) = r1;
+d->L(2) = r2;
+d->L(3) = r3;
+#if SHIFT == 2
+r0 = v->L((s->L(4) & 3) + 4);
+r1 = v->L((s->L(5) & 3) + 4);
+r2 = v->L((s->L(6) & 3) + 4);
+r3 = v->L((s->L(7) & 3) + 4);
+d->L(4) = r0;
+d->L(5) = r1;
+d->L(6) = r2;
+d->L(7) = r3;
+#endif
+}
+
+void glue(helper_vpermilpd_imm, SUFFIX)(CPUX86State *env,
+Reg *d, Reg *s, uint32_t order)
+{
+uint64_t r0, r1;
+
+r0 = s->Q((order >> 0) & 1);
+r1 = s->Q((order >> 1) & 1);
+d->Q(0) = r0;
+d->Q(1) = r1;
+#if SHIFT == 2
+r0 = s->Q(((order >> 2) & 1) + 2);
+r1 = s->Q(((order >> 3) & 1) + 2);
+d->Q(2) = r0;
+d->Q(3) = r1;
+#endif
+}
+
+void glue(helper_vpermilps_imm, SUFFIX)(CPUX86State *env,
+Reg *d, Reg *s, uint32_t order)
+{
+uint32_t r0, r1, r2, r3;
+
+r0 = s->L((order >> 0) & 3);
+r1 = s->L((order >> 2) & 3);
+r2 = s->L((order >> 4) & 3);
+r3 = s->L((order >> 6) & 3);
+d->L(0) = r0;
+d->L(1) = r1;
+d->L(2) = r2;
+d->L(3) = r3;
+#if SHIFT == 2
+r0 = s->L(((order >> 0) & 3) + 4);
+r1 = s->L(((order >> 2) & 3) + 4);
+r2 = s->L(((order >> 4) & 3) + 4);
+r3 = s->L(((order >> 6) & 3) + 4);
+d->L(4) = r0;
+d->L(5) = r1;
+d->L(6) = r2;
+d->L(7) = r3;
+#endif
+}
+
 #if SHIFT == 2
 void glue(helper_vbroadcastdq, SUFFIX)(CPUX86State *env, Reg *d, Reg *s)
 {
diff --git a/target/i386/ops_sse_header.h b/target/i386/ops_sse_header.h
index 51e02cd4fa..c52169a030 100644
--- a/target/i386/ops_sse_header.h
+++ b/target/i386/ops_sse_header.h
@@ -417,6 +417,10 @@ DEF_HELPER_3(glue(vbroadcastb, SUFFIX), void, env, Reg, 
Reg)
 DEF_HELPER_3(glue(vbroadcastw, SUFFIX), void, env, Reg, Reg)
 DEF_HELPER_3(glue(vbroadcastl, SUFFIX), void, env, Reg, Reg)
 DEF_HELPER_3(glue(vbroadcastq, SUFFIX), void, env, Reg, Reg)
+DEF_HELPER_4(glue(vpermilpd, SUFFIX), void, env, Reg, Reg, Reg)
+DEF_HELPER_4(glue(vpermilps, SUFFIX), void, env, Reg, Reg, Reg)
+DEF_HELPER_4(glue(vpermilpd_imm, SUFFIX), void, env, Reg, Reg, i32)
+DEF_HELPER_4(glue(vpermilps_imm, SUFFIX), void, env, Reg, Reg, i32)
 #if SHIFT == 2
 DEF_HELPER_3(glue(vbroadcastdq, SUFFIX), void, env, Reg, Reg)
 DEF_HELPER_1(vzeroall, void, env)
diff --git a/target/i386/tcg/translate.c b/target/i386/tcg/translate.c
index 59ab1dc562..358c3ecb0b 100644
--- a/target/i386/tcg/translate.c
+++ b/target/i386/tcg/translate.c
@@ -3251,6 +3251,8 @@ static const struct SSEOpHelper_table6 sse_op_table6[256] 
= {
 [0x09] = BINARY_OP_MMX(psignw, SSSE3),
 [0x0a] = BINARY_OP_MMX(psignd, SSSE3),
 [0x0b] = BINARY_OP_MMX(pmulhrsw, SSSE3),
+[0x0c] = BINARY_OP(vpermilps, AVX, 0),
+[0x0d] = BINARY_OP(vpermilpd, AVX, 0),
 [0x10] = BLENDV_OP(pblendvb, SSE41, SSE_OPF_MMX),
 [0x14] = BLENDV_OP(blendvps, SSE41, 0),
 [0x15] = BLENDV_OP(blendvpd, SSE41, 0),
@@ -3311,6 +3313,8 @@ static const struct SSEOpHelper_table6 sse_op_table6[256] 
= {
 
 /* prefix [66] 0f 3a */
 static const struct SSEOpHelper_table7 sse_op_table7[256] = {
+[0x04] = UNARY_OP(vpermilps_imm, AVX, 0),
+[0x05] = UNARY_OP(vpermilpd_imm, AVX, 0),
 [0x08] = UNARY_OP(roundps, SSE41, 0),
 [0x09] = UNARY_OP(roundpd, SSE41, 0),
 #define gen_helper_roundss_ymm NULL
-- 
2.36.0

[PATCH v2 33/42] i386: Implement VMASKMOV

2022-04-24 Thread Paul Brook

Decoding these is a bit messy, but at least the integer and float variants
have the same semantics once decoded.

We don't try and be clever with the load forms, instead load the whole
vector then mask out the elements we want.

Signed-off-by: Paul Brook 
---
 target/i386/ops_sse.h| 48 
 target/i386/ops_sse_header.h |  4 +++
 target/i386/tcg/translate.c  | 34 +
 3 files changed, 86 insertions(+)

diff --git a/target/i386/ops_sse.h b/target/i386/ops_sse.h
index edf14a25d7..ffcba3d02c 100644
--- a/target/i386/ops_sse.h
+++ b/target/i386/ops_sse.h
@@ -3240,6 +3240,54 @@ void glue(helper_vtestpd, SUFFIX)(CPUX86State *env, Reg 
*d, Reg *s)
 CC_SRC = ((zf >> 63) ? 0 : CC_Z) | ((cf >> 63) ? 0 : CC_C);
 }
 
+void glue(helper_vpmaskmovd_st, SUFFIX)(CPUX86State *env,
+Reg *s, Reg *v, target_ulong a0)
+{
+int i;
+
+for (i = 0; i < (2 << SHIFT); i++) {
+if (v->L(i) >> 31) {
+cpu_stl_data_ra(env, a0 + i * 4, s->L(i), GETPC());
+}
+}
+}
+
+void glue(helper_vpmaskmovq_st, SUFFIX)(CPUX86State *env,
+Reg *s, Reg *v, target_ulong a0)
+{
+int i;
+
+for (i = 0; i < (1 << SHIFT); i++) {
+if (v->Q(i) >> 63) {
+cpu_stq_data_ra(env, a0 + i * 8, s->Q(i), GETPC());
+}
+}
+}
+
+void glue(helper_vpmaskmovd, SUFFIX)(CPUX86State *env, Reg *d, Reg *v, Reg *s)
+{
+d->L(0) = (v->L(0) >> 31) ? s->L(0) : 0;
+d->L(1) = (v->L(1) >> 31) ? s->L(1) : 0;
+d->L(2) = (v->L(2) >> 31) ? s->L(2) : 0;
+d->L(3) = (v->L(3) >> 31) ? s->L(3) : 0;
+#if SHIFT == 2
+d->L(4) = (v->L(4) >> 31) ? s->L(4) : 0;
+d->L(5) = (v->L(5) >> 31) ? s->L(5) : 0;
+d->L(6) = (v->L(6) >> 31) ? s->L(6) : 0;
+d->L(7) = (v->L(7) >> 31) ? s->L(7) : 0;
+#endif
+}
+
+void glue(helper_vpmaskmovq, SUFFIX)(CPUX86State *env, Reg *d, Reg *v, Reg *s)
+{
+d->Q(0) = (v->Q(0) >> 63) ? s->Q(0) : 0;
+d->Q(1) = (v->Q(1) >> 63) ? s->Q(1) : 0;
+#if SHIFT == 2
+d->Q(2) = (v->Q(2) >> 63) ? s->Q(2) : 0;
+d->Q(3) = (v->Q(3) >> 63) ? s->Q(3) : 0;
+#endif
+}
+
 #if SHIFT == 2
 void glue(helper_vbroadcastdq, SUFFIX)(CPUX86State *env, Reg *d, Reg *s)
 {
diff --git a/target/i386/ops_sse_header.h b/target/i386/ops_sse_header.h
index 8b93b8e6d6..a7a6bf6b10 100644
--- a/target/i386/ops_sse_header.h
+++ b/target/i386/ops_sse_header.h
@@ -429,6 +429,10 @@ DEF_HELPER_4(glue(vpsravq, SUFFIX), void, env, Reg, Reg, 
Reg)
 DEF_HELPER_4(glue(vpsllvq, SUFFIX), void, env, Reg, Reg, Reg)
 DEF_HELPER_3(glue(vtestps, SUFFIX), void, env, Reg, Reg)
 DEF_HELPER_3(glue(vtestpd, SUFFIX), void, env, Reg, Reg)
+DEF_HELPER_4(glue(vpmaskmovd_st, SUFFIX), void, env, Reg, Reg, tl)
+DEF_HELPER_4(glue(vpmaskmovq_st, SUFFIX), void, env, Reg, Reg, tl)
+DEF_HELPER_4(glue(vpmaskmovd, SUFFIX), void, env, Reg, Reg, Reg)
+DEF_HELPER_4(glue(vpmaskmovq, SUFFIX), void, env, Reg, Reg, Reg)
 #if SHIFT == 2
 DEF_HELPER_3(glue(vbroadcastdq, SUFFIX), void, env, Reg, Reg)
 DEF_HELPER_1(vzeroall, void, env)
diff --git a/target/i386/tcg/translate.c b/target/i386/tcg/translate.c
index 2fbb7bfcad..e00195d301 100644
--- a/target/i386/tcg/translate.c
+++ b/target/i386/tcg/translate.c
@@ -3277,6 +3277,10 @@ static const struct SSEOpHelper_table6 
sse_op_table6[256] = {
 [0x29] = BINARY_OP(pcmpeqq, SSE41, SSE_OPF_MMX),
 [0x2a] = SPECIAL_OP(SSE41), /* movntqda */
 [0x2b] = BINARY_OP(packusdw, SSE41, SSE_OPF_MMX),
+[0x2c] = BINARY_OP(vpmaskmovd, AVX, 0), /* vmaskmovps */
+[0x2d] = BINARY_OP(vpmaskmovq, AVX, 0), /* vmaskmovpd */
+[0x2e] = SPECIAL_OP(AVX), /* vmaskmovps */
+[0x2f] = SPECIAL_OP(AVX), /* vmaskmovpd */
 [0x30] = UNARY_OP(pmovzxbw, SSE41, SSE_OPF_MMX),
 [0x31] = UNARY_OP(pmovzxbd, SSE41, SSE_OPF_MMX),
 [0x32] = UNARY_OP(pmovzxbq, SSE41, SSE_OPF_MMX),
@@ -3308,6 +3312,9 @@ static const struct SSEOpHelper_table6 sse_op_table6[256] 
= {
 [0x78] = UNARY_OP(vbroadcastb, AVX, SSE_OPF_SCALAR | SSE_OPF_MMX),
 /* vpbroadcastw */
 [0x79] = UNARY_OP(vbroadcastw, AVX, SSE_OPF_SCALAR | SSE_OPF_MMX),
+/* vpmaskmovd, vpmaskmovq */
+[0x8c] = BINARY_OP(vpmaskmovd, AVX, SSE_OPF_AVX2),
+[0x8e] = SPECIAL_OP(AVX), /* vpmaskmovd, vpmaskmovq */
 #define gen_helper_aesimc_ymm NULL
 [0xdb] = UNARY_OP(aesimc, AES, 0),
 [0xdc] = BINARY_OP(aesenc, AES, 0),
@@ -3369,6 +3376,11 @@ static const SSEFunc_0_eppp sse_op_table8[3][2] = {
 SSE_OP(vpsravq),
 SSE_OP(vpsllvq),
 };
+
+static const SSEFunc_0_eppt sse_op_table9[2][2] = {
+SSE_OP(vpmaskmovd_st),
+SSE_OP(vpmaskmovq_st),
+};
 #undef SSE_OP
 
 /* VEX prefix not allowed */
@@ -4394,6 +4406,22 @@ static void gen_sse(C

[PATCH v2 37/42] i386: Implement VBLENDV

2022-04-24 Thread Paul Brook

The AVX variants of the BLENDV instructions use a different opcode prefix
to support the additional operands. We already modified the helper functions
in anticipation of this.

Signed-off-by: Paul Brook 
---
 target/i386/tcg/translate.c | 18 --
 1 file changed, 16 insertions(+), 2 deletions(-)

diff --git a/target/i386/tcg/translate.c b/target/i386/tcg/translate.c
index 4072fa28d3..95ecdea8fe 100644
--- a/target/i386/tcg/translate.c
+++ b/target/i386/tcg/translate.c
@@ -3384,6 +3384,9 @@ static const struct SSEOpHelper_table7 sse_op_table7[256] 
= {
 [0x42] = BINARY_OP(mpsadbw, SSE41, SSE_OPF_MMX),
 [0x44] = BINARY_OP(pclmulqdq, PCLMULQDQ, 0),
 [0x46] = BINARY_OP(vpermdq, AVX, SSE_OPF_AVX2), /* vperm2i128 */
+[0x4a] = BLENDV_OP(blendvps, AVX, 0),
+[0x4b] = BLENDV_OP(blendvpd, AVX, 0),
+[0x4c] = BLENDV_OP(pblendvb, AVX, SSE_OPF_MMX),
 #define gen_helper_pcmpestrm_ymm NULL
 [0x60] = CMP_OP(pcmpestrm, SSE42),
 #define gen_helper_pcmpestri_ymm NULL
@@ -5268,6 +5271,10 @@ static void gen_sse(CPUX86State *env, DisasContext *s, 
int b,
 }
 
 /* SSE */
+if (op7.flags & SSE_OPF_BLENDV && !(s->prefix & PREFIX_VEX)) {
+/* Only VEX encodings are valid for these blendv opcodes */
+goto illegal_op;
+}
 op1_offset = ZMM_OFFSET(reg);
 if (mod == 3) {
 op2_offset = ZMM_OFFSET(rm | REX_B(s));
@@ -5316,8 +5323,15 @@ static void gen_sse(CPUX86State *env, DisasContext *s, 
int b,
 op7.fn[b1].op1(cpu_env, s->ptr0, s->ptr1, tcg_const_i32(val));
 } else {
 tcg_gen_addi_ptr(s->ptr2, cpu_env, v_offset);
-op7.fn[b1].op2(cpu_env, s->ptr0, s->ptr2, s->ptr1,
-   tcg_const_i32(val));
+if (op7.flags & SSE_OPF_BLENDV) {
+TCGv_ptr mask = tcg_temp_new_ptr();
+tcg_gen_addi_ptr(mask, cpu_env, ZMM_OFFSET(val >> 4));
+op7.fn[b1].op3(cpu_env, s->ptr0, s->ptr2, s->ptr1, mask);
+tcg_temp_free_ptr(mask);
+} else {
+op7.fn[b1].op2(cpu_env, s->ptr0, s->ptr2, s->ptr1,
+   tcg_const_i32(val));
+}
 }
 if ((op7.flags & SSE_OPF_CMP) == 0 && s->vex_l == 0) {
 gen_clear_ymmh(s, reg);
-- 
2.36.0

[PATCH v2 10/42] i386: Rewrite vector shift helper

2022-04-24 Thread Paul Brook

Rewrite the vector shift helpers in preperation for AVX support (3 operand
form and 256 bit vectors).

For now keep the existing two operand interface.

No functional changes to existing helpers.

Signed-off-by: Paul Brook 
---
 target/i386/ops_sse.h | 250 ++
 1 file changed, 133 insertions(+), 117 deletions(-)

diff --git a/target/i386/ops_sse.h b/target/i386/ops_sse.h
index 23daab6b50..9297c96d04 100644
--- a/target/i386/ops_sse.h
+++ b/target/i386/ops_sse.h
@@ -63,199 +63,215 @@
 #define MOVE(d, r) memcpy(&(d).B(0), &(r).B(0), SIZE)
 #endif
 
-void glue(helper_psrlw, SUFFIX)(CPUX86State *env, Reg *d, Reg *s)
+#if SHIFT == 0
+#define SHIFT_HELPER_BODY(n, elem, F) do {  \
+d->elem(0) = F(s->elem(0), shift);  \
+if ((n) > 1) {  \
+d->elem(1) = F(s->elem(1), shift);  \
+}   \
+if ((n) > 2) {  \
+d->elem(2) = F(s->elem(2), shift);  \
+d->elem(3) = F(s->elem(3), shift);  \
+}   \
+if ((n) > 4) {  \
+d->elem(4) = F(s->elem(4), shift);  \
+d->elem(5) = F(s->elem(5), shift);  \
+d->elem(6) = F(s->elem(6), shift);  \
+d->elem(7) = F(s->elem(7), shift);  \
+}   \
+if ((n) > 8) {  \
+d->elem(8) = F(s->elem(8), shift);  \
+d->elem(9) = F(s->elem(9), shift);  \
+d->elem(10) = F(s->elem(10), shift);\
+d->elem(11) = F(s->elem(11), shift);\
+d->elem(12) = F(s->elem(12), shift);\
+d->elem(13) = F(s->elem(13), shift);\
+d->elem(14) = F(s->elem(14), shift);\
+d->elem(15) = F(s->elem(15), shift);\
+}   \
+} while (0)
+
+#define FPSRL(x, c) ((x) >> shift)
+#define FPSRAW(x, c) ((int16_t)(x) >> shift)
+#define FPSRAL(x, c) ((int32_t)(x) >> shift)
+#define FPSLL(x, c) ((x) << shift)
+#endif
+
+void glue(helper_psrlw, SUFFIX)(CPUX86State *env, Reg *d, Reg *c)
 {
+Reg *s = d;
 int shift;
-
-if (s->Q(0) > 15) {
+if (c->Q(0) > 15) {
 d->Q(0) = 0;
-#if SHIFT == 1
-d->Q(1) = 0;
-#endif
+XMM_ONLY(d->Q(1) = 0;)
+YMM_ONLY(
+d->Q(2) = 0;
+d->Q(3) = 0;
+)
 } else {
-shift = s->B(0);
-d->W(0) >>= shift;
-d->W(1) >>= shift;
-d->W(2) >>= shift;
-d->W(3) >>= shift;
-#if SHIFT == 1
-d->W(4) >>= shift;
-d->W(5) >>= shift;
-d->W(6) >>= shift;
-d->W(7) >>= shift;
-#endif
+shift = c->B(0);
+SHIFT_HELPER_BODY(4 << SHIFT, W, FPSRL);
 }
 }
 
-void glue(helper_psraw, SUFFIX)(CPUX86State *env, Reg *d, Reg *s)
+void glue(helper_psllw, SUFFIX)(CPUX86State *env, Reg *d, Reg *c)
 {
+Reg *s = d;
 int shift;
-
-if (s->Q(0) > 15) {
-shift = 15;
+if (c->Q(0) > 15) {
+d->Q(0) = 0;
+XMM_ONLY(d->Q(1) = 0;)
+YMM_ONLY(
+d->Q(2) = 0;
+d->Q(3) = 0;
+)
 } else {
-shift = s->B(0);
+shift = c->B(0);
+SHIFT_HELPER_BODY(4 << SHIFT, W, FPSLL);
 }
-d->W(0) = (int16_t)d->W(0) >> shift;
-d->W(1) = (int16_t)d->W(1) >> shift;
-d->W(2) = (int16_t)d->W(2) >> shift;
-d->W(3) = (int16_t)d->W(3) >> shift;
-#if SHIFT == 1
-d->W(4) = (int16_t)d->W(4) >> shift;
-d->W(5) = (int16_t)d->W(5) >> shift;
-d->W(6) = (int16_t)d->W(6) >> shift;
-d->W(7) = (int16_t)d->W(7) >> shift;
-#endif
 }
 
-void glue(helper_psllw, SUFFIX)(CPUX86State *env, Reg *d, Reg *s)
+void glue(helper_psraw, SUFFIX)(CPUX86State *env, Reg *d, Reg *c)
 {
+Reg *s = d;
 int shift;
-
-if (s->Q(0) > 15) {
-d->Q(0) = 0;
-#if SHIFT == 1
-d->Q(1) = 0;
-#endif
+if (c->Q(0) > 15) {
+shift = 15;
 } else {
-shift = s->B(0);
-d->W(0) <<= shift;
-d->W(1) <<= shift;
-d->W(2) <<= shift;
-d->W(3) <<= shift;
-#if SHIFT == 1
-d->W(4) <<= shift;
-d->W(5) <<= shift;
-d->W(6) <<= shift;
-d->W(7) <<= shift;
-#endif
+shift = c->B(0);
 }
+SHIFT_HELPER_BODY(4 << SHIFT, W, FPSRAW);
 }
 
-void glue(helper_psrld, SUFFIX)(CPUX86State *env, Reg *d, Reg *s)
+void

[PATCH v2 05/42] i386: Rework sse_op_table6/7

2022-04-24 Thread Paul Brook

Add a flags field each row in sse_op_table6 and sse_op_table7.

Initially this is only used as a replacement for the magic
SSE41_SPECIAL pointer.  The other flags will become relevant
as the rest of the avx implementation is built out.

Signed-off-by: Paul Brook 
---
 target/i386/tcg/translate.c | 232 
 1 file changed, 132 insertions(+), 100 deletions(-)

diff --git a/target/i386/tcg/translate.c b/target/i386/tcg/translate.c
index 7fec582358..5335b86c01 100644
--- a/target/i386/tcg/translate.c
+++ b/target/i386/tcg/translate.c
@@ -2977,7 +2977,6 @@ static const struct SSEOpHelper_table1 sse_op_table1[256] 
= {
 #undef SSE_SPECIAL
 
 #define MMX_OP2(x) { gen_helper_ ## x ## _mmx, gen_helper_ ## x ## _xmm }
-#define SSE_SPECIAL_FN ((void *)1)
 
 static const SSEFunc_0_epp sse_op_table2[3 * 8][2] = {
 [0 + 2] = MMX_OP2(psrlw),
@@ -3061,113 +3060,134 @@ static const SSEFunc_0_epp sse_op_table5[256] = {
 [0xbf] = gen_helper_pavgb_mmx /* pavgusb */
 };
 
-struct SSEOpHelper_epp {
+struct SSEOpHelper_table6 {
 SSEFunc_0_epp op[2];
 uint32_t ext_mask;
+int flags;
 };
 
-struct SSEOpHelper_eppi {
+struct SSEOpHelper_table7 {
 SSEFunc_0_eppi op[2];
 uint32_t ext_mask;
+int flags;
 };
 
-#define SSSE3_OP(x) { MMX_OP2(x), CPUID_EXT_SSSE3 }
-#define SSE41_OP(x) { { NULL, gen_helper_ ## x ## _xmm }, CPUID_EXT_SSE41 }
-#define SSE42_OP(x) { { NULL, gen_helper_ ## x ## _xmm }, CPUID_EXT_SSE42 }
-#define SSE41_SPECIAL { { NULL, SSE_SPECIAL_FN }, CPUID_EXT_SSE41 }
-#define PCLMULQDQ_OP(x) { { NULL, gen_helper_ ## x ## _xmm }, \
-CPUID_EXT_PCLMULQDQ }
-#define AESNI_OP(x) { { NULL, gen_helper_ ## x ## _xmm }, CPUID_EXT_AES }
-
-static const struct SSEOpHelper_epp sse_op_table6[256] = {
-[0x00] = SSSE3_OP(pshufb),
-[0x01] = SSSE3_OP(phaddw),
-[0x02] = SSSE3_OP(phaddd),
-[0x03] = SSSE3_OP(phaddsw),
-[0x04] = SSSE3_OP(pmaddubsw),
-[0x05] = SSSE3_OP(phsubw),
-[0x06] = SSSE3_OP(phsubd),
-[0x07] = SSSE3_OP(phsubsw),
-[0x08] = SSSE3_OP(psignb),
-[0x09] = SSSE3_OP(psignw),
-[0x0a] = SSSE3_OP(psignd),
-[0x0b] = SSSE3_OP(pmulhrsw),
-[0x10] = SSE41_OP(pblendvb),
-[0x14] = SSE41_OP(blendvps),
-[0x15] = SSE41_OP(blendvpd),
-[0x17] = SSE41_OP(ptest),
-[0x1c] = SSSE3_OP(pabsb),
-[0x1d] = SSSE3_OP(pabsw),
-[0x1e] = SSSE3_OP(pabsd),
-[0x20] = SSE41_OP(pmovsxbw),
-[0x21] = SSE41_OP(pmovsxbd),
-[0x22] = SSE41_OP(pmovsxbq),
-[0x23] = SSE41_OP(pmovsxwd),
-[0x24] = SSE41_OP(pmovsxwq),
-[0x25] = SSE41_OP(pmovsxdq),
-[0x28] = SSE41_OP(pmuldq),
-[0x29] = SSE41_OP(pcmpeqq),
-[0x2a] = SSE41_SPECIAL, /* movntqda */
-[0x2b] = SSE41_OP(packusdw),
-[0x30] = SSE41_OP(pmovzxbw),
-[0x31] = SSE41_OP(pmovzxbd),
-[0x32] = SSE41_OP(pmovzxbq),
-[0x33] = SSE41_OP(pmovzxwd),
-[0x34] = SSE41_OP(pmovzxwq),
-[0x35] = SSE41_OP(pmovzxdq),
-[0x37] = SSE42_OP(pcmpgtq),
-[0x38] = SSE41_OP(pminsb),
-[0x39] = SSE41_OP(pminsd),
-[0x3a] = SSE41_OP(pminuw),
-[0x3b] = SSE41_OP(pminud),
-[0x3c] = SSE41_OP(pmaxsb),
-[0x3d] = SSE41_OP(pmaxsd),
-[0x3e] = SSE41_OP(pmaxuw),
-[0x3f] = SSE41_OP(pmaxud),
-[0x40] = SSE41_OP(pmulld),
-[0x41] = SSE41_OP(phminposuw),
-[0xdb] = AESNI_OP(aesimc),
-[0xdc] = AESNI_OP(aesenc),
-[0xdd] = AESNI_OP(aesenclast),
-[0xde] = AESNI_OP(aesdec),
-[0xdf] = AESNI_OP(aesdeclast),
+#define gen_helper_special_xmm NULL
+
+#define OP(name, op, flags, ext, mmx_name) \
+{{mmx_name, gen_helper_ ## name ## _xmm}, CPUID_EXT_ ## ext, flags}
+#define BINARY_OP_MMX(name, ext) \
+OP(name, op2, SSE_OPF_MMX, ext, gen_helper_ ## name ## _mmx)
+#define BINARY_OP(name, ext, flags) \
+OP(name, op2, flags, ext, NULL)
+#define UNARY_OP_MMX(name, ext) \
+OP(name, op1, SSE_OPF_V0 | SSE_OPF_MMX, ext, gen_helper_ ## name ## _mmx)
+#define UNARY_OP(name, ext, flags) \
+OP(name, op1, SSE_OPF_V0 | flags, ext, NULL)
+#define BLENDV_OP(name, ext, flags) OP(name, op3, SSE_OPF_BLENDV, ext, NULL)
+#define CMP_OP(name, ext) OP(name, op1, SSE_OPF_CMP | SSE_OPF_V0, ext, NULL)
+#define SPECIAL_OP(ext) OP(special, op1, SSE_OPF_SPECIAL, ext, NULL)
+
+/* prefix [66] 0f 38 */
+static const struct SSEOpHelper_table6 sse_op_table6[256] = {
+[0x00] = BINARY_OP_MMX(pshufb, SSSE3),
+[0x01] = BINARY_OP_MMX(phaddw, SSSE3),
+[0x02] = BINARY_OP_MMX(phaddd, SSSE3),
+[0x03] = BINARY_OP_MMX(phaddsw, SSSE3),
+[0x04] = BINARY_OP_MMX(pmaddubsw, SSSE3),
+[0x05] = BINARY_OP_MMX(phsubw, SSSE3),
+[0x06] = BINARY_OP_MMX(phsubd, SSSE3),
+[0x07] = BINARY_OP_MMX(phsubsw, SSSE3),
+[0x08] = BINARY_OP_MMX(psignb, SSSE3),
+[0x09] = BINARY_OP_MMX(psignw, SSSE3),
+[0x0a] = BINARY_OP_MMX(psignd, SSSE3),
+[0x0b] = BINARY_OP_MMX(pmulhrsw, SSSE3),
+[0x10] = BLENDV_OP(pblendvb, SSE41, SSE_OPF_MMX),
+[0x14] = BLENDV_OP(blendvps, SSE41, 0),
+[0x15] = BLENDV_OP(blendvpd, SSE41, 0

[PATCH v2 08/42] i386: Add ZMM_OFFSET macro

2022-04-24 Thread Paul Brook

Add a convenience macro to get the address of an xmm_regs element within
CPUX86State.

This was originally going to be the basis of an implementation that broke
operations into 128 bit chunks. I scrapped that idea, so this is now a purely
cosmetic change. But I think a worthwhile one - it reduces the number of
function calls that need to be split over multiple lines.

No functional changes.

Signed-off-by: Paul Brook 
---
 target/i386/tcg/translate.c | 60 +
 1 file changed, 27 insertions(+), 33 deletions(-)

diff --git a/target/i386/tcg/translate.c b/target/i386/tcg/translate.c
index 2f5cc24e0c..e9e6062b7f 100644
--- a/target/i386/tcg/translate.c
+++ b/target/i386/tcg/translate.c
@@ -2777,6 +2777,8 @@ static inline void gen_op_movq_env_0(DisasContext *s, int 
d_offset)
 tcg_gen_st_i64(s->tmp1_i64, cpu_env, d_offset);
 }
 
+#define ZMM_OFFSET(reg) offsetof(CPUX86State, xmm_regs[reg])
+
 typedef void (*SSEFunc_i_ep)(TCGv_i32 val, TCGv_ptr env, TCGv_ptr reg);
 typedef void (*SSEFunc_l_ep)(TCGv_i64 val, TCGv_ptr env, TCGv_ptr reg);
 typedef void (*SSEFunc_0_epi)(TCGv_ptr env, TCGv_ptr reg, TCGv_i32 val);
@@ -3329,14 +3331,14 @@ static void gen_sse(CPUX86State *env, DisasContext *s, 
int b,
 if (mod == 3)
 goto illegal_op;
 gen_lea_modrm(env, s, modrm);
-gen_sto_env_A0(s, offsetof(CPUX86State, xmm_regs[reg]));
+gen_sto_env_A0(s, ZMM_OFFSET(reg));
 break;
 case 0x3f0: /* lddqu */
 CHECK_AVX_V0(s);
 if (mod == 3)
 goto illegal_op;
 gen_lea_modrm(env, s, modrm);
-gen_ldo_env_A0(s, offsetof(CPUX86State, xmm_regs[reg]));
+gen_ldo_env_A0(s, ZMM_OFFSET(reg));
 break;
 case 0x22b: /* movntss */
 case 0x32b: /* movntsd */
@@ -3375,15 +3377,13 @@ static void gen_sse(CPUX86State *env, DisasContext *s, 
int b,
 #ifdef TARGET_X86_64
 if (s->dflag == MO_64) {
 gen_ldst_modrm(env, s, modrm, MO_64, OR_TMP0, 0);
-tcg_gen_addi_ptr(s->ptr0, cpu_env,
- offsetof(CPUX86State,xmm_regs[reg]));
+tcg_gen_addi_ptr(s->ptr0, cpu_env, ZMM_OFFSET(reg));
 gen_helper_movq_mm_T0_xmm(s->ptr0, s->T0);
 } else
 #endif
 {
 gen_ldst_modrm(env, s, modrm, MO_32, OR_TMP0, 0);
-tcg_gen_addi_ptr(s->ptr0, cpu_env,
- offsetof(CPUX86State,xmm_regs[reg]));
+tcg_gen_addi_ptr(s->ptr0, cpu_env, ZMM_OFFSET(reg));
 tcg_gen_trunc_tl_i32(s->tmp2_i32, s->T0);
 gen_helper_movl_mm_T0_xmm(s->ptr0, s->tmp2_i32);
 }
@@ -3410,11 +3410,10 @@ static void gen_sse(CPUX86State *env, DisasContext *s, 
int b,
 CHECK_AVX_V0(s);
 if (mod != 3) {
 gen_lea_modrm(env, s, modrm);
-gen_ldo_env_A0(s, offsetof(CPUX86State, xmm_regs[reg]));
+gen_ldo_env_A0(s, ZMM_OFFSET(reg));
 } else {
 rm = (modrm & 7) | REX_B(s);
-gen_op_movo(s, offsetof(CPUX86State, xmm_regs[reg]),
-offsetof(CPUX86State,xmm_regs[rm]));
+gen_op_movo(s, ZMM_OFFSET(reg), ZMM_OFFSET(rm));
 }
 break;
 case 0x210: /* movss xmm, ea */
@@ -3474,7 +3473,7 @@ static void gen_sse(CPUX86State *env, DisasContext *s, 
int b,
 CHECK_AVX_V0(s);
 if (mod != 3) {
 gen_lea_modrm(env, s, modrm);
-gen_ldo_env_A0(s, offsetof(CPUX86State, xmm_regs[reg]));
+gen_ldo_env_A0(s, ZMM_OFFSET(reg));
 } else {
 rm = (modrm & 7) | REX_B(s);
 gen_op_movl(s, offsetof(CPUX86State, xmm_regs[reg].ZMM_L(0)),
@@ -3519,7 +3518,7 @@ static void gen_sse(CPUX86State *env, DisasContext *s, 
int b,
 CHECK_AVX_V0(s);
 if (mod != 3) {
 gen_lea_modrm(env, s, modrm);
-gen_ldo_env_A0(s, offsetof(CPUX86State, xmm_regs[reg]));
+gen_ldo_env_A0(s, ZMM_OFFSET(reg));
 } else {
 rm = (modrm & 7) | REX_B(s);
 gen_op_movl(s, offsetof(CPUX86State, xmm_regs[reg].ZMM_L(1)),
@@ -3542,8 +3541,7 @@ static void gen_sse(CPUX86State *env, DisasContext *s, 
int b,
 goto illegal_op;
 field_length = x86_ldub_code(env, s) & 0x3F;
 bit_index = x86_ldub_code(env, s) & 0x3F;
-tcg_gen_addi_ptr(s->ptr0, cpu_env,
-offsetof(CPUX86State,xmm_regs[reg]));
+tcg_gen_addi_ptr(s->ptr0, cpu_env, ZMM_OFFSET(reg));
 if (b1 == 1)
 gen_helper_extrq_i(cpu_env, s->ptr0,
tcg_

[PATCH v2 07/42] Enforce VEX encoding restrictions

2022-04-24 Thread Paul Brook

Add CHECK_AVX* macros, and use them to validate VEX encoded AVX instructions

All AVX instructions require both CPU and OS support, this is encapsulated
by HF_AVX_EN.

Some also require specific values in the VEX.L and VEX.V fields.
Some (mostly integer operations) also require AVX2

Signed-off-by: Paul Brook 
---
 target/i386/tcg/translate.c | 159 +---
 1 file changed, 149 insertions(+), 10 deletions(-)

diff --git a/target/i386/tcg/translate.c b/target/i386/tcg/translate.c
index 66ba690b7d..2f5cc24e0c 100644
--- a/target/i386/tcg/translate.c
+++ b/target/i386/tcg/translate.c
@@ -3185,10 +3185,54 @@ static const struct SSEOpHelper_table7 
sse_op_table7[256] = {
 goto illegal_op; \
 } while (0)
 
+/*
+ * VEX encodings require AVX
+ * Allow legacy SSE encodings even if AVX not enabled
+ */
+#define CHECK_AVX(s) do { \
+if ((s->prefix & PREFIX_VEX) \
+&& !(env->hflags & HF_AVX_EN_MASK)) \
+goto illegal_op; \
+} while (0)
+
+/* If a VEX prefix is used then it must have V=b */
+#define CHECK_AVX_V0(s) do { \
+CHECK_AVX(s); \
+if ((s->prefix & PREFIX_VEX) && (s->vex_v != 0)) \
+goto illegal_op; \
+} while (0)
+
+/* If a VEX prefix is used then it must have L=0 */
+#define CHECK_AVX_128(s) do { \
+CHECK_AVX(s); \
+if ((s->prefix & PREFIX_VEX) && (s->vex_l != 0)) \
+goto illegal_op; \
+} while (0)
+
+/* If a VEX prefix is used then it must have V=b and L=0 */
+#define CHECK_AVX_V0_128(s) do { \
+CHECK_AVX(s); \
+if ((s->prefix & PREFIX_VEX) && (s->vex_v != 0 || s->vex_l != 0)) \
+goto illegal_op; \
+} while (0)
+
+/* 256-bit (ymm) variants require AVX2 */
+#define CHECK_AVX2_256(s) do { \
+if (s->vex_l && !(s->cpuid_7_0_ebx_features & CPUID_7_0_EBX_AVX2)) \
+goto illegal_op; \
+} while (0)
+
+/* Requires AVX2 and VEX encoding */
+#define CHECK_AVX2(s) do { \
+if ((s->prefix & PREFIX_VEX) == 0 \
+|| !(s->cpuid_7_0_ebx_features & CPUID_7_0_EBX_AVX2)) \
+goto illegal_op; \
+} while (0)
+
 static void gen_sse(CPUX86State *env, DisasContext *s, int b,
 target_ulong pc_start)
 {
-int b1, op1_offset, op2_offset, is_xmm, val;
+int b1, op1_offset, op2_offset, is_xmm, val, scalar_op;
 int modrm, mod, rm, reg;
 struct SSEOpHelper_table1 sse_op;
 struct SSEOpHelper_table6 op6;
@@ -3228,15 +3272,18 @@ static void gen_sse(CPUX86State *env, DisasContext *s, 
int b,
 gen_exception(s, EXCP07_PREX, pc_start - s->cs_base);
 return;
 }
-if (s->flags & HF_EM_MASK) {
-illegal_op:
-gen_illegal_opcode(s);
-return;
-}
-if (is_xmm
-&& !(s->flags & HF_OSFXSR_MASK)
-&& (b != 0x38 && b != 0x3a)) {
-goto unknown_op;
+/* VEX encoded instuctions ignore EM bit. See also CHECK_AVX */
+if (!(s->prefix & PREFIX_VEX)) {
+if (s->flags & HF_EM_MASK) {
+illegal_op:
+gen_illegal_opcode(s);
+return;
+}
+if (is_xmm
+&& !(s->flags & HF_OSFXSR_MASK)
+&& (b != 0x38 && b != 0x3a)) {
+goto unknown_op;
+}
 }
 if (b == 0x0e) {
 if (!(s->cpuid_ext2_features & CPUID_EXT2_3DNOW)) {
@@ -3278,12 +3325,14 @@ static void gen_sse(CPUX86State *env, DisasContext *s, 
int b,
 case 0x1e7: /* movntdq */
 case 0x02b: /* movntps */
 case 0x12b: /* movntps */
+CHECK_AVX_V0(s);
 if (mod == 3)
 goto illegal_op;
 gen_lea_modrm(env, s, modrm);
 gen_sto_env_A0(s, offsetof(CPUX86State, xmm_regs[reg]));
 break;
 case 0x3f0: /* lddqu */
+CHECK_AVX_V0(s);
 if (mod == 3)
 goto illegal_op;
 gen_lea_modrm(env, s, modrm);
@@ -3291,6 +3340,7 @@ static void gen_sse(CPUX86State *env, DisasContext *s, 
int b,
 break;
 case 0x22b: /* movntss */
 case 0x32b: /* movntsd */
+CHECK_AVX_V0_128(s);
 if (mod == 3)
 goto illegal_op;
 gen_lea_modrm(env, s, modrm);
@@ -3321,6 +3371,7 @@ static void gen_sse(CPUX86State *env, DisasContext *s, 
int b,
 }
 break;
 case 0x16e: /* movd xmm, ea */
+CHECK_AVX_V0_128(s);
 #ifdef TARGET_X86_64
 if (s->dflag == MO_64) {
 gen_ldst_modrm(env, s, modrm, MO_64, OR_TMP0, 0);
@@ -3356,6 +3407,7 @@ static void gen_sse(CPUX86State *env, DisasContext *s, 
int b,
 case 0x128: /* movapd */
 case 0x16f: /* movdqa xmm, ea */
 case 0x26f: /* movdqu xmm, ea */
+CHECK_AVX_V0(s);
 if (mod != 3) {

[PATCH v2 03/42] Add AVX_EN hflag

2022-04-24 Thread Paul Brook

Add a new hflag bit to determine whether AVX instructions are allowed

Signed-off-by: Paul Brook 
---
 target/i386/cpu.h|  3 +++
 target/i386/helper.c | 12 
 target/i386/tcg/fpu_helper.c |  1 +
 3 files changed, 16 insertions(+)

diff --git a/target/i386/cpu.h b/target/i386/cpu.h
index 9661f9fbd1..65200a1917 100644
--- a/target/i386/cpu.h
+++ b/target/i386/cpu.h
@@ -169,6 +169,7 @@ typedef enum X86Seg {
 #define HF_MPX_EN_SHIFT 25 /* MPX Enabled (CR4+XCR0+BNDCFGx) */
 #define HF_MPX_IU_SHIFT 26 /* BND registers in-use */
 #define HF_UMIP_SHIFT   27 /* CR4.UMIP */
+#define HF_AVX_EN_SHIFT 28 /* AVX Enabled (CR4+XCR0) */
 
 #define HF_CPL_MASK  (3 << HF_CPL_SHIFT)
 #define HF_INHIBIT_IRQ_MASK  (1 << HF_INHIBIT_IRQ_SHIFT)
@@ -195,6 +196,7 @@ typedef enum X86Seg {
 #define HF_MPX_EN_MASK   (1 << HF_MPX_EN_SHIFT)
 #define HF_MPX_IU_MASK   (1 << HF_MPX_IU_SHIFT)
 #define HF_UMIP_MASK (1 << HF_UMIP_SHIFT)
+#define HF_AVX_EN_MASK   (1 << HF_AVX_EN_SHIFT)
 
 /* hflags2 */
 
@@ -2035,6 +2037,7 @@ void host_cpuid(uint32_t function, uint32_t count,
 
 /* helper.c */
 void x86_cpu_set_a20(X86CPU *cpu, int a20_state);
+void cpu_sync_avx_hflag(CPUX86State *env);
 
 #ifndef CONFIG_USER_ONLY
 static inline int x86_asidx_from_attrs(CPUState *cs, MemTxAttrs attrs)
diff --git a/target/i386/helper.c b/target/i386/helper.c
index fa409e9c44..30083c9cff 100644
--- a/target/i386/helper.c
+++ b/target/i386/helper.c
@@ -29,6 +29,17 @@
 #endif
 #include "qemu/log.h"
 
+void cpu_sync_avx_hflag(CPUX86State *env)
+{
+if ((env->cr[4] & CR4_OSXSAVE_MASK)
+&& (env->xcr0 & (XSTATE_SSE_MASK | XSTATE_YMM_MASK))
+== (XSTATE_SSE_MASK | XSTATE_YMM_MASK)) {
+env->hflags |= HF_AVX_EN_MASK;
+} else{
+env->hflags &= ~HF_AVX_EN_MASK;
+}
+}
+
 void cpu_sync_bndcs_hflags(CPUX86State *env)
 {
 uint32_t hflags = env->hflags;
@@ -209,6 +220,7 @@ void cpu_x86_update_cr4(CPUX86State *env, uint32_t new_cr4)
 env->hflags = hflags;
 
 cpu_sync_bndcs_hflags(env);
+cpu_sync_avx_hflag(env);
 }
 
 #if !defined(CONFIG_USER_ONLY)
diff --git a/target/i386/tcg/fpu_helper.c b/target/i386/tcg/fpu_helper.c
index ebf5e73df9..b391b69635 100644
--- a/target/i386/tcg/fpu_helper.c
+++ b/target/i386/tcg/fpu_helper.c
@@ -2943,6 +2943,7 @@ void helper_xsetbv(CPUX86State *env, uint32_t ecx, 
uint64_t mask)
 
 env->xcr0 = mask;
 cpu_sync_bndcs_hflags(env);
+cpu_sync_avx_hflag(env);
 return;
 
  do_gpf:
-- 
2.36.0

[PATCH v2 06/42] i386: Add CHECK_NO_VEX

2022-04-24 Thread Paul Brook

Reject invalid VEX encodings on MMX instructions.

Signed-off-by: Paul Brook 
---
 target/i386/tcg/translate.c | 26 ++
 1 file changed, 26 insertions(+)

diff --git a/target/i386/tcg/translate.c b/target/i386/tcg/translate.c
index 5335b86c01..66ba690b7d 100644
--- a/target/i386/tcg/translate.c
+++ b/target/i386/tcg/translate.c
@@ -3179,6 +3179,12 @@ static const struct SSEOpHelper_table7 
sse_op_table7[256] = {
 #undef BLENDV_OP
 #undef SPECIAL_OP
 
+/* VEX prefix not allowed */
+#define CHECK_NO_VEX(s) do { \
+if (s->prefix & PREFIX_VEX) \
+goto illegal_op; \
+} while (0)
+
 static void gen_sse(CPUX86State *env, DisasContext *s, int b,
 target_ulong pc_start)
 {
@@ -3262,6 +3268,7 @@ static void gen_sse(CPUX86State *env, DisasContext *s, 
int b,
 b |= (b1 << 8);
 switch(b) {
 case 0x0e7: /* movntq */
+CHECK_NO_VEX(s);
 if (mod == 3) {
 goto illegal_op;
 }
@@ -3297,6 +3304,7 @@ static void gen_sse(CPUX86State *env, DisasContext *s, 
int b,
 }
 break;
 case 0x6e: /* movd mm, ea */
+CHECK_NO_VEX(s);
 #ifdef TARGET_X86_64
 if (s->dflag == MO_64) {
 gen_ldst_modrm(env, s, modrm, MO_64, OR_TMP0, 0);
@@ -3330,6 +3338,7 @@ static void gen_sse(CPUX86State *env, DisasContext *s, 
int b,
 }
 break;
 case 0x6f: /* movq mm, ea */
+CHECK_NO_VEX(s);
 if (mod != 3) {
 gen_lea_modrm(env, s, modrm);
 gen_ldq_env_A0(s, offsetof(CPUX86State, fpregs[reg].mmx));
@@ -3464,6 +3473,7 @@ static void gen_sse(CPUX86State *env, DisasContext *s, 
int b,
 break;
 case 0x178:
 case 0x378:
+CHECK_NO_VEX(s);
 {
 int bit_index, field_length;
 
@@ -3484,6 +3494,7 @@ static void gen_sse(CPUX86State *env, DisasContext *s, 
int b,
 }
 break;
 case 0x7e: /* movd ea, mm */
+CHECK_NO_VEX(s);
 #ifdef TARGET_X86_64
 if (s->dflag == MO_64) {
 tcg_gen_ld_i64(s->T0, cpu_env,
@@ -3524,6 +3535,7 @@ static void gen_sse(CPUX86State *env, DisasContext *s, 
int b,
 gen_op_movq_env_0(s, offsetof(CPUX86State, 
xmm_regs[reg].ZMM_Q(1)));
 break;
 case 0x7f: /* movq ea, mm */
+CHECK_NO_VEX(s);
 if (mod != 3) {
 gen_lea_modrm(env, s, modrm);
 gen_stq_env_A0(s, offsetof(CPUX86State, fpregs[reg].mmx));
@@ -3607,6 +3619,7 @@ static void gen_sse(CPUX86State *env, DisasContext *s, 
int b,
 offsetof(CPUX86State, xmm_t0.ZMM_L(1)));
 op1_offset = offsetof(CPUX86State,xmm_t0);
 } else {
+CHECK_NO_VEX(s);
 tcg_gen_movi_tl(s->T0, val);
 tcg_gen_st32_tl(s->T0, cpu_env,
 offsetof(CPUX86State, mmx_t0.MMX_L(0)));
@@ -3648,6 +3661,7 @@ static void gen_sse(CPUX86State *env, DisasContext *s, 
int b,
 break;
 case 0x02a: /* cvtpi2ps */
 case 0x12a: /* cvtpi2pd */
+CHECK_NO_VEX(s);
 gen_helper_enter_mmx(cpu_env);
 if (mod != 3) {
 gen_lea_modrm(env, s, modrm);
@@ -3693,6 +3707,7 @@ static void gen_sse(CPUX86State *env, DisasContext *s, 
int b,
 case 0x12c: /* cvttpd2pi */
 case 0x02d: /* cvtps2pi */
 case 0x12d: /* cvtpd2pi */
+CHECK_NO_VEX(s);
 gen_helper_enter_mmx(cpu_env);
 if (mod != 3) {
 gen_lea_modrm(env, s, modrm);
@@ -3766,6 +3781,7 @@ static void gen_sse(CPUX86State *env, DisasContext *s, 
int b,
 tcg_gen_st16_tl(s->T0, cpu_env,
 
offsetof(CPUX86State,xmm_regs[reg].ZMM_W(val)));
 } else {
+CHECK_NO_VEX(s);
 val &= 3;
 tcg_gen_st16_tl(s->T0, cpu_env,
 
offsetof(CPUX86State,fpregs[reg].mmx.MMX_W(val)));
@@ -3805,6 +3821,7 @@ static void gen_sse(CPUX86State *env, DisasContext *s, 
int b,
 }
 break;
 case 0x2d6: /* movq2dq */
+CHECK_NO_VEX(s);
 gen_helper_enter_mmx(cpu_env);
 rm = (modrm & 7);
 gen_op_movq(s, offsetof(CPUX86State, xmm_regs[reg].ZMM_Q(0)),
@@ -3812,6 +3829,7 @@ static void gen_sse(CPUX86State *env, DisasContext *s, 
int b,
 gen_op_movq_env_0(s, offsetof(CPUX86State, 
xmm_regs[reg].ZMM_Q(1)));
 break;
 case 0x3d6: /* movdq2q */
+CHECK_NO_VEX(s);
 gen_helper_enter_mmx(cpu_env);
 rm = (modrm & 7) | REX_B(s);
 gen_op_movq(s, offsetof(CPUX86State, fpregs[reg & 7].mmx),
@@ -3827,6 +3845,7 @@ static void

[PATCH v2 04/42] i386: Rework sse_op_table1

2022-04-24 Thread Paul Brook

Add a flags field each row in sse_op_table1.

Initially this is only used as a replacement for the magic
SSE_SPECIAL and SSE_DUMMY pointers, the other flags will become relevant
as the rest of the AVX implementation is built out.

Signed-off-by: Paul Brook 
---
 target/i386/tcg/translate.c | 316 +---
 1 file changed, 186 insertions(+), 130 deletions(-)

diff --git a/target/i386/tcg/translate.c b/target/i386/tcg/translate.c
index b7972f0ff5..7fec582358 100644
--- a/target/i386/tcg/translate.c
+++ b/target/i386/tcg/translate.c
@@ -2788,146 +2788,196 @@ typedef void (*SSEFunc_0_ppi)(TCGv_ptr reg_a, 
TCGv_ptr reg_b, TCGv_i32 val);
 typedef void (*SSEFunc_0_eppt)(TCGv_ptr env, TCGv_ptr reg_a, TCGv_ptr reg_b,
TCGv val);
 
-#define SSE_SPECIAL ((void *)1)
-#define SSE_DUMMY ((void *)2)
+#define SSE_OPF_V0(1 << 0) /* vex.v must be b (only 2 operands) */
+#define SSE_OPF_CMP   (1 << 1) /* does not write for first operand */
+#define SSE_OPF_BLENDV(1 << 2) /* blendv* instruction */
+#define SSE_OPF_SPECIAL   (1 << 3) /* magic */
+#define SSE_OPF_3DNOW (1 << 4) /* 3DNow! instruction */
+#define SSE_OPF_MMX   (1 << 5) /* MMX/integer/AVX2 instruction */
+#define SSE_OPF_SCALAR(1 << 6) /* Has SSE scalar variants */
+#define SSE_OPF_AVX2  (1 << 7) /* AVX2 instruction */
+#define SSE_OPF_SHUF  (1 << 9) /* pshufx/shufpx */
+
+#define OP(op, flags, a, b, c, d)   \
+{flags, {a, b, c, d} }
+
+#define MMX_OP(x) OP(op2, SSE_OPF_MMX, \
+gen_helper_ ## x ## _mmx, gen_helper_ ## x ## _xmm, NULL, NULL)
+
+#define SSE_FOP(name) OP(op2, SSE_OPF_SCALAR, \
+gen_helper_##name##ps, gen_helper_##name##pd, \
+gen_helper_##name##ss, gen_helper_##name##sd)
+#define SSE_OP(sname, dname, op, flags) OP(op, flags, \
+gen_helper_##sname##_xmm, gen_helper_##dname##_xmm, NULL, NULL)
+
+struct SSEOpHelper_table1 {
+int flags;
+SSEFunc_0_epp op[4];
+};
 
-#define MMX_OP2(x) { gen_helper_ ## x ## _mmx, gen_helper_ ## x ## _xmm }
-#define SSE_FOP(x) { gen_helper_ ## x ## ps, gen_helper_ ## x ## pd, \
- gen_helper_ ## x ## ss, gen_helper_ ## x ## sd, }
+#define SSE_3DNOW { SSE_OPF_3DNOW }
+#define SSE_SPECIAL { SSE_OPF_SPECIAL }
 
-static const SSEFunc_0_epp sse_op_table1[256][4] = {
+static const struct SSEOpHelper_table1 sse_op_table1[256] = {
 /* 3DNow! extensions */
-[0x0e] = { SSE_DUMMY }, /* femms */
-[0x0f] = { SSE_DUMMY }, /* pf... */
+[0x0e] = SSE_SPECIAL, /* femms */
+[0x0f] = SSE_3DNOW, /* pf... (sse_op_table5) */
 /* pure SSE operations */
-[0x10] = { SSE_SPECIAL, SSE_SPECIAL, SSE_SPECIAL, SSE_SPECIAL }, /* 
movups, movupd, movss, movsd */
-[0x11] = { SSE_SPECIAL, SSE_SPECIAL, SSE_SPECIAL, SSE_SPECIAL }, /* 
movups, movupd, movss, movsd */
-[0x12] = { SSE_SPECIAL, SSE_SPECIAL, SSE_SPECIAL, SSE_SPECIAL }, /* 
movlps, movlpd, movsldup, movddup */
-[0x13] = { SSE_SPECIAL, SSE_SPECIAL },  /* movlps, movlpd */
-[0x14] = { gen_helper_punpckldq_xmm, gen_helper_punpcklqdq_xmm },
-[0x15] = { gen_helper_punpckhdq_xmm, gen_helper_punpckhqdq_xmm },
-[0x16] = { SSE_SPECIAL, SSE_SPECIAL, SSE_SPECIAL },  /* movhps, movhpd, 
movshdup */
-[0x17] = { SSE_SPECIAL, SSE_SPECIAL },  /* movhps, movhpd */
-
-[0x28] = { SSE_SPECIAL, SSE_SPECIAL },  /* movaps, movapd */
-[0x29] = { SSE_SPECIAL, SSE_SPECIAL },  /* movaps, movapd */
-[0x2a] = { SSE_SPECIAL, SSE_SPECIAL, SSE_SPECIAL, SSE_SPECIAL }, /* 
cvtpi2ps, cvtpi2pd, cvtsi2ss, cvtsi2sd */
-[0x2b] = { SSE_SPECIAL, SSE_SPECIAL, SSE_SPECIAL, SSE_SPECIAL }, /* 
movntps, movntpd, movntss, movntsd */
-[0x2c] = { SSE_SPECIAL, SSE_SPECIAL, SSE_SPECIAL, SSE_SPECIAL }, /* 
cvttps2pi, cvttpd2pi, cvttsd2si, cvttss2si */
-[0x2d] = { SSE_SPECIAL, SSE_SPECIAL, SSE_SPECIAL, SSE_SPECIAL }, /* 
cvtps2pi, cvtpd2pi, cvtsd2si, cvtss2si */
-[0x2e] = { gen_helper_ucomiss, gen_helper_ucomisd },
-[0x2f] = { gen_helper_comiss, gen_helper_comisd },
-[0x50] = { SSE_SPECIAL, SSE_SPECIAL }, /* movmskps, movmskpd */
-[0x51] = SSE_FOP(sqrt),
-[0x52] = { gen_helper_rsqrtps, NULL, gen_helper_rsqrtss, NULL },
-[0x53] = { gen_helper_rcpps, NULL, gen_helper_rcpss, NULL },
-[0x54] = { gen_helper_pand_xmm, gen_helper_pand_xmm }, /* andps, andpd */
-[0x55] = { gen_helper_pandn_xmm, gen_helper_pandn_xmm }, /* andnps, andnpd 
*/
-[0x56] = { gen_helper_por_xmm, gen_helper_por_xmm }, /* orps, orpd */
-[0x57] = { gen_helper_pxor_xmm, gen_helper_pxor_xmm }, /* xorps, xorpd */
+[0x10] = SSE_SPECIAL, /* movups, movupd, movss, movsd */
+[0x11] = SSE_SPECIAL, /* movups, movupd, movss, movsd */
+[0x12] = SSE_SPECIAL, /* movlps, movlpd, movsldup, movddup */
+[0x13] = SSE_SPECIAL, /* movlps, movlpd */
+[0x14] = SSE_OP(punpckldq, punpcklqdq, op2, 0), /* unpcklps, unpcklpd */
+[0x15] = SSE_OP

[PATCH v2 02/42] i386: DPPS rounding fix

2022-04-24 Thread Paul Brook

The DPPS (Dot Product) instruction is defined to first sum pairs of
intermediate results, then sum those values to get the final result.
i.e. (A+B)+(C+D)

We incrementally sum the results, i.e. ((A+B)+C)+D, which can result
in incorrect rouding.

For consistency, also remove the redundant (but harmless) add operation
from DPPD

Signed-off-by: Paul Brook 
---
 target/i386/ops_sse.h | 47 +++
 1 file changed, 25 insertions(+), 22 deletions(-)

diff --git a/target/i386/ops_sse.h b/target/i386/ops_sse.h
index 535440f882..a5a48a20f6 100644
--- a/target/i386/ops_sse.h
+++ b/target/i386/ops_sse.h
@@ -1934,32 +1934,36 @@ SSE_HELPER_I(helper_pblendw, W, 8, FBLENDP)
 
 void glue(helper_dpps, SUFFIX)(CPUX86State *env, Reg *d, Reg *s, uint32_t mask)
 {
-float32 iresult = float32_zero;
+float32 prod, iresult, iresult2;
 
+/*
+ * We must evaluate (A+B)+(C+D), not ((A+B)+C)+D
+ * to correctly round the intermediate results
+ */
 if (mask & (1 << 4)) {
-iresult = float32_add(iresult,
-  float32_mul(d->ZMM_S(0), s->ZMM_S(0),
-  >sse_status),
-  >sse_status);
+iresult = float32_mul(d->ZMM_S(0), s->ZMM_S(0), >sse_status);
+} else {
+iresult = float32_zero;
 }
 if (mask & (1 << 5)) {
-iresult = float32_add(iresult,
-  float32_mul(d->ZMM_S(1), s->ZMM_S(1),
-  >sse_status),
-  >sse_status);
+prod = float32_mul(d->ZMM_S(1), s->ZMM_S(1), >sse_status);
+} else {
+prod = float32_zero;
 }
+iresult = float32_add(iresult, prod, >sse_status);
 if (mask & (1 << 6)) {
-iresult = float32_add(iresult,
-  float32_mul(d->ZMM_S(2), s->ZMM_S(2),
-  >sse_status),
-  >sse_status);
+iresult2 = float32_mul(d->ZMM_S(2), s->ZMM_S(2), >sse_status);
+} else {
+iresult2 = float32_zero;
 }
 if (mask & (1 << 7)) {
-iresult = float32_add(iresult,
-  float32_mul(d->ZMM_S(3), s->ZMM_S(3),
-  >sse_status),
-  >sse_status);
+prod = float32_mul(d->ZMM_S(3), s->ZMM_S(3), >sse_status);
+} else {
+prod = float32_zero;
 }
+iresult2 = float32_add(iresult2, prod, >sse_status);
+iresult = float32_add(iresult, iresult2, >sse_status);
+
 d->ZMM_S(0) = (mask & (1 << 0)) ? iresult : float32_zero;
 d->ZMM_S(1) = (mask & (1 << 1)) ? iresult : float32_zero;
 d->ZMM_S(2) = (mask & (1 << 2)) ? iresult : float32_zero;
@@ -1968,13 +1972,12 @@ void glue(helper_dpps, SUFFIX)(CPUX86State *env, Reg 
*d, Reg *s, uint32_t mask)
 
 void glue(helper_dppd, SUFFIX)(CPUX86State *env, Reg *d, Reg *s, uint32_t mask)
 {
-float64 iresult = float64_zero;
+float64 iresult;
 
 if (mask & (1 << 4)) {
-iresult = float64_add(iresult,
-  float64_mul(d->ZMM_D(0), s->ZMM_D(0),
-  >sse_status),
-  >sse_status);
+iresult = float64_mul(d->ZMM_D(0), s->ZMM_D(0), >sse_status);
+} else {
+iresult = float64_zero;
 }
 if (mask & (1 << 5)) {
 iresult = float64_add(iresult,
-- 
2.36.0

[PATCH v2 01/42] i386: pcmpestr 64-bit sign extension bug

2022-04-24 Thread Paul Brook

The abs1 function in ops_sse.h only works sorrectly when the result fits
in a signed int. This is fine most of the time because we're only dealing
with byte sized values.

However pcmp_elen helper function uses abs1 to calculate the absolute value
of a cpu register. This incorrectly truncates to 32 bits, and will give
the wrong anser for the most negative value.

Fix by open coding the saturation check before taking the absolute value.

Signed-off-by: Paul Brook 
---
 target/i386/ops_sse.h | 20 +---
 1 file changed, 9 insertions(+), 11 deletions(-)

diff --git a/target/i386/ops_sse.h b/target/i386/ops_sse.h
index e4d74b814a..535440f882 100644
--- a/target/i386/ops_sse.h
+++ b/target/i386/ops_sse.h
@@ -2011,25 +2011,23 @@ SSE_HELPER_Q(helper_pcmpgtq, FCMPGTQ)
 
 static inline int pcmp_elen(CPUX86State *env, int reg, uint32_t ctrl)
 {
-int val;
+target_long val, limit;
 
 /* Presence of REX.W is indicated by a bit higher than 7 set */
 if (ctrl >> 8) {
-val = abs1((int64_t)env->regs[reg]);
+val = (target_long)env->regs[reg];
 } else {
-val = abs1((int32_t)env->regs[reg]);
+val = (int32_t)env->regs[reg];
 }
-
 if (ctrl & 1) {
-if (val > 8) {
-return 8;
-}
+limit = 8;
 } else {
-if (val > 16) {
-return 16;
-}
+limit = 16;
 }
-return val;
+if ((val > limit) || (val < -limit)) {
+return limit;
+}
+return abs1(val);
 }
 
 static inline int pcmp_ilen(Reg *r, uint8_t ctrl)
-- 
2.36.0

[PATCH v2 09/42] i386: Helper macro for 256 bit AVX helpers

2022-04-24 Thread Paul Brook

Once all the code is in place, 256 bit vector helpers will be generated by
including ops_sse.h a third time with SHIFT=2.

The first bit of support for this is to define a YMM_ONLY macro for code that
only apples to 256 bit vectors.  XXM_ONLY code will be executed for both
128 and 256 bit vectors.

Signed-off-by: Paul Brook 
---
 target/i386/ops_sse.h| 8 
 target/i386/ops_sse_header.h | 4 
 2 files changed, 12 insertions(+)

diff --git a/target/i386/ops_sse.h b/target/i386/ops_sse.h
index a5a48a20f6..23daab6b50 100644
--- a/target/i386/ops_sse.h
+++ b/target/i386/ops_sse.h
@@ -24,6 +24,7 @@
 #define Reg MMXReg
 #define SIZE 8
 #define XMM_ONLY(...)
+#define YMM_ONLY(...)
 #define B(n) MMX_B(n)
 #define W(n) MMX_W(n)
 #define L(n) MMX_L(n)
@@ -37,7 +38,13 @@
 #define W(n) ZMM_W(n)
 #define L(n) ZMM_L(n)
 #define Q(n) ZMM_Q(n)
+#if SHIFT == 1
 #define SUFFIX _xmm
+#define YMM_ONLY(...)
+#else
+#define SUFFIX _ymm
+#define YMM_ONLY(...) __VA_ARGS__
+#endif
 #endif
 
 /*
@@ -2337,6 +2344,7 @@ void glue(helper_aeskeygenassist, SUFFIX)(CPUX86State 
*env, Reg *d, Reg *s,
 
 #undef SHIFT
 #undef XMM_ONLY
+#undef YMM_ONLY
 #undef Reg
 #undef B
 #undef W
diff --git a/target/i386/ops_sse_header.h b/target/i386/ops_sse_header.h
index cef28f2aae..7e7f2cee2a 100644
--- a/target/i386/ops_sse_header.h
+++ b/target/i386/ops_sse_header.h
@@ -21,7 +21,11 @@
 #define SUFFIX _mmx
 #else
 #define Reg ZMMReg
+#if SHIFT == 1
 #define SUFFIX _xmm
+#else
+#define SUFFIX _ymm
+#endif
 #endif
 
 #define dh_alias_Reg ptr
-- 
2.36.0

Re: [PATCH 2/4] TCG support for AVX

2022-04-20 Thread Paul Brook

On Wed, 2022-04-20 at 16:19 +0200, Paolo Bonzini wrote:
> On 4/18/22 21:45, Paul Brook wrote:
> > > Massively too large for a single patch, I'm afraid. This needs
> > > to be split, probably into at least twenty patches, which each
> > > are a reviewable chunk of code that does one coherent thing.
> > Hmm, I'mm see what I can do.
> > 
> > Unfortunately the table driven decoding means that going from two
> > to
> > three operands tends to be a bit all or nothing just to get the
> > thing
> > to compile.
> 
> Hi Paul, welcome back and thanks for this huge work.  It should be
> possible at least to split the patch as follows (at least that's
> what _I_ would do in order to review it):
> [snip]



Ok, that sounds like a reasonable start.

> I can do some of the work too since I was planning to do this
> anyway (but have hardly started yet).

I'll push my changes to https://github.com/pbrook/qemu . This is a
personal project, so I'll be working on it as and when.

If you have additional comments/suggestions on the approach taken then
I'd be happy to hear them.

Paul

Re: [PATCH 2/4] TCG support for AVX

2022-04-18 Thread Paul Brook

On Mon, 2022-04-18 at 20:33 +0100, Peter Maydell wrote:
> On Mon, 18 Apr 2022 at 18:48, Paul Brook  wrote:
> > 
> > Add TCG translation of guest AVX/AVX2 instructions
> > This comprises:
> > 
> 
> Massively too large for a single patch, I'm afraid. This needs
> to be split, probably into at least twenty patches, which each
> are a reviewable chunk of code that does one coherent thing.

Hmm, I'mm see what I can do.

Unfortunately the table driven decoding means that going from two to
three operands tends to be a bit all or nothing just to get the thing
to compile.

Paul

[PATCH 3/4] Enable all x86-64 cpu features in user mode

2022-04-18 Thread Paul Brook

We don't have any migration concerns for usermode emulation, so we may
as well enable all available CPU features by default.

Signed-off-by: Paul Brook 
---
 linux-user/x86_64/target_elf.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/linux-user/x86_64/target_elf.h b/linux-user/x86_64/target_elf.h
index 7b76a90de8..3f628f8d66 100644
--- a/linux-user/x86_64/target_elf.h
+++ b/linux-user/x86_64/target_elf.h
@@ -9,6 +9,6 @@
 #define X86_64_TARGET_ELF_H
 static inline const char *cpu_get_model(uint32_t eflags)
 {
-return "qemu64";
+return "max";
 }
 #endif
-- 
2.35.2

[PATCH 1/4] Add AVX_EN hflag

2022-04-18 Thread Paul Brook

Add a new hflag bit to determine whether AVX instructions are allowed

Signed-off-by: Paul Brook 
---
 target/i386/cpu.h|  3 +++
 target/i386/helper.c | 12 
 target/i386/tcg/fpu_helper.c |  1 +
 3 files changed, 16 insertions(+)

diff --git a/target/i386/cpu.h b/target/i386/cpu.h
index 982c532353..0c7162e2fd 100644
--- a/target/i386/cpu.h
+++ b/target/i386/cpu.h
@@ -168,6 +168,7 @@ typedef enum X86Seg {
 #define HF_MPX_EN_SHIFT 25 /* MPX Enabled (CR4+XCR0+BNDCFGx) */
 #define HF_MPX_IU_SHIFT 26 /* BND registers in-use */
 #define HF_UMIP_SHIFT   27 /* CR4.UMIP */
+#define HF_AVX_EN_SHIFT 28 /* AVX Enabled (CR4+XCR0) */
 
 #define HF_CPL_MASK  (3 << HF_CPL_SHIFT)
 #define HF_INHIBIT_IRQ_MASK  (1 << HF_INHIBIT_IRQ_SHIFT)
@@ -194,6 +195,7 @@ typedef enum X86Seg {
 #define HF_MPX_EN_MASK   (1 << HF_MPX_EN_SHIFT)
 #define HF_MPX_IU_MASK   (1 << HF_MPX_IU_SHIFT)
 #define HF_UMIP_MASK (1 << HF_UMIP_SHIFT)
+#define HF_AVX_EN_MASK   (1 << HF_AVX_EN_SHIFT)
 
 /* hflags2 */
 
@@ -2045,6 +2047,7 @@ void host_cpuid(uint32_t function, uint32_t count,
 
 /* helper.c */
 void x86_cpu_set_a20(X86CPU *cpu, int a20_state);
+void cpu_sync_avx_hflag(CPUX86State *env);
 
 #ifndef CONFIG_USER_ONLY
 static inline int x86_asidx_from_attrs(CPUState *cs, MemTxAttrs attrs)
diff --git a/target/i386/helper.c b/target/i386/helper.c
index fa409e9c44..30083c9cff 100644
--- a/target/i386/helper.c
+++ b/target/i386/helper.c
@@ -29,6 +29,17 @@
 #endif
 #include "qemu/log.h"
 
+void cpu_sync_avx_hflag(CPUX86State *env)
+{
+if ((env->cr[4] & CR4_OSXSAVE_MASK)
+&& (env->xcr0 & (XSTATE_SSE_MASK | XSTATE_YMM_MASK))
+== (XSTATE_SSE_MASK | XSTATE_YMM_MASK)) {
+env->hflags |= HF_AVX_EN_MASK;
+} else{
+env->hflags &= ~HF_AVX_EN_MASK;
+}
+}
+
 void cpu_sync_bndcs_hflags(CPUX86State *env)
 {
 uint32_t hflags = env->hflags;
@@ -209,6 +220,7 @@ void cpu_x86_update_cr4(CPUX86State *env, uint32_t new_cr4)
 env->hflags = hflags;
 
 cpu_sync_bndcs_hflags(env);
+cpu_sync_avx_hflag(env);
 }
 
 #if !defined(CONFIG_USER_ONLY)
diff --git a/target/i386/tcg/fpu_helper.c b/target/i386/tcg/fpu_helper.c
index ebf5e73df9..b391b69635 100644
--- a/target/i386/tcg/fpu_helper.c
+++ b/target/i386/tcg/fpu_helper.c
@@ -2943,6 +2943,7 @@ void helper_xsetbv(CPUX86State *env, uint32_t ecx, 
uint64_t mask)
 
 env->xcr0 = mask;
 cpu_sync_bndcs_hflags(env);
+cpu_sync_avx_hflag(env);
 return;
 
  do_gpf:
-- 
2.35.2

[PATCH 0/3] AVX guest implementation

2022-04-18 Thread Paul Brook

Patch series to implement AXV/AVX2 guest support in TCG.

All the system level code for this (cpid, xsave, wider registers, etc)
already exists, we just need to implement the instruction translation.

The majority of the new 256-bit operations operate on each 128-bit
"lane" independently, so in theory we could use a single set of 128-bit
helpers to implement both widths piecemeal. However this would further
complicate the already over-long gen_sse function. Instead I chose to
generate a whole new set of 256 bit "ymm" helpers using the framework
already in place for 64/128 bit mm/xmm operations.

I've included the tests I used during development to the linux-user
testsuite, and also ran these manually inside a debian x86-64 guest.

Appologies for the big patch, but I can't think of a good way to split
the bulk of the instruction translation.

Paul Brook (4):
  Add AVX_EN hflag
  TCG support for AVX
  Enable all x86-64 cpu features in user mode
  AVX tests

 linux-user/x86_64/target_elf.h |2 +-
 target/i386/cpu.c  |8 +-
 target/i386/cpu.h  |3 +
 target/i386/helper.c   |   12 +
 target/i386/helper.h   |2 +
 target/i386/ops_sse.h  | 2606 +-
 target/i386/ops_sse_header.h   |  364 ++-
 target/i386/tcg/fpu_helper.c   |4 +
 target/i386/tcg/translate.c| 1902 ++---
 tests/tcg/i386/Makefile.target |   10 +-
 tests/tcg/i386/README  |9 +
 tests/tcg/i386/test-avx.c  |  347 +++
 tests/tcg/i386/test-avx.py |  352 +++
 tests/tcg/i386/x86.csv | 4658 
 14 files changed, 8988 insertions(+), 1291 deletions(-)
 create mode 100644 tests/tcg/i386/test-avx.c
 create mode 100755 tests/tcg/i386/test-avx.py
 create mode 100644 tests/tcg/i386/x86.csv

-- 
2.35.2

[PATCH] linux-user: Fix inotify on aarch64

2022-01-26 Thread Paul Brook

The inotify implementation originally called the raw host syscalls.
Commit 3b3f24add0 changed this to use the glibc wrappers. However ifdefs
in syscall.c still test for presence of the raw syscalls.

This causes a problem on e.g. aarch64 hosts which never had the
inotify_init syscall - it had been obsoleted by inotify_init1 before
aarch64 was invented! However it does have a perfectly good glibc
implementation of inotify_wait.

Fix this by removing all the raw __NR_inotify_* tests, and instead check
CONFIG_INOTIFY, which already tests for the glibc functionality we use.

Also remove the now-pointless sys_inotify* wrappers.

Tested using x86-64 inotifywatch on aarch64 host, and vice-versa

Signed-off-by: Paul Brook 
---
 linux-user/fd-trans.c |  5 ++---
 linux-user/syscall.c  | 50 +--
 2 files changed, 12 insertions(+), 43 deletions(-)

diff --git a/linux-user/fd-trans.c b/linux-user/fd-trans.c
index 6941089959..30e7b49112 100644
--- a/linux-user/fd-trans.c
+++ b/linux-user/fd-trans.c
@@ -1460,9 +1460,8 @@ TargetFdTrans target_eventfd_trans = {
 .target_to_host_data = swap_data_eventfd,
 };
 
-#if (defined(TARGET_NR_inotify_init) && defined(__NR_inotify_init)) || \
-(defined(CONFIG_INOTIFY1) && defined(TARGET_NR_inotify_init1) && \
- defined(__NR_inotify_init1))
+#if defined(CONFIG_INOTIFY) && (defined(TARGET_NR_inotify_init) || \
+defined(TARGET_NR_inotify_init1))
 static abi_long host_to_target_data_inotify(void *buf, size_t len)
 {
 struct inotify_event *ev;
diff --git a/linux-user/syscall.c b/linux-user/syscall.c
index 56a3e17183..17cc38fe34 100644
--- a/linux-user/syscall.c
+++ b/linux-user/syscall.c
@@ -272,9 +272,6 @@ static type name (type1 arg1,type2 arg2,type3 arg3,type4 
arg4,type5 arg5,   \
 #if defined(__NR_futex_time64)
 # define __NR_sys_futex_time64 __NR_futex_time64
 #endif
-#define __NR_sys_inotify_init __NR_inotify_init
-#define __NR_sys_inotify_add_watch __NR_inotify_add_watch
-#define __NR_sys_inotify_rm_watch __NR_inotify_rm_watch
 #define __NR_sys_statx __NR_statx
 
 #if defined(__alpha__) || defined(__x86_64__) || defined(__s390x__)
@@ -447,33 +444,6 @@ static int sys_renameat2(int oldfd, const char *old,
 
 #ifdef CONFIG_INOTIFY
 #include 
-
-#if defined(TARGET_NR_inotify_init) && defined(__NR_inotify_init)
-static int sys_inotify_init(void)
-{
-  return (inotify_init());
-}
-#endif
-#if defined(TARGET_NR_inotify_add_watch) && defined(__NR_inotify_add_watch)
-static int sys_inotify_add_watch(int fd,const char *pathname, int32_t mask)
-{
-  return (inotify_add_watch(fd, pathname, mask));
-}
-#endif
-#if defined(TARGET_NR_inotify_rm_watch) && defined(__NR_inotify_rm_watch)
-static int sys_inotify_rm_watch(int fd, int32_t wd)
-{
-  return (inotify_rm_watch(fd, wd));
-}
-#endif
-#ifdef CONFIG_INOTIFY1
-#if defined(TARGET_NR_inotify_init1) && defined(__NR_inotify_init1)
-static int sys_inotify_init1(int flags)
-{
-  return (inotify_init1(flags));
-}
-#endif
-#endif
 #else
 /* Userspace can usually survive runtime without inotify */
 #undef TARGET_NR_inotify_init
@@ -12263,35 +12233,35 @@ static abi_long do_syscall1(void *cpu_env, int num, 
abi_long arg1,
 case TARGET_NR_futex_time64:
 return do_futex_time64(cpu, arg1, arg2, arg3, arg4, arg5, arg6);
 #endif
-#if defined(TARGET_NR_inotify_init) && defined(__NR_inotify_init)
+#ifdef CONFIG_INOTIFY
+#if defined(TARGET_NR_inotify_init)
 case TARGET_NR_inotify_init:
-ret = get_errno(sys_inotify_init());
+ret = get_errno(inotify_init());
 if (ret >= 0) {
 fd_trans_register(ret, _inotify_trans);
 }
 return ret;
 #endif
-#ifdef CONFIG_INOTIFY1
-#if defined(TARGET_NR_inotify_init1) && defined(__NR_inotify_init1)
+#if defined(TARGET_NR_inotify_init1) && defined(CONFIG_INOTIFY1)
 case TARGET_NR_inotify_init1:
-ret = get_errno(sys_inotify_init1(target_to_host_bitmask(arg1,
+ret = get_errno(inotify_init1(target_to_host_bitmask(arg1,
   fcntl_flags_tbl)));
 if (ret >= 0) {
 fd_trans_register(ret, _inotify_trans);
 }
 return ret;
 #endif
-#endif
-#if defined(TARGET_NR_inotify_add_watch) && defined(__NR_inotify_add_watch)
+#if defined(TARGET_NR_inotify_add_watch)
 case TARGET_NR_inotify_add_watch:
 p = lock_user_string(arg2);
-ret = get_errno(sys_inotify_add_watch(arg1, path(p), arg3));
+ret = get_errno(inotify_add_watch(arg1, path(p), arg3));
 unlock_user(p, arg2, 0);
 return ret;
 #endif
-#if defined(TARGET_NR_inotify_rm_watch) && defined(__NR_inotify_rm_watch)
+#if defined(TARGET_NR_inotify_rm_watch)
 case TARGET_NR_inotify_rm_watch:
-return get_errno(sys_inotify_rm_watch(arg1, arg2));
+return get_errno(inotify_rm_watch(arg1, arg2));
+#endif
 #endif
 
 #if defined(TARGET_NR_mq_open) && defined(__NR_mq_open)
-- 
2.34.1

Re: [Qemu-devel] [PATCH] softfloat: rebase to version 2a

2013-05-02 Thread Paul Brook

 The license of SoftFloat-2b is claimed to be GPLv2 incompatible by
 the FSF due to an indemnification clause.  The previous release,
 SoftFloat-2a, did not contain this clause.  The only changes between
 these two versions as far as QEMU is concerned is the license change
 and a global modification of the comment structure.  This patch rebases
 our softfloat code to SoftFloat-2a in order to have a GPLv2 compatible
 license.

Acked-by: Paul Brook p...@codesourcery.com

Re: [Qemu-devel] [PATCH v2 00/11] Fix versatile_pci (now without breaking linux)

2013-03-28 Thread Paul Brook

 This patch series fixes a number of serious bugs in our emulation of
 the PCI controller found on VersatilePB and the early Realview boards:
  * our interrupt mapping was totally wrong
  * the I/O window wasn't mapped on VersatilePB

FWIW the documentation avaiable at the time I implemented the VersatilePB did 
not include the IO region. The PCI interrupt routing still seems to be missing 
from the docs.

Acked-by: Paul Brook p...@codesourcery.com
[Ignoring any issues with the backwards comaptibility hacks]

Re: [Qemu-devel] [PATCH 4/4] target-arm: always set endian bits in big-endian mode

2013-03-04 Thread Paul Brook

 On 03/01/2013 09:58 PM, Paul Brook wrote:
  +#ifdef TARGET_WORDS_BIGENDIAN
  +if (arm_feature(env, ARM_FEATURE_V6)
  +|| arm_feature(env, ARM_FEATURE_V7)) {
  +/* IE and EE bits stay set for big-endian */
  +env-cp15.c1_sys |= (1  31) | (1  25);
  +}
  +#endif
  
  This is wrong for all the CPUs QEMU crrently supports. SCTLR.IE is
  defined to be zero.
 
 Again I'd like to have more information. Why is it wrong to set IE when
 we are in big-endian?

The ARM architecture defines two big-endian modes.  In BE8 mode only data 
accesses big-endian, code fetches are still little-endian.  In BE32 mode both 
code and data are big-endian.  In theory a fourth mode (big-endian code, 
little-endian data) exists, though I've never seen that used.

All the v7 cores QEMU currently supports[1] only implement BE8 mode.  The IE 
bit is reserved and most be zero.  Usermode emulation implements both, but the 
privileged cp15 registers can safely be ignored there.

Paul

[1] Except maybe the M profile cores, but they use a different system model 
anyway.

Re: [Qemu-devel] [PATCH 3/4] target-arm: Fix VFP register byte order in GDB remote

2013-03-04 Thread Paul Brook

  The bytes with the register are transmitted in target byte order.
  
   /* Aliases for Q regs.  */
   nregs += 16;
   if (reg  nregs) {
  
  -stfq_le_p(buf, env-vfp.regs[(reg - 32) * 2]);
  -stfq_le_p(buf + 8, env-vfp.regs[(reg - 32) * 2 + 1]);
  +stfq_p(buf, env-vfp.regs[(reg - 32) * 2]);
  +stfq_p(buf + 8, env-vfp.regs[(reg - 32) * 2 + 1]);
  
  This is wrong. You're still using little-endian ordering of words.
 
 Can you explain a little bit further? If I'm in big-endian mode, stfq_p()
 will be stfq_be_p(), right?

Because we're actually storing two halves of a 128-bit value.   You still 
store the least significant half first.

Paul

Re: [Qemu-devel] [PATCH 3/4] target-arm: Fix VFP register byte order in GDB remote

2013-03-01 Thread Paul Brook

 From GDB Remote Serial Protocol doc:
 
 The bytes with the register are transmitted in target byte order.

  /* Aliases for Q regs.  */
  nregs += 16;
  if (reg  nregs) {
 
 -stfq_le_p(buf, env-vfp.regs[(reg - 32) * 2]);
 -stfq_le_p(buf + 8, env-vfp.regs[(reg - 32) * 2 + 1]);
 +stfq_p(buf, env-vfp.regs[(reg - 32) * 2]);
 +stfq_p(buf + 8, env-vfp.regs[(reg - 32) * 2 + 1]);

This is wrong. You're still using little-endian ordering of words.

Paul

Re: [Qemu-devel] [PATCH 4/4] target-arm: always set endian bits in big-endian mode

2013-03-01 Thread Paul Brook

 +#ifdef TARGET_WORDS_BIGENDIAN
 +if (arm_feature(env, ARM_FEATURE_V6)
 +|| arm_feature(env, ARM_FEATURE_V7)) {
 +/* IE and EE bits stay set for big-endian */
 +env-cp15.c1_sys |= (1  31) | (1  25);
 +}
 +#endif

This is wrong for all the CPUs QEMU crrently supports. SCTLR.IE is defined to 
be zero.

Paul

Re: [Qemu-devel] [ARM] Cortex-R4F and VFP3-D16

2013-02-27 Thread Paul Brook

  Probably what you'll want is to have a separate feature bit for 32
  dregs which is set by default for vfpv3, and then use that in
  VFP_DREG rather than the vfpv3 feature bit.
 
 Right, it might be easier than I though. Maybe add a
 ARM_FEATURE_VFP3_D16 and do:

 #define VFP_DREG(reg, insn, bigbit, smallbit) do { \
 
 if (arm_feature(env, ARM_FEATURE_VFP3) \
 
  !arm_feature(env, ARM_FEATURE_VFP3_D16)) {   \

There's no need to check both flags.

I've got a patch to implement this as a side-effect of a different feature, 
I'll look at pushing it out.

Paul

Re: [Qemu-devel] [PATCH 4/6] Handle CPU interrupts by inline checking of a flag

2013-02-22 Thread Paul Brook

 @@ -100,6 +102,7 @@ struct CPUState {
  bool stop;
  bool stopped;
  volatile sig_atomic_t exit_request;
 +volatile sig_atomic_t tcg_exit_req;

Do we really need annother variable/check?  It seems like this should be at 
least partially redundant with the existing icount code. I have a simialr 
patch to that effect.

Paul

Re: [Qemu-devel] [PATCH 24028/24028] Evaluate breakpoint condition on target.

2013-02-21 Thread Paul Brook

In addition to the comments others made about patch formatting, etc:

 +/* conditional breakpoint evaluation on target*/
 +pstrcat(buf, sizeof(buf), ;ConditionalBreakpoints+);

I'm pretty sure this is a lie for most targets, given later on we have:

 +#if defined(TARGET_ARM)
 +cpu_get_reg_var_func = cpu_get_reg_var_arm;
 +#else
 +cpu_get_reg_var_func = 0;
 +#endif

 +for (i = 0 ; i  bp_cond_len ; i++) {
 +if (!isxdigit(*p) || !isxdigit(*(p + 1))) {
 +bp_cond_len = 0 ;
 +g_free(bp_cond_expr);
 +bp_cond_expr = NULL;
 +perror(Error in breakpoint condition);

perror is the wrong way to report a malformed gdb command.

 +#if TARGET_LONG_SIZE == 4
 +typedef float target_double;
 +#else /* TARGET_LONG_SIZE == 8 */
 +typedef double target_double;
 +#endif

This clearly has nothing to do with the target double precision floating point 
type.

 +int qemu_rw_debug_flag;

This appears to be a write-only variable.

 +#define BP_AGENT_MAX_COND_SIZE 1024

By my reading this isn't the maximim size, it's the maximum stack depth.

 +void cpu_get_reg_var_arm(TCGv var, int reg)
 +{
 +tcg_gen_mov_i32(var, cpu_R[reg]);
 +}

Looks like it will break horribly when the user requests anything other than 
r0-r15.  And r15 is probably also wrong.

 +bswap16(val);

Clearly wrong.

 +fprintf(stderr,
 +GDB agent: const 64 is not supported for 32 bit

This is not a good way to report user errors.  Several other occurances.

 +static target_long bp_agent_get_arg(const uint8_t *cond_exp,
...
 +case 4:
 +default:

I'd be amazed if this default case is correct.

 +/*for case error , ex.buffer overloading -
 +  need to set labels anyway in order to avoid segmentation fault  */

Sounds like you're failing to check for errors somewhere else.

Re: [Qemu-devel] [PATCH V2 2/6] hw/mdio: Generalize etraxfs MDIO bitbanging emulation (fwd)

2013-01-25 Thread Paul Brook

 To be able to create generic GPIO devices or other devices that have GPIO
 like pins (e.g MDIO), and hook those up to external buses through common
 frameworks, we need agreement on how to model tristate pins.
 A tristate pin model, or at least agreement on how to model these with
 multiple qemu_irqs.
 
 hmm, feels like we've opened a can of worms...

Probably. I'm not going to insist you use/implement generic GPIO for MDIO, but 
I still think separation between the PHY register interface and the bitbang 
code is good (i.e. same as bitbang_i2c).
 
 Anyway, how would such a qemu_tristate_pin be modelled?
[point 1 moved later so my answers read in a sensible order]

 2. Every connection point provides an output/value and an output_enable.

I think we'd be better providing a single state  i.e. an output of 0, 1 or Z.  
Possibly additional Z0/Z1 states to represent high-impedance with pull-up/down 
resistors.

 3. There is a mean for reading the pin value, which is computed based on
 all connection points outputs and output_enables (can be cached).

For MDIO being able to read the value is sufficient.  However in general we 
don't want to have to poll it.  We want to be told when it changes.

 4. The pin value can be invalid (multiple drivers or no drivers), 0 or 1.

I can't think of any cases where this is important.  In most cases it's 
undefined, in the rest it causes physical damage.

 1. It's not point-to-point, has an arbitrary nr of connection points.

QoM currently only does asymmetric 1-1 connections between objects.  However I 
don't think this is a fatal problem.  We can still retain an asymmetric API 
(effectively equivalent to male and female physical connectors), adding 
virtual wire objects where they don't match up.  It should be possible to 
implement this as a backward compatible extension to qemu_irq[1].  In most 
cases the additional wire should not be needed.

For simple output-input (i.e. all existing code) we just need to ignore Z 
states.  Preferably before they get to the input device.

For simple bidirectional point-point lines (which should include bitbang-i2c 
and bitbang-mdio) the bitbang object controls the value when subject to a Z 
output.

For arbitrary pin connections they all connect to a set of ports on a virtual 
wire device.  It takes care of arbitrating line state and sending 
notifications to the connected devices.

There are a couple of technical issues:

Fristly qemu_irq is currently stateless[2].  Giving it state is fine in 
principle, but means a lot of load/save code needs fixing.  In pactice we can 
probably avoid this, but there are some nice benefits from keeping state in 
qemu_irq.

Secondly, the [parent of the] qemu_irq object needs to be able to signal value 
changes to the object on the other side of the link. Currently QoM allows a 
property to be linked to an object, but provides no way for the object to 
identify/communicate with the property/device linked to it.


Paul

[1] I've no particular attachment to the name qemu_irq.  But I really don't 
want to have to make anything other than purely mechanical changes to all its 
users.
[2] More precicely it has no state that changes over its lifetime.

Re: [Qemu-devel] [PATCH V2 6/6] hw/mdio: Use bitbang core for smc91c111 network device

2013-01-25 Thread Paul Brook

 @@ -44,6 +45,10 @@ typedef struct {
  uint8_t int_level;
  uint8_t int_mask;
  MemoryRegion mmio;
 +
 +/* MDIO bus and the attached phy */
 +struct qemu_mdio mdio_bus;
 +struct qemu_phy phy;
  } smc91c111_state;
 
  static const VMStateDescription vmstate_smc91c111 = {
 @@ -71,6 +76,8 @@ static const VMStateDescription vmstate_smc91c111 = {
  VMSTATE_BUFFER_UNSAFE(data, smc91c111_state, 0, NUM_PACKETS *
 2048), VMSTATE_UINT8(int_level, smc91c111_state),
  VMSTATE_UINT8(int_mask, smc91c111_state),
 +VMSTATE_MDIO(mdio_bus, smc91c111_state),
 +VMSTATE_MDIO_PHY(phy, smc91c111_state),
  VMSTATE_END_OF_LIST()
  }
  };

 @@ -754,6 +768,9 @@ static int smc91c111_init1(SysBusDevice *dev)
  s-nic = qemu_new_nic(net_smc91c111_info, s-conf,
object_get_typename(OBJECT(dev)), dev-qdev.id,
 s); qemu_format_nic_info_str(s-nic-nc, s-conf.macaddr.a);
 +
 +tdk_init(s-phy);
 +mdio_attach(s-mdio_bus, s-phy, 0);
  /* ??? Save/restore.  */
  return 0;
  }

There's no reason for smc91c111_state to contain the PHY state.  For devices 
with an off-chip PHY we have no way of knowing which phy is used, or what 
state is required.

The PHY should be a device in its own right, and know how to save/restore 
itself.  smc91c111_init1 should create the PHY, attach it to the MDIO bus, 
then forget about it.

Paul

Re: [Qemu-devel] [PATCH v2 19/20] arm: add Faraday FTKBC010 support for A369

2013-01-25 Thread Paul Brook

 From: Kuo-Jung Su dant...@faraday-tech.com
 
 Faraday keyboard/mouse controller (FTKBC010) is compliant with the
 IBM PS/2 interface.

Your description doesn't appear to match the code at all.  Surely if this were 
true then you should be using the existing PS2 keyboard emulation.

Paul

Re: [Qemu-devel] [PATCH v2 20/20] arm: add generic ROM model for Faraday SoC platforms

2013-01-25 Thread Paul Brook

 Since the NAND and SPI flash memories do not support random access,
 so most of the systems which use such memory as main storages
 usually has some bootstrap code stored inside the embedded ROM of
 its SoC, and the bootstrap code is responsible for SDRAM initialization
 and then load the specific software(i.e. u-boot/linux) into SDRAM,
 and finally jumps into the loaded primary software.

No.

For a start the block device you're using is for parallel plash devices, which 
are directly mapped.  This contradicts your desciption which talks about 
serial flash.

Please look at how other boards work.  There are already mechanisms for 
creating rom areas, or preloading images into ram.

Paul

Re: [Qemu-devel] [PATCH v2 03/20] arm: add Faraday FTAPBBRG020 APB DMA support

2013-01-25 Thread Paul Brook

 The FTAPBBRG020 supports the DMA functions for the AHB-to-AHB,
 AHB-to-APB, APB-to-AHB, and APB-to-APB transactions.

All the timer code in this file looks suspect.  As a general rule everything 
should be event driven and complete immediately (or at least schedule a BH for 
immediate action if recursion is a concern), not relying on a periodic timer 
interrupts.

 +qemu_mod_timer(s-qtimer,
 +qemu_get_clock_ns(vm_clock) + 1);

For all practical purposes this is going to happen immediately, so you should 
not be using a timer.

 +qemu_mod_timer(s-qtimer,
 +qemu_get_clock_ns(vm_clock) + (get_ticks_per_sec()  2));

Why 0.25 seconds?  Usually this sort of try-again-soon behavior means you've 
missed a trigger event somewhere else.

 +if (!cpu_physical_memory_is_io(c-src)) {
 +src_map = src_ptr = cpu_physical_memory_map(c-src, src_len, 0);
 +}
 +if (!cpu_physical_memory_is_io(c-dst)) {
 +dst_map = dst_ptr = cpu_physical_memory_map(c-dst, dst_len, 1);
 +}

cpu_physical_memory_map might not map the whole region you requested.  This 
will cause badness in the subsequent code.


I suspect a log of this code anc and should be shared with your other DMA 
controller, and probably several of the existing DMA controllers.

Paul

Re: [Qemu-devel] [PATCH v2 05/20] arm: add Faraday FTGMAC100 1Gbps ethernet support

2013-01-25 Thread Paul Brook

 In order to reduce the processing load of the host CPU, the FTGMAC100
 implements TCP, UDP, and IP V4 checksum generation and validation, and
 supports VLAN tagging.

I see no evidence of these features in the code.

 +static void ftgmac100_read_desc(hwaddr addr, void *desc)
 +{
 +int i;
 +uint32_t *p = desc;
 +
 +cpu_physical_memory_read(addr, desc, 16);
 +
 +for (i = 0; i  16; i += 4) {
 +*p = le32_to_cpu(*p);
 +}
 +}

You're relying on the compiler choosing a particular bitfield and structure 
layout. Don't do that.  Especially when one of the fields is a void*.  Clearly 
never been tested on a 64-bit host. void *desc is just plain lazy.

 +buf = s-txbuff.buf + s-txbuff.len;
 +cpu_physical_memory_read(txd.buf, (uint8_t *)buf, txd.len);

Buffer overflow.  In at least two differnt ways.

 +if (!(s-maccr  MACCR_HT_MULTI_EN)) {
 +printf([qemu] ftgmac100_receive: mcst filtered\n);
 +return -1;

Looks like stray debug code.  Several other occurences.

 +case REG_TXPD:
 +case REG_HPTXPD:
 +qemu_mod_timer(s-qtimer, qemu_get_clock_ns(vm_clock) + 1);

Using a timer here is wrong.  Either you should transmit immediately, or you 
should wait for something else to happen.  Delaying by 1ns is never the right 
answer.

Paul

Re: [Qemu-devel] [PATCH v2 06/20] arm: add Faraday FTMAC110 10/100Mbps ethernet support

2013-01-25 Thread Paul Brook

 The FTMAC110 is a high quality 10/100 Ethernet controller 

Which looks largely the same as the other ethernet controller you added in the 
previous patch.

Paul

Re: [Qemu-devel] [PATCH v2 10/20] arm: add Faraday FTSDC010 MMC/SD controller support

2013-01-25 Thread Paul Brook

 +if (!(s-dcr  DCR_WR)  (s-datacnt  0)) {
 +ret = sd_read_data(s-card)
 +| sd_read_data(s-card)  8
 +| sd_read_data(s-card)  16
 +| sd_read_data(s-card)  24;
 +s-datacnt -= 4;
 +if (s-datacnt = 0) {
 +s-status |= STR_DAT_END;
 +}

This will fail if datacnt is not a multiple of 4.

Paul

Re: [Qemu-devel] [PATCH v2 14/20] arm: add Faraday FTRTC011 RTC timer support

2013-01-25 Thread Paul Brook

 +qemu_mod_timer(s-qtimer,
 +qemu_get_clock_ns(vm_clock) + get_ticks_per_sec());

This will not work reliably.  You can not rely on timers triggering promptly.  
Plus you're loosing the time taken to execute the callback every tick.

Additionally you can calculate values on demand, and only trigger a timer tick 
when an interrupt is actually enabled.  You don't need high precision, so use 
timer_ms rather than timer_ns.

Paul

Re: [Qemu-devel] [PATCH V2 2/6] hw/mdio: Generalize etraxfs MDIO bitbanging emulation (fwd)

2013-01-24 Thread Paul Brook

   It also worries me that there isn't a clean separation between the MDIO
   bus and the bitbang interface.  IMO the bitbang interface should be a
   separate device, and if we're wiring up bitbang interfaces then it
   really should be via standard GPIO pins (aka qemu_irq).
  
  Only the bitbang state machine is in the mdio layer. It says nothing
  about where those signals come from, gpio or otherwise. Not all cases
  will actually be GPIOs. For instance, the smc91c111 has dedicated pins
  for MDIO operations which are not GPIOs, even though the driver has to
  manage the bigbanging.

There's no such thing as a dedicated pin managed by software.  That's 
exactly what a GPIO pin is.  It may be that particular pins are usually used 
for a particular purpose, but I don't think that is sufficient reason to 
create a whole new API.  The way to solve that is to give the pins appropriate 
names.  Don't be distracted by the fact that the smc91c111 is two devices (MAC 
and PHY) on the same chip.

  That said, I'm not opposed to changing the model if that is the design
  direction. However, I hope that the series won't be blocked on this
  point. This series moves and enhances existing code. A move to qemu_irq
  should be done as a follow-on patch.
 
 Maybe we should do it like the i2c framework? It does very similar
 things as mdio would need (with a nice split). It addresses Pauls comments
 (I think) and also the split between slaves and the bus. It also makes it
 possible to select PHY model from board code.

Yes.  Though on closer inspection the bitbang I2C module introduces 
bitbang_i2c_set, which I'd preter to avoid.  This isn't quite as easy as it 
should be because we don't have a nice solution for tristate pins (currently 
modelled as a cross-wired output and input pair).

Paul

Re: [Qemu-devel] [PATCH V2 2/6] hw/mdio: Generalize etraxfs MDIO bitbanging emulation

2013-01-23 Thread Paul Brook

 +#ifdef USE_THIS_DEAD_CODE
 +void mdio_detach(struct qemu_mdio *bus, struct qemu_phy *phy, unsigned int
 addr) +{
 +bus-devs[addr  0x1f] = NULL;
 +}
 +#endif

This is clearly wrong.


It also worries me that there isn't a clean separation between the MDIO bus 
and the bitbang interface.  IMO the bitbang interface should be a separate 
device, and if we're wiring up bitbang interfaces then it really should be via 
standard GPIO pins (aka qemu_irq). 

Paul

Re: [Qemu-devel] [PATCH] Annotate questionable fallthroughs

2013-01-21 Thread Paul Brook

  diff --git a/disas/cris.c b/disas/cris.c
  +/* XXX: questionable fallthrough */
 
 Inherited from binutils; if you want to clean this up, suggest to do it
 there.

Except that upstream binutils is GPLv3, so this code is effectively orphaned.

Paul

Re: [Qemu-devel] [PATCH] Annotate questionable fallthroughs

2013-01-20 Thread Paul Brook

 I don't think there's much point adding tons of XXX comments
 when a bunch of these aren't actually wrong code. If you want to fix
 this I think a better approach would be more focused patches aimed
 at adding 'break;' or /* fallthrough */ based on actual human
 examination of the surrounding code.

I agree.   I encourage annotation of intentional fall through, but blindly 
pasting the output of an automated tool is liable to cause more harm than 
good.

IMO running code analysis tools is easy.  It's only when you take the time to 
manually inspect and fix the code that this really becomes valuable.

Paul

Re: [Qemu-devel] [PATCH] target-arm: add Faraday ARMv5TE processors support

2013-01-18 Thread Paul Brook

 * ARMv5TE series (FA606TE, FA626TE, FA616TE, FA726TE)
 
 All the single core RISC listed above are included in this patch.
 And there are two Faraday CP15 extensions (AUX and I/D-Scratchpad)
 have been implemented as NOP.

Is a NOP appropriate?  Should you at least read the value back?

 * Confidentiality Notice 
 This electronic message and any attachments may contain
 confidential and legally privileged information or
 information which is otherwise protected from disclosure.
 If you are not the intended recipient,please do not disclose
 the contents, either in whole or in part, to anyone,and
 immediately delete the message and any attachments from
 your computer system and destroy all hard copies.
 Thank you for your cooperation.
 ***

This sort of disclaimer is completely inappropriate for public mailing lists, 
and I'm unwilling to touch anything subject to these restrictions.
As instructed I have deleted all your other email unread.

Paul

Re: [Qemu-devel] [PATCH v1 3/4] hw: Deduce the default machine from the specified CPU model

2012-08-28 Thread Paul Brook

  This changes the driver behavior to choose the default machine
  model based on the CPU being used.  Defaulting the machine this
  way makes it easier to use QEMU as an ISS by just specifying
  the -cpu option since a default machine that is suitable for
  emulating the full ISA can be chosen.
  
  For example, currently on ARM the ARM Integrator/CP board is
  chosen as the default machine when specifying just a CPU.
  However, this doesn't work well when passing -cpu cortex-m3
  since on ARMv7-M processors the NVIC is a part of the architecture
  and is needed to support instructions like SVC.
 
 Personally I'd rather we didn't support a default machine at
 all, at least for ARM. It does matter what board you run on,
 so you need to specify.

A possible compromise is to only accept -cpu if -M is also specified.
 
 Just to pick an obvious example, you can't stick a core
 which supports VFPv4 (the A15 is the only one we have) into
 the integratorcp

Yes you can.

Your OS probably doesn't support it, and you might have trouble persuading the 
OS vendor to support something that doesn't physically exist, but those are a 
competely separate problems.

 We could reasonably add patches which made boards error
 out if you tried to use them with unsupported CPUs, I guess.

That suffers from a large fuzzy region containing interesting combinations 
that could/do work, but will probably never be created in silicon.

If done properly this the QOM conversion should give you this for free.

Paul

Re: [Qemu-devel] [PATCH v1 3/4] hw: Deduce the default machine from the specified CPU model

2012-08-28 Thread Paul Brook

  Just to pick an obvious example, you can't stick a core
  which supports VFPv4 (the A15 is the only one we have) into
  the integratorcp
  
  Yes you can.
 
 No you can't. integratorcp.c doesn't create the parts of the CPU
 which live in QEMU's 'a15mpcore_priv' device, so the resulting
 mess is liable to just fall over. If anybody reports bugs in
 QEMU in such a configuration I will tell them to go away and
 use a supported configuration instead.

The A15 core itself will work just fine.  The core is completely independent 
of the interrupt controller.  Unlike the M profile cores where the NVIC is 
inherently part of the CPU exception handling mechanism.  

Paul

Re: [Qemu-devel] [PATCH 0/3] Drop default SD card creation

2012-08-16 Thread Paul Brook

 On 16 August 2012 15:11, Markus Armbruster arm...@redhat.com wrote:
  Peter Maydell peter.mayd...@linaro.org writes:
  As suggested in the recent discussion on Markus' patchset to suppress
  unused default drives, this patchset cleans up the omap and pxa2xx
  
  SD card controllers to behave like the other controllers:
   * the init function looks for the next IF_SD drive
   * if there isn't one, we start up as a controller with no card
   
 present
  
  Isn't this an incompatible change?  Before, you get an SD card reader
  backed by an empty BDS default.  You can load/unload cards in the
  monitor.  After, you get an SD card reader that isn't backed by a BDS by
  default.  Device models prepared for that can treat it as permanently
  empty.
 
 Hmm, yes, but most of our SD controllers already act that way.
 We should probably fix them all...
 
 So what's the block layer equivalent of drive_get_next() that always
 returns us something we can get a bdrv from?

I think this may be the wrong way to fix this.  SD cards aren't really have 
removable media.  In the same way that a SCSI HDD are generally not removable 
media - you hotplug the whole drive.

Don't we really want a proper QOM device for the SD card, with hotplug 
support.

Paul

Re: [Qemu-devel] [PATCH 0/3] Drop default SD card creation

2012-08-16 Thread Paul Brook

 On 16 August 2012 16:17, Markus Armbruster arm...@redhat.com wrote:
  Paul Brook p...@codesourcery.com writes:
  I think this may be the wrong way to fix this.  SD cards aren't really
  have removable media.  In the same way that a SCSI HDD are generally
  not removable media - you hotplug the whole drive.
  
  If an SD card device doesn't support media change, then the device model
  should:
  
  1. Insist on non-null, non-empty BDS on initialization (this ensures we
  got media)
 
 This seems to be trying to draw a distinction that I don't understand.
 The SD card *is* the media, it's the physical object you stuff in and
 out of the slot on the side of your device.

It's the difference between not present and present but empty.

In the case of an SD card the media (i.e. flash) is generally not seperable 
from the SD device - I don't remember if the SD spec even supports removable 
media.  The same is true for most hard disks - the disk platters are an 
integral part of the drive.  In these cases the present but empty state does 
not exist.

c.f. cdrom drives where the concept of an empty device is clearly very 
different to an absent device.
 
 I guess that that means that change SD card should ideally be modelled
 as destroy the sd.c device object and create a new one and reconnect
 it to the controller but we don't really model things quite in the
 right way to permit that, so we fake it up at the moment by allowing
 the underlying BDS to change its idea of media. This works except
 that if the initial state is no card present we have a NULL BDS rather
 than one which is non-NULL but has no media at the moment.
 
 (I think Paul is suggesting that we should fix our model to
 move closer to this idea rather than faking things...)

I think we have two options:

A) Model the SD slot and card explicitly as separate objects.  Effectively the 
same way we have a scsi bus with scsi drives connected to it.  Cards can be 
hotplugged.  A card has a block device that is not optional, and not 
removable.

I don't know how well our UI handles this.  It may well require user-visible 
changes.

B) Continue to effectively model just the SD slot, with the card being 
implicit.  The slot should always create/find a [removable] block device.  An 
empty block device is modelled as an absent card.  A slot without a block 
device is IMO a bug.

This can create awkwardness because there's no good way to expose card 
specific properties (we don't curently implement any interesting ones).  These 
should really be per-card, i.e. may change when you change the contents.  
However the only thing we have to attach them to is the long-lived slot 
object.  e.g. in some cases data may be either an SD or an SDHC card.  We 
currently make a guess.  The only place to attach a user override is the SD 
slot, and that must be determined at machine creation, not when you associate 
data with the block device.

Paul

Re: [Qemu-devel] [PATCH 0/3] Drop default SD card creation

2012-08-16 Thread Paul Brook

 One way is to treat the SD card as a hot-pluggable device.  A card
 reader device model provides a connector for the SD card device model.
 The SD card device model is backed by a block backend, with
 non-removable medium.  Card change is device hot plug.
... 
 Note that we could model floppies and CD-ROMs that way, too.

That's a good point.  e.g. for a cdrom I'm pretty sure there's a bit somewhere 
that tells you whether it's a pressed cd or a cd-r.  Attaching this 
information to a cdrom-disk device (hotplugged into a cdrom-drive) seems to 
make sense.

Paul

Re: [Qemu-devel] [PATCH 00/23] Suppress unused default drives

2012-08-09 Thread Paul Brook

  *can* use it for something entirely else, if=sd notwithstanding:
  (qemu) device_add lsi
  (qemu) device_add scsi-cd,drive=sd0
 
 If/when we get a PCI SD card controller model, would all the PCI
 using machines need to be added to take the 'no default sd card'
 setting out again, or does it get overridden anyway if you say
 and I'd like an sd controller?

For SD cards we shouldn't need this to start with.  Why are we creating SD 
cards when there's no host controller to connect them to?  Surely we shold be 
able to figure that out automatically.  Especially important for board 
variants with multiple SD interfaces.

Is this all a hangover from before we have proper -drive options?

Paul

[Qemu-devel] [PATCH] Fix ALSA configure check

2012-07-31 Thread Paul Brook

Recent gcc notice that the ASLA configure check uses an uninitialized
variable, causing spurious failures.  Adjust the testcase to avoid this.

Signed-off-by: Paul Brook p...@codesourcery.com
---
 configure |2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/configure b/configure
index c65b5f6..9152798 100755
--- a/configure
+++ b/configure
@@ -1890,7 +1890,7 @@ for drv in $audio_drv_list; do
 case $drv in
 alsa)
 audio_drv_probe $drv alsa/asoundlib.h -lasound \
-snd_pcm_t **handle; return snd_pcm_close(*handle);
+snd_pcm_t *handle = NULL; return snd_pcm_close(handle);
 libs_softmmu=-lasound $libs_softmmu
 ;;
 
-- 
1.7.10.4

[Qemu-devel] [PATCH] target-arm: Fix CP15 based WFI

2012-07-01 Thread Paul Brook

The coprocessor register rework broke cp15 based WFI instructions.
We incorrectly fall through the normal register write case, which
incorrectly adds a forced block termination.  We've already done
a special version of this (DISAS_WFI), so return immediately.

Signed-off-by: Paul Brook p...@codesourcery.com
---
 target-arm/translate.c |2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/target-arm/translate.c b/target-arm/translate.c
index a2a0ecd..f39b9ca 100644
--- a/target-arm/translate.c
+++ b/target-arm/translate.c
@@ -6236,7 +6236,7 @@ static int disas_coproc_insn(CPUARMState * env, 
DisasContext *s, uint32_t insn)
 }
 gen_set_pc_im(s-pc);
 s-is_jmp = DISAS_WFI;
-break;
+return 0;
 default:
 break;
 }
-- 
1.7.10

Re: [Qemu-devel] [RFC PATCH 0/4] virtio-rng and RngBackend infrastructure (v2)

2012-07-01 Thread Paul Brook

 This series depends on my QOM -object series that I just posted.
 
 In Amit's thread on virtio-rng, danpb mentioned that we really ought to
 have a proper RNG backend infrastructure and of course he's correct on
 that.
 
 Now that we have QOM, I wanted to demonstrate how we can use QOM to
 construct a complete backend without adding any new infrastructure.
 
 I've now implemented a urandom and egd backend and tested them.  I think
 the first three patches are ready to go.

I never really understood why this exists in the first place.  It's a simple 
readonly charcter device.  IMHO you should be using virtio-serial.  This is 
virtio-console v.s. virtio-serial all over again.
The only thing close to a reason I've heard is that guest OS is incompetent 
and can't source random rata from a serial device.

Even accepting the pointless guest device, I see absolutely no reason to have 
special infrastructure for this within qemu.  Character devices do everything 
you need.  Creating annother read stream of data API is needless duplication 
and only going to reintroduce bugs we already fixed in the character device 
layer.

Paul

Re: [Qemu-devel] [RFC PATCH 1/1] linux-user: Probe the guest base for shared objects when needed

2012-06-27 Thread Paul Brook

  openSUSE uses a version patched so that IIUC 3G are reserved.
  Just today this failed on a system where swap got disabled and the
  mmap() thus failed.
 
 Err... why?  We map with MAP_NORESERVE, so swap shouldn't matter...

I can't say if it's the same cause, but we fail with ulimit -v 4046848.

Incidentally, it seems a strange that we only reserve 0xf700 bytes, not 
the full 4G.

Paul

Re: [Qemu-devel] [RFC PATCH 1/1] linux-user: Probe the guest base for shared objects when needed

2012-06-27 Thread Paul Brook

 On 28.06.2012, at 02:06, Paul Brook wrote:
  openSUSE uses a version patched so that IIUC 3G are reserved.
  Just today this failed on a system where swap got disabled and the
  mmap() thus failed.
  
  Err... why?  We map with MAP_NORESERVE, so swap shouldn't matter...
  
  I can't say if it's the same cause, but we fail with ulimit -v 4046848.
  
  Incidentally, it seems a strange that we only reserve 0xf700 bytes,
  not the full 4G.
 
 Uh, I think that was because of the vdso shared page that is allocated on
 top of -R.

That can't be right.  The whole point of -R is that it defines all the guest 
accessible virtual address space.  The surrounding space is liable to be used 
by something else, and we must not make any assumptions about it.

Further inspection shows that guest_validate_base contains some extremely 
bogus code.

If the guest needs something at the top of its address space then we need to 
offset address zero within the block, and ensure accesses wrap appropriately.

Paul

Re: [Qemu-devel] [RFC PATCH 1/1] linux-user: Probe the guest base for shared objects when needed

2012-06-27 Thread Paul Brook

 'guest_validate_base' is currently called for three reasons: (1) in main.c
 when using -B, (2) in main.c when using -R after mapping the reserved va
 region, and (3) and when probing for a guest base in probe_guest_base.
 
 For case (1) I suppose things are pretty much the same -- we just need to
 map the extra region when needed (e.g. for the ARM kernel helpers).

Yes.
 
 For case (2) maybe we can do a probing similar to what I mentioned here
 [1], but taking into account what you stated above and ensuring that the
 probing finds a single region for the request va region size and any
 needed extra stuff.

Something like that, yes. I suspect there are better ways to implement it 
though.  In principle your patch is making (2) a variant of (3). Instead of 
probing for the segments covered by the image we probe for the reserved 
regions (e.g. for ARM [0-reserved_va, 0x - 0x]).  A good 
implementation should automagically DTRT for both 32-bit and 64-bit hosts.

 Case (3) is mostly the same as (2) but we are probing for a guest base with
 a region size deduced from looking at the image we are loading.  I suppose
 it is still OK to map two regions here.  The single region only applies to
 -R?

I'd say (3) is more similar to (1).  There's no fundamental reason why -R has 
to allocate a single block.  In all cases we should be checking the same thing 
- are the addresses we need available on the host?  Having different code 
paths calling guest_validate_base, etc. for different reasons makes me think 
we're doing it wrong :-)

Paul

Re: [Qemu-devel] [RFC] QOMification of AXI stream

2012-06-08 Thread Paul Brook

 Im looking to QOMifying and refactoring the AXI stream interfaces
 between the AXI ethernet and AXI DMA modules. I could use some
 guidance on how to do this as I can think of about 6 different
 solutions. Sources are hw/xilinx_axienet.c and hw/xilinx_axidma.c.
 
...

 So what im proposing is AXI stream is implemented as a unidirectional
 point to point bus. The xilinx ethernet system would consist of two of
 these buses one for tx, one for rx.

I thought the idea was that with QOM the bus/device model would go away.
The DMA controller implements an AXIDMA interface, and the device has a AXIDMA 
link that's connected to that interface.

Of course we then hit the usual problem with QOM that we can only link to 
objects, and it's impossible to expose multiple interfaces of the same type. 
The DMA controller probably needs a proxy object for each DMA channel.

Paul

Re: [Qemu-devel] [RFC] QOMification of AXI stream

2012-06-08 Thread Paul Brook

 On 8 June 2012 10:13, Paul Brook p...@codesourcery.com wrote:
  Of course we then hit the usual problem with QOM that we can only link to
  objects, and it's impossible to expose multiple interfaces of the same
  type.
 
 I'm pretty sure Anthony claimed this was entirely possible --
 presumably that's how Pins are going to work.

Really?  Every time I've talked to him I've got the opposite impression.  Part 
of the response has been that interrupt pins are the only case where this 
actually occurs, so It's not worth fixing properly.  I disagree with this 
assesment.

Given we do need to expose multiple instances of the same interface, I see a 
few different options:

- Create a proxy object for each reciever which multiplexes onto a different 
interface on the main object.  For interrupt pins this basically means making 
the qemu_irq object part of the device tree, and have the actual device 
implement qemu_irq_handler (see hw/irq.h).  The equivalent of qemu_irq (i.e. 
irq.c/h) needs to be created for every duplicated interface.  It's worth 
noting that qemu_irq is about as simple as it gets, it's a single 
unidirectional call.

- Make some form of handle an explicit part of the API.  IMO this is a really 
bad idea, and a step backwards.  In the qemu_irq case it means that the device 
raising the interrupt needs to know how the interrupt controller enumerates 
its input pins, and which one it's connected to.  Instead of making 
connections via a nice clean links we have a link and some other device 
specific information.  It's worse than the old callback+opaque pointer pair 
because user [machine description] has to provide that device specific 
additional value.

- Link to properties, not objects.  This probably ends up similar to the first 
option, except with a framework and consistent implementation across different 
interfaces.

Paul

Re: [Qemu-devel] [RFC] QOMification of AXI stream

2012-06-08 Thread Paul Brook

  So what im proposing is AXI stream is implemented as a unidirectional
  point to point bus. The xilinx ethernet system would consist of two of
  these buses one for tx, one for rx.
  
  I thought the idea was that with QOM the bus/device model would go away.
  The DMA controller implements an AXIDMA interface, and the device has a
  AXIDMA link that's connected to that interface.
  
  Of course we then hit the usual problem with QOM that we can only link to
  objects, and it's impossible to expose multiple interfaces of the same
  type.
 
 No, QOM supports multiple inheritance of interfaces so you absolutely can
 inherit from multiple different interfaces.

But you can't have multiple instances of the same interface.  And the 
interfaces must be stateless.  Hence you need the proxy object.

Paul

Re: [Qemu-devel] [RFC] QOMification of AXI stream

2012-06-08 Thread Paul Brook

  Of course we then hit the usual problem with QOM that we can only link
  to objects, and it's impossible to expose multiple interfaces of the
  same type.
  
  I'm pretty sure Anthony claimed this was entirely possible --
  presumably that's how Pins are going to work.
  
  Really?  Every time I've talked to him I've got the opposite impression. 
  Part of the response has been that interrupt pins are the only case
  where this actually occurs, so It's not worth fixing properly.
 
 I think it depends on your definition of properly.
 
 There's really only three concepts in QOM that matter for this discussion:
 1) objects 2) children and 3) links.
 
 There is absolutely no difference between a Pin object and a SerialState
 object. They both are first-class objects are far as QOM and concerned. 
 Both can have links to other objects.
 
 The most common way for other objects to create objects is via children.  A
 device could have a bunch of Pin child objects with that being the sole
 communication mechanism with the outside world.

And those pin objects would presumably communicate back to the device via some 
as-yet unimplemented PinMultiplex interface link?  Or are you expecting the 
Pin objects to have an API call that allows the device to register an 
arbitrary QEMUBH (or equivalent)?

 A device could also have a 'PCISocket' child object (which inherits from
 PCIDevice) in order to expose a PCI interface to the world.
 
 For most bus-based devices, I think the above is poor design.  But that's
 my opinion from a modeling PoV, QOM doesn't have an opinion from an
 infrastructure PoV.

So what is a good design?  Are you hoping most of your interfaces are 
stateless, so can be implemented directly on the device object?

Paul

Re: [Qemu-devel] [PATCH V4 0/5] Ehnahced SSI bus support + M25P80 SPI flash + Xilinx SPI controller

2012-06-06 Thread Paul Brook

 On 5th April, when we first RFC'd our SPI layer support, you said to Peter:
 
 ==
 I don't believe there is any difference between SSI and SPI.  It's the
 exact same thing - the same way that many devices support a two-wire
 interface that is actually just I2C with a different name.
 
 The behavior of the CS pin varies between devices.  It sounds like you need
 a bit of extra logic not present in the current ssi code.  You should fix
 that, not invent a whole new bus.
 ==
 
 He's gone and done exactly that, indeed generalised it with the
 proposed changes to SSI.

No.  There are two changes.  Modelling the CS line in the SPI bus, and having 
SSI be a multipoint bus rather than point-point.

Paul

Re: [Qemu-devel] [PATCH V4 0/5] Ehnahced SSI bus support + M25P80 SPI flash + Xilinx SPI controller

2012-06-06 Thread Paul Brook

  I'm still not convinced modelling this as a multipoint bus is a good
  idea.  If nothing else you've failed to model the case where multiple
  slaves are selected simultanously.
 
 The bus can easily be changed such that multiple devices are
 selectable at once to get your desired multi device behaviour. AFAICT
 though nothing in QEMU behaves like this ATM.

By my reading your xilinx device *should* behave like this.
 
   Given the chip selects are actual wires, not part of
  the bus itself, I think multiple point-point busses are a better fit.
  
  For the stellaris device we still have the synthetic mux device and
  intermediate bus.
 
 Yes, because in your stellaris architecture, the SSI controller
 (pl022) is point to point so that exactly matches the hardware.
 
 In the microblaze controller in this series, the controller has
 inbuilt muxing with one-hot CS behavior. To implement with point to
 point, I would have to dynamically create a number of sub-busses
 (driven by a qdev property). I would also have to have a device within
 a device to model the internal mux which increases my code volume
 significantly. Also you end up with this little piece of ugliness in
 your machine model and device model:

I don't see why would would need a separate mux device.

One of my issues is that you've made this a device property.  A SPI device has 
no concept of address.  This really is a property of the controller.

 The multi-slave bus is a direct superset on point-to-point. There is
 nothing stopping anyone from using it as p2p. Its just things are very
 ugly for SPI controllers with integrated muxes to treat everything as
 point to point.

IMHO the resulting tree device is better with multiple point-point links.  I'm 
hoping the hardcoded board descriptions (i.e. everything using 
ssi_create_slave) will go away sooner rather than later.  Having two m25p80 
devices that are indistinguishable apart from one minor property seems 
undesirable.

Paul

Re: [Qemu-devel] [PATCH V4 0/5] Ehnahced SSI bus support + M25P80 SPI flash + Xilinx SPI controller

2012-06-04 Thread Paul Brook

 Patch 1 Enhances SSI bus support to properly support multiple attached
 devices. An api is provided for SSI/SPI masters to select a particular
 device attached to the bus.
 
 Patch 2 is a device model for the m25p80 style SPI flash chip.
 
 Patch 3 is  the Xilinx XPS SPI contoller. Its a sysbus device that
 instantiates a ssi bus, and interfaces the two (as per the controllers
 functionality)
 
 Patch 4 instantiates the XPS SPI controller in the petalogix ML605
 reference platform and connects two m25p80s to it.
 
 Patch 5 updates the stellaris machine model to use the multi slave SSI
 support

I'm still not convinced modelling this as a multipoint bus is a good idea.  If 
nothing else you've failed to model the case where multiple slaves are 
selected simultanously.  Given the chip selects are actual wires, not part of 
the bus itself, I think multiple point-point busses are a better fit.

For the stellaris device we still have the synthetic mux device and 
intermediate bus.


Paul

Re: [Qemu-devel] [PATCH qom-next 57/59] cpu: Introduce mandatory tlb_flush callback

2012-05-31 Thread Paul Brook

  +void cpu_tlb_flush(CPUState *cpu, bool flush_global)
  +{
  +CPUClass *cc = CPU_GET_CLASS(cpu);
  +
  +g_assert(cc-tlb_flush != NULL);
  +
  +cc-tlb_flush(cpu, flush_global);
  +}
  
  This needs to be able to call tlb_flush() itself
  rather than having to have every single subclass of CPUState
  implement an identical tlb_flush method. You could do this
  if there was a CPU_GET_ENV()...
 
 Which is exactly the point: CPUState does not know about the
 target-specific env. And CPU_GET_ENV() is just plain wrong
 conceptually because it adds yet another cpu.h dependency.

Maybe so, but having every single taget implement its own copy of the exact 
same target independent wrapper seems even more wrong.

 There's a separation between old code using env and new, clean code:
 Just like Anthony doesn't want old concepts rewritten with the new type
 (cf. object_realize() discussion) I don't want the old cpu.h #define
 mess leaking into code that I'm redesigning specifically to get rid of
 that target-*/cpu.h dependency in favor of a single qemu/cpu.h.
 qom/cpu.c is by definition not compiled per target so it cannot contain
 any target-specific code.

At minimum it should be clearly documented[1] that this is a transitional 
hack, and how it should be removed.  There have already been two posts in this 
thread suggesting this is a feature, implying that this operation is somehow 
target specific.  I think the opposite is true:  This is a target agnostic 
detail of the TCG implementation, and implementing architecturally defined 
MMU/TLB behavior here is activley wrong.

Paul

[1] In the code, not the commit message.  Commit logs are not documentation.  
Commit logs are transient information valid only when the patch is applied.  
After that point they become archeological evidence, and you should not expect 
subsequent developers to be aware of them.

[Qemu-devel] [PATCH 0/2] SPI SDcard fixes v2

2012-04-30 Thread Paul Brook

Recent testing showed the SPI mode SD card emulation (ssi-sd.c) doesn't
actually work if the guest tries to use features from the SD Physical
Layer Specification v2 (this incluldes SDHC).

This series replaces my previous single patch.

Paul Brook (2):
  Fix SPI SD card command responses
  hw/sd.c: Implement CMD58

 hw/sd.c |  140 +--
 hw/sd.h |   17 
 hw/ssi-sd.c |   83 +++
 3 files changed, 138 insertions(+), 102 deletions(-)

-- 
1.7.10

[Qemu-devel] [PATCH 2/2] hw/sd.c: Implement CMD58

2012-04-30 Thread Paul Brook

Implement CMD58.  This command is only valid in SPI mode, and required when we
implement CMD8.  Most of the code is already there, we just need to
trigger it.

Signed-off-by: Paul Brook p...@codesourcery.com
---
 hw/sd.c |   15 ++-
 1 file changed, 14 insertions(+), 1 deletion(-)

diff --git a/hw/sd.c b/hw/sd.c
index 220562e..952d5d8 100644
--- a/hw/sd.c
+++ b/hw/sd.c
@@ -141,7 +141,7 @@ static const sd_cmd_type_t sd_cmd_type[64] = {
 sd_ac,   sd_ac,   sd_none, sd_none, sd_none, sd_none, sd_ac,   sd_none,
 sd_none, sd_none, sd_bc,   sd_none, sd_none, sd_none, sd_none, sd_none,
 sd_none, sd_none, sd_none, sd_none, sd_none, sd_none, sd_none, sd_ac,
-sd_adtc, sd_none, sd_none, sd_none, sd_none, sd_none, sd_none, sd_none,
+sd_adtc, sd_none, sd_bcr,  sd_none,   sd_none, sd_none, sd_none, sd_none,
 };
 
 static const sd_cmd_type_t sd_acmd_type[64] = {
@@ -1223,6 +1223,19 @@ static sd_rsp_type_t sd_normal_command(SDState *sd,
 break;
 }
 break;
+case 58:/* CMD58: READ_OCR */
+if (!sd-spi) {
+goto bad_cmd;
+}
+switch (sd-state) {
+case sd_idle_state:
+case sd_transfer_state:
+return sd_r3;
+
+default:
+break;
+}
+break;
 
 default:
 bad_cmd:
-- 
1.7.10

[Qemu-devel] [PATCH 1/2] Fix SPI SD card command responses

2012-04-30 Thread Paul Brook

When in SPI mode, we give a bogus response to CMD8 (part of the SD physical
spec v2).  This command should return both the status byte and the
register value.

The current code returns long status words from sd.c, then parses translates
those to SPI status bytes ssi-sd.c.  For CMD8 (and CMD58 to follow)
this gets messy, with both parts requiring command specific knowledge.
We already have magic SPI-mode behavior in sd.c, so may as well just
generate the correct response there.

Signed-off-by: Paul Brook p...@codesourcery.com
---
 hw/sd.c |  125 ---
 hw/sd.h |   17 
 hw/ssi-sd.c |   83 +++
 3 files changed, 124 insertions(+), 101 deletions(-)

diff --git a/hw/sd.c b/hw/sd.c
index 07eb263..220562e 100644
--- a/hw/sd.c
+++ b/hw/sd.c
@@ -52,6 +52,7 @@ typedef enum {
 sd_r7,/* Operating voltage */
 sd_r1b = -1,
 sd_illegal = -2,
+sd_r1_long = -3, /* Two byte status in SPI mode.  */
 } sd_rsp_type_t;
 
 struct SDState {
@@ -342,24 +343,93 @@ static int sd_req_crc_validate(SDRequest *req)
 return sd_crc7(buffer, 5) != req-crc; /* TODO */
 }
 
-static void sd_response_r1_make(SDState *sd, uint8_t *response)
+
+/* Make SPI status word from full card status.  Most commands only use
+   the high byte.  */
+static uint16_t sd_get_spi_status(SDState *sd, uint32_t cardstatus)
+{
+uint16_t status = 0;
+
+if (((cardstatus  9)  0xf)  4)
+status |= SPI_SDR_IDLE;
+if (cardstatus  ERASE_RESET)
+status |= SPI_SDR_ERASE_RESET;
+if (cardstatus  ILLEGAL_COMMAND)
+status |= SPI_SDR_ILLEGAL_COMMAND;
+if (cardstatus  COM_CRC_ERROR)
+status |= SPI_SDR_COM_CRC_ERROR;
+if (cardstatus  ERASE_SEQ_ERROR)
+status |= SPI_SDR_ERASE_SEQ_ERROR;
+if (cardstatus  ADDRESS_ERROR)
+status |= SPI_SDR_ADDRESS_ERROR;
+if (cardstatus  CARD_IS_LOCKED)
+status |= SPI_SDR_LOCKED;
+if (cardstatus  (LOCK_UNLOCK_FAILED | WP_ERASE_SKIP))
+status |= SPI_SDR_WP_ERASE;
+if (cardstatus  SD_ERROR)
+status |= SPI_SDR_ERROR;
+if (cardstatus  CC_ERROR)
+status |= SPI_SDR_CC_ERROR;
+if (cardstatus  CARD_ECC_FAILED)
+status |= SPI_SDR_ECC_FAILED;
+if (cardstatus  WP_VIOLATION)
+status |= SPI_SDR_WP_VIOLATION;
+if (cardstatus  ERASE_PARAM)
+status |= SPI_SDR_ERASE_PARAM;
+if (cardstatus  (OUT_OF_RANGE | CID_CSD_OVERWRITE))
+status |= SPI_SDR_OUT_OF_RANGE;
+/* ??? Don't know what Parameter Error really means, so
+   assume it's set if the second byte is nonzero.  */
+if (status  0xff)
+status |= SPI_SDR_PARAMETER_ERROR;
+
+return status;
+}
+
+static int sd_response_r1_make(SDState *sd, uint8_t *response)
 {
 uint32_t status = sd-card_status;
 /* Clear the clear on read status bits */
 sd-card_status = ~CARD_STATUS_C;
 
-response[0] = (status  24)  0xff;
-response[1] = (status  16)  0xff;
-response[2] = (status  8)  0xff;
-response[3] = (status  0)  0xff;
+if (sd-spi) {
+response[0] = sd_get_spi_status(sd, status)  8;
+return 1;
+} else {
+response[0] = (status  24)  0xff;
+response[1] = (status  16)  0xff;
+response[2] = (status  8)  0xff;
+response[3] = (status  0)  0xff;
+return 4;
+}
+}
+
+/* Only used in SPI mode.  */
+static int sd_response_r1_long_make(SDState *sd, uint8_t *response)
+{
+uint32_t status = sd-card_status;
+/* Clear the clear on read status bits */
+sd-card_status = ~CARD_STATUS_C;
+status = sd_get_spi_status(sd, status);
+response[0] = status  8;
+response[1] = status  0xff;
+return 2;
 }
 
-static void sd_response_r3_make(SDState *sd, uint8_t *response)
+static int sd_response_r3_make(SDState *sd, uint8_t *response)
 {
-response[0] = (sd-ocr  24)  0xff;
-response[1] = (sd-ocr  16)  0xff;
-response[2] = (sd-ocr  8)  0xff;
-response[3] = (sd-ocr  0)  0xff;
+int len = 4;
+
+if (sd-spi) {
+len = 5;
+*(response++) = sd_get_spi_status(sd, sd-card_status)  8;
+}
+*(response++) = (sd-ocr  24)  0xff;
+*(response++) = (sd-ocr  16)  0xff;
+*(response++) = (sd-ocr  8)  0xff;
+*(response++) = (sd-ocr  0)  0xff;
+
+return len;
 }
 
 static void sd_response_r6_make(SDState *sd, uint8_t *response)
@@ -379,12 +449,20 @@ static void sd_response_r6_make(SDState *sd, uint8_t 
*response)
 response[3] = status  0xff;
 }
 
-static void sd_response_r7_make(SDState *sd, uint8_t *response)
+static int sd_response_r7_make(SDState *sd, uint8_t *response)
 {
-response[0] = (sd-vhs  24)  0xff;
-response[1] = (sd-vhs  16)  0xff;
-response[2] = (sd-vhs   8)  0xff;
-response[3] = (sd-vhs   0)  0xff;
+int len = 4;
+
+if (sd-spi) {
+len = 5;
+*(response++) = sd_get_spi_status(sd, sd-card_status)  8;
+}
+*(response

Re: [Qemu-devel] [PATCH] Fix SPI SD emulation

2012-04-30 Thread Paul Brook

  If this command could be issued in transfer state maybe in addition to
  IDLE_STATE you also need to set other bits (ADDRESS_ERROR,
  COM_CRC_ERROR, ILLEGAL_COMMAND, ERASE_SEQ_ERROR) in MSB of R3 response?
  
  In theory, yes.  I was thinking of a follow-up patch to move the spi
  status byte generation into sd.c.  Maybe I should do that first.
 
 Do you mean the one in ssi-sd.c? That would be nice I think, a bit less
 confusing.

I posted v2 of the patch earlier today:

http://lists.nongnu.org/archive/html/qemu-devel/2012-04/msg04214.html

Paul

Re: [Qemu-devel] [PATCH V3 11/13] SD card: introduce spi property for SD card objects

2012-04-30 Thread Paul Brook

 And drop passing is_spi argument to SDCardClass::init function.
 spi property could be set while SD card is in IDLE state. It defaults to
 false.

Why?  This isn't something that should be under user or board control.  The SD 
card object is an implementation detail.  It's something that's part of the 
host controller. i.e. ether an SD host controller or an SPI bus device.

Do you have an example of why you would need to defer this decision?

If you want to separate instantiation of the SD card from the controller (e.g. 
to implement sdio devices) then you need a SD bus, plus an sd card device.  
Something like we do for spi connected cards.

Paul

Re: [Qemu-devel] [PATCH 10/14] target-arm: Move feature register setup to per-CPU init fns

2012-04-30 Thread Paul Brook

 Move feature register value setup to per-CPU init functions.

 +env-cp15.c0_c1[0] = cpu-id_pfr0;
 +env-cp15.c0_c1[1] = cpu-id_pfr1;
 +env-cp15.c0_c1[2] = cpu-id_dfr0;
 +env-cp15.c0_c1[3] = cpu-id_afr0;
 +env-cp15.c0_c1[4] = cpu-id_mmfr0;
 +env-cp15.c0_c1[5] = cpu-id_mmfr1;
 +env-cp15.c0_c1[6] = cpu-id_mmfr2;
 +env-cp15.c0_c1[7] = cpu-id_mmfr3;
 +env-cp15.c0_c2[0] = cpu-id_isar0;
 +env-cp15.c0_c2[1] = cpu-id_isar1;
 +env-cp15.c0_c2[2] = cpu-id_isar2;
 +env-cp15.c0_c2[3] = cpu-id_isar3;
 +env-cp15.c0_c2[4] = cpu-id_isar4;
 +env-cp15.c0_c2[5] = cpu-id_isar5;

Why are we copying these values? All these registers are readonly, so the 
duplication seems wrong.  Shouldn't we should be using cpu-whatever 
everywhere?

I feel like I've asked this before, but don't remember seeing an answer.


Also, I'd prefer that id_isr5 were explicitly initialized, rather than relying 
on it being implicitly zero.  Bugs in an earlier patch series show how easy it 
is to accidentally miss a register.   IMO it's worth distinguishing a defined 
register that happens to be zero from a register this core doesn't have.  
Overall I'm not convinced that the new open-coded initialization is better 
then the tables it replaces.

Paul

Re: [Qemu-devel] [PATCH] Fix SPI SD emulation

2012-04-29 Thread Paul Brook

  -sd_adtc, sd_none, sd_none, sd_none, sd_none, sd_none, sd_none,
  +sd_adtc, sd_none, sd_bc,   sd_bc,   sd_none, sd_none,
 
 sd_bcr? not that it really matters though

Err, probably.

  +case 58:/* CMD58: READ_OCR */
  +if (!sd-spi) {
  +goto bad_cmd;
  +}
  +switch (sd-state) {
  +case sd_idle_state:
  +case sd_transfer_state:
  +return sd_r3;
 
 If this command could be issued in transfer state maybe in addition to
 IDLE_STATE you also need to set other bits (ADDRESS_ERROR,
 COM_CRC_ERROR, ILLEGAL_COMMAND, ERASE_SEQ_ERROR) in MSB of R3 response?

In theory, yes.  I was thinking of a follow-up patch to move the spi status 
byte generation into sd.c.  Maybe I should do that first.

Paul

Re: [Qemu-devel] [PATCH v6 5/5] FreeSCALE i.MX31 support: KZM-ARM11-01 evaluation board

2012-04-25 Thread Paul Brook

 On 23 April 2012 23:21, Peter Chubb peter.ch...@nicta.com.au wrote:
  Peter Are these two devices really on the same IRQ?
  
  Yes.  A single interrupt line comes from the FPGA into the AVIC.
  Inside the FPGA the interrupts for the UARTs, SD card and NAND flash
  are connected to that single interrupt line.
  The non-touchscreen FPGA UART isn't mentioned in the KZM manual, but
  is available on the board as a debug port.
  
  To avoid confusion I think I'll just get rid of it.
 
 Up to you. A comment would be fine if you'd rather keep the device.

No it's not.  You must never connect multiple devices to the same IRQ.  You 
need an explicit mux in between.

Paul

Re: [Qemu-devel] [PATCH v3 1/4] SSI: Built in multiple device support

2012-04-25 Thread Paul Brook

 Im happy to spend the 10 mins updating stellaris.c accordingly, but is
 someone sitting on a binary package and brief instructions or some
 such to regression test it? Do you of this machine have some sort of
 kernel image handy?

I've attached a tarball with some test binaries.  They're built from the 
example libraries shipped with this board.

The first exercises the display, amongst other things.

The second exercises the SD card.  Simple SD card image also included 
(remember to ungzip it first). ls and cat readme.txt over the serial port 
to make it do something verifiable.

Run them with:

 ./qemu-system-arm -M lm3s6965evb -kernel qs_ek-lm3s6965.bin
 ./qemu-system-arm -M lm3s6965evb -serial stdio -kernel sd_card.bin -sd 
sdcard.img

I don't have any software handy that exercises both simultaneously.


It's probably worth mentioning that we don't currently implement the all CS 
lines accurately for this board.

Most pins on this device are dual-function.  They can be configured either as 
regular GPIO, or driven from a periperal (aka alternate function, e.g. the 
SSI controller).  Config is cone via the GPIO controllers.  There are 7 GPIO 
contollers (A-G) with 8 pins each.  On reset all pins are configured as 
floating GPIO, and we let D0 float high.
The frame start/chip select line from the SPI controller goes via GPIO A3. 
This is connected to the display controller (ssd0323) CS pin.
The SD card CS pin is connected to GPIO D0.

When comminucating with the display controller the SSI pins will be configured 
normally.  When communicating with the SD card we configure A3 as a GPIO pin, 
set high (inactive), and pull D0 low to select the SD card.

The current implementation ignores the SSI select pin (A3), and assumes the 
display controller is selected whenever the SD card (D0) is not.  We do not 
implement the alternate function select in the GPIO controller.

It's a bit of a strange setup, but I guess probably not that unusual.

Paul

1 2 3 4 5 6 7 8 9 10 >

1 - 100 of 1522 matches

Mail list logo