Some potentially surprising details when comparing vpermilpd v.s. vpermilps, but overall pretty straightforward.
Signed-off-by: Paul Brook <p...@nowt.org> --- target/i386/ops_sse.h | 82 ++++++++++++++++++++++++++++++++++++ target/i386/ops_sse_header.h | 4 ++ target/i386/tcg/translate.c | 4 ++ 3 files changed, 90 insertions(+) diff --git a/target/i386/ops_sse.h b/target/i386/ops_sse.h index 4115c9a257..9b92b9790a 100644 --- a/target/i386/ops_sse.h +++ b/target/i386/ops_sse.h @@ -3113,6 +3113,88 @@ void glue(helper_vbroadcastq, SUFFIX)(CPUX86State *env, Reg *d, Reg *s) #endif } +void glue(helper_vpermilpd, SUFFIX)(CPUX86State *env, Reg *d, Reg *v, Reg *s) +{ + uint64_t r0, r1; + + r0 = v->Q((s->Q(0) >> 1) & 1); + r1 = v->Q((s->Q(1) >> 1) & 1); + d->Q(0) = r0; + d->Q(1) = r1; +#if SHIFT == 2 + r0 = v->Q(((s->Q(2) >> 1) & 1) + 2); + r1 = v->Q(((s->Q(3) >> 1) & 1) + 2); + d->Q(2) = r0; + d->Q(3) = r1; +#endif +} + +void glue(helper_vpermilps, SUFFIX)(CPUX86State *env, Reg *d, Reg *v, Reg *s) +{ + uint32_t r0, r1, r2, r3; + + r0 = v->L(s->L(0) & 3); + r1 = v->L(s->L(1) & 3); + r2 = v->L(s->L(2) & 3); + r3 = v->L(s->L(3) & 3); + d->L(0) = r0; + d->L(1) = r1; + d->L(2) = r2; + d->L(3) = r3; +#if SHIFT == 2 + r0 = v->L((s->L(4) & 3) + 4); + r1 = v->L((s->L(5) & 3) + 4); + r2 = v->L((s->L(6) & 3) + 4); + r3 = v->L((s->L(7) & 3) + 4); + d->L(4) = r0; + d->L(5) = r1; + d->L(6) = r2; + d->L(7) = r3; +#endif +} + +void glue(helper_vpermilpd_imm, SUFFIX)(CPUX86State *env, + Reg *d, Reg *s, uint32_t order) +{ + uint64_t r0, r1; + + r0 = s->Q((order >> 0) & 1); + r1 = s->Q((order >> 1) & 1); + d->Q(0) = r0; + d->Q(1) = r1; +#if SHIFT == 2 + r0 = s->Q(((order >> 2) & 1) + 2); + r1 = s->Q(((order >> 3) & 1) + 2); + d->Q(2) = r0; + d->Q(3) = r1; +#endif +} + +void glue(helper_vpermilps_imm, SUFFIX)(CPUX86State *env, + Reg *d, Reg *s, uint32_t order) +{ + uint32_t r0, r1, r2, r3; + + r0 = s->L((order >> 0) & 3); + r1 = s->L((order >> 2) & 3); + r2 = s->L((order >> 4) & 3); + r3 = s->L((order >> 6) & 3); + d->L(0) = r0; + d->L(1) = r1; + d->L(2) = r2; + d->L(3) = r3; +#if SHIFT == 2 + r0 = s->L(((order >> 0) & 3) + 4); + r1 = s->L(((order >> 2) & 3) + 4); + r2 = s->L(((order >> 4) & 3) + 4); + r3 = s->L(((order >> 6) & 3) + 4); + d->L(4) = r0; + d->L(5) = r1; + d->L(6) = r2; + d->L(7) = r3; +#endif +} + #if SHIFT == 2 void glue(helper_vbroadcastdq, SUFFIX)(CPUX86State *env, Reg *d, Reg *s) { diff --git a/target/i386/ops_sse_header.h b/target/i386/ops_sse_header.h index 51e02cd4fa..c52169a030 100644 --- a/target/i386/ops_sse_header.h +++ b/target/i386/ops_sse_header.h @@ -417,6 +417,10 @@ DEF_HELPER_3(glue(vbroadcastb, SUFFIX), void, env, Reg, Reg) DEF_HELPER_3(glue(vbroadcastw, SUFFIX), void, env, Reg, Reg) DEF_HELPER_3(glue(vbroadcastl, SUFFIX), void, env, Reg, Reg) DEF_HELPER_3(glue(vbroadcastq, SUFFIX), void, env, Reg, Reg) +DEF_HELPER_4(glue(vpermilpd, SUFFIX), void, env, Reg, Reg, Reg) +DEF_HELPER_4(glue(vpermilps, SUFFIX), void, env, Reg, Reg, Reg) +DEF_HELPER_4(glue(vpermilpd_imm, SUFFIX), void, env, Reg, Reg, i32) +DEF_HELPER_4(glue(vpermilps_imm, SUFFIX), void, env, Reg, Reg, i32) #if SHIFT == 2 DEF_HELPER_3(glue(vbroadcastdq, SUFFIX), void, env, Reg, Reg) DEF_HELPER_1(vzeroall, void, env) diff --git a/target/i386/tcg/translate.c b/target/i386/tcg/translate.c index 59ab1dc562..358c3ecb0b 100644 --- a/target/i386/tcg/translate.c +++ b/target/i386/tcg/translate.c @@ -3251,6 +3251,8 @@ static const struct SSEOpHelper_table6 sse_op_table6[256] = { [0x09] = BINARY_OP_MMX(psignw, SSSE3), [0x0a] = BINARY_OP_MMX(psignd, SSSE3), [0x0b] = BINARY_OP_MMX(pmulhrsw, SSSE3), + [0x0c] = BINARY_OP(vpermilps, AVX, 0), + [0x0d] = BINARY_OP(vpermilpd, AVX, 0), [0x10] = BLENDV_OP(pblendvb, SSE41, SSE_OPF_MMX), [0x14] = BLENDV_OP(blendvps, SSE41, 0), [0x15] = BLENDV_OP(blendvpd, SSE41, 0), @@ -3311,6 +3313,8 @@ static const struct SSEOpHelper_table6 sse_op_table6[256] = { /* prefix [66] 0f 3a */ static const struct SSEOpHelper_table7 sse_op_table7[256] = { + [0x04] = UNARY_OP(vpermilps_imm, AVX, 0), + [0x05] = UNARY_OP(vpermilpd_imm, AVX, 0), [0x08] = UNARY_OP(roundps, SSE41, 0), [0x09] = UNARY_OP(roundpd, SSE41, 0), #define gen_helper_roundss_ymm NULL -- 2.36.0