As pmovmskb is used by strlen et al, this is the third highest overhead sse operation at %0.8.
Signed-off-by: Richard Henderson <richard.hender...@linaro.org> --- target/i386/ops_sse.h | 26 ----------- target/i386/ops_sse_header.h | 1 - target/i386/tcg/translate.c | 86 +++++++++++++++++++++++++++++++----- 3 files changed, 74 insertions(+), 39 deletions(-) diff --git a/target/i386/ops_sse.h b/target/i386/ops_sse.h index b21f315f37..9f9801be63 100644 --- a/target/i386/ops_sse.h +++ b/target/i386/ops_sse.h @@ -1098,32 +1098,6 @@ uint32_t helper_movmskpd(CPUX86State *env, Reg *s) #endif -uint32_t glue(helper_pmovmskb, SUFFIX)(CPUX86State *env, Reg *s) -{ - uint32_t val; - - val = 0; - val |= (s->B(0) >> 7); - val |= (s->B(1) >> 6) & 0x02; - val |= (s->B(2) >> 5) & 0x04; - val |= (s->B(3) >> 4) & 0x08; - val |= (s->B(4) >> 3) & 0x10; - val |= (s->B(5) >> 2) & 0x20; - val |= (s->B(6) >> 1) & 0x40; - val |= (s->B(7)) & 0x80; -#if SHIFT == 1 - val |= (s->B(8) << 1) & 0x0100; - val |= (s->B(9) << 2) & 0x0200; - val |= (s->B(10) << 3) & 0x0400; - val |= (s->B(11) << 4) & 0x0800; - val |= (s->B(12) << 5) & 0x1000; - val |= (s->B(13) << 6) & 0x2000; - val |= (s->B(14) << 7) & 0x4000; - val |= (s->B(15) << 8) & 0x8000; -#endif - return val; -} - void glue(helper_packsswb, SUFFIX)(CPUX86State *env, Reg *d, Reg *s) { Reg r; diff --git a/target/i386/ops_sse_header.h b/target/i386/ops_sse_header.h index 542701720e..d6bb10342c 100644 --- a/target/i386/ops_sse_header.h +++ b/target/i386/ops_sse_header.h @@ -201,7 +201,6 @@ DEF_HELPER_2(movmskps, i32, env, Reg) DEF_HELPER_2(movmskpd, i32, env, Reg) #endif -DEF_HELPER_2(glue(pmovmskb, SUFFIX), i32, env, Reg) DEF_HELPER_3(glue(packsswb, SUFFIX), void, env, Reg, Reg) DEF_HELPER_3(glue(packuswb, SUFFIX), void, env, Reg, Reg) DEF_HELPER_3(glue(packssdw, SUFFIX), void, env, Reg, Reg) diff --git a/target/i386/tcg/translate.c b/target/i386/tcg/translate.c index d25d914d63..5829c702d6 100644 --- a/target/i386/tcg/translate.c +++ b/target/i386/tcg/translate.c @@ -2776,6 +2776,77 @@ static inline void gen_op_movq_env_0(DisasContext *s, int d_offset) tcg_gen_st_i64(s->tmp1_i64, cpu_env, d_offset); } +static void gen_pmovmskb_i64(TCGv_i64 d, TCGv_i64 s) +{ + TCGv_i64 t = tcg_temp_new_i64(); + + tcg_gen_andi_i64(d, s, 0x8080808080808080ull); + + /* + * After each shift+or pair: + * 0: a.......b.......c.......d.......e.......f.......g.......h....... + * 7: ab......bc......cd......de......ef......fg......gh......h....... + * 14: abcd....bcde....cdef....defg....efgh....fgh.....gh......h....... + * 28: abcdefghbcdefgh.cdefgh..defgh...efgh....fgh.....gh......h....... + * The result is left in the high bits of the word. + */ + tcg_gen_shli_i64(t, d, 7); + tcg_gen_or_i64(d, d, t); + tcg_gen_shli_i64(t, d, 14); + tcg_gen_or_i64(d, d, t); + tcg_gen_shli_i64(t, d, 28); + tcg_gen_or_i64(d, d, t); +} + +static void gen_pmovmskb_vec(unsigned vece, TCGv_vec d, TCGv_vec s) +{ + TCGv_vec t = tcg_temp_new_vec_matching(d); + TCGv_vec m = tcg_constant_vec_matching(d, MO_8, 0x80); + + /* See above */ + tcg_gen_and_vec(vece, d, s, m); + tcg_gen_shli_vec(vece, t, d, 7); + tcg_gen_or_vec(vece, d, d, t); + tcg_gen_shli_vec(vece, t, d, 14); + tcg_gen_or_vec(vece, d, d, t); + if (vece == MO_64) { + tcg_gen_shli_vec(vece, t, d, 28); + tcg_gen_or_vec(vece, d, d, t); + } +} + +static void gen_gvec_pmovmskb(TCGv out, int s_reg, bool is_xmm) +{ + static const TCGOpcode vecop_list[] = { INDEX_op_shli_vec, 0 }; + static const GVecGen2 g = { + .fni8 = gen_pmovmskb_i64, + .fniv = gen_pmovmskb_vec, + .opt_opc = vecop_list, + .vece = MO_64, + .prefer_i64 = TCG_TARGET_REG_BITS == 64 + }; + + int s_ofs = (is_xmm + ? offsetof(CPUX86State, xmm_regs[s_reg].ZMM_X(0)) + : offsetof(CPUX86State, fpregs[s_reg].mmx)); + int d_ofs = (is_xmm + ? offsetof(CPUX86State, xmm_t0.ZMM_X(0)) + : offsetof(CPUX86State, mmx_t0)); + int vec_len = is_xmm ? 16 : 8; + + tcg_gen_gvec_2(d_ofs, s_ofs, vec_len, vec_len, &g); + + if (is_xmm) { + TCGv t = tcg_temp_new(); + tcg_gen_ld8u_tl(t, cpu_env, d_ofs + offsetof(XMMReg, XMM_B(15))); + tcg_gen_ld8u_tl(out, cpu_env, d_ofs + offsetof(XMMReg, XMM_B(7))); + tcg_gen_deposit_tl(out, out, t, 8, TARGET_LONG_BITS - 8); + tcg_temp_free(t); + } else { + tcg_gen_ld8u_tl(out, cpu_env, d_ofs + offsetof(MMXReg, MMX_B(7))); + } +} + typedef void (*SSEFunc_i_ep)(TCGv_i32 val, TCGv_ptr env, TCGv_ptr reg); typedef void (*SSEFunc_l_ep)(TCGv_i64 val, TCGv_ptr env, TCGv_ptr reg); typedef void (*SSEFunc_0_epi)(TCGv_ptr env, TCGv_ptr reg, TCGv_i32 val); @@ -3742,21 +3813,12 @@ static void gen_sse(CPUX86State *env, DisasContext *s, int b, break; case 0xd7: /* pmovmskb */ case 0x1d7: - if (mod != 3) + if (mod != 3) { goto illegal_op; - if (b1) { - rm = (modrm & 7) | REX_B(s); - tcg_gen_addi_ptr(s->ptr0, cpu_env, - offsetof(CPUX86State, xmm_regs[rm])); - gen_helper_pmovmskb_xmm(s->tmp2_i32, cpu_env, s->ptr0); - } else { - rm = (modrm & 7); - tcg_gen_addi_ptr(s->ptr0, cpu_env, - offsetof(CPUX86State, fpregs[rm].mmx)); - gen_helper_pmovmskb_mmx(s->tmp2_i32, cpu_env, s->ptr0); } + rm = (modrm & 7) | (is_xmm ? REX_B(s) : 0); reg = ((modrm >> 3) & 7) | REX_R(s); - tcg_gen_extu_i32_tl(cpu_regs[reg], s->tmp2_i32); + gen_gvec_pmovmskb(cpu_regs[reg], rm, is_xmm); break; case 0x138: -- 2.34.1