On 08.03.22 02:53, Richard Henderson wrote: > From: David Miller <dmiller...@gmail.com> > > Prior to vector enhancements 2, the shift count was supposed to be equal > for each byte lest the result be unpredictable, which allowed us to assume > that the shift count was the same, and optimize accordingly. > > With vector enhancements 2, the shift count is allowed to be different > for each byte, and we must cope with that. > > Signed-off-by: David Miller <dmiller...@gmail.com> > Message-Id: <20220307020327.3003-4-dmiller...@gmail.com> > [rth: Split out of larger patch; simplify shift/merge code.] > Signed-off-by: Richard Henderson <richard.hender...@linaro.org> > --- > target/s390x/helper.h | 3 ++ > target/s390x/tcg/vec_int_helper.c | 58 ++++++++++++++++++++++ > target/s390x/tcg/translate_vx.c.inc | 77 ++++++++++++----------------- > target/s390x/tcg/insn-data.def | 12 ++--- > 4 files changed, 99 insertions(+), 51 deletions(-) > > diff --git a/target/s390x/helper.h b/target/s390x/helper.h > index 7412130883..bf33d86f74 100644 > --- a/target/s390x/helper.h > +++ b/target/s390x/helper.h > @@ -203,8 +203,11 @@ DEF_HELPER_FLAGS_3(gvec_vpopct16, TCG_CALL_NO_RWG, void, > ptr, cptr, i32) > DEF_HELPER_FLAGS_4(gvec_verim8, TCG_CALL_NO_RWG, void, ptr, cptr, cptr, i32) > DEF_HELPER_FLAGS_4(gvec_verim16, TCG_CALL_NO_RWG, void, ptr, cptr, cptr, i32) > DEF_HELPER_FLAGS_4(gvec_vsl, TCG_CALL_NO_RWG, void, ptr, cptr, i64, i32) > +DEF_HELPER_FLAGS_4(gvec_vsl_ve2, TCG_CALL_NO_RWG, void, ptr, cptr, cptr, i32) > DEF_HELPER_FLAGS_4(gvec_vsra, TCG_CALL_NO_RWG, void, ptr, cptr, i64, i32) > +DEF_HELPER_FLAGS_4(gvec_vsra_ve2, TCG_CALL_NO_RWG, void, ptr, cptr, cptr, > i32) > DEF_HELPER_FLAGS_4(gvec_vsrl, TCG_CALL_NO_RWG, void, ptr, cptr, i64, i32) > +DEF_HELPER_FLAGS_4(gvec_vsrl_ve2, TCG_CALL_NO_RWG, void, ptr, cptr, cptr, > i32) > DEF_HELPER_FLAGS_4(gvec_vscbi8, TCG_CALL_NO_RWG, void, ptr, cptr, cptr, i32) > DEF_HELPER_FLAGS_4(gvec_vscbi16, TCG_CALL_NO_RWG, void, ptr, cptr, cptr, i32) > DEF_HELPER_4(gvec_vtm, void, ptr, cptr, env, i32) > diff --git a/target/s390x/tcg/vec_int_helper.c > b/target/s390x/tcg/vec_int_helper.c > index 5561b3ed90..a881d5d267 100644 > --- a/target/s390x/tcg/vec_int_helper.c > +++ b/target/s390x/tcg/vec_int_helper.c > @@ -540,18 +540,76 @@ void HELPER(gvec_vsl)(void *v1, const void *v2, > uint64_t count, > s390_vec_shl(v1, v2, count); > } > > +void HELPER(gvec_vsl_ve2)(void *v1, const void *v2, const void *v3, > + uint32_t desc) > +{ > + S390Vector tmp; > + uint32_t sh, e0, e1 = 0;
int i; > + > + for (int i = 15; i >= 0; --i, e1 = e0 << 24) { I'd only do "e1 = e0" here and do the shift for the rol32 ... > + e0 = s390_vec_read_element8(v2, i); > + sh = s390_vec_read_element8(v3, i) & 7; > + > + s390_vec_write_element8(&tmp, i, rol32(e0 | e1, sh)); ... here s390_vec_write_element8(&tmp, i, rol32(e0 | e1 << 24, sh)); > + } > + > + *(S390Vector *)v1 = tmp; > +} > + > void HELPER(gvec_vsra)(void *v1, const void *v2, uint64_t count, > uint32_t desc) > { > s390_vec_sar(v1, v2, count); > } > > +void HELPER(gvec_vsra_ve2)(void *v1, const void *v2, const void *v3, > + uint32_t desc) > +{ > + S390Vector tmp; > + uint32_t sh, e0, e1; > + int i = 0; > + > + e0 = s390_vec_read_element8(v2, 0); > + e1 = -(e0 >> 7) << 8; > + > + for (;;) { > + sh = s390_vec_read_element8(v3, i) & 7; > + > + s390_vec_write_element8(&tmp, i, (e0 | e1) >> sh); > + > + if (++i >= 16) { > + break; > + } > + > + e1 = e0 << 8; > + e0 = s390_vec_read_element8(v2, i); > + } Can't we use the following that resembles the other helpers or am I missing something? S390Vector tmp; uint32_t sh, e0, e1 = 0; /* Byte 0 is special only. */ e0 = (int32_t)(int8_t)s390_vec_read_element8(v2, i); sh = s390_vec_read_element8(v3, i) & 7; s390_vec_write_element8(&tmp, i, e0 >> sh); e1 = e0; for (int i = 1; i < 16; ++i, e1 = e0) { e0 = s390_vec_read_element8(v2, i); sh = s390_vec_read_element8(v3, i) & 7; s390_vec_write_element8(&tmp, i, (e0 | e1 << 8) >> sh); } *(S390Vector *)v1 = tmp; > + > + *(S390Vector *)v1 = tmp; > +} > + > void HELPER(gvec_vsrl)(void *v1, const void *v2, uint64_t count, > uint32_t desc) > { > s390_vec_shr(v1, v2, count); > } > > +void HELPER(gvec_vsrl_ve2)(void *v1, const void *v2, const void *v3, > + uint32_t desc) > +{ > + S390Vector tmp; > + uint32_t sh, e0, e1 = 0; > + > + for (int i = 0; i < 16; ++i, e1 = e0 << 8) { Dito, I'd do the shift below ... > + e0 = s390_vec_read_element8(v2, i); > + sh = s390_vec_read_element8(v3, i) & 7; > + > + s390_vec_write_element8(&tmp, i, (e0 | e1) >> sh); s390_vec_write_element8(&tmp, i, (e0 | e1 << 8) >> sh); > + } > + > + *(S390Vector *)v1 = tmp; > +} > + > #define DEF_VSCBI(BITS) > \ > void HELPER(gvec_vscbi##BITS)(void *v1, const void *v2, const void *v3, > \ > uint32_t desc) > \ > diff --git a/target/s390x/tcg/translate_vx.c.inc > b/target/s390x/tcg/translate_vx.c.inc > index d514e8b218..967f6213d8 100644 > --- a/target/s390x/tcg/translate_vx.c.inc > +++ b/target/s390x/tcg/translate_vx.c.inc > @@ -2018,21 +2018,42 @@ static DisasJumpType op_ves(DisasContext *s, DisasOps > *o) > return DISAS_NEXT; > } > > +static DisasJumpType gen_vsh_bit_byte(DisasContext *s, DisasOps *o, > + gen_helper_gvec_2i *gen, > + gen_helper_gvec_3 *gen_ve2) > +{ > + bool byte = s->insn->data; Nit: I'd have called this "by_byte". -- Thanks, David / dhildenb