Implement the MVE VADDLV insn; this is similar to VADDV, except that it accumulates 32-bit elements into a 64-bit accumulator stored in a pair of general-purpose registers.
Signed-off-by: Peter Maydell <peter.mayd...@linaro.org> --- target/arm/helper-mve.h | 3 ++ target/arm/mve.decode | 6 +++- target/arm/mve_helper.c | 19 ++++++++++++ target/arm/translate-mve.c | 63 ++++++++++++++++++++++++++++++++++++++ 4 files changed, 90 insertions(+), 1 deletion(-) diff --git a/target/arm/helper-mve.h b/target/arm/helper-mve.h index d414b6309d5..cf5ba860f2f 100644 --- a/target/arm/helper-mve.h +++ b/target/arm/helper-mve.h @@ -356,6 +356,9 @@ DEF_HELPER_FLAGS_3(mve_vaddvuh, TCG_CALL_NO_WG, i32, env, ptr, i32) DEF_HELPER_FLAGS_3(mve_vaddvsw, TCG_CALL_NO_WG, i32, env, ptr, i32) DEF_HELPER_FLAGS_3(mve_vaddvuw, TCG_CALL_NO_WG, i32, env, ptr, i32) +DEF_HELPER_FLAGS_3(mve_vaddlv_s, TCG_CALL_NO_WG, i64, env, ptr, i64) +DEF_HELPER_FLAGS_3(mve_vaddlv_u, TCG_CALL_NO_WG, i64, env, ptr, i64) + DEF_HELPER_FLAGS_3(mve_vmovi, TCG_CALL_NO_WG, void, env, ptr, i64) DEF_HELPER_FLAGS_3(mve_vandi, TCG_CALL_NO_WG, void, env, ptr, i64) DEF_HELPER_FLAGS_3(mve_vorri, TCG_CALL_NO_WG, void, env, ptr, i64) diff --git a/target/arm/mve.decode b/target/arm/mve.decode index 914b108c379..595d97568eb 100644 --- a/target/arm/mve.decode +++ b/target/arm/mve.decode @@ -307,7 +307,11 @@ VQDMULH_scalar 1110 1110 0 . .. ... 1 ... 0 1110 . 110 .... @2scalar VQRDMULH_scalar 1111 1110 0 . .. ... 1 ... 0 1110 . 110 .... @2scalar # Vector add across vector -VADDV 111 u:1 1110 1111 size:2 01 ... 0 1111 0 0 a:1 0 qm:3 0 rda=%rdalo +{ + VADDV 111 u:1 1110 1111 size:2 01 ... 0 1111 0 0 a:1 0 qm:3 0 rda=%rdalo + VADDLV 111 u:1 1110 1 ... 1001 ... 0 1111 00 a:1 0 qm:3 0 \ + rdahi=%rdahi rdalo=%rdalo +} # Predicate operations %mask_22_13 22:1 13:3 diff --git a/target/arm/mve_helper.c b/target/arm/mve_helper.c index 9d4a07c1c0c..37af94bd9ea 100644 --- a/target/arm/mve_helper.c +++ b/target/arm/mve_helper.c @@ -1189,6 +1189,25 @@ DO_VADDV(vaddvub, 1, uint8_t) DO_VADDV(vaddvuh, 2, uint16_t) DO_VADDV(vaddvuw, 4, uint32_t) +#define DO_VADDLV(OP, TYPE, LTYPE) \ + uint64_t HELPER(glue(mve_, OP))(CPUARMState *env, void *vm, \ + uint64_t ra) \ + { \ + uint16_t mask = mve_element_mask(env); \ + unsigned e; \ + TYPE *m = vm; \ + for (e = 0; e < 16 / 4; e++, mask >>= 4) { \ + if (mask & 1) { \ + ra += (LTYPE)m[H4(e)]; \ + } \ + } \ + mve_advance_vpt(env); \ + return ra; \ + } \ + +DO_VADDLV(vaddlv_s, int32_t, int64_t) +DO_VADDLV(vaddlv_u, uint32_t, uint64_t) + /* Shifts by immediate */ #define DO_2SHIFT(OP, ESIZE, TYPE, FN) \ void HELPER(glue(mve_, OP))(CPUARMState *env, void *vd, \ diff --git a/target/arm/translate-mve.c b/target/arm/translate-mve.c index 460dff260fe..a2a45036a0b 100644 --- a/target/arm/translate-mve.c +++ b/target/arm/translate-mve.c @@ -790,6 +790,69 @@ static bool trans_VADDV(DisasContext *s, arg_VADDV *a) return true; } +static bool trans_VADDLV(DisasContext *s, arg_VADDLV *a) +{ + /* + * Vector Add Long Across Vector: accumulate the 32-bit + * elements of the vector into a 64-bit result stored in + * a pair of general-purpose registers. + * No need to check Qm's bank: it is only 3 bits in decode. + */ + TCGv_ptr qm; + TCGv_i64 rda; + TCGv_i32 rdalo, rdahi; + + if (!dc_isar_feature(aa32_mve, s)) { + return false; + } + /* + * rdahi == 13 is UNPREDICTABLE; rdahi == 15 is a related + * encoding; rdalo always has bit 0 clear so cannot be 13 or 15. + */ + if (a->rdahi == 13 || a->rdahi == 15) { + return false; + } + if (!mve_eci_check(s) || !vfp_access_check(s)) { + return true; + } + + /* + * This insn is subject to beat-wise execution. Partial execution + * of an A=0 (no-accumulate) insn which does not execute the first + * beat must start with the current value of RdaHi:RdaLo, not zero. + */ + if (a->a || mve_skip_first_beat(s)) { + /* Accumulate input from RdaHi:RdaLo */ + rda = tcg_temp_new_i64(); + rdalo = load_reg(s, a->rdalo); + rdahi = load_reg(s, a->rdahi); + tcg_gen_concat_i32_i64(rda, rdalo, rdahi); + tcg_temp_free_i32(rdalo); + tcg_temp_free_i32(rdahi); + } else { + /* Accumulate starting at zero */ + rda = tcg_const_i64(0); + } + + qm = mve_qreg_ptr(a->qm); + if (a->u) { + gen_helper_mve_vaddlv_u(rda, cpu_env, qm, rda); + } else { + gen_helper_mve_vaddlv_s(rda, cpu_env, qm, rda); + } + tcg_temp_free_ptr(qm); + + rdalo = tcg_temp_new_i32(); + rdahi = tcg_temp_new_i32(); + tcg_gen_extrl_i64_i32(rdalo, rda); + tcg_gen_extrh_i64_i32(rdahi, rda); + store_reg(s, a->rdalo, rdalo); + store_reg(s, a->rdahi, rdahi); + tcg_temp_free_i64(rda); + mve_update_eci(s); + return true; +} + static bool do_1imm(DisasContext *s, arg_1imm *a, MVEGenOneOpImmFn *fn) { TCGv_ptr qd; -- 2.20.1