QMAC.0x and QMACS.0x multiply the selected signed Q15 halfword lane from rs by rt<15:0> and accumulate the Q31 product into the Octeon HI/LO accumulator state.
QMAC updates the full 64-bit HI/LO accumulator. QMACS saturates the 32-bit Q31 result in LO and keeps HI<0> as the sticky saturation flag. Signed-off-by: James Hilliard <[email protected]> --- target/mips/helper.h | 2 ++ target/mips/tcg/octeon.decode | 5 ++++ target/mips/tcg/octeon_translate.c | 16 +++++++++++ target/mips/tcg/op_helper.c | 59 ++++++++++++++++++++++++++++++++++++++ 4 files changed, 82 insertions(+) diff --git a/target/mips/helper.h b/target/mips/helper.h index 08fda55ae1..e93bc37903 100644 --- a/target/mips/helper.h +++ b/target/mips/helper.h @@ -27,6 +27,8 @@ DEF_HELPER_FLAGS_4(rotx, TCG_CALL_NO_RWG_SE, tl, tl, i32, i32, i32) DEF_HELPER_3(octeon_vmulu, i64, env, i64, i64) DEF_HELPER_3(octeon_vmm0, i64, env, i64, i64) DEF_HELPER_3(octeon_v3mulu, i64, env, i64, i64) +DEF_HELPER_4(octeon_qmac, void, env, i64, i64, i32) +DEF_HELPER_4(octeon_qmacs, void, env, i64, i64, i32) /* microMIPS functions */ DEF_HELPER_4(lwm, void, env, tl, tl, i32) diff --git a/target/mips/tcg/octeon.decode b/target/mips/tcg/octeon.decode index 9c1fe8f4f1..5edcd95884 100644 --- a/target/mips/tcg/octeon.decode +++ b/target/mips/tcg/octeon.decode @@ -28,9 +28,12 @@ BBIT 11 set:1 . 10 rs:5 ..... offset:s16 p=%bbit_p # SEQI rt, rs, immediate # SNE rd, rs, rt # SNEI rt, rs, immediate +# QMAC.0x rs, rt +# QMACS.0x rs, rt @r3 ...... rs:5 rt:5 rd:5 ..... ...... &cmpi rs rt imm +&qmac rs rt lane %bitfield_p 0:1 6:5 @bitfield ...... rs:5 rt:5 lenm1:5 ..... ..... . p=%bitfield_p @@ -43,6 +46,8 @@ SEQ 011100 ..... ..... ..... 00000 101010 @r3 SNE 011100 ..... ..... ..... 00000 101011 @r3 SEQI 011100 rs:5 rt:5 imm:s10 101110 &cmpi SNEI 011100 rs:5 rt:5 imm:s10 101111 &cmpi +QMACS 011100 rs:5 rt:5 00000 000 lane:2 010010 &qmac +QMAC 011100 rs:5 rt:5 00000 100 lane:2 010010 &qmac &r2 rs rt MTM0 011100 rs:5 rt:5 00000 00000 001000 &r2 MTP0 011100 rs:5 rt:5 00000 00000 001001 &r2 diff --git a/target/mips/tcg/octeon_translate.c b/target/mips/tcg/octeon_translate.c index 2d836afddb..b41bc1f81e 100644 --- a/target/mips/tcg/octeon_translate.c +++ b/target/mips/tcg/octeon_translate.c @@ -14,6 +14,8 @@ #include "decode-octeon.c.inc" typedef void gen_helper_octeon_vmul(TCGv_i64, TCGv_ptr, TCGv_i64, TCGv_i64); +typedef void gen_helper_octeon_qmac_fn(TCGv_ptr, TCGv_i64, TCGv_i64, + TCGv_i32); static bool trans_BBIT(DisasContext *ctx, arg_BBIT *a) { @@ -156,6 +158,18 @@ static bool trans_SNEI(DisasContext *ctx, arg_SNEI *a) return do_seqi_snei(ctx, a, TCG_COND_NE); } +static bool trans_qmac(DisasContext *ctx, arg_qmac *a, + gen_helper_octeon_qmac_fn *helper) +{ + TCGv_i64 rs = tcg_temp_new_i64(); + TCGv_i64 rt = tcg_temp_new_i64(); + + gen_load_gpr(rs, a->rs); + gen_load_gpr(rt, a->rt); + helper(tcg_env, rs, rt, tcg_constant_i32(a->lane)); + return true; +} + static bool trans_lx(DisasContext *ctx, arg_lx *a, MemOp mop) { gen_lx(ctx, a->rd, a->base, a->index, mop); @@ -299,6 +313,8 @@ static bool trans_vmul(DisasContext *ctx, arg_decode_ext_octeon1 *a, TRANS(SAA, trans_saa, MO_UL); TRANS(SAAD, trans_saa, MO_UQ); +TRANS(QMAC, trans_qmac, gen_helper_octeon_qmac); +TRANS(QMACS, trans_qmac, gen_helper_octeon_qmacs); TRANS(LBX, trans_lx, MO_SB); TRANS(LBUX, trans_lx, MO_UB); TRANS(LHX, trans_lx, MO_SW); diff --git a/target/mips/tcg/op_helper.c b/target/mips/tcg/op_helper.c index 740c181d27..0a892e31a8 100644 --- a/target/mips/tcg/op_helper.c +++ b/target/mips/tcg/op_helper.c @@ -144,6 +144,65 @@ target_ulong helper_rotx(target_ulong rs, uint32_t shift, uint32_t shiftx, return (int64_t)(int32_t)(uint32_t)tmp5; } +static int32_t octeon_mul_q15_q15(int16_t a, int16_t b, bool *overflow) +{ + if (a == INT16_MIN && b == INT16_MIN) { + *overflow = true; + return INT32_MAX; + } + return (int32_t)a * b * 2; +} + +static int32_t octeon_sat32_acc_q31(int32_t acc, int32_t value, + bool *overflow) +{ + int64_t sum = (int64_t)acc + value; + + if (sum > INT32_MAX) { + *overflow = true; + return INT32_MAX; + } + if (sum < INT32_MIN) { + *overflow = true; + return INT32_MIN; + } + return sum; +} + +static int16_t octeon_qmac_lane(uint64_t rs, uint32_t lane) +{ + return (int16_t)(uint16_t)extract64(rs, lane * 16, 16); +} + +void helper_octeon_qmac(CPUMIPSState *env, uint64_t rs, uint64_t rt, + uint32_t lane) +{ + bool overflow = false; + int32_t product; + int64_t acc; + + product = octeon_mul_q15_q15((int16_t)(uint16_t)rt, + octeon_qmac_lane(rs, lane), &overflow); + acc = deposit64(env->active_tc.LO[0], 32, 32, env->active_tc.HI[0]); + acc += product; + + env->active_tc.LO[0] = (int64_t)(int32_t)acc; + env->active_tc.HI[0] = (int64_t)(int32_t)((uint64_t)acc >> 32); +} + +void helper_octeon_qmacs(CPUMIPSState *env, uint64_t rs, uint64_t rt, + uint32_t lane) +{ + bool overflow = env->active_tc.HI[0] & 1; + int32_t product; + + product = octeon_mul_q15_q15((int16_t)(uint16_t)rt, + octeon_qmac_lane(rs, lane), &overflow); + env->active_tc.LO[0] = octeon_sat32_acc_q31( + (int32_t)(uint32_t)env->active_tc.LO[0], product, &overflow); + env->active_tc.HI[0] = overflow; +} + static void octeon_add_limb(uint64_t *sum, int limb_count, uint64_t value, int limb) { -- 2.54.0
