QMAC.0x and QMACS.0x multiply the selected signed Q15 halfword lane from rs by rt<15:0> and accumulate the Q31 product into the Octeon HI/LO accumulator state.
QMAC updates the full 64-bit HI/LO accumulator. QMACS saturates the 32-bit Q31 result in LO and keeps HI<0> as the sticky saturation flag. Reviewed-by: Richard Henderson <[email protected]> Signed-off-by: James Hilliard <[email protected]> Signed-off-by: Richard Henderson <[email protected]> --- Changes v5 -> v6: - New patch. Changes v7 -> v8: - Use Richard Henderson's v7.5 inline TCG translator. --- target/mips/tcg/octeon.decode | 4 +++ target/mips/tcg/octeon_translate.c | 60 ++++++++++++++++++++++++++++++++++++++ 2 files changed, 64 insertions(+) diff --git a/target/mips/tcg/octeon.decode b/target/mips/tcg/octeon.decode index 4d0ad05834..2d02b4e0bc 100644 --- a/target/mips/tcg/octeon.decode +++ b/target/mips/tcg/octeon.decode @@ -28,6 +28,8 @@ BBIT 11 set:1 . 10 rs:5 ..... offset:s16 p=%bbit_p # SEQI rt, rs, immediate # SNE rd, rs, rt # SNEI rt, rs, immediate +# QMAC.0x rs, rt +# QMACS.0x rs, rt @r3 ...... rs:5 rt:5 rd:5 ..... ...... &cmpi rs rt imm @@ -43,6 +45,8 @@ SEQ 011100 ..... ..... ..... 00000 101010 @r3 SNE 011100 ..... ..... ..... 00000 101011 @r3 SEQI 011100 rs:5 rt:5 imm:s10 101110 &cmpi SNEI 011100 rs:5 rt:5 imm:s10 101111 &cmpi +QMACS 011100 rs:5 rt:5 00000 000 lane:2 010010 +QMAC 011100 rs:5 rt:5 00000 100 lane:2 010010 &r2 rs rt @r2 ...... rs:5 rt:5 ..... ..... ...... &r2 diff --git a/target/mips/tcg/octeon_translate.c b/target/mips/tcg/octeon_translate.c index 04e51538f9..e068fd8698 100644 --- a/target/mips/tcg/octeon_translate.c +++ b/target/mips/tcg/octeon_translate.c @@ -356,3 +356,63 @@ static bool trans_V3MULU(DisasContext *ctx, arg_V3MULU *a) gen_store_gpr(tmp, a->rd); return true; } + +static bool trans_QMAC(DisasContext *ctx, arg_QMAC *a) +{ + TCGv_i64 t0 = tcg_temp_new_i64(); + TCGv_i64 t1 = tcg_temp_new_i64(); + + gen_load_gpr(t0, a->rt); + gen_load_gpr(t1, a->rs); + + /* t0 = rt<0> * rs<lane> * 2 */ + tcg_gen_ext16s_i64(t0, t0); + tcg_gen_sextract_i64(t1, t1, a->lane * 16, 16); + tcg_gen_mul_i64(t0, t0, t1); + tcg_gen_add_i64(t0, t0, t0); + + /* Saturate -0x8000 * -0x8000 * 2 = 0x80000000 -> 0x7fffffff */ + tcg_gen_smin_i64(t0, t0, tcg_constant_i64(INT32_MAX)); + + /* HI:LO += t0 */ + tcg_gen_concat32_i64(t1, cpu_LO[0], cpu_HI[0]); + tcg_gen_add_i64(t0, t0, t1); + tcg_gen_sextract_i64(cpu_LO[0], t0, 0, 32); + tcg_gen_sextract_i64(cpu_HI[0], t0, 32, 32); + return true; +} + +static bool trans_QMACS(DisasContext *ctx, arg_QMACS *a) +{ + TCGv_i64 t0 = tcg_temp_new_i64(); + TCGv_i64 t1 = tcg_temp_new_i64(); + + gen_load_gpr(t0, a->rt); + gen_load_gpr(t1, a->rs); + + /* t0 = rt<0> * rs<lane> * 2 */ + tcg_gen_ext16s_i64(t0, t0); + tcg_gen_sextract_i64(t1, t1, a->lane * 16, 16); + tcg_gen_mul_i64(t0, t0, t1); + tcg_gen_add_i64(t0, t0, t0); + + /* + * Saturate -0x8000 * -0x8000 * 2 = 0x80000000 -> 0x7fffffff. + * Accumulate overflow in HI[0]. + */ + tcg_gen_smin_i64(t1, t0, tcg_constant_i64(INT32_MAX)); + tcg_gen_setcond_i64(TCG_COND_NE, t0, t0, t1); + tcg_gen_or_i64(cpu_HI[0], cpu_HI[0], t0); + + /* + * LO = sat32(LO + t0) + * Accumulate overflow in HI[0]. + */ + tcg_gen_ext32s_i64(t0, cpu_LO[0]); + tcg_gen_add_i64(t0, t0, t1); + tcg_gen_smin_i64(cpu_LO[0], t0, tcg_constant_i64(INT32_MAX)); + tcg_gen_smax_i64(cpu_LO[0], cpu_LO[0], tcg_constant_i64(INT32_MIN)); + tcg_gen_setcond_i64(TCG_COND_NE, t0, t0, cpu_LO[0]); + tcg_gen_or_i64(cpu_HI[0], cpu_HI[0], t0); + return true; +} -- 2.54.0
