VMULU multiplies the active Octeon multiplier state by rs, adds rt and queued partial products, returns the low result, and advances P[0]/P[1] with carry limbs.
Add helper and translator support for the two-limb accumulator operation. Signed-off-by: James Hilliard <[email protected]> --- Changes v2 -> v3: - Split VMULU out of the combined Octeon arithmetic and memory instruction patch. (requested by Richard Henderson) - Use uadd64_overflow() for multiplier limb carry accumulation. (suggested by Richard Henderson) --- target/mips/helper.h | 1 + target/mips/tcg/octeon.decode | 1 + target/mips/tcg/octeon_translate.c | 17 +++++++++++++++++ target/mips/tcg/op_helper.c | 32 ++++++++++++++++++++++++++++++++ 4 files changed, 51 insertions(+) diff --git a/target/mips/helper.h b/target/mips/helper.h index e2b83a1d19..f1e78ae329 100644 --- a/target/mips/helper.h +++ b/target/mips/helper.h @@ -24,6 +24,7 @@ DEF_HELPER_FLAGS_1(dbitswap, TCG_CALL_NO_RWG_SE, tl, tl) DEF_HELPER_3(crc32, tl, tl, tl, i32) DEF_HELPER_3(crc32c, tl, tl, tl, i32) DEF_HELPER_FLAGS_4(rotx, TCG_CALL_NO_RWG_SE, tl, tl, i32, i32, i32) +DEF_HELPER_3(octeon_vmulu, i64, env, i64, i64) /* microMIPS functions */ DEF_HELPER_4(lwm, void, env, tl, tl, i32) diff --git a/target/mips/tcg/octeon.decode b/target/mips/tcg/octeon.decode index 682473b011..75834afc6c 100644 --- a/target/mips/tcg/octeon.decode +++ b/target/mips/tcg/octeon.decode @@ -50,6 +50,7 @@ MTP1 011100 rs:5 rt:5 00000 00000 001010 &r2 MTP2 011100 rs:5 rt:5 00000 00000 001011 &r2 MTM1 011100 rs:5 rt:5 00000 00000 001100 &r2 MTM2 011100 rs:5 rt:5 00000 00000 001101 &r2 +VMULU 011100 ..... ..... ..... 00000 001111 @r3 &saa base rt @saa ...... base:5 rt:5 ................ &saa diff --git a/target/mips/tcg/octeon_translate.c b/target/mips/tcg/octeon_translate.c index fc0131ccc5..d39f904e73 100644 --- a/target/mips/tcg/octeon_translate.c +++ b/target/mips/tcg/octeon_translate.c @@ -13,6 +13,8 @@ /* Include the auto-generated decoder. */ #include "decode-octeon.c.inc" +typedef void gen_helper_lmi(TCGv_i64, TCGv_ptr, TCGv_i64, TCGv_i64); + static bool trans_BBIT(DisasContext *ctx, arg_BBIT *a) { TCGv_i64 p; @@ -265,6 +267,20 @@ static bool trans_mtp(DisasContext *ctx, arg_r2 *a, unsigned int index) return true; } +static bool trans_vmul(DisasContext *ctx, arg_decode_ext_octeon1 *a, + gen_helper_lmi *helper) +{ + TCGv_i64 rs = tcg_temp_new_i64(); + TCGv_i64 rt = tcg_temp_new_i64(); + TCGv_i64 rd = tcg_temp_new_i64(); + + gen_load_gpr(rs, a->rs); + gen_load_gpr(rt, a->rt); + helper(rd, tcg_env, rs, rt); + gen_store_gpr(rd, a->rd); + return true; +} + TRANS(SAA, trans_saa, MO_UL); TRANS(SAAD, trans_saa, MO_UQ); TRANS(LBX, trans_lx, MO_SB); @@ -280,3 +296,4 @@ TRANS(MTM2, trans_mtm, 2); TRANS(MTP0, trans_mtp, 0); TRANS(MTP1, trans_mtp, 1); TRANS(MTP2, trans_mtp, 2); +TRANS(VMULU, trans_vmul, gen_helper_octeon_vmulu); diff --git a/target/mips/tcg/op_helper.c b/target/mips/tcg/op_helper.c index 4502ae2b5b..ab3fb06a16 100644 --- a/target/mips/tcg/op_helper.c +++ b/target/mips/tcg/op_helper.c @@ -144,6 +144,38 @@ target_ulong helper_rotx(target_ulong rs, uint32_t shift, uint32_t shiftx, return (int64_t)(int32_t)(uint32_t)tmp5; } +static void octeon_add_limb(uint64_t *sum, int limb_count, + uint64_t value, int limb) +{ + while (limb < limb_count && + uadd64_overflow(sum[limb], value, &sum[limb])) { + value = 1; + limb++; + } +} + +uint64_t helper_octeon_vmulu(CPUMIPSState *env, uint64_t rs, uint64_t rt) +{ + uint64_t lo, hi; + uint64_t sum[3] = {}; + + mulu64(&lo, &hi, env->active_tc.octeon.MPL[0], rs); + sum[0] = lo; + sum[1] = hi; + + mulu64(&lo, &hi, env->active_tc.octeon.MPL[1], rs); + octeon_add_limb(sum, 3, lo, 1); + octeon_add_limb(sum, 3, hi, 2); + + octeon_add_limb(sum, 3, rt, 0); + octeon_add_limb(sum, 3, env->active_tc.octeon.P[0], 0); + octeon_add_limb(sum, 3, env->active_tc.octeon.P[1], 1); + + env->active_tc.octeon.P[0] = sum[1]; + env->active_tc.octeon.P[1] = sum[2]; + return sum[0]; +} + /* these crc32 functions are based on target/loongarch/tcg/op_helper.c */ target_ulong helper_crc32(target_ulong val, target_ulong m, uint32_t sz) { -- 2.54.0
