V3MULU extends VMULU across the full Octeon3 multiplier state, adding rt and queued partial products.
Return the low result while shifting the remaining accumulated limbs back into P[0] through P[5]. Signed-off-by: James Hilliard <[email protected]> --- Changes v2 -> v3: - Split V3MULU out of the combined Octeon arithmetic and memory instruction patch. (requested by Richard Henderson) Changes v3 -> v4: - Keep the Octeon3 MPL3-MPL5/P3-P5 high lanes used by the two-source MTM/MTP forms and Cavium SDK/runtime code. --- target/mips/helper.h | 1 + target/mips/tcg/octeon.decode | 1 + target/mips/tcg/octeon_translate.c | 1 + target/mips/tcg/op_helper.c | 46 ++++++++++++++++++++++++++++++++++++++ 4 files changed, 49 insertions(+) diff --git a/target/mips/helper.h b/target/mips/helper.h index 46ccad95c3..08fda55ae1 100644 --- a/target/mips/helper.h +++ b/target/mips/helper.h @@ -26,6 +26,7 @@ DEF_HELPER_3(crc32c, tl, tl, tl, i32) DEF_HELPER_FLAGS_4(rotx, TCG_CALL_NO_RWG_SE, tl, tl, i32, i32, i32) DEF_HELPER_3(octeon_vmulu, i64, env, i64, i64) DEF_HELPER_3(octeon_vmm0, i64, env, i64, i64) +DEF_HELPER_3(octeon_v3mulu, i64, env, i64, i64) /* microMIPS functions */ DEF_HELPER_4(lwm, void, env, tl, tl, i32) diff --git a/target/mips/tcg/octeon.decode b/target/mips/tcg/octeon.decode index c1c3506d20..2cdb7e4e76 100644 --- a/target/mips/tcg/octeon.decode +++ b/target/mips/tcg/octeon.decode @@ -52,6 +52,7 @@ MTM1 011100 rs:5 rt:5 00000 00000 001100 &r2 MTM2 011100 rs:5 rt:5 00000 00000 001101 &r2 VMULU 011100 ..... ..... ..... 00000 001111 @r3 VMM0 011100 ..... ..... ..... 00000 010000 @r3 +V3MULU 011100 ..... ..... ..... 00000 010001 @r3 &saa base rt @saa ...... base:5 rt:5 ................ &saa diff --git a/target/mips/tcg/octeon_translate.c b/target/mips/tcg/octeon_translate.c index 304aa58065..775f758369 100644 --- a/target/mips/tcg/octeon_translate.c +++ b/target/mips/tcg/octeon_translate.c @@ -318,3 +318,4 @@ TRANS(MTP1, trans_mtp, 1); TRANS(MTP2, trans_mtp, 2); TRANS(VMULU, trans_vmul, gen_helper_octeon_vmulu); TRANS(VMM0, trans_vmul, gen_helper_octeon_vmm0); +TRANS(V3MULU, trans_vmul, gen_helper_octeon_v3mulu); diff --git a/target/mips/tcg/op_helper.c b/target/mips/tcg/op_helper.c index 45e208ca43..740c181d27 100644 --- a/target/mips/tcg/op_helper.c +++ b/target/mips/tcg/op_helper.c @@ -196,6 +196,52 @@ uint64_t helper_octeon_vmm0(CPUMIPSState *env, uint64_t rs, uint64_t rt) return lo; } +uint64_t helper_octeon_v3mulu(CPUMIPSState *env, uint64_t rs, uint64_t rt) +{ + uint64_t lo, hi; + uint64_t sum[OCTEON_MULTIPLIER_REGS + 1] = {}; + + mulu64(&lo, &hi, env->active_tc.octeon.MPL[0], rs); + sum[0] = lo; + sum[1] = hi; + + mulu64(&lo, &hi, env->active_tc.octeon.MPL[1], rs); + octeon_add_limb(sum, ARRAY_SIZE(sum), lo, 1); + octeon_add_limb(sum, ARRAY_SIZE(sum), hi, 2); + + mulu64(&lo, &hi, env->active_tc.octeon.MPL[2], rs); + octeon_add_limb(sum, ARRAY_SIZE(sum), lo, 2); + octeon_add_limb(sum, ARRAY_SIZE(sum), hi, 3); + + mulu64(&lo, &hi, env->active_tc.octeon.MPL[3], rs); + octeon_add_limb(sum, ARRAY_SIZE(sum), lo, 3); + octeon_add_limb(sum, ARRAY_SIZE(sum), hi, 4); + + mulu64(&lo, &hi, env->active_tc.octeon.MPL[4], rs); + octeon_add_limb(sum, ARRAY_SIZE(sum), lo, 4); + octeon_add_limb(sum, ARRAY_SIZE(sum), hi, 5); + + mulu64(&lo, &hi, env->active_tc.octeon.MPL[5], rs); + octeon_add_limb(sum, ARRAY_SIZE(sum), lo, 5); + octeon_add_limb(sum, ARRAY_SIZE(sum), hi, 6); + + octeon_add_limb(sum, ARRAY_SIZE(sum), rt, 0); + octeon_add_limb(sum, ARRAY_SIZE(sum), env->active_tc.octeon.P[0], 0); + octeon_add_limb(sum, ARRAY_SIZE(sum), env->active_tc.octeon.P[1], 1); + octeon_add_limb(sum, ARRAY_SIZE(sum), env->active_tc.octeon.P[2], 2); + octeon_add_limb(sum, ARRAY_SIZE(sum), env->active_tc.octeon.P[3], 3); + octeon_add_limb(sum, ARRAY_SIZE(sum), env->active_tc.octeon.P[4], 4); + octeon_add_limb(sum, ARRAY_SIZE(sum), env->active_tc.octeon.P[5], 5); + + env->active_tc.octeon.P[0] = sum[1]; + env->active_tc.octeon.P[1] = sum[2]; + env->active_tc.octeon.P[2] = sum[3]; + env->active_tc.octeon.P[3] = sum[4]; + env->active_tc.octeon.P[4] = sum[5]; + env->active_tc.octeon.P[5] = sum[6]; + return sum[0]; +} + /* these crc32 functions are based on target/loongarch/tcg/op_helper.c */ target_ulong helper_crc32(target_ulong val, target_ulong m, uint32_t sz) { -- 2.54.0
