QMAC.0x and QMACS.0x multiply the selected signed Q15 halfword lane
from rs by rt<15:0> and accumulate the Q31 product into the Octeon
HI/LO accumulator state.

QMAC updates the full 64-bit HI/LO accumulator. QMACS saturates the
32-bit Q31 result in LO and keeps HI<0> as the sticky saturation flag.

Signed-off-by: James Hilliard <[email protected]>
---
 target/mips/helper.h               |  2 ++
 target/mips/tcg/octeon.decode      |  5 ++++
 target/mips/tcg/octeon_translate.c | 16 +++++++++++
 target/mips/tcg/op_helper.c        | 59 ++++++++++++++++++++++++++++++++++++++
 4 files changed, 82 insertions(+)

diff --git a/target/mips/helper.h b/target/mips/helper.h
index 08fda55ae1..e93bc37903 100644
--- a/target/mips/helper.h
+++ b/target/mips/helper.h
@@ -27,6 +27,8 @@ DEF_HELPER_FLAGS_4(rotx, TCG_CALL_NO_RWG_SE, tl, tl, i32, 
i32, i32)
 DEF_HELPER_3(octeon_vmulu, i64, env, i64, i64)
 DEF_HELPER_3(octeon_vmm0, i64, env, i64, i64)
 DEF_HELPER_3(octeon_v3mulu, i64, env, i64, i64)
+DEF_HELPER_4(octeon_qmac, void, env, i64, i64, i32)
+DEF_HELPER_4(octeon_qmacs, void, env, i64, i64, i32)
 
 /* microMIPS functions */
 DEF_HELPER_4(lwm, void, env, tl, tl, i32)
diff --git a/target/mips/tcg/octeon.decode b/target/mips/tcg/octeon.decode
index 9c1fe8f4f1..5edcd95884 100644
--- a/target/mips/tcg/octeon.decode
+++ b/target/mips/tcg/octeon.decode
@@ -28,9 +28,12 @@ BBIT         11 set:1 . 10 rs:5 ..... offset:s16 p=%bbit_p
 # SEQI rt, rs, immediate
 # SNE rd, rs, rt
 # SNEI rt, rs, immediate
+# QMAC.0x rs, rt
+# QMACS.0x rs, rt
 
 @r3          ...... rs:5 rt:5 rd:5 ..... ......
 &cmpi        rs rt imm
+&qmac        rs rt lane
 %bitfield_p  0:1 6:5
 @bitfield    ...... rs:5 rt:5 lenm1:5 ..... ..... . p=%bitfield_p
 
@@ -43,6 +46,8 @@ SEQ          011100 ..... ..... ..... 00000 101010 @r3
 SNE          011100 ..... ..... ..... 00000 101011 @r3
 SEQI         011100 rs:5 rt:5 imm:s10 101110 &cmpi
 SNEI         011100 rs:5 rt:5 imm:s10 101111 &cmpi
+QMACS        011100 rs:5 rt:5 00000 000 lane:2 010010 &qmac
+QMAC         011100 rs:5 rt:5 00000 100 lane:2 010010 &qmac
 &r2          rs rt
 MTM0         011100 rs:5 rt:5 00000 00000 001000 &r2
 MTP0         011100 rs:5 rt:5 00000 00000 001001 &r2
diff --git a/target/mips/tcg/octeon_translate.c 
b/target/mips/tcg/octeon_translate.c
index 2d836afddb..b41bc1f81e 100644
--- a/target/mips/tcg/octeon_translate.c
+++ b/target/mips/tcg/octeon_translate.c
@@ -14,6 +14,8 @@
 #include "decode-octeon.c.inc"
 
 typedef void gen_helper_octeon_vmul(TCGv_i64, TCGv_ptr, TCGv_i64, TCGv_i64);
+typedef void gen_helper_octeon_qmac_fn(TCGv_ptr, TCGv_i64, TCGv_i64,
+                                       TCGv_i32);
 
 static bool trans_BBIT(DisasContext *ctx, arg_BBIT *a)
 {
@@ -156,6 +158,18 @@ static bool trans_SNEI(DisasContext *ctx, arg_SNEI *a)
     return do_seqi_snei(ctx, a, TCG_COND_NE);
 }
 
+static bool trans_qmac(DisasContext *ctx, arg_qmac *a,
+                       gen_helper_octeon_qmac_fn *helper)
+{
+    TCGv_i64 rs = tcg_temp_new_i64();
+    TCGv_i64 rt = tcg_temp_new_i64();
+
+    gen_load_gpr(rs, a->rs);
+    gen_load_gpr(rt, a->rt);
+    helper(tcg_env, rs, rt, tcg_constant_i32(a->lane));
+    return true;
+}
+
 static bool trans_lx(DisasContext *ctx, arg_lx *a, MemOp mop)
 {
     gen_lx(ctx, a->rd, a->base, a->index, mop);
@@ -299,6 +313,8 @@ static bool trans_vmul(DisasContext *ctx, 
arg_decode_ext_octeon1 *a,
 
 TRANS(SAA,  trans_saa, MO_UL);
 TRANS(SAAD, trans_saa, MO_UQ);
+TRANS(QMAC,  trans_qmac, gen_helper_octeon_qmac);
+TRANS(QMACS, trans_qmac, gen_helper_octeon_qmacs);
 TRANS(LBX,  trans_lx, MO_SB);
 TRANS(LBUX, trans_lx, MO_UB);
 TRANS(LHX,  trans_lx, MO_SW);
diff --git a/target/mips/tcg/op_helper.c b/target/mips/tcg/op_helper.c
index 740c181d27..0a892e31a8 100644
--- a/target/mips/tcg/op_helper.c
+++ b/target/mips/tcg/op_helper.c
@@ -144,6 +144,65 @@ target_ulong helper_rotx(target_ulong rs, uint32_t shift, 
uint32_t shiftx,
     return (int64_t)(int32_t)(uint32_t)tmp5;
 }
 
+static int32_t octeon_mul_q15_q15(int16_t a, int16_t b, bool *overflow)
+{
+    if (a == INT16_MIN && b == INT16_MIN) {
+        *overflow = true;
+        return INT32_MAX;
+    }
+    return (int32_t)a * b * 2;
+}
+
+static int32_t octeon_sat32_acc_q31(int32_t acc, int32_t value,
+                                    bool *overflow)
+{
+    int64_t sum = (int64_t)acc + value;
+
+    if (sum > INT32_MAX) {
+        *overflow = true;
+        return INT32_MAX;
+    }
+    if (sum < INT32_MIN) {
+        *overflow = true;
+        return INT32_MIN;
+    }
+    return sum;
+}
+
+static int16_t octeon_qmac_lane(uint64_t rs, uint32_t lane)
+{
+    return (int16_t)(uint16_t)extract64(rs, lane * 16, 16);
+}
+
+void helper_octeon_qmac(CPUMIPSState *env, uint64_t rs, uint64_t rt,
+                        uint32_t lane)
+{
+    bool overflow = false;
+    int32_t product;
+    int64_t acc;
+
+    product = octeon_mul_q15_q15((int16_t)(uint16_t)rt,
+                                 octeon_qmac_lane(rs, lane), &overflow);
+    acc = deposit64(env->active_tc.LO[0], 32, 32, env->active_tc.HI[0]);
+    acc += product;
+
+    env->active_tc.LO[0] = (int64_t)(int32_t)acc;
+    env->active_tc.HI[0] = (int64_t)(int32_t)((uint64_t)acc >> 32);
+}
+
+void helper_octeon_qmacs(CPUMIPSState *env, uint64_t rs, uint64_t rt,
+                         uint32_t lane)
+{
+    bool overflow = env->active_tc.HI[0] & 1;
+    int32_t product;
+
+    product = octeon_mul_q15_q15((int16_t)(uint16_t)rt,
+                                 octeon_qmac_lane(rs, lane), &overflow);
+    env->active_tc.LO[0] = octeon_sat32_acc_q31(
+        (int32_t)(uint32_t)env->active_tc.LO[0], product, &overflow);
+    env->active_tc.HI[0] = overflow;
+}
+
 static void octeon_add_limb(uint64_t *sum, int limb_count,
                             uint64_t value, int limb)
 {

-- 
2.54.0


Reply via email to