From: Mateja Marjanovic <mateja.marjano...@rt-rk.com>

Optimize set of MSA instructions ILVEV.<B|H|W|D>, using
directly tcg registers and performing logic on them
instead of using helpers.

In the following table, the first column is the performance
before this patch. The second represents the performance
after converting from helpers to tcg, but without using
tcg_gen_deposit function. The third one is with using the
tcg_gen_deposit function and with using a uint64_t constant
bit mask, and the fourth is with using the tcg_gen_deposit
function and with a mask which is a tcg constant. The fourth
is implemented in this patch.

Performance measurement is done by executing the
instructions 10 million times on a computer
with Intel Core i7-3770 CPU @ 3.40GHz×8.

===================================================================
|| instruction ||      1     ||     2    ||     3    ||     4    ||
===================================================================
||   ilvev.b   || 107.592 ms || 2.432 ms || 2.381 ms || 2.599 ms ||
||   ilvev.h   ||  83.422 ms || 2.352 ms || 2.623 ms || 2.532 ms ||
||   ilvev.w   || 109.300 ms || 2.342 ms || 2.329 ms || 2.266 ms ||
||   ilvev.d   ||  30.915 ms || 1.926 ms || 2.002 ms || 1.976 ms ||
===================================================================
 1 - before
 2 - no-deposit-no-mask-as-tcg-constant
 3 - with-deposit-no-mask-as-tcg-constant
 4 - with-deposit-with-mask-as-tcg-constant (final)

The deposit function is used only in ILVEV.W.

No-deposit version of the ILVEV.W implementation:

static inline void gen_ilvev_w(CPUMIPSState *env, uint32_t wd,
                               uint32_t ws, uint32_t wt)
{
    TCGv_i64 t1 = tcg_temp_new_i64();
    TCGv_i64 t2 = tcg_temp_new_i64();
    uint64_t mask = 0x00000000ffffffffULL;

    tcg_gen_andi_i64(t1, msa_wr_d[wt * 2], mask);
    tcg_gen_andi_i64(t2, msa_wr_d[ws * 2], mask);
    tcg_gen_shli_i64(t2, t2, 32);
    tcg_gen_or_i64(msa_wr_d[wd * 2], t1, t2);

    tcg_gen_andi_i64(t1, msa_wr_d[wt * 2 + 1], mask);
    tcg_gen_andi_i64(t2, msa_wr_d[ws * 2 + 1], mask);
    tcg_gen_shli_i64(t2, t2, 32);
    tcg_gen_or_i64(msa_wr_d[wd * 2 + 1], t1, t2);

    tcg_temp_free_i64(t1);
    tcg_temp_free_i64(t2);
}

Reviewed-by: Richard Henderson <richard.hender...@linaro.org>
Suggested-by: Aleksandar Markovic <amarko...@wavecomp.com>
Suggested-by: Philippe Mathieu-Daudé <phi...@redhat.com>
Suggested-by: Richard Henderson <richard.hender...@linaro.org>
Signed-off-by: Mateja Marjanovic <mateja.marjano...@rt-rk.com>
---
 target/mips/helper.h     |  1 -
 target/mips/msa_helper.c |  9 -----
 target/mips/translate.c  | 87 +++++++++++++++++++++++++++++++++++++++++++++++-
 3 files changed, 86 insertions(+), 11 deletions(-)

diff --git a/target/mips/helper.h b/target/mips/helper.h
index d162836..2f23b0d 100644
--- a/target/mips/helper.h
+++ b/target/mips/helper.h
@@ -864,7 +864,6 @@ DEF_HELPER_5(msa_pckev_df, void, env, i32, i32, i32, i32)
 DEF_HELPER_5(msa_pckod_df, void, env, i32, i32, i32, i32)
 DEF_HELPER_5(msa_ilvl_df, void, env, i32, i32, i32, i32)
 DEF_HELPER_5(msa_ilvr_df, void, env, i32, i32, i32, i32)
-DEF_HELPER_5(msa_ilvev_df, void, env, i32, i32, i32, i32)
 DEF_HELPER_5(msa_vshf_df, void, env, i32, i32, i32, i32)
 DEF_HELPER_5(msa_srar_df, void, env, i32, i32, i32, i32)
 DEF_HELPER_5(msa_srlr_df, void, env, i32, i32, i32, i32)
diff --git a/target/mips/msa_helper.c b/target/mips/msa_helper.c
index 9e52a31..a500c59 100644
--- a/target/mips/msa_helper.c
+++ b/target/mips/msa_helper.c
@@ -1197,15 +1197,6 @@ MSA_FN_DF(ilvl_df)
     } while (0)
 MSA_FN_DF(ilvr_df)
 #undef MSA_DO
-
-#define MSA_DO(DF)                      \
-    do {                                \
-        pwx->DF[2*i]   = pwt->DF[2*i];  \
-        pwx->DF[2*i+1] = pws->DF[2*i];  \
-    } while (0)
-MSA_FN_DF(ilvev_df)
-#undef MSA_DO
-
 #undef MSA_LOOP_COND
 
 #define MSA_LOOP_COND(DF) \
diff --git a/target/mips/translate.c b/target/mips/translate.c
index 99bd441..930ef3a 100644
--- a/target/mips/translate.c
+++ b/target/mips/translate.c
@@ -28075,6 +28075,76 @@ static inline void gen_ilvod_d(CPUMIPSState *env, 
uint32_t wd,
     tcg_gen_mov_i64(msa_wr_d[wd * 2 + 1], msa_wr_d[ws * 2 + 1]);
 }
 
+
+/*
+ * [MSA] ILVEV.<B|H> wd, ws, wt
+ *
+ *   Vector Interleave Even (<byte|halfword> data elements)
+ *
+ */
+static inline void gen_ilvev_bh(CPUMIPSState *env, uint32_t wd,
+                                uint32_t ws, uint32_t wt,
+                                uint64_t mask, uint32_t shift)
+{
+    TCGv_i64 t1 = tcg_temp_new_i64();
+    TCGv_i64 t2 = tcg_temp_new_i64();
+    TCGv_i64 mask_tcg = tcg_const_i64(mask);
+
+    tcg_gen_and_i64(t1, msa_wr_d[wt * 2], mask_tcg);
+    tcg_gen_and_i64(t2, msa_wr_d[ws * 2], mask_tcg);
+    tcg_gen_shli_i64(t2, t2, shift);
+    tcg_gen_or_i64(msa_wr_d[wd * 2], t1, t2);
+
+    tcg_gen_and_i64(t1, msa_wr_d[wt * 2 + 1], mask_tcg);
+    tcg_gen_and_i64(t2, msa_wr_d[ws * 2 + 1], mask_tcg);
+    tcg_gen_shli_i64(t2, t2, shift);
+    tcg_gen_or_i64(msa_wr_d[wd * 2 + 1], t1, t2);
+
+    tcg_temp_free_i64(mask_tcg);
+    tcg_temp_free_i64(t1);
+    tcg_temp_free_i64(t2);
+}
+
+static inline void gen_ilvev_b(CPUMIPSState *env, uint32_t wd,
+                               uint32_t ws, uint32_t wt)
+{
+    gen_ilvev_bh(env, wd, ws, wt, 0x00ff00ff00ff00ffULL, 8);
+}
+
+static inline void gen_ilvev_h(CPUMIPSState *env, uint32_t wd,
+                               uint32_t ws, uint32_t wt)
+{
+    gen_ilvev_bh(env, wd, ws, wt, 0x0000ffff0000ffffULL, 16);
+}
+
+/*
+ * [MSA] ILVEV.W wd, ws, wt
+ *
+ *   Vector Interleave Even (word data elements)
+ *
+ */
+static inline void gen_ilvev_w(CPUMIPSState *env, uint32_t wd,
+                               uint32_t ws, uint32_t wt)
+{
+    tcg_gen_deposit_i64(msa_wr_d[wd * 2], msa_wr_d[wt * 2],
+                        msa_wr_d[ws * 2], 32, 32);
+    tcg_gen_deposit_i64(msa_wr_d[wd * 2 + 1], msa_wr_d[wt * 2 + 1],
+                        msa_wr_d[ws * 2 + 1], 32, 32);
+}
+
+/*
+ * [MSA] ILVEV.D wd, ws, wt
+ *
+ *   Vector Interleave Even (Doubleword data elements)
+ *
+ */
+static inline void gen_ilvev_d(CPUMIPSState *env, uint32_t wd,
+                               uint32_t ws, uint32_t wt)
+{
+    tcg_gen_mov_i64(msa_wr_d[wd * 2 + 1], msa_wr_d[ws * 2]);
+    tcg_gen_mov_i64(msa_wr_d[wd * 2], msa_wr_d[wt * 2]);
+}
+
 static void gen_msa_3r(CPUMIPSState *env, DisasContext *ctx)
 {
 #define MASK_MSA_3R(op)    (MASK_MSA_MINOR(op) | (op & (0x7 << 23)))
@@ -28231,7 +28301,22 @@ static void gen_msa_3r(CPUMIPSState *env, DisasContext 
*ctx)
         gen_helper_msa_mod_s_df(cpu_env, tdf, twd, tws, twt);
         break;
     case OPC_ILVEV_df:
-        gen_helper_msa_ilvev_df(cpu_env, tdf, twd, tws, twt);
+        switch (df) {
+        case DF_BYTE:
+            gen_ilvev_b(env, wd, ws, wt);
+            break;
+        case DF_HALF:
+            gen_ilvev_h(env, wd, ws, wt);
+            break;
+        case DF_WORD:
+            gen_ilvev_w(env, wd, ws, wt);
+            break;
+        case DF_DOUBLE:
+            gen_ilvev_d(env, wd, ws, wt);
+            break;
+        default:
+            assert(0);
+        }
         break;
     case OPC_BINSR_df:
         gen_helper_msa_binsr_df(cpu_env, tdf, twd, tws, twt);
-- 
2.7.4


Reply via email to