Re: [Qemu-devel] [PATCH 1/2] target/mips: Optimize ILVOD. MSA instructions

2019-03-19 Thread Richard Henderson
On 3/19/19 4:28 AM, Mateja Marjanovic wrote:
> +tcg_gen_andi_i64(t1, msa_wr_d[wt * 2], mask);
> +tcg_gen_shri_i64(t1, t1, 32);
> +tcg_gen_andi_i64(t0, msa_wr_d[ws * 2], mask);
> +tcg_gen_or_i64(msa_wr_d[wd * 2], t1, t0);

Deposit, again.


r~



Re: [Qemu-devel] [PATCH 1/2] target/mips: Optimize ILVOD. MSA instructions

2019-03-19 Thread Aleksandar Markovic
> From: Mateja Marjanovic 
> Sent: Tuesday, March 19, 2019 12:28 PM
> To: qemu-devel@nongnu.org
> Cc: aurel...@aurel32.net; Aleksandar Markovic; Aleksandar Rikalo
> Subject: [PATCH 1/2] target/mips: Optimize ILVOD. MSA instructions
> 
> From: Mateja Marjanovic 
> 
> Optimize set of MSA instructions ILVOD, using directly\

Use full instructions names.

> tcg registers and performing logic on them instead of
> using helpers.
> 
>  instr||   before||   after
> ==
>  ilvod.b  ||  117.50 ms  ||  24.99 ms
>  ilvod.h  ||   93.16 ms  ||  24.27 ms
>  ilvod.w  ||  119.90 ms  ||  24.81 ms
>  ilvod.d  ||   43.01 ms  ||  20.69 ms
> 

How are these numbers obtained? You should even insert the
source code of the test program you are using, complete with
the compilation command line and the execution command line,
and basic data on the host.

How do you interpret the strange pattern of numbers in
"before" column?

> Signed-off-by: Mateja Marjanovic 
> ---
>  target/mips/helper.h |   1 -
>  target/mips/msa_helper.c |   7 ---
>  target/mips/translate.c  | 111 
> ++-
>  3 files changed, 110 insertions(+), 9 deletions(-)
> 
> diff --git a/target/mips/helper.h b/target/mips/helper.h
> index a6d687e..d162836 100644
> --- a/target/mips/helper.h
> +++ b/target/mips/helper.h
> @@ -865,7 +865,6 @@ DEF_HELPER_5(msa_pckod_df, void, env, i32, i32, i32, i32)
>  DEF_HELPER_5(msa_ilvl_df, void, env, i32, i32, i32, i32)
>  DEF_HELPER_5(msa_ilvr_df, void, env, i32, i32, i32, i32)
>  DEF_HELPER_5(msa_ilvev_df, void, env, i32, i32, i32, i32)
> -DEF_HELPER_5(msa_ilvod_df, void, env, i32, i32, i32, i32)
>  DEF_HELPER_5(msa_vshf_df, void, env, i32, i32, i32, i32)
>  DEF_HELPER_5(msa_srar_df, void, env, i32, i32, i32, i32)
>  DEF_HELPER_5(msa_srlr_df, void, env, i32, i32, i32, i32)
> diff --git a/target/mips/msa_helper.c b/target/mips/msa_helper.c
> index c74e3cd..9e52a31 100644
> --- a/target/mips/msa_helper.c
> +++ b/target/mips/msa_helper.c
> @@ -1206,13 +1206,6 @@ MSA_FN_DF(ilvr_df)
>  MSA_FN_DF(ilvev_df)
>  #undef MSA_DO
> 
> -#define MSA_DO(DF)  \
> -do {\
> -pwx->DF[2*i]   = pwt->DF[2*i+1];\
> -pwx->DF[2*i+1] = pws->DF[2*i+1];\
> -} while (0)
> -MSA_FN_DF(ilvod_df)
> -#undef MSA_DO
>  #undef MSA_LOOP_COND
> 
>  #define MSA_LOOP_COND(DF) \
> diff --git a/target/mips/translate.c b/target/mips/translate.c
> index 5848f7a..10c5c55 100644
> --- a/target/mips/translate.c
> +++ b/target/mips/translate.c
> @@ -28884,6 +28884,100 @@ static void gen_msa_bit(CPUMIPSState *env, 
> DisasContext *ctx)
>  tcg_temp_free_i32(tws);
>  }
> 
> +/*
> + * [MSA] ILVOD.B wd, ws, wt
> + *
> + *   Vector Interleave Odd (byte data elements)
> + *
> + */
> +static inline void gen_ilvod_b(CPUMIPSState *env, uint32_t wd,
> +   uint32_t ws, uint32_t wt)
> +{
> +TCGv_i64 t0 = tcg_temp_new_i64();
> +TCGv_i64 t1 = tcg_temp_new_i64();
> +const uint64_t mask = 0xff00ff00ff00ff00ULL;
> +
> +tcg_gen_andi_i64(t1, msa_wr_d[wt * 2], mask);
> +tcg_gen_shri_i64(t1, t1, 8);
> +tcg_gen_andi_i64(t0, msa_wr_d[ws * 2], mask);
> +tcg_gen_or_i64(msa_wr_d[wd * 2], t1, t0);
> +

It is very odd (and confusing for future readers) that you
use t1 first in this function. If variables are named t0, t1,
etc., than their first uses in the function in question should
be in the same order.

(the same comment applies to other similar cases in this
series)

> +tcg_gen_andi_i64(t1, msa_wr_d[wt * 2 + 1], mask);
> +tcg_gen_shri_i64(t1, t1, 8);
> +tcg_gen_andi_i64(t0, msa_wr_d[ws * 2 + 1], mask);
> +tcg_gen_or_i64(msa_wr_d[wd * 2 + 1], t1, t0);
> +
> +tcg_temp_free_i64(t0);
> +tcg_temp_free_i64(t1);
> +}
> +
> +/*
> + * [MSA] ILVOD.H wd, ws, wt
> + *
> + *   Vector Interleave Odd (halfword data elements)
> + *
> + */
> +static inline void gen_ilvod_h(CPUMIPSState *env, uint32_t wd,
> +   uint32_t ws, uint32_t wt)
> +{
> +TCGv_i64 t0 = tcg_temp_new_i64();
> +TCGv_i64 t1 = tcg_temp_new_i64();
> +const uint64_t mask = 0xULL;
> +
> +tcg_gen_andi_i64(t1, msa_wr_d[wt * 2], mask);
> +tcg_gen_shri_i64(t1, t1, 16);
> +tcg_gen_andi_i64(t0, msa_wr_d[ws * 2], mask);
> +tcg_gen_or_i64(msa_wr_d[wd * 2], t1, t0);
> +
> +tcg_gen_andi_i64(t1, msa_wr_d[wt * 2 + 1], mask);
> +tcg_gen_shri_i64(t1, t1, 16);
> +tcg_gen_andi_i64(t0, msa_wr_d[ws * 2 + 1], mask);
> +tcg_gen_or_i64(msa_wr_d[wd * 2 + 1], t1, t0);
> +
> +tcg_temp_free_i64(t0);
> +tcg_temp_free_i64(t1);
> +}
> +
> +/*
> + * [MSA] ILVOD.W wd, ws, wt
> + *
> + *   Vector Interleave Odd (word data elements)
> + *
> + */
> +static inline void gen_ilvod_w(CPUMIPSState *env, uint32_t wd,
> +   uint32_t ws, uint32_t wt)
> +{
> +TCGv_i64 t0 = tcg_temp_new_i64();
> +TCGv_i64 t1 = tc

[Qemu-devel] [PATCH 1/2] target/mips: Optimize ILVOD. MSA instructions

2019-03-19 Thread Mateja Marjanovic
From: Mateja Marjanovic 

Optimize set of MSA instructions ILVOD, using directly
tcg registers and performing logic on them instead of
using helpers.

 instr||   before||   after
==
 ilvod.b  ||  117.50 ms  ||  24.99 ms
 ilvod.h  ||   93.16 ms  ||  24.27 ms
 ilvod.w  ||  119.90 ms  ||  24.81 ms
 ilvod.d  ||   43.01 ms  ||  20.69 ms

Signed-off-by: Mateja Marjanovic 
---
 target/mips/helper.h |   1 -
 target/mips/msa_helper.c |   7 ---
 target/mips/translate.c  | 111 ++-
 3 files changed, 110 insertions(+), 9 deletions(-)

diff --git a/target/mips/helper.h b/target/mips/helper.h
index a6d687e..d162836 100644
--- a/target/mips/helper.h
+++ b/target/mips/helper.h
@@ -865,7 +865,6 @@ DEF_HELPER_5(msa_pckod_df, void, env, i32, i32, i32, i32)
 DEF_HELPER_5(msa_ilvl_df, void, env, i32, i32, i32, i32)
 DEF_HELPER_5(msa_ilvr_df, void, env, i32, i32, i32, i32)
 DEF_HELPER_5(msa_ilvev_df, void, env, i32, i32, i32, i32)
-DEF_HELPER_5(msa_ilvod_df, void, env, i32, i32, i32, i32)
 DEF_HELPER_5(msa_vshf_df, void, env, i32, i32, i32, i32)
 DEF_HELPER_5(msa_srar_df, void, env, i32, i32, i32, i32)
 DEF_HELPER_5(msa_srlr_df, void, env, i32, i32, i32, i32)
diff --git a/target/mips/msa_helper.c b/target/mips/msa_helper.c
index c74e3cd..9e52a31 100644
--- a/target/mips/msa_helper.c
+++ b/target/mips/msa_helper.c
@@ -1206,13 +1206,6 @@ MSA_FN_DF(ilvr_df)
 MSA_FN_DF(ilvev_df)
 #undef MSA_DO
 
-#define MSA_DO(DF)  \
-do {\
-pwx->DF[2*i]   = pwt->DF[2*i+1];\
-pwx->DF[2*i+1] = pws->DF[2*i+1];\
-} while (0)
-MSA_FN_DF(ilvod_df)
-#undef MSA_DO
 #undef MSA_LOOP_COND
 
 #define MSA_LOOP_COND(DF) \
diff --git a/target/mips/translate.c b/target/mips/translate.c
index 5848f7a..10c5c55 100644
--- a/target/mips/translate.c
+++ b/target/mips/translate.c
@@ -28884,6 +28884,100 @@ static void gen_msa_bit(CPUMIPSState *env, 
DisasContext *ctx)
 tcg_temp_free_i32(tws);
 }
 
+/*
+ * [MSA] ILVOD.B wd, ws, wt
+ *
+ *   Vector Interleave Odd (byte data elements)
+ *
+ */
+static inline void gen_ilvod_b(CPUMIPSState *env, uint32_t wd,
+   uint32_t ws, uint32_t wt)
+{
+TCGv_i64 t0 = tcg_temp_new_i64();
+TCGv_i64 t1 = tcg_temp_new_i64();
+const uint64_t mask = 0xff00ff00ff00ff00ULL;
+
+tcg_gen_andi_i64(t1, msa_wr_d[wt * 2], mask);
+tcg_gen_shri_i64(t1, t1, 8);
+tcg_gen_andi_i64(t0, msa_wr_d[ws * 2], mask);
+tcg_gen_or_i64(msa_wr_d[wd * 2], t1, t0);
+
+tcg_gen_andi_i64(t1, msa_wr_d[wt * 2 + 1], mask);
+tcg_gen_shri_i64(t1, t1, 8);
+tcg_gen_andi_i64(t0, msa_wr_d[ws * 2 + 1], mask);
+tcg_gen_or_i64(msa_wr_d[wd * 2 + 1], t1, t0);
+
+tcg_temp_free_i64(t0);
+tcg_temp_free_i64(t1);
+}
+
+/*
+ * [MSA] ILVOD.H wd, ws, wt
+ *
+ *   Vector Interleave Odd (halfword data elements)
+ *
+ */
+static inline void gen_ilvod_h(CPUMIPSState *env, uint32_t wd,
+   uint32_t ws, uint32_t wt)
+{
+TCGv_i64 t0 = tcg_temp_new_i64();
+TCGv_i64 t1 = tcg_temp_new_i64();
+const uint64_t mask = 0xULL;
+
+tcg_gen_andi_i64(t1, msa_wr_d[wt * 2], mask);
+tcg_gen_shri_i64(t1, t1, 16);
+tcg_gen_andi_i64(t0, msa_wr_d[ws * 2], mask);
+tcg_gen_or_i64(msa_wr_d[wd * 2], t1, t0);
+
+tcg_gen_andi_i64(t1, msa_wr_d[wt * 2 + 1], mask);
+tcg_gen_shri_i64(t1, t1, 16);
+tcg_gen_andi_i64(t0, msa_wr_d[ws * 2 + 1], mask);
+tcg_gen_or_i64(msa_wr_d[wd * 2 + 1], t1, t0);
+
+tcg_temp_free_i64(t0);
+tcg_temp_free_i64(t1);
+}
+
+/*
+ * [MSA] ILVOD.W wd, ws, wt
+ *
+ *   Vector Interleave Odd (word data elements)
+ *
+ */
+static inline void gen_ilvod_w(CPUMIPSState *env, uint32_t wd,
+   uint32_t ws, uint32_t wt)
+{
+TCGv_i64 t0 = tcg_temp_new_i64();
+TCGv_i64 t1 = tcg_temp_new_i64();
+const uint64_t mask = 0xULL;
+
+tcg_gen_andi_i64(t1, msa_wr_d[wt * 2], mask);
+tcg_gen_shri_i64(t1, t1, 32);
+tcg_gen_andi_i64(t0, msa_wr_d[ws * 2], mask);
+tcg_gen_or_i64(msa_wr_d[wd * 2], t1, t0);
+
+tcg_gen_andi_i64(t1, msa_wr_d[wt * 2 + 1], mask);
+tcg_gen_shri_i64(t1, t1, 32);
+tcg_gen_andi_i64(t0, msa_wr_d[ws * 2 + 1], mask);
+tcg_gen_or_i64(msa_wr_d[wd * 2 + 1], t1, t0);
+
+tcg_temp_free_i64(t0);
+tcg_temp_free_i64(t1);
+}
+
+/*
+ * [MSA] ILVOD.D wd, ws, wt
+ *
+ *   Vector Interleave Odd (double data elements)
+ *
+ */
+static inline void gen_ilvod_d(CPUMIPSState *env, uint32_t wd,
+   uint32_t ws, uint32_t wt)
+{
+tcg_gen_mov_i64(msa_wr_d[wd * 2], msa_wr_d[wt * 2 + 1]);
+tcg_gen_mov_i64(msa_wr_d[wd * 2 + 1], msa_wr_d[ws * 2 + 1]);
+}
+
 static void gen_msa_3r(CPUMIPSState *env, DisasContext *ctx)
 {
 #define MASK_MSA_3R(op)(MASK_MSA_MINOR(op) | (op & (0x7 << 23)))
@@ -29055,7 +29149,22 @@ static void gen_msa_3r(CPUMIPSState *env, D

Re: [Qemu-devel] [PATCH 1/2] target/mips: Optimize ILVOD. MSA instructions

2019-03-18 Thread Mateja Marjanovic

On 15.3.19. 17:46, Aleksandar Markovic wrote:

From: Mateja Marjanovic 
Subject: [PATCH 1/2] target/mips: Optimize ILVOD. MSA instructions

From: Mateja Marjanovic 

Optimize set of MSA instructions ILVOD, using directly
tcg registers and performing logic on them insted of
using helpers.

insted -> instead

Fix in v2.

Performance measurement is done by executing the
instructions large number of times on a computer
with Intel Core i7-3770 CPU @ 3.40GHz×8.

  instruction ||before||after   ||
==
  ilvod.b:||   66.97 ms   ||  26.34 ms  ||
  ilvod.h:||   44.75 ms   ||  25.17 ms  ||
  ilvod.w:||   41.27 ms   ||  24.37 ms  ||
  ilvod.d:||   41.75 ms   ||  20.50 ms  ||


Alignment looks wrong.

Fix in v2.

Signed-off-by: Mateja Marjanovic 
---
  target/mips/helper.h |   1 -
  target/mips/msa_helper.c |  51 
  target/mips/translate.c  | 119 ++-
  3 files changed, 118 insertions(+), 53 deletions(-)

diff --git a/target/mips/helper.h b/target/mips/helper.h
index a6d687e..d162836 100644
--- a/target/mips/helper.h
+++ b/target/mips/helper.h
@@ -865,7 +865,6 @@ DEF_HELPER_5(msa_pckod_df, void, env, i32, i32, i32, i32)
  DEF_HELPER_5(msa_ilvl_df, void, env, i32, i32, i32, i32)
  DEF_HELPER_5(msa_ilvr_df, void, env, i32, i32, i32, i32)
  DEF_HELPER_5(msa_ilvev_df, void, env, i32, i32, i32, i32)
-DEF_HELPER_5(msa_ilvod_df, void, env, i32, i32, i32, i32)
  DEF_HELPER_5(msa_vshf_df, void, env, i32, i32, i32, i32)
  DEF_HELPER_5(msa_srar_df, void, env, i32, i32, i32, i32)
  DEF_HELPER_5(msa_srlr_df, void, env, i32, i32, i32, i32)
diff --git a/target/mips/msa_helper.c b/target/mips/msa_helper.c
index 9d9dafe..cbcfd57 100644
--- a/target/mips/msa_helper.c
+++ b/target/mips/msa_helper.c
@@ -1363,57 +1363,6 @@ void helper_msa_ilvev_df(CPUMIPSState *env, uint32_t df, 
uint32_t > wd,
  }
  }

-void helper_msa_ilvod_df(CPUMIPSState *env, uint32_t df, uint32_t wd,
- uint32_t ws, uint32_t wt)
-{
-wr_t *pwd = &(env->active_fpu.fpr[wd].wr);
-wr_t *pws = &(env->active_fpu.fpr[ws].wr);
-wr_t *pwt = &(env->active_fpu.fpr[wt].wr);
-
-switch (df) {
-case DF_BYTE:
-pwd->b[0]  = pwt->b[1];
-pwd->b[1]  = pws->b[1];
-pwd->b[2]  = pwt->b[3];
-pwd->b[3]  = pws->b[3];
-pwd->b[4]  = pwt->b[5];
-pwd->b[5]  = pws->b[5];
-pwd->b[6]  = pwt->b[7];
-pwd->b[7]  = pws->b[7];
-pwd->b[8]  = pwt->b[9];
-pwd->b[9]  = pws->b[9];
-pwd->b[10] = pwt->b[11];
-pwd->b[11] = pws->b[11];
-pwd->b[12] = pwt->b[13];
-pwd->b[13] = pws->b[13];
-pwd->b[14] = pwt->b[15];
-pwd->b[15] = pws->b[15];
-break;
-case DF_HALF:
-pwd->h[0] = pwt->h[1];
-pwd->h[1] = pws->h[1];
-pwd->h[2] = pwt->h[3];
-pwd->h[3] = pws->h[3];
-pwd->h[4] = pwt->h[5];
-pwd->h[5] = pws->h[5];
-pwd->h[6] = pwt->h[7];
-pwd->h[7] = pws->h[7];
-break;
-case DF_WORD:
-pwd->w[0] = pwt->w[1];
-pwd->w[1] = pws->w[1];
-pwd->w[2] = pwt->w[3];
-pwd->w[3] = pws->w[3];
-break;
-case DF_DOUBLE:
-pwd->d[0] = pwt->d[1];
-pwd->d[1] = pws->d[1];
-break;
-default:
-assert(0);
-}
-}
-
  void helper_msa_ilvl_df(CPUMIPSState *env, uint32_t df, uint32_t wd,
  uint32_t ws, uint32_t wt)
  {
diff --git a/target/mips/translate.c b/target/mips/translate.c
index b4a1103..101d2de 100644
--- a/target/mips/translate.c
+++ b/target/mips/translate.c
@@ -28889,6 +28889,108 @@ static void gen_msa_bit(CPUMIPSState *env, 
DisasContext *ctx)
  tcg_temp_free_i32(tws);
  }

+static inline void gen_ilvod_b(CPUMIPSState *env, uint32_t wd,
+   uint32_t ws, uint32_t wt) {

Please insert a brief comment before each handler, like in this example:

/*
  * [MSA] ILVOD.B wd, ws, wt
  *
  *   Vector Interleave Odd (byte data elements)
  *
  */

Will do in v2.

+TCGv_i64 t0 = tcg_temp_new_i64();
+TCGv_i64 t1 = tcg_temp_new_i64();
+
+uint64_t mask = (1ULL << 8) - 1;

There shouldn't be a blank line between declarations. The blank line
should be after the last declaration.

You are right.

+mask |= mask << 16;
+mask |= mask << 32;
+mask <<= 8;
+tcg_gen_movi_i64(t1, 0);
+
+tcg_gen_andi_i64(t0, msa_wr_d[wt * 2], mask);
+tcg_gen_shri_i64(t0, t0, 8);
+tcg_gen_or_i64(t1, t1, t0);

If t1 is already initialized to 0, what is the purpose of OR-ing?


+tcg_gen_andi_i64(t0, msa_wr_d[ws * 2], mask);
+tcg_gen_or_i64(t1, t1, t0);
+
+tcg_gen_mov_i64(msa_wr_d[wd * 2], t1);
+
+tcg_gen_movi_i64(t1, 0);
+
+tcg_gen_andi_i64(t0, msa_wr_d[wt * 2 + 1], mask);
+tcg_gen_shri_i64(t0, t0, 8);
+tcg_gen_or_i64(t1, t1, t0);
+tcg_gen_andi_i64(t0, msa_wr_d[ws * 2 + 1], m

Re: [Qemu-devel] [PATCH 1/2] target/mips: Optimize ILVOD. MSA instructions

2019-03-15 Thread Richard Henderson
On 3/15/19 5:02 AM, Mateja Marjanovic wrote:
> From: Mateja Marjanovic 
> 
> Optimize set of MSA instructions ILVOD, using directly
> tcg registers and performing logic on them insted of
> using helpers.
> Performance measurement is done by executing the
> instructions large number of times on a computer
> with Intel Core i7-3770 CPU @ 3.40GHz×8.
> 
>  instruction ||before||after   ||
> ==
>  ilvod.b:  ||   66.97 ms   ||  26.34 ms  ||
>  ilvod.h:  ||   44.75 ms   ||  25.17 ms  ||
>  ilvod.w:  ||   41.27 ms   ||  24.37 ms  ||
>  ilvod.d:  ||   41.75 ms   ||  20.50 ms  ||
> 
> Signed-off-by: Mateja Marjanovic 
> ---
>  target/mips/helper.h |   1 -
>  target/mips/msa_helper.c |  51 
>  target/mips/translate.c  | 119 
> ++-
>  3 files changed, 118 insertions(+), 53 deletions(-)
> 
> diff --git a/target/mips/helper.h b/target/mips/helper.h
> index a6d687e..d162836 100644
> --- a/target/mips/helper.h
> +++ b/target/mips/helper.h
> @@ -865,7 +865,6 @@ DEF_HELPER_5(msa_pckod_df, void, env, i32, i32, i32, i32)
>  DEF_HELPER_5(msa_ilvl_df, void, env, i32, i32, i32, i32)
>  DEF_HELPER_5(msa_ilvr_df, void, env, i32, i32, i32, i32)
>  DEF_HELPER_5(msa_ilvev_df, void, env, i32, i32, i32, i32)
> -DEF_HELPER_5(msa_ilvod_df, void, env, i32, i32, i32, i32)
>  DEF_HELPER_5(msa_vshf_df, void, env, i32, i32, i32, i32)
>  DEF_HELPER_5(msa_srar_df, void, env, i32, i32, i32, i32)
>  DEF_HELPER_5(msa_srlr_df, void, env, i32, i32, i32, i32)
> diff --git a/target/mips/msa_helper.c b/target/mips/msa_helper.c
> index 9d9dafe..cbcfd57 100644
> --- a/target/mips/msa_helper.c
> +++ b/target/mips/msa_helper.c
> @@ -1363,57 +1363,6 @@ void helper_msa_ilvev_df(CPUMIPSState *env, uint32_t 
> df, uint32_t wd,
>  }
>  }
>  
> -void helper_msa_ilvod_df(CPUMIPSState *env, uint32_t df, uint32_t wd,
> - uint32_t ws, uint32_t wt)
> -{
> -wr_t *pwd = &(env->active_fpu.fpr[wd].wr);
> -wr_t *pws = &(env->active_fpu.fpr[ws].wr);
> -wr_t *pwt = &(env->active_fpu.fpr[wt].wr);
> -
> -switch (df) {
> -case DF_BYTE:
> -pwd->b[0]  = pwt->b[1];
> -pwd->b[1]  = pws->b[1];
> -pwd->b[2]  = pwt->b[3];
> -pwd->b[3]  = pws->b[3];
> -pwd->b[4]  = pwt->b[5];
> -pwd->b[5]  = pws->b[5];
> -pwd->b[6]  = pwt->b[7];
> -pwd->b[7]  = pws->b[7];
> -pwd->b[8]  = pwt->b[9];
> -pwd->b[9]  = pws->b[9];
> -pwd->b[10] = pwt->b[11];
> -pwd->b[11] = pws->b[11];
> -pwd->b[12] = pwt->b[13];
> -pwd->b[13] = pws->b[13];
> -pwd->b[14] = pwt->b[15];
> -pwd->b[15] = pws->b[15];
> -break;
> -case DF_HALF:
> -pwd->h[0] = pwt->h[1];
> -pwd->h[1] = pws->h[1];
> -pwd->h[2] = pwt->h[3];
> -pwd->h[3] = pws->h[3];
> -pwd->h[4] = pwt->h[5];
> -pwd->h[5] = pws->h[5];
> -pwd->h[6] = pwt->h[7];
> -pwd->h[7] = pws->h[7];
> -break;
> -case DF_WORD:
> -pwd->w[0] = pwt->w[1];
> -pwd->w[1] = pws->w[1];
> -pwd->w[2] = pwt->w[3];
> -pwd->w[3] = pws->w[3];
> -break;
> -case DF_DOUBLE:
> -pwd->d[0] = pwt->d[1];
> -pwd->d[1] = pws->d[1];
> -break;
> -default:
> -assert(0);
> -}
> -}
> -
>  void helper_msa_ilvl_df(CPUMIPSState *env, uint32_t df, uint32_t wd,
>  uint32_t ws, uint32_t wt)
>  {
> diff --git a/target/mips/translate.c b/target/mips/translate.c
> index b4a1103..101d2de 100644
> --- a/target/mips/translate.c
> +++ b/target/mips/translate.c
> @@ -28889,6 +28889,108 @@ static void gen_msa_bit(CPUMIPSState *env, 
> DisasContext *ctx)
>  tcg_temp_free_i32(tws);
>  }
>  
> +static inline void gen_ilvod_b(CPUMIPSState *env, uint32_t wd,
> +   uint32_t ws, uint32_t wt) {

{ on next line.

> +TCGv_i64 t0 = tcg_temp_new_i64();
> +TCGv_i64 t1 = tcg_temp_new_i64();
> +
> +uint64_t mask = (1ULL << 8) - 1;
> +mask |= mask << 16;
> +mask |= mask << 32;
> +mask <<= 8;

This is a constant.  Clearer to just write 0xff00ff00ff00ff00ull;

> +static inline void gen_ilvod_h(CPUMIPSState *env, uint32_t wd,
> +   uint32_t ws, uint32_t wt) {

Likewise.

> +uint64_t mask = (1ULL << 16) - 1;
> +mask |= mask << 32;
> +mask <<= 16;

0xull

> +static inline void gen_ilvod_w(CPUMIPSState *env, uint32_t wd,
> +   uint32_t ws, uint32_t wt) {

Likewise.

> +tcg_gen_andi_i64(t0, msa_wr_d[wt * 2], mask);
> +tcg_gen_shri_i64(t0, t0, 32);
> +tcg_gen_or_i64(t1, t1, t0);
> +tcg_gen_andi_i64(t0, msa_wr_d[ws * 2], mask);
> +tcg_gen_or_i64(t1, t1, t0);

This can fold down to deposit.

tcg_gen_shri_i64(t0, msa_wr_d[wt * 2], 32);
tcg_gen_deposit_i64(msa_wr_d[wd * 2], msa_wr_d[ws

Re: [Qemu-devel] [PATCH 1/2] target/mips: Optimize ILVOD. MSA instructions

2019-03-15 Thread Aleksandar Markovic
> From: Mateja Marjanovic 
> Subject: [PATCH 1/2] target/mips: Optimize ILVOD. MSA instructions
> 
> From: Mateja Marjanovic 
> 
> Optimize set of MSA instructions ILVOD, using directly
> tcg registers and performing logic on them insted of
> using helpers.

insted -> instead

> Performance measurement is done by executing the
> instructions large number of times on a computer
> with Intel Core i7-3770 CPU @ 3.40GHz×8.
> 
>  instruction ||before||after   ||
> ==
>  ilvod.b:||   66.97 ms   ||  26.34 ms  ||
>  ilvod.h:||   44.75 ms   ||  25.17 ms  ||
>  ilvod.w:||   41.27 ms   ||  24.37 ms  ||
>  ilvod.d:||   41.75 ms   ||  20.50 ms  ||
> 

Alignment looks wrong.

> Signed-off-by: Mateja Marjanovic 
> ---
>  target/mips/helper.h |   1 -
>  target/mips/msa_helper.c |  51 
>  target/mips/translate.c  | 119 
> ++-
>  3 files changed, 118 insertions(+), 53 deletions(-)
> 
> diff --git a/target/mips/helper.h b/target/mips/helper.h
> index a6d687e..d162836 100644
> --- a/target/mips/helper.h
> +++ b/target/mips/helper.h
> @@ -865,7 +865,6 @@ DEF_HELPER_5(msa_pckod_df, void, env, i32, i32, i32, i32)
>  DEF_HELPER_5(msa_ilvl_df, void, env, i32, i32, i32, i32)
>  DEF_HELPER_5(msa_ilvr_df, void, env, i32, i32, i32, i32)
>  DEF_HELPER_5(msa_ilvev_df, void, env, i32, i32, i32, i32)
> -DEF_HELPER_5(msa_ilvod_df, void, env, i32, i32, i32, i32)
>  DEF_HELPER_5(msa_vshf_df, void, env, i32, i32, i32, i32)
>  DEF_HELPER_5(msa_srar_df, void, env, i32, i32, i32, i32)
>  DEF_HELPER_5(msa_srlr_df, void, env, i32, i32, i32, i32)
> diff --git a/target/mips/msa_helper.c b/target/mips/msa_helper.c
> index 9d9dafe..cbcfd57 100644
> --- a/target/mips/msa_helper.c
> +++ b/target/mips/msa_helper.c
> @@ -1363,57 +1363,6 @@ void helper_msa_ilvev_df(CPUMIPSState *env, uint32_t 
> df, uint32_t > wd,
>  }
>  }
> 
> -void helper_msa_ilvod_df(CPUMIPSState *env, uint32_t df, uint32_t wd,
> - uint32_t ws, uint32_t wt)
> -{
> -wr_t *pwd = &(env->active_fpu.fpr[wd].wr);
> -wr_t *pws = &(env->active_fpu.fpr[ws].wr);
> -wr_t *pwt = &(env->active_fpu.fpr[wt].wr);
> -
> -switch (df) {
> -case DF_BYTE:
> -pwd->b[0]  = pwt->b[1];
> -pwd->b[1]  = pws->b[1];
> -pwd->b[2]  = pwt->b[3];
> -pwd->b[3]  = pws->b[3];
> -pwd->b[4]  = pwt->b[5];
> -pwd->b[5]  = pws->b[5];
> -pwd->b[6]  = pwt->b[7];
> -pwd->b[7]  = pws->b[7];
> -pwd->b[8]  = pwt->b[9];
> -pwd->b[9]  = pws->b[9];
> -pwd->b[10] = pwt->b[11];
> -pwd->b[11] = pws->b[11];
> -pwd->b[12] = pwt->b[13];
> -pwd->b[13] = pws->b[13];
> -pwd->b[14] = pwt->b[15];
> -pwd->b[15] = pws->b[15];
> -break;
> -case DF_HALF:
> -pwd->h[0] = pwt->h[1];
> -pwd->h[1] = pws->h[1];
> -pwd->h[2] = pwt->h[3];
> -pwd->h[3] = pws->h[3];
> -pwd->h[4] = pwt->h[5];
> -pwd->h[5] = pws->h[5];
> -pwd->h[6] = pwt->h[7];
> -pwd->h[7] = pws->h[7];
> -break;
> -case DF_WORD:
> -pwd->w[0] = pwt->w[1];
> -pwd->w[1] = pws->w[1];
> -pwd->w[2] = pwt->w[3];
> -pwd->w[3] = pws->w[3];
> -break;
> -case DF_DOUBLE:
> -pwd->d[0] = pwt->d[1];
> -pwd->d[1] = pws->d[1];
> -break;
> -default:
> -assert(0);
> -}
> -}
> -
>  void helper_msa_ilvl_df(CPUMIPSState *env, uint32_t df, uint32_t wd,
>  uint32_t ws, uint32_t wt)
>  {
> diff --git a/target/mips/translate.c b/target/mips/translate.c
> index b4a1103..101d2de 100644
> --- a/target/mips/translate.c
> +++ b/target/mips/translate.c
> @@ -28889,6 +28889,108 @@ static void gen_msa_bit(CPUMIPSState *env, 
> DisasContext *ctx)
>  tcg_temp_free_i32(tws);
>  }
> 
> +static inline void gen_ilvod_b(CPUMIPSState *env, uint32_t wd,
> +   uint32_t ws, uint32_t wt) {

Please insert a brief comment before each handler, like in this example:

/*
 * [MSA] ILVOD.B wd, ws, wt
 *
 *   Vector Interleave Odd (byte data elements)
 *
 */

> +TCGv_i64 t0 = tcg_temp_new_i64();
> +TCGv_i64 t1 = tcg_temp_new_i64();
> +
> +uint64_t mask = (1ULL << 8) - 1;

There shouldn't be a blank line between declarations. The blank line
should be after the last declaration.

> +mask |= mask << 16;
> +mask |= mask << 32;
> +mask <<= 8;
> +tcg_gen_movi_i64(t1, 0);
> +
> +tcg_gen_andi_i64(t0, msa_wr_d[wt * 2], mask);
> +tcg_gen_shri_i64(t0, t0, 8);
> +tcg_gen_or_i64(t1, t1, t0);

If t1 is already initialized to 0, what is the purpose of OR-ing?

> +tcg_gen_andi_i64(t0, msa_wr_d[ws * 2], mask);
> +tcg_gen_or_i64(t1, t1, t0);
> +
> +tcg_gen_mov_i64(msa_wr_d[wd * 2], t1);
> +
> +tcg_gen_movi_i64(t1, 0);
> +
> +tcg_gen_andi_i64(t0, msa_wr_d[w

[Qemu-devel] [PATCH 1/2] target/mips: Optimize ILVOD. MSA instructions

2019-03-15 Thread Mateja Marjanovic
From: Mateja Marjanovic 

Optimize set of MSA instructions ILVOD, using directly
tcg registers and performing logic on them insted of
using helpers.
Performance measurement is done by executing the
instructions large number of times on a computer
with Intel Core i7-3770 CPU @ 3.40GHz×8.

 instruction ||before||after   ||
==
 ilvod.b:||   66.97 ms   ||  26.34 ms  ||
 ilvod.h:||   44.75 ms   ||  25.17 ms  ||
 ilvod.w:||   41.27 ms   ||  24.37 ms  ||
 ilvod.d:||   41.75 ms   ||  20.50 ms  ||

Signed-off-by: Mateja Marjanovic 
---
 target/mips/helper.h |   1 -
 target/mips/msa_helper.c |  51 
 target/mips/translate.c  | 119 ++-
 3 files changed, 118 insertions(+), 53 deletions(-)

diff --git a/target/mips/helper.h b/target/mips/helper.h
index a6d687e..d162836 100644
--- a/target/mips/helper.h
+++ b/target/mips/helper.h
@@ -865,7 +865,6 @@ DEF_HELPER_5(msa_pckod_df, void, env, i32, i32, i32, i32)
 DEF_HELPER_5(msa_ilvl_df, void, env, i32, i32, i32, i32)
 DEF_HELPER_5(msa_ilvr_df, void, env, i32, i32, i32, i32)
 DEF_HELPER_5(msa_ilvev_df, void, env, i32, i32, i32, i32)
-DEF_HELPER_5(msa_ilvod_df, void, env, i32, i32, i32, i32)
 DEF_HELPER_5(msa_vshf_df, void, env, i32, i32, i32, i32)
 DEF_HELPER_5(msa_srar_df, void, env, i32, i32, i32, i32)
 DEF_HELPER_5(msa_srlr_df, void, env, i32, i32, i32, i32)
diff --git a/target/mips/msa_helper.c b/target/mips/msa_helper.c
index 9d9dafe..cbcfd57 100644
--- a/target/mips/msa_helper.c
+++ b/target/mips/msa_helper.c
@@ -1363,57 +1363,6 @@ void helper_msa_ilvev_df(CPUMIPSState *env, uint32_t df, 
uint32_t wd,
 }
 }
 
-void helper_msa_ilvod_df(CPUMIPSState *env, uint32_t df, uint32_t wd,
- uint32_t ws, uint32_t wt)
-{
-wr_t *pwd = &(env->active_fpu.fpr[wd].wr);
-wr_t *pws = &(env->active_fpu.fpr[ws].wr);
-wr_t *pwt = &(env->active_fpu.fpr[wt].wr);
-
-switch (df) {
-case DF_BYTE:
-pwd->b[0]  = pwt->b[1];
-pwd->b[1]  = pws->b[1];
-pwd->b[2]  = pwt->b[3];
-pwd->b[3]  = pws->b[3];
-pwd->b[4]  = pwt->b[5];
-pwd->b[5]  = pws->b[5];
-pwd->b[6]  = pwt->b[7];
-pwd->b[7]  = pws->b[7];
-pwd->b[8]  = pwt->b[9];
-pwd->b[9]  = pws->b[9];
-pwd->b[10] = pwt->b[11];
-pwd->b[11] = pws->b[11];
-pwd->b[12] = pwt->b[13];
-pwd->b[13] = pws->b[13];
-pwd->b[14] = pwt->b[15];
-pwd->b[15] = pws->b[15];
-break;
-case DF_HALF:
-pwd->h[0] = pwt->h[1];
-pwd->h[1] = pws->h[1];
-pwd->h[2] = pwt->h[3];
-pwd->h[3] = pws->h[3];
-pwd->h[4] = pwt->h[5];
-pwd->h[5] = pws->h[5];
-pwd->h[6] = pwt->h[7];
-pwd->h[7] = pws->h[7];
-break;
-case DF_WORD:
-pwd->w[0] = pwt->w[1];
-pwd->w[1] = pws->w[1];
-pwd->w[2] = pwt->w[3];
-pwd->w[3] = pws->w[3];
-break;
-case DF_DOUBLE:
-pwd->d[0] = pwt->d[1];
-pwd->d[1] = pws->d[1];
-break;
-default:
-assert(0);
-}
-}
-
 void helper_msa_ilvl_df(CPUMIPSState *env, uint32_t df, uint32_t wd,
 uint32_t ws, uint32_t wt)
 {
diff --git a/target/mips/translate.c b/target/mips/translate.c
index b4a1103..101d2de 100644
--- a/target/mips/translate.c
+++ b/target/mips/translate.c
@@ -28889,6 +28889,108 @@ static void gen_msa_bit(CPUMIPSState *env, 
DisasContext *ctx)
 tcg_temp_free_i32(tws);
 }
 
+static inline void gen_ilvod_b(CPUMIPSState *env, uint32_t wd,
+   uint32_t ws, uint32_t wt) {
+TCGv_i64 t0 = tcg_temp_new_i64();
+TCGv_i64 t1 = tcg_temp_new_i64();
+
+uint64_t mask = (1ULL << 8) - 1;
+mask |= mask << 16;
+mask |= mask << 32;
+mask <<= 8;
+tcg_gen_movi_i64(t1, 0);
+
+tcg_gen_andi_i64(t0, msa_wr_d[wt * 2], mask);
+tcg_gen_shri_i64(t0, t0, 8);
+tcg_gen_or_i64(t1, t1, t0);
+tcg_gen_andi_i64(t0, msa_wr_d[ws * 2], mask);
+tcg_gen_or_i64(t1, t1, t0);
+
+tcg_gen_mov_i64(msa_wr_d[wd * 2], t1);
+
+tcg_gen_movi_i64(t1, 0);
+
+tcg_gen_andi_i64(t0, msa_wr_d[wt * 2 + 1], mask);
+tcg_gen_shri_i64(t0, t0, 8);
+tcg_gen_or_i64(t1, t1, t0);
+tcg_gen_andi_i64(t0, msa_wr_d[ws * 2 + 1], mask);
+tcg_gen_or_i64(t1, t1, t0);
+
+tcg_gen_mov_i64(msa_wr_d[wd * 2 + 1], t1);
+
+tcg_temp_free_i64(t0);
+tcg_temp_free_i64(t1);
+}
+
+static inline void gen_ilvod_h(CPUMIPSState *env, uint32_t wd,
+   uint32_t ws, uint32_t wt) {
+TCGv_i64 t0 = tcg_temp_new_i64();
+TCGv_i64 t1 = tcg_temp_new_i64();
+
+uint64_t mask = (1ULL << 16) - 1;
+mask |= mask << 32;
+mask <<= 16;
+tcg_gen_movi_i64(t1, 0);
+
+tcg_gen_andi_i64(t0, msa_wr_d[wt * 2], mask);
+tcg_gen_shri_i64(t0, t0, 16);
+tcg_gen_or_i64(t1, t1, t0);
+tcg_gen_andi_i64(t