Re: [Qemu-devel] [PATCH v6 1/3] target/ppc: Optimize emulation of vpkpx instruction

2019-08-29 Thread Richard Henderson
On 8/29/19 6:34 AM, Stefan Brankovic wrote:
> Then I run my performance tests and I got following results(test is calling
> vpkpx 10 times):
> 
> 1) Current helper implementation: ~ 157 ms
> 
> 2) helper implementation you suggested: ~94 ms
> 
> 3) tcg implementation: ~75 ms

I assume you tested in a loop.  If you have just the one expansion, you'll not
see the penalty for the icache expansion.  To show the other extreme, you'd
want to test as separate sequential invocations.

That said, I'd be more interested in a real test case that isn't just calling
one instruction over and over.  Is there a real test case that shows vpkpx in
the top 25 of the profile?  With more than 0.5% of runtime?


r~



Re: [Qemu-devel] [PATCH v6 1/3] target/ppc: Optimize emulation of vpkpx instruction

2019-08-29 Thread Stefan Brankovic


On 27.8.19. 20:52, Richard Henderson wrote:

On 8/27/19 2:37 AM, Stefan Brankovic wrote:

+for (i = 0; i < 4; i++) {
+switch (i) {
+case 0:
+/*
+ * Get high doubleword of vA to perfrom 6-5-5 pack of pixels
+ * 1 and 2.
+ */
+get_avr64(avr, VA, true);
+tcg_gen_movi_i64(result, 0x0ULL);
+break;
+case 1:
+/*
+ * Get low doubleword of vA to perfrom 6-5-5 pack of pixels
+ * 3 and 4.
+ */
+get_avr64(avr, VA, false);
+break;
+case 2:
+/*
+ * Get high doubleword of vB to perfrom 6-5-5 pack of pixels
+ * 5 and 6.
+ */
+get_avr64(avr, VB, true);
+tcg_gen_movi_i64(result, 0x0ULL);
+break;
+case 3:
+/*
+ * Get low doubleword of vB to perfrom 6-5-5 pack of pixels
+ * 7 and 8.
+ */
+get_avr64(avr, VB, false);
+break;
+}
+/* Perform the packing for 2 pixels(each iteration for 1). */
+tcg_gen_movi_i64(tmp, 0x0ULL);
+for (j = 0; j < 2; j++) {
+tcg_gen_shri_i64(shifted, avr, (j * 16 + 3));
+tcg_gen_andi_i64(shifted, shifted, mask1 << (j * 16));
+tcg_gen_or_i64(tmp, tmp, shifted);
+
+tcg_gen_shri_i64(shifted, avr, (j * 16 + 6));
+tcg_gen_andi_i64(shifted, shifted, mask2 << (j * 16));
+tcg_gen_or_i64(tmp, tmp, shifted);
+
+tcg_gen_shri_i64(shifted, avr, (j * 16 + 9));
+tcg_gen_andi_i64(shifted, shifted, mask3 << (j * 16));
+tcg_gen_or_i64(tmp, tmp, shifted);
+}
+if ((i == 0) || (i == 2)) {
+tcg_gen_shli_i64(tmp, tmp, 32);
+}
+tcg_gen_or_i64(result, result, tmp);
+if (i == 1) {
+/* Place packed pixels 1:4 to high doubleword of vD. */
+tcg_gen_mov_i64(result1, result);
+}
+if (i == 3) {
+/* Place packed pixels 5:8 to low doubleword of vD. */
+tcg_gen_mov_i64(result2, result);
+}
+}
+set_avr64(VT, result1, true);
+set_avr64(VT, result2, false);

I really have a hard time believing that it is worthwhile to inline all of this
code.  By my count this is 82 non-move opcodes.  That is a *lot* of inline
expansion.

However, I can well imagine that the existing out-of-line helper is less than
optimal.


-void helper_vpkpx(ppc_avr_t *r, ppc_avr_t *a, ppc_avr_t *b)
-{
-int i, j;
-ppc_avr_t result;
-#if defined(HOST_WORDS_BIGENDIAN)
-const ppc_avr_t *x[2] = { a, b };
-#else
-const ppc_avr_t *x[2] = { b, a };
-#endif
-
-VECTOR_FOR_INORDER_I(i, u64) {
-VECTOR_FOR_INORDER_I(j, u32) {
-uint32_t e = x[i]->u32[j];

Double indirect loads?


-
-result.u16[4 * i + j] = (((e >> 9) & 0xfc00) |
- ((e >> 6) & 0x3e0) |
- ((e >> 3) & 0x1f));

Store to temporary ...


-}
-}
-*r = result;

... and then copy?

Try replacing the existing helper with something like the following.


r~



static inline uint64_t pkpx_1(uint64_t a, int shr, int shl)
{
 uint64_t r;

 r  = ((a >> (shr + 9)) & 0x3f) << shl;
 r |= ((a >> (shr + 6)) & 0x1f) << shl;
 r |= ((a >> (shr + 3)) & 0x1f) << shl;

 return r;
}

static inline uint64_t pkpx_2(uint64_t ah, uint64_t al)
{
 return pkpx_1(ah, 32, 48)
  | pkpx_1(ah,  0, 32)
  | pkpx_1(al, 32, 16)
  | pkpx_1(al,  0,  0);
}

void helper_vpkpx(uint64_t *r, uint64_t *a, uint64_t *b)
{
 uint64_t rh = pkpx_2(a->VsrD(0), a->VsrD(1));
 uint64_t rl = pkpx_2(b->VsrD(0), b->VsrD(1));
 r->VsrD(0) = rh;
 r->VsrD(1) = rl;
}


I implemented vpkpx as you suggested above with small modifications(so 
it builds and gives correct result). It looks like this:


static inline uint64_t pkpx_1(uint64_t a, int shr, int shl)
{
    uint64_t r;

    r  = ((a >> (shr + 9)) & 0xfc00) << shl;
    r |= ((a >> (shr + 6)) & 0x3e0) << shl;
    r |= ((a >> (shr + 3)) & 0x1f) << shl;

    return r;
}

static inline uint64_t pkpx_2(uint64_t ah, uint64_t al)
{
    return pkpx_1(ah, 32, 48)
 | pkpx_1(ah,  0, 32)
 | pkpx_1(al, 32, 16)
 | pkpx_1(al,  0,  0);
}

void helper_vpkpx(ppc_avr_t *r, ppc_avr_t *a, ppc_avr_t *b)
{
    uint64_t rh = pkpx_2(a->u64[1], a->u64[0]);
    uint64_t rl = pkpx_2(b->u64[1], b->u64[0]);
    r->u64[1] = rh;
    r->u64[0] = rl;
}

I also noticed that this would work only for little_endian hosts, so we 
would need to modify it in order to support big_endian hosts (this 
shouldn't affect performance results).


Then I run my performance tests and I got following results(test is 
calling vpkpx 10 times):


1) Current helper implementation: ~ 157 ms

2) helper implementation you suggested: ~94 ms

3)

Re: [Qemu-devel] [PATCH v6 1/3] target/ppc: Optimize emulation of vpkpx instruction

2019-08-27 Thread BALATON Zoltan

On Tue, 27 Aug 2019, Richard Henderson wrote:

On 8/27/19 2:37 AM, Stefan Brankovic wrote:

+for (i = 0; i < 4; i++) {
+switch (i) {
+case 0:
+/*
+ * Get high doubleword of vA to perfrom 6-5-5 pack of pixels
+ * 1 and 2.
+ */
+get_avr64(avr, VA, true);
+tcg_gen_movi_i64(result, 0x0ULL);
+break;
+case 1:
+/*
+ * Get low doubleword of vA to perfrom 6-5-5 pack of pixels
+ * 3 and 4.
+ */
+get_avr64(avr, VA, false);
+break;
+case 2:
+/*
+ * Get high doubleword of vB to perfrom 6-5-5 pack of pixels
+ * 5 and 6.
+ */
+get_avr64(avr, VB, true);
+tcg_gen_movi_i64(result, 0x0ULL);
+break;
+case 3:
+/*
+ * Get low doubleword of vB to perfrom 6-5-5 pack of pixels


If this is replaced by Richard's suggested version it does not matter but 
there's a typo in above comments. Probably you've meant perfrom -> perform


Regards,
BALATON Zoltan



Re: [Qemu-devel] [PATCH v6 1/3] target/ppc: Optimize emulation of vpkpx instruction

2019-08-27 Thread Richard Henderson
On 8/27/19 2:37 AM, Stefan Brankovic wrote:
> +for (i = 0; i < 4; i++) {
> +switch (i) {
> +case 0:
> +/*
> + * Get high doubleword of vA to perfrom 6-5-5 pack of pixels
> + * 1 and 2.
> + */
> +get_avr64(avr, VA, true);
> +tcg_gen_movi_i64(result, 0x0ULL);
> +break;
> +case 1:
> +/*
> + * Get low doubleword of vA to perfrom 6-5-5 pack of pixels
> + * 3 and 4.
> + */
> +get_avr64(avr, VA, false);
> +break;
> +case 2:
> +/*
> + * Get high doubleword of vB to perfrom 6-5-5 pack of pixels
> + * 5 and 6.
> + */
> +get_avr64(avr, VB, true);
> +tcg_gen_movi_i64(result, 0x0ULL);
> +break;
> +case 3:
> +/*
> + * Get low doubleword of vB to perfrom 6-5-5 pack of pixels
> + * 7 and 8.
> + */
> +get_avr64(avr, VB, false);
> +break;
> +}
> +/* Perform the packing for 2 pixels(each iteration for 1). */
> +tcg_gen_movi_i64(tmp, 0x0ULL);
> +for (j = 0; j < 2; j++) {
> +tcg_gen_shri_i64(shifted, avr, (j * 16 + 3));
> +tcg_gen_andi_i64(shifted, shifted, mask1 << (j * 16));
> +tcg_gen_or_i64(tmp, tmp, shifted);
> +
> +tcg_gen_shri_i64(shifted, avr, (j * 16 + 6));
> +tcg_gen_andi_i64(shifted, shifted, mask2 << (j * 16));
> +tcg_gen_or_i64(tmp, tmp, shifted);
> +
> +tcg_gen_shri_i64(shifted, avr, (j * 16 + 9));
> +tcg_gen_andi_i64(shifted, shifted, mask3 << (j * 16));
> +tcg_gen_or_i64(tmp, tmp, shifted);
> +}
> +if ((i == 0) || (i == 2)) {
> +tcg_gen_shli_i64(tmp, tmp, 32);
> +}
> +tcg_gen_or_i64(result, result, tmp);
> +if (i == 1) {
> +/* Place packed pixels 1:4 to high doubleword of vD. */
> +tcg_gen_mov_i64(result1, result);
> +}
> +if (i == 3) {
> +/* Place packed pixels 5:8 to low doubleword of vD. */
> +tcg_gen_mov_i64(result2, result);
> +}
> +}
> +set_avr64(VT, result1, true);
> +set_avr64(VT, result2, false);

I really have a hard time believing that it is worthwhile to inline all of this
code.  By my count this is 82 non-move opcodes.  That is a *lot* of inline
expansion.

However, I can well imagine that the existing out-of-line helper is less than
optimal.

> -void helper_vpkpx(ppc_avr_t *r, ppc_avr_t *a, ppc_avr_t *b)
> -{
> -int i, j;
> -ppc_avr_t result;
> -#if defined(HOST_WORDS_BIGENDIAN)
> -const ppc_avr_t *x[2] = { a, b };
> -#else
> -const ppc_avr_t *x[2] = { b, a };
> -#endif
> -
> -VECTOR_FOR_INORDER_I(i, u64) {
> -VECTOR_FOR_INORDER_I(j, u32) {
> -uint32_t e = x[i]->u32[j];

Double indirect loads?

> -
> -result.u16[4 * i + j] = (((e >> 9) & 0xfc00) |
> - ((e >> 6) & 0x3e0) |
> - ((e >> 3) & 0x1f));

Store to temporary ...

> -}
> -}
> -*r = result;

... and then copy?

Try replacing the existing helper with something like the following.


r~



static inline uint64_t pkpx_1(uint64_t a, int shr, int shl)
{
uint64_t r;

r  = ((a >> (shr + 9)) & 0x3f) << shl;
r |= ((a >> (shr + 6)) & 0x1f) << shl;
r |= ((a >> (shr + 3)) & 0x1f) << shl;

return r;
}

static inline uint64_t pkpx_2(uint64_t ah, uint64_t al)
{
return pkpx_1(ah, 32, 48)
 | pkpx_1(ah,  0, 32)
 | pkpx_1(al, 32, 16)
 | pkpx_1(al,  0,  0);
}

void helper_vpkpx(uint64_t *r, uint64_t *a, uint64_t *b)
{
uint64_t rh = pkpx_2(a->VsrD(0), a->VsrD(1));
uint64_t rl = pkpx_2(b->VsrD(0), b->VsrD(1));
r->VsrD(0) = rh;
r->VsrD(1) = rl;
}



[Qemu-devel] [PATCH v6 1/3] target/ppc: Optimize emulation of vpkpx instruction

2019-08-27 Thread Stefan Brankovic
Optimize altivec instruction vpkpx (Vector Pack Pixel).
Rearranges 8 pixels coded in 6-5-5 pattern (4 from each source register)
into contigous array of bits in the destination register.

In each iteration of outer loop, the instruction is to be done with
the 6-5-5 pack for 2 pixels of each doubleword element of each
source register. The first thing to be done in outer loop is
choosing which doubleword element of which register is to be used
in current iteration and it is to be placed in avr variable. The
next step is to perform 6-5-5 pack of pixels on avr variable in inner
for loop(2 iterations, 1 for each pixel) and save result in tmp variable.
In the end of outer for loop, the result is merged in variable called
result and saved in appropriate doubleword element of vD if the whole
doubleword is finished(every second iteration). The outer loop has 4
iterations.

Signed-off-by: Stefan Brankovic 
---
 target/ppc/helper.h |  1 -
 target/ppc/int_helper.c | 21 
 target/ppc/translate/vmx-impl.inc.c | 99 -
 3 files changed, 98 insertions(+), 23 deletions(-)

diff --git a/target/ppc/helper.h b/target/ppc/helper.h
index 54ea9b9..940a115 100644
--- a/target/ppc/helper.h
+++ b/target/ppc/helper.h
@@ -258,7 +258,6 @@ DEF_HELPER_4(vpkudus, void, env, avr, avr, avr)
 DEF_HELPER_4(vpkuhum, void, env, avr, avr, avr)
 DEF_HELPER_4(vpkuwum, void, env, avr, avr, avr)
 DEF_HELPER_4(vpkudum, void, env, avr, avr, avr)
-DEF_HELPER_3(vpkpx, void, avr, avr, avr)
 DEF_HELPER_5(vmhaddshs, void, env, avr, avr, avr, avr)
 DEF_HELPER_5(vmhraddshs, void, env, avr, avr, avr, avr)
 DEF_HELPER_5(vmsumuhm, void, env, avr, avr, avr, avr)
diff --git a/target/ppc/int_helper.c b/target/ppc/int_helper.c
index 46deb57..9ff3b03 100644
--- a/target/ppc/int_helper.c
+++ b/target/ppc/int_helper.c
@@ -1262,27 +1262,6 @@ void helper_vpmsumd(ppc_avr_t *r, ppc_avr_t *a, 
ppc_avr_t *b)
 #else
 #define PKBIG 0
 #endif
-void helper_vpkpx(ppc_avr_t *r, ppc_avr_t *a, ppc_avr_t *b)
-{
-int i, j;
-ppc_avr_t result;
-#if defined(HOST_WORDS_BIGENDIAN)
-const ppc_avr_t *x[2] = { a, b };
-#else
-const ppc_avr_t *x[2] = { b, a };
-#endif
-
-VECTOR_FOR_INORDER_I(i, u64) {
-VECTOR_FOR_INORDER_I(j, u32) {
-uint32_t e = x[i]->u32[j];
-
-result.u16[4 * i + j] = (((e >> 9) & 0xfc00) |
- ((e >> 6) & 0x3e0) |
- ((e >> 3) & 0x1f));
-}
-}
-*r = result;
-}
 
 #define VPK(suffix, from, to, cvt, dosat)   \
 void helper_vpk##suffix(CPUPPCState *env, ppc_avr_t *r, \
diff --git a/target/ppc/translate/vmx-impl.inc.c 
b/target/ppc/translate/vmx-impl.inc.c
index 0d71c10..45a 100644
--- a/target/ppc/translate/vmx-impl.inc.c
+++ b/target/ppc/translate/vmx-impl.inc.c
@@ -571,6 +571,103 @@ static void trans_lvsr(DisasContext *ctx)
 }
 
 /*
+ * vpkpx VRT,VRA,VRB - Vector Pack Pixel
+ *
+ * Rearranges 8 pixels coded in 6-5-5 pattern (4 from each source register)
+ * into contigous array of bits in the destination register.
+ */
+static void trans_vpkpx(DisasContext *ctx)
+{
+int VT = rD(ctx->opcode);
+int VA = rA(ctx->opcode);
+int VB = rB(ctx->opcode);
+TCGv_i64 tmp = tcg_temp_new_i64();
+TCGv_i64 shifted = tcg_temp_new_i64();
+TCGv_i64 avr = tcg_temp_new_i64();
+TCGv_i64 result = tcg_temp_new_i64();
+TCGv_i64 result1 = tcg_temp_new_i64();
+TCGv_i64 result2 = tcg_temp_new_i64();
+int64_t mask1 = 0x1fULL;
+int64_t mask2 = 0x1fULL << 5;
+int64_t mask3 = 0x3fULL << 10;
+int i, j;
+/*
+ * In each iteration do the 6-5-5 pack for 2 pixels of each doubleword
+ * element of each source register.
+ */
+for (i = 0; i < 4; i++) {
+switch (i) {
+case 0:
+/*
+ * Get high doubleword of vA to perfrom 6-5-5 pack of pixels
+ * 1 and 2.
+ */
+get_avr64(avr, VA, true);
+tcg_gen_movi_i64(result, 0x0ULL);
+break;
+case 1:
+/*
+ * Get low doubleword of vA to perfrom 6-5-5 pack of pixels
+ * 3 and 4.
+ */
+get_avr64(avr, VA, false);
+break;
+case 2:
+/*
+ * Get high doubleword of vB to perfrom 6-5-5 pack of pixels
+ * 5 and 6.
+ */
+get_avr64(avr, VB, true);
+tcg_gen_movi_i64(result, 0x0ULL);
+break;
+case 3:
+/*
+ * Get low doubleword of vB to perfrom 6-5-5 pack of pixels
+ * 7 and 8.
+ */
+get_avr64(avr, VB, false);
+break;
+}
+/* Perform the packing for 2 pixels(each iteration for 1). */
+tcg_gen_movi_i64(tmp, 0x0ULL);
+for (j = 0; j < 2; j++) {
+tcg_gen_shri_i64(shifted, avr, (j * 16 + 3));
+