[PATCH v2 0/1] Add Loongson 2F disassembler

2020-07-03 Thread Stefan Brankovic
This patch adds disassembler for Loongson 2F architecture.

v2:
Fixed coding style problems.
Added comments related to licence and author.

Stefan Brankovic (1):
  disas: mips: Add Loongson 2F disassembler

 MAINTAINERS |1 +
 configure   |1 +
 disas/Makefile.objs |1 +
 disas/loongson2f.cpp| 8154 +++
 disas/loongson2f.h  | 2562 
 include/disas/dis-asm.h |1 +
 include/exec/poison.h   |1 +
 target/mips/cpu.c   |4 +
 8 files changed, 10725 insertions(+)
 create mode 100644 disas/loongson2f.cpp
 create mode 100644 disas/loongson2f.h

-- 
2.17.1




Re: [PATCH 1/1] disas: mips: Add Loongson 2F disassembler

2020-07-03 Thread Stefan Brankovic



On 3.7.20. 12:09, Thomas Huth wrote:

On 03/07/2020 11.49, Stefan Brankovic wrote:

On 3.7.20. 09:59, Thomas Huth wrote:

On 02/07/2020 21.42, Stefan Brankovic wrote:

Add disassembler for Loongson 2F instruction set.

Testing is done by comparing qemu disassembly output, obtained by
using -d in_asm command line option, with appropriate objdump output.

Signed-off-by: Stefan Brankovic 
---
   MAINTAINERS |    1 +
   configure   |    1 +
   disas/Makefile.objs |    1 +
   disas/loongson2f.cpp    | 8134 +++
   disas/loongson2f.h  | 2542 
   include/disas/dis-asm.h |    1 +
   include/exec/poison.h   |    1 +
   target/mips/cpu.c   |    4 +
   8 files changed, 10685 insertions(+)
   create mode 100644 disas/loongson2f.cpp
   create mode 100644 disas/loongson2f.h

diff --git a/MAINTAINERS b/MAINTAINERS
index 3abe3faa4e..913ed2a6d3 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -219,6 +219,7 @@ S: Maintained
   F: target/mips/
   F: default-configs/*mips*
   F: disas/*mips*
+F: disas/loongson*
   F: docs/system/cpu-models-mips.rst.inc
   F: hw/intc/mips_gic.c
   F: hw/mips/
diff --git a/configure b/configure
index 597e909b53..e163dac53e 100755
--- a/configure
+++ b/configure
@@ -8102,6 +8102,7 @@ for i in $ARCH $TARGET_BASE_ARCH ; do
   disas_config "MIPS"
   if test -n "${cxx}"; then
     disas_config "NANOMIPS"
+  disas_config "LOONGSON2F"
   fi
     ;;
     moxie*)
diff --git a/disas/Makefile.objs b/disas/Makefile.objs
index 3c1cdce026..0d5ee1e038 100644
--- a/disas/Makefile.objs
+++ b/disas/Makefile.objs
@@ -14,6 +14,7 @@ common-obj-$(CONFIG_I386_DIS) += i386.o
   common-obj-$(CONFIG_M68K_DIS) += m68k.o
   common-obj-$(CONFIG_MICROBLAZE_DIS) += microblaze.o
   common-obj-$(CONFIG_MIPS_DIS) += mips.o
+common-obj-$(CONFIG_LOONGSON2F_DIS) += loongson2f.o
   common-obj-$(CONFIG_NANOMIPS_DIS) += nanomips.o
   common-obj-$(CONFIG_NIOS2_DIS) += nios2.o
   common-obj-$(CONFIG_MOXIE_DIS) += moxie.o
diff --git a/disas/loongson2f.cpp b/disas/loongson2f.cpp
new file mode 100644
index 00..a2f32dcf93
--- /dev/null
+++ b/disas/loongson2f.cpp
@@ -0,0 +1,8134 @@

This file (and the header) lack a proper header comment. Which license
do you want to use for this code? Who wrote the initial implementation?

I will add proper license comments in v2. I will use GPL2+ license.
Thanks for reminding. This is the initial implementation, and I am the
author.

Also, unless you've copied the code from another project that uses
C++, why did you use C++ here?

This is disassembler is written as a generic disassembler that can be
integrated into other projects. In this case, it is integrated into
QEMU. One of initial key requirements was that it uses C++ as its
language.

Ok, fair, but please mention that rationale in the commit description
when you send v2.


I will mention it in commit description. Thanks for your suggestions.


Kind Regard,

Stefan



  Thanks,
   Thomas





Re: [PATCH 1/1] disas: mips: Add Loongson 2F disassembler

2020-07-03 Thread Stefan Brankovic


On 3.7.20. 09:59, Thomas Huth wrote:

On 02/07/2020 21.42, Stefan Brankovic wrote:

Add disassembler for Loongson 2F instruction set.

Testing is done by comparing qemu disassembly output, obtained by
using -d in_asm command line option, with appropriate objdump output.

Signed-off-by: Stefan Brankovic 
---
  MAINTAINERS |    1 +
  configure   |    1 +
  disas/Makefile.objs |    1 +
  disas/loongson2f.cpp    | 8134 +++
  disas/loongson2f.h  | 2542 
  include/disas/dis-asm.h |    1 +
  include/exec/poison.h   |    1 +
  target/mips/cpu.c   |    4 +
  8 files changed, 10685 insertions(+)
  create mode 100644 disas/loongson2f.cpp
  create mode 100644 disas/loongson2f.h

diff --git a/MAINTAINERS b/MAINTAINERS
index 3abe3faa4e..913ed2a6d3 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -219,6 +219,7 @@ S: Maintained
  F: target/mips/
  F: default-configs/*mips*
  F: disas/*mips*
+F: disas/loongson*
  F: docs/system/cpu-models-mips.rst.inc
  F: hw/intc/mips_gic.c
  F: hw/mips/
diff --git a/configure b/configure
index 597e909b53..e163dac53e 100755
--- a/configure
+++ b/configure
@@ -8102,6 +8102,7 @@ for i in $ARCH $TARGET_BASE_ARCH ; do
  disas_config "MIPS"
  if test -n "${cxx}"; then
    disas_config "NANOMIPS"
+  disas_config "LOONGSON2F"
  fi
    ;;
    moxie*)
diff --git a/disas/Makefile.objs b/disas/Makefile.objs
index 3c1cdce026..0d5ee1e038 100644
--- a/disas/Makefile.objs
+++ b/disas/Makefile.objs
@@ -14,6 +14,7 @@ common-obj-$(CONFIG_I386_DIS) += i386.o
  common-obj-$(CONFIG_M68K_DIS) += m68k.o
  common-obj-$(CONFIG_MICROBLAZE_DIS) += microblaze.o
  common-obj-$(CONFIG_MIPS_DIS) += mips.o
+common-obj-$(CONFIG_LOONGSON2F_DIS) += loongson2f.o
  common-obj-$(CONFIG_NANOMIPS_DIS) += nanomips.o
  common-obj-$(CONFIG_NIOS2_DIS) += nios2.o
  common-obj-$(CONFIG_MOXIE_DIS) += moxie.o
diff --git a/disas/loongson2f.cpp b/disas/loongson2f.cpp
new file mode 100644
index 00..a2f32dcf93
--- /dev/null
+++ b/disas/loongson2f.cpp
@@ -0,0 +1,8134 @@


This file (and the header) lack a proper header comment. Which license 
do you want to use for this code? Who wrote the initial implementation?

I will add proper license comments in v2. I will use GPL2+ license.
Thanks for reminding. This is the initial implementation, and I am the
author.


Also, unless you've copied the code from another project that uses 
C++, why did you use C++ here?

This is disassembler is written as a generic disassembler that can be
integrated into other projects. In this case, it is integrated into
QEMU. One of initial key requirements was that it uses C++ as its
language.
QEMU is C by default, we only allow C++ for some files that have been 
taken from other C++ projects and need to be kept in sync from time to 
time. So if you wrote this code from scratch, please use C instead.

There is no need for updating this disassembler, it is a complete
solution - with exceptions of possible bugs. However, I did extensive
testing, using objdump disassembly as a reference. Switching to C is
certainly possible, however it would be time-consuming, and at this
moment I simply don't have enough resources to do this.

Kind Regards,
Stefan



 Thanks,
  Thomas



+extern "C" {
+#include "qemu/osdep.h"
+#include "qemu/bitops.h"
+#include "disas/dis-asm.h"
+}
+
+#include "loongson2f.h"
+
+int print_insn_loongson2f(bfd_vma addr, disassemble_info *info)
+{
+    bfd_byte buffer[4];
+    uint32_t insn32;
+    int status;
+    Decoder *decoder = new Decoder();
+
+    status = info->read_memory_func(addr, buffer, 4, info);
+    if (status != 0) {
+    info->memory_error_func(status, addr, info);
+    return -1;
+    }
+    if (info->endian == BFD_ENDIAN_BIG) {
+    insn32 = bfd_getb32(buffer);
+    } else {
+    insn32 = bfd_getl32(buffer);
+    }
+
+    status = decoder->decode32(info, insn32);
+
+    delete decoder;
+
+    return status == 0 ? -1 : 4;
+}




[PATCH 0/1] Add Loongson 2F disassembler

2020-07-02 Thread Stefan Brankovic
This patch adds disassembler for Loongson 2F instruction set.

Stefan Brankovic (1):
  disas: mips: Add Loongson 2F disassembler

 MAINTAINERS |1 +
 configure   |1 +
 disas/Makefile.objs |1 +
 disas/loongson2f.cpp| 8134 +++
 disas/loongson2f.h  | 2542 
 include/disas/dis-asm.h |1 +
 include/exec/poison.h   |1 +
 target/mips/cpu.c   |4 +
 8 files changed, 10685 insertions(+)
 create mode 100644 disas/loongson2f.cpp
 create mode 100644 disas/loongson2f.h

-- 
2.17.1




Re: [PATCH 2/2] mailmap: Change email address of Stefan Brankovic

2020-06-02 Thread Stefan Brankovic



On 2.6.20. 10:52, Aleksandar Markovic wrote:

Stefan Brankovic wants to use his new email address for his future
work in QEMU.

CC: Stefan Brankovic 
Signed-off-by: Aleksandar Markovic 

Reviewed-by: Stefan Brankovic 

---
  .mailmap | 1 +
  1 file changed, 1 insertion(+)

diff --git a/.mailmap b/.mailmap
index 9f2a3a55f9..84f36592ba 100644
--- a/.mailmap
+++ b/.mailmap
@@ -52,6 +52,7 @@ Paul Burton  
  Paul Burton  
  Paul Burton  
  Philippe Mathieu-Daudé  
+Stefan Brankovic  
  Yongbok Kim  
  
  # Also list preferred name forms where people have changed their




[PATCH v9 1/3] target/ppc: Optimize emulation of vclzh and vclzb instructions

2019-10-23 Thread Stefan Brankovic
Optimize emulation of Altivec instructions vclzh (Vector Count Leading Zeros
Halfword) and vclzb (Vector Count Leading Zeros Byte).This instructions
count the number of leading zeros of each halfword/byte element in source
register and place result in the appropriate halfword/byte element of the
destination register.

Emulation of vclzh instruction is implemented in two 'for' loops.
In each iteration of the outer 'for' loop count operation is performed on
one doubleword element of source register vB. In the first iteration, a
higher doubleword element of vB is placed in variable 'avr', and then counting
for every halfword element is performed by using 'tcg_gen_clzi_i64'.
Since it counts leading zeros on 64 bit lenght, ith halword element has to
be moved to the highest 16 bits of variable 'tmp', or-ed with 'mask'(in order
to get all ones in the lowest 48 bits), then perform 'tcg_gen_clzi_i64' and
move it's result in the appropriate halfword element of variable 'result'.
This is done in inner 'for' loop. After the operation is finished, the 'result'
is saved in the appropriate doubleword element of the destination register vD.
The same sequence of orders is to be applied again to the lower doubleword
element of vB.

Emulation of vclzb instruction is implemented in two 'for' loops.
In each iteration of the outer 'for' loop count operation is performed on
one doubleword element of source register vB. In the first iteration, the
higher doubleword element of vB is placed in variable 'avr', and then counting
for every byte element is performed using 'tcg_gen_clzi_i64'. Since it counts
leading zeros on 64 bit length, ith byte element has to be moved to the
highest 8 bits of variable 'tmp', or-ed with 'mask'(in order to get all ones
in the lowest 56 bits), then perform 'tcg_gen_clzi_i64' and move it's result
in the appropriate byte element of variable 'result'. This is done in inner
'for' loop. After the operation is finished, the 'result' is saved in the
appropriate doubleword element of the destination register vD. The same sequence
of orders is to be applied again for the lower doubleword element of vB.

Signed-off-by: Stefan Brankovic 
---
 target/ppc/helper.h |   2 -
 target/ppc/int_helper.c |   9 ---
 target/ppc/translate/vmx-impl.inc.c | 132 +++-
 3 files changed, 130 insertions(+), 13 deletions(-)

diff --git a/target/ppc/helper.h b/target/ppc/helper.h
index f843814..281e54f 100644
--- a/target/ppc/helper.h
+++ b/target/ppc/helper.h
@@ -308,8 +308,6 @@ DEF_HELPER_4(vcfsx, void, env, avr, avr, i32)
 DEF_HELPER_4(vctuxs, void, env, avr, avr, i32)
 DEF_HELPER_4(vctsxs, void, env, avr, avr, i32)
 
-DEF_HELPER_2(vclzb, void, avr, avr)
-DEF_HELPER_2(vclzh, void, avr, avr)
 DEF_HELPER_2(vctzb, void, avr, avr)
 DEF_HELPER_2(vctzh, void, avr, avr)
 DEF_HELPER_2(vctzw, void, avr, avr)
diff --git a/target/ppc/int_helper.c b/target/ppc/int_helper.c
index 6d238b9..cd00f5e 100644
--- a/target/ppc/int_helper.c
+++ b/target/ppc/int_helper.c
@@ -1817,15 +1817,6 @@ VUPK(lsw, s64, s32, UPKLO)
 }   \
 }
 
-#define clzb(v) ((v) ? clz32((uint32_t)(v) << 24) : 8)
-#define clzh(v) ((v) ? clz32((uint32_t)(v) << 16) : 16)
-
-VGENERIC_DO(clzb, u8)
-VGENERIC_DO(clzh, u16)
-
-#undef clzb
-#undef clzh
-
 #define ctzb(v) ((v) ? ctz32(v) : 8)
 #define ctzh(v) ((v) ? ctz32(v) : 16)
 #define ctzw(v) ctz32((v))
diff --git a/target/ppc/translate/vmx-impl.inc.c 
b/target/ppc/translate/vmx-impl.inc.c
index 2472a52..8f68e41 100644
--- a/target/ppc/translate/vmx-impl.inc.c
+++ b/target/ppc/translate/vmx-impl.inc.c
@@ -751,6 +751,134 @@ static void trans_vgbbd(DisasContext *ctx)
 }
 
 /*
+ * vclzb VRT,VRB - Vector Count Leading Zeros Byte
+ *
+ * Counting the number of leading zero bits of each byte element in source
+ * register and placing result in appropriate byte element of destination
+ * register.
+ */
+static void trans_vclzb(DisasContext *ctx)
+{
+int VT = rD(ctx->opcode);
+int VB = rB(ctx->opcode);
+TCGv_i64 avr = tcg_temp_new_i64();
+TCGv_i64 result = tcg_temp_new_i64();
+TCGv_i64 result1 = tcg_temp_new_i64();
+TCGv_i64 tmp = tcg_temp_new_i64();
+TCGv_i64 mask = tcg_const_i64(0xffULL);
+int i, j;
+
+for (i = 0; i < 2; i++) {
+if (i == 0) {
+/* Get high doubleword of vB in 'avr'. */
+get_avr64(avr, VB, true);
+} else {
+/* Get low doubleword of vB in 'avr'. */
+get_avr64(avr, VB, false);
+}
+/*
+ * Perform count for every byte element using 'tcg_gen_clzi_i64'.
+ * Since it counts leading zeros on 64 bit lenght, we have to move
+ * ith byte element to highest 8 bits of 'tmp', or it with mask(so we
+ * get all ones in lowest 56 bits), then perform 'tcg_gen_clzi_i64' and
+ * move it's result in appropriate b

[PATCH v9 2/3] target/ppc: Optimize emulation of vpkpx instruction

2019-10-23 Thread Stefan Brankovic
Optimize altivec instruction vpkpx (Vector Pack Pixel).
Rearranges 8 pixels coded in 6-5-5 pattern (4 from each source register)
into a contigous array of bits in the destination register.

In each iteration of outer loop, the instruction is to be done with
the 6-5-5 pack for 2 pixels of each doubleword element of each
source register. The first thing to be done in outer loop is
choosing which doubleword element of which register is to be used
in the current iteration and it is to be placed in 'avr' variable. The
next step is to perform 6-5-5 pack of pixels on 'avr' variable in inner
'for' loop(2 iterations, 1 for each pixel) and save result in 'tmp'
variable. At the end of the outer 'for' loop, the result is merged in the
variable called 'result' and saved in the appropriate doubleword element
of vD if the whole doubleword is finished(every second iteration). The
outer loop has 4 iterations.

Signed-off-by: Stefan Brankovic 
---
 target/ppc/helper.h |  1 -
 target/ppc/int_helper.c | 21 -
 target/ppc/translate/vmx-impl.inc.c | 93 -
 3 files changed, 92 insertions(+), 23 deletions(-)

diff --git a/target/ppc/helper.h b/target/ppc/helper.h
index 281e54f..b489b38 100644
--- a/target/ppc/helper.h
+++ b/target/ppc/helper.h
@@ -258,7 +258,6 @@ DEF_HELPER_4(vpkudus, void, env, avr, avr, avr)
 DEF_HELPER_4(vpkuhum, void, env, avr, avr, avr)
 DEF_HELPER_4(vpkuwum, void, env, avr, avr, avr)
 DEF_HELPER_4(vpkudum, void, env, avr, avr, avr)
-DEF_HELPER_3(vpkpx, void, avr, avr, avr)
 DEF_HELPER_5(vmhaddshs, void, env, avr, avr, avr, avr)
 DEF_HELPER_5(vmhraddshs, void, env, avr, avr, avr, avr)
 DEF_HELPER_5(vmsumuhm, void, env, avr, avr, avr, avr)
diff --git a/target/ppc/int_helper.c b/target/ppc/int_helper.c
index cd00f5e..f910c11 100644
--- a/target/ppc/int_helper.c
+++ b/target/ppc/int_helper.c
@@ -1262,27 +1262,6 @@ void helper_vpmsumd(ppc_avr_t *r, ppc_avr_t *a, 
ppc_avr_t *b)
 #else
 #define PKBIG 0
 #endif
-void helper_vpkpx(ppc_avr_t *r, ppc_avr_t *a, ppc_avr_t *b)
-{
-int i, j;
-ppc_avr_t result;
-#if defined(HOST_WORDS_BIGENDIAN)
-const ppc_avr_t *x[2] = { a, b };
-#else
-const ppc_avr_t *x[2] = { b, a };
-#endif
-
-VECTOR_FOR_INORDER_I(i, u64) {
-VECTOR_FOR_INORDER_I(j, u32) {
-uint32_t e = x[i]->u32[j];
-
-result.u16[4 * i + j] = (((e >> 9) & 0xfc00) |
- ((e >> 6) & 0x3e0) |
- ((e >> 3) & 0x1f));
-}
-}
-*r = result;
-}
 
 #define VPK(suffix, from, to, cvt, dosat)   \
 void helper_vpk##suffix(CPUPPCState *env, ppc_avr_t *r, \
diff --git a/target/ppc/translate/vmx-impl.inc.c 
b/target/ppc/translate/vmx-impl.inc.c
index 8f68e41..787008d 100644
--- a/target/ppc/translate/vmx-impl.inc.c
+++ b/target/ppc/translate/vmx-impl.inc.c
@@ -579,6 +579,97 @@ static void trans_lvsr(DisasContext *ctx)
 }
 
 /*
+ * vpkpx VRT,VRA,VRB - Vector Pack Pixel
+ *
+ * Rearranges 8 pixels coded in 6-5-5 pattern (4 from each source register)
+ * into contigous array of bits in the destination register.
+ */
+static void trans_vpkpx(DisasContext *ctx)
+{
+int VT = rD(ctx->opcode);
+int VA = rA(ctx->opcode);
+int VB = rB(ctx->opcode);
+TCGv_i64 tmp = tcg_temp_new_i64();
+TCGv_i64 shifted = tcg_temp_new_i64();
+TCGv_i64 avr = tcg_temp_new_i64();
+TCGv_i64 result = tcg_temp_new_i64();
+TCGv_i64 result1 = tcg_temp_new_i64();
+int64_t mask1 = 0x1fULL;
+int64_t mask2 = 0x1fULL << 5;
+int64_t mask3 = 0x3fULL << 10;
+int i, j;
+/*
+ * In each iteration do the 6-5-5 pack for 2 pixels of each doubleword
+ * element of each source register.
+ */
+for (i = 0; i < 4; i++) {
+switch (i) {
+case 0:
+/*
+ * Get high doubleword of vA to perform 6-5-5 pack of pixels
+ * 1 and 2.
+ */
+get_avr64(avr, VA, true);
+tcg_gen_movi_i64(result, 0x0ULL);
+break;
+case 1:
+/*
+ * Get low doubleword of vA to perform 6-5-5 pack of pixels
+ * 3 and 4.
+ */
+get_avr64(avr, VA, false);
+break;
+case 2:
+/*
+ * Get high doubleword of vB to perform 6-5-5 pack of pixels
+ * 5 and 6.
+ */
+get_avr64(avr, VB, true);
+tcg_gen_movi_i64(result, 0x0ULL);
+break;
+case 3:
+/*
+ * Get low doubleword of vB to perform 6-5-5 pack of pixels
+ * 7 and 8.
+ */
+get_avr64(avr, VB, false);
+break;
+}
+/* Perform the packing for 2 pixels(each iteration for 1). */
+tcg_gen_movi_i64(tmp, 0x0ULL);
+for (j = 0; j < 2; j+

[PATCH v9 0/3] Optimize emulation of some Altivec instructions

2019-10-23 Thread Stefan Brankovic
Optimize emulation of twelve Altivec instructions: lvsl, lvsr, vsl, vsr, vpkpx,
vgbbd, vclzb, vclzh, vclzw, vclzd, vupkhpx and vupklpx.

This series builds up on and complements recent work of Thomas Murta, Mark
Cave-Ayland and Richard Henderson in the same area. It is based on devising TCG
translation implementation for selected instructions rather than using helpers.
The selected instructions are most of the time idiosyncratic to ppc platform,
so relatively complex TCG translation (without direct mapping to host
instruction that is not possible in these cases) seems to be the best option,
and that approach is presented in this series. The performance improvements
are significant in all cases.

V9:

Fixed comments and commit messages.

V8:

Addressed Aleksandar Markovic's suggestions.

V7:

Added optimization for vupkhpx and vupklpx instructions.

V6:

Rebased series to the latest qemu code.
Excluded all patches that are already accepted.

V5:

Fixed vpkpx bug and added it back in patch.
Fixed graphical distortions on OSX 10.3 and 10.4.
Removed conversion of vmrgh and vmrgl instructions to vector operations for
further investigation.

V4:

Addressed Richard's Henderson's suggestions.
Removed vpkpx's optimization for further investigation on graphical distortions
it caused on OSX 10.2-4 guests.
Added opcodes for vector vmrgh(b|h|w) and vmrgl(b|h|w) in tcg.
Implemented vector vmrgh and vmrgl instructions for i386.
Converted vmrgh and vmrgl instructions to vector operations.

V3:

Fixed problem during build.

V2:

Addressed Richard's Henderson's suggestions.
Fixed problem during build on patch 2/8.
Rebased series to the latest qemu code.

Stefan Brankovic (3):
  target/ppc: Optimize emulation of vclzh and vclzb instructions
  target/ppc: Optimize emulation of vpkpx instruction
  target/ppc: Optimize emulation of vupkhpx and vupklpx instructions

 target/ppc/helper.h |   5 -
 target/ppc/int_helper.c |  50 --
 target/ppc/translate/vmx-impl.inc.c | 307 +++-
 3 files changed, 302 insertions(+), 60 deletions(-)

-- 
2.7.4




[PATCH v9 3/3] target/ppc: Optimize emulation of vupkhpx and vupklpx instructions

2019-10-23 Thread Stefan Brankovic
Optimize altivec instructions vupkhpx and vupklpx (Vector Unpack High/Low
Pixel). Unpacks 4 pixels coded in 1-5-5-5 pattern from source register
into a contigous array of bits in the destination register.

'trans_vupkpx' function implements emulation of both vupkhpx and vupklpx
instructions, while its argument 'high' determines which instruction is
processed. Instructions are implemented in two 'for' loops. Outer 'for'
loop repeats unpacking two times, since both doubleword elements of the
destination register are formed the same way. It also stores result of
every iteration in a temporary variable 'result', that is later transferred
to the destination register. Inner 'for' loop does unpacking of pixels in
two iterations. Each iteration takes 16 bits from source register and
unpacks them into 32 bits of the destination register.

Signed-off-by: Stefan Brankovic 
---
 target/ppc/helper.h |  2 -
 target/ppc/int_helper.c | 20 -
 target/ppc/translate/vmx-impl.inc.c | 82 -
 3 files changed, 80 insertions(+), 24 deletions(-)

diff --git a/target/ppc/helper.h b/target/ppc/helper.h
index b489b38..fd06b56 100644
--- a/target/ppc/helper.h
+++ b/target/ppc/helper.h
@@ -233,8 +233,6 @@ DEF_HELPER_2(vextsh2d, void, avr, avr)
 DEF_HELPER_2(vextsw2d, void, avr, avr)
 DEF_HELPER_2(vnegw, void, avr, avr)
 DEF_HELPER_2(vnegd, void, avr, avr)
-DEF_HELPER_2(vupkhpx, void, avr, avr)
-DEF_HELPER_2(vupklpx, void, avr, avr)
 DEF_HELPER_2(vupkhsb, void, avr, avr)
 DEF_HELPER_2(vupkhsh, void, avr, avr)
 DEF_HELPER_2(vupkhsw, void, avr, avr)
diff --git a/target/ppc/int_helper.c b/target/ppc/int_helper.c
index f910c11..9ee667d 100644
--- a/target/ppc/int_helper.c
+++ b/target/ppc/int_helper.c
@@ -1737,26 +1737,6 @@ void helper_vsum4ubs(CPUPPCState *env, ppc_avr_t *r, 
ppc_avr_t *a, ppc_avr_t *b)
 #define UPKHI 0
 #define UPKLO 1
 #endif
-#define VUPKPX(suffix, hi)  \
-void helper_vupk##suffix(ppc_avr_t *r, ppc_avr_t *b)\
-{   \
-int i;  \
-ppc_avr_t result;   \
-\
-for (i = 0; i < ARRAY_SIZE(r->u32); i++) {  \
-uint16_t e = b->u16[hi ? i : i + 4];\
-uint8_t a = (e >> 15) ? 0xff : 0;   \
-uint8_t r = (e >> 10) & 0x1f;   \
-uint8_t g = (e >> 5) & 0x1f;\
-uint8_t b = e & 0x1f;   \
-\
-result.u32[i] = (a << 24) | (r << 16) | (g << 8) | b;   \
-}   \
-*r = result;\
-}
-VUPKPX(lpx, UPKLO)
-VUPKPX(hpx, UPKHI)
-#undef VUPKPX
 
 #define VUPK(suffix, unpacked, packee, hi)  \
 void helper_vupk##suffix(ppc_avr_t *r, ppc_avr_t *b)\
diff --git a/target/ppc/translate/vmx-impl.inc.c 
b/target/ppc/translate/vmx-impl.inc.c
index 787008d..c246880 100644
--- a/target/ppc/translate/vmx-impl.inc.c
+++ b/target/ppc/translate/vmx-impl.inc.c
@@ -670,6 +670,84 @@ static void trans_vpkpx(DisasContext *ctx)
 }
 
 /*
+ * vupkhpx VRT,VRB - Vector Unpack High Pixel
+ * vupklpx VRT,VRB - Vector Unpack Low Pixel
+ *
+ * Unpacks 4 pixels coded in 1-5-5-5 pattern from high/low doubleword element
+ * of source register into contigous array of bits in the destination register.
+ * Argument 'high' determines if high or low doubleword element of source
+ * register is processed.
+ */
+static void trans_vupkpx(DisasContext *ctx, bool high)
+{
+int VT = rD(ctx->opcode);
+int VB = rB(ctx->opcode);
+TCGv_i64 tmp = tcg_temp_new_i64();
+TCGv_i64 avr = tcg_temp_new_i64();
+TCGv_i64 result = tcg_temp_new_i64();
+TCGv_i64 result1 = tcg_temp_new_i64();
+int64_t mask1 = 0x1fULL;
+int64_t mask2 = 0x1fULL << 8;
+int64_t mask3 = 0x1fULL << 16;
+int64_t mask4 = 0xffULL << 56;
+int i, j;
+
+if (high == true) {
+/* vupkhpx */
+get_avr64(avr, VB, true);
+} else {
+/* vupklpx */
+get_avr64(avr, VB, false);
+}
+
+tcg_gen_movi_i64(result, 0x0ULL);
+for (i = 0; i < 2; i++) {
+for (j = 0; j < 2; j++) {
+tcg_gen_shli_i64(tmp, avr, (j * 16));
+tcg_gen_andi_i64(tmp, tmp, mask1 << (j * 32));
+tcg_gen_or_i64(result, result, tmp);
+
+tcg_gen_shli_i64(tmp, avr, 3 + (j * 16));
+

[PATCH v8 3/3] target/ppc: Optimize emulation of vupkhpx and vupklpx instructions

2019-10-23 Thread Stefan Brankovic
'trans_vupkpx' function implements emulation of both vupkhpx and vupklpx
instructions, while its argument 'high' determines which instruction is
processed. Instructions are implemented in two 'for' loops. Outer 'for'
loop repeats unpacking two times, since both doubleword elements of the
destination register are formed the same way. It also stores result of
every iteration in a temporary variable 'result', that is later transferred
to the destination register. Inner 'for' loop does unpacking of pixels in
two iterations. Each iteration takes 16 bits from source register and
unpacks them into 32 bits of the destination register.

Signed-off-by: Stefan Brankovic 
---
 target/ppc/helper.h |  2 -
 target/ppc/int_helper.c | 20 -
 target/ppc/translate/vmx-impl.inc.c | 82 -
 3 files changed, 80 insertions(+), 24 deletions(-)

diff --git a/target/ppc/helper.h b/target/ppc/helper.h
index b489b38..fd06b56 100644
--- a/target/ppc/helper.h
+++ b/target/ppc/helper.h
@@ -233,8 +233,6 @@ DEF_HELPER_2(vextsh2d, void, avr, avr)
 DEF_HELPER_2(vextsw2d, void, avr, avr)
 DEF_HELPER_2(vnegw, void, avr, avr)
 DEF_HELPER_2(vnegd, void, avr, avr)
-DEF_HELPER_2(vupkhpx, void, avr, avr)
-DEF_HELPER_2(vupklpx, void, avr, avr)
 DEF_HELPER_2(vupkhsb, void, avr, avr)
 DEF_HELPER_2(vupkhsh, void, avr, avr)
 DEF_HELPER_2(vupkhsw, void, avr, avr)
diff --git a/target/ppc/int_helper.c b/target/ppc/int_helper.c
index f910c11..9ee667d 100644
--- a/target/ppc/int_helper.c
+++ b/target/ppc/int_helper.c
@@ -1737,26 +1737,6 @@ void helper_vsum4ubs(CPUPPCState *env, ppc_avr_t *r, 
ppc_avr_t *a, ppc_avr_t *b)
 #define UPKHI 0
 #define UPKLO 1
 #endif
-#define VUPKPX(suffix, hi)  \
-void helper_vupk##suffix(ppc_avr_t *r, ppc_avr_t *b)\
-{   \
-int i;  \
-ppc_avr_t result;   \
-\
-for (i = 0; i < ARRAY_SIZE(r->u32); i++) {  \
-uint16_t e = b->u16[hi ? i : i + 4];\
-uint8_t a = (e >> 15) ? 0xff : 0;   \
-uint8_t r = (e >> 10) & 0x1f;   \
-uint8_t g = (e >> 5) & 0x1f;\
-uint8_t b = e & 0x1f;   \
-\
-result.u32[i] = (a << 24) | (r << 16) | (g << 8) | b;   \
-}   \
-*r = result;\
-}
-VUPKPX(lpx, UPKLO)
-VUPKPX(hpx, UPKHI)
-#undef VUPKPX
 
 #define VUPK(suffix, unpacked, packee, hi)  \
 void helper_vupk##suffix(ppc_avr_t *r, ppc_avr_t *b)\
diff --git a/target/ppc/translate/vmx-impl.inc.c 
b/target/ppc/translate/vmx-impl.inc.c
index dcb6fd9..9d27d2d 100644
--- a/target/ppc/translate/vmx-impl.inc.c
+++ b/target/ppc/translate/vmx-impl.inc.c
@@ -670,6 +670,84 @@ static void trans_vpkpx(DisasContext *ctx)
 }
 
 /*
+ * vupkhpx VRT,VRB - Vector Unpack High Pixel
+ * vupklpx VRT,VRB - Vector Unpack Low Pixel
+ *
+ * Unpacks 4 pixels coded in 1-5-5-5 pattern from high/low doubleword element
+ * of source register into contigous array of bits in the destination register.
+ * Argument 'high' determines if high or low doubleword element of source
+ * register is processed.
+ */
+static void trans_vupkpx(DisasContext *ctx, bool high)
+{
+int VT = rD(ctx->opcode);
+int VB = rB(ctx->opcode);
+TCGv_i64 tmp = tcg_temp_new_i64();
+TCGv_i64 avr = tcg_temp_new_i64();
+TCGv_i64 result = tcg_temp_new_i64();
+TCGv_i64 result1 = tcg_temp_new_i64();
+int64_t mask1 = 0x1fULL;
+int64_t mask2 = 0x1fULL << 8;
+int64_t mask3 = 0x1fULL << 16;
+int64_t mask4 = 0xffULL << 56;
+int i, j;
+
+if (high == true) {
+/* vupkhpx */
+get_avr64(avr, VB, true);
+} else {
+/* vupklpx */
+get_avr64(avr, VB, false);
+}
+
+tcg_gen_movi_i64(result, 0x0ULL);
+for (i = 0; i < 2; i++) {
+for (j = 0; j < 2; j++) {
+tcg_gen_shli_i64(tmp, avr, (j * 16));
+tcg_gen_andi_i64(tmp, tmp, mask1 << (j * 32));
+tcg_gen_or_i64(result, result, tmp);
+
+tcg_gen_shli_i64(tmp, avr, 3 + (j * 16));
+tcg_gen_andi_i64(tmp, tmp, mask2 << (j * 32));
+tcg_gen_or_i64(result, result, tmp);
+
+tcg_gen_shli_i64(tmp, avr, 6 + (j * 16));
+tcg_gen_andi_i64(tmp, tmp, ma

[PATCH v8 1/3] target/ppc: Optimize emulation of vclzh and vclzb instructions

2019-10-23 Thread Stefan Brankovic
Optimize emulation of Altivec instructions vclzh (Vector Count Leading Zeros
Halfword) and vclzb (Vector Count Leading Zeros Byte).This instructions
count the number of leading zeros of each halfword/byte element in source
register and place result in the appropriate halfword/byte element of the
destination register.

Emulation of vclzh instruction is implemented in two 'for' loops.
In each iteration of the outer 'for' loop count operation is performed on
one doubleword element of source register vB. In the first iteration, a
higher doubleword element of vB is placed in variable 'avr', and then counting
for every halfword element is performed by using 'tcg_gen_clzi_i64'.
Since it counts leading zeros on 64 bit lenght, ith halword element has to
be moved to the highest 16 bits of variable 'tmp', or-ed with 'mask'(in order
to get all ones in the lowest 48 bits), then perform 'tcg_gen_clzi_i64' and
move it's result in the appropriate halfword element of variable 'result'.
This is done in inner 'for' loop. After the operation is finished, the 'result'
is saved in the appropriate doubleword element of the destination register vD.
The same sequence of orders is to be applied again to the lower doubleword
element of vB.

Emulation of vclzb instruction is implemented in two 'for' loops.
In each iteration of the outer 'for' loop count operation is performed on
one doubleword element of source register vB. In the first iteration, the
higher doubleword element of vB is placed in variable 'avr', and then counting
for every byte element is performed using 'tcg_gen_clzi_i64'. Since it counts
leading zeros on 64 bit length, ith byte element has to be moved to the
highest 8 bits of variable 'tmp', or-ed with 'mask'(in order to get all ones
in the lowest 56 bits), then perform 'tcg_gen_clzi_i64' and move it's result
in the appropriate byte element of variable 'result'. This is done in inner
'for' loop. After the operation is finished, the 'result' is saved in the
appropriate doubleword element of the destination register vD. The same sequence
of orders is to be applied again for the lower doubleword element of vB.

Signed-off-by: Stefan Brankovic 
---
 target/ppc/helper.h |   2 -
 target/ppc/int_helper.c |   9 ---
 target/ppc/translate/vmx-impl.inc.c | 132 +++-
 3 files changed, 130 insertions(+), 13 deletions(-)

diff --git a/target/ppc/helper.h b/target/ppc/helper.h
index f843814..281e54f 100644
--- a/target/ppc/helper.h
+++ b/target/ppc/helper.h
@@ -308,8 +308,6 @@ DEF_HELPER_4(vcfsx, void, env, avr, avr, i32)
 DEF_HELPER_4(vctuxs, void, env, avr, avr, i32)
 DEF_HELPER_4(vctsxs, void, env, avr, avr, i32)
 
-DEF_HELPER_2(vclzb, void, avr, avr)
-DEF_HELPER_2(vclzh, void, avr, avr)
 DEF_HELPER_2(vctzb, void, avr, avr)
 DEF_HELPER_2(vctzh, void, avr, avr)
 DEF_HELPER_2(vctzw, void, avr, avr)
diff --git a/target/ppc/int_helper.c b/target/ppc/int_helper.c
index 6d238b9..cd00f5e 100644
--- a/target/ppc/int_helper.c
+++ b/target/ppc/int_helper.c
@@ -1817,15 +1817,6 @@ VUPK(lsw, s64, s32, UPKLO)
 }   \
 }
 
-#define clzb(v) ((v) ? clz32((uint32_t)(v) << 24) : 8)
-#define clzh(v) ((v) ? clz32((uint32_t)(v) << 16) : 16)
-
-VGENERIC_DO(clzb, u8)
-VGENERIC_DO(clzh, u16)
-
-#undef clzb
-#undef clzh
-
 #define ctzb(v) ((v) ? ctz32(v) : 8)
 #define ctzh(v) ((v) ? ctz32(v) : 16)
 #define ctzw(v) ctz32((v))
diff --git a/target/ppc/translate/vmx-impl.inc.c 
b/target/ppc/translate/vmx-impl.inc.c
index 2472a52..3ad425a 100644
--- a/target/ppc/translate/vmx-impl.inc.c
+++ b/target/ppc/translate/vmx-impl.inc.c
@@ -751,6 +751,134 @@ static void trans_vgbbd(DisasContext *ctx)
 }
 
 /*
+ * vclzb VRT,VRB - Vector Count Leading Zeros Byte
+ *
+ * Counting the number of leading zero bits of each byte element in source
+ * register and placing result in appropriate byte element of destination
+ * register.
+ */
+static void trans_vclzb(DisasContext *ctx)
+{
+int VT = rD(ctx->opcode);
+int VB = rB(ctx->opcode);
+TCGv_i64 avr = tcg_temp_new_i64();
+TCGv_i64 result = tcg_temp_new_i64();
+TCGv_i64 result1 = tcg_temp_new_i64();
+TCGv_i64 tmp = tcg_temp_new_i64();
+TCGv_i64 mask = tcg_const_i64(0xffULL);
+int i, j;
+
+for (i = 0; i < 2; i++) {
+if (i == 0) {
+/* Get high doubleword of vB in 'avr'. */
+get_avr64(avr, VB, true);
+} else {
+/* Get low doubleword of vB in 'avr'. */
+get_avr64(avr, VB, false);
+}
+/*
+ * Perform count for every byte element using 'tcg_gen_clzi_i64'.
+ * Since it counts leading zeros on 64 bit lenght, we have to move
+ * ith byte element to highest 8 bits of 'tmp', or it with mask(so we
+ * get all ones in lowest 56 bits), then perform 'tcg_gen_clzi_i64' and
+ * move it's result in appropriate b

[PATCH v8 0/3] Optimize emulation of some Altivec instructions

2019-10-23 Thread Stefan Brankovic
Optimize emulation of twelve Altivec instructions: lvsl, lvsr, vsl, vsr, vpkpx,
vgbbd, vclzb, vclzh, vclzw, vclzd, vupkhpx and vupklpx.

This series builds up on and complements recent work of Thomas Murta, Mark
Cave-Ayland and Richard Henderson in the same area. It is based on devising TCG
translation implementation for selected instructions rather than using helpers.
The selected instructions are most of the time idiosyncratic to ppc platform,
so relatively complex TCG translation (without direct mapping to host
instruction that is not possible in these cases) seems to be the best option,
and that approach is presented in this series. The performance improvements
are significant in all cases.

V8:

Addressed Aleksandar Markovic's suggestions.

V7:

Added optimization for vupkhpx and vupklpx instructions.

V6:

Rebased series to the latest qemu code.
Excluded all patches that are already accepted.

V5:

Fixed vpkpx bug and added it back in patch.
Fixed graphical distortions on OSX 10.3 and 10.4.
Removed conversion of vmrgh and vmrgl instructions to vector operations for
further investigation.

V4:

Addressed Richard's Henderson's suggestions.
Removed vpkpx's optimization for further investigation on graphical distortions
it caused on OSX 10.2-4 guests.
Added opcodes for vector vmrgh(b|h|w) and vmrgl(b|h|w) in tcg.
Implemented vector vmrgh and vmrgl instructions for i386.
Converted vmrgh and vmrgl instructions to vector operations.

V3:

Fixed problem during build.

V2:

Addressed Richard's Henderson's suggestions.
Fixed problem during build on patch 2/8.
Rebased series to the latest qemu code.

Stefan Brankovic (3):
  target/ppc: Optimize emulation of vclzh and vclzb instructions
  target/ppc: Optimize emulation of vpkpx instruction
  target/ppc: Optimize emulation of vupkhpx and vupklpx instructions

 target/ppc/helper.h |   5 -
 target/ppc/int_helper.c |  50 --
 target/ppc/translate/vmx-impl.inc.c | 307 +++-
 3 files changed, 302 insertions(+), 60 deletions(-)

-- 
2.7.4




[PATCH v8 2/3] target/ppc: Optimize emulation of vpkpx instruction

2019-10-23 Thread Stefan Brankovic
Optimize altivec instruction vpkpx (Vector Pack Pixel).
Rearranges 8 pixels coded in 6-5-5 pattern (4 from each source register)
into a contigous array of bits in the destination register.

In each iteration of outer loop, the instruction is to be done with
the 6-5-5 pack for 2 pixels of each doubleword element of each
source register. The first thing to be done in outer loop is
choosing which doubleword element of which register is to be used
in the current iteration and it is to be placed in 'avr' variable. The
next step is to perform 6-5-5 pack of pixels on 'avr' variable in inner
'for' loop(2 iterations, 1 for each pixel) and save result in 'tmp'
variable. At the end of the outer 'for' loop, the result is merged in the
variable called 'result' and saved in the appropriate doubleword element
of vD if the whole doubleword is finished(every second iteration). The
outer loop has 4 iterations.

Signed-off-by: Stefan Brankovic 
---
 target/ppc/helper.h |  1 -
 target/ppc/int_helper.c | 21 -
 target/ppc/translate/vmx-impl.inc.c | 93 -
 3 files changed, 92 insertions(+), 23 deletions(-)

diff --git a/target/ppc/helper.h b/target/ppc/helper.h
index 281e54f..b489b38 100644
--- a/target/ppc/helper.h
+++ b/target/ppc/helper.h
@@ -258,7 +258,6 @@ DEF_HELPER_4(vpkudus, void, env, avr, avr, avr)
 DEF_HELPER_4(vpkuhum, void, env, avr, avr, avr)
 DEF_HELPER_4(vpkuwum, void, env, avr, avr, avr)
 DEF_HELPER_4(vpkudum, void, env, avr, avr, avr)
-DEF_HELPER_3(vpkpx, void, avr, avr, avr)
 DEF_HELPER_5(vmhaddshs, void, env, avr, avr, avr, avr)
 DEF_HELPER_5(vmhraddshs, void, env, avr, avr, avr, avr)
 DEF_HELPER_5(vmsumuhm, void, env, avr, avr, avr, avr)
diff --git a/target/ppc/int_helper.c b/target/ppc/int_helper.c
index cd00f5e..f910c11 100644
--- a/target/ppc/int_helper.c
+++ b/target/ppc/int_helper.c
@@ -1262,27 +1262,6 @@ void helper_vpmsumd(ppc_avr_t *r, ppc_avr_t *a, 
ppc_avr_t *b)
 #else
 #define PKBIG 0
 #endif
-void helper_vpkpx(ppc_avr_t *r, ppc_avr_t *a, ppc_avr_t *b)
-{
-int i, j;
-ppc_avr_t result;
-#if defined(HOST_WORDS_BIGENDIAN)
-const ppc_avr_t *x[2] = { a, b };
-#else
-const ppc_avr_t *x[2] = { b, a };
-#endif
-
-VECTOR_FOR_INORDER_I(i, u64) {
-VECTOR_FOR_INORDER_I(j, u32) {
-uint32_t e = x[i]->u32[j];
-
-result.u16[4 * i + j] = (((e >> 9) & 0xfc00) |
- ((e >> 6) & 0x3e0) |
- ((e >> 3) & 0x1f));
-}
-}
-*r = result;
-}
 
 #define VPK(suffix, from, to, cvt, dosat)   \
 void helper_vpk##suffix(CPUPPCState *env, ppc_avr_t *r, \
diff --git a/target/ppc/translate/vmx-impl.inc.c 
b/target/ppc/translate/vmx-impl.inc.c
index 3ad425a..dcb6fd9 100644
--- a/target/ppc/translate/vmx-impl.inc.c
+++ b/target/ppc/translate/vmx-impl.inc.c
@@ -579,6 +579,97 @@ static void trans_lvsr(DisasContext *ctx)
 }
 
 /*
+ * vpkpx VRT,VRA,VRB - Vector Pack Pixel
+ *
+ * Rearranges 8 pixels coded in 6-5-5 pattern (4 from each source register)
+ * into contigous array of bits in the destination register.
+ */
+static void trans_vpkpx(DisasContext *ctx)
+{
+int VT = rD(ctx->opcode);
+int VA = rA(ctx->opcode);
+int VB = rB(ctx->opcode);
+TCGv_i64 tmp = tcg_temp_new_i64();
+TCGv_i64 shifted = tcg_temp_new_i64();
+TCGv_i64 avr = tcg_temp_new_i64();
+TCGv_i64 result = tcg_temp_new_i64();
+TCGv_i64 result1 = tcg_temp_new_i64();
+int64_t mask1 = 0x1fULL;
+int64_t mask2 = 0x1fULL << 5;
+int64_t mask3 = 0x3fULL << 10;
+int i, j;
+/*
+ * In each iteration do the 6-5-5 pack for 2 pixels of each doubleword
+ * element of each source register.
+ */
+for (i = 0; i < 4; i++) {
+switch (i) {
+case 0:
+/*
+ * Get high doubleword of vA to perform 6-5-5 pack of pixels
+ * 1 and 2.
+ */
+get_avr64(avr, VA, true);
+tcg_gen_movi_i64(result, 0x0ULL);
+break;
+case 1:
+/*
+ * Get low doubleword of vA to perform 6-5-5 pack of pixels
+ * 3 and 4.
+ */
+get_avr64(avr, VA, false);
+break;
+case 2:
+/*
+ * Get high doubleword of vB to perform 6-5-5 pack of pixels
+ * 5 and 6.
+ */
+get_avr64(avr, VB, true);
+tcg_gen_movi_i64(result, 0x0ULL);
+break;
+case 3:
+/*
+ * Get low doubleword of vB to perform 6-5-5 pack of pixels
+ * 7 and 8.
+ */
+get_avr64(avr, VB, false);
+break;
+}
+/* Perform the packing for 2 pixels(each iteration for 1). */
+tcg_gen_movi_i64(tmp, 0x0ULL);
+for (j = 0; j < 2; j+

Re: [PATCH v7 3/3] target/ppc: Optimize emulation of vupkhpx and vupklpx instructions

2019-10-21 Thread Stefan Brankovic

Hello Aleksandar,

Thank you for taking a look at this patch. I will start working on a 
version 8 of the patch where I will address all your suggestions.


Kind Regards,

Stefan

On 19.10.19. 22:40, Aleksandar Markovic wrote:



On Thursday, October 17, 2019, Stefan Brankovic 
mailto:stefan.branko...@rt-rk.com>> wrote:


'trans_vupkpx' function implements both vupkhpx and vupklpx
instructions with
argument 'high' determine which instruction is processed.
Instructions are
implemented in two 'for' loops. Outer 'for' loop repeats unpacking
two times,
since both doubleword elements of destination register are formed
the same way.
It also stores result of every iteration in temporary register,
that is later
transferred to destination register. Inner 'for' loop does
unpacking of pixels
and forms resulting doubleword 32 by 32 bits.

Signed-off-by: Stefan Brankovic mailto:stefan.branko...@rt-rk.com>>
---
 target/ppc/helper.h                 |  2 -
 target/ppc/int_helper.c             | 20 
 target/ppc/translate/vmx-impl.inc.c | 91
-
 3 files changed, 89 insertions(+), 24 deletions(-)

diff --git a/target/ppc/helper.h b/target/ppc/helper.h
index b489b38..fd06b56 100644
--- a/target/ppc/helper.h
+++ b/target/ppc/helper.h
@@ -233,8 +233,6 @@ DEF_HELPER_2(vextsh2d, void, avr, avr)
 DEF_HELPER_2(vextsw2d, void, avr, avr)
 DEF_HELPER_2(vnegw, void, avr, avr)
 DEF_HELPER_2(vnegd, void, avr, avr)
-DEF_HELPER_2(vupkhpx, void, avr, avr)
-DEF_HELPER_2(vupklpx, void, avr, avr)
 DEF_HELPER_2(vupkhsb, void, avr, avr)
 DEF_HELPER_2(vupkhsh, void, avr, avr)
 DEF_HELPER_2(vupkhsw, void, avr, avr)
diff --git a/target/ppc/int_helper.c b/target/ppc/int_helper.c
index f910c11..9ee667d 100644
--- a/target/ppc/int_helper.c
+++ b/target/ppc/int_helper.c
@@ -1737,26 +1737,6 @@ void helper_vsum4ubs(CPUPPCState *env,
ppc_avr_t *r, ppc_avr_t *a, ppc_avr_t *b)
 #define UPKHI 0
 #define UPKLO 1
 #endif
-#define VUPKPX(suffix, hi)         \
-    void helper_vupk##suffix(ppc_avr_t *r, ppc_avr_t *b)         \
-    {          \
-        int i;         \
-        ppc_avr_t result;          \
-         \
-        for (i = 0; i < ARRAY_SIZE(r->u32); i++) {               \
-            uint16_t e = b->u16[hi ? i : i + 4];             \
-            uint8_t a = (e >> 15) ? 0xff : 0;                \
-            uint8_t r = (e >> 10) & 0x1f;                    \
-            uint8_t g = (e >> 5) & 0x1f;                   \
-            uint8_t b = e & 0x1f;              \
-         \
-            result.u32[i] = (a << 24) | (r << 16) | (g << 8) |
b;       \
-        }          \
-        *r = result;         \
-    }
-VUPKPX(lpx, UPKLO)
-VUPKPX(hpx, UPKHI)
-#undef VUPKPX

 #define VUPK(suffix, unpacked, packee, hi)         \
     void helper_vupk##suffix(ppc_avr_t *r, ppc_avr_t *b)         \
diff --git a/target/ppc/translate/vmx-impl.inc.c
b/target/ppc/translate/vmx-impl.inc.c
index 3550ffa..09d80d6 100644
--- a/target/ppc/translate/vmx-impl.inc.c
+++ b/target/ppc/translate/vmx-impl.inc.c
@@ -1031,6 +1031,95 @@ static void trans_vclzd(DisasContext *ctx)
     tcg_temp_free_i64(avr);
 }

+/*
+ * vupkhpx VRT,VRB - Vector Unpack High Pixel
+ * vupklpx VRT,VRB - Vector Unpack Low Pixel
+ *
+ * Unpacks 4 pixels coded in 1-5-5-5 pattern from high/low
doubleword element
+ * of source register into contigous array of bits in the
destination register.
+ * Argument 'high' determines if high or low doubleword element
of source
+ * register is processed.
+ */
+static void trans_vupkpx(DisasContext *ctx, int high)


The last argument should be boolean.

+{
+    int VT = rD(ctx->opcode);
+    int VB = rB(ctx->opcode);
+    TCGv_i64 tmp = tcg_temp_new_i64();
+    TCGv_i64 avr = tcg_temp_new_i64();
+    TCGv_i64 result = tcg_temp_new_i64();
+    TCGv_i64 result1 = tcg_temp_new_i64();
+    TCGv_i64 result2 = tcg_temp_new_i64();
+    int64_t mask1 = 0x1fULL;
+    int64_t mask2 = 0x1fULL << 8;
+    int64_t mask3 = 0x1fULL << 16;
+    int64_t mask4 = 0xffULL << 56;
+    int i, j;
+
+    if (high == 1) {
+        get_avr64(avr, VB, true);
+    } else {
+        get_avr64(avr, VB, false);
+    }
+
+    tcg_gen_movi_i64(result, 0x0ULL);
+    for (i = 0; i < 2; i++) {
+        for (j = 0; j < 2; j++) {
+            tcg_gen_shli_i64(tmp, avr, (j * 16));
+            tcg_gen_andi_i64(tmp, tmp, mask1 << (j * 32));
+            tcg_gen_or_i64(result, result, tmp);
+
+         

[PATCH v7 3/3] target/ppc: Optimize emulation of vupkhpx and vupklpx instructions

2019-10-17 Thread Stefan Brankovic
'trans_vupkpx' function implements both vupkhpx and vupklpx instructions with
argument 'high' determine which instruction is processed. Instructions are
implemented in two 'for' loops. Outer 'for' loop repeats unpacking two times,
since both doubleword elements of destination register are formed the same way.
It also stores result of every iteration in temporary register, that is later
transferred to destination register. Inner 'for' loop does unpacking of pixels
and forms resulting doubleword 32 by 32 bits.

Signed-off-by: Stefan Brankovic 
---
 target/ppc/helper.h |  2 -
 target/ppc/int_helper.c | 20 
 target/ppc/translate/vmx-impl.inc.c | 91 -
 3 files changed, 89 insertions(+), 24 deletions(-)

diff --git a/target/ppc/helper.h b/target/ppc/helper.h
index b489b38..fd06b56 100644
--- a/target/ppc/helper.h
+++ b/target/ppc/helper.h
@@ -233,8 +233,6 @@ DEF_HELPER_2(vextsh2d, void, avr, avr)
 DEF_HELPER_2(vextsw2d, void, avr, avr)
 DEF_HELPER_2(vnegw, void, avr, avr)
 DEF_HELPER_2(vnegd, void, avr, avr)
-DEF_HELPER_2(vupkhpx, void, avr, avr)
-DEF_HELPER_2(vupklpx, void, avr, avr)
 DEF_HELPER_2(vupkhsb, void, avr, avr)
 DEF_HELPER_2(vupkhsh, void, avr, avr)
 DEF_HELPER_2(vupkhsw, void, avr, avr)
diff --git a/target/ppc/int_helper.c b/target/ppc/int_helper.c
index f910c11..9ee667d 100644
--- a/target/ppc/int_helper.c
+++ b/target/ppc/int_helper.c
@@ -1737,26 +1737,6 @@ void helper_vsum4ubs(CPUPPCState *env, ppc_avr_t *r, 
ppc_avr_t *a, ppc_avr_t *b)
 #define UPKHI 0
 #define UPKLO 1
 #endif
-#define VUPKPX(suffix, hi)  \
-void helper_vupk##suffix(ppc_avr_t *r, ppc_avr_t *b)\
-{   \
-int i;  \
-ppc_avr_t result;   \
-\
-for (i = 0; i < ARRAY_SIZE(r->u32); i++) {  \
-uint16_t e = b->u16[hi ? i : i + 4];\
-uint8_t a = (e >> 15) ? 0xff : 0;   \
-uint8_t r = (e >> 10) & 0x1f;   \
-uint8_t g = (e >> 5) & 0x1f;\
-uint8_t b = e & 0x1f;   \
-\
-result.u32[i] = (a << 24) | (r << 16) | (g << 8) | b;   \
-}   \
-*r = result;\
-}
-VUPKPX(lpx, UPKLO)
-VUPKPX(hpx, UPKHI)
-#undef VUPKPX
 
 #define VUPK(suffix, unpacked, packee, hi)  \
 void helper_vupk##suffix(ppc_avr_t *r, ppc_avr_t *b)\
diff --git a/target/ppc/translate/vmx-impl.inc.c 
b/target/ppc/translate/vmx-impl.inc.c
index 3550ffa..09d80d6 100644
--- a/target/ppc/translate/vmx-impl.inc.c
+++ b/target/ppc/translate/vmx-impl.inc.c
@@ -1031,6 +1031,95 @@ static void trans_vclzd(DisasContext *ctx)
 tcg_temp_free_i64(avr);
 }
 
+/*
+ * vupkhpx VRT,VRB - Vector Unpack High Pixel
+ * vupklpx VRT,VRB - Vector Unpack Low Pixel
+ *
+ * Unpacks 4 pixels coded in 1-5-5-5 pattern from high/low doubleword element
+ * of source register into contigous array of bits in the destination register.
+ * Argument 'high' determines if high or low doubleword element of source
+ * register is processed.
+ */
+static void trans_vupkpx(DisasContext *ctx, int high)
+{
+int VT = rD(ctx->opcode);
+int VB = rB(ctx->opcode);
+TCGv_i64 tmp = tcg_temp_new_i64();
+TCGv_i64 avr = tcg_temp_new_i64();
+TCGv_i64 result = tcg_temp_new_i64();
+TCGv_i64 result1 = tcg_temp_new_i64();
+TCGv_i64 result2 = tcg_temp_new_i64();
+int64_t mask1 = 0x1fULL;
+int64_t mask2 = 0x1fULL << 8;
+int64_t mask3 = 0x1fULL << 16;
+int64_t mask4 = 0xffULL << 56;
+int i, j;
+
+if (high == 1) {
+get_avr64(avr, VB, true);
+} else {
+get_avr64(avr, VB, false);
+}
+
+tcg_gen_movi_i64(result, 0x0ULL);
+for (i = 0; i < 2; i++) {
+for (j = 0; j < 2; j++) {
+tcg_gen_shli_i64(tmp, avr, (j * 16));
+tcg_gen_andi_i64(tmp, tmp, mask1 << (j * 32));
+tcg_gen_or_i64(result, result, tmp);
+
+tcg_gen_shli_i64(tmp, avr, 3 + (j * 16));
+tcg_gen_andi_i64(tmp, tmp, mask2 << (j * 32));
+tcg_gen_or_i64(result, result, tmp);
+
+tcg_gen_shli_i64(tmp, avr, 6 + (j * 16));
+tcg_gen_andi_i64(tmp, tmp, mask3 << (j * 32));
+tcg_gen_or_i64(result, result, tmp);
+
+tcg_g

[PATCH v7 1/3] target/ppc: Optimize emulation of vclzh and vclzb instructions

2019-10-17 Thread Stefan Brankovic
Optimize Altivec instruction vclzh (Vector Count Leading Zeros Halfword).
This instruction counts the number of leading zeros of each halfword element
in source register and places result in the appropriate halfword element of
destination register.

In each iteration of outer for loop count operation is performed on one
doubleword element of source register vB. In the first iteration, higher
doubleword element of vB is placed in variable avr, and then counting
for every halfword element is performed by  using tcg_gen_clzi_i64.
Since it counts leading zeros on 64 bit lenght, ith byte element has to
be moved to the highest 16 bits of tmp, or-ed with mask(in order to get all
ones in lowest 48 bits), then perform tcg_gen_clzi_i64 and move it's result
in appropriate halfword element of result. This is done in inner for loop.
After the operation is finished, the result is saved in the appropriate
doubleword element of destination register vD. The same sequence of orders
is to be applied again for the  lower doubleword element of vB.

Optimize Altivec instruction vclzb (Vector Count Leading Zeros Byte).
This instruction counts the number of leading zeros of each byte element
in source register and places result in the appropriate byte element of
destination register.

In each iteration of the outer for loop, counting operation is done on one
doubleword element of source register vB. In the first iteration, the
higher doubleword element of vB is placed in variable avr, and then counting
for every byte element is performed using tcg_gen_clzi_i64. Since it counts
leading zeros on 64 bit lenght, ith byte element has to be moved to the highest
8 bits of variable  tmp, or-ed with mask(in order to get all ones in the lowest
56 bits), then perform tcg_gen_clzi_i64 and move it's result in the appropriate
byte element of result. This is done in inner for loop. After the operation is
finished, the result is saved in the  appropriate doubleword element of 
destination
register vD. The same sequence of orders is to be applied again for the lower
doubleword element of vB.

Signed-off-by: Stefan Brankovic 
---
 target/ppc/helper.h |   2 -
 target/ppc/int_helper.c |   9 ---
 target/ppc/translate/vmx-impl.inc.c | 136 +++-
 3 files changed, 134 insertions(+), 13 deletions(-)

diff --git a/target/ppc/helper.h b/target/ppc/helper.h
index f843814..281e54f 100644
--- a/target/ppc/helper.h
+++ b/target/ppc/helper.h
@@ -308,8 +308,6 @@ DEF_HELPER_4(vcfsx, void, env, avr, avr, i32)
 DEF_HELPER_4(vctuxs, void, env, avr, avr, i32)
 DEF_HELPER_4(vctsxs, void, env, avr, avr, i32)
 
-DEF_HELPER_2(vclzb, void, avr, avr)
-DEF_HELPER_2(vclzh, void, avr, avr)
 DEF_HELPER_2(vctzb, void, avr, avr)
 DEF_HELPER_2(vctzh, void, avr, avr)
 DEF_HELPER_2(vctzw, void, avr, avr)
diff --git a/target/ppc/int_helper.c b/target/ppc/int_helper.c
index 6d238b9..cd00f5e 100644
--- a/target/ppc/int_helper.c
+++ b/target/ppc/int_helper.c
@@ -1817,15 +1817,6 @@ VUPK(lsw, s64, s32, UPKLO)
 }   \
 }
 
-#define clzb(v) ((v) ? clz32((uint32_t)(v) << 24) : 8)
-#define clzh(v) ((v) ? clz32((uint32_t)(v) << 16) : 16)
-
-VGENERIC_DO(clzb, u8)
-VGENERIC_DO(clzh, u16)
-
-#undef clzb
-#undef clzh
-
 #define ctzb(v) ((v) ? ctz32(v) : 8)
 #define ctzh(v) ((v) ? ctz32(v) : 16)
 #define ctzw(v) ctz32((v))
diff --git a/target/ppc/translate/vmx-impl.inc.c 
b/target/ppc/translate/vmx-impl.inc.c
index 2472a52..a428ef3 100644
--- a/target/ppc/translate/vmx-impl.inc.c
+++ b/target/ppc/translate/vmx-impl.inc.c
@@ -751,6 +751,138 @@ static void trans_vgbbd(DisasContext *ctx)
 }
 
 /*
+ * vclzb VRT,VRB - Vector Count Leading Zeros Byte
+ *
+ * Counting the number of leading zero bits of each byte element in source
+ * register and placing result in appropriate byte element of destination
+ * register.
+ */
+static void trans_vclzb(DisasContext *ctx)
+{
+int VT = rD(ctx->opcode);
+int VB = rB(ctx->opcode);
+TCGv_i64 avr = tcg_temp_new_i64();
+TCGv_i64 result = tcg_temp_new_i64();
+TCGv_i64 result1 = tcg_temp_new_i64();
+TCGv_i64 result2 = tcg_temp_new_i64();
+TCGv_i64 tmp = tcg_temp_new_i64();
+TCGv_i64 mask = tcg_const_i64(0xffULL);
+int i, j;
+
+for (i = 0; i < 2; i++) {
+if (i == 0) {
+/* Get high doubleword of vB in 'avr'. */
+get_avr64(avr, VB, true);
+} else {
+/* Get low doubleword of vB in 'avr'. */
+get_avr64(avr, VB, false);
+}
+/*
+ * Perform count for every byte element using 'tcg_gen_clzi_i64'.
+ * Since it counts leading zeros on 64 bit lenght, we have to move
+ * ith byte element to highest 8 bits of 'tmp', or it with mask(so we
+ * get all ones in lowest 56 bits), then perform 'tcg_gen_clzi_i64' and
+ * move it's result in appropriate b

[PATCH v7 2/3] target/ppc: Optimize emulation of vpkpx instruction

2019-10-17 Thread Stefan Brankovic
Optimize altivec instruction vpkpx (Vector Pack Pixel).
Rearranges 8 pixels coded in 6-5-5 pattern (4 from each source register)
into contigous array of bits in the destination register.

In each iteration of outer loop, the instruction is to be done with
the 6-5-5 pack for 2 pixels of each doubleword element of each
source register. The first thing to be done in outer loop is
choosing which doubleword element of which register is to be used
in current iteration and it is to be placed in avr variable. The
next step is to perform 6-5-5 pack of pixels on avr variable in inner
for loop(2 iterations, 1 for each pixel) and save result in tmp variable.
In the end of outer for loop, the result is merged in variable called
result and saved in appropriate doubleword element of vD if the whole
doubleword is finished(every second iteration). The outer loop has 4
iterations.

Signed-off-by: Stefan Brankovic 
---
 target/ppc/helper.h |  1 -
 target/ppc/int_helper.c | 21 
 target/ppc/translate/vmx-impl.inc.c | 99 -
 3 files changed, 98 insertions(+), 23 deletions(-)

diff --git a/target/ppc/helper.h b/target/ppc/helper.h
index 281e54f..b489b38 100644
--- a/target/ppc/helper.h
+++ b/target/ppc/helper.h
@@ -258,7 +258,6 @@ DEF_HELPER_4(vpkudus, void, env, avr, avr, avr)
 DEF_HELPER_4(vpkuhum, void, env, avr, avr, avr)
 DEF_HELPER_4(vpkuwum, void, env, avr, avr, avr)
 DEF_HELPER_4(vpkudum, void, env, avr, avr, avr)
-DEF_HELPER_3(vpkpx, void, avr, avr, avr)
 DEF_HELPER_5(vmhaddshs, void, env, avr, avr, avr, avr)
 DEF_HELPER_5(vmhraddshs, void, env, avr, avr, avr, avr)
 DEF_HELPER_5(vmsumuhm, void, env, avr, avr, avr, avr)
diff --git a/target/ppc/int_helper.c b/target/ppc/int_helper.c
index cd00f5e..f910c11 100644
--- a/target/ppc/int_helper.c
+++ b/target/ppc/int_helper.c
@@ -1262,27 +1262,6 @@ void helper_vpmsumd(ppc_avr_t *r, ppc_avr_t *a, 
ppc_avr_t *b)
 #else
 #define PKBIG 0
 #endif
-void helper_vpkpx(ppc_avr_t *r, ppc_avr_t *a, ppc_avr_t *b)
-{
-int i, j;
-ppc_avr_t result;
-#if defined(HOST_WORDS_BIGENDIAN)
-const ppc_avr_t *x[2] = { a, b };
-#else
-const ppc_avr_t *x[2] = { b, a };
-#endif
-
-VECTOR_FOR_INORDER_I(i, u64) {
-VECTOR_FOR_INORDER_I(j, u32) {
-uint32_t e = x[i]->u32[j];
-
-result.u16[4 * i + j] = (((e >> 9) & 0xfc00) |
- ((e >> 6) & 0x3e0) |
- ((e >> 3) & 0x1f));
-}
-}
-*r = result;
-}
 
 #define VPK(suffix, from, to, cvt, dosat)   \
 void helper_vpk##suffix(CPUPPCState *env, ppc_avr_t *r, \
diff --git a/target/ppc/translate/vmx-impl.inc.c 
b/target/ppc/translate/vmx-impl.inc.c
index a428ef3..3550ffa 100644
--- a/target/ppc/translate/vmx-impl.inc.c
+++ b/target/ppc/translate/vmx-impl.inc.c
@@ -579,6 +579,103 @@ static void trans_lvsr(DisasContext *ctx)
 }
 
 /*
+ * vpkpx VRT,VRA,VRB - Vector Pack Pixel
+ *
+ * Rearranges 8 pixels coded in 6-5-5 pattern (4 from each source register)
+ * into contigous array of bits in the destination register.
+ */
+static void trans_vpkpx(DisasContext *ctx)
+{
+int VT = rD(ctx->opcode);
+int VA = rA(ctx->opcode);
+int VB = rB(ctx->opcode);
+TCGv_i64 tmp = tcg_temp_new_i64();
+TCGv_i64 shifted = tcg_temp_new_i64();
+TCGv_i64 avr = tcg_temp_new_i64();
+TCGv_i64 result = tcg_temp_new_i64();
+TCGv_i64 result1 = tcg_temp_new_i64();
+TCGv_i64 result2 = tcg_temp_new_i64();
+int64_t mask1 = 0x1fULL;
+int64_t mask2 = 0x1fULL << 5;
+int64_t mask3 = 0x3fULL << 10;
+int i, j;
+/*
+ * In each iteration do the 6-5-5 pack for 2 pixels of each doubleword
+ * element of each source register.
+ */
+for (i = 0; i < 4; i++) {
+switch (i) {
+case 0:
+/*
+ * Get high doubleword of vA to perform 6-5-5 pack of pixels
+ * 1 and 2.
+ */
+get_avr64(avr, VA, true);
+tcg_gen_movi_i64(result, 0x0ULL);
+break;
+case 1:
+/*
+ * Get low doubleword of vA to perform 6-5-5 pack of pixels
+ * 3 and 4.
+ */
+get_avr64(avr, VA, false);
+break;
+case 2:
+/*
+ * Get high doubleword of vB to perform 6-5-5 pack of pixels
+ * 5 and 6.
+ */
+get_avr64(avr, VB, true);
+tcg_gen_movi_i64(result, 0x0ULL);
+break;
+case 3:
+/*
+ * Get low doubleword of vB to perform 6-5-5 pack of pixels
+ * 7 and 8.
+ */
+get_avr64(avr, VB, false);
+break;
+}
+/* Perform the packing for 2 pixels(each iteration for 1). */
+tcg_gen_movi_i64(tmp, 0x0ULL);
+for (j = 0; j < 2; j+

[PATCH v7 0/3] target/ppc: Optimize emulation of some Altivec instructions

2019-10-17 Thread Stefan Brankovic
Optimize emulation of ten Altivec instructions: lvsl, lvsr, vsl, vsr, vpkpx,
vgbbd, vclzb, vclzh, vclzw, vclzd, vupkhpx and vupklpx.

This series buils up on and complements recent work of Thomas Murta, Mark
Cave-Ayland and Richard Henderson in the same area. It is based on devising TCG
translation implementation for selected instructions rather than using helpers.
The selected instructions are most of the time idiosyncratic to ppc platform,
so relatively complex TCG translation (without direct mapping to host
instruction that is not possible in these cases) seems to be the best option,
and that approach is presented in this series. The performance improvements
are significant in all cases.

V7:

Added optimization for vupkhpx and vupklpx instructions.

V6:

Rebased series to the latest qemu code.
Excluded all patches that are already accepted.

V5:

Fixed vpkpx bug and added it back in patch.
Fixed graphical distortions on OSX 10.3 and 10.4.
Removed conversion of vmrgh and vmrgl instructions to vector operations for
further investigation.

V4:

Addressed Richard's Henderson's suggestions.
Removed vpkpx's optimization for further investigation on graphical distortions
it caused on OSX 10.2-4 guests.
Added opcodes for vector vmrgh(b|h|w) and vmrgl(b|h|w) in tcg.
Implemented vector vmrgh and vmrgl instructions for i386.
Converted vmrgh and vmrgl instructions to vector operations.

V3:

Fixed problem during build.

V2:

Addressed Richard's Henderson's suggestions.
Fixed problem during build on patch 2/8.
Rebased series to the latest qemu code.

Stefan Brankovic (3):
  target/ppc: Optimize emulation of vclzh and vclzb instructions
  target/ppc: Optimize emulation of vpkpx instruction
  target/ppc: Optimize emulation of vupkhpx and vupklpx instructions

 target/ppc/helper.h |   5 -
 target/ppc/int_helper.c |  50 --
 target/ppc/translate/vmx-impl.inc.c | 326 +++-
 3 files changed, 321 insertions(+), 60 deletions(-)

-- 
2.7.4




Re: [PATCH v6 1/3] target/ppc: Optimize emulation of vpkpx instruction

2019-10-16 Thread Stefan Brankovic



On 29.8.19. 17:31, Richard Henderson wrote:

On 8/29/19 6:34 AM, Stefan Brankovic wrote:

Then I run my performance tests and I got following results(test is calling
vpkpx 10 times):

1) Current helper implementation: ~ 157 ms

2) helper implementation you suggested: ~94 ms

3) tcg implementation: ~75 ms

I assume you tested in a loop.  If you have just the one expansion, you'll not
see the penalty for the icache expansion.  To show the other extreme, you'd
want to test as separate sequential invocations.

Yes, testing is done in a loop.


That said, I'd be more interested in a real test case that isn't just calling
one instruction over and over.  Is there a real test case that shows vpkpx in
the top 25 of the profile?  With more than 0.5% of runtime?


r~


I made an experiment where I started MAC OSX 10.4 in QEMU system mode 
and I found out that vpkpx instruction is widely used to display 
different graphical elements. With that in mind, this performance 
improvement is of great importance.


Also, vpkpx instruction is often used in a loop, to process big amount 
of pixels at once. That's why testing performance of this instruction in 
a loop should give good insight of how this instruction perform overall.


Kind Regards,

Stefan




[PATCH v2] target/ppc: Fix for optimized vsl/vsr instructions

2019-10-04 Thread Stefan Brankovic
This patch fixes bug in optimized vsl/vsr instructions reported by Mark
Cave-Ayland and Paul Clarke. Sorry for not responding earlier, I was absent
last couple of days. I also integrated some suggestions made by Aleksandar
Markovic. New soultion is tested and still has noticable performance
improvement compared to old helper implementation.

V1 of this patch was not sent to qemu-devel and I am now sending V2 to
appropriate email adresses.

Stefan Brankovic (1):
  target/ppc: Fix for optimized vsl/vsr instructions

 target/ppc/translate/vmx-impl.inc.c | 84 ++---
 1 file changed, 40 insertions(+), 44 deletions(-)

-- 
2.7.4




[PATCH v2] target/ppc: Fix for optimized vsl/vsr instructions

2019-10-04 Thread Stefan Brankovic
In previous implementation, invocation of TCG shift function could request
shift of TCG variable by 64 bits when variable 'sh' is 0, which is not
supported in TCG (values can be shifted by 0 to 63 bits). This patch fixes
this by using two separate invocation of TCG shift functions, with maximum
shift amount of 32.

Name of variable 'shifted' is changed to 'carry' so variable naming
is similar to old helper implementation.

Variables 'avrA' and 'avrB' are replaced with variable 'avr'.

Fixes: 4e6d0920e7547e6af4bbac5ffe9adfe6ea621822
Reported-by: Paul Clark 
Reported-by: Mark Cave-Ayland 
Suggested-by: Aleksandar Markovic 
Signed-off-by: Stefan Brankovic 
---
 target/ppc/translate/vmx-impl.inc.c | 84 ++---
 1 file changed, 40 insertions(+), 44 deletions(-)

diff --git a/target/ppc/translate/vmx-impl.inc.c 
b/target/ppc/translate/vmx-impl.inc.c
index 2472a52..81d5a7a 100644
--- a/target/ppc/translate/vmx-impl.inc.c
+++ b/target/ppc/translate/vmx-impl.inc.c
@@ -590,40 +590,38 @@ static void trans_vsl(DisasContext *ctx)
 int VT = rD(ctx->opcode);
 int VA = rA(ctx->opcode);
 int VB = rB(ctx->opcode);
-TCGv_i64 avrA = tcg_temp_new_i64();
-TCGv_i64 avrB = tcg_temp_new_i64();
+TCGv_i64 avr = tcg_temp_new_i64();
 TCGv_i64 sh = tcg_temp_new_i64();
-TCGv_i64 shifted = tcg_temp_new_i64();
+TCGv_i64 carry = tcg_temp_new_i64();
 TCGv_i64 tmp = tcg_temp_new_i64();
 
-/* Place bits 125-127 of vB in sh. */
-get_avr64(avrB, VB, false);
-tcg_gen_andi_i64(sh, avrB, 0x07ULL);
+/* Place bits 125-127 of vB in 'sh'. */
+get_avr64(avr, VB, false);
+tcg_gen_andi_i64(sh, avr, 0x07ULL);
 
 /*
- * Save highest sh bits of lower doubleword element of vA in variable
- * shifted and perform shift on lower doubleword.
+ * Save highest 'sh' bits of lower doubleword element of vA in variable
+ * 'carry' and perform shift on lower doubleword.
  */
-get_avr64(avrA, VA, false);
-tcg_gen_subfi_i64(tmp, 64, sh);
-tcg_gen_shr_i64(shifted, avrA, tmp);
-tcg_gen_andi_i64(shifted, shifted, 0x7fULL);
-tcg_gen_shl_i64(avrA, avrA, sh);
-set_avr64(VT, avrA, false);
+get_avr64(avr, VA, false);
+tcg_gen_subfi_i64(tmp, 32, sh);
+tcg_gen_shri_i64(carry, avr, 32);
+tcg_gen_shr_i64(carry, carry, tmp);
+tcg_gen_shl_i64(avr, avr, sh);
+set_avr64(VT, avr, false);
 
 /*
  * Perform shift on higher doubleword element of vA and replace lowest
- * sh bits with shifted.
+ * 'sh' bits with 'carry'.
  */
-get_avr64(avrA, VA, true);
-tcg_gen_shl_i64(avrA, avrA, sh);
-tcg_gen_or_i64(avrA, avrA, shifted);
-set_avr64(VT, avrA, true);
+get_avr64(avr, VA, true);
+tcg_gen_shl_i64(avr, avr, sh);
+tcg_gen_or_i64(avr, avr, carry);
+set_avr64(VT, avr, true);
 
-tcg_temp_free_i64(avrA);
-tcg_temp_free_i64(avrB);
+tcg_temp_free_i64(avr);
 tcg_temp_free_i64(sh);
-tcg_temp_free_i64(shifted);
+tcg_temp_free_i64(carry);
 tcg_temp_free_i64(tmp);
 }
 
@@ -639,39 +637,37 @@ static void trans_vsr(DisasContext *ctx)
 int VT = rD(ctx->opcode);
 int VA = rA(ctx->opcode);
 int VB = rB(ctx->opcode);
-TCGv_i64 avrA = tcg_temp_new_i64();
-TCGv_i64 avrB = tcg_temp_new_i64();
+TCGv_i64 avr = tcg_temp_new_i64();
 TCGv_i64 sh = tcg_temp_new_i64();
-TCGv_i64 shifted = tcg_temp_new_i64();
+TCGv_i64 carry = tcg_temp_new_i64();
 TCGv_i64 tmp = tcg_temp_new_i64();
 
-/* Place bits 125-127 of vB in sh. */
-get_avr64(avrB, VB, false);
-tcg_gen_andi_i64(sh, avrB, 0x07ULL);
+/* Place bits 125-127 of vB in 'sh'. */
+get_avr64(avr, VB, false);
+tcg_gen_andi_i64(sh, avr, 0x07ULL);
 
 /*
- * Save lowest sh bits of higher doubleword element of vA in variable
- * shifted and perform shift on higher doubleword.
+ * Save lowest 'sh' bits of higher doubleword element of vA in variable
+ * 'carry' and perform shift on higher doubleword.
  */
-get_avr64(avrA, VA, true);
-tcg_gen_subfi_i64(tmp, 64, sh);
-tcg_gen_shl_i64(shifted, avrA, tmp);
-tcg_gen_andi_i64(shifted, shifted, 0xfe00ULL);
-tcg_gen_shr_i64(avrA, avrA, sh);
-set_avr64(VT, avrA, true);
+get_avr64(avr, VA, true);
+tcg_gen_subfi_i64(tmp, 32, sh);
+tcg_gen_shli_i64(carry, avr, 32);
+tcg_gen_shl_i64(carry, carry, tmp);
+tcg_gen_shr_i64(avr, avr, sh);
+set_avr64(VT, avr, true);
 /*
  * Perform shift on lower doubleword element of vA and replace highest
- * sh bits with shifted.
+ * 'sh' bits with 'carry'.
  */
-get_avr64(avrA, VA, false);
-tcg_gen_shr_i64(avrA, avrA, sh);
-tcg_gen_or_i64(avrA, avrA, shifted);
-set_avr64(VT, avrA, false);
+get_avr64(avr, VA, false);
+tcg_gen_shr_i64(avr, avr, sh);
+tcg_gen_or_i64(avr, avr, carry);
+set_avr64(VT, avr, false);
 
-tcg_temp_free_i64(avrA);
-tcg_

Re: target/ppc: bug in optimised vsl/vsr implementation?

2019-10-03 Thread Stefan Brankovic
Please take a look at the following patch 
https://lists.nongnu.org/archive/html/qemu-ppc/2019-10/msg00133.html and 
let me know if problem is solved.


On 2.10.19. 16:08, Stefan Brankovic wrote:

Hi Mark,

Thank you for reporting this bug. I was away from office for couple of 
days, so that's why I am answering you a bit late, sorry about that. I 
will start working on a solution and try to fix this problem in next 
couple of days.


On 1.10.19. 20:24, Mark Cave-Ayland wrote:

On 28/09/2019 18:45, Aleksandar Markovic wrote:

Hi Aleksandar,

Thanks for taking a look at this!


Mark and Paul (and Stefan),

Thanks for spotting this and pinpointing the culprit commit. I guess 
Stefan is going
to respond soon, but, in the meantime, I took a look at the commit 
in question:


https://github.com/qemu/qemu/commit/4e6d0920e7547e6af4bbac5ffe9adfe6ea621822 



I don't have at the moment any dev/test environment handy, but I did 
manual
inspection of the code, and here is what I found (in order of 
importance, perceived

by me):

1. The code will not work correctly if the shift ammount (variable 
'sh') is 0. This
is because, in that case, one of succeeding invocations of TCG shift 
functions will
be required to shift a 64-bit TCG variable by 64 bits, and the 
result of such TCG

operation is undefined (shift amount must be 63 or less) - see
https://github.com/qemu/qemu/blob/master/tcg/README.
Yes I think you're right here - the old helper got around this by 
doing an explicit
copy from a to r if the shift value is zero. In fact the case that 
Paul reported is

exactly this:

    vsl VRT, VRA, VRB

=> 0x16e0 : vsl v0,v0,v1
(gdb) p $vr0.uint128
$21 = 0x10111213141516172021222324252650
(gdb) p $vr1.uint128
$22 = 0x0
(gdb) stepi
0x16e4 in vec_slq ()
1: x/i $pc
=> 0x16e4 : xxlor vs0,vs32,vs32
(gdb) p $vr0.uint128
$23 = 0x10111213141516172021222324252650

I guess the solution is check for sh == 0 and if this is the case, 
execute a copy

instead.

I agree with you. This will be changed in upcoming patch.


2. Variable naming is better in the old helper than in the new 
translator. In that
light, I would advise Stefan to change 'sh' to 'shift', and 
'shifted' to 'carry'.
It looks like the name "sh" comes from the ISA documentation, so 
whilst it's a little
tricky to compare with the previous implementation it does make sense 
when comparing
with the algorithm shown there. Note: this implementation also drops 
the check for
each byte of VRB having the same shift value - should we care about 
this?


"sh" is taken from the ISA documentation, so I would leave that as it 
is now, but I can change some other variable names to be consistent 
with previous implementation (e.g. "shifted" -> "carry").


I don't think that we should check each byte of VRB, because we care 
only about "defined" behavior. If shift values doesn't match, result 
is "undefined" so it doesn't matter what is inside resulting register.



3. Lines

tcg_gen_andi_i64(shifted, shifted, 0x7fULL);

and

tcg_gen_andi_i64(shifted, shifted, 0xfe00ULL);

appear to be spurious (albait in a harmless way). Therefore, they 
should be deleted,

or, alternatevely, a justification for them should be provided.
I'm not sure why they are needed either - there's certainly no 
mention of it in the

ISA documentation. Stefan?

This will be removed in upcoming patch.


4. In the commit message, variable names were used without quotation 
mark, resulting

in puzzling and unclear wording.

5. (a question for Mark) After all recent changes, does 
get_avr64(..., ..., true)
always (for any endian configuration) return the "high" half of an 
Altivec register,

and get_avr64(..., ..., false) the "low" one?
Yes - the new functions always return the MSB (high) and LSB (low) 
correctly

regardless of host endian.

Given all these circumstances, perhaps the most reasonable solution 
would be to
revert the commit in question, and allow Stefan enough dev and test 
time to hopefully

submit a new, better, version later on.
Given that it has been broken for 3 months now, I don't think we're 
in any major rush
to revert ASAP. I'd prefer to give Stefan a bit more time first since 
he does report

some substantial speed improvements from these new implementations.


ATB,

Mark.


Best Regards,

Stefan





Re: target/ppc: bug in optimised vsl/vsr implementation?

2019-10-02 Thread Stefan Brankovic

Hi Mark,

Thank you for reporting this bug. I was away from office for couple of 
days, so that's why I am answering you a bit late, sorry about that. I 
will start working on a solution and try to fix this problem in next 
couple of days.


On 1.10.19. 20:24, Mark Cave-Ayland wrote:

On 28/09/2019 18:45, Aleksandar Markovic wrote:

Hi Aleksandar,

Thanks for taking a look at this!


Mark and Paul (and Stefan),

Thanks for spotting this and pinpointing the culprit commit. I guess Stefan is 
going
to respond soon, but, in the meantime, I took a look at the commit in question:

https://github.com/qemu/qemu/commit/4e6d0920e7547e6af4bbac5ffe9adfe6ea621822

I don't have at the moment any dev/test environment handy, but I did manual
inspection of the code, and here is what I found (in order of importance, 
perceived
by me):

1. The code will not work correctly if the shift ammount (variable 'sh') is 0. 
This
is because, in that case, one of succeeding invocations of TCG shift functions 
will
be required to shift a 64-bit TCG variable by 64 bits, and the result of such 
TCG
operation is undefined (shift amount must be 63 or less) - see
https://github.com/qemu/qemu/blob/master/tcg/README.

Yes I think you're right here - the old helper got around this by doing an 
explicit
copy from a to r if the shift value is zero. In fact the case that Paul 
reported is
exactly this:

vsl VRT, VRA, VRB

=> 0x16e0 : vsl v0,v0,v1
(gdb) p $vr0.uint128
$21 = 0x10111213141516172021222324252650
(gdb) p $vr1.uint128
$22 = 0x0
(gdb) stepi
0x16e4 in vec_slq ()
1: x/i $pc
=> 0x16e4 : xxlor vs0,vs32,vs32
(gdb) p $vr0.uint128
$23 = 0x10111213141516172021222324252650

I guess the solution is check for sh == 0 and if this is the case, execute a 
copy
instead.

I agree with you. This will be changed in upcoming patch.



2. Variable naming is better in the old helper than in the new translator. In 
that
light, I would advise Stefan to change 'sh' to 'shift', and 'shifted' to 
'carry'.

It looks like the name "sh" comes from the ISA documentation, so whilst it's a 
little
tricky to compare with the previous implementation it does make sense when 
comparing
with the algorithm shown there. Note: this implementation also drops the check 
for
each byte of VRB having the same shift value - should we care about this?


"sh" is taken from the ISA documentation, so I would leave that as it is 
now, but I can change some other variable names to be consistent with 
previous implementation (e.g. "shifted" -> "carry").


I don't think that we should check each byte of VRB, because we care 
only about "defined" behavior. If shift values doesn't match, result is 
"undefined" so it doesn't matter what is inside resulting register.



3. Lines

tcg_gen_andi_i64(shifted, shifted, 0x7fULL);

and

tcg_gen_andi_i64(shifted, shifted, 0xfe00ULL);

appear to be spurious (albait in a harmless way). Therefore, they should be 
deleted,
or, alternatevely, a justification for them should be provided.

I'm not sure why they are needed either - there's certainly no mention of it in 
the
ISA documentation. Stefan?

This will be removed in upcoming patch.



4. In the commit message, variable names were used without quotation mark, 
resulting
in puzzling and unclear wording.

5. (a question for Mark) After all recent changes, does get_avr64(..., ..., 
true)
always (for any endian configuration) return the "high" half of an Altivec 
register,
and get_avr64(..., ..., false) the "low" one?

Yes - the new functions always return the MSB (high) and LSB (low) correctly
regardless of host endian.


Given all these circumstances, perhaps the most reasonable solution would be to
revert the commit in question, and allow Stefan enough dev and test time to 
hopefully
submit a new, better, version later on.

Given that it has been broken for 3 months now, I don't think we're in any 
major rush
to revert ASAP. I'd prefer to give Stefan a bit more time first since he does 
report
some substantial speed improvements from these new implementations.


ATB,

Mark.


Best Regards,

Stefan




Re: [Qemu-devel] [PATCH v6 1/3] target/ppc: Optimize emulation of vpkpx instruction

2019-08-29 Thread Stefan Brankovic


On 27.8.19. 20:52, Richard Henderson wrote:

On 8/27/19 2:37 AM, Stefan Brankovic wrote:

+for (i = 0; i < 4; i++) {
+switch (i) {
+case 0:
+/*
+ * Get high doubleword of vA to perfrom 6-5-5 pack of pixels
+ * 1 and 2.
+ */
+get_avr64(avr, VA, true);
+tcg_gen_movi_i64(result, 0x0ULL);
+break;
+case 1:
+/*
+ * Get low doubleword of vA to perfrom 6-5-5 pack of pixels
+ * 3 and 4.
+ */
+get_avr64(avr, VA, false);
+break;
+case 2:
+/*
+ * Get high doubleword of vB to perfrom 6-5-5 pack of pixels
+ * 5 and 6.
+ */
+get_avr64(avr, VB, true);
+tcg_gen_movi_i64(result, 0x0ULL);
+break;
+case 3:
+/*
+ * Get low doubleword of vB to perfrom 6-5-5 pack of pixels
+ * 7 and 8.
+ */
+get_avr64(avr, VB, false);
+break;
+}
+/* Perform the packing for 2 pixels(each iteration for 1). */
+tcg_gen_movi_i64(tmp, 0x0ULL);
+for (j = 0; j < 2; j++) {
+tcg_gen_shri_i64(shifted, avr, (j * 16 + 3));
+tcg_gen_andi_i64(shifted, shifted, mask1 << (j * 16));
+tcg_gen_or_i64(tmp, tmp, shifted);
+
+tcg_gen_shri_i64(shifted, avr, (j * 16 + 6));
+tcg_gen_andi_i64(shifted, shifted, mask2 << (j * 16));
+tcg_gen_or_i64(tmp, tmp, shifted);
+
+tcg_gen_shri_i64(shifted, avr, (j * 16 + 9));
+tcg_gen_andi_i64(shifted, shifted, mask3 << (j * 16));
+tcg_gen_or_i64(tmp, tmp, shifted);
+}
+if ((i == 0) || (i == 2)) {
+tcg_gen_shli_i64(tmp, tmp, 32);
+}
+tcg_gen_or_i64(result, result, tmp);
+if (i == 1) {
+/* Place packed pixels 1:4 to high doubleword of vD. */
+tcg_gen_mov_i64(result1, result);
+}
+if (i == 3) {
+/* Place packed pixels 5:8 to low doubleword of vD. */
+tcg_gen_mov_i64(result2, result);
+}
+}
+set_avr64(VT, result1, true);
+set_avr64(VT, result2, false);

I really have a hard time believing that it is worthwhile to inline all of this
code.  By my count this is 82 non-move opcodes.  That is a *lot* of inline
expansion.

However, I can well imagine that the existing out-of-line helper is less than
optimal.


-void helper_vpkpx(ppc_avr_t *r, ppc_avr_t *a, ppc_avr_t *b)
-{
-int i, j;
-ppc_avr_t result;
-#if defined(HOST_WORDS_BIGENDIAN)
-const ppc_avr_t *x[2] = { a, b };
-#else
-const ppc_avr_t *x[2] = { b, a };
-#endif
-
-VECTOR_FOR_INORDER_I(i, u64) {
-VECTOR_FOR_INORDER_I(j, u32) {
-uint32_t e = x[i]->u32[j];

Double indirect loads?


-
-result.u16[4 * i + j] = (((e >> 9) & 0xfc00) |
- ((e >> 6) & 0x3e0) |
- ((e >> 3) & 0x1f));

Store to temporary ...


-}
-}
-*r = result;

... and then copy?

Try replacing the existing helper with something like the following.


r~



static inline uint64_t pkpx_1(uint64_t a, int shr, int shl)
{
 uint64_t r;

 r  = ((a >> (shr + 9)) & 0x3f) << shl;
 r |= ((a >> (shr + 6)) & 0x1f) << shl;
 r |= ((a >> (shr + 3)) & 0x1f) << shl;

 return r;
}

static inline uint64_t pkpx_2(uint64_t ah, uint64_t al)
{
 return pkpx_1(ah, 32, 48)
  | pkpx_1(ah,  0, 32)
  | pkpx_1(al, 32, 16)
  | pkpx_1(al,  0,  0);
}

void helper_vpkpx(uint64_t *r, uint64_t *a, uint64_t *b)
{
 uint64_t rh = pkpx_2(a->VsrD(0), a->VsrD(1));
 uint64_t rl = pkpx_2(b->VsrD(0), b->VsrD(1));
 r->VsrD(0) = rh;
 r->VsrD(1) = rl;
}


I implemented vpkpx as you suggested above with small modifications(so 
it builds and gives correct result). It looks like this:


static inline uint64_t pkpx_1(uint64_t a, int shr, int shl)
{
    uint64_t r;

    r  = ((a >> (shr + 9)) & 0xfc00) << shl;
    r |= ((a >> (shr + 6)) & 0x3e0) << shl;
    r |= ((a >> (shr + 3)) & 0x1f) << shl;

    return r;
}

static inline uint64_t pkpx_2(uint64_t ah, uint64_t al)
{
    return pkpx_1(ah, 32, 48)
 | pkpx_1(ah,  0, 32)
 | pkpx_1(al, 32, 16)
 | pkpx_1(al,  0,  0);
}

void helper_vpkpx(ppc_avr_t *r, ppc_avr_t *a, ppc_avr_t *b)
{
    uint64_t rh = pkpx_2(a->u64[1], a->u64[0]);
    uint64_t rl = pkpx_2(b->u64[1], b->u64[0]);
    r->u64[1] = rh;
    r->u64[0] = rl;
}

I also noticed that this would work only for little_endian hosts, so we 
would need to modify it in order to support big_endian hosts (this 
shouldn't affect performance results).


Then

[Qemu-devel] [PATCH v6 2/3] target/ppc: Optimize emulation of vclzh and vclzb instructions

2019-08-27 Thread Stefan Brankovic
Optimize Altivec instruction vclzh (Vector Count Leading Zeros Halfword).
This instruction counts the number of leading zeros of each halfword element
in source register and places result in the appropriate halfword element of
destination register.

In each iteration of outer for loop count operation is performed on one
doubleword element of source register vB. In the first iteration, higher
doubleword element of vB is placed in variable avr, and then counting
for every halfword element is performed by  using tcg_gen_clzi_i64.
Since it counts leading zeros on 64 bit lenght, ith byte element has to
be moved to the highest 16 bits of tmp, or-ed with mask(in order to get all
ones in lowest 48 bits), then perform tcg_gen_clzi_i64 and move it's result
in appropriate halfword element of result. This is done in inner for loop.
After the operation is finished, the result is saved in the appropriate
doubleword element of destination register vD. The same sequence of orders
is to be applied again for the  lower doubleword element of vB.

Optimize Altivec instruction vclzb (Vector Count Leading Zeros Byte).
This instruction counts the number of leading zeros of each byte element
in source register and places result in the appropriate byte element of
destination register.

In each iteration of the outer for loop, counting operation is done on one
doubleword element of source register vB. In the first iteration, the
higher doubleword element of vB is placed in variable avr, and then counting
for every byte element is performed using tcg_gen_clzi_i64. Since it counts
leading zeros on 64 bit lenght, ith byte element has to be moved to the highest
8 bits of variable  tmp, or-ed with mask(in order to get all ones in the lowest
56 bits), then perform tcg_gen_clzi_i64 and move it's result in the appropriate
byte element of result. This is done in inner for loop. After the operation is
finished, the result is saved in the  appropriate doubleword element of 
destination
register vD. The same sequence of orders is to be applied again for the lower
doubleword element of vB.

Signed-off-by: Stefan Brankovic 
---
 target/ppc/helper.h |   2 -
 target/ppc/int_helper.c |   9 ---
 target/ppc/translate/vmx-impl.inc.c | 136 +++-
 3 files changed, 134 insertions(+), 13 deletions(-)

diff --git a/target/ppc/helper.h b/target/ppc/helper.h
index 940a115..39c202f 100644
--- a/target/ppc/helper.h
+++ b/target/ppc/helper.h
@@ -307,8 +307,6 @@ DEF_HELPER_4(vcfsx, void, env, avr, avr, i32)
 DEF_HELPER_4(vctuxs, void, env, avr, avr, i32)
 DEF_HELPER_4(vctsxs, void, env, avr, avr, i32)
 
-DEF_HELPER_2(vclzb, void, avr, avr)
-DEF_HELPER_2(vclzh, void, avr, avr)
 DEF_HELPER_2(vctzb, void, avr, avr)
 DEF_HELPER_2(vctzh, void, avr, avr)
 DEF_HELPER_2(vctzw, void, avr, avr)
diff --git a/target/ppc/int_helper.c b/target/ppc/int_helper.c
index 9ff3b03..65a9387 100644
--- a/target/ppc/int_helper.c
+++ b/target/ppc/int_helper.c
@@ -1796,15 +1796,6 @@ VUPK(lsw, s64, s32, UPKLO)
 }   \
 }
 
-#define clzb(v) ((v) ? clz32((uint32_t)(v) << 24) : 8)
-#define clzh(v) ((v) ? clz32((uint32_t)(v) << 16) : 16)
-
-VGENERIC_DO(clzb, u8)
-VGENERIC_DO(clzh, u16)
-
-#undef clzb
-#undef clzh
-
 #define ctzb(v) ((v) ? ctz32(v) : 8)
 #define ctzh(v) ((v) ? ctz32(v) : 16)
 #define ctzw(v) ctz32((v))
diff --git a/target/ppc/translate/vmx-impl.inc.c 
b/target/ppc/translate/vmx-impl.inc.c
index 45a..e8a0fb6 100644
--- a/target/ppc/translate/vmx-impl.inc.c
+++ b/target/ppc/translate/vmx-impl.inc.c
@@ -840,6 +840,138 @@ static void trans_vgbbd(DisasContext *ctx)
 }
 
 /*
+ * vclzb VRT,VRB - Vector Count Leading Zeros Byte
+ *
+ * Counting the number of leading zero bits of each byte element in source
+ * register and placing result in appropriate byte element of destination
+ * register.
+ */
+static void trans_vclzb(DisasContext *ctx)
+{
+int VT = rD(ctx->opcode);
+int VB = rB(ctx->opcode);
+TCGv_i64 avr = tcg_temp_new_i64();
+TCGv_i64 result = tcg_temp_new_i64();
+TCGv_i64 result1 = tcg_temp_new_i64();
+TCGv_i64 result2 = tcg_temp_new_i64();
+TCGv_i64 tmp = tcg_temp_new_i64();
+TCGv_i64 mask = tcg_const_i64(0xffULL);
+int i, j;
+
+for (i = 0; i < 2; i++) {
+if (i == 0) {
+/* Get high doubleword of vB in avr. */
+get_avr64(avr, VB, true);
+} else {
+/* Get low doubleword of vB in avr. */
+get_avr64(avr, VB, false);
+}
+/*
+ * Perform count for every byte element using tcg_gen_clzi_i64.
+ * Since it counts leading zeros on 64 bit lenght, we have to move
+ * ith byte element to highest 8 bits of tmp, or it with mask(so we get
+ * all ones in lowest 56 bits), then perform tcg_gen_clzi_i64 and move
+ * it's result in appropriate b

[Qemu-devel] [PATCH v6 3/3] target/ppc: Refactor emulation of vmrgew and vmrgow instructions

2019-08-27 Thread Stefan Brankovic
Since I found this two instructions implemented with tcg, I refactored
them so they are consistent with other similar implementations that
I introduced in this patch.

Also, a new dual macro GEN_VXFORM_TRANS_DUAL is added. This macro is
used if one instruction is realized with direct translation, and second
one with a helper.

Signed-off-by: Stefan Brankovic 
---
 target/ppc/translate/vmx-impl.inc.c | 66 +
 1 file changed, 37 insertions(+), 29 deletions(-)

diff --git a/target/ppc/translate/vmx-impl.inc.c 
b/target/ppc/translate/vmx-impl.inc.c
index e8a0fb6..6af9c73 100644
--- a/target/ppc/translate/vmx-impl.inc.c
+++ b/target/ppc/translate/vmx-impl.inc.c
@@ -350,6 +350,28 @@ static void glue(gen_, name0##_##name1)(DisasContext *ctx) 
\
 }  \
 }
 
+/*
+ * We use this macro if one instruction is realized with direct
+ * translation, and second one with helper.
+ */
+#define GEN_VXFORM_TRANS_DUAL(name0, flg0, flg2_0, name1, flg1, flg2_1)\
+static void glue(gen_, name0##_##name1)(DisasContext *ctx) \
+{  \
+if ((Rc(ctx->opcode) == 0) &&  \
+((ctx->insns_flags & flg0) || (ctx->insns_flags2 & flg2_0))) { \
+if (unlikely(!ctx->altivec_enabled)) { \
+gen_exception(ctx, POWERPC_EXCP_VPU);  \
+return;\
+}  \
+trans_##name0(ctx);\
+} else if ((Rc(ctx->opcode) == 1) &&   \
+((ctx->insns_flags & flg1) || (ctx->insns_flags2 & flg2_1))) { \
+gen_##name1(ctx);  \
+} else {   \
+gen_inval_exception(ctx, POWERPC_EXCP_INVAL_INVAL);\
+}  \
+}
+
 /* Adds support to provide invalid mask */
 #define GEN_VXFORM_DUAL_EXT(name0, flg0, flg2_0, inval0,\
 name1, flg1, flg2_1, inval1)\
@@ -431,20 +453,13 @@ GEN_VXFORM(vmrglb, 6, 4);
 GEN_VXFORM(vmrglh, 6, 5);
 GEN_VXFORM(vmrglw, 6, 6);
 
-static void gen_vmrgew(DisasContext *ctx)
+static void trans_vmrgew(DisasContext *ctx)
 {
-TCGv_i64 tmp;
-TCGv_i64 avr;
-int VT, VA, VB;
-if (unlikely(!ctx->altivec_enabled)) {
-gen_exception(ctx, POWERPC_EXCP_VPU);
-return;
-}
-VT = rD(ctx->opcode);
-VA = rA(ctx->opcode);
-VB = rB(ctx->opcode);
-tmp = tcg_temp_new_i64();
-avr = tcg_temp_new_i64();
+int VT = rD(ctx->opcode);
+int VA = rA(ctx->opcode);
+int VB = rB(ctx->opcode);
+TCGv_i64 tmp = tcg_temp_new_i64();
+TCGv_i64 avr = tcg_temp_new_i64();
 
 get_avr64(avr, VB, true);
 tcg_gen_shri_i64(tmp, avr, 32);
@@ -462,21 +477,14 @@ static void gen_vmrgew(DisasContext *ctx)
 tcg_temp_free_i64(avr);
 }
 
-static void gen_vmrgow(DisasContext *ctx)
+static void trans_vmrgow(DisasContext *ctx)
 {
-TCGv_i64 t0, t1;
-TCGv_i64 avr;
-int VT, VA, VB;
-if (unlikely(!ctx->altivec_enabled)) {
-gen_exception(ctx, POWERPC_EXCP_VPU);
-return;
-}
-VT = rD(ctx->opcode);
-VA = rA(ctx->opcode);
-VB = rB(ctx->opcode);
-t0 = tcg_temp_new_i64();
-t1 = tcg_temp_new_i64();
-avr = tcg_temp_new_i64();
+int VT = rD(ctx->opcode);
+int VA = rA(ctx->opcode);
+int VB = rB(ctx->opcode);
+TCGv_i64 t0 = tcg_temp_new_i64();
+TCGv_i64 t1 = tcg_temp_new_i64();
+TCGv_i64 avr = tcg_temp_new_i64();
 
 get_avr64(t0, VB, true);
 get_avr64(t1, VA, true);
@@ -1165,14 +1173,14 @@ GEN_VXFORM_ENV(vminfp, 5, 17);
 GEN_VXFORM_HETRO(vextublx, 6, 24)
 GEN_VXFORM_HETRO(vextuhlx, 6, 25)
 GEN_VXFORM_HETRO(vextuwlx, 6, 26)
-GEN_VXFORM_DUAL(vmrgow, PPC_NONE, PPC2_ALTIVEC_207,
+GEN_VXFORM_TRANS_DUAL(vmrgow, PPC_NONE, PPC2_ALTIVEC_207,
 vextuwlx, PPC_NONE, PPC2_ISA300)
 GEN_VXFORM_HETRO(vextubrx, 6, 28)
 GEN_VXFORM_HETRO(vextuhrx, 6, 29)
 GEN_VXFORM_HETRO(vextuwrx, 6, 30)
 GEN_VXFORM_TRANS(lvsl, 6, 31)
 GEN_VXFORM_TRANS(lvsr, 6, 32)
-GEN_VXFORM_DUAL(vmrgew, PPC_NONE, PPC2_ALTIVEC_207, \
+GEN_VXFORM_TRANS_DUAL(vmrgew, PPC_NONE, PPC2_ALTIVEC_207,
 vextuwrx, PPC_NONE, PPC2_ISA300)
 
 #define GEN_VXRFORM1(opname, name, str, opc2, opc3) \
-- 
2.7.4




[Qemu-devel] [PATCH v6 1/3] target/ppc: Optimize emulation of vpkpx instruction

2019-08-27 Thread Stefan Brankovic
Optimize altivec instruction vpkpx (Vector Pack Pixel).
Rearranges 8 pixels coded in 6-5-5 pattern (4 from each source register)
into contigous array of bits in the destination register.

In each iteration of outer loop, the instruction is to be done with
the 6-5-5 pack for 2 pixels of each doubleword element of each
source register. The first thing to be done in outer loop is
choosing which doubleword element of which register is to be used
in current iteration and it is to be placed in avr variable. The
next step is to perform 6-5-5 pack of pixels on avr variable in inner
for loop(2 iterations, 1 for each pixel) and save result in tmp variable.
In the end of outer for loop, the result is merged in variable called
result and saved in appropriate doubleword element of vD if the whole
doubleword is finished(every second iteration). The outer loop has 4
iterations.

Signed-off-by: Stefan Brankovic 
---
 target/ppc/helper.h |  1 -
 target/ppc/int_helper.c | 21 
 target/ppc/translate/vmx-impl.inc.c | 99 -
 3 files changed, 98 insertions(+), 23 deletions(-)

diff --git a/target/ppc/helper.h b/target/ppc/helper.h
index 54ea9b9..940a115 100644
--- a/target/ppc/helper.h
+++ b/target/ppc/helper.h
@@ -258,7 +258,6 @@ DEF_HELPER_4(vpkudus, void, env, avr, avr, avr)
 DEF_HELPER_4(vpkuhum, void, env, avr, avr, avr)
 DEF_HELPER_4(vpkuwum, void, env, avr, avr, avr)
 DEF_HELPER_4(vpkudum, void, env, avr, avr, avr)
-DEF_HELPER_3(vpkpx, void, avr, avr, avr)
 DEF_HELPER_5(vmhaddshs, void, env, avr, avr, avr, avr)
 DEF_HELPER_5(vmhraddshs, void, env, avr, avr, avr, avr)
 DEF_HELPER_5(vmsumuhm, void, env, avr, avr, avr, avr)
diff --git a/target/ppc/int_helper.c b/target/ppc/int_helper.c
index 46deb57..9ff3b03 100644
--- a/target/ppc/int_helper.c
+++ b/target/ppc/int_helper.c
@@ -1262,27 +1262,6 @@ void helper_vpmsumd(ppc_avr_t *r, ppc_avr_t *a, 
ppc_avr_t *b)
 #else
 #define PKBIG 0
 #endif
-void helper_vpkpx(ppc_avr_t *r, ppc_avr_t *a, ppc_avr_t *b)
-{
-int i, j;
-ppc_avr_t result;
-#if defined(HOST_WORDS_BIGENDIAN)
-const ppc_avr_t *x[2] = { a, b };
-#else
-const ppc_avr_t *x[2] = { b, a };
-#endif
-
-VECTOR_FOR_INORDER_I(i, u64) {
-VECTOR_FOR_INORDER_I(j, u32) {
-uint32_t e = x[i]->u32[j];
-
-result.u16[4 * i + j] = (((e >> 9) & 0xfc00) |
- ((e >> 6) & 0x3e0) |
- ((e >> 3) & 0x1f));
-}
-}
-*r = result;
-}
 
 #define VPK(suffix, from, to, cvt, dosat)   \
 void helper_vpk##suffix(CPUPPCState *env, ppc_avr_t *r, \
diff --git a/target/ppc/translate/vmx-impl.inc.c 
b/target/ppc/translate/vmx-impl.inc.c
index 0d71c10..45a 100644
--- a/target/ppc/translate/vmx-impl.inc.c
+++ b/target/ppc/translate/vmx-impl.inc.c
@@ -571,6 +571,103 @@ static void trans_lvsr(DisasContext *ctx)
 }
 
 /*
+ * vpkpx VRT,VRA,VRB - Vector Pack Pixel
+ *
+ * Rearranges 8 pixels coded in 6-5-5 pattern (4 from each source register)
+ * into contigous array of bits in the destination register.
+ */
+static void trans_vpkpx(DisasContext *ctx)
+{
+int VT = rD(ctx->opcode);
+int VA = rA(ctx->opcode);
+int VB = rB(ctx->opcode);
+TCGv_i64 tmp = tcg_temp_new_i64();
+TCGv_i64 shifted = tcg_temp_new_i64();
+TCGv_i64 avr = tcg_temp_new_i64();
+TCGv_i64 result = tcg_temp_new_i64();
+TCGv_i64 result1 = tcg_temp_new_i64();
+TCGv_i64 result2 = tcg_temp_new_i64();
+int64_t mask1 = 0x1fULL;
+int64_t mask2 = 0x1fULL << 5;
+int64_t mask3 = 0x3fULL << 10;
+int i, j;
+/*
+ * In each iteration do the 6-5-5 pack for 2 pixels of each doubleword
+ * element of each source register.
+ */
+for (i = 0; i < 4; i++) {
+switch (i) {
+case 0:
+/*
+ * Get high doubleword of vA to perfrom 6-5-5 pack of pixels
+ * 1 and 2.
+ */
+get_avr64(avr, VA, true);
+tcg_gen_movi_i64(result, 0x0ULL);
+break;
+case 1:
+/*
+ * Get low doubleword of vA to perfrom 6-5-5 pack of pixels
+ * 3 and 4.
+ */
+get_avr64(avr, VA, false);
+break;
+case 2:
+/*
+ * Get high doubleword of vB to perfrom 6-5-5 pack of pixels
+ * 5 and 6.
+ */
+get_avr64(avr, VB, true);
+tcg_gen_movi_i64(result, 0x0ULL);
+break;
+case 3:
+/*
+ * Get low doubleword of vB to perfrom 6-5-5 pack of pixels
+ * 7 and 8.
+ */
+get_avr64(avr, VB, false);
+break;
+}
+/* Perform the packing for 2 pixels(each iteration for 1). */
+tcg_gen_movi_i64(tmp, 0x0ULL);
+for (j = 0; j < 2; j+

[Qemu-devel] [PATCH v6 0/3] target/ppc: Optimize emulation of some Altivec instructions

2019-08-27 Thread Stefan Brankovic
Optimize emulation of ten Altivec instructions: lvsl, lvsr, vsl, vsr, vpkpx,
vgbbd, vclzb, vclzh, vclzw and vclzd.

This series buils up on and complements recent work of Thomas Murta, Mark
Cave-Ayland and Richard Henderson in the same area. It is based on devising TCG
translation implementation for selected instructions rather than using helpers.
The selected instructions are most of the time idiosyncratic to ppc platform,
so relatively complex TCG translation (without direct mapping to host
instruction that is not possible in these cases) seems to be the best option,
and that approach is presented in this series. The performance improvements
are significant in all cases.

V6:

Rebased series to the latest qemu code.
Excluded all patches that are already accepted.

V5:

Fixed vpkpx bug and added it back in patch.
Fixed graphical distortions on OSX 10.3 and 10.4.
Removed conversion of vmrgh and vmrgl instructions to vector operations for
further investigation.

V4:

Addressed Richard's Henderson's suggestions.
Removed vpkpx's optimization for further investigation on graphical distortions
it caused on OSX 10.2-4 guests.
Added opcodes for vector vmrgh(b|h|w) and vmrgl(b|h|w) in tcg.
Implemented vector vmrgh and vmrgl instructions for i386.
Converted vmrgh and vmrgl instructions to vector operations.

V3:

Fixed problem during build.

V2:

Addressed Richard's Henderson's suggestions.
Fixed problem during build on patch 2/8.
Rebased series to the latest qemu code.

Stefan Brankovic (3):
  target/ppc: Optimize emulation of vpkpx instruction
  target/ppc: Optimize emulation of vclzh and vclzb instructions
  target/ppc: Refactor emulation of vmrgew and vmrgow instructions

 target/ppc/helper.h |   3 -
 target/ppc/int_helper.c |  30 
 target/ppc/translate/vmx-impl.inc.c | 301 
 3 files changed, 269 insertions(+), 65 deletions(-)

-- 
2.7.4




[Qemu-devel] [PATCH v5 7/8] target/ppc: Optimize emulation of vclzh and vclzb instructions

2019-07-15 Thread Stefan Brankovic
Optimize Altivec instruction vclzh (Vector Count Leading Zeros Halfword).
This instruction counts the number of leading zeros of each halfword element
in source register and places result in the appropriate halfword element of
destination register.

In each iteration of outer for loop count operation is performed on one
doubleword element of source register vB. In the first iteration, higher
doubleword element of vB is placed in variable avr, and then counting
for every halfword element is performed by  using tcg_gen_clzi_i64.
Since it counts leading zeros on 64 bit lenght, ith byte element has to
be moved to the highest 16 bits of tmp, or-ed with mask(in order to get all
ones in lowest 48 bits), then perform tcg_gen_clzi_i64 and move it's result
in appropriate halfword element of result. This is done in inner for loop.
After the operation is finished, the result is saved in the appropriate
doubleword element of destination register vD. The same sequence of orders
is to be applied again for the  lower doubleword element of vB.

Optimize Altivec instruction vclzb (Vector Count Leading Zeros Byte).
This instruction counts the number of leading zeros of each byte element
in source register and places result in the appropriate byte element of
destination register.

In each iteration of the outer for loop, counting operation is done on one
doubleword element of source register vB. In the first iteration, the
higher doubleword element of vB is placed in variable avr, and then counting
for every byte element is performed using tcg_gen_clzi_i64. Since it counts
leading zeros on 64 bit lenght, ith byte element has to be moved to the highest
8 bits of variable  tmp, or-ed with mask(in order to get all ones in the lowest
56 bits), then perform tcg_gen_clzi_i64 and move it's result in the appropriate
byte element of result. This is done in inner for loop. After the operation is
finished, the result is saved in the  appropriate doubleword element of 
destination
register vD. The same sequence of orders is to be applied again for the lower
doubleword element of vB.

Signed-off-by: Stefan Brankovic 
---
 target/ppc/helper.h |   2 -
 target/ppc/int_helper.c |   9 ---
 target/ppc/translate/vmx-impl.inc.c | 136 +++-
 3 files changed, 134 insertions(+), 13 deletions(-)

diff --git a/target/ppc/helper.h b/target/ppc/helper.h
index 595241c..17b4b06 100644
--- a/target/ppc/helper.h
+++ b/target/ppc/helper.h
@@ -303,8 +303,6 @@ DEF_HELPER_4(vcfsx, void, env, avr, avr, i32)
 DEF_HELPER_4(vctuxs, void, env, avr, avr, i32)
 DEF_HELPER_4(vctsxs, void, env, avr, avr, i32)
 
-DEF_HELPER_2(vclzb, void, avr, avr)
-DEF_HELPER_2(vclzh, void, avr, avr)
 DEF_HELPER_2(vctzb, void, avr, avr)
 DEF_HELPER_2(vctzh, void, avr, avr)
 DEF_HELPER_2(vctzw, void, avr, avr)
diff --git a/target/ppc/int_helper.c b/target/ppc/int_helper.c
index 82cb12e..264b5e7 100644
--- a/target/ppc/int_helper.c
+++ b/target/ppc/int_helper.c
@@ -1800,15 +1800,6 @@ VUPK(lsw, s64, s32, UPKLO)
 }   \
 }
 
-#define clzb(v) ((v) ? clz32((uint32_t)(v) << 24) : 8)
-#define clzh(v) ((v) ? clz32((uint32_t)(v) << 16) : 16)
-
-VGENERIC_DO(clzb, u8)
-VGENERIC_DO(clzh, u16)
-
-#undef clzb
-#undef clzh
-
 #define ctzb(v) ((v) ? ctz32(v) : 8)
 #define ctzh(v) ((v) ? ctz32(v) : 16)
 #define ctzw(v) ctz32((v))
diff --git a/target/ppc/translate/vmx-impl.inc.c 
b/target/ppc/translate/vmx-impl.inc.c
index 45a..e8a0fb6 100644
--- a/target/ppc/translate/vmx-impl.inc.c
+++ b/target/ppc/translate/vmx-impl.inc.c
@@ -840,6 +840,138 @@ static void trans_vgbbd(DisasContext *ctx)
 }
 
 /*
+ * vclzb VRT,VRB - Vector Count Leading Zeros Byte
+ *
+ * Counting the number of leading zero bits of each byte element in source
+ * register and placing result in appropriate byte element of destination
+ * register.
+ */
+static void trans_vclzb(DisasContext *ctx)
+{
+int VT = rD(ctx->opcode);
+int VB = rB(ctx->opcode);
+TCGv_i64 avr = tcg_temp_new_i64();
+TCGv_i64 result = tcg_temp_new_i64();
+TCGv_i64 result1 = tcg_temp_new_i64();
+TCGv_i64 result2 = tcg_temp_new_i64();
+TCGv_i64 tmp = tcg_temp_new_i64();
+TCGv_i64 mask = tcg_const_i64(0xffULL);
+int i, j;
+
+for (i = 0; i < 2; i++) {
+if (i == 0) {
+/* Get high doubleword of vB in avr. */
+get_avr64(avr, VB, true);
+} else {
+/* Get low doubleword of vB in avr. */
+get_avr64(avr, VB, false);
+}
+/*
+ * Perform count for every byte element using tcg_gen_clzi_i64.
+ * Since it counts leading zeros on 64 bit lenght, we have to move
+ * ith byte element to highest 8 bits of tmp, or it with mask(so we get
+ * all ones in lowest 56 bits), then perform tcg_gen_clzi_i64 and move
+ * it's result in appropriate b

[Qemu-devel] [PATCH v5 6/8] target/ppc: Optimize emulation of vclzw instruction

2019-07-15 Thread Stefan Brankovic
Optimize Altivec instruction vclzw (Vector Count Leading Zeros Word).
This instruction counts the number of leading zeros of each word element
in source register and places result in the appropriate word element of
destination register.

Counting is to be performed in four iterations of for loop(one for each
word elemnt of source register vB). Every iteration consists of loading
appropriate word element from source register, counting leading zeros
with tcg_gen_clzi_i32, and saving the result in appropriate word element
of destination register.

Signed-off-by: Stefan Brankovic 
Reviewed-by: Richard Henderson 
---
 target/ppc/helper.h |  1 -
 target/ppc/int_helper.c |  3 ---
 target/ppc/translate/vmx-impl.inc.c | 28 +++-
 3 files changed, 27 insertions(+), 5 deletions(-)

diff --git a/target/ppc/helper.h b/target/ppc/helper.h
index 3b92e3b..595241c 100644
--- a/target/ppc/helper.h
+++ b/target/ppc/helper.h
@@ -305,7 +305,6 @@ DEF_HELPER_4(vctsxs, void, env, avr, avr, i32)
 
 DEF_HELPER_2(vclzb, void, avr, avr)
 DEF_HELPER_2(vclzh, void, avr, avr)
-DEF_HELPER_2(vclzw, void, avr, avr)
 DEF_HELPER_2(vctzb, void, avr, avr)
 DEF_HELPER_2(vctzh, void, avr, avr)
 DEF_HELPER_2(vctzw, void, avr, avr)
diff --git a/target/ppc/int_helper.c b/target/ppc/int_helper.c
index 3134798..82cb12e 100644
--- a/target/ppc/int_helper.c
+++ b/target/ppc/int_helper.c
@@ -1802,15 +1802,12 @@ VUPK(lsw, s64, s32, UPKLO)
 
 #define clzb(v) ((v) ? clz32((uint32_t)(v) << 24) : 8)
 #define clzh(v) ((v) ? clz32((uint32_t)(v) << 16) : 16)
-#define clzw(v) clz32((v))
 
 VGENERIC_DO(clzb, u8)
 VGENERIC_DO(clzh, u16)
-VGENERIC_DO(clzw, u32)
 
 #undef clzb
 #undef clzh
-#undef clzw
 
 #define ctzb(v) ((v) ? ctz32(v) : 8)
 #define ctzh(v) ((v) ? ctz32(v) : 16)
diff --git a/target/ppc/translate/vmx-impl.inc.c 
b/target/ppc/translate/vmx-impl.inc.c
index 75c5c8c..45a 100644
--- a/target/ppc/translate/vmx-impl.inc.c
+++ b/target/ppc/translate/vmx-impl.inc.c
@@ -840,6 +840,32 @@ static void trans_vgbbd(DisasContext *ctx)
 }
 
 /*
+ * vclzw VRT,VRB - Vector Count Leading Zeros Word
+ *
+ * Counting the number of leading zero bits of each word element in source
+ * register and placing result in appropriate word element of destination
+ * register.
+ */
+static void trans_vclzw(DisasContext *ctx)
+{
+int VT = rD(ctx->opcode);
+int VB = rB(ctx->opcode);
+TCGv_i32 tmp = tcg_temp_new_i32();
+int i;
+
+/* Perform count for every word element using tcg_gen_clzi_i32. */
+for (i = 0; i < 4; i++) {
+tcg_gen_ld_i32(tmp, cpu_env,
+offsetof(CPUPPCState, vsr[32 + VB].u64[0]) + i * 4);
+tcg_gen_clzi_i32(tmp, tmp, 32);
+tcg_gen_st_i32(tmp, cpu_env,
+offsetof(CPUPPCState, vsr[32 + VT].u64[0]) + i * 4);
+}
+
+tcg_temp_free_i32(tmp);
+}
+
+/*
  * vclzd VRT,VRB - Vector Count Leading Zeros Doubleword
  *
  * Counting the number of leading zero bits of each doubleword element in 
source
@@ -1380,7 +1406,7 @@ GEN_VAFORM_PAIRED(vmaddfp, vnmsubfp, 23)
 
 GEN_VXFORM_NOA(vclzb, 1, 28)
 GEN_VXFORM_NOA(vclzh, 1, 29)
-GEN_VXFORM_NOA(vclzw, 1, 30)
+GEN_VXFORM_TRANS(vclzw, 1, 30)
 GEN_VXFORM_TRANS(vclzd, 1, 31)
 GEN_VXFORM_NOA_2(vnegw, 1, 24, 6)
 GEN_VXFORM_NOA_2(vnegd, 1, 24, 7)
-- 
2.7.4




[Qemu-devel] [PATCH v5 5/8] target/ppc: Optimize emulation of vclzd instruction

2019-07-15 Thread Stefan Brankovic
Optimize Altivec instruction vclzd (Vector Count Leading Zeros Doubleword).
This instruction counts the number of leading zeros of each doubleword element
in source register and places result in the appropriate doubleword element of
destination register.

Using tcg-s count leading zeros instruction two times(once for each
doubleword element of source register vB) and placing result in
appropriate doubleword element of destination register vD.

Signed-off-by: Stefan Brankovic 
Reviewed-by: Richard Henderson 
---
 target/ppc/helper.h |  1 -
 target/ppc/int_helper.c |  3 ---
 target/ppc/translate/vmx-impl.inc.c | 28 +++-
 3 files changed, 27 insertions(+), 5 deletions(-)

diff --git a/target/ppc/helper.h b/target/ppc/helper.h
index 0aa1e05..3b92e3b 100644
--- a/target/ppc/helper.h
+++ b/target/ppc/helper.h
@@ -306,7 +306,6 @@ DEF_HELPER_4(vctsxs, void, env, avr, avr, i32)
 DEF_HELPER_2(vclzb, void, avr, avr)
 DEF_HELPER_2(vclzh, void, avr, avr)
 DEF_HELPER_2(vclzw, void, avr, avr)
-DEF_HELPER_2(vclzd, void, avr, avr)
 DEF_HELPER_2(vctzb, void, avr, avr)
 DEF_HELPER_2(vctzh, void, avr, avr)
 DEF_HELPER_2(vctzw, void, avr, avr)
diff --git a/target/ppc/int_helper.c b/target/ppc/int_helper.c
index 1e32549..3134798 100644
--- a/target/ppc/int_helper.c
+++ b/target/ppc/int_helper.c
@@ -1803,17 +1803,14 @@ VUPK(lsw, s64, s32, UPKLO)
 #define clzb(v) ((v) ? clz32((uint32_t)(v) << 24) : 8)
 #define clzh(v) ((v) ? clz32((uint32_t)(v) << 16) : 16)
 #define clzw(v) clz32((v))
-#define clzd(v) clz64((v))
 
 VGENERIC_DO(clzb, u8)
 VGENERIC_DO(clzh, u16)
 VGENERIC_DO(clzw, u32)
-VGENERIC_DO(clzd, u64)
 
 #undef clzb
 #undef clzh
 #undef clzw
-#undef clzd
 
 #define ctzb(v) ((v) ? ctz32(v) : 8)
 #define ctzh(v) ((v) ? ctz32(v) : 16)
diff --git a/target/ppc/translate/vmx-impl.inc.c 
b/target/ppc/translate/vmx-impl.inc.c
index 781c866..75c5c8c 100644
--- a/target/ppc/translate/vmx-impl.inc.c
+++ b/target/ppc/translate/vmx-impl.inc.c
@@ -839,6 +839,32 @@ static void trans_vgbbd(DisasContext *ctx)
 tcg_temp_free_i64(avr[1]);
 }
 
+/*
+ * vclzd VRT,VRB - Vector Count Leading Zeros Doubleword
+ *
+ * Counting the number of leading zero bits of each doubleword element in 
source
+ * register and placing result in appropriate doubleword element of destination
+ * register.
+ */
+static void trans_vclzd(DisasContext *ctx)
+{
+int VT = rD(ctx->opcode);
+int VB = rB(ctx->opcode);
+TCGv_i64 avr = tcg_temp_new_i64();
+
+/* high doubleword */
+get_avr64(avr, VB, true);
+tcg_gen_clzi_i64(avr, avr, 64);
+set_avr64(VT, avr, true);
+
+/* low doubleword */
+get_avr64(avr, VB, false);
+tcg_gen_clzi_i64(avr, avr, 64);
+set_avr64(VT, avr, false);
+
+tcg_temp_free_i64(avr);
+}
+
 GEN_VXFORM(vmuloub, 4, 0);
 GEN_VXFORM(vmulouh, 4, 1);
 GEN_VXFORM(vmulouw, 4, 2);
@@ -1355,7 +1381,7 @@ GEN_VAFORM_PAIRED(vmaddfp, vnmsubfp, 23)
 GEN_VXFORM_NOA(vclzb, 1, 28)
 GEN_VXFORM_NOA(vclzh, 1, 29)
 GEN_VXFORM_NOA(vclzw, 1, 30)
-GEN_VXFORM_NOA(vclzd, 1, 31)
+GEN_VXFORM_TRANS(vclzd, 1, 31)
 GEN_VXFORM_NOA_2(vnegw, 1, 24, 6)
 GEN_VXFORM_NOA_2(vnegd, 1, 24, 7)
 GEN_VXFORM_NOA_2(vextsb2w, 1, 24, 16)
-- 
2.7.4




[Qemu-devel] [PATCH v5 4/8] target/ppc: Optimize emulation of vgbbd instruction

2019-07-15 Thread Stefan Brankovic
Optimize altivec instruction vgbbd (Vector Gather Bits by Bytes by Doubleword)
All ith bits (i in range 1 to 8) of each byte of doubleword element in
source register are concatenated and placed into ith byte of appropriate
doubleword element in destination register.

Following solution is done for both doubleword elements of source register
in parallel, in order to reduce the number of instructions needed(that's why
arrays are used):
First, both doubleword elements of source register vB are placed in
appropriate element of array avr. Bits are gathered in 2x8 iterations(2 for
loops). In first iteration bit 1 of byte 1, bit 2 of byte 2,... bit 8 of
byte 8 are in their final spots so avr[i], i={0,1} can be and-ed with
tcg_mask. For every following iteration, both avr[i] and tcg_mask variables
have to be shifted right for 7 and 8 places, respectively, in order to get
bit 1 of byte 2, bit 2 of byte 3.. bit 7 of byte 8 in their final spots so
shifted avr values(saved in tmp) can be and-ed with new value of tcg_mask...
After first 8 iteration(first loop), all the first bits are in their final
places, all second bits but second bit from eight byte are in their places...
only 1 eight bit from eight byte is in it's place). In second loop we do all
operations symmetrically, in order to get other half of bits in their final
spots. Results for first and second doubleword elements are saved in
result[0] and result[1] respectively. In the end those results are saved in
appropriate doubleword element of destination register vD.

Signed-off-by: Stefan Brankovic 
Reviewed-by: Richard Henderson 
---
 target/ppc/helper.h |   1 -
 target/ppc/int_helper.c | 276 
 target/ppc/translate/vmx-impl.inc.c |  77 +-
 3 files changed, 76 insertions(+), 278 deletions(-)

diff --git a/target/ppc/helper.h b/target/ppc/helper.h
index 7a3d68d..0aa1e05 100644
--- a/target/ppc/helper.h
+++ b/target/ppc/helper.h
@@ -319,7 +319,6 @@ DEF_HELPER_1(vclzlsbb, tl, avr)
 DEF_HELPER_1(vctzlsbb, tl, avr)
 DEF_HELPER_3(vbpermd, void, avr, avr, avr)
 DEF_HELPER_3(vbpermq, void, avr, avr, avr)
-DEF_HELPER_2(vgbbd, void, avr, avr)
 DEF_HELPER_3(vpmsumb, void, avr, avr, avr)
 DEF_HELPER_3(vpmsumh, void, avr, avr, avr)
 DEF_HELPER_3(vpmsumw, void, avr, avr, avr)
diff --git a/target/ppc/int_helper.c b/target/ppc/int_helper.c
index f397380..1e32549 100644
--- a/target/ppc/int_helper.c
+++ b/target/ppc/int_helper.c
@@ -1185,282 +1185,6 @@ void helper_vbpermq(ppc_avr_t *r, ppc_avr_t *a, 
ppc_avr_t *b)
 #undef VBPERMQ_INDEX
 #undef VBPERMQ_DW
 
-static const uint64_t VGBBD_MASKS[256] = {
-0xull, /* 00 */
-0x0080ull, /* 01 */
-0x8000ull, /* 02 */
-0x8080ull, /* 03 */
-0x0080ull, /* 04 */
-0x00800080ull, /* 05 */
-0x00808000ull, /* 06 */
-0x00808080ull, /* 07 */
-0x8000ull, /* 08 */
-0x8080ull, /* 09 */
-0x80008000ull, /* 0A */
-0x80008080ull, /* 0B */
-0x8080ull, /* 0C */
-0x80800080ull, /* 0D */
-0x80808000ull, /* 0E */
-0x80808080ull, /* 0F */
-0x0080ull, /* 10 */
-0x00800080ull, /* 11 */
-0x00808000ull, /* 12 */
-0x00808080ull, /* 13 */
-0x00800080ull, /* 14 */
-0x008000800080ull, /* 15 */
-0x008000808000ull, /* 16 */
-0x008000808080ull, /* 17 */
-0x00808000ull, /* 18 */
-0x00808080ull, /* 19 */
-0x008080008000ull, /* 1A */
-0x008080008080ull, /* 1B */
-0x00808080ull, /* 1C */
-0x008080800080ull, /* 1D */
-0x008080808000ull, /* 1E */
-0x008080808080ull, /* 1F */
-0x8000ull, /* 20 */
-0x8080ull, /* 21 */
-0x80008000ull, /* 22 */
-0x80008080ull, /* 23 */
-0x8080ull, /* 24 */
-0x80800080ull, /* 25 */
-0x80808000ull, /* 26 */
-0x80808080ull, /* 27 */
-0x80008000ull, /* 28 */
-0x80008080ull, /* 29 */
-0x800080008000ull, /* 2A */
-0x800080008080ull, /* 2B */
-0x80008080ull, /* 2C */
-0x800080800080ull, /* 2D */
-0x800080808000ull, /* 2E */
-0x800080808080ull, /* 2F */
-0x8080ull, /* 30 */
-0x80800080ull, /* 31 */
-0x80808000ull, /* 32 */
-0x80808080ull, /* 33 */
-0x80800080ull, /* 34 */
-0x808000800080ull, /* 35 */
-0x808000808000ull, /* 36 */
-0x808000808080ull, /* 37 */
-0x80808000ull, /* 38 */
-0x80808080ull, /* 39 */
-0x808080008000ull, /* 3A */
-0x808080008080ull, /* 3B */
-0x80808080ull, /* 3C */
-0x808080800080ull, /* 3D */
-0x808080808000ull, /* 3E

[Qemu-devel] [PATCH v5 8/8] target/ppc: Refactor emulation of vmrgew and vmrgow instructions

2019-07-15 Thread Stefan Brankovic
Since I found this two instructions implemented with tcg, I refactored
them so they are consistent with other similar implementations that
I introduced in this patch.

Also, a new dual macro GEN_VXFORM_TRANS_DUAL is added. This macro is
used if one instruction is realized with direct translation, and second
one with a helper.

Signed-off-by: Stefan Brankovic 
---
 target/ppc/translate/vmx-impl.inc.c | 66 +
 1 file changed, 37 insertions(+), 29 deletions(-)

diff --git a/target/ppc/translate/vmx-impl.inc.c 
b/target/ppc/translate/vmx-impl.inc.c
index e8a0fb6..6af9c73 100644
--- a/target/ppc/translate/vmx-impl.inc.c
+++ b/target/ppc/translate/vmx-impl.inc.c
@@ -350,6 +350,28 @@ static void glue(gen_, name0##_##name1)(DisasContext *ctx) 
\
 }  \
 }
 
+/*
+ * We use this macro if one instruction is realized with direct
+ * translation, and second one with helper.
+ */
+#define GEN_VXFORM_TRANS_DUAL(name0, flg0, flg2_0, name1, flg1, flg2_1)\
+static void glue(gen_, name0##_##name1)(DisasContext *ctx) \
+{  \
+if ((Rc(ctx->opcode) == 0) &&  \
+((ctx->insns_flags & flg0) || (ctx->insns_flags2 & flg2_0))) { \
+if (unlikely(!ctx->altivec_enabled)) { \
+gen_exception(ctx, POWERPC_EXCP_VPU);  \
+return;\
+}  \
+trans_##name0(ctx);\
+} else if ((Rc(ctx->opcode) == 1) &&   \
+((ctx->insns_flags & flg1) || (ctx->insns_flags2 & flg2_1))) { \
+gen_##name1(ctx);  \
+} else {   \
+gen_inval_exception(ctx, POWERPC_EXCP_INVAL_INVAL);\
+}  \
+}
+
 /* Adds support to provide invalid mask */
 #define GEN_VXFORM_DUAL_EXT(name0, flg0, flg2_0, inval0,\
 name1, flg1, flg2_1, inval1)\
@@ -431,20 +453,13 @@ GEN_VXFORM(vmrglb, 6, 4);
 GEN_VXFORM(vmrglh, 6, 5);
 GEN_VXFORM(vmrglw, 6, 6);
 
-static void gen_vmrgew(DisasContext *ctx)
+static void trans_vmrgew(DisasContext *ctx)
 {
-TCGv_i64 tmp;
-TCGv_i64 avr;
-int VT, VA, VB;
-if (unlikely(!ctx->altivec_enabled)) {
-gen_exception(ctx, POWERPC_EXCP_VPU);
-return;
-}
-VT = rD(ctx->opcode);
-VA = rA(ctx->opcode);
-VB = rB(ctx->opcode);
-tmp = tcg_temp_new_i64();
-avr = tcg_temp_new_i64();
+int VT = rD(ctx->opcode);
+int VA = rA(ctx->opcode);
+int VB = rB(ctx->opcode);
+TCGv_i64 tmp = tcg_temp_new_i64();
+TCGv_i64 avr = tcg_temp_new_i64();
 
 get_avr64(avr, VB, true);
 tcg_gen_shri_i64(tmp, avr, 32);
@@ -462,21 +477,14 @@ static void gen_vmrgew(DisasContext *ctx)
 tcg_temp_free_i64(avr);
 }
 
-static void gen_vmrgow(DisasContext *ctx)
+static void trans_vmrgow(DisasContext *ctx)
 {
-TCGv_i64 t0, t1;
-TCGv_i64 avr;
-int VT, VA, VB;
-if (unlikely(!ctx->altivec_enabled)) {
-gen_exception(ctx, POWERPC_EXCP_VPU);
-return;
-}
-VT = rD(ctx->opcode);
-VA = rA(ctx->opcode);
-VB = rB(ctx->opcode);
-t0 = tcg_temp_new_i64();
-t1 = tcg_temp_new_i64();
-avr = tcg_temp_new_i64();
+int VT = rD(ctx->opcode);
+int VA = rA(ctx->opcode);
+int VB = rB(ctx->opcode);
+TCGv_i64 t0 = tcg_temp_new_i64();
+TCGv_i64 t1 = tcg_temp_new_i64();
+TCGv_i64 avr = tcg_temp_new_i64();
 
 get_avr64(t0, VB, true);
 get_avr64(t1, VA, true);
@@ -1165,14 +1173,14 @@ GEN_VXFORM_ENV(vminfp, 5, 17);
 GEN_VXFORM_HETRO(vextublx, 6, 24)
 GEN_VXFORM_HETRO(vextuhlx, 6, 25)
 GEN_VXFORM_HETRO(vextuwlx, 6, 26)
-GEN_VXFORM_DUAL(vmrgow, PPC_NONE, PPC2_ALTIVEC_207,
+GEN_VXFORM_TRANS_DUAL(vmrgow, PPC_NONE, PPC2_ALTIVEC_207,
 vextuwlx, PPC_NONE, PPC2_ISA300)
 GEN_VXFORM_HETRO(vextubrx, 6, 28)
 GEN_VXFORM_HETRO(vextuhrx, 6, 29)
 GEN_VXFORM_HETRO(vextuwrx, 6, 30)
 GEN_VXFORM_TRANS(lvsl, 6, 31)
 GEN_VXFORM_TRANS(lvsr, 6, 32)
-GEN_VXFORM_DUAL(vmrgew, PPC_NONE, PPC2_ALTIVEC_207, \
+GEN_VXFORM_TRANS_DUAL(vmrgew, PPC_NONE, PPC2_ALTIVEC_207,
 vextuwrx, PPC_NONE, PPC2_ISA300)
 
 #define GEN_VXRFORM1(opname, name, str, opc2, opc3) \
-- 
2.7.4




[Qemu-devel] [PATCH v5 3/8] target/ppc: Optimize emulation of vpkpx instruction

2019-07-15 Thread Stefan Brankovic
Optimize altivec instruction vpkpx (Vector Pack Pixel).
Rearranges 8 pixels coded in 6-5-5 pattern (4 from each source register)
into contigous array of bits in the destination register.

In each iteration of outer loop, the instruction is to be done with
the 6-5-5 pack for 2 pixels of each doubleword element of each
source register. The first thing to be done in outer loop is
choosing which doubleword element of which register is to be used
in current iteration and it is to be placed in avr variable. The
next step is to perform 6-5-5 pack of pixels on avr variable in inner
for loop(2 iterations, 1 for each pixel) and save result in tmp variable.
In the end of outer for loop, the result is merged in variable called
result and saved in appropriate doubleword element of vD if the whole
doubleword is finished(every second iteration). The outer loop has 4
iterations.

Signed-off-by: Stefan Brankovic 
---
 target/ppc/helper.h |  1 -
 target/ppc/int_helper.c | 21 
 target/ppc/translate/vmx-impl.inc.c | 99 -
 3 files changed, 98 insertions(+), 23 deletions(-)

diff --git a/target/ppc/helper.h b/target/ppc/helper.h
index 33dad6a..7a3d68d 100644
--- a/target/ppc/helper.h
+++ b/target/ppc/helper.h
@@ -254,7 +254,6 @@ DEF_HELPER_4(vpkudus, void, env, avr, avr, avr)
 DEF_HELPER_4(vpkuhum, void, env, avr, avr, avr)
 DEF_HELPER_4(vpkuwum, void, env, avr, avr, avr)
 DEF_HELPER_4(vpkudum, void, env, avr, avr, avr)
-DEF_HELPER_3(vpkpx, void, avr, avr, avr)
 DEF_HELPER_5(vmhaddshs, void, env, avr, avr, avr, avr)
 DEF_HELPER_5(vmhraddshs, void, env, avr, avr, avr, avr)
 DEF_HELPER_5(vmsumuhm, void, env, avr, avr, avr, avr)
diff --git a/target/ppc/int_helper.c b/target/ppc/int_helper.c
index a23853e..f397380 100644
--- a/target/ppc/int_helper.c
+++ b/target/ppc/int_helper.c
@@ -1536,27 +1536,6 @@ void helper_vpmsumd(ppc_avr_t *r, ppc_avr_t *a, 
ppc_avr_t *b)
 #else
 #define PKBIG 0
 #endif
-void helper_vpkpx(ppc_avr_t *r, ppc_avr_t *a, ppc_avr_t *b)
-{
-int i, j;
-ppc_avr_t result;
-#if defined(HOST_WORDS_BIGENDIAN)
-const ppc_avr_t *x[2] = { a, b };
-#else
-const ppc_avr_t *x[2] = { b, a };
-#endif
-
-VECTOR_FOR_INORDER_I(i, u64) {
-VECTOR_FOR_INORDER_I(j, u32) {
-uint32_t e = x[i]->u32[j];
-
-result.u16[4 * i + j] = (((e >> 9) & 0xfc00) |
- ((e >> 6) & 0x3e0) |
- ((e >> 3) & 0x1f));
-}
-}
-*r = result;
-}
 
 #define VPK(suffix, from, to, cvt, dosat)   \
 void helper_vpk##suffix(CPUPPCState *env, ppc_avr_t *r, \
diff --git a/target/ppc/translate/vmx-impl.inc.c 
b/target/ppc/translate/vmx-impl.inc.c
index e06e65a..64e598f 100644
--- a/target/ppc/translate/vmx-impl.inc.c
+++ b/target/ppc/translate/vmx-impl.inc.c
@@ -571,6 +571,103 @@ static void trans_lvsr(DisasContext *ctx)
 }
 
 /*
+ * vpkpx VRT,VRA,VRB - Vector Pack Pixel
+ *
+ * Rearranges 8 pixels coded in 6-5-5 pattern (4 from each source register)
+ * into contigous array of bits in the destination register.
+ */
+static void trans_vpkpx(DisasContext *ctx)
+{
+int VT = rD(ctx->opcode);
+int VA = rA(ctx->opcode);
+int VB = rB(ctx->opcode);
+TCGv_i64 tmp = tcg_temp_new_i64();
+TCGv_i64 shifted = tcg_temp_new_i64();
+TCGv_i64 avr = tcg_temp_new_i64();
+TCGv_i64 result = tcg_temp_new_i64();
+TCGv_i64 result1 = tcg_temp_new_i64();
+TCGv_i64 result2 = tcg_temp_new_i64();
+int64_t mask1 = 0x1fULL;
+int64_t mask2 = 0x1fULL << 5;
+int64_t mask3 = 0x3fULL << 10;
+int i, j;
+/*
+ * In each iteration do the 6-5-5 pack for 2 pixels of each doubleword
+ * element of each source register.
+ */
+for (i = 0; i < 4; i++) {
+switch (i) {
+case 0:
+/*
+ * Get high doubleword of vA to perfrom 6-5-5 pack of pixels
+ * 1 and 2.
+ */
+get_avr64(avr, VA, true);
+tcg_gen_movi_i64(result, 0x0ULL);
+break;
+case 1:
+/*
+ * Get low doubleword of vA to perfrom 6-5-5 pack of pixels
+ * 3 and 4.
+ */
+get_avr64(avr, VA, false);
+break;
+case 2:
+/*
+ * Get high doubleword of vB to perfrom 6-5-5 pack of pixels
+ * 5 and 6.
+ */
+get_avr64(avr, VB, true);
+tcg_gen_movi_i64(result, 0x0ULL);
+break;
+case 3:
+/*
+ * Get low doubleword of vB to perfrom 6-5-5 pack of pixels
+ * 7 and 8.
+ */
+get_avr64(avr, VB, false);
+break;
+}
+/* Perform the packing for 2 pixels(each iteration for 1). */
+tcg_gen_movi_i64(tmp, 0x0ULL);
+for (j = 0; j < 2; j+

[Qemu-devel] [PATCH v5 1/8] target/ppc: Optimize emulation of lvsl and lvsr instructions

2019-07-15 Thread Stefan Brankovic
Adding simple macro that is calling tcg implementation of appropriate
instruction if altivec support is active.

Optimization of altivec instruction lvsl (Load Vector for Shift Left).
Place bytes sh:sh+15 of value 0x00 || 0x01 || 0x02 || ... || 0x1E || 0x1F
in destination register. Sh is calculated by adding 2 source registers and
getting bits 60-63 of result.

First, the bits [28-31] are placed from EA to variable sh. After that,
the bytes are created in the following way:
sh:(sh+7) of X(from description) by multiplying sh with 0x0101010101010101
followed by addition of the result with 0x0001020304050607. Value obtained
is placed in higher doubleword element of vD.
(sh+8):(sh+15) by adding the result of previous multiplication with
0x08090a0b0c0d0e0f. Value obtained is placed in lower doubleword element
of vD.

Optimization of altivec instruction lvsr (Load Vector for Shift Right).
Place bytes 16-sh:31-sh of value 0x00 || 0x01 || 0x02 || ... || 0x1E ||
0x1F in destination register. Sh is calculated by adding 2 source
registers and getting bits 60-63 of result.

First, the bits [28-31] are placed from EA to variable sh. After that,
the bytes are created in the following way:
sh:(sh+7) of X(from description) by multiplying sh with 0x0101010101010101
followed by substraction of the result from 0x1011121314151617. Value
obtained is placed in higher doubleword element of vD.
(sh+8):(sh+15) by substracting the result of previous multiplication from
0x18191a1b1c1d1e1f. Value obtained is placed in lower doubleword element
of vD.

Signed-off-by: Stefan Brankovic 
Reviewed-by: Richard Henderson 
---
 target/ppc/helper.h |   2 -
 target/ppc/int_helper.c |  18 --
 target/ppc/translate/vmx-impl.inc.c | 121 ++--
 3 files changed, 89 insertions(+), 52 deletions(-)

diff --git a/target/ppc/helper.h b/target/ppc/helper.h
index 02b67a3..c82105e 100644
--- a/target/ppc/helper.h
+++ b/target/ppc/helper.h
@@ -189,8 +189,6 @@ DEF_HELPER_2(vprtybw, void, avr, avr)
 DEF_HELPER_2(vprtybd, void, avr, avr)
 DEF_HELPER_2(vprtybq, void, avr, avr)
 DEF_HELPER_3(vsubcuw, void, avr, avr, avr)
-DEF_HELPER_2(lvsl, void, avr, tl)
-DEF_HELPER_2(lvsr, void, avr, tl)
 DEF_HELPER_FLAGS_5(vaddsbs, TCG_CALL_NO_RWG, void, avr, avr, avr, avr, i32)
 DEF_HELPER_FLAGS_5(vaddshs, TCG_CALL_NO_RWG, void, avr, avr, avr, avr, i32)
 DEF_HELPER_FLAGS_5(vaddsws, TCG_CALL_NO_RWG, void, avr, avr, avr, avr, i32)
diff --git a/target/ppc/int_helper.c b/target/ppc/int_helper.c
index 8ce89f2..9505f4c 100644
--- a/target/ppc/int_helper.c
+++ b/target/ppc/int_helper.c
@@ -457,24 +457,6 @@ SATCVT(sd, uw, int64_t, uint32_t, 0, UINT32_MAX)
 #undef SATCVT
 #undef SATCVTU
 
-void helper_lvsl(ppc_avr_t *r, target_ulong sh)
-{
-int i, j = (sh & 0xf);
-
-for (i = 0; i < ARRAY_SIZE(r->u8); i++) {
-r->VsrB(i) = j++;
-}
-}
-
-void helper_lvsr(ppc_avr_t *r, target_ulong sh)
-{
-int i, j = 0x10 - (sh & 0xf);
-
-for (i = 0; i < ARRAY_SIZE(r->u8); i++) {
-r->VsrB(i) = j++;
-}
-}
-
 void helper_mtvscr(CPUPPCState *env, uint32_t vscr)
 {
 env->vscr = vscr & ~(1u << VSCR_SAT);
diff --git a/target/ppc/translate/vmx-impl.inc.c 
b/target/ppc/translate/vmx-impl.inc.c
index 663275b..a9fe3c7 100644
--- a/target/ppc/translate/vmx-impl.inc.c
+++ b/target/ppc/translate/vmx-impl.inc.c
@@ -142,38 +142,6 @@ GEN_VR_STVE(bx, 0x07, 0x04, 1);
 GEN_VR_STVE(hx, 0x07, 0x05, 2);
 GEN_VR_STVE(wx, 0x07, 0x06, 4);
 
-static void gen_lvsl(DisasContext *ctx)
-{
-TCGv_ptr rd;
-TCGv EA;
-if (unlikely(!ctx->altivec_enabled)) {
-gen_exception(ctx, POWERPC_EXCP_VPU);
-return;
-}
-EA = tcg_temp_new();
-gen_addr_reg_index(ctx, EA);
-rd = gen_avr_ptr(rD(ctx->opcode));
-gen_helper_lvsl(rd, EA);
-tcg_temp_free(EA);
-tcg_temp_free_ptr(rd);
-}
-
-static void gen_lvsr(DisasContext *ctx)
-{
-TCGv_ptr rd;
-TCGv EA;
-if (unlikely(!ctx->altivec_enabled)) {
-gen_exception(ctx, POWERPC_EXCP_VPU);
-return;
-}
-EA = tcg_temp_new();
-gen_addr_reg_index(ctx, EA);
-rd = gen_avr_ptr(rD(ctx->opcode));
-gen_helper_lvsr(rd, EA);
-tcg_temp_free(EA);
-tcg_temp_free_ptr(rd);
-}
-
 static void gen_mfvscr(DisasContext *ctx)
 {
 TCGv_i32 t;
@@ -316,6 +284,16 @@ static void glue(gen_, name)(DisasContext *ctx)
 \
 tcg_temp_free_ptr(rd);  \
 }
 
+#define GEN_VXFORM_TRANS(name, opc2, opc3)  \
+static void glue(gen_, name)(DisasContext *ctx) \
+{   \
+if (unlikely(!ctx->altivec_enabled)) {  \
+gen_exception(ctx, POWERPC_

[Qemu-devel] [PATCH v5 2/8] target/ppc: Optimize emulation of vsl and vsr instructions

2019-07-15 Thread Stefan Brankovic
Optimization of altivec instructions vsl and vsr(Vector Shift Left/Rigt).
Perform shift operation (left and right respectively) on 128 bit value of
register vA by value specified in bits 125-127 of register vB. Lowest 3
bits in each byte element of register vB must be identical or result is
undefined.

For vsl instruction, the first step is bits 125-127 of register vB have
to be saved in variable sh. Then, the highest sh bits of the lower
doubleword element of register vA are saved in variable shifted,
in order not to lose those bits when shift operation is performed on
the lower doubleword element of register vA, which is the next
step. After shifting the lower doubleword element shift operation
is performed on higher doubleword element of vA, with replacement of
the lowest sh bits(that are now 0) with bits saved in shifted.

For vsr instruction, firstly, the bits 125-127 of register vB have
to be saved in variable sh. Then, the lowest sh bits of the higher
doubleword element of register vA are saved in variable shifted,
in odred not to lose those bits when the shift operation is
performed on the higher doubleword element of register vA, which is
the next step. After shifting higher doubleword element, shift operation
is performed on lower doubleword element of vA, with replacement of
highest sh bits(that are now 0) with bits saved in shifted.

Signed-off-by: Stefan Brankovic 
Reviewed-by: Richard Henderson 
---
 target/ppc/helper.h |   2 -
 target/ppc/int_helper.c |  35 -
 target/ppc/translate/vmx-impl.inc.c | 101 +++-
 3 files changed, 99 insertions(+), 39 deletions(-)

diff --git a/target/ppc/helper.h b/target/ppc/helper.h
index c82105e..33dad6a 100644
--- a/target/ppc/helper.h
+++ b/target/ppc/helper.h
@@ -213,8 +213,6 @@ DEF_HELPER_3(vrlb, void, avr, avr, avr)
 DEF_HELPER_3(vrlh, void, avr, avr, avr)
 DEF_HELPER_3(vrlw, void, avr, avr, avr)
 DEF_HELPER_3(vrld, void, avr, avr, avr)
-DEF_HELPER_3(vsl, void, avr, avr, avr)
-DEF_HELPER_3(vsr, void, avr, avr, avr)
 DEF_HELPER_4(vsldoi, void, avr, avr, avr, i32)
 DEF_HELPER_3(vextractub, void, avr, avr, i32)
 DEF_HELPER_3(vextractuh, void, avr, avr, i32)
diff --git a/target/ppc/int_helper.c b/target/ppc/int_helper.c
index 9505f4c..a23853e 100644
--- a/target/ppc/int_helper.c
+++ b/target/ppc/int_helper.c
@@ -1738,41 +1738,6 @@ VEXTU_X_DO(vextuhrx, 16, 0)
 VEXTU_X_DO(vextuwrx, 32, 0)
 #undef VEXTU_X_DO
 
-/*
- * The specification says that the results are undefined if all of the
- * shift counts are not identical.  We check to make sure that they
- * are to conform to what real hardware appears to do.
- */
-#define VSHIFT(suffix, leftp)   \
-void helper_vs##suffix(ppc_avr_t *r, ppc_avr_t *a, ppc_avr_t *b)\
-{   \
-int shift = b->VsrB(15) & 0x7;  \
-int doit = 1;   \
-int i;  \
-\
-for (i = 0; i < ARRAY_SIZE(r->u8); i++) {   \
-doit = doit && ((b->u8[i] & 0x7) == shift); \
-}   \
-if (doit) { \
-if (shift == 0) {   \
-*r = *a;\
-} else if (leftp) { \
-uint64_t carry = a->VsrD(1) >> (64 - shift);\
-\
-r->VsrD(0) = (a->VsrD(0) << shift) | carry; \
-r->VsrD(1) = a->VsrD(1) << shift;   \
-} else {\
-uint64_t carry = a->VsrD(0) << (64 - shift);\
-\
-r->VsrD(1) = (a->VsrD(1) >> shift) | carry; \
-r->VsrD(0) = a->VsrD(0) >> shift;   \
-}   \
-}   \
-}
-VSHIFT(l, 1)
-VSHIFT(r, 0)
-#undef VSHIFT
-
 void helper_vslv(ppc_avr_t *r, ppc_avr_t *a, ppc_avr_t *b)
 {
 int i;
diff --git a/target/ppc/translate/vmx-impl.inc.c 
b/target/ppc/translate/vmx-impl.inc.c
index a9fe3c7..e06e65a 100644
--- a/target/ppc/translate/vmx-impl.inc.c
+++ b/target/ppc/translate/vmx-impl.inc.c
@@ -570,6 +570,103 @@ 

[Qemu-devel] [PATCH v5 0/8] target/ppc: Optimize emulation of some Altivec instructions

2019-07-15 Thread Stefan Brankovic
Optimize emulation of ten Altivec instructions: lvsl, lvsr, vsl, vsr, vpkpx,
vgbbd, vclzb, vclzh, vclzw and vclzd.

This series buils up on and complements recent work of Thomas Murta, Mark
Cave-Ayland and Richard Henderson in the same area. It is based on devising TCG
translation implementation for selected instructions rather than using helpers.
The selected instructions are most of the time idiosyncratic to ppc platform,
so relatively complex TCG translation (without direct mapping to host
instruction that is not possible in these cases) seems to be the best option,
and that approach is presented in this series. The performance improvements
are significant in all cases.

V5:

Fixed vpkpx bug and added it back in patch.
Fixed graphical distortions on OSX 10.3 and 10.4.
Removed conversion of vmrgh and vmrgl instructions to vector operations for
further investigation.

V4:

Addressed Richard's Henderson's suggestions.
Removed vpkpx's optimization for further investigation on graphical distortions
it caused on OSX 10.2-4 guests.
Added opcodes for vector vmrgh(b|h|w) and vmrgl(b|h|w) in tcg.
Implemented vector vmrgh and vmrgl instructions for i386.
Converted vmrgh and vmrgl instructions to vector operations.

V3:

Fixed problem during build.

V2:

Addressed Richard's Henderson's suggestions.
Fixed problem during build on patch 2/8.
Rebased series to the latest qemu code.

Stefan Brankovic (8):
  target/ppc: Optimize emulation of lvsl and lvsr instructions
  target/ppc: Optimize emulation of vsl and vsr instructions
  target/ppc: Optimize emulation of vpkpx instruction
  target/ppc: Optimize emulation of vgbbd instruction
  target/ppc: Optimize emulation of vclzd instruction
  target/ppc: Optimize emulation of vclzw instruction
  target/ppc: Optimize emulation of vclzh and vclzb instructions
  target/ppc: Refactor emulation of vmrgew and vmrgow instructions

 target/ppc/helper.h |  10 -
 target/ppc/int_helper.c | 365 
 target/ppc/translate/vmx-impl.inc.c | 656 
 3 files changed, 587 insertions(+), 444 deletions(-)

-- 
2.7.4




[Qemu-devel] [PATCH v4 12/13] tcg/i386: Implement vector vmrgl instructions

2019-06-27 Thread Stefan Brankovic
Signed-off-by: Stefan Brankovic 
---
 tcg/i386/tcg-target.h |  2 +-
 tcg/i386/tcg-target.inc.c | 10 ++
 2 files changed, 11 insertions(+), 1 deletion(-)

diff --git a/tcg/i386/tcg-target.h b/tcg/i386/tcg-target.h
index e825324..d20d08f 100644
--- a/tcg/i386/tcg-target.h
+++ b/tcg/i386/tcg-target.h
@@ -193,7 +193,7 @@ extern bool have_avx2;
 #define TCG_TARGET_HAS_bitsel_vec   0
 #define TCG_TARGET_HAS_cmpsel_vec   -1
 #define TCG_TARGET_HAS_vmrgh_vec1
-#define TCG_TARGET_HAS_vmrgl_vec0
+#define TCG_TARGET_HAS_vmrgl_vec1
 
 #define TCG_TARGET_deposit_i32_valid(ofs, len) \
 (((ofs) == 0 && (len) == 8) || ((ofs) == 8 && (len) == 8) || \
diff --git a/tcg/i386/tcg-target.inc.c b/tcg/i386/tcg-target.inc.c
index 31e1b2b..dc3cd65 100644
--- a/tcg/i386/tcg-target.inc.c
+++ b/tcg/i386/tcg-target.inc.c
@@ -2826,6 +2826,9 @@ static void tcg_out_vec_op(TCGContext *s, TCGOpcode opc,
 case INDEX_op_vmrgh_vec:
 insn = punpckh_insn[vece];
 goto gen_simd;
+case INDEX_op_vmrgl_vec:
+insn = punpckl_insn[vece];
+goto gen_simd;
 case INDEX_op_shlv_vec:
 insn = shlv_insn[vece];
 goto gen_simd;
@@ -3227,6 +3230,7 @@ static const TCGTargetOpDef *tcg_target_op_def(TCGOpcode 
op)
 case INDEX_op_smax_vec:
 case INDEX_op_umax_vec:
 case INDEX_op_vmrgh_vec:
+case INDEX_op_vmrgl_vec:
 case INDEX_op_shlv_vec:
 case INDEX_op_shrv_vec:
 case INDEX_op_sarv_vec:
@@ -3327,6 +3331,8 @@ int tcg_can_emit_vec_op(TCGOpcode opc, TCGType type, 
unsigned vece)
 return vece <= MO_32;
 case INDEX_op_vmrgh_vec:
 return vece <= MO_32 ? -1 : 0;
+case INDEX_op_vmrgl_vec:
+return vece <= MO_32 ? -1 : 0;
 
 default:
 return 0;
@@ -3671,6 +3677,10 @@ void tcg_expand_vec_op(TCGOpcode opc, TCGType type, 
unsigned vece,
 v2 = temp_tcgv_vec(arg_temp(a2));
 expand_vec_vmrg(opc, type, vece, v0, v1, v2);
 break;
+case INDEX_op_vmrgl_vec:
+v2 = temp_tcgv_vec(arg_temp(a2));
+expand_vec_vmrg(opc, type, vece, v0, v1, v2);
+break;
 
 default:
 break;
-- 
2.7.4




[Qemu-devel] [PATCH v4 08/13] tcg: Add opcodes for vector vmrgh instructions

2019-06-27 Thread Stefan Brankovic
Signed-off-by: Stefan Brankovic 
---
 accel/tcg/tcg-runtime-gvec.c | 42 ++
 accel/tcg/tcg-runtime.h  |  4 
 tcg/i386/tcg-target.h|  1 +
 tcg/tcg-op-gvec.c| 23 +++
 tcg/tcg-op-gvec.h|  3 +++
 tcg/tcg-op-vec.c |  5 +
 tcg/tcg-op.h |  2 ++
 tcg/tcg-opc.h|  2 ++
 tcg/tcg.c|  2 ++
 tcg/tcg.h|  1 +
 10 files changed, 85 insertions(+)

diff --git a/accel/tcg/tcg-runtime-gvec.c b/accel/tcg/tcg-runtime-gvec.c
index 51cb29c..28173ae 100644
--- a/accel/tcg/tcg-runtime-gvec.c
+++ b/accel/tcg/tcg-runtime-gvec.c
@@ -1458,3 +1458,45 @@ void HELPER(gvec_bitsel)(void *d, void *a, void *b, void 
*c, uint32_t desc)
 }
 clear_high(d, oprsz, desc);
 }
+
+void HELPER(gvec_vmrgh8)(void *d, void *a, void *b, uint32_t desc)
+{
+intptr_t oprsz = simd_oprsz(desc);
+intptr_t i;
+
+for (i = 0; i < (oprsz / 2); i += sizeof(uint8_t)) {
+uint8_t aa = *(uint8_t *)(a + 8 * sizeof(uint8_t) + i);
+uint8_t bb = *(uint8_t *)(b + 8 * sizeof(uint8_t) + i);
+*(uint8_t *)(d + 2 * i) = bb;
+*(uint8_t *)(d + 2 * i + sizeof(uint8_t)) = aa;
+}
+clear_high(d, oprsz, desc);
+}
+
+void HELPER(gvec_vmrgh16)(void *d, void *a, void *b, uint32_t desc)
+{
+intptr_t oprsz = simd_oprsz(desc);
+intptr_t i;
+
+for (i = 0; i < (oprsz / 2); i += sizeof(uint16_t)) {
+uint16_t aa = *(uint16_t *)(a + 4 * sizeof(uint16_t) + i);
+uint16_t bb = *(uint16_t *)(b + 4 * sizeof(uint16_t) + i);
+*(uint16_t *)(d + 2 * i) = bb;
+*(uint16_t *)(d + 2 * i + sizeof(uint16_t)) = aa;
+}
+clear_high(d, oprsz, desc);
+}
+
+void HELPER(gvec_vmrgh32)(void *d, void *a, void *b, uint32_t desc)
+{
+intptr_t oprsz = simd_oprsz(desc);
+intptr_t i;
+
+for (i = 0; i < oprsz; i += sizeof(uint32_t)) {
+uint32_t aa = *(uint32_t *)(a + 2 * sizeof(uint32_t) + i);
+uint32_t bb = *(uint32_t *)(b + 2 * sizeof(uint32_t) + i);
+*(uint32_t *)(d + 2 * i) = bb;
+*(uint32_t *)(d + 2 * i + sizeof(uint32_t)) = aa;
+}
+clear_high(d, oprsz, desc);
+}
diff --git a/accel/tcg/tcg-runtime.h b/accel/tcg/tcg-runtime.h
index 4fa61b4..089956f 100644
--- a/accel/tcg/tcg-runtime.h
+++ b/accel/tcg/tcg-runtime.h
@@ -305,3 +305,7 @@ DEF_HELPER_FLAGS_4(gvec_leu32, TCG_CALL_NO_RWG, void, ptr, 
ptr, ptr, i32)
 DEF_HELPER_FLAGS_4(gvec_leu64, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32)
 
 DEF_HELPER_FLAGS_5(gvec_bitsel, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, ptr, i32)
+
+DEF_HELPER_FLAGS_4(gvec_vmrgh8, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32)
+DEF_HELPER_FLAGS_4(gvec_vmrgh16, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32)
+DEF_HELPER_FLAGS_4(gvec_vmrgh32, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32)
diff --git a/tcg/i386/tcg-target.h b/tcg/i386/tcg-target.h
index 928e8b8..e11b22d 100644
--- a/tcg/i386/tcg-target.h
+++ b/tcg/i386/tcg-target.h
@@ -192,6 +192,7 @@ extern bool have_avx2;
 #define TCG_TARGET_HAS_minmax_vec   1
 #define TCG_TARGET_HAS_bitsel_vec   0
 #define TCG_TARGET_HAS_cmpsel_vec   -1
+#define TCG_TARGET_HAS_vmrgh_vec0
 
 #define TCG_TARGET_deposit_i32_valid(ofs, len) \
 (((ofs) == 0 && (len) == 8) || ((ofs) == 8 && (len) == 8) || \
diff --git a/tcg/tcg-op-gvec.c b/tcg/tcg-op-gvec.c
index 17679b6..2560fb6 100644
--- a/tcg/tcg-op-gvec.c
+++ b/tcg/tcg-op-gvec.c
@@ -2102,6 +2102,29 @@ void tcg_gen_gvec_umax(unsigned vece, uint32_t dofs, 
uint32_t aofs,
 tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, [vece]);
 }
 
+static const TCGOpcode vecop_list_vmrgh[] = { INDEX_op_vmrgh_vec, 0 };
+
+void tcg_gen_gvec_vmrgh(unsigned vece, uint32_t dofs, uint32_t aofs,
+   uint32_t bofs, uint32_t oprsz, uint32_t maxsz)
+{
+static const GVecGen3 g[3] = {
+{ .fniv = tcg_gen_vmrgh_vec,
+  .fno = gen_helper_gvec_vmrgh8,
+  .opt_opc = vecop_list_vmrgh,
+  .vece = MO_8 },
+{ .fniv = tcg_gen_vmrgh_vec,
+  .fno = gen_helper_gvec_vmrgh16,
+  .opt_opc = vecop_list_vmrgh,
+  .vece = MO_16 },
+{ .fniv = tcg_gen_vmrgh_vec,
+  .fno = gen_helper_gvec_vmrgh32,
+  .opt_opc = vecop_list_vmrgh,
+  .vece = MO_32 }
+};
+tcg_debug_assert(vece <= MO_64);
+tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, [vece]);
+}
+
 /* Perform a vector negation using normal negation and a mask.
Compare gen_subv_mask above.  */
 static void gen_negv_mask(TCGv_i64 d, TCGv_i64 b, TCGv_i64 m)
diff --git a/tcg/tcg-op-gvec.h b/tcg/tcg-op-gvec.h
index 830d68f..8c04d71 100644
--- a/tcg/tcg-op-gvec.h
+++ b/tcg/tcg-op-gvec.h
@@ -272,6 +272,9 @@ void tcg_gen_gvec_smax(unsigned vece, uint32_t dofs, 
uint32_t aofs,
uint32_t bofs, uint32_t oprsz, uint32_t maxsz);
 void tcg_gen_gvec_umax(unsigned vece, 

[Qemu-devel] [PATCH v4 07/13] target/ppc: Refactor emulation of vmrgew and vmrgow instructions

2019-06-27 Thread Stefan Brankovic
Since I found this two instructions implemented with tcg, I refactored
them so they are consistent with other similar implementations that
I introduced in this patch.

Also, a new dual macro GEN_VXFORM_TRANS_DUAL is added. This macro is
used if one instruction is realized with direct translation, and second
one with a helper.

Signed-off-by: Stefan Brankovic 
---
 target/ppc/translate/vmx-impl.inc.c | 66 +
 1 file changed, 37 insertions(+), 29 deletions(-)

diff --git a/target/ppc/translate/vmx-impl.inc.c 
b/target/ppc/translate/vmx-impl.inc.c
index fd25b7c..39fb26d 100644
--- a/target/ppc/translate/vmx-impl.inc.c
+++ b/target/ppc/translate/vmx-impl.inc.c
@@ -350,6 +350,28 @@ static void glue(gen_, name0##_##name1)(DisasContext *ctx) 
\
 }  \
 }
 
+/*
+ * We use this macro if one instruction is realized with direct
+ * translation, and second one with helper.
+ */
+#define GEN_VXFORM_TRANS_DUAL(name0, flg0, flg2_0, name1, flg1, flg2_1)\
+static void glue(gen_, name0##_##name1)(DisasContext *ctx) \
+{  \
+if ((Rc(ctx->opcode) == 0) &&  \
+((ctx->insns_flags & flg0) || (ctx->insns_flags2 & flg2_0))) { \
+if (unlikely(!ctx->altivec_enabled)) { \
+gen_exception(ctx, POWERPC_EXCP_VPU);  \
+return;\
+}  \
+trans_##name0(ctx);\
+} else if ((Rc(ctx->opcode) == 1) &&   \
+((ctx->insns_flags & flg1) || (ctx->insns_flags2 & flg2_1))) { \
+gen_##name1(ctx);  \
+} else {   \
+gen_inval_exception(ctx, POWERPC_EXCP_INVAL_INVAL);\
+}  \
+}
+
 /* Adds support to provide invalid mask */
 #define GEN_VXFORM_DUAL_EXT(name0, flg0, flg2_0, inval0,\
 name1, flg1, flg2_1, inval1)\
@@ -431,20 +453,13 @@ GEN_VXFORM(vmrglb, 6, 4);
 GEN_VXFORM(vmrglh, 6, 5);
 GEN_VXFORM(vmrglw, 6, 6);
 
-static void gen_vmrgew(DisasContext *ctx)
+static void trans_vmrgew(DisasContext *ctx)
 {
-TCGv_i64 tmp;
-TCGv_i64 avr;
-int VT, VA, VB;
-if (unlikely(!ctx->altivec_enabled)) {
-gen_exception(ctx, POWERPC_EXCP_VPU);
-return;
-}
-VT = rD(ctx->opcode);
-VA = rA(ctx->opcode);
-VB = rB(ctx->opcode);
-tmp = tcg_temp_new_i64();
-avr = tcg_temp_new_i64();
+int VT = rD(ctx->opcode);
+int VA = rA(ctx->opcode);
+int VB = rB(ctx->opcode);
+TCGv_i64 tmp = tcg_temp_new_i64();
+TCGv_i64 avr = tcg_temp_new_i64();
 
 get_avr64(avr, VB, true);
 tcg_gen_shri_i64(tmp, avr, 32);
@@ -462,21 +477,14 @@ static void gen_vmrgew(DisasContext *ctx)
 tcg_temp_free_i64(avr);
 }
 
-static void gen_vmrgow(DisasContext *ctx)
+static void trans_vmrgow(DisasContext *ctx)
 {
-TCGv_i64 t0, t1;
-TCGv_i64 avr;
-int VT, VA, VB;
-if (unlikely(!ctx->altivec_enabled)) {
-gen_exception(ctx, POWERPC_EXCP_VPU);
-return;
-}
-VT = rD(ctx->opcode);
-VA = rA(ctx->opcode);
-VB = rB(ctx->opcode);
-t0 = tcg_temp_new_i64();
-t1 = tcg_temp_new_i64();
-avr = tcg_temp_new_i64();
+int VT = rD(ctx->opcode);
+int VA = rA(ctx->opcode);
+int VB = rB(ctx->opcode);
+TCGv_i64 t0 = tcg_temp_new_i64();
+TCGv_i64 t1 = tcg_temp_new_i64();
+TCGv_i64 avr = tcg_temp_new_i64();
 
 get_avr64(t0, VB, true);
 get_avr64(t1, VA, true);
@@ -1052,14 +1060,14 @@ GEN_VXFORM_ENV(vminfp, 5, 17);
 GEN_VXFORM_HETRO(vextublx, 6, 24)
 GEN_VXFORM_HETRO(vextuhlx, 6, 25)
 GEN_VXFORM_HETRO(vextuwlx, 6, 26)
-GEN_VXFORM_DUAL(vmrgow, PPC_NONE, PPC2_ALTIVEC_207,
+GEN_VXFORM_TRANS_DUAL(vmrgow, PPC_NONE, PPC2_ALTIVEC_207,
 vextuwlx, PPC_NONE, PPC2_ISA300)
 GEN_VXFORM_HETRO(vextubrx, 6, 28)
 GEN_VXFORM_HETRO(vextuhrx, 6, 29)
 GEN_VXFORM_HETRO(vextuwrx, 6, 30)
 GEN_VXFORM_TRANS(lvsl, 6, 31)
 GEN_VXFORM_TRANS(lvsr, 6, 32)
-GEN_VXFORM_DUAL(vmrgew, PPC_NONE, PPC2_ALTIVEC_207, \
+GEN_VXFORM_TRANS_DUAL(vmrgew, PPC_NONE, PPC2_ALTIVEC_207,
 vextuwrx, PPC_NONE, PPC2_ISA300)
 
 #define GEN_VXRFORM1(opname, name, str, opc2, opc3) \
-- 
2.7.4




[Qemu-devel] [PATCH v4 11/13] tcg: Add opcodes for verctor vmrgl instructions

2019-06-27 Thread Stefan Brankovic
Signed-off-by: Stefan Brankovic 
---
 accel/tcg/tcg-runtime-gvec.c | 42 ++
 accel/tcg/tcg-runtime.h  |  4 
 tcg/i386/tcg-target.h|  1 +
 tcg/tcg-op-gvec.c| 24 
 tcg/tcg-op-gvec.h|  2 ++
 tcg/tcg-op-vec.c |  5 +
 tcg/tcg-op.h |  1 +
 tcg/tcg-opc.h|  1 +
 tcg/tcg.c|  2 ++
 tcg/tcg.h|  1 +
 10 files changed, 83 insertions(+)

diff --git a/accel/tcg/tcg-runtime-gvec.c b/accel/tcg/tcg-runtime-gvec.c
index 28173ae..152f277 100644
--- a/accel/tcg/tcg-runtime-gvec.c
+++ b/accel/tcg/tcg-runtime-gvec.c
@@ -1500,3 +1500,45 @@ void HELPER(gvec_vmrgh32)(void *d, void *a, void *b, 
uint32_t desc)
 }
 clear_high(d, oprsz, desc);
 }
+
+void HELPER(gvec_vmrgl8)(void *d, void *a, void *b, uint32_t desc)
+{
+intptr_t oprsz = simd_oprsz(desc);
+intptr_t i;
+
+for (i = 0; i < (oprsz / 2); i += sizeof(uint8_t)) {
+uint8_t aa = *(uint8_t *)(a + i);
+uint8_t bb = *(uint8_t *)(b + i);
+*(uint8_t *)(d + 2 * i) = bb;
+*(uint8_t *)(d + 2 * i + sizeof(uint8_t)) = aa;
+}
+clear_high(d, oprsz, desc);
+}
+
+void HELPER(gvec_vmrgl16)(void *d, void *a, void *b, uint32_t desc)
+{
+intptr_t oprsz = simd_oprsz(desc);
+intptr_t i;
+
+for (i = 0; i < (oprsz / 2); i += sizeof(uint16_t)) {
+uint16_t aa = *(uint16_t *)(a + i);
+uint16_t bb = *(uint16_t *)(b + i);
+*(uint16_t *)(d + 2 * i) = bb;
+*(uint16_t *)(d + 2 * i + sizeof(uint16_t)) = aa;
+}
+clear_high(d, oprsz, desc);
+}
+
+void HELPER(gvec_vmrgl32)(void *d, void *a, void *b, uint32_t desc)
+{
+intptr_t oprsz = simd_oprsz(desc);
+intptr_t i;
+
+for (i = 0; i < oprsz; i += sizeof(uint32_t)) {
+uint32_t aa = *(uint32_t *)(a + i);
+uint32_t bb = *(uint32_t *)(b + i);
+*(uint32_t *)(d + 2 * i) = bb;
+*(uint32_t *)(d + 2 * i + sizeof(uint32_t)) = aa;
+}
+clear_high(d, oprsz, desc);
+}
diff --git a/accel/tcg/tcg-runtime.h b/accel/tcg/tcg-runtime.h
index 089956f..fd0ba1e 100644
--- a/accel/tcg/tcg-runtime.h
+++ b/accel/tcg/tcg-runtime.h
@@ -309,3 +309,7 @@ DEF_HELPER_FLAGS_5(gvec_bitsel, TCG_CALL_NO_RWG, void, ptr, 
ptr, ptr, ptr, i32)
 DEF_HELPER_FLAGS_4(gvec_vmrgh8, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32)
 DEF_HELPER_FLAGS_4(gvec_vmrgh16, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32)
 DEF_HELPER_FLAGS_4(gvec_vmrgh32, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32)
+
+DEF_HELPER_FLAGS_4(gvec_vmrgl8, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32)
+DEF_HELPER_FLAGS_4(gvec_vmrgl16, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32)
+DEF_HELPER_FLAGS_4(gvec_vmrgl32, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32)
diff --git a/tcg/i386/tcg-target.h b/tcg/i386/tcg-target.h
index daae35f..e825324 100644
--- a/tcg/i386/tcg-target.h
+++ b/tcg/i386/tcg-target.h
@@ -193,6 +193,7 @@ extern bool have_avx2;
 #define TCG_TARGET_HAS_bitsel_vec   0
 #define TCG_TARGET_HAS_cmpsel_vec   -1
 #define TCG_TARGET_HAS_vmrgh_vec1
+#define TCG_TARGET_HAS_vmrgl_vec0
 
 #define TCG_TARGET_deposit_i32_valid(ofs, len) \
 (((ofs) == 0 && (len) == 8) || ((ofs) == 8 && (len) == 8) || \
diff --git a/tcg/tcg-op-gvec.c b/tcg/tcg-op-gvec.c
index 2560fb6..da1d272 100644
--- a/tcg/tcg-op-gvec.c
+++ b/tcg/tcg-op-gvec.c
@@ -2125,6 +2125,30 @@ void tcg_gen_gvec_vmrgh(unsigned vece, uint32_t dofs, 
uint32_t aofs,
 tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, [vece]);
 }
 
+static const TCGOpcode vecop_list_vmrgl[] = { INDEX_op_vmrgl_vec, 0 };
+
+void tcg_gen_gvec_vmrgl(unsigned vece, uint32_t dofs, uint32_t aofs,
+   uint32_t bofs, uint32_t oprsz, uint32_t maxsz)
+{
+static const GVecGen3 g[3] = {
+{ .fniv = tcg_gen_vmrgl_vec,
+  .fno = gen_helper_gvec_vmrgl8,
+  .opt_opc = vecop_list_vmrgl,
+  .vece = MO_8 },
+{ .fniv = tcg_gen_vmrgl_vec,
+  .fno = gen_helper_gvec_vmrgl16,
+  .opt_opc = vecop_list_vmrgl,
+  .vece = MO_16 },
+{
+  .fniv = tcg_gen_vmrgl_vec,
+  .fno = gen_helper_gvec_vmrgl32,
+  .opt_opc = vecop_list_vmrgl,
+  .vece = MO_32 }
+};
+tcg_debug_assert(vece <= MO_64);
+tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, [vece]);
+}
+
 /* Perform a vector negation using normal negation and a mask.
Compare gen_subv_mask above.  */
 static void gen_negv_mask(TCGv_i64 d, TCGv_i64 b, TCGv_i64 m)
diff --git a/tcg/tcg-op-gvec.h b/tcg/tcg-op-gvec.h
index 8c04d71..a2eb45c 100644
--- a/tcg/tcg-op-gvec.h
+++ b/tcg/tcg-op-gvec.h
@@ -275,6 +275,8 @@ void tcg_gen_gvec_umax(unsigned vece, uint32_t dofs, 
uint32_t aofs,
 /* Vector merge. */
 void tcg_gen_gvec_vmrgh(unsigned vece, uint32_t dofs, uint32_t aofs,
uint32_t bofs, uint32_t oprsz, uint32_t maxsz);
+void tcg_gen_gvec_vmrgl(unsigned v

[Qemu-devel] [PATCH v4 09/13] tcg/i386: Implement vector vmrgh instructions

2019-06-27 Thread Stefan Brankovic
Signed-off-by: Stefan Brankovic 
---
 tcg/i386/tcg-target.h |  2 +-
 tcg/i386/tcg-target.inc.c | 19 +++
 2 files changed, 20 insertions(+), 1 deletion(-)

diff --git a/tcg/i386/tcg-target.h b/tcg/i386/tcg-target.h
index e11b22d..daae35f 100644
--- a/tcg/i386/tcg-target.h
+++ b/tcg/i386/tcg-target.h
@@ -192,7 +192,7 @@ extern bool have_avx2;
 #define TCG_TARGET_HAS_minmax_vec   1
 #define TCG_TARGET_HAS_bitsel_vec   0
 #define TCG_TARGET_HAS_cmpsel_vec   -1
-#define TCG_TARGET_HAS_vmrgh_vec0
+#define TCG_TARGET_HAS_vmrgh_vec1
 
 #define TCG_TARGET_deposit_i32_valid(ofs, len) \
 (((ofs) == 0 && (len) == 8) || ((ofs) == 8 && (len) == 8) || \
diff --git a/tcg/i386/tcg-target.inc.c b/tcg/i386/tcg-target.inc.c
index 6ddeebf..31e1b2b 100644
--- a/tcg/i386/tcg-target.inc.c
+++ b/tcg/i386/tcg-target.inc.c
@@ -2823,6 +2823,9 @@ static void tcg_out_vec_op(TCGContext *s, TCGOpcode opc,
 case INDEX_op_umax_vec:
 insn = umax_insn[vece];
 goto gen_simd;
+case INDEX_op_vmrgh_vec:
+insn = punpckh_insn[vece];
+goto gen_simd;
 case INDEX_op_shlv_vec:
 insn = shlv_insn[vece];
 goto gen_simd;
@@ -3223,6 +3226,7 @@ static const TCGTargetOpDef *tcg_target_op_def(TCGOpcode 
op)
 case INDEX_op_umin_vec:
 case INDEX_op_smax_vec:
 case INDEX_op_umax_vec:
+case INDEX_op_vmrgh_vec:
 case INDEX_op_shlv_vec:
 case INDEX_op_shrv_vec:
 case INDEX_op_sarv_vec:
@@ -3321,6 +3325,8 @@ int tcg_can_emit_vec_op(TCGOpcode opc, TCGType type, 
unsigned vece)
 case INDEX_op_umax_vec:
 case INDEX_op_abs_vec:
 return vece <= MO_32;
+case INDEX_op_vmrgh_vec:
+return vece <= MO_32 ? -1 : 0;
 
 default:
 return 0;
@@ -3614,6 +3620,14 @@ static void expand_vec_cmpsel(TCGType type, unsigned 
vece, TCGv_vec v0,
 tcg_temp_free_vec(t);
 }
 
+static void expand_vec_vmrg(TCGOpcode opc, TCGType type, unsigned vece,
+  TCGv_vec v0, TCGv_vec v1, TCGv_vec v2)
+{
+vec_gen_3(opc, type, vece,
+  tcgv_vec_arg(v0), tcgv_vec_arg(v2),
+  tcgv_vec_arg(v1));
+}
+
 void tcg_expand_vec_op(TCGOpcode opc, TCGType type, unsigned vece,
TCGArg a0, ...)
 {
@@ -3653,6 +3667,11 @@ void tcg_expand_vec_op(TCGOpcode opc, TCGType type, 
unsigned vece,
 expand_vec_cmpsel(type, vece, v0, v1, v2, v3, v4, va_arg(va, TCGArg));
 break;
 
+case INDEX_op_vmrgh_vec:
+v2 = temp_tcgv_vec(arg_temp(a2));
+expand_vec_vmrg(opc, type, vece, v0, v1, v2);
+break;
+
 default:
 break;
 }
-- 
2.7.4




[Qemu-devel] [PATCH v4 06/13] target/ppc: Optimize emulation of vclzh and vclzb instructions

2019-06-27 Thread Stefan Brankovic
Optimize Altivec instruction vclzh (Vector Count Leading Zeros Halfword).
This instruction counts the number of leading zeros of each halfword element
in source register and places result in the appropriate halfword element of
destination register.

In each iteration of outer for loop count operation is performed on one
doubleword element of source register vB. In the first iteration, higher
doubleword element of vB is placed in variable avr, and then counting
for every halfword element is performed by  using tcg_gen_clzi_i64.
Since it counts leading zeros on 64 bit lenght, ith byte element has to
be moved to the highest 16 bits of tmp, or-ed with mask(in order to get all
ones in lowest 48 bits), then perform tcg_gen_clzi_i64 and move it's result
in appropriate halfword element of result. This is done in inner for loop.
After the operation is finished, the result is saved in the appropriate
doubleword element of destination register vD. The same sequence of orders
is to be applied again for the  lower doubleword element of vB.

Optimize Altivec instruction vclzb (Vector Count Leading Zeros Byte).
This instruction counts the number of leading zeros of each byte element
in source register and places result in the appropriate byte element of
destination register.

In each iteration of the outer for loop, counting operation is done on one
doubleword element of source register vB. In the first iteration, the
higher doubleword element of vB is placed in variable avr, and then counting
for every byte element is performed using tcg_gen_clzi_i64. Since it counts
leading zeros on 64 bit lenght, ith byte element has to be moved to the highest
8 bits of variable  tmp, or-ed with mask(in order to get all ones in the lowest
56 bits), then perform tcg_gen_clzi_i64 and move it's result in the appropriate
byte element of result. This is done in inner for loop. After the operation is
finished, the result is saved in the  appropriate doubleword element of 
destination
register vD. The same sequence of orders is to be applied again for the lower
doubleword element of vB.

Signed-off-by: Stefan Brankovic 
---
 target/ppc/helper.h |   2 -
 target/ppc/int_helper.c |   9 ---
 target/ppc/translate/vmx-impl.inc.c | 122 +++-
 3 files changed, 120 insertions(+), 13 deletions(-)

diff --git a/target/ppc/helper.h b/target/ppc/helper.h
index 4c5c359..ac1a5bd 100644
--- a/target/ppc/helper.h
+++ b/target/ppc/helper.h
@@ -304,8 +304,6 @@ DEF_HELPER_4(vcfsx, void, env, avr, avr, i32)
 DEF_HELPER_4(vctuxs, void, env, avr, avr, i32)
 DEF_HELPER_4(vctsxs, void, env, avr, avr, i32)
 
-DEF_HELPER_2(vclzb, void, avr, avr)
-DEF_HELPER_2(vclzh, void, avr, avr)
 DEF_HELPER_2(vctzb, void, avr, avr)
 DEF_HELPER_2(vctzh, void, avr, avr)
 DEF_HELPER_2(vctzw, void, avr, avr)
diff --git a/target/ppc/int_helper.c b/target/ppc/int_helper.c
index cd25b66..3edf334 100644
--- a/target/ppc/int_helper.c
+++ b/target/ppc/int_helper.c
@@ -1821,15 +1821,6 @@ VUPK(lsw, s64, s32, UPKLO)
 }   \
 }
 
-#define clzb(v) ((v) ? clz32((uint32_t)(v) << 24) : 8)
-#define clzh(v) ((v) ? clz32((uint32_t)(v) << 16) : 16)
-
-VGENERIC_DO(clzb, u8)
-VGENERIC_DO(clzh, u16)
-
-#undef clzb
-#undef clzh
-
 #define ctzb(v) ((v) ? ctz32(v) : 8)
 #define ctzh(v) ((v) ? ctz32(v) : 16)
 #define ctzw(v) ctz32((v))
diff --git a/target/ppc/translate/vmx-impl.inc.c 
b/target/ppc/translate/vmx-impl.inc.c
index 39c7839..fd25b7c 100644
--- a/target/ppc/translate/vmx-impl.inc.c
+++ b/target/ppc/translate/vmx-impl.inc.c
@@ -741,6 +741,124 @@ static void trans_vgbbd(DisasContext *ctx)
 }
 
 /*
+ * vclzb VRT,VRB - Vector Count Leading Zeros Byte
+ *
+ * Counting the number of leading zero bits of each byte element in source
+ * register and placing result in appropriate byte element of destination
+ * register.
+ */
+static void trans_vclzb(DisasContext *ctx)
+{
+int VT = rD(ctx->opcode);
+int VB = rB(ctx->opcode);
+TCGv_i64 avr = tcg_temp_new_i64();
+TCGv_i64 result = tcg_temp_new_i64();
+TCGv_i64 tmp = tcg_temp_new_i64();
+TCGv_i64 mask = tcg_const_i64(0xffULL);
+int i, j;
+
+for (i = 0; i < 2; i++) {
+if (i == 0) {
+/* Get high doubleword of vB in avr. */
+get_avr64(avr, VB, true);
+} else {
+/* Get low doubleword of vB in avr. */
+get_avr64(avr, VB, false);
+}
+/*
+ * Perform count for every byte element using tcg_gen_clzi_i64.
+ * Since it counts leading zeros on 64 bit lenght, we have to move
+ * ith byte element to highest 8 bits of tmp, or it with mask(so we get
+ * all ones in lowest 56 bits), then perform tcg_gen_clzi_i64 and move
+ * it's result in appropriate byte element of result.
+ */
+tcg_gen_shli_i64(tmp, avr, 56);
+tcg_gen_

[Qemu-devel] [PATCH v4 10/13] target/ppc: convert vmrgh instructions to vector operations

2019-06-27 Thread Stefan Brankovic
Signed-off-by: Stefan Brankovic 
---
 target/ppc/helper.h | 3 ---
 target/ppc/int_helper.c | 2 +-
 target/ppc/translate/vmx-impl.inc.c | 6 +++---
 3 files changed, 4 insertions(+), 7 deletions(-)

diff --git a/target/ppc/helper.h b/target/ppc/helper.h
index ac1a5bd..9a7721f 100644
--- a/target/ppc/helper.h
+++ b/target/ppc/helper.h
@@ -164,9 +164,6 @@ DEF_HELPER_4(vcmpbfp_dot, void, env, avr, avr, avr)
 DEF_HELPER_3(vmrglb, void, avr, avr, avr)
 DEF_HELPER_3(vmrglh, void, avr, avr, avr)
 DEF_HELPER_3(vmrglw, void, avr, avr, avr)
-DEF_HELPER_3(vmrghb, void, avr, avr, avr)
-DEF_HELPER_3(vmrghh, void, avr, avr, avr)
-DEF_HELPER_3(vmrghw, void, avr, avr, avr)
 DEF_HELPER_3(vmulesb, void, avr, avr, avr)
 DEF_HELPER_3(vmulesh, void, avr, avr, avr)
 DEF_HELPER_3(vmulesw, void, avr, avr, avr)
diff --git a/target/ppc/int_helper.c b/target/ppc/int_helper.c
index 3edf334..00e6e02 100644
--- a/target/ppc/int_helper.c
+++ b/target/ppc/int_helper.c
@@ -948,7 +948,7 @@ void helper_vmladduhm(ppc_avr_t *r, ppc_avr_t *a, ppc_avr_t 
*b, ppc_avr_t *c)
 
 #define VMRG(suffix, element, access)  \
 VMRG_DO(mrgl##suffix, element, access, half)   \
-VMRG_DO(mrgh##suffix, element, access, 0)
+
 VMRG(b, u8, VsrB)
 VMRG(h, u16, VsrH)
 VMRG(w, u32, VsrW)
diff --git a/target/ppc/translate/vmx-impl.inc.c 
b/target/ppc/translate/vmx-impl.inc.c
index 39fb26d..e02390f 100644
--- a/target/ppc/translate/vmx-impl.inc.c
+++ b/target/ppc/translate/vmx-impl.inc.c
@@ -446,9 +446,9 @@ GEN_VXFORM_DUAL(vavguw, PPC_ALTIVEC, PPC_NONE, \
 GEN_VXFORM(vavgsb, 1, 20);
 GEN_VXFORM(vavgsh, 1, 21);
 GEN_VXFORM(vavgsw, 1, 22);
-GEN_VXFORM(vmrghb, 6, 0);
-GEN_VXFORM(vmrghh, 6, 1);
-GEN_VXFORM(vmrghw, 6, 2);
+GEN_VXFORM_V(vmrghb, MO_8, tcg_gen_gvec_vmrgh, 6, 0);
+GEN_VXFORM_V(vmrghh, MO_16, tcg_gen_gvec_vmrgh, 6, 1);
+GEN_VXFORM_V(vmrghw, MO_32, tcg_gen_gvec_vmrgh, 6, 2);
 GEN_VXFORM(vmrglb, 6, 4);
 GEN_VXFORM(vmrglh, 6, 5);
 GEN_VXFORM(vmrglw, 6, 6);
-- 
2.7.4




[Qemu-devel] [PATCH v4 03/13] target/ppc: Optimize emulation of vgbbd instruction

2019-06-27 Thread Stefan Brankovic
Optimize altivec instruction vgbbd (Vector Gather Bits by Bytes by Doubleword)
All ith bits (i in range 1 to 8) of each byte of doubleword element in
source register are concatenated and placed into ith byte of appropriate
doubleword element in destination register.

Following solution is done for both doubleword elements of source register
in parallel, in order to reduce the number of instructions needed(that's why
arrays are used):
First, both doubleword elements of source register vB are placed in
appropriate element of array avr. Bits are gathered in 2x8 iterations(2 for
loops). In first iteration bit 1 of byte 1, bit 2 of byte 2,... bit 8 of
byte 8 are in their final spots so avr[i], i={0,1} can be and-ed with
tcg_mask. For every following iteration, both avr[i] and tcg_mask variables
have to be shifted right for 7 and 8 places, respectively, in order to get
bit 1 of byte 2, bit 2 of byte 3.. bit 7 of byte 8 in their final spots so
shifted avr values(saved in tmp) can be and-ed with new value of tcg_mask...
After first 8 iteration(first loop), all the first bits are in their final
places, all second bits but second bit from eight byte are in their places...
only 1 eight bit from eight byte is in it's place). In second loop we do all
operations symmetrically, in order to get other half of bits in their final
spots. Results for first and second doubleword elements are saved in
result[0] and result[1] respectively. In the end those results are saved in
appropriate doubleword element of destination register vD.

Signed-off-by: Stefan Brankovic 
Reviewed-by: Richard Henderson 
---
 target/ppc/helper.h |   1 -
 target/ppc/int_helper.c | 276 
 target/ppc/translate/vmx-impl.inc.c |  77 +-
 3 files changed, 76 insertions(+), 278 deletions(-)

diff --git a/target/ppc/helper.h b/target/ppc/helper.h
index 33dad6a..cf1af51 100644
--- a/target/ppc/helper.h
+++ b/target/ppc/helper.h
@@ -320,7 +320,6 @@ DEF_HELPER_1(vclzlsbb, tl, avr)
 DEF_HELPER_1(vctzlsbb, tl, avr)
 DEF_HELPER_3(vbpermd, void, avr, avr, avr)
 DEF_HELPER_3(vbpermq, void, avr, avr, avr)
-DEF_HELPER_2(vgbbd, void, avr, avr)
 DEF_HELPER_3(vpmsumb, void, avr, avr, avr)
 DEF_HELPER_3(vpmsumh, void, avr, avr, avr)
 DEF_HELPER_3(vpmsumw, void, avr, avr, avr)
diff --git a/target/ppc/int_helper.c b/target/ppc/int_helper.c
index a23853e..87e3062 100644
--- a/target/ppc/int_helper.c
+++ b/target/ppc/int_helper.c
@@ -1185,282 +1185,6 @@ void helper_vbpermq(ppc_avr_t *r, ppc_avr_t *a, 
ppc_avr_t *b)
 #undef VBPERMQ_INDEX
 #undef VBPERMQ_DW
 
-static const uint64_t VGBBD_MASKS[256] = {
-0xull, /* 00 */
-0x0080ull, /* 01 */
-0x8000ull, /* 02 */
-0x8080ull, /* 03 */
-0x0080ull, /* 04 */
-0x00800080ull, /* 05 */
-0x00808000ull, /* 06 */
-0x00808080ull, /* 07 */
-0x8000ull, /* 08 */
-0x8080ull, /* 09 */
-0x80008000ull, /* 0A */
-0x80008080ull, /* 0B */
-0x8080ull, /* 0C */
-0x80800080ull, /* 0D */
-0x80808000ull, /* 0E */
-0x80808080ull, /* 0F */
-0x0080ull, /* 10 */
-0x00800080ull, /* 11 */
-0x00808000ull, /* 12 */
-0x00808080ull, /* 13 */
-0x00800080ull, /* 14 */
-0x008000800080ull, /* 15 */
-0x008000808000ull, /* 16 */
-0x008000808080ull, /* 17 */
-0x00808000ull, /* 18 */
-0x00808080ull, /* 19 */
-0x008080008000ull, /* 1A */
-0x008080008080ull, /* 1B */
-0x00808080ull, /* 1C */
-0x008080800080ull, /* 1D */
-0x008080808000ull, /* 1E */
-0x008080808080ull, /* 1F */
-0x8000ull, /* 20 */
-0x8080ull, /* 21 */
-0x80008000ull, /* 22 */
-0x80008080ull, /* 23 */
-0x8080ull, /* 24 */
-0x80800080ull, /* 25 */
-0x80808000ull, /* 26 */
-0x80808080ull, /* 27 */
-0x80008000ull, /* 28 */
-0x80008080ull, /* 29 */
-0x800080008000ull, /* 2A */
-0x800080008080ull, /* 2B */
-0x80008080ull, /* 2C */
-0x800080800080ull, /* 2D */
-0x800080808000ull, /* 2E */
-0x800080808080ull, /* 2F */
-0x8080ull, /* 30 */
-0x80800080ull, /* 31 */
-0x80808000ull, /* 32 */
-0x80808080ull, /* 33 */
-0x80800080ull, /* 34 */
-0x808000800080ull, /* 35 */
-0x808000808000ull, /* 36 */
-0x808000808080ull, /* 37 */
-0x80808000ull, /* 38 */
-0x80808080ull, /* 39 */
-0x808080008000ull, /* 3A */
-0x808080008080ull, /* 3B */
-0x80808080ull, /* 3C */
-0x808080800080ull, /* 3D */
-0x808080808000ull, /* 3E

[Qemu-devel] [PATCH v4 01/13] target/ppc: Optimize emulation of lvsl and lvsr instructions

2019-06-27 Thread Stefan Brankovic
Adding simple macro that is calling tcg implementation of appropriate
instruction if altivec support is active.

Optimization of altivec instruction lvsl (Load Vector for Shift Left).
Place bytes sh:sh+15 of value 0x00 || 0x01 || 0x02 || ... || 0x1E || 0x1F
in destination register. Sh is calculated by adding 2 source registers and
getting bits 60-63 of result.

First, the bits [28-31] are placed from EA to variable sh. After that,
the bytes are created in the following way:
sh:(sh+7) of X(from description) by multiplying sh with 0x0101010101010101
followed by addition of the result with 0x0001020304050607. Value obtained
is placed in higher doubleword element of vD.
(sh+8):(sh+15) by adding the result of previous multiplication with
0x08090a0b0c0d0e0f. Value obtained is placed in lower doubleword element
of vD.

Optimization of altivec instruction lvsr (Load Vector for Shift Right).
Place bytes 16-sh:31-sh of value 0x00 || 0x01 || 0x02 || ... || 0x1E ||
0x1F in destination register. Sh is calculated by adding 2 source
registers and getting bits 60-63 of result.

First, the bits [28-31] are placed from EA to variable sh. After that,
the bytes are created in the following way:
sh:(sh+7) of X(from description) by multiplying sh with 0x0101010101010101
followed by substraction of the result from 0x1011121314151617. Value
obtained is placed in higher doubleword element of vD.
(sh+8):(sh+15) by substracting the result of previous multiplication from
0x18191a1b1c1d1e1f. Value obtained is placed in lower doubleword element
of vD.

Signed-off-by: Stefan Brankovic 
Reviewed-by: Richard Henderson 
---
 target/ppc/helper.h |   2 -
 target/ppc/int_helper.c |  18 --
 target/ppc/translate/vmx-impl.inc.c | 121 ++--
 3 files changed, 89 insertions(+), 52 deletions(-)

diff --git a/target/ppc/helper.h b/target/ppc/helper.h
index 02b67a3..c82105e 100644
--- a/target/ppc/helper.h
+++ b/target/ppc/helper.h
@@ -189,8 +189,6 @@ DEF_HELPER_2(vprtybw, void, avr, avr)
 DEF_HELPER_2(vprtybd, void, avr, avr)
 DEF_HELPER_2(vprtybq, void, avr, avr)
 DEF_HELPER_3(vsubcuw, void, avr, avr, avr)
-DEF_HELPER_2(lvsl, void, avr, tl)
-DEF_HELPER_2(lvsr, void, avr, tl)
 DEF_HELPER_FLAGS_5(vaddsbs, TCG_CALL_NO_RWG, void, avr, avr, avr, avr, i32)
 DEF_HELPER_FLAGS_5(vaddshs, TCG_CALL_NO_RWG, void, avr, avr, avr, avr, i32)
 DEF_HELPER_FLAGS_5(vaddsws, TCG_CALL_NO_RWG, void, avr, avr, avr, avr, i32)
diff --git a/target/ppc/int_helper.c b/target/ppc/int_helper.c
index 8ce89f2..9505f4c 100644
--- a/target/ppc/int_helper.c
+++ b/target/ppc/int_helper.c
@@ -457,24 +457,6 @@ SATCVT(sd, uw, int64_t, uint32_t, 0, UINT32_MAX)
 #undef SATCVT
 #undef SATCVTU
 
-void helper_lvsl(ppc_avr_t *r, target_ulong sh)
-{
-int i, j = (sh & 0xf);
-
-for (i = 0; i < ARRAY_SIZE(r->u8); i++) {
-r->VsrB(i) = j++;
-}
-}
-
-void helper_lvsr(ppc_avr_t *r, target_ulong sh)
-{
-int i, j = 0x10 - (sh & 0xf);
-
-for (i = 0; i < ARRAY_SIZE(r->u8); i++) {
-r->VsrB(i) = j++;
-}
-}
-
 void helper_mtvscr(CPUPPCState *env, uint32_t vscr)
 {
 env->vscr = vscr & ~(1u << VSCR_SAT);
diff --git a/target/ppc/translate/vmx-impl.inc.c 
b/target/ppc/translate/vmx-impl.inc.c
index 663275b..a9fe3c7 100644
--- a/target/ppc/translate/vmx-impl.inc.c
+++ b/target/ppc/translate/vmx-impl.inc.c
@@ -142,38 +142,6 @@ GEN_VR_STVE(bx, 0x07, 0x04, 1);
 GEN_VR_STVE(hx, 0x07, 0x05, 2);
 GEN_VR_STVE(wx, 0x07, 0x06, 4);
 
-static void gen_lvsl(DisasContext *ctx)
-{
-TCGv_ptr rd;
-TCGv EA;
-if (unlikely(!ctx->altivec_enabled)) {
-gen_exception(ctx, POWERPC_EXCP_VPU);
-return;
-}
-EA = tcg_temp_new();
-gen_addr_reg_index(ctx, EA);
-rd = gen_avr_ptr(rD(ctx->opcode));
-gen_helper_lvsl(rd, EA);
-tcg_temp_free(EA);
-tcg_temp_free_ptr(rd);
-}
-
-static void gen_lvsr(DisasContext *ctx)
-{
-TCGv_ptr rd;
-TCGv EA;
-if (unlikely(!ctx->altivec_enabled)) {
-gen_exception(ctx, POWERPC_EXCP_VPU);
-return;
-}
-EA = tcg_temp_new();
-gen_addr_reg_index(ctx, EA);
-rd = gen_avr_ptr(rD(ctx->opcode));
-gen_helper_lvsr(rd, EA);
-tcg_temp_free(EA);
-tcg_temp_free_ptr(rd);
-}
-
 static void gen_mfvscr(DisasContext *ctx)
 {
 TCGv_i32 t;
@@ -316,6 +284,16 @@ static void glue(gen_, name)(DisasContext *ctx)
 \
 tcg_temp_free_ptr(rd);  \
 }
 
+#define GEN_VXFORM_TRANS(name, opc2, opc3)  \
+static void glue(gen_, name)(DisasContext *ctx) \
+{   \
+if (unlikely(!ctx->altivec_enabled)) {  \
+gen_exception(ctx, POWERPC_

[Qemu-devel] [PATCH v4 05/13] target/ppc: Optimize emulation of vclzw instruction

2019-06-27 Thread Stefan Brankovic
Optimize Altivec instruction vclzw (Vector Count Leading Zeros Word).
This instruction counts the number of leading zeros of each word element
in source register and places result in the appropriate word element of
destination register.

Counting is to be performed in four iterations of for loop(one for each
word elemnt of source register vB). Every iteration consists of loading
appropriate word element from source register, counting leading zeros
with tcg_gen_clzi_i32, and saving the result in appropriate word element
of destination register.

Signed-off-by: Stefan Brankovic 
Reviewed-by: Richard Henderson 
---
 target/ppc/helper.h |  1 -
 target/ppc/int_helper.c |  3 ---
 target/ppc/translate/vmx-impl.inc.c | 28 +++-
 3 files changed, 27 insertions(+), 5 deletions(-)

diff --git a/target/ppc/helper.h b/target/ppc/helper.h
index 57a954c..4c5c359 100644
--- a/target/ppc/helper.h
+++ b/target/ppc/helper.h
@@ -306,7 +306,6 @@ DEF_HELPER_4(vctsxs, void, env, avr, avr, i32)
 
 DEF_HELPER_2(vclzb, void, avr, avr)
 DEF_HELPER_2(vclzh, void, avr, avr)
-DEF_HELPER_2(vclzw, void, avr, avr)
 DEF_HELPER_2(vctzb, void, avr, avr)
 DEF_HELPER_2(vctzh, void, avr, avr)
 DEF_HELPER_2(vctzw, void, avr, avr)
diff --git a/target/ppc/int_helper.c b/target/ppc/int_helper.c
index 210e8be..cd25b66 100644
--- a/target/ppc/int_helper.c
+++ b/target/ppc/int_helper.c
@@ -1823,15 +1823,12 @@ VUPK(lsw, s64, s32, UPKLO)
 
 #define clzb(v) ((v) ? clz32((uint32_t)(v) << 24) : 8)
 #define clzh(v) ((v) ? clz32((uint32_t)(v) << 16) : 16)
-#define clzw(v) clz32((v))
 
 VGENERIC_DO(clzb, u8)
 VGENERIC_DO(clzh, u16)
-VGENERIC_DO(clzw, u32)
 
 #undef clzb
 #undef clzh
-#undef clzw
 
 #define ctzb(v) ((v) ? ctz32(v) : 8)
 #define ctzh(v) ((v) ? ctz32(v) : 16)
diff --git a/target/ppc/translate/vmx-impl.inc.c 
b/target/ppc/translate/vmx-impl.inc.c
index 50d906b..39c7839 100644
--- a/target/ppc/translate/vmx-impl.inc.c
+++ b/target/ppc/translate/vmx-impl.inc.c
@@ -741,6 +741,32 @@ static void trans_vgbbd(DisasContext *ctx)
 }
 
 /*
+ * vclzw VRT,VRB - Vector Count Leading Zeros Word
+ *
+ * Counting the number of leading zero bits of each word element in source
+ * register and placing result in appropriate word element of destination
+ * register.
+ */
+static void trans_vclzw(DisasContext *ctx)
+{
+int VT = rD(ctx->opcode);
+int VB = rB(ctx->opcode);
+TCGv_i32 tmp = tcg_temp_new_i32();
+int i;
+
+/* Perform count for every word element using tcg_gen_clzi_i32. */
+for (i = 0; i < 4; i++) {
+tcg_gen_ld_i32(tmp, cpu_env,
+offsetof(CPUPPCState, vsr[32 + VB].u64[0]) + i * 4);
+tcg_gen_clzi_i32(tmp, tmp, 32);
+tcg_gen_st_i32(tmp, cpu_env,
+offsetof(CPUPPCState, vsr[32 + VT].u64[0]) + i * 4);
+}
+
+tcg_temp_free_i32(tmp);
+}
+
+/*
  * vclzd VRT,VRB - Vector Count Leading Zeros Doubleword
  *
  * Counting the number of leading zero bits of each doubleword element in 
source
@@ -1281,7 +1307,7 @@ GEN_VAFORM_PAIRED(vmaddfp, vnmsubfp, 23)
 
 GEN_VXFORM_NOA(vclzb, 1, 28)
 GEN_VXFORM_NOA(vclzh, 1, 29)
-GEN_VXFORM_NOA(vclzw, 1, 30)
+GEN_VXFORM_TRANS(vclzw, 1, 30)
 GEN_VXFORM_TRANS(vclzd, 1, 31)
 GEN_VXFORM_NOA_2(vnegw, 1, 24, 6)
 GEN_VXFORM_NOA_2(vnegd, 1, 24, 7)
-- 
2.7.4




[Qemu-devel] [PATCH v4 13/13] target/ppc: convert vmrgl instructions to vector operations

2019-06-27 Thread Stefan Brankovic
Signed-off-by: Stefan Brankovic 
---
 target/ppc/helper.h | 3 ---
 target/ppc/int_helper.c | 9 -
 target/ppc/translate/vmx-impl.inc.c | 6 +++---
 3 files changed, 3 insertions(+), 15 deletions(-)

diff --git a/target/ppc/helper.h b/target/ppc/helper.h
index 9a7721f..0f10657 100644
--- a/target/ppc/helper.h
+++ b/target/ppc/helper.h
@@ -161,9 +161,6 @@ DEF_HELPER_4(vcmpeqfp_dot, void, env, avr, avr, avr)
 DEF_HELPER_4(vcmpgefp_dot, void, env, avr, avr, avr)
 DEF_HELPER_4(vcmpgtfp_dot, void, env, avr, avr, avr)
 DEF_HELPER_4(vcmpbfp_dot, void, env, avr, avr, avr)
-DEF_HELPER_3(vmrglb, void, avr, avr, avr)
-DEF_HELPER_3(vmrglh, void, avr, avr, avr)
-DEF_HELPER_3(vmrglw, void, avr, avr, avr)
 DEF_HELPER_3(vmulesb, void, avr, avr, avr)
 DEF_HELPER_3(vmulesh, void, avr, avr, avr)
 DEF_HELPER_3(vmulesw, void, avr, avr, avr)
diff --git a/target/ppc/int_helper.c b/target/ppc/int_helper.c
index 00e6e02..4b6e074 100644
--- a/target/ppc/int_helper.c
+++ b/target/ppc/int_helper.c
@@ -946,15 +946,6 @@ void helper_vmladduhm(ppc_avr_t *r, ppc_avr_t *a, 
ppc_avr_t *b, ppc_avr_t *c)
 *r = result; \
 }
 
-#define VMRG(suffix, element, access)  \
-VMRG_DO(mrgl##suffix, element, access, half)   \
-
-VMRG(b, u8, VsrB)
-VMRG(h, u16, VsrH)
-VMRG(w, u32, VsrW)
-#undef VMRG_DO
-#undef VMRG
-
 void helper_vmsummbm(CPUPPCState *env, ppc_avr_t *r, ppc_avr_t *a,
  ppc_avr_t *b, ppc_avr_t *c)
 {
diff --git a/target/ppc/translate/vmx-impl.inc.c 
b/target/ppc/translate/vmx-impl.inc.c
index e02390f..12f41af 100644
--- a/target/ppc/translate/vmx-impl.inc.c
+++ b/target/ppc/translate/vmx-impl.inc.c
@@ -449,9 +449,9 @@ GEN_VXFORM(vavgsw, 1, 22);
 GEN_VXFORM_V(vmrghb, MO_8, tcg_gen_gvec_vmrgh, 6, 0);
 GEN_VXFORM_V(vmrghh, MO_16, tcg_gen_gvec_vmrgh, 6, 1);
 GEN_VXFORM_V(vmrghw, MO_32, tcg_gen_gvec_vmrgh, 6, 2);
-GEN_VXFORM(vmrglb, 6, 4);
-GEN_VXFORM(vmrglh, 6, 5);
-GEN_VXFORM(vmrglw, 6, 6);
+GEN_VXFORM_V(vmrglb, MO_8, tcg_gen_gvec_vmrgl, 6, 4);
+GEN_VXFORM_V(vmrglh, MO_16, tcg_gen_gvec_vmrgl, 6, 5);
+GEN_VXFORM_V(vmrglw, MO_32, tcg_gen_gvec_vmrgl, 6, 6);
 
 static void trans_vmrgew(DisasContext *ctx)
 {
-- 
2.7.4




[Qemu-devel] [PATCH v4 04/13] target/ppc: Optimize emulation of vclzd instruction

2019-06-27 Thread Stefan Brankovic
Optimize Altivec instruction vclzd (Vector Count Leading Zeros Doubleword).
This instruction counts the number of leading zeros of each doubleword element
in source register and places result in the appropriate doubleword element of
destination register.

Using tcg-s count leading zeros instruction two times(once for each
doubleword element of source register vB) and placing result in
appropriate doubleword element of destination register vD.

Signed-off-by: Stefan Brankovic 
Reviewed-by: Richard Henderson 
---
 target/ppc/helper.h |  1 -
 target/ppc/int_helper.c |  3 ---
 target/ppc/translate/vmx-impl.inc.c | 28 +++-
 3 files changed, 27 insertions(+), 5 deletions(-)

diff --git a/target/ppc/helper.h b/target/ppc/helper.h
index cf1af51..57a954c 100644
--- a/target/ppc/helper.h
+++ b/target/ppc/helper.h
@@ -307,7 +307,6 @@ DEF_HELPER_4(vctsxs, void, env, avr, avr, i32)
 DEF_HELPER_2(vclzb, void, avr, avr)
 DEF_HELPER_2(vclzh, void, avr, avr)
 DEF_HELPER_2(vclzw, void, avr, avr)
-DEF_HELPER_2(vclzd, void, avr, avr)
 DEF_HELPER_2(vctzb, void, avr, avr)
 DEF_HELPER_2(vctzh, void, avr, avr)
 DEF_HELPER_2(vctzw, void, avr, avr)
diff --git a/target/ppc/int_helper.c b/target/ppc/int_helper.c
index 87e3062..210e8be 100644
--- a/target/ppc/int_helper.c
+++ b/target/ppc/int_helper.c
@@ -1824,17 +1824,14 @@ VUPK(lsw, s64, s32, UPKLO)
 #define clzb(v) ((v) ? clz32((uint32_t)(v) << 24) : 8)
 #define clzh(v) ((v) ? clz32((uint32_t)(v) << 16) : 16)
 #define clzw(v) clz32((v))
-#define clzd(v) clz64((v))
 
 VGENERIC_DO(clzb, u8)
 VGENERIC_DO(clzh, u16)
 VGENERIC_DO(clzw, u32)
-VGENERIC_DO(clzd, u64)
 
 #undef clzb
 #undef clzh
 #undef clzw
-#undef clzd
 
 #define ctzb(v) ((v) ? ctz32(v) : 8)
 #define ctzh(v) ((v) ? ctz32(v) : 16)
diff --git a/target/ppc/translate/vmx-impl.inc.c 
b/target/ppc/translate/vmx-impl.inc.c
index d9b346b..50d906b 100644
--- a/target/ppc/translate/vmx-impl.inc.c
+++ b/target/ppc/translate/vmx-impl.inc.c
@@ -740,6 +740,32 @@ static void trans_vgbbd(DisasContext *ctx)
 tcg_temp_free_i64(avr[1]);
 }
 
+/*
+ * vclzd VRT,VRB - Vector Count Leading Zeros Doubleword
+ *
+ * Counting the number of leading zero bits of each doubleword element in 
source
+ * register and placing result in appropriate doubleword element of destination
+ * register.
+ */
+static void trans_vclzd(DisasContext *ctx)
+{
+int VT = rD(ctx->opcode);
+int VB = rB(ctx->opcode);
+TCGv_i64 avr = tcg_temp_new_i64();
+
+/* high doubleword */
+get_avr64(avr, VB, true);
+tcg_gen_clzi_i64(avr, avr, 64);
+set_avr64(VT, avr, true);
+
+/* low doubleword */
+get_avr64(avr, VB, false);
+tcg_gen_clzi_i64(avr, avr, 64);
+set_avr64(VT, avr, false);
+
+tcg_temp_free_i64(avr);
+}
+
 GEN_VXFORM(vmuloub, 4, 0);
 GEN_VXFORM(vmulouh, 4, 1);
 GEN_VXFORM(vmulouw, 4, 2);
@@ -1256,7 +1282,7 @@ GEN_VAFORM_PAIRED(vmaddfp, vnmsubfp, 23)
 GEN_VXFORM_NOA(vclzb, 1, 28)
 GEN_VXFORM_NOA(vclzh, 1, 29)
 GEN_VXFORM_NOA(vclzw, 1, 30)
-GEN_VXFORM_NOA(vclzd, 1, 31)
+GEN_VXFORM_TRANS(vclzd, 1, 31)
 GEN_VXFORM_NOA_2(vnegw, 1, 24, 6)
 GEN_VXFORM_NOA_2(vnegd, 1, 24, 7)
 GEN_VXFORM_NOA_2(vextsb2w, 1, 24, 16)
-- 
2.7.4




[Qemu-devel] [PATCH v4 00/13] target/ppc, tcg, tcg/i386: Optimize emulation of some Altivec instructions

2019-06-27 Thread Stefan Brankovic
Optimize emulation of ten Altivec instructions: lvsl, lvsr, vsl, vsr, vgbbd,
vclzb, vclzh, vclzw, vclzd, vmrghb, vmrghh, vmrghw, vmrglb, vmrglh and vmrglw.

This series buils up on and complements recent work of Thomas Murta, Mark
Cave-Ayland and Richard Henderson in the same area. It is based on devising TCG
translation implementation for selected instructions rather than using helpers.
The selected instructions are most of the time idiosyncratic to ppc platform,
so relatively complex TCG translation (without direct mapping to host
instruction that is not possible in these cases) seems to be the best option,
and that approach is presented in this series.

This series also adds opcodes for vector implementation of instructions 
vmrgh(b|h|w) and vmrgl(b|h|w) in tcg, alongside with vector implementation of
those instructions for i386 targets in tcg backend.

The performance improvements are significant in all cases.

V4:

Addressed Richard's Henderson's suggestions.
Removed vpkpx's optimization for further investigation on graphical distortions
it caused on OSX 10.2-4 guests.
Added opcodes for vector vmrgh(b|h|w) and vmrgl(b|h|w) in tcg.
Implemented vector vmrgh and vmrgl instructions for i386.
Converted vmrgh and vmrgl instructions to vector operations.

V3:

Fixed problem during build.

V2:

Addressed Richard's Henderson's suggestions.
Fixed problem during build on patch 2/8.
Rebased series to the latest qemu code.

Stefan Brankovic (13):
  target/ppc: Optimize emulation of lvsl and lvsr instructions
  target/ppc: Optimize emulation of vsl and vsr instructions
  target/ppc: Optimize emulation of vgbbd instruction
  target/ppc: Optimize emulation of vclzd instruction
  target/ppc: Optimize emulation of vclzw instruction
  target/ppc: Optimize emulation of vclzh and vclzb instructions
  target/ppc: Refactor emulation of vmrgew and vmrgow instructions
  tcg: Add opcodes for vector vmrgh instructions
  tcg/i386: Implement vector vmrgh instructions
  target/ppc: convert vmrgh instructions to vector operations
  tcg: Add opcodes for verctor vmrgl instructions
  tcg/i386: Implement vector vmrgl instructions
  target/ppc: convert vmrgl instructions to vector operations

 accel/tcg/tcg-runtime-gvec.c|  84 ++
 accel/tcg/tcg-runtime.h |   8 +
 target/ppc/helper.h |  15 -
 target/ppc/int_helper.c | 353 ---
 target/ppc/translate/vmx-impl.inc.c | 555 +++-
 tcg/i386/tcg-target.h   |   2 +
 tcg/i386/tcg-target.inc.c   |  29 ++
 tcg/tcg-op-gvec.c   |  47 +++
 tcg/tcg-op-gvec.h   |   5 +
 tcg/tcg-op-vec.c|  10 +
 tcg/tcg-op.h|   3 +
 tcg/tcg-opc.h   |   3 +
 tcg/tcg.c   |   4 +
 tcg/tcg.h   |   2 +
 14 files changed, 677 insertions(+), 443 deletions(-)

-- 
2.7.4




[Qemu-devel] [PATCH v4 02/13] target/ppc: Optimize emulation of vsl and vsr instructions

2019-06-27 Thread Stefan Brankovic
Optimization of altivec instructions vsl and vsr(Vector Shift Left/Rigt).
Perform shift operation (left and right respectively) on 128 bit value of
register vA by value specified in bits 125-127 of register vB. Lowest 3
bits in each byte element of register vB must be identical or result is
undefined.

For vsl instruction, the first step is bits 125-127 of register vB have
to be saved in variable sh. Then, the highest sh bits of the lower
doubleword element of register vA are saved in variable shifted,
in order not to lose those bits when shift operation is performed on
the lower doubleword element of register vA, which is the next
step. After shifting the lower doubleword element shift operation
is performed on higher doubleword element of vA, with replacement of
the lowest sh bits(that are now 0) with bits saved in shifted.

For vsr instruction, firstly, the bits 125-127 of register vB have
to be saved in variable sh. Then, the lowest sh bits of the higher
doubleword element of register vA are saved in variable shifted,
in odred not to lose those bits when the shift operation is
performed on the higher doubleword element of register vA, which is
the next step. After shifting higher doubleword element, shift operation
is performed on lower doubleword element of vA, with replacement of
highest sh bits(that are now 0) with bits saved in shifted.

Signed-off-by: Stefan Brankovic 
Reviewed-by: Richard Henderson 
---
 target/ppc/helper.h |  2 -
 target/ppc/int_helper.c | 35 -
 target/ppc/translate/vmx-impl.inc.c | 99 -
 3 files changed, 97 insertions(+), 39 deletions(-)

diff --git a/target/ppc/helper.h b/target/ppc/helper.h
index c82105e..33dad6a 100644
--- a/target/ppc/helper.h
+++ b/target/ppc/helper.h
@@ -213,8 +213,6 @@ DEF_HELPER_3(vrlb, void, avr, avr, avr)
 DEF_HELPER_3(vrlh, void, avr, avr, avr)
 DEF_HELPER_3(vrlw, void, avr, avr, avr)
 DEF_HELPER_3(vrld, void, avr, avr, avr)
-DEF_HELPER_3(vsl, void, avr, avr, avr)
-DEF_HELPER_3(vsr, void, avr, avr, avr)
 DEF_HELPER_4(vsldoi, void, avr, avr, avr, i32)
 DEF_HELPER_3(vextractub, void, avr, avr, i32)
 DEF_HELPER_3(vextractuh, void, avr, avr, i32)
diff --git a/target/ppc/int_helper.c b/target/ppc/int_helper.c
index 9505f4c..a23853e 100644
--- a/target/ppc/int_helper.c
+++ b/target/ppc/int_helper.c
@@ -1738,41 +1738,6 @@ VEXTU_X_DO(vextuhrx, 16, 0)
 VEXTU_X_DO(vextuwrx, 32, 0)
 #undef VEXTU_X_DO
 
-/*
- * The specification says that the results are undefined if all of the
- * shift counts are not identical.  We check to make sure that they
- * are to conform to what real hardware appears to do.
- */
-#define VSHIFT(suffix, leftp)   \
-void helper_vs##suffix(ppc_avr_t *r, ppc_avr_t *a, ppc_avr_t *b)\
-{   \
-int shift = b->VsrB(15) & 0x7;  \
-int doit = 1;   \
-int i;  \
-\
-for (i = 0; i < ARRAY_SIZE(r->u8); i++) {   \
-doit = doit && ((b->u8[i] & 0x7) == shift); \
-}   \
-if (doit) { \
-if (shift == 0) {   \
-*r = *a;\
-} else if (leftp) { \
-uint64_t carry = a->VsrD(1) >> (64 - shift);\
-\
-r->VsrD(0) = (a->VsrD(0) << shift) | carry; \
-r->VsrD(1) = a->VsrD(1) << shift;   \
-} else {\
-uint64_t carry = a->VsrD(0) << (64 - shift);\
-\
-r->VsrD(1) = (a->VsrD(1) >> shift) | carry; \
-r->VsrD(0) = a->VsrD(0) >> shift;   \
-}   \
-}   \
-}
-VSHIFT(l, 1)
-VSHIFT(r, 0)
-#undef VSHIFT
-
 void helper_vslv(ppc_avr_t *r, ppc_avr_t *a, ppc_avr_t *b)
 {
 int i;
diff --git a/target/ppc/translate/vmx-impl.inc.c 
b/target/ppc/translate/vmx-impl.inc.c
index a9fe3c7..62108ca 100644
--- a/target/ppc/translate/vmx-impl.inc.c
+++ b/target/ppc/translate/vmx-impl.inc.c
@@ -570,6 +570,101 @@ 

Re: [Qemu-devel] ?==?utf-8?q? [PATCH v3 0/8] target/ppc: Optimize emulation of some Altivec

2019-06-24 Thread Stefan Brankovic


Hi Howard,

Would you be so kind to send me exact qemu command line and applicable files(OS 
images or similar) that you used to bring this exact situation up. Thanks in 
advance.

Kind Regards,
Stefan

 Original Message 
Subject: Re: [PATCH v3 0/8] target/ppc: Optimize emulation of some Altivec
Date: Monday, June 24, 2019 13:20 CEST
From: Howard Spoelstra 
To: Stefan Brankovic 
CC: qemu-devel qemu-devel 
References: <1561371065-3637-1-git-send-email-stefan.branko...@rt-rk.com> 
<43c6-5d10a600-15-34dab4c0@176981179>


   On Mon, Jun 24, 2019 at 12:28 PM Stefan Brankovic 
 wrote:Hi Howard,

Thanks for letting me know about compilation errors in my patch, I really 
appreciate it. Those should be fixed  in the latest version of this patch (v3), 
so please take a look and let me know if there is anything else that should be 
changed.

Kind Regards,
Stefan

 Original Message 
Subject: [PATCH v3 0/8] target/ppc: Optimize emulation of some Altivec
Date: Monday, June 24, 2019 12:10 CEST
From: Stefan Brankovic 
To: stefan.branko...@rt-rk.com Hi Stefan, V3 applies and builds cleanly on top 
of current master. The resulting qemu-system-ppc shows graphical distortions 
with OSX 10.2-4 guests. Mac OS 9.x guests seem OK.Please see screen dump. 
Best,Howard  
 


Re: [Qemu-devel] ?==?utf-8?q? [PATCH v3 0/8] target/ppc: Optimize emulation of some Altivec

2019-06-24 Thread Stefan Brankovic


Hi Howard,

Thanks for letting me know about compilation errors in my patch, I really 
appreciate it. Those should be fixed  in the latest version of this patch (v3), 
so please take a look and let me know if there is anything else that should be 
changed.

Kind Regards,
Stefan

 Original Message 
Subject: [PATCH v3 0/8] target/ppc: Optimize emulation of some Altivec
Date: Monday, June 24, 2019 12:10 CEST
From: Stefan Brankovic 
To: stefan.branko...@rt-rk.com


 Optimize emulation of ten Altivec instructions: lvsl, lvsr, vsl, vsr, vpkpx,
vgbbd, vclzb, vclzh, vclzw and vclzd.

This series buils up on and complements recent work of Thomas Murta, Mark
Cave-Ayland and Richard Henderson in the same area. It is based on devising TCG
translation implementation for selected instructions rather than using helpers.
The selected instructions are most of the time idiosyncratic to ppc platform,
so relatively complex TCG translation (without direct mapping to host
instruction that is not possible in these cases) seems to be the best option,
and that approach is presented in this series. The performance improvements are
significant in all cases.

V3:

Fixed problem during build.

V2:

Addressed Richard's Henderson's suggestions.
Fixed problem during build on patch 2/8.
Rebased series to the latest qemu code.

Stefan Brankovic (8):
target/ppc: Optimize emulation of lvsl and lvsr instructions
target/ppc: Optimize emulation of vsl and vsr instructions
target/ppc: Optimize emulation of vpkpx instruction
target/ppc: Optimize emulation of vgbbd instruction
target/ppc: Optimize emulation of vclzd instruction
target/ppc: Optimize emulation of vclzw instruction
target/ppc: Optimize emulation of vclzh and vclzb instructions
target/ppc: Refactor emulation of vmrgew and vmrgow instructions

target/ppc/helper.h | 10 -
target/ppc/int_helper.c | 365 
target/ppc/translate/vmx-impl.inc.c | 642 
3 files changed, 573 insertions(+), 444 deletions(-)

--
2.7.4
 
 


[Qemu-devel] [PATCH v3 5/8] target/ppc: Optimize emulation of vclzd instruction

2019-06-21 Thread Stefan Brankovic
Optimize Altivec instruction vclzd (Vector Count Leading Zeros Doubleword).
This instruction counts the number of leading zeros of each doubleword element
in source register and places result in the appropriate doubleword element of
destination register.

Using tcg-s count leading zeros instruction two times(once for each
doubleword element of source register vB) and placing result in
appropriate doubleword element of destination register vD.

Signed-off-by: Stefan Brankovic 
Reviewed-by: Richard Henderson 
---
 target/ppc/helper.h |  1 -
 target/ppc/int_helper.c |  3 ---
 target/ppc/translate/vmx-impl.inc.c | 28 +++-
 3 files changed, 27 insertions(+), 5 deletions(-)

diff --git a/target/ppc/helper.h b/target/ppc/helper.h
index 0aa1e05..3b92e3b 100644
--- a/target/ppc/helper.h
+++ b/target/ppc/helper.h
@@ -306,7 +306,6 @@ DEF_HELPER_4(vctsxs, void, env, avr, avr, i32)
 DEF_HELPER_2(vclzb, void, avr, avr)
 DEF_HELPER_2(vclzh, void, avr, avr)
 DEF_HELPER_2(vclzw, void, avr, avr)
-DEF_HELPER_2(vclzd, void, avr, avr)
 DEF_HELPER_2(vctzb, void, avr, avr)
 DEF_HELPER_2(vctzh, void, avr, avr)
 DEF_HELPER_2(vctzw, void, avr, avr)
diff --git a/target/ppc/int_helper.c b/target/ppc/int_helper.c
index 1e32549..3134798 100644
--- a/target/ppc/int_helper.c
+++ b/target/ppc/int_helper.c
@@ -1803,17 +1803,14 @@ VUPK(lsw, s64, s32, UPKLO)
 #define clzb(v) ((v) ? clz32((uint32_t)(v) << 24) : 8)
 #define clzh(v) ((v) ? clz32((uint32_t)(v) << 16) : 16)
 #define clzw(v) clz32((v))
-#define clzd(v) clz64((v))
 
 VGENERIC_DO(clzb, u8)
 VGENERIC_DO(clzh, u16)
 VGENERIC_DO(clzw, u32)
-VGENERIC_DO(clzd, u64)
 
 #undef clzb
 #undef clzh
 #undef clzw
-#undef clzd
 
 #define ctzb(v) ((v) ? ctz32(v) : 8)
 #define ctzh(v) ((v) ? ctz32(v) : 16)
diff --git a/target/ppc/translate/vmx-impl.inc.c 
b/target/ppc/translate/vmx-impl.inc.c
index 00896ec..1953c65 100644
--- a/target/ppc/translate/vmx-impl.inc.c
+++ b/target/ppc/translate/vmx-impl.inc.c
@@ -839,6 +839,32 @@ static void trans_vgbbd(DisasContext *ctx)
 tcg_temp_free_i64(avr[1]);
 }
 
+/*
+ * vclzd VRT,VRB - Vector Count Leading Zeros Doubleword
+ *
+ * Counting the number of leading zero bits of each doubleword element in 
source
+ * register and placing result in appropriate doubleword element of destination
+ * register.
+ */
+static void trans_vclzd(DisasContext *ctx)
+{
+int VT = rD(ctx->opcode);
+int VB = rB(ctx->opcode);
+TCGv_i64 avr = tcg_temp_new_i64();
+
+/* high doubleword */
+get_avr64(avr, VB, true);
+tcg_gen_clzi_i64(avr, avr, 64);
+set_avr64(VT, avr, true);
+
+/* low doubleword */
+get_avr64(avr, VB, false);
+tcg_gen_clzi_i64(avr, avr, 64);
+set_avr64(VT, avr, false);
+
+tcg_temp_free_i64(avr);
+}
+
 GEN_VXFORM(vmuloub, 4, 0);
 GEN_VXFORM(vmulouh, 4, 1);
 GEN_VXFORM(vmulouw, 4, 2);
@@ -1355,7 +1381,7 @@ GEN_VAFORM_PAIRED(vmaddfp, vnmsubfp, 23)
 GEN_VXFORM_NOA(vclzb, 1, 28)
 GEN_VXFORM_NOA(vclzh, 1, 29)
 GEN_VXFORM_NOA(vclzw, 1, 30)
-GEN_VXFORM_NOA(vclzd, 1, 31)
+GEN_VXFORM_TRANS(vclzd, 1, 31)
 GEN_VXFORM_NOA_2(vnegw, 1, 24, 6)
 GEN_VXFORM_NOA_2(vnegd, 1, 24, 7)
 GEN_VXFORM_NOA_2(vextsb2w, 1, 24, 16)
-- 
2.7.4




[Qemu-devel] [PATCH v3 7/8] target/ppc: Optimize emulation of vclzh and vclzb instructions

2019-06-21 Thread Stefan Brankovic
Optimize Altivec instruction vclzh (Vector Count Leading Zeros Halfword).
This instruction counts the number of leading zeros of each halfword element
in source register and places result in the appropriate halfword element of
destination register.

In each iteration of outer for loop count operation is performed on one
doubleword element of source register vB. In the first iteration, higher
doubleword element of vB is placed in variable avr, and then counting
for every halfword element is performed by  using tcg_gen_clzi_i64.
Since it counts leading zeros on 64 bit lenght, ith byte element has to
be moved to the highest 16 bits of tmp, or-ed with mask(in order to get all
ones in lowest 48 bits), then perform tcg_gen_clzi_i64 and move it's result
in appropriate halfword element of result. This is done in inner for loop.
After the operation is finished, the result is saved in the appropriate
doubleword element of destination register vD. The same sequence of orders
is to be applied again for the  lower doubleword element of vB.

Optimize Altivec instruction vclzb (Vector Count Leading Zeros Byte).
This instruction counts the number of leading zeros of each byte element
in source register and places result in the appropriate byte element of
destination register.

In each iteration of the outer for loop, counting operation is done on one
doubleword element of source register vB. In the first iteration, the
higher doubleword element of vB is placed in variable avr, and then counting
for every byte element is performed using tcg_gen_clzi_i64. Since it counts
leading zeros on 64 bit lenght, ith byte element has to be moved to the highest
8 bits of variable  tmp, or-ed with mask(in order to get all ones in the lowest
56 bits), then perform tcg_gen_clzi_i64 and move it's result in the appropriate
byte element of result. This is done in inner for loop. After the operation is
finished, the result is saved in the  appropriate doubleword element of 
destination
register vD. The same sequence of orders is to be applied again for the lower
doubleword element of vB.

Signed-off-by: Stefan Brankovic 
---
 target/ppc/helper.h |   2 -
 target/ppc/int_helper.c |   9 ---
 target/ppc/translate/vmx-impl.inc.c | 122 +++-
 3 files changed, 120 insertions(+), 13 deletions(-)

diff --git a/target/ppc/helper.h b/target/ppc/helper.h
index 595241c..17b4b06 100644
--- a/target/ppc/helper.h
+++ b/target/ppc/helper.h
@@ -303,8 +303,6 @@ DEF_HELPER_4(vcfsx, void, env, avr, avr, i32)
 DEF_HELPER_4(vctuxs, void, env, avr, avr, i32)
 DEF_HELPER_4(vctsxs, void, env, avr, avr, i32)
 
-DEF_HELPER_2(vclzb, void, avr, avr)
-DEF_HELPER_2(vclzh, void, avr, avr)
 DEF_HELPER_2(vctzb, void, avr, avr)
 DEF_HELPER_2(vctzh, void, avr, avr)
 DEF_HELPER_2(vctzw, void, avr, avr)
diff --git a/target/ppc/int_helper.c b/target/ppc/int_helper.c
index 82cb12e..264b5e7 100644
--- a/target/ppc/int_helper.c
+++ b/target/ppc/int_helper.c
@@ -1800,15 +1800,6 @@ VUPK(lsw, s64, s32, UPKLO)
 }   \
 }
 
-#define clzb(v) ((v) ? clz32((uint32_t)(v) << 24) : 8)
-#define clzh(v) ((v) ? clz32((uint32_t)(v) << 16) : 16)
-
-VGENERIC_DO(clzb, u8)
-VGENERIC_DO(clzh, u16)
-
-#undef clzb
-#undef clzh
-
 #define ctzb(v) ((v) ? ctz32(v) : 8)
 #define ctzh(v) ((v) ? ctz32(v) : 16)
 #define ctzw(v) ctz32((v))
diff --git a/target/ppc/translate/vmx-impl.inc.c 
b/target/ppc/translate/vmx-impl.inc.c
index 9ed2fae..518d9de 100644
--- a/target/ppc/translate/vmx-impl.inc.c
+++ b/target/ppc/translate/vmx-impl.inc.c
@@ -840,6 +840,124 @@ static void trans_vgbbd(DisasContext *ctx)
 }
 
 /*
+ * vclzb VRT,VRB - Vector Count Leading Zeros Byte
+ *
+ * Counting the number of leading zero bits of each byte element in source
+ * register and placing result in appropriate byte element of destination
+ * register.
+ */
+static void trans_vclzb(DisasContext *ctx)
+{
+int VT = rD(ctx->opcode);
+int VB = rB(ctx->opcode);
+TCGv_i64 avr = tcg_temp_new_i64();
+TCGv_i64 result = tcg_temp_new_i64();
+TCGv_i64 tmp = tcg_temp_new_i64();
+TCGv_i64 mask = tcg_const_i64(0xffULL);
+int i, j;
+
+for (i = 0; i < 2; i++) {
+if (i == 0) {
+/* Get high doubleword of vB in avr. */
+get_avr64(avr, VB, true);
+} else {
+/* Get low doubleword of vB in avr. */
+get_avr64(avr, VB, false);
+}
+/*
+ * Perform count for every byte element using tcg_gen_clzi_i64.
+ * Since it counts leading zeros on 64 bit lenght, we have to move
+ * ith byte element to highest 8 bits of tmp, or it with mask(so we get
+ * all ones in lowest 56 bits), then perform tcg_gen_clzi_i64 and move
+ * it's result in appropriate byte element of result.
+ */
+tcg_gen_shli_i64(tmp, avr, 56);
+tcg_gen_

[Qemu-devel] [PATCH v3 8/8] target/ppc: Refactor emulation of vmrgew and vmrgow instructions

2019-06-21 Thread Stefan Brankovic
Since I found this two instructions implemented with tcg, I refactored
them so they are consistent with other similar implementations that
I introduced in this patch.

Also, a new dual macro GEN_VXFORM_TRANS_DUAL is added. This macro is
used if one instruction is realized with direct translation, and second
one with a helper.

Signed-off-by: Stefan Brankovic 
---
 target/ppc/translate/vmx-impl.inc.c | 66 +
 1 file changed, 37 insertions(+), 29 deletions(-)

diff --git a/target/ppc/translate/vmx-impl.inc.c 
b/target/ppc/translate/vmx-impl.inc.c
index 518d9de..1fff98d 100644
--- a/target/ppc/translate/vmx-impl.inc.c
+++ b/target/ppc/translate/vmx-impl.inc.c
@@ -350,6 +350,28 @@ static void glue(gen_, name0##_##name1)(DisasContext *ctx) 
\
 }  \
 }
 
+/*
+ * We use this macro if one instruction is realized with direct
+ * translation, and second one with helper.
+ */
+#define GEN_VXFORM_TRANS_DUAL(name0, flg0, flg2_0, name1, flg1, flg2_1)\
+static void glue(gen_, name0##_##name1)(DisasContext *ctx) \
+{  \
+if ((Rc(ctx->opcode) == 0) &&  \
+((ctx->insns_flags & flg0) || (ctx->insns_flags2 & flg2_0))) { \
+if (unlikely(!ctx->altivec_enabled)) { \
+gen_exception(ctx, POWERPC_EXCP_VPU);  \
+return;\
+}  \
+trans_##name0(ctx);\
+} else if ((Rc(ctx->opcode) == 1) &&   \
+((ctx->insns_flags & flg1) || (ctx->insns_flags2 & flg2_1))) { \
+gen_##name1(ctx);  \
+} else {   \
+gen_inval_exception(ctx, POWERPC_EXCP_INVAL_INVAL);\
+}  \
+}
+
 /* Adds support to provide invalid mask */
 #define GEN_VXFORM_DUAL_EXT(name0, flg0, flg2_0, inval0,\
 name1, flg1, flg2_1, inval1)\
@@ -431,20 +453,13 @@ GEN_VXFORM(vmrglb, 6, 4);
 GEN_VXFORM(vmrglh, 6, 5);
 GEN_VXFORM(vmrglw, 6, 6);
 
-static void gen_vmrgew(DisasContext *ctx)
+static void trans_vmrgew(DisasContext *ctx)
 {
-TCGv_i64 tmp;
-TCGv_i64 avr;
-int VT, VA, VB;
-if (unlikely(!ctx->altivec_enabled)) {
-gen_exception(ctx, POWERPC_EXCP_VPU);
-return;
-}
-VT = rD(ctx->opcode);
-VA = rA(ctx->opcode);
-VB = rB(ctx->opcode);
-tmp = tcg_temp_new_i64();
-avr = tcg_temp_new_i64();
+int VT = rD(ctx->opcode);
+int VA = rA(ctx->opcode);
+int VB = rB(ctx->opcode);
+TCGv_i64 tmp = tcg_temp_new_i64();
+TCGv_i64 avr = tcg_temp_new_i64();
 
 get_avr64(avr, VB, true);
 tcg_gen_shri_i64(tmp, avr, 32);
@@ -462,21 +477,14 @@ static void gen_vmrgew(DisasContext *ctx)
 tcg_temp_free_i64(avr);
 }
 
-static void gen_vmrgow(DisasContext *ctx)
+static void trans_vmrgow(DisasContext *ctx)
 {
-TCGv_i64 t0, t1;
-TCGv_i64 avr;
-int VT, VA, VB;
-if (unlikely(!ctx->altivec_enabled)) {
-gen_exception(ctx, POWERPC_EXCP_VPU);
-return;
-}
-VT = rD(ctx->opcode);
-VA = rA(ctx->opcode);
-VB = rB(ctx->opcode);
-t0 = tcg_temp_new_i64();
-t1 = tcg_temp_new_i64();
-avr = tcg_temp_new_i64();
+int VT = rD(ctx->opcode);
+int VA = rA(ctx->opcode);
+int VB = rB(ctx->opcode);
+TCGv_i64 t0 = tcg_temp_new_i64();
+TCGv_i64 t1 = tcg_temp_new_i64();
+TCGv_i64 avr = tcg_temp_new_i64();
 
 get_avr64(t0, VB, true);
 get_avr64(t1, VA, true);
@@ -1151,14 +1159,14 @@ GEN_VXFORM_ENV(vminfp, 5, 17);
 GEN_VXFORM_HETRO(vextublx, 6, 24)
 GEN_VXFORM_HETRO(vextuhlx, 6, 25)
 GEN_VXFORM_HETRO(vextuwlx, 6, 26)
-GEN_VXFORM_DUAL(vmrgow, PPC_NONE, PPC2_ALTIVEC_207,
+GEN_VXFORM_TRANS_DUAL(vmrgow, PPC_NONE, PPC2_ALTIVEC_207,
 vextuwlx, PPC_NONE, PPC2_ISA300)
 GEN_VXFORM_HETRO(vextubrx, 6, 28)
 GEN_VXFORM_HETRO(vextuhrx, 6, 29)
 GEN_VXFORM_HETRO(vextuwrx, 6, 30)
 GEN_VXFORM_TRANS(lvsl, 6, 31)
 GEN_VXFORM_TRANS(lvsr, 6, 32)
-GEN_VXFORM_DUAL(vmrgew, PPC_NONE, PPC2_ALTIVEC_207, \
+GEN_VXFORM_TRANS_DUAL(vmrgew, PPC_NONE, PPC2_ALTIVEC_207,
 vextuwrx, PPC_NONE, PPC2_ISA300)
 
 #define GEN_VXRFORM1(opname, name, str, opc2, opc3) \
-- 
2.7.4




[Qemu-devel] [PATCH v3 4/8] target/ppc: Optimize emulation of vgbbd instruction

2019-06-21 Thread Stefan Brankovic
Optimize altivec instruction vgbbd (Vector Gather Bits by Bytes by Doubleword)
All ith bits (i in range 1 to 8) of each byte of doubleword element in
source register are concatenated and placed into ith byte of appropriate
doubleword element in destination register.

Following solution is done for both doubleword elements of source register
in parallel, in order to reduce the number of instructions needed(that's why
arrays are used):
First, both doubleword elements of source register vB are placed in
appropriate element of array avr. Bits are gathered in 2x8 iterations(2 for
loops). In first iteration bit 1 of byte 1, bit 2 of byte 2,... bit 8 of
byte 8 are in their final spots so avr[i], i={0,1} can be and-ed with
tcg_mask. For every following iteration, both avr[i] and tcg_mask variables
have to be shifted right for 7 and 8 places, respectively, in order to get
bit 1 of byte 2, bit 2 of byte 3.. bit 7 of byte 8 in their final spots so
shifted avr values(saved in tmp) can be and-ed with new value of tcg_mask...
After first 8 iteration(first loop), all the first bits are in their final
places, all second bits but second bit from eight byte are in their places...
only 1 eight bit from eight byte is in it's place). In second loop we do all
operations symmetrically, in order to get other half of bits in their final
spots. Results for first and second doubleword elements are saved in
result[0] and result[1] respectively. In the end those results are saved in
appropriate doubleword element of destination register vD.

Signed-off-by: Stefan Brankovic 
---
 target/ppc/helper.h |   1 -
 target/ppc/int_helper.c | 276 
 target/ppc/translate/vmx-impl.inc.c |  77 +-
 3 files changed, 76 insertions(+), 278 deletions(-)

diff --git a/target/ppc/helper.h b/target/ppc/helper.h
index 7a3d68d..0aa1e05 100644
--- a/target/ppc/helper.h
+++ b/target/ppc/helper.h
@@ -319,7 +319,6 @@ DEF_HELPER_1(vclzlsbb, tl, avr)
 DEF_HELPER_1(vctzlsbb, tl, avr)
 DEF_HELPER_3(vbpermd, void, avr, avr, avr)
 DEF_HELPER_3(vbpermq, void, avr, avr, avr)
-DEF_HELPER_2(vgbbd, void, avr, avr)
 DEF_HELPER_3(vpmsumb, void, avr, avr, avr)
 DEF_HELPER_3(vpmsumh, void, avr, avr, avr)
 DEF_HELPER_3(vpmsumw, void, avr, avr, avr)
diff --git a/target/ppc/int_helper.c b/target/ppc/int_helper.c
index f397380..1e32549 100644
--- a/target/ppc/int_helper.c
+++ b/target/ppc/int_helper.c
@@ -1185,282 +1185,6 @@ void helper_vbpermq(ppc_avr_t *r, ppc_avr_t *a, 
ppc_avr_t *b)
 #undef VBPERMQ_INDEX
 #undef VBPERMQ_DW
 
-static const uint64_t VGBBD_MASKS[256] = {
-0xull, /* 00 */
-0x0080ull, /* 01 */
-0x8000ull, /* 02 */
-0x8080ull, /* 03 */
-0x0080ull, /* 04 */
-0x00800080ull, /* 05 */
-0x00808000ull, /* 06 */
-0x00808080ull, /* 07 */
-0x8000ull, /* 08 */
-0x8080ull, /* 09 */
-0x80008000ull, /* 0A */
-0x80008080ull, /* 0B */
-0x8080ull, /* 0C */
-0x80800080ull, /* 0D */
-0x80808000ull, /* 0E */
-0x80808080ull, /* 0F */
-0x0080ull, /* 10 */
-0x00800080ull, /* 11 */
-0x00808000ull, /* 12 */
-0x00808080ull, /* 13 */
-0x00800080ull, /* 14 */
-0x008000800080ull, /* 15 */
-0x008000808000ull, /* 16 */
-0x008000808080ull, /* 17 */
-0x00808000ull, /* 18 */
-0x00808080ull, /* 19 */
-0x008080008000ull, /* 1A */
-0x008080008080ull, /* 1B */
-0x00808080ull, /* 1C */
-0x008080800080ull, /* 1D */
-0x008080808000ull, /* 1E */
-0x008080808080ull, /* 1F */
-0x8000ull, /* 20 */
-0x8080ull, /* 21 */
-0x80008000ull, /* 22 */
-0x80008080ull, /* 23 */
-0x8080ull, /* 24 */
-0x80800080ull, /* 25 */
-0x80808000ull, /* 26 */
-0x80808080ull, /* 27 */
-0x80008000ull, /* 28 */
-0x80008080ull, /* 29 */
-0x800080008000ull, /* 2A */
-0x800080008080ull, /* 2B */
-0x80008080ull, /* 2C */
-0x800080800080ull, /* 2D */
-0x800080808000ull, /* 2E */
-0x800080808080ull, /* 2F */
-0x8080ull, /* 30 */
-0x80800080ull, /* 31 */
-0x80808000ull, /* 32 */
-0x80808080ull, /* 33 */
-0x80800080ull, /* 34 */
-0x808000800080ull, /* 35 */
-0x808000808000ull, /* 36 */
-0x808000808080ull, /* 37 */
-0x80808000ull, /* 38 */
-0x80808080ull, /* 39 */
-0x808080008000ull, /* 3A */
-0x808080008080ull, /* 3B */
-0x80808080ull, /* 3C */
-0x808080800080ull, /* 3D */
-0x808080808000ull, /* 3E */
-0x808080808080ull, /* 3F

[Qemu-devel] [PATCH v3 0/8] target/ppc: Optimize emulation of some Altivec

2019-06-21 Thread Stefan Brankovic
Optimize emulation of ten Altivec instructions: lvsl, lvsr, vsl, vsr, vpkpx,
vgbbd, vclzb, vclzh, vclzw and vclzd.

This series buils up on and complements recent work of Thomas Murta, Mark
Cave-Ayland and Richard Henderson in the same area. It is based on devising TCG
translation implementation for selected instructions rather than using helpers.
The selected instructions are most of the time idiosyncratic to ppc platform,
so relatively complex TCG translation (without direct mapping to host
instruction that is not possible in these cases) seems to be the best option,
and that approach is presented in this series. The performance improvements are
significant in all cases.

V3:

Fixed problem during build.

V2:

Addressed Richard's Henderson's suggestions.
Fixed problem during build on patch 2/8.
Rebased series to the latest qemu code.

Stefan Brankovic (8):
  target/ppc: Optimize emulation of lvsl and lvsr instructions
  target/ppc: Optimize emulation of vsl and vsr instructions
  target/ppc: Optimize emulation of vpkpx instruction
  target/ppc: Optimize emulation of vgbbd instruction
  target/ppc: Optimize emulation of vclzd instruction
  target/ppc: Optimize emulation of vclzw instruction
  target/ppc: Optimize emulation of vclzh and vclzb instructions
  target/ppc: Refactor emulation of vmrgew and vmrgow instructions

 target/ppc/helper.h |  10 -
 target/ppc/int_helper.c | 365 
 target/ppc/translate/vmx-impl.inc.c | 642 
 3 files changed, 573 insertions(+), 444 deletions(-)

-- 
2.7.4




[Qemu-devel] [PATCH v3 1/8] target/ppc: Optimize emulation of lvsl and lvsr instructions

2019-06-21 Thread Stefan Brankovic
Adding simple macro that is calling tcg implementation of appropriate
instruction if altivec support is active.

Optimization of altivec instruction lvsl (Load Vector for Shift Left).
Place bytes sh:sh+15 of value 0x00 || 0x01 || 0x02 || ... || 0x1E || 0x1F
in destination register. Sh is calculated by adding 2 source registers and
getting bits 60-63 of result.

First, the bits [28-31] are placed from EA to variable sh. After that,
the bytes are created in the following way:
sh:(sh+7) of X(from description) by multiplying sh with 0x0101010101010101
followed by addition of the result with 0x0001020304050607. Value obtained
is placed in higher doubleword element of vD.
(sh+8):(sh+15) by adding the result of previous multiplication with
0x08090a0b0c0d0e0f. Value obtained is placed in lower doubleword element
of vD.

Optimization of altivec instruction lvsr (Load Vector for Shift Right).
Place bytes 16-sh:31-sh of value 0x00 || 0x01 || 0x02 || ... || 0x1E ||
0x1F in destination register. Sh is calculated by adding 2 source
registers and getting bits 60-63 of result.

First, the bits [28-31] are placed from EA to variable sh. After that,
the bytes are created in the following way:
sh:(sh+7) of X(from description) by multiplying sh with 0x0101010101010101
followed by substraction of the result from 0x1011121314151617. Value
obtained is placed in higher doubleword element of vD.
(sh+8):(sh+15) by substracting the result of previous multiplication from
0x18191a1b1c1d1e1f. Value obtained is placed in lower doubleword element
of vD.

Signed-off-by: Stefan Brankovic 
---
 target/ppc/helper.h |   2 -
 target/ppc/int_helper.c |  18 -
 target/ppc/translate/vmx-impl.inc.c | 129 +++-
 3 files changed, 97 insertions(+), 52 deletions(-)

diff --git a/target/ppc/helper.h b/target/ppc/helper.h
index 02b67a3..c82105e 100644
--- a/target/ppc/helper.h
+++ b/target/ppc/helper.h
@@ -189,8 +189,6 @@ DEF_HELPER_2(vprtybw, void, avr, avr)
 DEF_HELPER_2(vprtybd, void, avr, avr)
 DEF_HELPER_2(vprtybq, void, avr, avr)
 DEF_HELPER_3(vsubcuw, void, avr, avr, avr)
-DEF_HELPER_2(lvsl, void, avr, tl)
-DEF_HELPER_2(lvsr, void, avr, tl)
 DEF_HELPER_FLAGS_5(vaddsbs, TCG_CALL_NO_RWG, void, avr, avr, avr, avr, i32)
 DEF_HELPER_FLAGS_5(vaddshs, TCG_CALL_NO_RWG, void, avr, avr, avr, avr, i32)
 DEF_HELPER_FLAGS_5(vaddsws, TCG_CALL_NO_RWG, void, avr, avr, avr, avr, i32)
diff --git a/target/ppc/int_helper.c b/target/ppc/int_helper.c
index 8ce89f2..9505f4c 100644
--- a/target/ppc/int_helper.c
+++ b/target/ppc/int_helper.c
@@ -457,24 +457,6 @@ SATCVT(sd, uw, int64_t, uint32_t, 0, UINT32_MAX)
 #undef SATCVT
 #undef SATCVTU
 
-void helper_lvsl(ppc_avr_t *r, target_ulong sh)
-{
-int i, j = (sh & 0xf);
-
-for (i = 0; i < ARRAY_SIZE(r->u8); i++) {
-r->VsrB(i) = j++;
-}
-}
-
-void helper_lvsr(ppc_avr_t *r, target_ulong sh)
-{
-int i, j = 0x10 - (sh & 0xf);
-
-for (i = 0; i < ARRAY_SIZE(r->u8); i++) {
-r->VsrB(i) = j++;
-}
-}
-
 void helper_mtvscr(CPUPPCState *env, uint32_t vscr)
 {
 env->vscr = vscr & ~(1u << VSCR_SAT);
diff --git a/target/ppc/translate/vmx-impl.inc.c 
b/target/ppc/translate/vmx-impl.inc.c
index 663275b..eba6355 100644
--- a/target/ppc/translate/vmx-impl.inc.c
+++ b/target/ppc/translate/vmx-impl.inc.c
@@ -142,38 +142,6 @@ GEN_VR_STVE(bx, 0x07, 0x04, 1);
 GEN_VR_STVE(hx, 0x07, 0x05, 2);
 GEN_VR_STVE(wx, 0x07, 0x06, 4);
 
-static void gen_lvsl(DisasContext *ctx)
-{
-TCGv_ptr rd;
-TCGv EA;
-if (unlikely(!ctx->altivec_enabled)) {
-gen_exception(ctx, POWERPC_EXCP_VPU);
-return;
-}
-EA = tcg_temp_new();
-gen_addr_reg_index(ctx, EA);
-rd = gen_avr_ptr(rD(ctx->opcode));
-gen_helper_lvsl(rd, EA);
-tcg_temp_free(EA);
-tcg_temp_free_ptr(rd);
-}
-
-static void gen_lvsr(DisasContext *ctx)
-{
-TCGv_ptr rd;
-TCGv EA;
-if (unlikely(!ctx->altivec_enabled)) {
-gen_exception(ctx, POWERPC_EXCP_VPU);
-return;
-}
-EA = tcg_temp_new();
-gen_addr_reg_index(ctx, EA);
-rd = gen_avr_ptr(rD(ctx->opcode));
-gen_helper_lvsr(rd, EA);
-tcg_temp_free(EA);
-tcg_temp_free_ptr(rd);
-}
-
 static void gen_mfvscr(DisasContext *ctx)
 {
 TCGv_i32 t;
@@ -316,6 +284,16 @@ static void glue(gen_, name)(DisasContext *ctx)
 \
 tcg_temp_free_ptr(rd);  \
 }
 
+#define GEN_VXFORM_TRANS(name, opc2, opc3)  \
+static void glue(gen_, name)(DisasContext *ctx) \
+{   \
+if (unlikely(!ctx->altivec_enabled)) {  \
+gen_exception(ctx, POWERPC_EXCP_VPU);   \
+return; \
+}

[Qemu-devel] [PATCH v3 3/8] target/ppc: Optimize emulation of vpkpx instruction

2019-06-21 Thread Stefan Brankovic
Optimize altivec instruction vpkpx (Vector Pack Pixel).
Rearranges 8 pixels coded in 6-5-5 pattern (4 from each source register)
into contigous array of bits in the destination register.

In each iteration of outer loop, the instruction is to be done with
the 6-5-5 pack for 2 pixels of each doubleword element of each
source register. The first thing to be done in outer loop is
choosing which doubleword element of which register is to be used
in current iteration and it is to be placed in avr variable. The
next step is to perform 6-5-5 pack of pixels on avr variable in inner
for loop(2 iterations, 1 for each pixel) and save result in tmp variable.
In the end of outer for loop, the result is merged in variable called
result and saved in appropriate doubleword element of vD if the whole
doubleword is finished(every second iteration). The outer loop has 4
iterations.

Signed-off-by: Stefan Brankovic 
---
 target/ppc/helper.h |  1 -
 target/ppc/int_helper.c | 21 -
 target/ppc/translate/vmx-impl.inc.c | 93 -
 3 files changed, 92 insertions(+), 23 deletions(-)

diff --git a/target/ppc/helper.h b/target/ppc/helper.h
index 33dad6a..7a3d68d 100644
--- a/target/ppc/helper.h
+++ b/target/ppc/helper.h
@@ -254,7 +254,6 @@ DEF_HELPER_4(vpkudus, void, env, avr, avr, avr)
 DEF_HELPER_4(vpkuhum, void, env, avr, avr, avr)
 DEF_HELPER_4(vpkuwum, void, env, avr, avr, avr)
 DEF_HELPER_4(vpkudum, void, env, avr, avr, avr)
-DEF_HELPER_3(vpkpx, void, avr, avr, avr)
 DEF_HELPER_5(vmhaddshs, void, env, avr, avr, avr, avr)
 DEF_HELPER_5(vmhraddshs, void, env, avr, avr, avr, avr)
 DEF_HELPER_5(vmsumuhm, void, env, avr, avr, avr, avr)
diff --git a/target/ppc/int_helper.c b/target/ppc/int_helper.c
index a23853e..f397380 100644
--- a/target/ppc/int_helper.c
+++ b/target/ppc/int_helper.c
@@ -1536,27 +1536,6 @@ void helper_vpmsumd(ppc_avr_t *r, ppc_avr_t *a, 
ppc_avr_t *b)
 #else
 #define PKBIG 0
 #endif
-void helper_vpkpx(ppc_avr_t *r, ppc_avr_t *a, ppc_avr_t *b)
-{
-int i, j;
-ppc_avr_t result;
-#if defined(HOST_WORDS_BIGENDIAN)
-const ppc_avr_t *x[2] = { a, b };
-#else
-const ppc_avr_t *x[2] = { b, a };
-#endif
-
-VECTOR_FOR_INORDER_I(i, u64) {
-VECTOR_FOR_INORDER_I(j, u32) {
-uint32_t e = x[i]->u32[j];
-
-result.u16[4 * i + j] = (((e >> 9) & 0xfc00) |
- ((e >> 6) & 0x3e0) |
- ((e >> 3) & 0x1f));
-}
-}
-*r = result;
-}
 
 #define VPK(suffix, from, to, cvt, dosat)   \
 void helper_vpk##suffix(CPUPPCState *env, ppc_avr_t *r, \
diff --git a/target/ppc/translate/vmx-impl.inc.c 
b/target/ppc/translate/vmx-impl.inc.c
index cc11d21..30dd7e1 100644
--- a/target/ppc/translate/vmx-impl.inc.c
+++ b/target/ppc/translate/vmx-impl.inc.c
@@ -579,6 +579,97 @@ static void trans_lvsr(DisasContext *ctx)
 }
 
 /*
+ * vpkpx VRT,VRA,VRB - Vector Pack Pixel
+ *
+ * Rearranges 8 pixels coded in 6-5-5 pattern (4 from each source register)
+ * into contigous array of bits in the destination register.
+ */
+static void trans_vpkpx(DisasContext *ctx)
+{
+int VT = rD(ctx->opcode);
+int VA = rA(ctx->opcode);
+int VB = rB(ctx->opcode);
+TCGv_i64 tmp = tcg_temp_new_i64();
+TCGv_i64 shifted = tcg_temp_new_i64();
+TCGv_i64 avr = tcg_temp_new_i64();
+TCGv_i64 result = tcg_temp_new_i64();
+int64_t mask1 = 0x1fULL;
+int64_t mask2 = 0x1fULL << 5;
+int64_t mask3 = 0x3fULL << 10;
+int i, j;
+/*
+ * In each iteration do the 6-5-5 pack for 2 pixels of each doubleword
+ * element of each source register.
+ */
+for (i = 0; i < 4; i++) {
+switch (i) {
+case 0:
+/*
+ * Get high doubleword of vA to perfrom 6-5-5 pack of pixels
+ * 1 and 2.
+ */
+get_avr64(avr, VA, true);
+tcg_gen_movi_i64(result, 0x0ULL);
+break;
+case 1:
+/*
+ * Get low doubleword of vA to perfrom 6-5-5 pack of pixels
+ * 3 and 4.
+ */
+get_avr64(avr, VA, false);
+break;
+case 2:
+/*
+ * Get high doubleword of vB to perfrom 6-5-5 pack of pixels
+ * 5 and 6.
+ */
+get_avr64(avr, VB, true);
+tcg_gen_movi_i64(result, 0x0ULL);
+break;
+case 3:
+/*
+ * Get low doubleword of vB to perfrom 6-5-5 pack of pixels
+ * 7 and 8.
+ */
+get_avr64(avr, VB, false);
+break;
+}
+/* Perform the packing for 2 pixels(each iteration for 1). */
+tcg_gen_movi_i64(tmp, 0x0ULL);
+for (j = 0; j < 2; j++) {
+tcg_gen_shri_i64(shifted, avr, (j * 16 + 3));
+   

[Qemu-devel] [PATCH v3 2/8] target/ppc: Optimize emulation of vsl and vsr instructions

2019-06-21 Thread Stefan Brankovic
Optimization of altivec instructions vsl and vsr(Vector Shift Left/Rigt).
Perform shift operation (left and right respectively) on 128 bit value of
register vA by value specified in bits 125-127 of register vB. Lowest 3
bits in each byte element of register vB must be identical or result is
undefined.

For vsl instruction, the first step is bits 125-127 of register vB have
to be saved in variable sh. Then, the highest sh bits of the lower
doubleword element of register vA are saved in variable shifted,
in order not to lose those bits when shift operation is performed on
the lower doubleword element of register vA, which is the next
step. After shifting the lower doubleword element shift operation
is performed on higher doubleword element of vA, with replacement of
the lowest sh bits(that are now 0) with bits saved in shifted.

For vsr instruction, firstly, the bits 125-127 of register vB have
to be saved in variable sh. Then, the lowest sh bits of the higher
doubleword element of register vA are saved in variable shifted,
in odred not to lose those bits when the shift operation is
performed on the higher doubleword element of register vA, which is
the next step. After shifting higher doubleword element, shift operation
is performed on lower doubleword element of vA, with replacement of
highest sh bits(that are now 0) with bits saved in shifted.

Signed-off-by: Stefan Brankovic 
---
 target/ppc/helper.h |  2 -
 target/ppc/int_helper.c | 35 -
 target/ppc/translate/vmx-impl.inc.c | 99 -
 3 files changed, 97 insertions(+), 39 deletions(-)

diff --git a/target/ppc/helper.h b/target/ppc/helper.h
index c82105e..33dad6a 100644
--- a/target/ppc/helper.h
+++ b/target/ppc/helper.h
@@ -213,8 +213,6 @@ DEF_HELPER_3(vrlb, void, avr, avr, avr)
 DEF_HELPER_3(vrlh, void, avr, avr, avr)
 DEF_HELPER_3(vrlw, void, avr, avr, avr)
 DEF_HELPER_3(vrld, void, avr, avr, avr)
-DEF_HELPER_3(vsl, void, avr, avr, avr)
-DEF_HELPER_3(vsr, void, avr, avr, avr)
 DEF_HELPER_4(vsldoi, void, avr, avr, avr, i32)
 DEF_HELPER_3(vextractub, void, avr, avr, i32)
 DEF_HELPER_3(vextractuh, void, avr, avr, i32)
diff --git a/target/ppc/int_helper.c b/target/ppc/int_helper.c
index 9505f4c..a23853e 100644
--- a/target/ppc/int_helper.c
+++ b/target/ppc/int_helper.c
@@ -1738,41 +1738,6 @@ VEXTU_X_DO(vextuhrx, 16, 0)
 VEXTU_X_DO(vextuwrx, 32, 0)
 #undef VEXTU_X_DO
 
-/*
- * The specification says that the results are undefined if all of the
- * shift counts are not identical.  We check to make sure that they
- * are to conform to what real hardware appears to do.
- */
-#define VSHIFT(suffix, leftp)   \
-void helper_vs##suffix(ppc_avr_t *r, ppc_avr_t *a, ppc_avr_t *b)\
-{   \
-int shift = b->VsrB(15) & 0x7;  \
-int doit = 1;   \
-int i;  \
-\
-for (i = 0; i < ARRAY_SIZE(r->u8); i++) {   \
-doit = doit && ((b->u8[i] & 0x7) == shift); \
-}   \
-if (doit) { \
-if (shift == 0) {   \
-*r = *a;\
-} else if (leftp) { \
-uint64_t carry = a->VsrD(1) >> (64 - shift);\
-\
-r->VsrD(0) = (a->VsrD(0) << shift) | carry; \
-r->VsrD(1) = a->VsrD(1) << shift;   \
-} else {\
-uint64_t carry = a->VsrD(0) << (64 - shift);\
-\
-r->VsrD(1) = (a->VsrD(1) >> shift) | carry; \
-r->VsrD(0) = a->VsrD(0) >> shift;   \
-}   \
-}   \
-}
-VSHIFT(l, 1)
-VSHIFT(r, 0)
-#undef VSHIFT
-
 void helper_vslv(ppc_avr_t *r, ppc_avr_t *a, ppc_avr_t *b)
 {
 int i;
diff --git a/target/ppc/translate/vmx-impl.inc.c 
b/target/ppc/translate/vmx-impl.inc.c
index eba6355..cc11d21 100644
--- a/target/ppc/translate/vmx-impl.inc.c
+++ b/target/ppc/translate/vmx-impl.inc.c
@@ -578,6 +578,101 @@ static void trans_lvsr(DisasContext *

[Qemu-devel] [PATCH v3 6/8] target/ppc: Optimize emulation of vclzw instruction

2019-06-21 Thread Stefan Brankovic
Optimize Altivec instruction vclzw (Vector Count Leading Zeros Word).
This instruction counts the number of leading zeros of each word element
in source register and places result in the appropriate word element of
destination register.

Counting is to be performed in four iterations of for loop(one for each
word elemnt of source register vB). Every iteration consists of loading
appropriate word element from source register, counting leading zeros
with tcg_gen_clzi_i32, and saving the result in appropriate word element
of destination register.

Signed-off-by: Stefan Brankovic 
---
 target/ppc/helper.h |  1 -
 target/ppc/int_helper.c |  3 ---
 target/ppc/translate/vmx-impl.inc.c | 28 +++-
 3 files changed, 27 insertions(+), 5 deletions(-)

diff --git a/target/ppc/helper.h b/target/ppc/helper.h
index 3b92e3b..595241c 100644
--- a/target/ppc/helper.h
+++ b/target/ppc/helper.h
@@ -305,7 +305,6 @@ DEF_HELPER_4(vctsxs, void, env, avr, avr, i32)
 
 DEF_HELPER_2(vclzb, void, avr, avr)
 DEF_HELPER_2(vclzh, void, avr, avr)
-DEF_HELPER_2(vclzw, void, avr, avr)
 DEF_HELPER_2(vctzb, void, avr, avr)
 DEF_HELPER_2(vctzh, void, avr, avr)
 DEF_HELPER_2(vctzw, void, avr, avr)
diff --git a/target/ppc/int_helper.c b/target/ppc/int_helper.c
index 3134798..82cb12e 100644
--- a/target/ppc/int_helper.c
+++ b/target/ppc/int_helper.c
@@ -1802,15 +1802,12 @@ VUPK(lsw, s64, s32, UPKLO)
 
 #define clzb(v) ((v) ? clz32((uint32_t)(v) << 24) : 8)
 #define clzh(v) ((v) ? clz32((uint32_t)(v) << 16) : 16)
-#define clzw(v) clz32((v))
 
 VGENERIC_DO(clzb, u8)
 VGENERIC_DO(clzh, u16)
-VGENERIC_DO(clzw, u32)
 
 #undef clzb
 #undef clzh
-#undef clzw
 
 #define ctzb(v) ((v) ? ctz32(v) : 8)
 #define ctzh(v) ((v) ? ctz32(v) : 16)
diff --git a/target/ppc/translate/vmx-impl.inc.c 
b/target/ppc/translate/vmx-impl.inc.c
index 1953c65..9ed2fae 100644
--- a/target/ppc/translate/vmx-impl.inc.c
+++ b/target/ppc/translate/vmx-impl.inc.c
@@ -840,6 +840,32 @@ static void trans_vgbbd(DisasContext *ctx)
 }
 
 /*
+ * vclzw VRT,VRB - Vector Count Leading Zeros Word
+ *
+ * Counting the number of leading zero bits of each word element in source
+ * register and placing result in appropriate word element of destination
+ * register.
+ */
+static void trans_vclzw(DisasContext *ctx)
+{
+int VT = rD(ctx->opcode);
+int VB = rB(ctx->opcode);
+TCGv_i32 tmp = tcg_temp_new_i32();
+int i;
+
+/* Perform count for every word element using tcg_gen_clzi_i32. */
+for (i = 0; i < 4; i++) {
+tcg_gen_ld_i32(tmp, cpu_env,
+offsetof(CPUPPCState, vsr[32 + VB].u64[0]) + i * 4);
+tcg_gen_clzi_i32(tmp, tmp, 32);
+tcg_gen_st_i32(tmp, cpu_env,
+offsetof(CPUPPCState, vsr[32 + VT].u64[0]) + i * 4);
+}
+
+tcg_temp_free_i32(tmp);
+}
+
+/*
  * vclzd VRT,VRB - Vector Count Leading Zeros Doubleword
  *
  * Counting the number of leading zero bits of each doubleword element in 
source
@@ -1380,7 +1406,7 @@ GEN_VAFORM_PAIRED(vmaddfp, vnmsubfp, 23)
 
 GEN_VXFORM_NOA(vclzb, 1, 28)
 GEN_VXFORM_NOA(vclzh, 1, 29)
-GEN_VXFORM_NOA(vclzw, 1, 30)
+GEN_VXFORM_TRANS(vclzw, 1, 30)
 GEN_VXFORM_TRANS(vclzd, 1, 31)
 GEN_VXFORM_NOA_2(vnegw, 1, 24, 6)
 GEN_VXFORM_NOA_2(vnegd, 1, 24, 7)
-- 
2.7.4




[Qemu-devel] [PATCH 6/8] target/ppc: Optimize emulation of vclzw instruction

2019-06-19 Thread Stefan Brankovic
Optimize Altivec instruction vclzw (Vector Count Leading Zeros Word).
This instruction counts the number of leading zeros of each word element
in source register and places result in the appropriate word element of
destination register.

Counting is to be performed in four iterations of for loop(one for each
word elemnt of source register vB). Every iteration consists of loading
appropriate word element from source register, counting leading zeros
with tcg_gen_clzi_i32, and saving the result in appropriate word element
of destination register.

Signed-off-by: Stefan Brankovic 
---
 target/ppc/helper.h |  1 -
 target/ppc/int_helper.c |  3 ---
 target/ppc/translate/vmx-impl.inc.c | 28 +++-
 3 files changed, 27 insertions(+), 5 deletions(-)

diff --git a/target/ppc/helper.h b/target/ppc/helper.h
index 3b92e3b..595241c 100644
--- a/target/ppc/helper.h
+++ b/target/ppc/helper.h
@@ -305,7 +305,6 @@ DEF_HELPER_4(vctsxs, void, env, avr, avr, i32)
 
 DEF_HELPER_2(vclzb, void, avr, avr)
 DEF_HELPER_2(vclzh, void, avr, avr)
-DEF_HELPER_2(vclzw, void, avr, avr)
 DEF_HELPER_2(vctzb, void, avr, avr)
 DEF_HELPER_2(vctzh, void, avr, avr)
 DEF_HELPER_2(vctzw, void, avr, avr)
diff --git a/target/ppc/int_helper.c b/target/ppc/int_helper.c
index 3134798..82cb12e 100644
--- a/target/ppc/int_helper.c
+++ b/target/ppc/int_helper.c
@@ -1802,15 +1802,12 @@ VUPK(lsw, s64, s32, UPKLO)
 
 #define clzb(v) ((v) ? clz32((uint32_t)(v) << 24) : 8)
 #define clzh(v) ((v) ? clz32((uint32_t)(v) << 16) : 16)
-#define clzw(v) clz32((v))
 
 VGENERIC_DO(clzb, u8)
 VGENERIC_DO(clzh, u16)
-VGENERIC_DO(clzw, u32)
 
 #undef clzb
 #undef clzh
-#undef clzw
 
 #define ctzb(v) ((v) ? ctz32(v) : 8)
 #define ctzh(v) ((v) ? ctz32(v) : 16)
diff --git a/target/ppc/translate/vmx-impl.inc.c 
b/target/ppc/translate/vmx-impl.inc.c
index c5bebfb..2588a9e 100644
--- a/target/ppc/translate/vmx-impl.inc.c
+++ b/target/ppc/translate/vmx-impl.inc.c
@@ -831,6 +831,32 @@ static void trans_vgbbd(DisasContext *ctx)
 }
 
 /*
+ * vclzw VRT,VRB - Vector Count Leading Zeros Word
+ *
+ * Counting the number of leading zero bits of each word element in source
+ * register and placing result in appropriate word element of destination
+ * register.
+ */
+static void trans_vclzw(DisasContext *ctx)
+{
+int VT = rD(ctx->opcode);
+int VB = rB(ctx->opcode);
+TCGv_i32 tmp = tcg_temp_new_i32();
+int i;
+
+/* Perform count for every word element using tcg_gen_clzi_i32. */
+for (i = 0; i < 4; i++) {
+tcg_gen_ld_i32(tmp, cpu_env,
+offsetof(CPUPPCState, vsr[32 + VB].u64[0]) + i * 4);
+tcg_gen_clzi_i32(tmp, tmp, 32);
+tcg_gen_st_i32(tmp, cpu_env,
+offsetof(CPUPPCState, vsr[32 + VT].u64[0]) + i * 4);
+}
+
+tcg_temp_free_i32(tmp);
+}
+
+/*
  * vclzd VRT,VRB - Vector Count Leading Zeros Doubleword
  *
  * Counting the number of leading zero bits of each doubleword element in 
source
@@ -1371,7 +1397,7 @@ GEN_VAFORM_PAIRED(vmaddfp, vnmsubfp, 23)
 
 GEN_VXFORM_NOA(vclzb, 1, 28)
 GEN_VXFORM_NOA(vclzh, 1, 29)
-GEN_VXFORM_NOA(vclzw, 1, 30)
+GEN_VXFORM_TRANS(vclzw, 1, 30)
 GEN_VXFORM_TRANS(vclzd, 1, 31)
 GEN_VXFORM_NOA_2(vnegw, 1, 24, 6)
 GEN_VXFORM_NOA_2(vnegd, 1, 24, 7)
-- 
2.7.4




[Qemu-devel] [PATCH 0/8] target/ppc: Optimize emulation of some Altivec instructions

2019-06-19 Thread Stefan Brankovic
Optimize emulation of ten Altivec instructions: lvsl, lvsr, vsl, vsr, vpkpx,
vgbbd, vclzb, vclzh, vclzw and vclzd.

This series buils up on and complements recent work of Thomas Murta, Mark
Cave-Ayland and Richard Henderson in the same area. It is based on devising TCG
translation implementation for selected instructions rather than using helpers.
The selected instructions are most of the time idiosyncratic to ppc platform,
so relatively complex TCG translation (without direct mapping to host
instruction that is not possible in these cases) seems to be the best option,
and that approach is presented in this series. The performance improvements are
significant in all cases.

V2:

Addressed Richard's Henderson's suggestions.
Fixed problem during build on patch 2/8.
Rebased series to the latest qemu code.

Stefan Brankovic (8):
  target/ppc: Optimize emulation of lvsl and lvsr instructions
  target/ppc: Optimize emulation of vsl and vsr instructions
  target/ppc: Optimize emulation of vpkpx instruction
  target/ppc: Optimize emulation of vgbbd instruction
  target/ppc: Optimize emulation of vclzd instruction
  target/ppc: Optimize emulation of vclzw instruction
  target/ppc: Optimize emulation of vclzh and vclzb instructions
  target/ppc: Refactor emulation of vmrgew and vmrgow instructions

 target/ppc/helper.h |  10 -
 target/ppc/int_helper.c | 365 -
 target/ppc/translate/vmx-impl.inc.c | 633 
 3 files changed, 564 insertions(+), 444 deletions(-)

-- 
2.7.4




[Qemu-devel] [PATCH 7/8] target/ppc: Optimize emulation of vclzh and vclzb instructions

2019-06-19 Thread Stefan Brankovic
Optimize Altivec instruction vclzh (Vector Count Leading Zeros Halfword).
This instruction counts the number of leading zeros of each halfword element
in source register and places result in the appropriate halfword element of
destination register.

In each iteration of outer for loop count operation is performed on one
doubleword element of source register vB. In the first iteration, higher
doubleword element of vB is placed in variable avr, and then counting
for every halfword element is performed by  using tcg_gen_clzi_i64.
Since it counts leading zeros on 64 bit lenght, ith byte element has to
be moved to the highest 16 bits of tmp, or-ed with mask(in order to get all
ones in lowest 48 bits), then perform tcg_gen_clzi_i64 and move it's result
in appropriate halfword element of result. This is done in inner for loop.
After the operation is finished, the result is saved in the appropriate
doubleword element of destination register vD. The same sequence of orders
is to be applied again for the  lower doubleword element of vB.

Optimize Altivec instruction vclzb (Vector Count Leading Zeros Byte).
This instruction counts the number of leading zeros of each byte element
in source register and places result in the appropriate byte element of
destination register.

In each iteration of the outer for loop, counting operation is done on one
doubleword element of source register vB. In the first iteration, the
higher doubleword element of vB is placed in variable avr, and then counting
for every byte element is performed using tcg_gen_clzi_i64. Since it counts
leading zeros on 64 bit lenght, ith byte element has to be moved to the highest
8 bits of variable  tmp, or-ed with mask(in order to get all ones in the lowest
56 bits), then perform tcg_gen_clzi_i64 and move it's result in the appropriate
byte element of result. This is done in inner for loop. After the operation is
finished, the result is saved in the  appropriate doubleword element of 
destination
register vD. The same sequence of orders is to be applied again for the lower
doubleword element of vB.

Signed-off-by: Stefan Brankovic 
---
 target/ppc/helper.h |   2 -
 target/ppc/int_helper.c |   9 ---
 target/ppc/translate/vmx-impl.inc.c | 122 +++-
 3 files changed, 120 insertions(+), 13 deletions(-)

diff --git a/target/ppc/helper.h b/target/ppc/helper.h
index 595241c..17b4b06 100644
--- a/target/ppc/helper.h
+++ b/target/ppc/helper.h
@@ -303,8 +303,6 @@ DEF_HELPER_4(vcfsx, void, env, avr, avr, i32)
 DEF_HELPER_4(vctuxs, void, env, avr, avr, i32)
 DEF_HELPER_4(vctsxs, void, env, avr, avr, i32)
 
-DEF_HELPER_2(vclzb, void, avr, avr)
-DEF_HELPER_2(vclzh, void, avr, avr)
 DEF_HELPER_2(vctzb, void, avr, avr)
 DEF_HELPER_2(vctzh, void, avr, avr)
 DEF_HELPER_2(vctzw, void, avr, avr)
diff --git a/target/ppc/int_helper.c b/target/ppc/int_helper.c
index 82cb12e..264b5e7 100644
--- a/target/ppc/int_helper.c
+++ b/target/ppc/int_helper.c
@@ -1800,15 +1800,6 @@ VUPK(lsw, s64, s32, UPKLO)
 }   \
 }
 
-#define clzb(v) ((v) ? clz32((uint32_t)(v) << 24) : 8)
-#define clzh(v) ((v) ? clz32((uint32_t)(v) << 16) : 16)
-
-VGENERIC_DO(clzb, u8)
-VGENERIC_DO(clzh, u16)
-
-#undef clzb
-#undef clzh
-
 #define ctzb(v) ((v) ? ctz32(v) : 8)
 #define ctzh(v) ((v) ? ctz32(v) : 16)
 #define ctzw(v) ctz32((v))
diff --git a/target/ppc/translate/vmx-impl.inc.c 
b/target/ppc/translate/vmx-impl.inc.c
index 2588a9e..81569a8 100644
--- a/target/ppc/translate/vmx-impl.inc.c
+++ b/target/ppc/translate/vmx-impl.inc.c
@@ -831,6 +831,124 @@ static void trans_vgbbd(DisasContext *ctx)
 }
 
 /*
+ * vclzb VRT,VRB - Vector Count Leading Zeros Byte
+ *
+ * Counting the number of leading zero bits of each byte element in source
+ * register and placing result in appropriate byte element of destination
+ * register.
+ */
+static void trans_vclzb(DisasContext *ctx)
+{
+int VT = rD(ctx->opcode);
+int VB = rB(ctx->opcode);
+TCGv_i64 avr = tcg_temp_new_i64();
+TCGv_i64 result = tcg_temp_new_i64();
+TCGv_i64 tmp = tcg_temp_new_i64();
+TCGv_i64 mask = tcg_const_i64(0xffULL);
+int i, j;
+
+for (i = 0; i < 2; i++) {
+if (i == 0) {
+/* Get high doubleword of vB in avr. */
+get_avr64(avr, VB, true);
+} else {
+/* Get low doubleword of vB in avr. */
+get_avr64(avr, VB, false);
+}
+/*
+ * Perform count for every byte element using tcg_gen_clzi_i64.
+ * Since it counts leading zeros on 64 bit lenght, we have to move
+ * ith byte element to highest 8 bits of tmp, or it with mask(so we get
+ * all ones in lowest 56 bits), then perform tcg_gen_clzi_i64 and move
+ * it's result in appropriate byte element of result.
+ */
+tcg_gen_shli_i64(tmp, avr, 56);
+tcg_gen_

[Qemu-devel] [PATCH 2/8] target/ppc: Optimize emulation of vsl and vsr instructions

2019-06-19 Thread Stefan Brankovic
Optimization of altivec instructions vsl and vsr(Vector Shift Left/Rigt).
Perform shift operation (left and right respectively) on 128 bit value of
register vA by value specified in bits 125-127 of register vB. Lowest 3
bits in each byte element of register vB must be identical or result is
undefined.

For vsl instruction, the first step is bits 125-127 of register vB have
to be saved in variable sh. Then, the highest sh bits of the lower
doubleword element of register vA are saved in variable shifted,
in order not to lose those bits when shift operation is performed on
the lower doubleword element of register vA, which is the next
step. After shifting the lower doubleword element shift operation
is performed on higher doubleword element of vA, with replacement of
the lowest sh bits(that are now 0) with bits saved in shifted.

For vsr instruction, firstly, the bits 125-127 of register vB have
to be saved in variable sh. Then, the lowest sh bits of the higher
doubleword element of register vA are saved in variable shifted,
in odred not to lose those bits when the shift operation is
performed on the higher doubleword element of register vA, which is
the next step. After shifting higher doubleword element, shift operation
is performed on lower doubleword element of vA, with replacement of
highest sh bits(that are now 0) with bits saved in shifted.

Signed-off-by: Stefan Brankovic 
---
 target/ppc/helper.h |  2 -
 target/ppc/int_helper.c | 35 -
 target/ppc/translate/vmx-impl.inc.c | 99 -
 3 files changed, 97 insertions(+), 39 deletions(-)

diff --git a/target/ppc/helper.h b/target/ppc/helper.h
index c82105e..33dad6a 100644
--- a/target/ppc/helper.h
+++ b/target/ppc/helper.h
@@ -213,8 +213,6 @@ DEF_HELPER_3(vrlb, void, avr, avr, avr)
 DEF_HELPER_3(vrlh, void, avr, avr, avr)
 DEF_HELPER_3(vrlw, void, avr, avr, avr)
 DEF_HELPER_3(vrld, void, avr, avr, avr)
-DEF_HELPER_3(vsl, void, avr, avr, avr)
-DEF_HELPER_3(vsr, void, avr, avr, avr)
 DEF_HELPER_4(vsldoi, void, avr, avr, avr, i32)
 DEF_HELPER_3(vextractub, void, avr, avr, i32)
 DEF_HELPER_3(vextractuh, void, avr, avr, i32)
diff --git a/target/ppc/int_helper.c b/target/ppc/int_helper.c
index 9505f4c..a23853e 100644
--- a/target/ppc/int_helper.c
+++ b/target/ppc/int_helper.c
@@ -1738,41 +1738,6 @@ VEXTU_X_DO(vextuhrx, 16, 0)
 VEXTU_X_DO(vextuwrx, 32, 0)
 #undef VEXTU_X_DO
 
-/*
- * The specification says that the results are undefined if all of the
- * shift counts are not identical.  We check to make sure that they
- * are to conform to what real hardware appears to do.
- */
-#define VSHIFT(suffix, leftp)   \
-void helper_vs##suffix(ppc_avr_t *r, ppc_avr_t *a, ppc_avr_t *b)\
-{   \
-int shift = b->VsrB(15) & 0x7;  \
-int doit = 1;   \
-int i;  \
-\
-for (i = 0; i < ARRAY_SIZE(r->u8); i++) {   \
-doit = doit && ((b->u8[i] & 0x7) == shift); \
-}   \
-if (doit) { \
-if (shift == 0) {   \
-*r = *a;\
-} else if (leftp) { \
-uint64_t carry = a->VsrD(1) >> (64 - shift);\
-\
-r->VsrD(0) = (a->VsrD(0) << shift) | carry; \
-r->VsrD(1) = a->VsrD(1) << shift;   \
-} else {\
-uint64_t carry = a->VsrD(0) << (64 - shift);\
-\
-r->VsrD(1) = (a->VsrD(1) >> shift) | carry; \
-r->VsrD(0) = a->VsrD(0) >> shift;   \
-}   \
-}   \
-}
-VSHIFT(l, 1)
-VSHIFT(r, 0)
-#undef VSHIFT
-
 void helper_vslv(ppc_avr_t *r, ppc_avr_t *a, ppc_avr_t *b)
 {
 int i;
diff --git a/target/ppc/translate/vmx-impl.inc.c 
b/target/ppc/translate/vmx-impl.inc.c
index 482e2ee..e7ae979 100644
--- a/target/ppc/translate/vmx-impl.inc.c
+++ b/target/ppc/translate/vmx-impl.inc.c
@@ -569,6 +569,101 @@ static void trans_lvsr(DisasContext *

[Qemu-devel] [PATCH 4/8] target/ppc: Optimize emulation of vgbbd instruction

2019-06-19 Thread Stefan Brankovic
Optimize altivec instruction vgbbd (Vector Gather Bits by Bytes by Doubleword)
All ith bits (i in range 1 to 8) of each byte of doubleword element in
source register are concatenated and placed into ith byte of appropriate
doubleword element in destination register.

Following solution is done for both doubleword elements of source register
in parallel, in order to reduce the number of instructions needed(that's why
arrays are used):
First, both doubleword elements of source register vB are placed in
appropriate element of array avr. Bits are gathered in 2x8 iterations(2 for
loops). In first iteration bit 1 of byte 1, bit 2 of byte 2,... bit 8 of
byte 8 are in their final spots so avr[i], i={0,1} can be and-ed with
tcg_mask. For every following iteration, both avr[i] and tcg_mask variables
have to be shifted right for 7 and 8 places, respectively, in order to get
bit 1 of byte 2, bit 2 of byte 3.. bit 7 of byte 8 in their final spots so
shifted avr values(saved in tmp) can be and-ed with new value of tcg_mask...
After first 8 iteration(first loop), all the first bits are in their final
places, all second bits but second bit from eight byte are in their places...
only 1 eight bit from eight byte is in it's place). In second loop we do all
operations symmetrically, in order to get other half of bits in their final
spots. Results for first and second doubleword elements are saved in
result[0] and result[1] respectively. In the end those results are saved in
appropriate doubleword element of destination register vD.

Signed-off-by: Stefan Brankovic 
---
 target/ppc/helper.h |   1 -
 target/ppc/int_helper.c | 276 
 target/ppc/translate/vmx-impl.inc.c |  77 +-
 3 files changed, 76 insertions(+), 278 deletions(-)

diff --git a/target/ppc/helper.h b/target/ppc/helper.h
index 7a3d68d..0aa1e05 100644
--- a/target/ppc/helper.h
+++ b/target/ppc/helper.h
@@ -319,7 +319,6 @@ DEF_HELPER_1(vclzlsbb, tl, avr)
 DEF_HELPER_1(vctzlsbb, tl, avr)
 DEF_HELPER_3(vbpermd, void, avr, avr, avr)
 DEF_HELPER_3(vbpermq, void, avr, avr, avr)
-DEF_HELPER_2(vgbbd, void, avr, avr)
 DEF_HELPER_3(vpmsumb, void, avr, avr, avr)
 DEF_HELPER_3(vpmsumh, void, avr, avr, avr)
 DEF_HELPER_3(vpmsumw, void, avr, avr, avr)
diff --git a/target/ppc/int_helper.c b/target/ppc/int_helper.c
index f397380..1e32549 100644
--- a/target/ppc/int_helper.c
+++ b/target/ppc/int_helper.c
@@ -1185,282 +1185,6 @@ void helper_vbpermq(ppc_avr_t *r, ppc_avr_t *a, 
ppc_avr_t *b)
 #undef VBPERMQ_INDEX
 #undef VBPERMQ_DW
 
-static const uint64_t VGBBD_MASKS[256] = {
-0xull, /* 00 */
-0x0080ull, /* 01 */
-0x8000ull, /* 02 */
-0x8080ull, /* 03 */
-0x0080ull, /* 04 */
-0x00800080ull, /* 05 */
-0x00808000ull, /* 06 */
-0x00808080ull, /* 07 */
-0x8000ull, /* 08 */
-0x8080ull, /* 09 */
-0x80008000ull, /* 0A */
-0x80008080ull, /* 0B */
-0x8080ull, /* 0C */
-0x80800080ull, /* 0D */
-0x80808000ull, /* 0E */
-0x80808080ull, /* 0F */
-0x0080ull, /* 10 */
-0x00800080ull, /* 11 */
-0x00808000ull, /* 12 */
-0x00808080ull, /* 13 */
-0x00800080ull, /* 14 */
-0x008000800080ull, /* 15 */
-0x008000808000ull, /* 16 */
-0x008000808080ull, /* 17 */
-0x00808000ull, /* 18 */
-0x00808080ull, /* 19 */
-0x008080008000ull, /* 1A */
-0x008080008080ull, /* 1B */
-0x00808080ull, /* 1C */
-0x008080800080ull, /* 1D */
-0x008080808000ull, /* 1E */
-0x008080808080ull, /* 1F */
-0x8000ull, /* 20 */
-0x8080ull, /* 21 */
-0x80008000ull, /* 22 */
-0x80008080ull, /* 23 */
-0x8080ull, /* 24 */
-0x80800080ull, /* 25 */
-0x80808000ull, /* 26 */
-0x80808080ull, /* 27 */
-0x80008000ull, /* 28 */
-0x80008080ull, /* 29 */
-0x800080008000ull, /* 2A */
-0x800080008080ull, /* 2B */
-0x80008080ull, /* 2C */
-0x800080800080ull, /* 2D */
-0x800080808000ull, /* 2E */
-0x800080808080ull, /* 2F */
-0x8080ull, /* 30 */
-0x80800080ull, /* 31 */
-0x80808000ull, /* 32 */
-0x80808080ull, /* 33 */
-0x80800080ull, /* 34 */
-0x808000800080ull, /* 35 */
-0x808000808000ull, /* 36 */
-0x808000808080ull, /* 37 */
-0x80808000ull, /* 38 */
-0x80808080ull, /* 39 */
-0x808080008000ull, /* 3A */
-0x808080008080ull, /* 3B */
-0x80808080ull, /* 3C */
-0x808080800080ull, /* 3D */
-0x808080808000ull, /* 3E */
-0x808080808080ull, /* 3F

[Qemu-devel] [PATCH 3/8] target/ppc: Optimize emulation of vpkpx instruction

2019-06-19 Thread Stefan Brankovic
Optimize altivec instruction vpkpx (Vector Pack Pixel).
Rearranges 8 pixels coded in 6-5-5 pattern (4 from each source register)
into contigous array of bits in the destination register.

In each iteration of outer loop, the instruction is to be done with
the 6-5-5 pack for 2 pixels of each doubleword element of each
source register. The first thing to be done in outer loop is
choosing which doubleword element of which register is to be used
in current iteration and it is to be placed in avr variable. The
next step is to perform 6-5-5 pack of pixels on avr variable in inner
for loop(2 iterations, 1 for each pixel) and save result in tmp variable.
In the end of outer for loop, the result is merged in variable called
result and saved in appropriate doubleword element of vD if the whole
doubleword is finished(every second iteration). The outer loop has 4
iterations.

Signed-off-by: Stefan Brankovic 
---
 target/ppc/helper.h |  1 -
 target/ppc/int_helper.c | 21 -
 target/ppc/translate/vmx-impl.inc.c | 93 -
 3 files changed, 92 insertions(+), 23 deletions(-)

diff --git a/target/ppc/helper.h b/target/ppc/helper.h
index 33dad6a..7a3d68d 100644
--- a/target/ppc/helper.h
+++ b/target/ppc/helper.h
@@ -254,7 +254,6 @@ DEF_HELPER_4(vpkudus, void, env, avr, avr, avr)
 DEF_HELPER_4(vpkuhum, void, env, avr, avr, avr)
 DEF_HELPER_4(vpkuwum, void, env, avr, avr, avr)
 DEF_HELPER_4(vpkudum, void, env, avr, avr, avr)
-DEF_HELPER_3(vpkpx, void, avr, avr, avr)
 DEF_HELPER_5(vmhaddshs, void, env, avr, avr, avr, avr)
 DEF_HELPER_5(vmhraddshs, void, env, avr, avr, avr, avr)
 DEF_HELPER_5(vmsumuhm, void, env, avr, avr, avr, avr)
diff --git a/target/ppc/int_helper.c b/target/ppc/int_helper.c
index a23853e..f397380 100644
--- a/target/ppc/int_helper.c
+++ b/target/ppc/int_helper.c
@@ -1536,27 +1536,6 @@ void helper_vpmsumd(ppc_avr_t *r, ppc_avr_t *a, 
ppc_avr_t *b)
 #else
 #define PKBIG 0
 #endif
-void helper_vpkpx(ppc_avr_t *r, ppc_avr_t *a, ppc_avr_t *b)
-{
-int i, j;
-ppc_avr_t result;
-#if defined(HOST_WORDS_BIGENDIAN)
-const ppc_avr_t *x[2] = { a, b };
-#else
-const ppc_avr_t *x[2] = { b, a };
-#endif
-
-VECTOR_FOR_INORDER_I(i, u64) {
-VECTOR_FOR_INORDER_I(j, u32) {
-uint32_t e = x[i]->u32[j];
-
-result.u16[4 * i + j] = (((e >> 9) & 0xfc00) |
- ((e >> 6) & 0x3e0) |
- ((e >> 3) & 0x1f));
-}
-}
-*r = result;
-}
 
 #define VPK(suffix, from, to, cvt, dosat)   \
 void helper_vpk##suffix(CPUPPCState *env, ppc_avr_t *r, \
diff --git a/target/ppc/translate/vmx-impl.inc.c 
b/target/ppc/translate/vmx-impl.inc.c
index e7ae979..7d0c824 100644
--- a/target/ppc/translate/vmx-impl.inc.c
+++ b/target/ppc/translate/vmx-impl.inc.c
@@ -570,6 +570,97 @@ static void trans_lvsr(DisasContext *ctx)
 }
 
 /*
+ * vpkpx VRT,VRA,VRB - Vector Pack Pixel
+ *
+ * Rearranges 8 pixels coded in 6-5-5 pattern (4 from each source register)
+ * into contigous array of bits in the destination register.
+ */
+static void trans_vpkpx(DisasContext *ctx)
+{
+int VT = rD(ctx->opcode);
+int VA = rA(ctx->opcode);
+int VB = rB(ctx->opcode);
+TCGv_i64 tmp = tcg_temp_new_i64();
+TCGv_i64 shifted = tcg_temp_new_i64();
+TCGv_i64 avr = tcg_temp_new_i64();
+TCGv_i64 result = tcg_temp_new_i64();
+int64_t mask1 = 0x1fULL;
+int64_t mask2 = 0x1fULL << 5;
+int64_t mask3 = 0x3fULL << 10;
+int i, j;
+/*
+ * In each iteration do the 6-5-5 pack for 2 pixels of each doubleword
+ * element of each source register.
+ */
+for (i = 0; i < 4; i++) {
+switch (i) {
+case 0:
+/*
+ * Get high doubleword of vA to perfrom 6-5-5 pack of pixels
+ * 1 and 2.
+ */
+get_avr64(avr, VA, true);
+tcg_gen_movi_i64(result, 0x0ULL);
+break;
+case 1:
+/*
+ * Get low doubleword of vA to perfrom 6-5-5 pack of pixels
+ * 3 and 4.
+ */
+get_avr64(avr, VA, false);
+break;
+case 2:
+/*
+ * Get high doubleword of vB to perfrom 6-5-5 pack of pixels
+ * 5 and 6.
+ */
+get_avr64(avr, VB, true);
+tcg_gen_movi_i64(result, 0x0ULL);
+break;
+case 3:
+/*
+ * Get low doubleword of vB to perfrom 6-5-5 pack of pixels
+ * 7 and 8.
+ */
+get_avr64(avr, VB, false);
+break;
+}
+/* Perform the packing for 2 pixels(each iteration for 1). */
+tcg_gen_movi_i64(tmp, 0x0ULL);
+for (j = 0; j < 2; j++) {
+tcg_gen_shri_i64(shifted, avr, (j * 16 + 3));
+   

[Qemu-devel] [PATCH 8/8] target/ppc: Refactor emulation of vmrgew and vmrgow instructions

2019-06-19 Thread Stefan Brankovic
Since I found this two instructions implemented with tcg, I refactored
them so they are consistent with other similar implementations that
I introduced in this patch.

Also, a new dual macro GEN_VXFORM_TRANS_DUAL is added. This macro is
used if one instruction is realized with direct translation, and second
one with a helper.

Signed-off-by: Stefan Brankovic 
---
 target/ppc/translate/vmx-impl.inc.c | 66 +
 1 file changed, 37 insertions(+), 29 deletions(-)

diff --git a/target/ppc/translate/vmx-impl.inc.c 
b/target/ppc/translate/vmx-impl.inc.c
index 81569a8..f052dcb 100644
--- a/target/ppc/translate/vmx-impl.inc.c
+++ b/target/ppc/translate/vmx-impl.inc.c
@@ -350,6 +350,28 @@ static void glue(gen_, name0##_##name1)(DisasContext *ctx) 
\
 }  \
 }
 
+/*
+ * We use this macro if one instruction is realized with direct
+ * translation, and second one with helper.
+ */
+#define GEN_VXFORM_TRANS_DUAL(name0, flg0, flg2_0, name1, flg1, flg2_1)\
+static void glue(gen_, name0##_##name1)(DisasContext *ctx) \
+{  \
+if ((Rc(ctx->opcode) == 0) &&  \
+((ctx->insns_flags & flg0) || (ctx->insns_flags2 & flg2_0))) { \
+if (unlikely(!ctx->altivec_enabled)) { \
+gen_exception(ctx, POWERPC_EXCP_VPU);  \
+return;\
+}  \
+trans_##name0(ctx);\
+} else if ((Rc(ctx->opcode) == 1) &&   \
+((ctx->insns_flags & flg1) || (ctx->insns_flags2 & flg2_1))) { \
+gen_##name1(ctx);  \
+} else {   \
+gen_inval_exception(ctx, POWERPC_EXCP_INVAL_INVAL);\
+}  \
+}
+
 /* Adds support to provide invalid mask */
 #define GEN_VXFORM_DUAL_EXT(name0, flg0, flg2_0, inval0,\
 name1, flg1, flg2_1, inval1)\
@@ -431,20 +453,13 @@ GEN_VXFORM(vmrglb, 6, 4);
 GEN_VXFORM(vmrglh, 6, 5);
 GEN_VXFORM(vmrglw, 6, 6);
 
-static void gen_vmrgew(DisasContext *ctx)
+static void trans_vmrgew(DisasContext *ctx)
 {
-TCGv_i64 tmp;
-TCGv_i64 avr;
-int VT, VA, VB;
-if (unlikely(!ctx->altivec_enabled)) {
-gen_exception(ctx, POWERPC_EXCP_VPU);
-return;
-}
-VT = rD(ctx->opcode);
-VA = rA(ctx->opcode);
-VB = rB(ctx->opcode);
-tmp = tcg_temp_new_i64();
-avr = tcg_temp_new_i64();
+int VT = rD(ctx->opcode);
+int VA = rA(ctx->opcode);
+int VB = rB(ctx->opcode);
+TCGv_i64 tmp = tcg_temp_new_i64();
+TCGv_i64 avr = tcg_temp_new_i64();
 
 get_avr64(avr, VB, true);
 tcg_gen_shri_i64(tmp, avr, 32);
@@ -462,21 +477,14 @@ static void gen_vmrgew(DisasContext *ctx)
 tcg_temp_free_i64(avr);
 }
 
-static void gen_vmrgow(DisasContext *ctx)
+static void trans_vmrgow(DisasContext *ctx)
 {
-TCGv_i64 t0, t1;
-TCGv_i64 avr;
-int VT, VA, VB;
-if (unlikely(!ctx->altivec_enabled)) {
-gen_exception(ctx, POWERPC_EXCP_VPU);
-return;
-}
-VT = rD(ctx->opcode);
-VA = rA(ctx->opcode);
-VB = rB(ctx->opcode);
-t0 = tcg_temp_new_i64();
-t1 = tcg_temp_new_i64();
-avr = tcg_temp_new_i64();
+int VT = rD(ctx->opcode);
+int VA = rA(ctx->opcode);
+int VB = rB(ctx->opcode);
+TCGv_i64 t0 = tcg_temp_new_i64();
+TCGv_i64 t1 = tcg_temp_new_i64();
+TCGv_i64 avr = tcg_temp_new_i64();
 
 get_avr64(t0, VB, true);
 get_avr64(t1, VA, true);
@@ -1142,14 +1150,14 @@ GEN_VXFORM_ENV(vminfp, 5, 17);
 GEN_VXFORM_HETRO(vextublx, 6, 24)
 GEN_VXFORM_HETRO(vextuhlx, 6, 25)
 GEN_VXFORM_HETRO(vextuwlx, 6, 26)
-GEN_VXFORM_DUAL(vmrgow, PPC_NONE, PPC2_ALTIVEC_207,
+GEN_VXFORM_TRANS_DUAL(vmrgow, PPC_NONE, PPC2_ALTIVEC_207,
 vextuwlx, PPC_NONE, PPC2_ISA300)
 GEN_VXFORM_HETRO(vextubrx, 6, 28)
 GEN_VXFORM_HETRO(vextuhrx, 6, 29)
 GEN_VXFORM_HETRO(vextuwrx, 6, 30)
 GEN_VXFORM_TRANS(lvsl, 6, 31)
 GEN_VXFORM_TRANS(lvsr, 6, 32)
-GEN_VXFORM_DUAL(vmrgew, PPC_NONE, PPC2_ALTIVEC_207, \
+GEN_VXFORM_TRANS_DUAL(vmrgew, PPC_NONE, PPC2_ALTIVEC_207,
 vextuwrx, PPC_NONE, PPC2_ISA300)
 
 #define GEN_VXRFORM1(opname, name, str, opc2, opc3) \
-- 
2.7.4




[Qemu-devel] [PATCH 1/8] target/ppc: Optimize emulation of lvsl and lvsr instructions

2019-06-19 Thread Stefan Brankovic
Adding simple macro that is calling tcg implementation of appropriate
instruction if altivec support is active.

Optimization of altivec instruction lvsl (Load Vector for Shift Left).
Place bytes sh:sh+15 of value 0x00 || 0x01 || 0x02 || ... || 0x1E || 0x1F
in destination register. Sh is calculated by adding 2 source registers and
getting bits 60-63 of result.

First, the bits [28-31] are placed from EA to variable sh. After that,
the bytes are created in the following way:
sh:(sh+7) of X(from description) by multiplying sh with 0x0101010101010101
followed by addition of the result with 0x0001020304050607. Value obtained
is placed in higher doubleword element of vD.
(sh+8):(sh+15) by adding the result of previous multiplication with
0x08090a0b0c0d0e0f. Value obtained is placed in lower doubleword element
of vD.

Optimization of altivec instruction lvsr (Load Vector for Shift Right).
Place bytes 16-sh:31-sh of value 0x00 || 0x01 || 0x02 || ... || 0x1E ||
0x1F in destination register. Sh is calculated by adding 2 source
registers and getting bits 60-63 of result.

First, the bits [28-31] are placed from EA to variable sh. After that,
the bytes are created in the following way:
sh:(sh+7) of X(from description) by multiplying sh with 0x0101010101010101
followed by substraction of the result from 0x1011121314151617. Value
obtained is placed in higher doubleword element of vD.
(sh+8):(sh+15) by substracting the result of previous multiplication from
0x18191a1b1c1d1e1f. Value obtained is placed in lower doubleword element
of vD.

Signed-off-by: Stefan Brankovic 
---
 target/ppc/helper.h |   2 -
 target/ppc/int_helper.c |  18 --
 target/ppc/translate/vmx-impl.inc.c | 120 ++--
 3 files changed, 88 insertions(+), 52 deletions(-)

diff --git a/target/ppc/helper.h b/target/ppc/helper.h
index 02b67a3..c82105e 100644
--- a/target/ppc/helper.h
+++ b/target/ppc/helper.h
@@ -189,8 +189,6 @@ DEF_HELPER_2(vprtybw, void, avr, avr)
 DEF_HELPER_2(vprtybd, void, avr, avr)
 DEF_HELPER_2(vprtybq, void, avr, avr)
 DEF_HELPER_3(vsubcuw, void, avr, avr, avr)
-DEF_HELPER_2(lvsl, void, avr, tl)
-DEF_HELPER_2(lvsr, void, avr, tl)
 DEF_HELPER_FLAGS_5(vaddsbs, TCG_CALL_NO_RWG, void, avr, avr, avr, avr, i32)
 DEF_HELPER_FLAGS_5(vaddshs, TCG_CALL_NO_RWG, void, avr, avr, avr, avr, i32)
 DEF_HELPER_FLAGS_5(vaddsws, TCG_CALL_NO_RWG, void, avr, avr, avr, avr, i32)
diff --git a/target/ppc/int_helper.c b/target/ppc/int_helper.c
index 8ce89f2..9505f4c 100644
--- a/target/ppc/int_helper.c
+++ b/target/ppc/int_helper.c
@@ -457,24 +457,6 @@ SATCVT(sd, uw, int64_t, uint32_t, 0, UINT32_MAX)
 #undef SATCVT
 #undef SATCVTU
 
-void helper_lvsl(ppc_avr_t *r, target_ulong sh)
-{
-int i, j = (sh & 0xf);
-
-for (i = 0; i < ARRAY_SIZE(r->u8); i++) {
-r->VsrB(i) = j++;
-}
-}
-
-void helper_lvsr(ppc_avr_t *r, target_ulong sh)
-{
-int i, j = 0x10 - (sh & 0xf);
-
-for (i = 0; i < ARRAY_SIZE(r->u8); i++) {
-r->VsrB(i) = j++;
-}
-}
-
 void helper_mtvscr(CPUPPCState *env, uint32_t vscr)
 {
 env->vscr = vscr & ~(1u << VSCR_SAT);
diff --git a/target/ppc/translate/vmx-impl.inc.c 
b/target/ppc/translate/vmx-impl.inc.c
index 663275b..482e2ee 100644
--- a/target/ppc/translate/vmx-impl.inc.c
+++ b/target/ppc/translate/vmx-impl.inc.c
@@ -142,38 +142,6 @@ GEN_VR_STVE(bx, 0x07, 0x04, 1);
 GEN_VR_STVE(hx, 0x07, 0x05, 2);
 GEN_VR_STVE(wx, 0x07, 0x06, 4);
 
-static void gen_lvsl(DisasContext *ctx)
-{
-TCGv_ptr rd;
-TCGv EA;
-if (unlikely(!ctx->altivec_enabled)) {
-gen_exception(ctx, POWERPC_EXCP_VPU);
-return;
-}
-EA = tcg_temp_new();
-gen_addr_reg_index(ctx, EA);
-rd = gen_avr_ptr(rD(ctx->opcode));
-gen_helper_lvsl(rd, EA);
-tcg_temp_free(EA);
-tcg_temp_free_ptr(rd);
-}
-
-static void gen_lvsr(DisasContext *ctx)
-{
-TCGv_ptr rd;
-TCGv EA;
-if (unlikely(!ctx->altivec_enabled)) {
-gen_exception(ctx, POWERPC_EXCP_VPU);
-return;
-}
-EA = tcg_temp_new();
-gen_addr_reg_index(ctx, EA);
-rd = gen_avr_ptr(rD(ctx->opcode));
-gen_helper_lvsr(rd, EA);
-tcg_temp_free(EA);
-tcg_temp_free_ptr(rd);
-}
-
 static void gen_mfvscr(DisasContext *ctx)
 {
 TCGv_i32 t;
@@ -316,6 +284,16 @@ static void glue(gen_, name)(DisasContext *ctx)
 \
 tcg_temp_free_ptr(rd);  \
 }
 
+#define GEN_VXFORM_TRANS(name, opc2, opc3)  \
+static void glue(gen_, name)(DisasContext *ctx) \
+{   \
+if (unlikely(!ctx->altivec_enabled)) {  \
+gen_exception(ctx, POWERPC_EXCP_VPU);   \
+return; \
+}

[Qemu-devel] [PATCH 5/8] target/ppc: Optimize emulation of vclzd instruction

2019-06-19 Thread Stefan Brankovic
Optimize Altivec instruction vclzd (Vector Count Leading Zeros Doubleword).
This instruction counts the number of leading zeros of each doubleword element
in source register and places result in the appropriate doubleword element of
destination register.

Using tcg-s count leading zeros instruction two times(once for each
doubleword element of source register vB) and placing result in
appropriate doubleword element of destination register vD.

Signed-off-by: Stefan Brankovic 
Reviewed-by: Richard Henderson 
---
 target/ppc/helper.h |  1 -
 target/ppc/int_helper.c |  3 ---
 target/ppc/translate/vmx-impl.inc.c | 28 +++-
 3 files changed, 27 insertions(+), 5 deletions(-)

diff --git a/target/ppc/helper.h b/target/ppc/helper.h
index 0aa1e05..3b92e3b 100644
--- a/target/ppc/helper.h
+++ b/target/ppc/helper.h
@@ -306,7 +306,6 @@ DEF_HELPER_4(vctsxs, void, env, avr, avr, i32)
 DEF_HELPER_2(vclzb, void, avr, avr)
 DEF_HELPER_2(vclzh, void, avr, avr)
 DEF_HELPER_2(vclzw, void, avr, avr)
-DEF_HELPER_2(vclzd, void, avr, avr)
 DEF_HELPER_2(vctzb, void, avr, avr)
 DEF_HELPER_2(vctzh, void, avr, avr)
 DEF_HELPER_2(vctzw, void, avr, avr)
diff --git a/target/ppc/int_helper.c b/target/ppc/int_helper.c
index 1e32549..3134798 100644
--- a/target/ppc/int_helper.c
+++ b/target/ppc/int_helper.c
@@ -1803,17 +1803,14 @@ VUPK(lsw, s64, s32, UPKLO)
 #define clzb(v) ((v) ? clz32((uint32_t)(v) << 24) : 8)
 #define clzh(v) ((v) ? clz32((uint32_t)(v) << 16) : 16)
 #define clzw(v) clz32((v))
-#define clzd(v) clz64((v))
 
 VGENERIC_DO(clzb, u8)
 VGENERIC_DO(clzh, u16)
 VGENERIC_DO(clzw, u32)
-VGENERIC_DO(clzd, u64)
 
 #undef clzb
 #undef clzh
 #undef clzw
-#undef clzd
 
 #define ctzb(v) ((v) ? ctz32(v) : 8)
 #define ctzh(v) ((v) ? ctz32(v) : 16)
diff --git a/target/ppc/translate/vmx-impl.inc.c 
b/target/ppc/translate/vmx-impl.inc.c
index a3467a5..c5bebfb 100644
--- a/target/ppc/translate/vmx-impl.inc.c
+++ b/target/ppc/translate/vmx-impl.inc.c
@@ -830,6 +830,32 @@ static void trans_vgbbd(DisasContext *ctx)
 tcg_temp_free_i64(avr[1]);
 }
 
+/*
+ * vclzd VRT,VRB - Vector Count Leading Zeros Doubleword
+ *
+ * Counting the number of leading zero bits of each doubleword element in 
source
+ * register and placing result in appropriate doubleword element of destination
+ * register.
+ */
+static void trans_vclzd(DisasContext *ctx)
+{
+int VT = rD(ctx->opcode);
+int VB = rB(ctx->opcode);
+TCGv_i64 avr = tcg_temp_new_i64();
+
+/* high doubleword */
+get_avr64(avr, VB, true);
+tcg_gen_clzi_i64(avr, avr, 64);
+set_avr64(VT, avr, true);
+
+/* low doubleword */
+get_avr64(avr, VB, false);
+tcg_gen_clzi_i64(avr, avr, 64);
+set_avr64(VT, avr, false);
+
+tcg_temp_free_i64(avr);
+}
+
 GEN_VXFORM(vmuloub, 4, 0);
 GEN_VXFORM(vmulouh, 4, 1);
 GEN_VXFORM(vmulouw, 4, 2);
@@ -1346,7 +1372,7 @@ GEN_VAFORM_PAIRED(vmaddfp, vnmsubfp, 23)
 GEN_VXFORM_NOA(vclzb, 1, 28)
 GEN_VXFORM_NOA(vclzh, 1, 29)
 GEN_VXFORM_NOA(vclzw, 1, 30)
-GEN_VXFORM_NOA(vclzd, 1, 31)
+GEN_VXFORM_TRANS(vclzd, 1, 31)
 GEN_VXFORM_NOA_2(vnegw, 1, 24, 6)
 GEN_VXFORM_NOA_2(vnegd, 1, 24, 7)
 GEN_VXFORM_NOA_2(vextsb2w, 1, 24, 16)
-- 
2.7.4




Re: [Qemu-devel] [PATCH 4/8] target/ppc: Optimize emulation of vgbbd instruction

2019-06-17 Thread Stefan Brankovic



On 6.6.19. 20:19, Richard Henderson wrote:

On 6/6/19 5:15 AM, Stefan Brankovic wrote:

Optimize altivec instruction vgbbd (Vector Gather Bits by Bytes by Doubleword)
All ith bits (i in range 1 to 8) of each byte of doubleword element in
source register are concatenated and placed into ith byte of appropriate
doubleword element in destination register.

Following solution is done for every doubleword element of source register
(placed in shifted variable):
We gather bits in 2x8 iterations.
In first iteration bit 1 of byte 1, bit 2 of byte 2,... bit 8 of byte 8 are
in their final spots so we just and avr with mask. For every next iteration,
we have to shift right both shifted(7 places) and mask(8 places), so we get
bit 1 of byte 2, bit 2 of byte 3.. bit 7 of byte 8 in right places so we and
shifted with new value of mask... After first 8 iteration(first for loop) we
have all first bits in their final place all second bits but second bit from
eight byte in their place,... only 1 eight bit from eight byte is in it's
place), so we and result1 with mask1 to save those bits that are at right
place and save them in result1. In second loop we do all operations
symetrical, so we get other half of bits on their final spots, and save
result in result2. Or of result1 and result2 is placed in appropriate
doubleword element of vD. We repeat this 2 times.

Signed-off-by: Stefan Brankovic 
---
  target/ppc/translate/vmx-impl.inc.c | 99 -
  1 file changed, 98 insertions(+), 1 deletion(-)

diff --git a/target/ppc/translate/vmx-impl.inc.c 
b/target/ppc/translate/vmx-impl.inc.c
index 87f69dc..010f337 100644
--- a/target/ppc/translate/vmx-impl.inc.c
+++ b/target/ppc/translate/vmx-impl.inc.c
@@ -780,6 +780,103 @@ static void trans_vsr(DisasContext *ctx)
  tcg_temp_free_i64(tmp);
  }
  
+/*

+ * vgbbd VRT,VRB - Vector Gather Bits by Bytes by Doubleword
+ *
+ * All ith bits (i in range 1 to 8) of each byte of doubleword element in 
source
+ * register are concatenated and placed into ith byte of appropriate doubleword
+ * element in destination register.
+ *
+ * Following solution is done for every doubleword element of source register
+ * (placed in shifted variable):
+ * We gather bits in 2x8 iterations.
+ * In first iteration bit 1 of byte 1, bit 2 of byte 2,... bit 8 of byte 8 are
+ * in their final spots so we just and avr with mask. For every next iteration,
+ * we have to shift right both shifted(7 places) and mask(8 places), so we get
+ * bit 1 of byte 2, bit 2 of byte 3.. bit 7 of byte 8 in right places so we and
+ * shifted with new value of mask... After first 8 iteration(first for loop) we
+ * have all first bits in their final place all second bits but second bit from
+ * eight byte in their place,... only 1 eight bit from eight byte is in it's
+ * place), so we and result1 with mask1 to save those bits that are at right
+ * place and save them in result1. In second loop we do all operations
+ * symetrical, so we get other half of bits on their final spots, and save
+ * result in result2. Or of result1 and result2 is placed in appropriate
+ * doubleword element of vD. We repeat this 2 times.
+ */
+static void trans_vgbbd(DisasContext *ctx)
+{
+int VT = rD(ctx->opcode);
+int VB = rB(ctx->opcode);
+TCGv_i64 tmp = tcg_temp_new_i64();
+TCGv_i64 avr = tcg_temp_new_i64();
+TCGv_i64 shifted = tcg_temp_new_i64();
+TCGv_i64 result1 = tcg_temp_new_i64();
+TCGv_i64 result2 = tcg_temp_new_i64();
+uint64_t mask = 0x8040201008040201ULL;
+uint64_t mask1 = 0x80c0e0f0f8fcfeffULL;
+uint64_t mask2 = 0x7f3f1f0f07030100ULL;
+int i;
+
+get_avr64(avr, VB, true);
+tcg_gen_movi_i64(result1, 0x0ULL);
+tcg_gen_mov_i64(shifted, avr);
+for (i = 0; i < 8; i++) {
+tcg_gen_andi_i64(tmp, shifted, mask);
+tcg_gen_or_i64(result1, result1, tmp);
+
+tcg_gen_shri_i64(shifted, shifted, 7);
+mask = mask >> 8;
+}
+tcg_gen_andi_i64(result1, result1, mask1);

This masking appears to be redundant with the masking within the loop.


+
+mask = 0x8040201008040201ULL;
+tcg_gen_movi_i64(result2, 0x0ULL);
+for (i = 0; i < 8; i++) {
+tcg_gen_andi_i64(tmp, avr, mask);
+tcg_gen_or_i64(result2, result2, tmp);
+
+tcg_gen_shli_i64(avr, avr, 7);
+mask = mask << 8;
+}
+tcg_gen_andi_i64(result2, result2, mask2);

Similarly.

Also, the first iteration of the second loop is redundant with the first
iteration of the first loop.

I will also note that these are large constants, not easily constructable.
Therefore it would be best to avoid needing to construct them twice.  You can
do this by processing the two doublewords simultaneously.  e.g.

TCGv_i64 avr[2], out[2], tmp, tcg_mask;

identity_mask = 0x8040201008040201ull;
tcg_gen_movi_i64(tcg_mask, identity_mask);
for (j = 0; j < 2; j++) {
get_avr(avr[j], VB, j);

Re: [Qemu-devel] [PATCH 6/8] target/ppc: Optimize emulation of vclzw instruction

2019-06-17 Thread Stefan Brankovic



On 6.6.19. 20:34, Richard Henderson wrote:

On 6/6/19 5:15 AM, Stefan Brankovic wrote:

+for (i = 0; i < 2; i++) {
+if (i == 0) {
+/* Get high doubleword element of vB in avr. */
+get_avr64(avr, VB, true);
+} else {
+/* Get low doubleword element of vB in avr. */
+get_avr64(avr, VB, false);
+}

Better as simply get_avr64(avr, VB, i);

Definitely shorter way to do this.



+/*
+ * Perform count for every word element using tcg_gen_clzi_i64.
+ * Since it counts leading zeros on 64 bit lenght, we have to move
+ * ith word element to highest 32 bits of tmp, or it with mask(so we 
get
+ * all ones in lowest 32 bits), then perform tcg_gen_clzi_i64 and move
+ * it's result in appropriate word element of result.
+ */
+tcg_gen_shli_i64(tmp, avr, 32);
+tcg_gen_or_i64(tmp, tmp, mask);
+tcg_gen_clzi_i64(result, tmp, 64);
+
+tcg_gen_or_i64(tmp, avr, mask);
+tcg_gen_clzi_i64(tmp, tmp, 64);

s/64/32.


+tcg_gen_deposit_i64(result, result, tmp, 32, 32);

That said, it's probably better to treat this as 4 words, not 2 doublewords.

for (i = 0; i < 4; i++) {
tcg_gen_ld_i32(tmp, cpu_env, avr_full_offset(VB) + i * 4);
tcg_gen_clzi_i32(tmp, tmp, 32);
tcg_gen_st_i32(tmp, cpu_env, avr_full_offset(VT) + i * 4);
}


I will use this way in v2.

Kind Regards,

Stefan


r~




Re: [Qemu-devel] [PATCH 8/8] target/ppc: Refactor emulation of vmrgew and vmrgow instructions

2019-06-17 Thread Stefan Brankovic



On 6.6.19. 22:43, Richard Henderson wrote:

On 6/6/19 5:15 AM, Stefan Brankovic wrote:

+/*
+ * We use this macro if one instruction is realized with direct
+ * translation, and second one with helper.
+ */
+#define GEN_VXFORM_TRANS_DUAL(name0, flg0, flg2_0, name1, flg1, flg2_1)\
+static void glue(gen_, name0##_##name1)(DisasContext *ctx) \
+{  \
+if ((Rc(ctx->opcode) == 0) &&  \
+((ctx->insns_flags & flg0) || (ctx->insns_flags2 & flg2_0))) { \
+trans_##name0(ctx);\
+} else if ((Rc(ctx->opcode) == 1) &&   \
+((ctx->insns_flags & flg1) || (ctx->insns_flags2 & flg2_1))) { \
+gen_##name1(ctx);  \
+} else {   \
+gen_inval_exception(ctx, POWERPC_EXCP_INVAL_INVAL);\
+}  \
+}
+
  /* Adds support to provide invalid mask */
  #define GEN_VXFORM_DUAL_EXT(name0, flg0, flg2_0, inval0,\
  name1, flg1, flg2_1, inval1)\
@@ -431,20 +449,13 @@ GEN_VXFORM(vmrglb, 6, 4);
  GEN_VXFORM(vmrglh, 6, 5);
  GEN_VXFORM(vmrglw, 6, 6);
  
-static void gen_vmrgew(DisasContext *ctx)

+static void trans_vmrgew(DisasContext *ctx)
  {
-TCGv_i64 tmp;
-TCGv_i64 avr;
-int VT, VA, VB;
-if (unlikely(!ctx->altivec_enabled)) {
-gen_exception(ctx, POWERPC_EXCP_VPU);
-return;
-}

This appears to drop the check for altivec_enabled.


Thank you for spotting this, I will fix this bug in v2.

Kind Regards,

Stefan


r~




Re: [Qemu-devel] [PATCH 7/8] target/ppc: Optimize emulation of vclzh and vclzb instructions

2019-06-17 Thread Stefan Brankovic



On 6.6.19. 22:38, Richard Henderson wrote:

On 6/6/19 5:15 AM, Stefan Brankovic wrote:

Optimize Altivec instruction vclzh (Vector Count Leading Zeros Halfword).
This instruction counts the number of leading zeros of each halfword element
in source register and places result in the appropriate halfword element of
destination register.

For halfword, you're generating 32 operations.  A loop over the halfwords,
similar to the word loop I suggested for the last patch, does not reduce this
total, since one has to adjust the clz32 result.

For byte, you're generating 64 operations.

These expansions are so big that without host vector support it's probably best
to leave them out-of-line.

I can imagine a byte clz expansion like

t0 = input >> 4;
t1 = input << 4;
cmp = input == 0 ? -1 : 0;
input = cmp ? t1 : input;
output = cmp & 4;

t0 = input >> 6;
t1 = input << 2;
cmp = input == 0 ? -1 : 0;
input = cmp ? t1 : input;
t0 = cmp & 2;
output += t0;

t1 = input << 1;
cmp = input >= 0 ? -1 : 0;
output -= cmp;

cmp = input == 0 ? -1 : 0;
output -= cmp;

which would expand to 20 x86_64 vector instructions.  A halfword expansion
would require one more round and thus 25 instructions.


I based this patch on performance results and my measurements say that 
tcg implementation is still significantly superior to helper 
implementation, regardless of somewhat large number of instructions.


I can attach both performance measurements results and disassembly of 
both helper and tcg implementations, if you want me to do this.




I'll also note that ARM, Power8, and S390 all support this as a native vector
operation; only x86_64 would require the above expansion.  It probably makes
sense to add this operation to tcg.


I agree with this, but currently we don't have this implemented in tcg, 
so I worked with what I have.


Kind Regards,

Stefan


r~




Re: [Qemu-devel] [PATCH 2/8] target/ppc: Optimize emulation of vsl and vsr instructions

2019-06-17 Thread Stefan Brankovic



On 6.6.19. 19:03, Richard Henderson wrote:

On 6/6/19 5:15 AM, Stefan Brankovic wrote:

+tcg_gen_subi_i64(tmp, sh, 64);
+tcg_gen_neg_i64(tmp, tmp);

Better as

 tcg_gen_subfi_i64(tmp, 64, sh);

I was aware there must be way of doing it in a single tcg invocation, 
but couldn't find right tcg instruction. I will apply this in v2.


Kind Regards,

Stefan


r~




Re: [Qemu-devel] [PATCH 0/8] Optimize emulation of ten Altivec instructions: lvsl,

2019-06-17 Thread Stefan Brankovic



On 6.6.19. 19:13, Richard Henderson wrote:

On 6/6/19 5:15 AM, Stefan Brankovic wrote:

Stefan Brankovic (8):
   target/ppc: Optimize emulation of lvsl and lvsr instructions
   target/ppc: Optimize emulation of vsl and vsr instructions
   target/ppc: Optimize emulation of vpkpx instruction
   target/ppc: Optimize emulation of vgbbd instruction
   target/ppc: Optimize emulation of vclzd instruction
   target/ppc: Optimize emulation of vclzw instruction
   target/ppc: Optimize emulation of vclzh and vclzb instructions
   target/ppc: Refactor emulation of vmrgew and vmrgow instructions

  target/ppc/translate/vmx-impl.inc.c | 705 
  1 file changed, 636 insertions(+), 69 deletions(-)

You should be removing the out-of-line helpers that are no longer used.


I agree. I will remove them in v2.

Kind Regards,

Stefan


r~




Re: [Qemu-devel] [PATCH 1/8] target/ppc: Optimize emulation of lvsl and lvsr instructions

2019-06-17 Thread Stefan Brankovic



On 6.6.19. 18:46, Richard Henderson wrote:

On 6/6/19 5:15 AM, Stefan Brankovic wrote:

+tcg_gen_addi_i64(result, sh, 7);
+for (i = 7; i >= 1; i--) {
+tcg_gen_shli_i64(tmp, sh, i * 8);
+tcg_gen_or_i64(result, result, tmp);
+tcg_gen_addi_i64(sh, sh, 1);
+}

Better to replicate sh into the 8 positions and then use one add.

 tcg_gen_muli_i64(sh, sh, 0x0101010101010101ull);
 tcg_gen_addi_i64(hi_result, sh, 0x0001020304050607ull);
 tcg_gen_addi_i64(lo_result, sh, 0x08090a0b0c0d0e0full);

and

 tcg_gen_subfi_i64(hi_result, 0x1011121314151617ull, sh);
 tcg_gen_subfi_i64(lo_result, 0x18191a1b1c1d1e1full, sh);

for lvsr.

I think you are right, this is definitely better way of implementing it. 
I will adopt your approach in v2.


Kind Regards,

Stefan


r~




Re: [Qemu-devel] ?==?utf-8?q? ?==?utf-8?q? [PATCH 0/8] Optimize emulation of ten Altivec instructions:?==?utf-8?q? lvsl,

2019-06-12 Thread Stefan Brankovic


>
>
>  Original Message 
> Subject: Re: [Qemu-devel] [PATCH 0/8] Optimize emulation of ten Altivec 
> instructions: lvsl,
> Date: Thursday, June 6, 2019 19:13 CEST
> From: Richard Henderson 
> To: Stefan Brankovic , qemu-devel@nongnu.org
> CC: da...@gibson.dropbear.id.au
> References: <1559816130-17113-1-git-send-email-stefan.branko...@rt-rk.com>
>
>
>
> > On 6/6/19 5:15 AM, Stefan Brankovic wrote:
> > > Stefan Brankovic (8):
> > > target/ppc: Optimize emulation of lvsl and lvsr instructions
> > > target/ppc: Optimize emulation of vsl and vsr instructions
> > > target/ppc: Optimize emulation of vpkpx instruction
> > > target/ppc: Optimize emulation of vgbbd instruction
> > > target/ppc: Optimize emulation of vclzd instruction
> > > target/ppc: Optimize emulation of vclzw instruction
> > > target/ppc: Optimize emulation of vclzh and vclzb instructions
> > > target/ppc: Refactor emulation of vmrgew and vmrgow instructions
> > >
> > > target/ppc/translate/vmx-impl.inc.c | 705 
> > > 
> > > 1 file changed, 636 insertions(+), 69 deletions(-)
> >
> > You should be removing the out-of-line helpers that are no longer used.
> >

Thank you for taking your time to review my code. I think that your suggestions
are all constructive and very useful. However, I was on a short leave this
week and I couldn't respond promptly. I will respond with more details in next
few days.

Kind Regards,
Stefan

> >
> > r~
>
>
>


[Qemu-devel] [PATCH 3/8] target/ppc: Optimize emulation of vpkpx instruction

2019-06-06 Thread Stefan Brankovic
Optimize altivec instruction vpkpx (Vector Pack Pixel).
Rearranges 8 pixels coded in 6-5-5 pattern (4 from each source register)
into contigous array of bits in the destination register.

In each iteration of outer loop we do the 6-5-5 pack for 2 pixels
of each doubleword element of each source register. The first thing
we do in outer loop is choosing which doubleword element of which
register are we using in current iteration and we place it in avr
variable. Then we perform 6-5-5 pack of pixels on avr variable
in inner for loop(2 iterations, 1 for each pixel) and save result
in tmp variable. In the end of outer for loop, we merge result in
variable called result and save it in appropriate doubleword element
of vD if whole doubleword is finished(every second iteration). Outer
loop has 4 iterations.

Signed-off-by: Stefan Brankovic 
---
 target/ppc/translate/vmx-impl.inc.c | 93 -
 1 file changed, 92 insertions(+), 1 deletion(-)

diff --git a/target/ppc/translate/vmx-impl.inc.c 
b/target/ppc/translate/vmx-impl.inc.c
index 6bd072a..87f69dc 100644
--- a/target/ppc/translate/vmx-impl.inc.c
+++ b/target/ppc/translate/vmx-impl.inc.c
@@ -593,6 +593,97 @@ static void trans_lvsr(DisasContext *ctx)
 }
 
 /*
+ * vpkpx VRT,VRA,VRB - Vector Pack Pixel
+ *
+ * Rearranges 8 pixels coded in 6-5-5 pattern (4 from each source register)
+ * into contigous array of bits in the destination register.
+ */
+static void trans_vpkpx(DisasContext *ctx)
+{
+int VT = rD(ctx->opcode);
+int VA = rA(ctx->opcode);
+int VB = rB(ctx->opcode);
+TCGv_i64 tmp = tcg_temp_new_i64();
+TCGv_i64 shifted = tcg_temp_new_i64();
+TCGv_i64 avr = tcg_temp_new_i64();
+TCGv_i64 result = tcg_temp_new_i64();
+int64_t mask1 = 0x1fULL;
+int64_t mask2 = 0x1fULL << 5;
+int64_t mask3 = 0x3fULL << 10;
+int i, j;
+/*
+ * In each iteration do the 6-5-5 pack for 2 pixels of each doubleword
+ * element of each source register.
+ */
+for (i = 0; i < 4; i++) {
+switch (i) {
+case 0:
+/*
+ * Get high doubleword of vA to perfrom 6-5-5 pack of pixels
+ * 1 and 2.
+ */
+get_avr64(avr, VA, true);
+tcg_gen_movi_i64(result, 0x0ULL);
+break;
+case 1:
+/*
+ * Get low doubleword of vA to perfrom 6-5-5 pack of pixels
+ * 3 and 4.
+ */
+get_avr64(avr, VA, false);
+break;
+case 2:
+/*
+ * Get high doubleword of vB to perfrom 6-5-5 pack of pixels
+ * 5 and 6.
+ */
+get_avr64(avr, VB, true);
+tcg_gen_movi_i64(result, 0x0ULL);
+break;
+case 3:
+/*
+ * Get low doubleword of vB to perfrom 6-5-5 pack of pixels
+ * 7 and 8.
+ */
+get_avr64(avr, VB, false);
+break;
+}
+/* Perform the packing for 2 pixels(each iteration for 1). */
+tcg_gen_movi_i64(tmp, 0x0ULL);
+for (j = 0; j < 2; j++) {
+tcg_gen_shri_i64(shifted, avr, (j * 16 + 3));
+tcg_gen_andi_i64(shifted, shifted, mask1 << (j * 16));
+tcg_gen_or_i64(tmp, tmp, shifted);
+
+tcg_gen_shri_i64(shifted, avr, (j * 16 + 6));
+tcg_gen_andi_i64(shifted, shifted, mask2 << (j * 16));
+tcg_gen_or_i64(tmp, tmp, shifted);
+
+tcg_gen_shri_i64(shifted, avr, (j * 16 + 9));
+tcg_gen_andi_i64(shifted, shifted, mask3 << (j * 16));
+tcg_gen_or_i64(tmp, tmp, shifted);
+}
+if ((i == 0) || (i == 2)) {
+tcg_gen_shli_i64(tmp, tmp, 32);
+}
+tcg_gen_or_i64(result, result, tmp);
+if (i == 1) {
+/* Place packed pixels 1:4 to high doubleword of vD. */
+set_avr64(VT, result, true);
+}
+if (i == 3) {
+/* Place packed pixels 5:8 to low doubleword of vD. */
+set_avr64(VT, result, false);
+}
+}
+
+tcg_temp_free_i64(tmp);
+tcg_temp_free_i64(shifted);
+tcg_temp_free_i64(avr);
+tcg_temp_free_i64(result);
+}
+
+/*
  * vsl VRT,VRA,VRB - Vector Shift Left
  *
  * Shifting left 128 bit value of vA by value specified in bits 125-127 of vB.
@@ -813,7 +904,7 @@ GEN_VXFORM_ENV(vpksdus, 7, 21);
 GEN_VXFORM_ENV(vpkshss, 7, 6);
 GEN_VXFORM_ENV(vpkswss, 7, 7);
 GEN_VXFORM_ENV(vpksdss, 7, 23);
-GEN_VXFORM(vpkpx, 7, 12);
+GEN_VXFORM_TRANS(vpkpx, 7, 12);
 GEN_VXFORM_ENV(vsum4ubs, 4, 24);
 GEN_VXFORM_ENV(vsum4sbs, 4, 28);
 GEN_VXFORM_ENV(vsum4shs, 4, 25);
-- 
2.7.4




[Qemu-devel] [PATCH 8/8] target/ppc: Refactor emulation of vmrgew and vmrgow instructions

2019-06-06 Thread Stefan Brankovic
Since I found this two instructions implemented with tcg, I refactored
them so they are consistent with other similar implementations that
I introduced in this patch.

Also had to add new dual macro GEN_VXFORM_TRANS_DUAL. We use this macro
if one instruction is realized with direct translation, and second one
with helper.

Signed-off-by: Stefan Brankovic 
---
 target/ppc/translate/vmx-impl.inc.c | 62 -
 1 file changed, 33 insertions(+), 29 deletions(-)

diff --git a/target/ppc/translate/vmx-impl.inc.c 
b/target/ppc/translate/vmx-impl.inc.c
index 8535a31..46c6f34 100644
--- a/target/ppc/translate/vmx-impl.inc.c
+++ b/target/ppc/translate/vmx-impl.inc.c
@@ -350,6 +350,24 @@ static void glue(gen_, name0##_##name1)(DisasContext *ctx) 
\
 }  \
 }
 
+/*
+ * We use this macro if one instruction is realized with direct
+ * translation, and second one with helper.
+ */
+#define GEN_VXFORM_TRANS_DUAL(name0, flg0, flg2_0, name1, flg1, flg2_1)\
+static void glue(gen_, name0##_##name1)(DisasContext *ctx) \
+{  \
+if ((Rc(ctx->opcode) == 0) &&  \
+((ctx->insns_flags & flg0) || (ctx->insns_flags2 & flg2_0))) { \
+trans_##name0(ctx);\
+} else if ((Rc(ctx->opcode) == 1) &&   \
+((ctx->insns_flags & flg1) || (ctx->insns_flags2 & flg2_1))) { \
+gen_##name1(ctx);  \
+} else {   \
+gen_inval_exception(ctx, POWERPC_EXCP_INVAL_INVAL);\
+}  \
+}
+
 /* Adds support to provide invalid mask */
 #define GEN_VXFORM_DUAL_EXT(name0, flg0, flg2_0, inval0,\
 name1, flg1, flg2_1, inval1)\
@@ -431,20 +449,13 @@ GEN_VXFORM(vmrglb, 6, 4);
 GEN_VXFORM(vmrglh, 6, 5);
 GEN_VXFORM(vmrglw, 6, 6);
 
-static void gen_vmrgew(DisasContext *ctx)
+static void trans_vmrgew(DisasContext *ctx)
 {
-TCGv_i64 tmp;
-TCGv_i64 avr;
-int VT, VA, VB;
-if (unlikely(!ctx->altivec_enabled)) {
-gen_exception(ctx, POWERPC_EXCP_VPU);
-return;
-}
-VT = rD(ctx->opcode);
-VA = rA(ctx->opcode);
-VB = rB(ctx->opcode);
-tmp = tcg_temp_new_i64();
-avr = tcg_temp_new_i64();
+int VT = rD(ctx->opcode);
+int VA = rA(ctx->opcode);
+int VB = rB(ctx->opcode);
+TCGv_i64 tmp = tcg_temp_new_i64();
+TCGv_i64 avr = tcg_temp_new_i64();
 
 get_avr64(avr, VB, true);
 tcg_gen_shri_i64(tmp, avr, 32);
@@ -462,21 +473,14 @@ static void gen_vmrgew(DisasContext *ctx)
 tcg_temp_free_i64(avr);
 }
 
-static void gen_vmrgow(DisasContext *ctx)
+static void trans_vmrgow(DisasContext *ctx)
 {
-TCGv_i64 t0, t1;
-TCGv_i64 avr;
-int VT, VA, VB;
-if (unlikely(!ctx->altivec_enabled)) {
-gen_exception(ctx, POWERPC_EXCP_VPU);
-return;
-}
-VT = rD(ctx->opcode);
-VA = rA(ctx->opcode);
-VB = rB(ctx->opcode);
-t0 = tcg_temp_new_i64();
-t1 = tcg_temp_new_i64();
-avr = tcg_temp_new_i64();
+int VT = rD(ctx->opcode);
+int VA = rA(ctx->opcode);
+int VB = rB(ctx->opcode);
+TCGv_i64 t0 = tcg_temp_new_i64();
+TCGv_i64 t1 = tcg_temp_new_i64();
+TCGv_i64 avr = tcg_temp_new_i64();
 
 get_avr64(t0, VB, true);
 get_avr64(t1, VA, true);
@@ -1213,14 +1217,14 @@ GEN_VXFORM_ENV(vminfp, 5, 17);
 GEN_VXFORM_HETRO(vextublx, 6, 24)
 GEN_VXFORM_HETRO(vextuhlx, 6, 25)
 GEN_VXFORM_HETRO(vextuwlx, 6, 26)
-GEN_VXFORM_DUAL(vmrgow, PPC_NONE, PPC2_ALTIVEC_207,
+GEN_VXFORM_TRANS_DUAL(vmrgow, PPC_NONE, PPC2_ALTIVEC_207,
 vextuwlx, PPC_NONE, PPC2_ISA300)
 GEN_VXFORM_HETRO(vextubrx, 6, 28)
 GEN_VXFORM_HETRO(vextuhrx, 6, 29)
 GEN_VXFORM_HETRO(vextuwrx, 6, 30)
 GEN_VXFORM_TRANS(lvsl, 6, 31)
 GEN_VXFORM_TRANS(lvsr, 6, 32)
-GEN_VXFORM_DUAL(vmrgew, PPC_NONE, PPC2_ALTIVEC_207, \
+GEN_VXFORM_TRANS_DUAL(vmrgew, PPC_NONE, PPC2_ALTIVEC_207,
 vextuwrx, PPC_NONE, PPC2_ISA300)
 
 #define GEN_VXRFORM1(opname, name, str, opc2, opc3) \
-- 
2.7.4




[Qemu-devel] [PATCH 4/8] target/ppc: Optimize emulation of vgbbd instruction

2019-06-06 Thread Stefan Brankovic
Optimize altivec instruction vgbbd (Vector Gather Bits by Bytes by Doubleword)
All ith bits (i in range 1 to 8) of each byte of doubleword element in
source register are concatenated and placed into ith byte of appropriate
doubleword element in destination register.

Following solution is done for every doubleword element of source register
(placed in shifted variable):
We gather bits in 2x8 iterations.
In first iteration bit 1 of byte 1, bit 2 of byte 2,... bit 8 of byte 8 are
in their final spots so we just and avr with mask. For every next iteration,
we have to shift right both shifted(7 places) and mask(8 places), so we get
bit 1 of byte 2, bit 2 of byte 3.. bit 7 of byte 8 in right places so we and
shifted with new value of mask... After first 8 iteration(first for loop) we
have all first bits in their final place all second bits but second bit from
eight byte in their place,... only 1 eight bit from eight byte is in it's
place), so we and result1 with mask1 to save those bits that are at right
place and save them in result1. In second loop we do all operations
symetrical, so we get other half of bits on their final spots, and save
result in result2. Or of result1 and result2 is placed in appropriate
doubleword element of vD. We repeat this 2 times.

Signed-off-by: Stefan Brankovic 
---
 target/ppc/translate/vmx-impl.inc.c | 99 -
 1 file changed, 98 insertions(+), 1 deletion(-)

diff --git a/target/ppc/translate/vmx-impl.inc.c 
b/target/ppc/translate/vmx-impl.inc.c
index 87f69dc..010f337 100644
--- a/target/ppc/translate/vmx-impl.inc.c
+++ b/target/ppc/translate/vmx-impl.inc.c
@@ -780,6 +780,103 @@ static void trans_vsr(DisasContext *ctx)
 tcg_temp_free_i64(tmp);
 }
 
+/*
+ * vgbbd VRT,VRB - Vector Gather Bits by Bytes by Doubleword
+ *
+ * All ith bits (i in range 1 to 8) of each byte of doubleword element in 
source
+ * register are concatenated and placed into ith byte of appropriate doubleword
+ * element in destination register.
+ *
+ * Following solution is done for every doubleword element of source register
+ * (placed in shifted variable):
+ * We gather bits in 2x8 iterations.
+ * In first iteration bit 1 of byte 1, bit 2 of byte 2,... bit 8 of byte 8 are
+ * in their final spots so we just and avr with mask. For every next iteration,
+ * we have to shift right both shifted(7 places) and mask(8 places), so we get
+ * bit 1 of byte 2, bit 2 of byte 3.. bit 7 of byte 8 in right places so we and
+ * shifted with new value of mask... After first 8 iteration(first for loop) we
+ * have all first bits in their final place all second bits but second bit from
+ * eight byte in their place,... only 1 eight bit from eight byte is in it's
+ * place), so we and result1 with mask1 to save those bits that are at right
+ * place and save them in result1. In second loop we do all operations
+ * symetrical, so we get other half of bits on their final spots, and save
+ * result in result2. Or of result1 and result2 is placed in appropriate
+ * doubleword element of vD. We repeat this 2 times.
+ */
+static void trans_vgbbd(DisasContext *ctx)
+{
+int VT = rD(ctx->opcode);
+int VB = rB(ctx->opcode);
+TCGv_i64 tmp = tcg_temp_new_i64();
+TCGv_i64 avr = tcg_temp_new_i64();
+TCGv_i64 shifted = tcg_temp_new_i64();
+TCGv_i64 result1 = tcg_temp_new_i64();
+TCGv_i64 result2 = tcg_temp_new_i64();
+uint64_t mask = 0x8040201008040201ULL;
+uint64_t mask1 = 0x80c0e0f0f8fcfeffULL;
+uint64_t mask2 = 0x7f3f1f0f07030100ULL;
+int i;
+
+get_avr64(avr, VB, true);
+tcg_gen_movi_i64(result1, 0x0ULL);
+tcg_gen_mov_i64(shifted, avr);
+for (i = 0; i < 8; i++) {
+tcg_gen_andi_i64(tmp, shifted, mask);
+tcg_gen_or_i64(result1, result1, tmp);
+
+tcg_gen_shri_i64(shifted, shifted, 7);
+mask = mask >> 8;
+}
+tcg_gen_andi_i64(result1, result1, mask1);
+
+mask = 0x8040201008040201ULL;
+tcg_gen_movi_i64(result2, 0x0ULL);
+for (i = 0; i < 8; i++) {
+tcg_gen_andi_i64(tmp, avr, mask);
+tcg_gen_or_i64(result2, result2, tmp);
+
+tcg_gen_shli_i64(avr, avr, 7);
+mask = mask << 8;
+}
+tcg_gen_andi_i64(result2, result2, mask2);
+
+tcg_gen_or_i64(result2, result2, result1);
+set_avr64(VT, result2, true);
+
+mask = 0x8040201008040201ULL;
+get_avr64(avr, VB, false);
+tcg_gen_movi_i64(result1, 0x0ULL);
+tcg_gen_mov_i64(shifted, avr);
+for (i = 0; i < 8; i++) {
+tcg_gen_andi_i64(tmp, shifted, mask);
+tcg_gen_or_i64(result1, result1, tmp);
+
+tcg_gen_shri_i64(shifted, shifted, 7);
+mask = mask >> 8;
+}
+tcg_gen_andi_i64(result1, result1, mask1);
+
+mask = 0x8040201008040201ULL;
+tcg_gen_movi_i64(result2, 0x0ULL);
+for (i = 0; i < 8; i++) {
+tcg_gen_andi_i64(tmp, avr, mask);
+tcg_gen_or_i64(result2, result2, tmp);
+
+tcg_gen_shli_i6

[Qemu-devel] [PATCH 6/8] target/ppc: Optimize emulation of vclzw instruction

2019-06-06 Thread Stefan Brankovic
Optimize Altivec instruction vclzw (Vector Count Leading Zeros Word).
This instruction counts the number of leading zeros of each word element
in source register and places result in the appropriate word element of
destination register.

We perform counting in two iterations of for loop(one for each
doubleword element of source register vB). First thing we do in
loop is placing appropriate doubleword element of vB in variable
avr. Then we perform counting using tcg-s count leading zeros
function. Since it counts leading zeros on 64 bit lenght, we have to
move ith word element to highest 32 bits of variable tmp, or it with
mask(so we get all ones in lowest 32 bits), then perform
tcg_gen_clzi_i64 and move it's result in appropriate word element of
variable result. In the end of each loop iteration we save variable
result to appropriate doubleword element of destination register vD.

Signed-off-by: Stefan Brankovic 
---
 target/ppc/translate/vmx-impl.inc.c | 57 -
 1 file changed, 56 insertions(+), 1 deletion(-)

diff --git a/target/ppc/translate/vmx-impl.inc.c 
b/target/ppc/translate/vmx-impl.inc.c
index 1c34908..7689739 100644
--- a/target/ppc/translate/vmx-impl.inc.c
+++ b/target/ppc/translate/vmx-impl.inc.c
@@ -878,6 +878,61 @@ static void trans_vgbbd(DisasContext *ctx)
 }
 
 /*
+ * vclzw VRT,VRB - Vector Count Leading Zeros Word
+ *
+ * Counting the number of leading zero bits of each word element in source
+ * register and placing result in appropriate word element of destination
+ * register.
+ */
+static void trans_vclzw(DisasContext *ctx)
+{
+int VT = rD(ctx->opcode);
+int VB = rB(ctx->opcode);
+TCGv_i64 avr = tcg_temp_new_i64();
+TCGv_i64 result = tcg_temp_new_i64();
+TCGv_i64 tmp = tcg_temp_new_i64();
+TCGv_i64 mask = tcg_const_i64(0xULL);
+int i;
+
+for (i = 0; i < 2; i++) {
+if (i == 0) {
+/* Get high doubleword element of vB in avr. */
+get_avr64(avr, VB, true);
+} else {
+/* Get low doubleword element of vB in avr. */
+get_avr64(avr, VB, false);
+}
+/*
+ * Perform count for every word element using tcg_gen_clzi_i64.
+ * Since it counts leading zeros on 64 bit lenght, we have to move
+ * ith word element to highest 32 bits of tmp, or it with mask(so we 
get
+ * all ones in lowest 32 bits), then perform tcg_gen_clzi_i64 and move
+ * it's result in appropriate word element of result.
+ */
+tcg_gen_shli_i64(tmp, avr, 32);
+tcg_gen_or_i64(tmp, tmp, mask);
+tcg_gen_clzi_i64(result, tmp, 64);
+
+tcg_gen_or_i64(tmp, avr, mask);
+tcg_gen_clzi_i64(tmp, tmp, 64);
+tcg_gen_deposit_i64(result, result, tmp, 32, 32);
+
+if (i == 0) {
+/* Place result in high doubleword element of vD. */
+set_avr64(VT, result, true);
+} else {
+/* Place result in low doubleword element of vD. */
+set_avr64(VT, result, false);
+}
+}
+
+tcg_temp_free_i64(avr);
+tcg_temp_free_i64(result);
+tcg_temp_free_i64(tmp);
+tcg_temp_free_i64(mask);
+}
+
+/*
  * vclzd VRT,VRB - Vector Count Leading Zeros Doubleword
  *
  * Counting the number of leading zero bits of each doubleword element in 
source
@@ -1413,7 +1468,7 @@ GEN_VAFORM_PAIRED(vmaddfp, vnmsubfp, 23)
 
 GEN_VXFORM_NOA(vclzb, 1, 28)
 GEN_VXFORM_NOA(vclzh, 1, 29)
-GEN_VXFORM_NOA(vclzw, 1, 30)
+GEN_VXFORM_TRANS(vclzw, 1, 30)
 GEN_VXFORM_TRANS(vclzd, 1, 31)
 GEN_VXFORM_NOA_2(vnegw, 1, 24, 6)
 GEN_VXFORM_NOA_2(vnegd, 1, 24, 7)
-- 
2.7.4




[Qemu-devel] [PATCH 5/8] target/ppc: Optimize emulation of vclzd instruction

2019-06-06 Thread Stefan Brankovic
Optimize Altivec instruction vclzd (Vector Count Leading Zeros Doubleword).
This instruction counts the number of leading zeros of each doubleword element
in source register and places result in the appropriate doubleword element of
destination register.

Using tcg-s count leading zeros instruction two times(once for each
doubleword element of source register vB) and placing result in
appropriate doubleword element of destination register vD.

Signed-off-by: Stefan Brankovic 
---
 target/ppc/translate/vmx-impl.inc.c | 28 +++-
 1 file changed, 27 insertions(+), 1 deletion(-)

diff --git a/target/ppc/translate/vmx-impl.inc.c 
b/target/ppc/translate/vmx-impl.inc.c
index 010f337..1c34908 100644
--- a/target/ppc/translate/vmx-impl.inc.c
+++ b/target/ppc/translate/vmx-impl.inc.c
@@ -877,6 +877,32 @@ static void trans_vgbbd(DisasContext *ctx)
 tcg_temp_free_i64(result2);
 }
 
+/*
+ * vclzd VRT,VRB - Vector Count Leading Zeros Doubleword
+ *
+ * Counting the number of leading zero bits of each doubleword element in 
source
+ * register and placing result in appropriate doubleword element of destination
+ * register.
+ */
+static void trans_vclzd(DisasContext *ctx)
+{
+int VT = rD(ctx->opcode);
+int VB = rB(ctx->opcode);
+TCGv_i64 avr = tcg_temp_new_i64();
+
+/* high doubleword */
+get_avr64(avr, VB, true);
+tcg_gen_clzi_i64(avr, avr, 64);
+set_avr64(VT, avr, true);
+
+/* low doubleword */
+get_avr64(avr, VB, false);
+tcg_gen_clzi_i64(avr, avr, 64);
+set_avr64(VT, avr, false);
+
+tcg_temp_free_i64(avr);
+}
+
 GEN_VXFORM(vmuloub, 4, 0);
 GEN_VXFORM(vmulouh, 4, 1);
 GEN_VXFORM(vmulouw, 4, 2);
@@ -1388,7 +1414,7 @@ GEN_VAFORM_PAIRED(vmaddfp, vnmsubfp, 23)
 GEN_VXFORM_NOA(vclzb, 1, 28)
 GEN_VXFORM_NOA(vclzh, 1, 29)
 GEN_VXFORM_NOA(vclzw, 1, 30)
-GEN_VXFORM_NOA(vclzd, 1, 31)
+GEN_VXFORM_TRANS(vclzd, 1, 31)
 GEN_VXFORM_NOA_2(vnegw, 1, 24, 6)
 GEN_VXFORM_NOA_2(vnegd, 1, 24, 7)
 GEN_VXFORM_NOA_2(vextsb2w, 1, 24, 16)
-- 
2.7.4




[Qemu-devel] [PATCH 0/8] Optimize emulation of ten Altivec instructions: lvsl,

2019-06-06 Thread Stefan Brankovic
This series buils up on and complements recent work of Thomas Murta, Mark
Cave-Ayland and Richard Henderson in the same area. It is based on devising TCG
translation implementation for selected instructions rather than using helpers.
The selected instructions are most of the time idiosyncratic to ppc platform,
so relatively complex TCG translation (without direct mapping to host
instruction that is not possible in these cases) seems to be the best option,
and that approach is presented in this series. The performance improvements are
significant in all cases.

Stefan Brankovic (8):
  target/ppc: Optimize emulation of lvsl and lvsr instructions
  target/ppc: Optimize emulation of vsl and vsr instructions
  target/ppc: Optimize emulation of vpkpx instruction
  target/ppc: Optimize emulation of vgbbd instruction
  target/ppc: Optimize emulation of vclzd instruction
  target/ppc: Optimize emulation of vclzw instruction
  target/ppc: Optimize emulation of vclzh and vclzb instructions
  target/ppc: Refactor emulation of vmrgew and vmrgow instructions

 target/ppc/translate/vmx-impl.inc.c | 705 
 1 file changed, 636 insertions(+), 69 deletions(-)

-- 
2.7.4




[Qemu-devel] [PATCH 1/8] target/ppc: Optimize emulation of lvsl and lvsr instructions

2019-06-06 Thread Stefan Brankovic
Adding simple macro that is calling tcg implementation of appropriate
instruction if altivec support is active.

Optimization of altivec instruction lvsl (Load Vector for Shift Left).
Place bytes sh:sh+15 of value 0x00 || 0x01 || 0x02 || ... || 0x1E || 0x1F
in destination register. Sh is calculated by adding 2 source registers and
getting bits 60-63 of result.

First we place bits [28-31] of EA to variable sh. After that
we create bytes sh:(sh+7) of X(from description) in for loop
(by incrementing sh in each iteration and placing it in
appropriate byte of variable result) and save them in higher
doubleword element of vD. We repeat this once again for lower
doubleword element of vD by creating bytes (sh+8):(sh+15) in
a for loop and saving result.

Optimization of altivec instruction lvsr (Load Vector for Shift Right).
Place bytes 16-sh:31-sh of value 0x00 || 0x01 || 0x02 || ... || 0x1E ||
0x1F in destination register. Sh is calculated by adding 2 source
registers and getting bits 60-63 of result.

First we place bits [28-31] of EA to variable sh. After that
we create bytes (16-sh):(23-sh) of X(from description) in for loop
(by incrementing sh in each iteration and placing it in
appropriate byte of variable result) and save them in higher
doubleword element of vD. We repeat this once again for lower
doubleword element of vD by creating bytes (24-sh):(32-sh) in
a for loop and saving result.

Signed-off-by: Stefan Brankovic 
---
 target/ppc/translate/vmx-impl.inc.c | 143 
 1 file changed, 111 insertions(+), 32 deletions(-)

diff --git a/target/ppc/translate/vmx-impl.inc.c 
b/target/ppc/translate/vmx-impl.inc.c
index bd3ff40..140bb05 100644
--- a/target/ppc/translate/vmx-impl.inc.c
+++ b/target/ppc/translate/vmx-impl.inc.c
@@ -142,38 +142,6 @@ GEN_VR_STVE(bx, 0x07, 0x04, 1);
 GEN_VR_STVE(hx, 0x07, 0x05, 2);
 GEN_VR_STVE(wx, 0x07, 0x06, 4);
 
-static void gen_lvsl(DisasContext *ctx)
-{
-TCGv_ptr rd;
-TCGv EA;
-if (unlikely(!ctx->altivec_enabled)) {
-gen_exception(ctx, POWERPC_EXCP_VPU);
-return;
-}
-EA = tcg_temp_new();
-gen_addr_reg_index(ctx, EA);
-rd = gen_avr_ptr(rD(ctx->opcode));
-gen_helper_lvsl(rd, EA);
-tcg_temp_free(EA);
-tcg_temp_free_ptr(rd);
-}
-
-static void gen_lvsr(DisasContext *ctx)
-{
-TCGv_ptr rd;
-TCGv EA;
-if (unlikely(!ctx->altivec_enabled)) {
-gen_exception(ctx, POWERPC_EXCP_VPU);
-return;
-}
-EA = tcg_temp_new();
-gen_addr_reg_index(ctx, EA);
-rd = gen_avr_ptr(rD(ctx->opcode));
-gen_helper_lvsr(rd, EA);
-tcg_temp_free(EA);
-tcg_temp_free_ptr(rd);
-}
-
 static void gen_mfvscr(DisasContext *ctx)
 {
 TCGv_i32 t;
@@ -316,6 +284,16 @@ static void glue(gen_, name)(DisasContext *ctx)
 \
 tcg_temp_free_ptr(rd);  \
 }
 
+#define GEN_VXFORM_TRANS(name, opc2, opc3)  \
+static void glue(gen_, name)(DisasContext *ctx) \
+{   \
+if (unlikely(!ctx->altivec_enabled)) {  \
+gen_exception(ctx, POWERPC_EXCP_VPU);   \
+return; \
+}   \
+trans_##name(ctx);  \
+}
+
 #define GEN_VXFORM_ENV(name, opc2, opc3)\
 static void glue(gen_, name)(DisasContext *ctx) \
 {   \
@@ -515,6 +493,105 @@ static void gen_vmrgow(DisasContext *ctx)
 tcg_temp_free_i64(avr);
 }
 
+/*
+ * lvsl VRT,RA,RB - Load Vector for Shift Left
+ *
+ * Let the EA be the sum (rA|0)+(rB). Let sh=EA[28???31].
+ * Let X be the 32-byte value 0x00 || 0x01 || 0x02 || ... || 0x1E || 0x1F.
+ * Bytes sh:sh+15 of X are placed into vD.
+ */
+static void trans_lvsl(DisasContext *ctx)
+{
+int VT = rD(ctx->opcode);
+TCGv_i64 result = tcg_temp_new_i64();
+TCGv_i64 tmp = tcg_temp_new_i64();
+TCGv_i64 sh = tcg_temp_new_i64();
+TCGv_i64 EA = tcg_temp_new();
+int i;
+
+/* Get sh(from description) by anding EA with 0xf. */
+gen_addr_reg_index(ctx, EA);
+tcg_gen_andi_i64(sh, EA, 0xfULL);
+/*
+ * Create bytes sh:sh+7 of X(from description) and place them in
+ * higher doubleword of vD.
+ */
+tcg_gen_addi_i64(result, sh, 7);
+for (i = 7; i >= 1; i--) {
+tcg_gen_shli_i64(tmp, sh, i * 8);
+tcg_gen_or_i64(result, result, tmp);
+tcg_gen_addi_i64(sh, sh, 1);
+}
+set_avr64(VT, result, true);
+/*
+ * Create bytes sh+8:sh+15 of X(from description) and place them in
+ * lower doubleword of vD.
+ */
+tcg_gen_addi_i64(result, sh, 8);
+

[Qemu-devel] [PATCH 7/8] target/ppc: Optimize emulation of vclzh and vclzb instructions

2019-06-06 Thread Stefan Brankovic
Optimize Altivec instruction vclzh (Vector Count Leading Zeros Halfword).
This instruction counts the number of leading zeros of each halfword element
in source register and places result in the appropriate halfword element of
destination register.

In each iteration of outer for loop we perform count operation on one
doubleword elements of source register vB. In first iteration we place
higher doubleword element of vB in variable avr, then we perform count
for every halfword element using tcg_gen_clzi_i64. Since it counts
leading zeros on 64 bit lenght, we have to move ith byte element to
highest 16 bits of tmp, or it with mask(so we get all ones in lowest
48 bits), then perform tcg_gen_clzi_i64 and move it's result in
appropriate halfword element of result. We do this in inner for loop.
After operation is finished we save result in appropriate doubleword
element of destination register vD. We repeat this once again for
lower doubleword element of vB.

Optimize Altivec instruction vclzb (Vector Count Leading Zeros Byte).
This instruction counts the number of leading zeros of each byte element
in source register and places result in the appropriate byte element of
destination register.

In each iteration of outer for loop we perform count operation on one
doubleword elements of source register vB. In first iteration we place
higher doubleword element of vB in variable avr, then we perform count
for every byte element using tcg_gen_clzi_i64. Since it counts leading
zeros on 64 bit lenght, we have to move ith byte element to highest 8
bits of variable  tmp, or it with mask(so we get all ones in lowest 56
bits), then perform tcg_gen_clzi_i64 and move it's result in appropriate
byte element of result. We do this in inner for loop. After operation is
finished we save result in appropriate doubleword element of destination
register vD. We repeat this once again for lower doubleword element of
vB.

Signed-off-by: Stefan Brankovic 
---
 target/ppc/translate/vmx-impl.inc.c | 122 +++-
 1 file changed, 120 insertions(+), 2 deletions(-)

diff --git a/target/ppc/translate/vmx-impl.inc.c 
b/target/ppc/translate/vmx-impl.inc.c
index 7689739..8535a31 100644
--- a/target/ppc/translate/vmx-impl.inc.c
+++ b/target/ppc/translate/vmx-impl.inc.c
@@ -878,6 +878,124 @@ static void trans_vgbbd(DisasContext *ctx)
 }
 
 /*
+ * vclzb VRT,VRB - Vector Count Leading Zeros Byte
+ *
+ * Counting the number of leading zero bits of each byte element in source
+ * register and placing result in appropriate byte element of destination
+ * register.
+ */
+static void trans_vclzb(DisasContext *ctx)
+{
+int VT = rD(ctx->opcode);
+int VB = rB(ctx->opcode);
+TCGv_i64 avr = tcg_temp_new_i64();
+TCGv_i64 result = tcg_temp_new_i64();
+TCGv_i64 tmp = tcg_temp_new_i64();
+TCGv_i64 mask = tcg_const_i64(0xffULL);
+int i, j;
+
+for (i = 0; i < 2; i++) {
+if (i == 0) {
+/* Get high doubleword of vB in avr. */
+get_avr64(avr, VB, true);
+} else {
+/* Get low doubleword of vB in avr. */
+get_avr64(avr, VB, false);
+}
+/*
+ * Perform count for every byte element using tcg_gen_clzi_i64.
+ * Since it counts leading zeros on 64 bit lenght, we have to move
+ * ith byte element to highest 8 bits of tmp, or it with mask(so we get
+ * all ones in lowest 56 bits), then perform tcg_gen_clzi_i64 and move
+ * it's result in appropriate byte element of result.
+ */
+tcg_gen_shli_i64(tmp, avr, 56);
+tcg_gen_or_i64(tmp, tmp, mask);
+tcg_gen_clzi_i64(result, tmp, 64);
+for (j = 1; j < 7; j++) {
+tcg_gen_shli_i64(tmp, avr, (7 - j) * 8);
+tcg_gen_or_i64(tmp, tmp, mask);
+tcg_gen_clzi_i64(tmp, tmp, 64);
+tcg_gen_deposit_i64(result, result, tmp, j * 8, 8);
+}
+tcg_gen_or_i64(tmp, avr, mask);
+tcg_gen_clzi_i64(tmp, tmp, 64);
+tcg_gen_deposit_i64(result, result, tmp, 56, 8);
+if (i == 0) {
+/* Place result in high doubleword element of vD. */
+set_avr64(VT, result, true);
+} else {
+/* Place result in low doubleword element of vD. */
+set_avr64(VT, result, false);
+}
+}
+
+tcg_temp_free_i64(avr);
+tcg_temp_free_i64(result);
+tcg_temp_free_i64(tmp);
+tcg_temp_free_i64(mask);
+}
+
+/*
+ * vclzh VRT,VRB - Vector Count Leading Zeros Halfword
+ *
+ * Counting the number of leading zero bits of each halfword element in source
+ * register and placing result in appropriate halfword element of destination
+ * register.
+ */
+static void trans_vclzh(DisasContext *ctx)
+{
+int VT = rD(ctx->opcode);
+int VB = rB(ctx->opcode);
+TCGv_i64 avr = tcg_temp_new_i64();
+TCGv_i64 result = tcg_temp_new_i64();
+TCGv_i64 tmp = tcg_temp_new_i64();
+TCGv_i64 ma

[Qemu-devel] [PATCH 2/8] target/ppc: Optimize emulation of vsl and vsr instructions

2019-06-06 Thread Stefan Brankovic
Optimization of altivec instructions vsl and vsr(Vector Shift Left/Rigt).
Perform shift operation (left and right respectively) on 128 bit value of
register vA by value specified in bits 125-127 of register vB. Lowest 3
bits in each byte element of register vB must be identical or result is
undefined.

For vsl instruction we do this by first saving bits 125-127
of register  vB in variable sh. Then we save highest sh bits
of lower doubleword element of register vA in variable shifted,
so we don't lose those bits when we perform shift operation on
lower doubleword element of register vA, which is our next
step. After shifting lower doubleword element we perform shift
operation on higher doubleword element of vA and replace
lowest sh bits(that are now 0) with bits saved in shifted.

For vsr instruction we do this by first saving bits 125-127
of register  vB in variable sh. Then we save lowest sh bits
of higher doubleword element of register vA in variable shifted,
so we don't lose those bits when we perform shift operation on
higher doubleword element of register vA, which is our next step.
After shifting higher doubleword element we perform shift
operation on lower doubleword element of vA and replace
highest sh bits(that are now 0) with bits saved in shifted.

Signed-off-by: Stefan Brankovic 
---
 target/ppc/translate/vmx-impl.inc.c | 101 +++-
 1 file changed, 99 insertions(+), 2 deletions(-)

diff --git a/target/ppc/translate/vmx-impl.inc.c 
b/target/ppc/translate/vmx-impl.inc.c
index 140bb05..6bd072a 100644
--- a/target/ppc/translate/vmx-impl.inc.c
+++ b/target/ppc/translate/vmx-impl.inc.c
@@ -592,6 +592,103 @@ static void trans_lvsr(DisasContext *ctx)
 tcg_temp_free(EA);
 }
 
+/*
+ * vsl VRT,VRA,VRB - Vector Shift Left
+ *
+ * Shifting left 128 bit value of vA by value specified in bits 125-127 of vB.
+ * Lowest 3 bits in each byte element of register vB must be identical or
+ * result is undefined.
+ */
+static void trans_vsl(DisasContext *ctx)
+{
+int VT = rD(ctx->opcode);
+int VA = rA(ctx->opcode);
+int VB = rB(ctx->opcode);
+TCGv_i64 avrA = tcg_temp_new_i64();
+TCGv_i64 avrB = tcg_temp_new_i64();
+TCGv_i64 sh = tcg_temp_new_i64();
+TCGv_i64 shifted = tcg_temp_new_i64();
+TCGv_i64 tmp = tcg_temp_new_i64();
+
+/* Place bits 125-127 of vB in sh. */
+get_avr64(avrB, VB, false);
+tcg_gen_andi_i64(sh, avrB, 0x07ULL);
+
+/*
+ * Save highest sh bits of lower doubleword element of vA in variable
+ * shifted and perform shift on lower doubleword.
+ */
+get_avr64(avrA, VA, false);
+tcg_gen_subi_i64(tmp, sh, 64);
+tcg_gen_neg_i64(tmp, tmp);
+tcg_gen_shr_i64(shifted, avrA, tmp);
+tcg_gen_shl_i64(avrA, avrA, sh);
+set_avr64(VT, avrA, false);
+
+/*
+ * Perform shift on higher doubleword element of vA and replace lowest
+ * sh bits with shifted.
+ */
+get_avr64(avrA, VA, true);
+tcg_gen_shl_i64(avrA, avrA, sh);
+tcg_gen_or_i64(avrA, avrA, shifted);
+set_avr64(VT, avrA, true);
+
+tcg_temp_free_i64(avrA);
+tcg_temp_free_i64(avrB);
+tcg_temp_free_i64(sh);
+tcg_temp_free_i64(shifted);
+tcg_temp_free_i64(tmp);
+}
+
+/*
+ * vsr VRT,VRA,VRB - Vector Shift Right
+ *
+ * Shifting right 128 bit value of vA by value specified in bits 125-127 of vB.
+ * Lowest 3 bits in each byte element of register vB must be identical or
+ * result is undefined.
+ */
+static void trans_vsr(DisasContext *ctx)
+{
+int VT = rD(ctx->opcode);
+int VA = rA(ctx->opcode);
+int VB = rB(ctx->opcode);
+TCGv_i64 avrA = tcg_temp_new_i64();
+TCGv_i64 avrB = tcg_temp_new_i64();
+TCGv_i64 sh = tcg_temp_new_i64();
+TCGv_i64 shifted = tcg_temp_new_i64();
+TCGv_i64 tmp = tcg_temp_new_i64();
+
+/* Place bits 125-127 of vB in sh. */
+get_avr64(avrB, VB, false);
+tcg_gen_andi_i64(sh, avrB, 0x07ULL);
+
+/*
+ * Save lowest sh bits of higher doubleword element of vA in variable
+ * shifted and perform shift on higher doubleword.
+ */
+get_avr64(avrA, VA, true);
+tcg_gen_subi_i64(tmp, sh, 64);
+tcg_gen_neg_i64(tmp, tmp);
+tcg_gen_shl_i64(shifted, avrA, tmp);
+tcg_gen_shr_i64(avrA, avrA, sh);
+set_avr64(VT, avrA, true);
+/*
+ * Perform shift on lower doubleword element of vA and replace highest
+ * sh bits with shifted.
+ */
+get_avr64(avrA, VA, false);
+tcg_gen_shr_i64(avrA, avrA, sh);
+tcg_gen_or_i64(avrA, avrA, shifted);
+set_avr64(VT, avrA, false);
+
+tcg_temp_free_i64(avrA);
+tcg_temp_free_i64(avrB);
+tcg_temp_free_i64(sh);
+tcg_temp_free_i64(shifted);
+tcg_temp_free_i64(tmp);
+}
+
 GEN_VXFORM(vmuloub, 4, 0);
 GEN_VXFORM(vmulouh, 4, 1);
 GEN_VXFORM(vmulouw, 4, 2);
@@ -699,11 +796,11 @@ GEN_VXFORM(vrld, 2, 3);
 GEN_VXFORM(vrldmi, 2, 3);
 GEN_VXFORM_DUAL(vrld, PPC_NONE, PPC2_ALTIVEC_207, \
 vrldmi, PPC_NONE, PPC2_ISA300)
-GE

[Qemu-devel] [PATCH 1/2] target/tilegx: Implement emulation of TILEGX instructions V1CMPLEU and V1CMPLTU

2019-03-11 Thread Stefan Brankovic
Implement emulation of TILEGX instruction V1CMPLEU and V1CMPLTU
using TCG front end operations.

Signed-off-by: Stefan Brankovic 
---
 target/tilegx/translate.c | 62 ---
 1 file changed, 58 insertions(+), 4 deletions(-)

diff --git a/target/tilegx/translate.c b/target/tilegx/translate.c
index f201150..396c33e 100644
--- a/target/tilegx/translate.c
+++ b/target/tilegx/translate.c
@@ -491,6 +491,56 @@ static TileExcp gen_specill(DisasContext *dc, unsigned 
dest, unsigned srca,
 return gen_signal(dc, signo, sigcode, mnemonic);
 }
 
+static void gen_v1cmpleu(TCGv tdest, TCGv tsrca, TCGv tsrcb)
+{
+TCGv_64 t_sa = tcg_temp_new();
+TCGv_64 t_sb = tcg_temp_new();
+TCGv_64 t_d = tcg_temp_new();
+int64_t mask = 0xffULL;
+int64_t mask1 = 0x1ULL;
+int i;
+
+tcg_gen_movi_i64(tdest, 0x0ULL);
+for (i = 0; i < 8; i++) {
+tcg_gen_andi_i64(t_sa, tsrca, mask);
+tcg_gen_andi_i64(t_sb, tsrcb, mask);
+tcg_gen_setcond_i64(TCG_COND_LEU, t_d, t_sa, t_sb);
+tcg_gen_andi_i64(t_d, t_d, mask1);
+tcg_gen_or_i64(tdest, tdest, t_d);
+mask = mask << 8;
+mask1 = mask1 << 8;
+}
+
+tcg_temp_free(t_sa);
+tcg_temp_free(t_sb);
+tcg_temp_free(t_d);
+}
+
+static void gen_v1cmpltu(TCGv tdest, TCGv tsrca, TCGv tsrcb)
+{
+TCGv_64 t_sa = tcg_temp_new();
+TCGv_64 t_sb = tcg_temp_new();
+TCGv_64 t_d = tcg_temp_new();
+int64_t mask = 0xffULL;
+int64_t mask1 = 0x1ULL;
+int i;
+
+tcg_gen_movi_i64(tdest, 0x0ULL);
+for (i = 0; i < 8; i++) {
+tcg_gen_andi_i64(t_sa, tsrca, mask);
+tcg_gen_andi_i64(t_sb, tsrcb, mask);
+tcg_gen_setcond_i64(TCG_COND_LTU, t_d, t_sa, t_sb);
+tcg_gen_andi_i64(t_d, t_d, mask1);
+tcg_gen_or_i64(tdest, tdest, t_d);
+mask = mask << 8;
+mask1 = mask1 << 8;
+}
+
+tcg_temp_free(t_sa);
+tcg_temp_free(t_sb);
+tcg_temp_free(t_d);
+}
+
 static TileExcp gen_rr_opcode(DisasContext *dc, unsigned opext,
   unsigned dest, unsigned srca, uint64_t bundle)
 {
@@ -1247,12 +1297,8 @@ static TileExcp gen_rrr_opcode(DisasContext *dc, 
unsigned opext,
 break;
 case OE_RRR(V1CMPLES, 0, X0):
 case OE_RRR(V1CMPLES, 0, X1):
-case OE_RRR(V1CMPLEU, 0, X0):
-case OE_RRR(V1CMPLEU, 0, X1):
 case OE_RRR(V1CMPLTS, 0, X0):
 case OE_RRR(V1CMPLTS, 0, X1):
-case OE_RRR(V1CMPLTU, 0, X0):
-case OE_RRR(V1CMPLTU, 0, X1):
 return TILEGX_EXCP_OPCODE_UNIMPLEMENTED;
 case OE_RRR(V1CMPNE, 0, X0):
 case OE_RRR(V1CMPNE, 0, X1):
@@ -1260,6 +1306,14 @@ static TileExcp gen_rrr_opcode(DisasContext *dc, 
unsigned opext,
 gen_v1cmpne0(tdest);
 mnemonic = "v1cmpne";
 break;
+case OE_RRR(V1CMPLEU, 0, X0):
+case OE_RRR(V1CMPLEU, 0, X1):
+gen_v1cmpleu(tdest, tsrca, tsrcb);
+break;
+case OE_RRR(V1CMPLTU, 0, X0):
+case OE_RRR(V1CMPLTU, 0, X1):
+gen_v1cmpltu(tdest, tsrca, tsrcb);
+break;
 case OE_RRR(V1DDOTPUA, 0, X0):
 case OE_RRR(V1DDOTPUSA, 0, X0):
 case OE_RRR(V1DDOTPUS, 0, X0):
-- 
2.7.4




[Qemu-devel] [PATCH 0/2] Add support for some comparison instructions

2019-03-11 Thread Stefan Brankovic
Implement emulation of TILE-Gx instructions V1CMPLEU, V1CMPLTU,
V2CMPLEU, and V2CMPLTU.

Stefan Brankovic (2):
  target/tilegx: Implement emulation of TILEGX instructions V1CMPLEU and
V1CMPLTU
  target/tilegx: Implement emulation of TILEGX instructions V2CMPLEU and
V2CMPLTU

 target/tilegx/translate.c | 124 +++---
 1 file changed, 116 insertions(+), 8 deletions(-)

-- 
2.7.4




[Qemu-devel] [PATCH 2/2] target/tilegx: Implement emulation of TILEGX instructions V2CMPLEU and V2CMPLTU

2019-03-11 Thread Stefan Brankovic
Implement emulation of TILEGX instruction V2CMPLEU and V2CMPLTU
using TCG front end operations.

Signed-off-by: Stefan Brankovic 
---
 target/tilegx/translate.c | 62 ---
 1 file changed, 58 insertions(+), 4 deletions(-)

diff --git a/target/tilegx/translate.c b/target/tilegx/translate.c
index 396c33e..6e0dc44 100644
--- a/target/tilegx/translate.c
+++ b/target/tilegx/translate.c
@@ -541,6 +541,56 @@ static void gen_v1cmpltu(TCGv tdest, TCGv tsrca, TCGv 
tsrcb)
 tcg_temp_free(t_d);
 }
 
+static void gen_v2cmpleu(TCGv tdest, TCGv tsrca, TCGv tsrcb)
+{
+TCGv_64 t_sa = tcg_temp_new();
+TCGv_64 t_sb = tcg_temp_new();
+TCGv_64 t_d = tcg_temp_new();
+int64_t mask = 0xULL;
+int64_t mask1 = 0x1ULL;
+int i;
+
+tcg_gen_movi_i64(tdest, 0x0ULL);
+for (i = 0; i < 4; i++) {
+tcg_gen_andi_i64(t_sa, tsrca, mask);
+tcg_gen_andi_i64(t_sb, tsrcb, mask);
+tcg_gen_setcond_i64(TCG_COND_LEU, t_d, t_sa, t_sb);
+tcg_gen_andi_i64(t_d, t_d, mask1);
+tcg_gen_or_i64(tdest, tdest, t_d);
+mask = mask << 16;
+mask1 = mask1 << 16;
+}
+
+tcg_temp_free(t_sa);
+tcg_temp_free(t_sb);
+tcg_temp_free(t_d);
+}
+
+static void gen_v2cmpltu(TCGv tdest, TCGv tsrca, TCGv tsrcb)
+{
+TCGv_64 t_sa = tcg_temp_new();
+TCGv_64 t_sb = tcg_temp_new();
+TCGv_64 t_d = tcg_temp_new();
+int64_t mask = 0xULL;
+int64_t mask1 = 0x1ULL;
+int i;
+
+tcg_gen_movi_i64(tdest, 0x0ULL);
+for (i = 0; i < 4; i++) {
+tcg_gen_andi_i64(t_sa, tsrca, mask);
+tcg_gen_andi_i64(t_sb, tsrcb, mask);
+tcg_gen_setcond_i64(TCG_COND_LTU, t_d, t_sa, t_sb);
+tcg_gen_andi_i64(t_d, t_d, mask1);
+tcg_gen_or_i64(tdest, tdest, t_d);
+mask = mask << 16;
+mask1 = mask1 << 16;
+}
+
+tcg_temp_free(t_sa);
+tcg_temp_free(t_sb);
+tcg_temp_free(t_d);
+}
+
 static TileExcp gen_rr_opcode(DisasContext *dc, unsigned opext,
   unsigned dest, unsigned srca, uint64_t bundle)
 {
@@ -1390,17 +1440,21 @@ static TileExcp gen_rrr_opcode(DisasContext *dc, 
unsigned opext,
 case OE_RRR(V2CMPEQ, 0, X1):
 case OE_RRR(V2CMPLES, 0, X0):
 case OE_RRR(V2CMPLES, 0, X1):
-case OE_RRR(V2CMPLEU, 0, X0):
-case OE_RRR(V2CMPLEU, 0, X1):
 case OE_RRR(V2CMPLTS, 0, X0):
 case OE_RRR(V2CMPLTS, 0, X1):
-case OE_RRR(V2CMPLTU, 0, X0):
-case OE_RRR(V2CMPLTU, 0, X1):
 case OE_RRR(V2CMPNE, 0, X0):
 case OE_RRR(V2CMPNE, 0, X1):
 case OE_RRR(V2DOTPA, 0, X0):
 case OE_RRR(V2DOTP, 0, X0):
 return TILEGX_EXCP_OPCODE_UNIMPLEMENTED;
+case OE_RRR(V2CMPLEU, 0, X0):
+case OE_RRR(V2CMPLEU, 0, X1):
+gen_v2cmpleu(tdest, tsrca, tsrcb);
+break;
+case OE_RRR(V2CMPLTU, 0, X0):
+case OE_RRR(V2CMPLTU, 0, X1):
+gen_v2cmpltu(tdest, tsrca, tsrcb);
+break;
 case OE_RRR(V2INT_H, 0, X0):
 case OE_RRR(V2INT_H, 0, X1):
 gen_helper_v2int_h(tdest, tsrca, tsrcb);
-- 
2.7.4