Re: [PATCH v2 2/3] target/ppc: Implement Vector Extract Mask

2021-12-03 Thread Richard Henderson

On 12/3/21 5:00 AM, Richard Henderson wrote:

On 11/12/21 6:14 AM, matheus.fe...@eldorado.org.br wrote:

From: Matheus Ferst 

Implement the following PowerISA v3.1 instructions:
vextractbm: Vector Extract Byte Mask
vextracthm: Vector Extract Halfword Mask
vextractwm: Vector Extract Word Mask
vextractdm: Vector Extract Doubleword Mask
vextractqm: Vector Extract Quadword Mask

Suggested-by: Richard Henderson 
Signed-off-by: Matheus Ferst 
---
v2:
- Applied rth suggestion to do_vextractm
---
  target/ppc/insn32.decode    |  6 +++
  target/ppc/translate/vmx-impl.c.inc | 60 +
  2 files changed, 66 insertions(+)

diff --git a/target/ppc/insn32.decode b/target/ppc/insn32.decode
index 9a28f1d266..639ac22bf0 100644
--- a/target/ppc/insn32.decode
+++ b/target/ppc/insn32.decode
@@ -419,6 +419,12 @@ VEXPANDWM   000100 . 00010 . 1100110    
@VX_tb
  VEXPANDDM   000100 . 00011 . 1100110    @VX_tb
  VEXPANDQM   000100 . 00100 . 1100110    @VX_tb
+VEXTRACTBM  000100 . 01000 . 1100110    @VX_tb
+VEXTRACTHM  000100 . 01001 . 1100110    @VX_tb
+VEXTRACTWM  000100 . 01010 . 1100110    @VX_tb
+VEXTRACTDM  000100 . 01011 . 1100110    @VX_tb
+VEXTRACTQM  000100 . 01100 . 1100110    @VX_tb
+
  # VSX Load/Store Instructions
  LXV 01 . .  . 001   @DQ_TSX
diff --git a/target/ppc/translate/vmx-impl.c.inc 
b/target/ppc/translate/vmx-impl.c.inc
index 58aca58f0f..dd7337c2f2 100644
--- a/target/ppc/translate/vmx-impl.c.inc
+++ b/target/ppc/translate/vmx-impl.c.inc
@@ -1539,6 +1539,66 @@ static bool trans_VEXPANDQM(DisasContext *ctx, arg_VX_tb 
*a)
  return true;
  }
+static bool do_vextractm(DisasContext *ctx, arg_VX_tb *a, unsigned vece)
+{
+    const uint64_t elem_width = 8 << vece, elem_count_half = 8 >> vece;
+    TCGv_i64 t, b, tmp;
+
+    REQUIRE_INSNS_FLAGS2(ctx, ISA310);
+    REQUIRE_VECTOR(ctx);
+
+    t = tcg_const_i64(0);
+    b = tcg_temp_new_i64();
+    tmp = tcg_temp_new_i64();
+
+    for (int w = 0; w < 2; w++) {
+    get_avr64(b, a->vrb, w);
+
+    for (int i = 0; i < elem_count_half; i++) {
+    int in_bit = (i + 1) * elem_width - 1;
+    int out_bit = w * elem_count_half + i;
+
+    if (in_bit > out_bit) {
+    tcg_gen_shri_i64(tmp, b, in_bit - out_bit);
+    } else {
+    tcg_gen_shli_i64(tmp, b, out_bit - in_bit);
+    }
+    tcg_gen_andi_i64(tmp, tmp, 1 << out_bit);
+    tcg_gen_or_i64(t, t, tmp);
+    }
+    }
+    tcg_gen_trunc_i64_tl(cpu_gpr[a->vrt], t);


Pardon me.  I realized after the fact that we can run the same algorithm as for mtvsrm (in 
the next patch) in reverse.


   & dup(1)
...a...b...c...d...e...f...g...h
   >> 32 - 4
...a...b...c...d
   |
...a...b...c...d...a...e...b...f...c...g...d...h
   >> 16 - 2
.a...b...c...d...a...e...b...f..
   |
...a...b.a.c.b.d...a.c.e...b.d.f.a.c.e.g.b.d.f.h
   >> 8 - 1
..a...b.a.c.b.d...a.c.e...b.d.f.a.c.e.g.
   |
...a..ab.abcabcd...abcde..abcdef.abcdefgabcdefgh
   & 0xff
abcdefgh

where one of the two final masks can be done via deposit:

     tcg_gen_andi_i64(hi, hi, 0xff);
     tcg_gen_deposit_i64(lo, lo, hi, 8, 56);

Which will reduce the instruction count of this implementation by half.


Oops, ENOCOFFEE.  Of course the input bit comes from the msb of the element, not the lsb. 
 Three different options:


(1) Begin with a shift of elem_count_half - 1, then do the above,

(2) Change the initial mask to the msb, then extract from elem_count_half - 1.

(3) Do left shifts so that we collect the bits at the msb of
the word.  This probably results in the easiest concatenation
in the end:

tcg_gen_shri_i64(hi, hi, 64 - elem_count_half);
tcg_gen_extract2_i64(lo, lo, hi, 64 - 2 * elem_count_half);


r~



Re: [PATCH v2 2/3] target/ppc: Implement Vector Extract Mask

2021-12-03 Thread Richard Henderson

On 11/12/21 6:14 AM, matheus.fe...@eldorado.org.br wrote:

From: Matheus Ferst 

Implement the following PowerISA v3.1 instructions:
vextractbm: Vector Extract Byte Mask
vextracthm: Vector Extract Halfword Mask
vextractwm: Vector Extract Word Mask
vextractdm: Vector Extract Doubleword Mask
vextractqm: Vector Extract Quadword Mask

Suggested-by: Richard Henderson 
Signed-off-by: Matheus Ferst 
---
v2:
- Applied rth suggestion to do_vextractm
---
  target/ppc/insn32.decode|  6 +++
  target/ppc/translate/vmx-impl.c.inc | 60 +
  2 files changed, 66 insertions(+)

diff --git a/target/ppc/insn32.decode b/target/ppc/insn32.decode
index 9a28f1d266..639ac22bf0 100644
--- a/target/ppc/insn32.decode
+++ b/target/ppc/insn32.decode
@@ -419,6 +419,12 @@ VEXPANDWM   000100 . 00010 . 1100110
@VX_tb
  VEXPANDDM   000100 . 00011 . 1100110@VX_tb
  VEXPANDQM   000100 . 00100 . 1100110@VX_tb
  
+VEXTRACTBM  000100 . 01000 . 1100110@VX_tb

+VEXTRACTHM  000100 . 01001 . 1100110@VX_tb
+VEXTRACTWM  000100 . 01010 . 1100110@VX_tb
+VEXTRACTDM  000100 . 01011 . 1100110@VX_tb
+VEXTRACTQM  000100 . 01100 . 1100110@VX_tb
+
  # VSX Load/Store Instructions
  
  LXV 01 . .  . 001   @DQ_TSX

diff --git a/target/ppc/translate/vmx-impl.c.inc 
b/target/ppc/translate/vmx-impl.c.inc
index 58aca58f0f..dd7337c2f2 100644
--- a/target/ppc/translate/vmx-impl.c.inc
+++ b/target/ppc/translate/vmx-impl.c.inc
@@ -1539,6 +1539,66 @@ static bool trans_VEXPANDQM(DisasContext *ctx, arg_VX_tb 
*a)
  return true;
  }
  
+static bool do_vextractm(DisasContext *ctx, arg_VX_tb *a, unsigned vece)

+{
+const uint64_t elem_width = 8 << vece, elem_count_half = 8 >> vece;
+TCGv_i64 t, b, tmp;
+
+REQUIRE_INSNS_FLAGS2(ctx, ISA310);
+REQUIRE_VECTOR(ctx);
+
+t = tcg_const_i64(0);
+b = tcg_temp_new_i64();
+tmp = tcg_temp_new_i64();
+
+for (int w = 0; w < 2; w++) {
+get_avr64(b, a->vrb, w);
+
+for (int i = 0; i < elem_count_half; i++) {
+int in_bit = (i + 1) * elem_width - 1;
+int out_bit = w * elem_count_half + i;
+
+if (in_bit > out_bit) {
+tcg_gen_shri_i64(tmp, b, in_bit - out_bit);
+} else {
+tcg_gen_shli_i64(tmp, b, out_bit - in_bit);
+}
+tcg_gen_andi_i64(tmp, tmp, 1 << out_bit);
+tcg_gen_or_i64(t, t, tmp);
+}
+}
+tcg_gen_trunc_i64_tl(cpu_gpr[a->vrt], t);


Pardon me.  I realized after the fact that we can run the same algorithm as for mtvsrm (in 
the next patch) in reverse.


  & dup(1)
...a...b...c...d...e...f...g...h
  >> 32 - 4
...a...b...c...d
  |
...a...b...c...d...a...e...b...f...c...g...d...h
  >> 16 - 2
.a...b...c...d...a...e...b...f..
  |
...a...b.a.c.b.d...a.c.e...b.d.f.a.c.e.g.b.d.f.h
  >> 8 - 1
..a...b.a.c.b.d...a.c.e...b.d.f.a.c.e.g.
  |
...a..ab.abcabcd...abcde..abcdef.abcdefgabcdefgh
  & 0xff
abcdefgh

where one of the two final masks can be done via deposit:

tcg_gen_andi_i64(hi, hi, 0xff);
tcg_gen_deposit_i64(lo, lo, hi, 8, 56);

Which will reduce the instruction count of this implementation by half.


r~



[PATCH v2 2/3] target/ppc: Implement Vector Extract Mask

2021-11-12 Thread matheus . ferst
From: Matheus Ferst 

Implement the following PowerISA v3.1 instructions:
vextractbm: Vector Extract Byte Mask
vextracthm: Vector Extract Halfword Mask
vextractwm: Vector Extract Word Mask
vextractdm: Vector Extract Doubleword Mask
vextractqm: Vector Extract Quadword Mask

Suggested-by: Richard Henderson 
Signed-off-by: Matheus Ferst 
---
v2:
- Applied rth suggestion to do_vextractm
---
 target/ppc/insn32.decode|  6 +++
 target/ppc/translate/vmx-impl.c.inc | 60 +
 2 files changed, 66 insertions(+)

diff --git a/target/ppc/insn32.decode b/target/ppc/insn32.decode
index 9a28f1d266..639ac22bf0 100644
--- a/target/ppc/insn32.decode
+++ b/target/ppc/insn32.decode
@@ -419,6 +419,12 @@ VEXPANDWM   000100 . 00010 . 1100110
@VX_tb
 VEXPANDDM   000100 . 00011 . 1100110@VX_tb
 VEXPANDQM   000100 . 00100 . 1100110@VX_tb
 
+VEXTRACTBM  000100 . 01000 . 1100110@VX_tb
+VEXTRACTHM  000100 . 01001 . 1100110@VX_tb
+VEXTRACTWM  000100 . 01010 . 1100110@VX_tb
+VEXTRACTDM  000100 . 01011 . 1100110@VX_tb
+VEXTRACTQM  000100 . 01100 . 1100110@VX_tb
+
 # VSX Load/Store Instructions
 
 LXV 01 . .  . 001   @DQ_TSX
diff --git a/target/ppc/translate/vmx-impl.c.inc 
b/target/ppc/translate/vmx-impl.c.inc
index 58aca58f0f..dd7337c2f2 100644
--- a/target/ppc/translate/vmx-impl.c.inc
+++ b/target/ppc/translate/vmx-impl.c.inc
@@ -1539,6 +1539,66 @@ static bool trans_VEXPANDQM(DisasContext *ctx, arg_VX_tb 
*a)
 return true;
 }
 
+static bool do_vextractm(DisasContext *ctx, arg_VX_tb *a, unsigned vece)
+{
+const uint64_t elem_width = 8 << vece, elem_count_half = 8 >> vece;
+TCGv_i64 t, b, tmp;
+
+REQUIRE_INSNS_FLAGS2(ctx, ISA310);
+REQUIRE_VECTOR(ctx);
+
+t = tcg_const_i64(0);
+b = tcg_temp_new_i64();
+tmp = tcg_temp_new_i64();
+
+for (int w = 0; w < 2; w++) {
+get_avr64(b, a->vrb, w);
+
+for (int i = 0; i < elem_count_half; i++) {
+int in_bit = (i + 1) * elem_width - 1;
+int out_bit = w * elem_count_half + i;
+
+if (in_bit > out_bit) {
+tcg_gen_shri_i64(tmp, b, in_bit - out_bit);
+} else {
+tcg_gen_shli_i64(tmp, b, out_bit - in_bit);
+}
+tcg_gen_andi_i64(tmp, tmp, 1 << out_bit);
+tcg_gen_or_i64(t, t, tmp);
+}
+}
+tcg_gen_trunc_i64_tl(cpu_gpr[a->vrt], t);
+
+tcg_temp_free_i64(t);
+tcg_temp_free_i64(b);
+tcg_temp_free_i64(tmp);
+
+return true;
+}
+
+TRANS(VEXTRACTBM, do_vextractm, MO_8)
+TRANS(VEXTRACTHM, do_vextractm, MO_16)
+TRANS(VEXTRACTWM, do_vextractm, MO_32)
+TRANS(VEXTRACTDM, do_vextractm, MO_64)
+
+static bool trans_VEXTRACTQM(DisasContext *ctx, arg_VX_tb *a)
+{
+TCGv_i64 tmp;
+
+REQUIRE_INSNS_FLAGS2(ctx, ISA310);
+REQUIRE_VECTOR(ctx);
+
+tmp = tcg_temp_new_i64();
+
+get_avr64(tmp, a->vrb, true);
+tcg_gen_shri_i64(tmp, tmp, 63);
+tcg_gen_trunc_i64_tl(cpu_gpr[a->vrt], tmp);
+
+tcg_temp_free_i64(tmp);
+
+return true;
+}
+
 #define GEN_VAFORM_PAIRED(name0, name1, opc2)   \
 static void glue(gen_, name0##_##name1)(DisasContext *ctx)  \
 {   \
-- 
2.25.1