Hi David,

Thanks for the review!

> The patch looks fine to me, but I'll let Segher decide if it addresses
> his requested changes.
> 
> I'm trying to be stricter about the test cases.
> 
> +++ b/gcc/testsuite/gcc.target/powerpc/pr96933-1.c
> @@ -0,0 +1,14 @@
> +/* { dg-do compile { target { lp64 && has_arch_pwr9 } } } */
> +/* { dg-require-effective-target powerpc_p9vector_ok } */
> +/* { dg-options "-O2" } */
> 
> Why does this test has_arch_pwr9 instead of adding -mdejagnu-cpu=power9?
> 

I thought using -mdejagnu-cpu=power9 would force the case run with
power9 cpu all the time, while using has_arch_pwr9 seems to be more
flexible, it can be compiled with power9 or later (like -mcpu=power10),
we can check whether we generate unexpected code on power10 or later.
Does it sound good?

> +++ b/gcc/testsuite/gcc.target/powerpc/pr96933-2.c
> @@ -0,0 +1,14 @@
> +/* { dg-do compile { target powerpc_p8vector_ok } } */
> +/* { dg-options "-O2 -mdejagnu-cpu=power8" } */
> 
> Please place powerpc_p8vector_ok on a separate dg-require-effective-target 
> line.
> 

Done.

> +++ b/gcc/testsuite/gcc.target/powerpc/pr96933-3.c
> @@ -0,0 +1,63 @@
> +/* { dg-do run } */
> +/* { dg-require-effective-target p8vector_hw } */
> +/* { dg-options "-O2" } */
> 
> Doesn't this need -mdejagnu-cpu=power8?

Thanks for catching!  Yes, it needs.  I was thinking to use one
case for both Power8 and Power9 runs, it passed the testings on
both machines.  But your question made me realize that it's
incorrect when we are doing testing on Power8 but pass some
external option like -mcpu=power9, it can generate power9 insns
which are illegal on the machine.

The new version leaves this case for Power8 run test and add one
more pr96933-4.c for Power9 run test.

BR,
Kewen
-----
gcc/ChangeLog:

        PR target/96933
        * config/rs6000/rs6000.c (rs6000_expand_vector_init): Use direct move
        instructions for vector construction with char/short types.
        * config/rs6000/rs6000.md (p8_mtvsrwz_v16qisi2): New define_insn.
        (p8_mtvsrd_v16qidi2): Likewise. 

gcc/testsuite/ChangeLog:

        PR target/96933
        * gcc.target/powerpc/pr96933-1.c: New test.
        * gcc.target/powerpc/pr96933-2.c: New test.
        * gcc.target/powerpc/pr96933-3.c: New test.
        * gcc.target/powerpc/pr96933-4.c: New test.
        * gcc.target/powerpc/pr96933.h: New test.
        * gcc.target/powerpc/pr96933-run.h: New test.
diff --git a/gcc/config/rs6000/rs6000.c b/gcc/config/rs6000/rs6000.c
index ca5b71ecdd3..b641ee49a49 100644
--- a/gcc/config/rs6000/rs6000.c
+++ b/gcc/config/rs6000/rs6000.c
@@ -6411,11 +6411,11 @@ rs6000_expand_vector_init (rtx target, rtx vals)
 {
   machine_mode mode = GET_MODE (target);
   machine_mode inner_mode = GET_MODE_INNER (mode);
-  int n_elts = GET_MODE_NUNITS (mode);
+  unsigned int n_elts = GET_MODE_NUNITS (mode);
   int n_var = 0, one_var = -1;
   bool all_same = true, all_const_zero = true;
   rtx x, mem;
-  int i;
+  unsigned int i;
 
   for (i = 0; i < n_elts; ++i)
     {
@@ -6681,6 +6681,181 @@ rs6000_expand_vector_init (rtx target, rtx vals)
       return;
     }
 
+  if (TARGET_DIRECT_MOVE && (mode == V16QImode || mode == V8HImode))
+    {
+      rtx op[16];
+      /* Force the values into word_mode registers.  */
+      for (i = 0; i < n_elts; i++)
+       {
+         rtx tmp = force_reg (GET_MODE_INNER (mode), XVECEXP (vals, 0, i));
+         if (TARGET_POWERPC64)
+           {
+             op[i] = gen_reg_rtx (DImode);
+             emit_insn (gen_zero_extendqidi2 (op[i], tmp));
+           }
+         else
+           {
+             op[i] = gen_reg_rtx (SImode);
+             emit_insn (gen_zero_extendqisi2 (op[i], tmp));
+           }
+       }
+
+      /* Take unsigned char big endianness on 64bit as example for below
+        construction, the input values are: A, B, C, D, ..., O, P.  */
+
+      if (TARGET_DIRECT_MOVE_128)
+       {
+         /* Move to VSX register with vec_concat, each has 2 values.
+            eg: vr1[0] = { xxxxxxxA, xxxxxxxB };
+                vr1[1] = { xxxxxxxC, xxxxxxxD };
+                ...
+                vr1[7] = { xxxxxxxO, xxxxxxxP };  */
+         rtx vr1[8];
+         for (i = 0; i < n_elts / 2; i++)
+           {
+             vr1[i] = gen_reg_rtx (V2DImode);
+             emit_insn (gen_vsx_concat_v2di (vr1[i], op[i * 2],
+                                             op[i * 2 + 1]));
+           }
+
+         /* Pack vectors with 2 values into vectors with 4 values.
+            eg: vr2[0] = { xxxAxxxB, xxxCxxxD };
+                vr2[1] = { xxxExxxF, xxxGxxxH };
+                vr2[1] = { xxxIxxxJ, xxxKxxxL };
+                vr2[3] = { xxxMxxxN, xxxOxxxP };  */
+         rtx vr2[4];
+         for (i = 0; i < n_elts / 4; i++)
+           {
+             vr2[i] = gen_reg_rtx (V4SImode);
+             emit_insn (gen_altivec_vpkudum (vr2[i], vr1[i * 2],
+                                             vr1[i * 2 + 1]));
+           }
+
+         /* Pack vectors with 4 values into vectors with 8 values.
+            eg: vr3[0] = { xAxBxCxD, xExFxGxH };
+                vr3[1] = { xIxJxKxL, xMxNxOxP };  */
+         rtx vr3[2];
+         for (i = 0; i < n_elts / 8; i++)
+           {
+             vr3[i] = gen_reg_rtx (V8HImode);
+             emit_insn (gen_altivec_vpkuwum (vr3[i], vr2[i * 2],
+                                             vr2[i * 2 + 1]));
+           }
+
+         /* If it's V8HImode, it's done and return it. */
+         if (mode == V8HImode)
+           {
+             emit_insn (gen_rtx_SET (target, vr3[0]));
+             return;
+           }
+
+         /* Pack vectors with 8 values into 16 values.  */
+         rtx res = gen_reg_rtx (V16QImode);
+         emit_insn (gen_altivec_vpkuhum (res, vr3[0], vr3[1]));
+         emit_insn (gen_rtx_SET (target, res));
+       }
+      else
+       {
+         rtx (*merge_v16qi) (rtx, rtx, rtx) = NULL;
+         rtx (*merge_v8hi) (rtx, rtx, rtx) = NULL;
+         rtx (*merge_v4si) (rtx, rtx, rtx) = NULL;
+         rtx perm_idx;
+
+         /* Set up some common gen routines and values.  */
+         if (BYTES_BIG_ENDIAN)
+           {
+             if (mode == V16QImode)
+               {
+                 merge_v16qi = gen_altivec_vmrghb;
+                 merge_v8hi = gen_altivec_vmrglh;
+               }
+             else
+               merge_v8hi = gen_altivec_vmrghh;
+
+             merge_v4si = gen_altivec_vmrglw;
+             perm_idx = GEN_INT (3);
+           }
+         else
+           {
+             if (mode == V16QImode)
+               {
+                 merge_v16qi = gen_altivec_vmrglb;
+                 merge_v8hi = gen_altivec_vmrghh;
+               }
+             else
+               merge_v8hi = gen_altivec_vmrglh;
+
+             merge_v4si = gen_altivec_vmrghw;
+             perm_idx = GEN_INT (0);
+           }
+
+         /* Move to VSX register with direct move.
+            eg: vr_qi[0] = { xxxxxxxA, xxxxxxxx };
+                vr_qi[1] = { xxxxxxxB, xxxxxxxx };
+                ...
+                vr_qi[15] = { xxxxxxxP, xxxxxxxx };  */
+         rtx vr_qi[16];
+         for (i = 0; i < n_elts; i++)
+           {
+             vr_qi[i] = gen_reg_rtx (V16QImode);
+             if (TARGET_POWERPC64)
+               emit_insn (gen_p8_mtvsrd_v16qidi2 (vr_qi[i], op[i]));
+             else
+               emit_insn (gen_p8_mtvsrwz_v16qisi2 (vr_qi[i], op[i]));
+           }
+
+         /* Merge/move to vector short.
+            eg: vr_hi[0] = { xxxxxxxx, xxxxxxAB };
+                vr_hi[1] = { xxxxxxxx, xxxxxxCD };
+                ...
+                vr_hi[7] = { xxxxxxxx, xxxxxxOP };  */
+         rtx vr_hi[8];
+         for (i = 0; i < 8; i++)
+           {
+             rtx tmp = vr_qi[i];
+             if (mode == V16QImode)
+               {
+                 tmp = gen_reg_rtx (V16QImode);
+                 emit_insn (merge_v16qi (tmp, vr_qi[2 * i], vr_qi[2 * i + 1]));
+               }
+             vr_hi[i] = gen_reg_rtx (V8HImode);
+             emit_move_insn (vr_hi[i], gen_lowpart (V8HImode, tmp));
+           }
+
+         /* Merge vector short to vector int.
+            eg: vr_si[0] = { xxxxxxxx, xxxxABCD };
+                vr_si[1] = { xxxxxxxx, xxxxEFGH };
+                ...
+                vr_si[3] = { xxxxxxxx, xxxxMNOP };  */
+         rtx vr_si[4];
+         for (i = 0; i < 4; i++)
+           {
+             rtx tmp = gen_reg_rtx (V8HImode);
+             emit_insn (merge_v8hi (tmp, vr_hi[2 * i], vr_hi[2 * i + 1]));
+             vr_si[i] = gen_reg_rtx (V4SImode);
+             emit_move_insn (vr_si[i], gen_lowpart (V4SImode, tmp));
+           }
+
+         /* Merge vector int to vector long.
+            eg: vr_di[0] = { xxxxxxxx, ABCDEFGH };
+                vr_di[1] = { xxxxxxxx, IJKLMNOP };  */
+         rtx vr_di[2];
+         for (i = 0; i < 2; i++)
+           {
+             rtx tmp = gen_reg_rtx (V4SImode);
+             emit_insn (merge_v4si (tmp, vr_si[2 * i], vr_si[2 * i + 1]));
+             vr_di[i] = gen_reg_rtx (V2DImode);
+             emit_move_insn (vr_di[i], gen_lowpart (V2DImode, tmp));
+           }
+
+         rtx res = gen_reg_rtx (V2DImode);
+         emit_insn (gen_vsx_xxpermdi_v2di (res, vr_di[0], vr_di[1], perm_idx));
+         emit_insn (gen_rtx_SET (target, gen_lowpart (mode, res)));
+       }
+
+      return;
+    }
+
   /* Construct the vector in memory one field at a time
      and load the whole vector.  */
   mem = assign_stack_temp (mode, GET_MODE_SIZE (mode));
diff --git a/gcc/config/rs6000/rs6000.md b/gcc/config/rs6000/rs6000.md
index 43b620ae1c0..4d6bd193a10 100644
--- a/gcc/config/rs6000/rs6000.md
+++ b/gcc/config/rs6000/rs6000.md
@@ -8713,6 +8713,22 @@
   "mtvsrwz %x0,%1"
   [(set_attr "type" "mftgpr")])
 
+(define_insn "p8_mtvsrwz_v16qisi2"
+  [(set (match_operand:V16QI 0 "register_operand" "=wa")
+    (unspec:V16QI [(match_operand:SI 1 "register_operand" "r")]
+                 UNSPEC_P8V_MTVSRWZ))]
+  "!TARGET_POWERPC64 && TARGET_DIRECT_MOVE"
+  "mtvsrwz %x0,%1"
+  [(set_attr "type" "mftgpr")])
+
+(define_insn "p8_mtvsrd_v16qidi2"
+  [(set (match_operand:V16QI 0 "register_operand" "=wa")
+    (unspec:V16QI [(match_operand:DI 1 "register_operand" "r")]
+                 UNSPEC_P8V_MTVSRD))]
+  "TARGET_POWERPC64 && TARGET_DIRECT_MOVE"
+  "mtvsrd %x0,%1"
+  [(set_attr "type" "mftgpr")])
+
 (define_insn_and_split "reload_fpr_from_gpr<mode>"
   [(set (match_operand:FMOVE64X 0 "register_operand" "=d")
        (unspec:FMOVE64X [(match_operand:FMOVE64X 1 "register_operand" "r")]
diff --git a/gcc/testsuite/gcc.target/powerpc/pr96933-1.c 
b/gcc/testsuite/gcc.target/powerpc/pr96933-1.c
new file mode 100644
index 00000000000..27ff251e28c
--- /dev/null
+++ b/gcc/testsuite/gcc.target/powerpc/pr96933-1.c
@@ -0,0 +1,14 @@
+/* { dg-do compile { target { lp64 && has_arch_pwr9 } } } */
+/* { dg-require-effective-target powerpc_p9vector_ok } */
+/* { dg-options "-O2" } */
+
+/* Test vector constructions with char/short type values whether use 128bit
+   direct move instructions mtvsrdd on Power9 or later, rather than transfering
+   with memory store/load with stb/sth and vector load.  */
+
+#include "pr96933.h"
+
+/* { dg-final { scan-assembler-times {\mmtvsrdd\M} 24 } } */
+/* { dg-final { scan-assembler-times {\mvpkudum\M} 12 } } */
+/* { dg-final { scan-assembler-not {\mstb\M} } } */
+/* { dg-final { scan-assembler-not {\msth\M} } } */
diff --git a/gcc/testsuite/gcc.target/powerpc/pr96933-2.c 
b/gcc/testsuite/gcc.target/powerpc/pr96933-2.c
new file mode 100644
index 00000000000..cef8fbd4f35
--- /dev/null
+++ b/gcc/testsuite/gcc.target/powerpc/pr96933-2.c
@@ -0,0 +1,15 @@
+/* { dg-do compile } */
+/* { dg-require-effective-target powerpc_p8vector_ok } */
+/* { dg-options "-O2 -mdejagnu-cpu=power8" } */
+
+/* Test vector constructions with char/short type values whether use direct
+   move instructions like mtvsrd/mtvsrwz on Power8, rather than transfering
+   with memory store/load with stb/sth and vector load.  */
+
+#include "pr96933.h"
+
+/* { dg-final { scan-assembler-times {\mmtvsrd\M} 48 {target lp64 } } } */
+/* { dg-final { scan-assembler-times {\mmtvsrwz\M} 48 {target {! lp64 } } } } 
*/
+/* { dg-final { scan-assembler-times {\mxxpermdi\M} 4 } } */
+/* { dg-final { scan-assembler-not {\mstb\M} } } */
+/* { dg-final { scan-assembler-not {\msth\M} } } */
diff --git a/gcc/testsuite/gcc.target/powerpc/pr96933-3.c 
b/gcc/testsuite/gcc.target/powerpc/pr96933-3.c
new file mode 100644
index 00000000000..3e5709ab0d4
--- /dev/null
+++ b/gcc/testsuite/gcc.target/powerpc/pr96933-3.c
@@ -0,0 +1,10 @@
+/* { dg-do run } */
+/* { dg-require-effective-target p8vector_hw } */
+/* { dg-options "-O2 -mdejagnu-cpu=power8" } */
+
+/* Test vector constructions with char/short run successfully on Power8.  */
+
+#include <stdlib.h>
+#include "pr96933.h"
+#include "pr96933-run.h"
+
diff --git a/gcc/testsuite/gcc.target/powerpc/pr96933-4.c 
b/gcc/testsuite/gcc.target/powerpc/pr96933-4.c
new file mode 100644
index 00000000000..5a1c3d0c857
--- /dev/null
+++ b/gcc/testsuite/gcc.target/powerpc/pr96933-4.c
@@ -0,0 +1,10 @@
+/* { dg-do run } */
+/* { dg-require-effective-target p9vector_hw } */
+/* { dg-options "-O2 -mdejagnu-cpu=power9" } */
+
+/* Test vector constructions with char/short run successfully on Power9.  */
+
+#include <stdlib.h>
+#include "pr96933.h"
+#include "pr96933-run.h"
+
diff --git a/gcc/testsuite/gcc.target/powerpc/pr96933-run.h 
b/gcc/testsuite/gcc.target/powerpc/pr96933-run.h
new file mode 100644
index 00000000000..7fa8dac639c
--- /dev/null
+++ b/gcc/testsuite/gcc.target/powerpc/pr96933-run.h
@@ -0,0 +1,56 @@
+/* Test function for pr96933-{3,4}.c run result verification.  */
+
+int
+main ()
+{
+  unsigned char uc[16];
+  signed char sc[16];
+
+  for (int i = 0; i < 16; i++)
+    {
+      uc[i] = (unsigned char) (i * 2 + 1);
+      sc[i] = (signed char) ((i % 2 == 0) ? (i + 1) : -i);
+    }
+
+  vector unsigned char ucv
+    = test_uchar (uc[0], uc[1], uc[2], uc[3], uc[4], uc[5], uc[6], uc[7], 
uc[8],
+                 uc[9], uc[10], uc[11], uc[12], uc[13], uc[14], uc[15]);
+  vector signed char scv
+    = test_schar (sc[0], sc[1], sc[2], sc[3], sc[4], sc[5], sc[6], sc[7], 
sc[8],
+                 sc[9], sc[10], sc[11], sc[12], sc[13], sc[14], sc[15]);
+
+  for (int i = 0; i < 16; i++)
+    {
+      unsigned char uexp = (unsigned char) (i * 2 + 1);
+      signed char sexp = (signed char) ((i % 2 == 0) ? (i + 1) : -i);
+      if (ucv[i] != uexp)
+       abort ();
+      if (scv[i] != sexp)
+       abort ();
+    }
+
+  unsigned short us[8];
+  signed short ss[8];
+  for (int i = 0; i < 8; i++)
+    {
+      us[i] = (unsigned short) (i * 2 + 1);
+      ss[i] = (signed short) ((i % 2 == 0) ? (i + 1) : -i);
+    }
+
+  vector unsigned short usv
+    = test_ushort (us[0], us[1], us[2], us[3], us[4], us[5], us[6], us[7]);
+  vector signed short ssv
+    = test_sshort (ss[0], ss[1], ss[2], ss[3], ss[4], ss[5], ss[6], ss[7]);
+
+  for (int i = 0; i < 8; i++)
+    {
+      unsigned short uexp = (unsigned short) (i * 2 + 1);
+      signed short sexp = (signed short) ((i % 2 == 0) ? (i + 1) : -i);
+      if (usv[i] != uexp)
+       abort ();
+      if (ssv[i] != sexp)
+       abort ();
+    }
+
+  return 0;
+}
diff --git a/gcc/testsuite/gcc.target/powerpc/pr96933.h 
b/gcc/testsuite/gcc.target/powerpc/pr96933.h
new file mode 100644
index 00000000000..4bc2b941e61
--- /dev/null
+++ b/gcc/testsuite/gcc.target/powerpc/pr96933.h
@@ -0,0 +1,50 @@
+/* Source file for pr96933-*.c testing, this mainly contains 4
+   functions as below:
+
+     - test_uchar  // vector unsigned char
+     - test_schar  // vector signed char
+     - test_ushort // vector unsigned short
+     - test_sshort // vector signed short
+*/
+
+__attribute__ ((noipa)) vector unsigned char
+test_uchar (unsigned char c1, unsigned char c2, unsigned char c3,
+           unsigned char c4, unsigned char c5, unsigned char c6,
+           unsigned char c7, unsigned char c8, unsigned char c9,
+           unsigned char c10, unsigned char c11, unsigned char c12,
+           unsigned char c13, unsigned char c14, unsigned char c15,
+           unsigned char c16)
+{
+  vector unsigned char v
+    = {c1, c2, c3, c4, c5, c6, c7, c8, c9, c10, c11, c12, c13, c14, c15, c16};
+  return v;
+}
+
+__attribute__ ((noipa)) vector signed char
+test_schar (signed char c1, signed char c2, signed char c3, signed char c4,
+           signed char c5, signed char c6, signed char c7, signed char c8,
+           signed char c9, signed char c10, signed char c11, signed char c12,
+           signed char c13, signed char c14, signed char c15, signed char c16)
+{
+  vector signed char v
+    = {c1, c2, c3, c4, c5, c6, c7, c8, c9, c10, c11, c12, c13, c14, c15, c16};
+  return v;
+}
+
+__attribute__ ((noipa)) vector unsigned short
+test_ushort (unsigned short c1, unsigned short c2, unsigned short c3,
+            unsigned short c4, unsigned short c5, unsigned short c6,
+            unsigned short c7, unsigned short c8)
+{
+  vector unsigned short v = {c1, c2, c3, c4, c5, c6, c7, c8};
+  return v;
+}
+
+__attribute__ ((noipa)) vector signed short
+test_sshort (signed short c1, signed short c2, signed short c3,
+            signed short c4, signed short c5, signed short c6,
+            signed short c7, signed short c8)
+{
+  vector signed short v = {c1, c2, c3, c4, c5, c6, c7, c8};
+  return v;
+}

Reply via email to