This patch makes use of the vector permute double immediate
instruction for constant permute vectors.

gcc/ChangeLog:

        * config/s390/s390.c (expand_perm_with_vpdi): New function.
        (vectorize_vec_perm_const_1): Call expand_perm_with_vpdi.
        * config/s390/vector.md (*vpdi1<mode>, @vpdi1<mode>): Enable a
        parameterized expander.
        (*vpdi4<mode>, @vpdi4<mode>): Likewise.

gcc/testsuite/ChangeLog:

        * gcc.target/s390/vector/perm-vpdi.c: New test.
---
 gcc/config/s390/s390.c                        | 47 ++++++++++++++++++
 gcc/config/s390/vector.md                     |  5 +-
 .../gcc.target/s390/vector/perm-vpdi.c        | 49 +++++++++++++++++++
 3 files changed, 98 insertions(+), 3 deletions(-)
 create mode 100644 gcc/testsuite/gcc.target/s390/vector/perm-vpdi.c

diff --git a/gcc/config/s390/s390.c b/gcc/config/s390/s390.c
index 684241b00b8..20c52c83c72 100644
--- a/gcc/config/s390/s390.c
+++ b/gcc/config/s390/s390.c
@@ -16981,6 +16981,50 @@ expand_perm_with_merge (const struct expand_vec_perm_d 
&d)
   return merge_lo_p || merge_hi_p;
 }
 
+/* Try to expand the vector permute operation described by D using the
+   vector permute doubleword immediate instruction vpdi.  Return true
+   if vpdi could be used.
+
+   VPDI allows 4 different immediate values (0, 1, 4, 5). The 0 and 5
+   cases are covered by vmrhg and vmrlg already.  So we only care
+   about the 1, 4 cases here.
+   1 - First element of src1 and second of src2
+   4 - Second element of src1 and first of src2  */
+static bool
+expand_perm_with_vpdi (const struct expand_vec_perm_d &d)
+{
+  bool vpdi1_p = false;
+  bool vpdi4_p = false;
+  rtx op0_reg, op1_reg;
+
+  // Only V2DI and V2DF are supported here.
+  if (d.nelt != 2)
+    return false;
+
+  if (d.perm[0] == 0 && d.perm[1] == 3)
+    vpdi1_p = true;
+
+  if (d.perm[0] == 1 && d.perm[1] == 2)
+    vpdi4_p = true;
+
+  if (!vpdi1_p && !vpdi4_p)
+    return false;
+
+  if (d.testing_p)
+    return true;
+
+  op0_reg = force_reg (GET_MODE (d.op0), d.op0);
+  op1_reg = force_reg (GET_MODE (d.op1), d.op1);
+
+  if (vpdi1_p)
+    emit_insn (gen_vpdi1 (d.vmode, d.target, op0_reg, op1_reg));
+
+  if (vpdi4_p)
+    emit_insn (gen_vpdi4 (d.vmode, d.target, op0_reg, op1_reg));
+
+  return true;
+}
+
 /* Try to find the best sequence for the vector permute operation
    described by D.  Return true if the operation could be
    expanded.  */
@@ -16990,6 +17034,9 @@ vectorize_vec_perm_const_1 (const struct 
expand_vec_perm_d &d)
   if (expand_perm_with_merge (d))
     return true;
 
+  if (expand_perm_with_vpdi (d))
+    return true;
+
   return false;
 }
 
diff --git a/gcc/config/s390/vector.md b/gcc/config/s390/vector.md
index b372bf171f7..1b0ae47ab49 100644
--- a/gcc/config/s390/vector.md
+++ b/gcc/config/s390/vector.md
@@ -768,7 +768,7 @@ (define_insn "*vec_perm<mode>"
 
 
 ; First DW of op1 and second DW of op2
-(define_insn "*vpdi1<mode>"
+(define_insn "@vpdi1<mode>"
   [(set (match_operand:V_HW_2   0 "register_operand" "=v")
        (vec_select:V_HW_2
         (vec_concat:<vec_2x_nelts>
@@ -780,7 +780,7 @@ (define_insn "*vpdi1<mode>"
   [(set_attr "op_type" "VRR")])
 
 ; Second DW of op1 and first of op2
-(define_insn "*vpdi4<mode>"
+(define_insn "@vpdi4<mode>"
   [(set (match_operand:V_HW_2   0 "register_operand" "=v")
        (vec_select:V_HW_2
         (vec_concat:<vec_2x_nelts>
@@ -926,7 +926,6 @@ (define_insn_and_split "tf_to_fprx2"
   operands[5] = simplify_gen_subreg (DFmode, operands[1], TFmode, 8);
 })
 
-; vec_perm_const for V2DI using vpdi?
 
 ;;
 ;; Vector integer arithmetic instructions
diff --git a/gcc/testsuite/gcc.target/s390/vector/perm-vpdi.c 
b/gcc/testsuite/gcc.target/s390/vector/perm-vpdi.c
new file mode 100644
index 00000000000..cc925315b37
--- /dev/null
+++ b/gcc/testsuite/gcc.target/s390/vector/perm-vpdi.c
@@ -0,0 +1,49 @@
+/* { dg-do run { target { s390*-*-* } } } */
+/* { dg-options "-O3 -mzarch -march=z14 -mzvector --save-temps" } */
+
+/* { dg-final { scan-assembler-times "\tvmrhg\t" 3 } } */
+/* { dg-final { scan-assembler-times "\tvmrlg\t" 3 } } */
+/* { dg-final { scan-assembler-times "\tvpdi\t" 6 } } */
+
+#include "vec-types.h"
+#include <vecintrin.h>
+
+#define GEN_PERMI_BITS(VEC_TYPE, BITS)                         \
+  VEC_TYPE __attribute__((noinline))                           \
+  permi_##BITS##_##VEC_TYPE(VEC_TYPE a, VEC_TYPE b) {          \
+    return (VEC_TYPE){a[((BITS) & 2) >> 1], b[(BITS) & 1] }; }
+
+#define GEN_PERMI(VEC_TYPE)                    \
+  GEN_PERMI_BITS(VEC_TYPE, 0);                 \
+  GEN_PERMI_BITS(VEC_TYPE, 1);                 \
+  GEN_PERMI_BITS(VEC_TYPE, 2);                 \
+  GEN_PERMI_BITS(VEC_TYPE, 3);                 \
+
+GEN_PERMI(v2di)
+GEN_PERMI(uv2di)
+GEN_PERMI(v2df)
+
+
+#define CHECK_PERMI_BITS(VEC_TYPE, BITS)               \
+  VEC_TYPE r##BITS = permi_##BITS##_##VEC_TYPE (a, b); \
+  if (r##BITS[0] != ((BITS) & 2) >> 1                  \
+      || r##BITS[1] != ((BITS) & 1) + 2)               \
+    __builtin_abort();
+
+#define CHECK_PERMI(VEC_TYPE)                  \
+  {                                            \
+    VEC_TYPE a = GEN_SEQ_VEC (VEC_TYPE, 0);    \
+    VEC_TYPE b = GEN_SEQ_VEC (VEC_TYPE, 2);    \
+    CHECK_PERMI_BITS (VEC_TYPE, 0);            \
+    CHECK_PERMI_BITS (VEC_TYPE, 1);            \
+    CHECK_PERMI_BITS (VEC_TYPE, 2);            \
+    CHECK_PERMI_BITS (VEC_TYPE, 3);            \
+  }
+
+int
+main ()
+{
+  CHECK_PERMI (v2di);
+  CHECK_PERMI (uv2di);
+  CHECK_PERMI (v2df);
+}
-- 
2.31.1

Reply via email to