Hi,

after discussing off-list, here is v2 of the patch.  We now recognize if
the permutation mask only refers to the first or the second operand and
use this later when emitting vpdi.

Regtested and bootstrapped, no regressions.

Is it OK?

Regards
 Robin

>From 1f11a6b89c9b0ad64b480229cd4db06eaaaa887a Mon Sep 17 00:00:00 2001
From: Robin Dapp <rd...@linux.ibm.com>
Date: Fri, 24 Jun 2022 15:17:08 +0200
Subject: [PATCH v2] s390: Recognize reverse/element swap permute patterns.

This adds functions to recognize reverse/element swap permute patterns
for vler, vster as well as vpdi and rotate.

gcc/ChangeLog:

        * config/s390/s390.cc (expand_perm_with_vpdi): Recognize swap pattern.
        (is_reverse_perm_mask): New function.
        (expand_perm_with_rot): Recognize reverse pattern.
        (expand_perm_with_vstbrq): New function.
        (expand_perm_with_vster): Use vler/vster for element reversal on z15.
        (vectorize_vec_perm_const_1): Use.
        (s390_vectorize_vec_perm_const): Add expand functions.
        * config/s390/vx-builtins.md: Prefer vster over vler.

gcc/testsuite/ChangeLog:

        * gcc.target/s390/vector/vperm-rev-z14.c: New test.
        * gcc.target/s390/vector/vperm-rev-z15.c: New test.
        * gcc.target/s390/zvector/vec-reve-store-byte.c: Adjust test
        expectation.
---
 gcc/config/s390/s390.cc                       | 119 +++++++++++++++++-
 gcc/config/s390/vx-builtins.md                |  21 ++++
 .../gcc.target/s390/vector/vperm-rev-z14.c    |  87 +++++++++++++
 .../gcc.target/s390/vector/vperm-rev-z15.c    | 118 +++++++++++++++++
 .../s390/zvector/vec-reve-store-byte.c        |   6 +-
 5 files changed, 345 insertions(+), 6 deletions(-)
 create mode 100644 gcc/testsuite/gcc.target/s390/vector/vperm-rev-z14.c
 create mode 100644 gcc/testsuite/gcc.target/s390/vector/vperm-rev-z15.c

diff --git a/gcc/config/s390/s390.cc b/gcc/config/s390/s390.cc
index 528cd8c7f0f6..5e853e60f1a9 100644
--- a/gcc/config/s390/s390.cc
+++ b/gcc/config/s390/s390.cc
@@ -17161,6 +17161,8 @@ struct expand_vec_perm_d
   machine_mode vmode;
   unsigned char nelt;
   bool testing_p;
+  bool only_op0;
+  bool only_op1;
 };

 /* Try to expand the vector permute operation described by D using the
@@ -17228,7 +17230,9 @@ expand_perm_with_vpdi (const struct
expand_vec_perm_d &d)
   if (d.perm[0] == 0 && d.perm[1] == 3)
     vpdi1_p = true;

-  if (d.perm[0] == 1 && d.perm[1] == 2)
+  if ((d.perm[0] == 1 && d.perm[1] == 2)
+      || (d.perm[0] == 1 && d.perm[1] == 0)
+      || (d.perm[0] == 3 && d.perm[1] == 2))
     vpdi4_p = true;

   if (!vpdi1_p && !vpdi4_p)
@@ -17240,15 +17244,107 @@ expand_perm_with_vpdi (const struct
expand_vec_perm_d &d)
   op0_reg = force_reg (GET_MODE (d.op0), d.op0);
   op1_reg = force_reg (GET_MODE (d.op1), d.op1);

+  /* If we only reference either of the operands in
+     the permute mask, just use one of them.  */
+  if (d.only_op0)
+    op1_reg = op0_reg;
+  else if (d.only_op1)
+    op0_reg = op1_reg;
+
   if (vpdi1_p)
     emit_insn (gen_vpdi1 (d.vmode, d.target, op0_reg, op1_reg));
-
   if (vpdi4_p)
     emit_insn (gen_vpdi4 (d.vmode, d.target, op0_reg, op1_reg));

   return true;
 }

+/* Helper that checks if a vector permutation mask D
+   represents a reversal of the vector's elements.  */
+static inline bool
+is_reverse_perm_mask (const struct expand_vec_perm_d &d)
+{
+  for (int i = 0; i < d.nelt; i++)
+    if (d.perm[i] != d.nelt - i - 1)
+      return false;
+  return true;
+}
+
+/* The case of reversing a four-element vector [0, 1, 2, 3]
+   can be handled by first permuting the doublewords
+   [2, 3, 0, 1] and subsequently rotating them by 32 bits.  */
+static bool
+expand_perm_with_rot (const struct expand_vec_perm_d &d)
+{
+  if (d.nelt != 4)
+    return false;
+
+  if (d.op0 == d.op1 && is_reverse_perm_mask (d))
+    {
+      if (d.testing_p)
+       return true;
+
+      rtx tmp = gen_reg_rtx (d.vmode);
+      rtx op0_reg = force_reg (GET_MODE (d.op0), d.op0);
+
+      emit_insn (gen_vpdi4_2 (d.vmode, tmp, op0_reg, op0_reg));
+      if (d.vmode == V4SImode)
+       emit_insn (gen_rotlv4si3_di (d.target, tmp));
+      else if (d.vmode == V4SFmode)
+       emit_insn (gen_rotlv4sf3_di (d.target, tmp));
+
+      return true;
+    }
+
+  return false;
+}
+
+/* If we just reverse the elements, emit an eltswap if we have
+   vler/vster.  */
+static bool
+expand_perm_with_vster (const struct expand_vec_perm_d &d)
+{
+  if (TARGET_VXE2 && d.op0 == d.op1 && is_reverse_perm_mask (d)
+      && (d.vmode == V2DImode || d.vmode == V2DFmode
+         || d.vmode == V4SImode || d.vmode == V4SFmode
+         || d.vmode == V8HImode))
+    {
+      if (d.testing_p)
+       return true;
+
+      if (d.vmode == V2DImode)
+       emit_insn (gen_eltswapv2di (d.target, d.op0));
+      else if (d.vmode == V2DFmode)
+       emit_insn (gen_eltswapv2df (d.target, d.op0));
+      else if (d.vmode == V4SImode)
+       emit_insn (gen_eltswapv4si (d.target, d.op0));
+      else if (d.vmode == V4SFmode)
+       emit_insn (gen_eltswapv4sf (d.target, d.op0));
+      else if (d.vmode == V8HImode)
+       emit_insn (gen_eltswapv8hi (d.target, d.op0));
+      return true;
+    }
+  return false;
+}
+
+/* If we reverse a byte-vector this is the same as
+   byte reversing it which can be done with vstbrq.  */
+static bool
+expand_perm_with_vstbrq (const struct expand_vec_perm_d &d)
+{
+  if (TARGET_VXE2 && d.op0 == d.op1 && is_reverse_perm_mask (d)
+      && d.vmode == V16QImode)
+    {
+      if (d.testing_p)
+       return true;
+
+      emit_insn (gen_eltswapv16qi (d.target, d.op0));
+      return true;
+    }
+  return false;
+}
+
+
 /* Try to find the best sequence for the vector permute operation
    described by D.  Return true if the operation could be
    expanded.  */
@@ -17258,9 +17354,18 @@ vectorize_vec_perm_const_1 (const struct
expand_vec_perm_d &d)
   if (expand_perm_with_merge (d))
     return true;

+  if (expand_perm_with_vster (d))
+    return true;
+
+  if (expand_perm_with_vstbrq (d))
+    return true;
+
   if (expand_perm_with_vpdi (d))
     return true;

+  if (expand_perm_with_rot (d))
+    return true;
+
   return false;
 }

@@ -17290,17 +17395,27 @@ s390_vectorize_vec_perm_const (machine_mode
vmode, machine_mode op_mode,
   gcc_assert (VECTOR_MODE_P (d.vmode));
   d.nelt = nelt = GET_MODE_NUNITS (d.vmode);
   d.testing_p = target == NULL_RTX;
+  d.only_op0 = false;
+  d.only_op1 = false;

   gcc_assert (target == NULL_RTX || REG_P (target));
   gcc_assert (sel.length () == nelt);

+  unsigned int highest = 0, lowest = 2 * nelt - 1;
   for (i = 0; i < nelt; i++)
     {
       unsigned char e = sel[i];
+      lowest = MIN (lowest, e);
+      highest = MAX (highest, e);
       gcc_assert (e < 2 * nelt);
       d.perm[i] = e;
     }

+  if (lowest < nelt && highest < nelt)
+    d.only_op0 = true;
+  else if (lowest >= nelt && highest >= nelt)
+    d.only_op1 = true;
+
   return vectorize_vec_perm_const_1 (d);
 }

diff --git a/gcc/config/s390/vx-builtins.md b/gcc/config/s390/vx-builtins.md
index 99c4c037b49a..22d0355ec219 100644
--- a/gcc/config/s390/vx-builtins.md
+++ b/gcc/config/s390/vx-builtins.md
@@ -2184,6 +2184,27 @@ (define_insn "*eltswap<mode>"
    vster<bhfgq>\t%v1,%v0"
   [(set_attr "op_type" "*,VRX,VRX")])

+; The emulation pattern below will also accept
+;  vst (eltswap (vl))
+; i.e. both operands in memory, which reload needs to fix.
+; Split into
+;  vl
+;  vster (=vst (eltswap))
+; since we prefer vster over vler as long as the latter
+; does not support alignment hints.
+(define_split
+  [(set (match_operand:VEC_HW                 0 "memory_operand" "")
+       (unspec:VEC_HW [(match_operand:VEC_HW 1 "memory_operand" "")]
+                      UNSPEC_VEC_ELTSWAP))]
+  "TARGET_VXE2 && can_create_pseudo_p ()"
+  [(set (match_dup 2) (match_dup 1))
+   (set (match_dup 0)
+       (unspec:VEC_HW [(match_dup 2)] UNSPEC_VEC_ELTSWAP))]
+{
+  operands[2] = gen_reg_rtx (<MODE>mode);
+})
+
+
 ; Swapping v2df/v2di can be done via vpdi on z13 and z14.
 (define_split
   [(set (match_operand:V_HW_2                 0 "register_operand" "")
diff --git a/gcc/testsuite/gcc.target/s390/vector/vperm-rev-z14.c
b/gcc/testsuite/gcc.target/s390/vector/vperm-rev-z14.c
new file mode 100644
index 000000000000..5c64fac4646c
--- /dev/null
+++ b/gcc/testsuite/gcc.target/s390/vector/vperm-rev-z14.c
@@ -0,0 +1,87 @@
+/* Make sure that the reverse permute patterns are optimized
+   correctly.  */
+/* { dg-do run { target { s390*-*-* } } } */
+/* { dg-options "-O2 -march=z14 -mzarch -fno-unroll-loops" } */
+
+/* { dg-final { scan-assembler-times "vpdi\t" 4 } } */
+/* { dg-final { scan-assembler-times "verllg\t" 2 } } */
+
+#include <assert.h>
+
+__attribute__((noipa))
+void reversel (long long *restrict a, long long *restrict b, int n)
+{
+  for (int i = 0; i < n; i += 2)
+    {
+      a[i + 1] = b[i + 0];
+      a[i + 0] = b[i + 1];
+    }
+}
+
+__attribute__((noipa))
+void reversed (double *restrict a, double *restrict b, int n)
+{
+  for (int i = 0; i < n; i += 2)
+    {
+      a[i + 1] = b[i + 0];
+      a[i + 0] = b[i + 1];
+    }
+}
+
+__attribute__((noipa))
+void reversei (unsigned int *restrict a, unsigned int *restrict b, int n)
+{
+  for (int i = 0; i < n; i += 4)
+    {
+      a[i + 3] = b[i + 0];
+      a[i + 2] = b[i + 1];
+      a[i + 1] = b[i + 2];
+      a[i + 0] = b[i + 3];
+    }
+}
+
+__attribute__((noipa))
+void reversef (float *restrict a, float *restrict b, int n)
+{
+  for (int i = 0; i < n; i += 4)
+    {
+      a[i + 3] = b[i + 0];
+      a[i + 2] = b[i + 1];
+      a[i + 1] = b[i + 2];
+      a[i + 0] = b[i + 3];
+    }
+}
+
+int main()
+{
+  const int n = 1024;
+  unsigned int u[n], u2[n];
+  long long l[n], l2[n];
+  double d[n], d2[n];
+  float f[n], f2[n];
+
+  for (int i = 0; i < n; i++)
+    {
+      u[i] = i;
+      l[i] = i;
+      d[i] = i;
+      f[i] = i;
+      u2[i] = i;
+      l2[i] = i;
+      d2[i] = i;
+      f2[i] = i;
+    }
+
+  reversei (u2, u, n);
+  reversel (l2, l, n);
+  reversed (d2, d, n);
+  reversef (f2, f, n);
+
+  for (int i = 0; i < n - 16; i++)
+    {
+      assert (u[i] == u2[i / (16 / sizeof (u[0])) * (16 / sizeof
(u[0])) + 16 / sizeof (u[0]) - 1 - i % (16 / sizeof (u[0]))]);
+      assert (l[i] == l2[i / (16 / sizeof (l[0])) * (16 / sizeof
(l[0])) + 16 / sizeof (l[0]) - 1 - i % (16 / sizeof (l[0]))]);
+      assert (d[i] == d2[i / (16 / sizeof (d[0])) * (16 / sizeof
(d[0])) + 16 / sizeof (d[0]) - 1 - i % (16 / sizeof (d[0]))]);
+      assert (f[i] == f2[i / (16 / sizeof (f[0])) * (16 / sizeof
(f[0])) + 16 / sizeof (f[0]) - 1 - i % (16 / sizeof (f[0]))]);
+    }
+}
diff --git a/gcc/testsuite/gcc.target/s390/vector/vperm-rev-z15.c
b/gcc/testsuite/gcc.target/s390/vector/vperm-rev-z15.c
new file mode 100644
index 000000000000..bff52406fa9b
--- /dev/null
+++ b/gcc/testsuite/gcc.target/s390/vector/vperm-rev-z15.c
@@ -0,0 +1,118 @@
+/* Make sure that the reverse permute patterns are optimized
+   correctly.  */
+/* { dg-do run { target { s390*-*-* } } } */
+/* { dg-options "-O2 -march=z15 -mzarch -fno-unroll-loops" } */
+
+/* { dg-final { scan-assembler-times "vsterg\t" 2 } } */
+/* { dg-final { scan-assembler-times "vsterf" 2 } } */
+/* { dg-final { scan-assembler-times "vstbrq\t" 1 } } */
+/* { dg-final { scan-assembler-times "vperm" 0 } } */
+
+#include <assert.h>
+
+__attribute__((noipa))
+void reversec (char *restrict a, char *restrict b, int n)
+{
+  for (int i = 0; i < n; i += 16)
+    {
+      a[i + 0] = b[i + 15];
+      a[i + 1] = b[i + 14];
+      a[i + 2] = b[i + 13];
+      a[i + 3] = b[i + 12];
+      a[i + 4] = b[i + 11];
+      a[i + 5] = b[i + 10];
+      a[i + 6] = b[i + 9];
+      a[i + 7] = b[i + 8];
+      a[i + 8] = b[i + 7];
+      a[i + 9] = b[i + 6];
+      a[i + 10] = b[i + 5];
+      a[i + 11] = b[i + 4];
+      a[i + 12] = b[i + 3];
+      a[i + 13] = b[i + 2];
+      a[i + 14] = b[i + 1];
+      a[i + 15] = b[i + 0];
+    }
+}
+
+__attribute__((noipa))
+void reversel (long long *restrict a, long long *restrict b, int n)
+{
+  for (int i = 0; i < n; i += 2)
+    {
+      a[i + 1] = b[i + 0];
+      a[i + 0] = b[i + 1];
+    }
+}
+
+__attribute__((noipa))
+void reversed (double *restrict a, double *restrict b, int n)
+{
+  for (int i = 0; i < n; i += 2)
+    {
+      a[i + 1] = b[i + 0];
+      a[i + 0] = b[i + 1];
+    }
+}
+
+__attribute__((noipa))
+void reversei (unsigned int *restrict a, unsigned int *restrict b, int n)
+{
+  for (int i = 0; i < n; i += 4)
+    {
+      a[i + 3] = b[i + 0];
+      a[i + 2] = b[i + 1];
+      a[i + 1] = b[i + 2];
+      a[i + 0] = b[i + 3];
+    }
+}
+
+__attribute__((noipa))
+void reversef (float *restrict a, float *restrict b, int n)
+{
+  for (int i = 0; i < n; i += 4)
+    {
+      a[i + 3] = b[i + 0];
+      a[i + 2] = b[i + 1];
+      a[i + 1] = b[i + 2];
+      a[i + 0] = b[i + 3];
+    }
+}
+
+int main()
+{
+  const int n = 1024;
+  char c[n], c2[n];
+  unsigned int u[n], u2[n];
+  long long l[n], l2[n];
+  double d[n], d2[n];
+  float f[n], f2[n];
+
+  for (int i = 0; i < n; i++)
+    {
+      c[i] = i;
+      u[i] = i;
+      l[i] = i;
+      d[i] = i;
+      f[i] = i;
+      c2[i] = i;
+      u2[i] = i;
+      l2[i] = i;
+      d2[i] = i;
+      f2[i] = i;
+    }
+
+  reversec (c2, c, n);
+  reversei (u2, u, n);
+  reversel (l2, l, n);
+  reversed (d2, d, n);
+  reversef (f2, f, n);
+
+  for (int i = 0; i < n - 16; i++)
+    {
+      assert (c[i] == c2[i / (16 / sizeof (c[0])) * (16 / sizeof
(c[0])) + 16 / sizeof (c[0]) - 1 - i % (16 / sizeof (c[0]))]);
+      assert (u[i] == u2[i / (16 / sizeof (u[0])) * (16 / sizeof
(u[0])) + 16 / sizeof (u[0]) - 1 - i % (16 / sizeof (u[0]))]);
+      assert (l[i] == l2[i / (16 / sizeof (l[0])) * (16 / sizeof
(l[0])) + 16 / sizeof (l[0]) - 1 - i % (16 / sizeof (l[0]))]);
+      assert (d[i] == d2[i / (16 / sizeof (d[0])) * (16 / sizeof
(d[0])) + 16 / sizeof (d[0]) - 1 - i % (16 / sizeof (d[0]))]);
+      assert (f[i] == f2[i / (16 / sizeof (f[0])) * (16 / sizeof
(f[0])) + 16 / sizeof (f[0]) - 1 - i % (16 / sizeof (f[0]))]);
+    }
+}
diff --git a/gcc/testsuite/gcc.target/s390/zvector/vec-reve-store-byte.c
b/gcc/testsuite/gcc.target/s390/zvector/vec-reve-store-byte.c
index db8284b1f8ff..6c061c69fea0 100644
--- a/gcc/testsuite/gcc.target/s390/zvector/vec-reve-store-byte.c
+++ b/gcc/testsuite/gcc.target/s390/zvector/vec-reve-store-byte.c
@@ -16,13 +16,11 @@ bar (signed char *target, vector signed char x)
   vec_xst (vec_reve (x), 0, target);
 }

-/* { dg-final { scan-assembler-times "vstbrq\t" 2 } } */
-
-/* mem -> mem: This becomes vlbrq + vst */
+/* mem -> mem: This becomes vl + vstbrq */
 void
 baz (vector signed char *target, vector signed char *x)
 {
   *target = vec_reve (*x);
 }

-/* { dg-final { scan-assembler-times "vlbrq\t" 1 } } */
+/* { dg-final { scan-assembler-times "vstbrq\t" 3 } } */
-- 
2.31.1

Reply via email to