This patch implements the TARGET_VECTORIZE_VEC_PERM_CONST in the IBM Z backend. The initial implementation only exploits the vector merge instruction but there is more to come.
gcc/ChangeLog: * config/s390/s390.c (MAX_VECT_LEN): Define macro. (struct expand_vec_perm_d): Define struct. (expand_perm_with_merge): New function. (vectorize_vec_perm_const_1): New function. (s390_vectorize_vec_perm_const): New function. (TARGET_VECTORIZE_VEC_PERM_CONST): Define target macro. gcc/testsuite/ChangeLog: * gcc.target/s390/vector/perm-merge.c: New test. * gcc.target/s390/vector/vec-types.h: New test. --- gcc/config/s390/s390.c | 108 ++++++++++++++++++ .../gcc.target/s390/vector/perm-merge.c | 104 +++++++++++++++++ .../gcc.target/s390/vector/vec-types.h | 35 ++++++ 3 files changed, 247 insertions(+) create mode 100644 gcc/testsuite/gcc.target/s390/vector/perm-merge.c create mode 100644 gcc/testsuite/gcc.target/s390/vector/vec-types.h diff --git a/gcc/config/s390/s390.c b/gcc/config/s390/s390.c index b1a9ca9d8aa..684241b00b8 100644 --- a/gcc/config/s390/s390.c +++ b/gcc/config/s390/s390.c @@ -16928,6 +16928,110 @@ s390_md_asm_adjust (vec<rtx> &outputs, vec<rtx> &inputs, return after_md_seq; } +#define MAX_VECT_LEN 16 + +struct expand_vec_perm_d +{ + rtx target, op0, op1; + unsigned char perm[MAX_VECT_LEN]; + machine_mode vmode; + unsigned char nelt; + bool testing_p; +}; + +/* Try to expand the vector permute operation described by D using the + vector merge instructions vml and vmh. Return true if vector merge + could be used. */ +static bool +expand_perm_with_merge (const struct expand_vec_perm_d &d) +{ + bool merge_lo_p = true; + bool merge_hi_p = true; + + if (d.nelt % 2) + return false; + + // For V4SI this checks for: { 0, 4, 1, 5 } + for (int telt = 0; telt < d.nelt; telt++) + if (d.perm[telt] != telt / 2 + (telt % 2) * d.nelt) + { + merge_hi_p = false; + break; + } + + if (!merge_hi_p) + { + // For V4SI this checks for: { 2, 6, 3, 7 } + for (int telt = 0; telt < d.nelt; telt++) + if (d.perm[telt] != (telt + d.nelt) / 2 + (telt % 2) * d.nelt) + { + merge_lo_p = false; + break; + } + } + else + merge_lo_p = false; + + if (d.testing_p) + return merge_lo_p || merge_hi_p; + + if (merge_lo_p || merge_hi_p) + s390_expand_merge (d.target, d.op0, d.op1, merge_hi_p); + + return merge_lo_p || merge_hi_p; +} + +/* Try to find the best sequence for the vector permute operation + described by D. Return true if the operation could be + expanded. */ +static bool +vectorize_vec_perm_const_1 (const struct expand_vec_perm_d &d) +{ + if (expand_perm_with_merge (d)) + return true; + + return false; +} + +/* Return true if we can emit instructions for the constant + permutation vector in SEL. If OUTPUT, IN0, IN1 are non-null the + hook is supposed to emit the required INSNs. */ + +bool +s390_vectorize_vec_perm_const (machine_mode vmode, rtx target, rtx op0, rtx op1, + const vec_perm_indices &sel) +{ + struct expand_vec_perm_d d; + unsigned char perm[MAX_VECT_LEN]; + unsigned int i, nelt; + + if (!s390_vector_mode_supported_p (vmode) || GET_MODE_SIZE (vmode) != 16) + return false; + + d.target = target; + d.op0 = op0; + d.op1 = op1; + + d.vmode = vmode; + gcc_assert (VECTOR_MODE_P (d.vmode)); + d.nelt = nelt = GET_MODE_NUNITS (d.vmode); + d.testing_p = target == NULL_RTX; + + gcc_assert (target == NULL_RTX || REG_P (target)); + gcc_assert (sel.length () == nelt); + gcc_checking_assert (sizeof (d.perm) == sizeof (perm)); + + for (i = 0; i < nelt; i++) + { + unsigned char e = sel[i]; + gcc_assert (e < 2 * nelt); + d.perm[i] = e; + perm[i] = e; + } + + return vectorize_vec_perm_const_1 (d); +} + /* Initialize GCC target structure. */ #undef TARGET_ASM_ALIGNED_HI_OP @@ -17238,6 +17342,10 @@ s390_md_asm_adjust (vec<rtx> &outputs, vec<rtx> &inputs, #undef TARGET_MD_ASM_ADJUST #define TARGET_MD_ASM_ADJUST s390_md_asm_adjust +#undef TARGET_VECTORIZE_VEC_PERM_CONST +#define TARGET_VECTORIZE_VEC_PERM_CONST s390_vectorize_vec_perm_const + + struct gcc_target targetm = TARGET_INITIALIZER; #include "gt-s390.h" diff --git a/gcc/testsuite/gcc.target/s390/vector/perm-merge.c b/gcc/testsuite/gcc.target/s390/vector/perm-merge.c new file mode 100644 index 00000000000..51b23ddd886 --- /dev/null +++ b/gcc/testsuite/gcc.target/s390/vector/perm-merge.c @@ -0,0 +1,104 @@ +/* { dg-do compile } */ +/* { dg-options "-O3 -mzarch -march=z14 -mzvector --save-temps" } */ +/* { dg-do run { target { s390_z14_hw } } } */ + +/* { dg-final { scan-assembler-times "\tvmrhb\t" 2 } } */ +/* { dg-final { scan-assembler-times "\tvmrlb\t" 2 } } */ +/* { dg-final { scan-assembler-times "\tvmrhh\t" 2 } } */ +/* { dg-final { scan-assembler-times "\tvmrlh\t" 2 } } */ +/* { dg-final { scan-assembler-times "\tvmrhf\t" 3 } } */ +/* { dg-final { scan-assembler-times "\tvmrlf\t" 3 } } */ +/* { dg-final { scan-assembler-times "\tvmrhg\t" 3 } } */ +/* { dg-final { scan-assembler-times "\tvmrlg\t" 3 } } */ + +#include "vec-types.h" + +#define GEN_MERGE_2(VEC_TYPE, HILO, A) \ + VEC_TYPE __attribute__((noinline)) \ + merge_##HILO##_##VEC_TYPE(VEC_TYPE a, VEC_TYPE b) { \ + return (VEC_TYPE){ a[0+A], b[0+A] }; } + +#define GEN_MERGE_4(VEC_TYPE, HILO, A) \ + VEC_TYPE __attribute__((noinline)) \ + merge_##HILO##_##VEC_TYPE(VEC_TYPE a, VEC_TYPE b) { \ + return (VEC_TYPE){ a[0+A], b[0+A], a[1+A], b[1+A] }; } + +#define GEN_MERGE_8(VEC_TYPE, HILO, A) \ + VEC_TYPE __attribute__((noinline)) \ + merge_##HILO##_##VEC_TYPE(VEC_TYPE a, VEC_TYPE b) { \ + return (VEC_TYPE){ a[0+A], b[0+A], a[1+A], b[1+A], a[2+A], b[2+A], a[3+A], b[3+A] }; } + +#define GEN_MERGE_16(VEC_TYPE, HILO, A) \ + VEC_TYPE __attribute__((noinline)) \ + merge_##HILO##_##VEC_TYPE(VEC_TYPE a, VEC_TYPE b) { \ + return (VEC_TYPE){ a[0+A], b[0+A], a[1+A], b[1+A], a[2+A], b[2+A], a[3+A], b[3+A], \ + a[4+A], b[4+A], a[5+A], b[5+A], a[6+A], b[6+A], a[7+A], b[7+A]}; } + + +GEN_MERGE_16(v16qi, l, 8) +GEN_MERGE_16(v16qi, h, 0) +GEN_MERGE_16(uv16qi, l, 8) +GEN_MERGE_16(uv16qi, h, 0) + +GEN_MERGE_8(v8hi, l, 4) +GEN_MERGE_8(v8hi, h, 0) +GEN_MERGE_8(uv8hi, l, 4) +GEN_MERGE_8(uv8hi, h, 0) + +GEN_MERGE_4(v4si, l, 2) +GEN_MERGE_4(v4si, h, 0) +GEN_MERGE_4(uv4si, l, 2) +GEN_MERGE_4(uv4si, h, 0) + +GEN_MERGE_4(v4sf, l, 2) +GEN_MERGE_4(v4sf, h, 0) + +GEN_MERGE_2(v2di, l, 1) +GEN_MERGE_2(v2di, h, 0) +GEN_MERGE_2(uv2di, l, 1) +GEN_MERGE_2(uv2di, h, 0) + +GEN_MERGE_2(v2df, l, 1) +GEN_MERGE_2(v2df, h, 0) + + +#define CHECK_MERGE_LO(VEC_TYPE, SRC1, SRC2) \ + { \ + VEC_TYPE v = merge_l_##VEC_TYPE ((SRC1), (SRC2)); \ + int elts = sizeof(v) / sizeof(v[0]); \ + for (int i = 0; i < elts; i++) \ + if (v[i] != (i + elts) / 2 + (i % 2) * elts) \ + __builtin_abort(); \ + } + +#define CHECK_MERGE_HI(VEC_TYPE, SRC1, SRC2) \ + { \ + VEC_TYPE v = merge_h_##VEC_TYPE ((SRC1), (SRC2)); \ + int elts = sizeof(v) / sizeof(v[0]); \ + for (int i = 0; i < elts; i++) \ + if (v[i] != i / 2 + (i % 2) * elts) \ + __builtin_abort(); \ + } + +#define CHECK_MERGE(VEC_TYPE) \ + { \ + VEC_TYPE a = GEN_SEQ_VEC (VEC_TYPE, 0); \ + VEC_TYPE b = GEN_SEQ_VEC (VEC_TYPE, sizeof(VEC_TYPE) / sizeof(a[0])); \ + CHECK_MERGE_LO (VEC_TYPE, a, b); \ + CHECK_MERGE_HI (VEC_TYPE, a, b); \ + } + +int +main () +{ + CHECK_MERGE(v16qi); + CHECK_MERGE(uv16qi); + CHECK_MERGE(v8hi); + CHECK_MERGE(uv8hi); + CHECK_MERGE(v4si); + CHECK_MERGE(uv4si); + CHECK_MERGE(v4sf); + CHECK_MERGE(v2di); + CHECK_MERGE(uv2di); + CHECK_MERGE(v2df); +} diff --git a/gcc/testsuite/gcc.target/s390/vector/vec-types.h b/gcc/testsuite/gcc.target/s390/vector/vec-types.h new file mode 100644 index 00000000000..b7ffbe73321 --- /dev/null +++ b/gcc/testsuite/gcc.target/s390/vector/vec-types.h @@ -0,0 +1,35 @@ +#ifndef VEC_TYPES_H +#define VEC_TYPES_H 1 + +typedef __attribute__((vector_size(16))) signed char v16qi; +typedef __attribute__((vector_size(16))) unsigned char uv16qi; + +typedef __attribute__((vector_size(16))) signed short v8hi; +typedef __attribute__((vector_size(16))) unsigned short uv8hi; + +typedef __attribute__((vector_size(16))) signed int v4si; +typedef __attribute__((vector_size(16))) unsigned int uv4si; + +typedef __attribute__((vector_size(16))) signed long long v2di; +typedef __attribute__((vector_size(16))) unsigned long long uv2di; + +#if __SIZEOF_INT128__ == 16 +typedef __attribute__((vector_size(16))) __int128_t v1ti; +#endif + +typedef __attribute__((vector_size(16))) double v2df; +typedef __attribute__((vector_size(16))) long double v1tf; + +#if __ARCH__ >= 12 +typedef __attribute__((vector_size(16))) float v4sf; +#endif + +#define GEN_SEQ_VEC(VEC_TYPE, ADDEND) \ + ({ VEC_TYPE dummy; \ + const int elts = sizeof(VEC_TYPE) / sizeof(dummy[0]); \ + typeof(dummy[0]) __attribute__((aligned(8))) ar[elts]; \ + for (int i = 0; i < elts; i++) \ + ar[i] = (typeof(dummy[0]))(i + (ADDEND)); \ + *(VEC_TYPE*)ar;}) + +#endif -- 2.31.1