On 1/26/19, Jakub Jelinek <ja...@redhat.com> wrote: > Hi! > > The following 4 define_insn shuffle patterns don't have sufficient > conditions. As can be seen even from the way how they transform the > RTL representation into the mask, e.g.: > mask = INTVAL (operands[3]) / 2; > mask |= INTVAL (operands[5]) / 2 << 2; > mask |= (INTVAL (operands[7]) - 8) / 2 << 4; > mask |= (INTVAL (operands[9]) - 8) / 2 << 6; > operands[3] = GEN_INT (mask); > or how corresponding expander constructs the RTL representation from the > mask, > e.g.: > emit_insn (gen_avx512f_shuf_<shuffletype>64x2_1_mask > (operands[0], operands[1], operands[2], > GEN_INT (((mask >> 0) & 3) * 2), > GEN_INT (((mask >> 0) & 3) * 2 + 1), > GEN_INT (((mask >> 2) & 3) * 2), > GEN_INT (((mask >> 2) & 3) * 2 + 1), > GEN_INT (((mask >> 4) & 3) * 2 + 8), > GEN_INT (((mask >> 4) & 3) * 2 + 9), > GEN_INT (((mask >> 6) & 3) * 2 + 8), > GEN_INT (((mask >> 6) & 3) * 2 + 9), > they really require not just that there are 2 (or 4) consecutive numbers > from certain range (in the predicate), but also that the first of these > numbers is a multiple of 2 (or 4) - the least significant 1 (or 2) bits > are ignored when creating the mask for the hw instruction. > Rather than including a huge set of new predicates like > const_0_or_2_operand, const_0_2_4_or_6_operand etc., this patch just > verifies the least significant 1 (or 2) bits are zero where needed, > plus some formatting fixes. > > Bootstrapped/regtested on x86_64-linux and i686-linux (on skylake-avx512), > verified both testcases FAIL without the patch, including for the second > one > every single subtest in there (all those are where at least one set of > pairs > or quadruples starts with a number that is not a multiple of 2 or 4). > Ok for trunk and release branches after a while? > > 2019-01-26 Jakub Jelinek <ja...@redhat.com> > > PR target/87214 > * config/i386/sse.md > (<mask_codefor>avx512dq_shuf_<shuffletype>64x2_1<mask_name>, > avx512f_shuf_<shuffletype>64x2_1<mask_name>): Ensure the > first constants in pairs are multiples of 2. Formatting fixes. > (avx512vl_shuf_<shuffletype>32x4_1<mask_name>, > avx512vl_shuf_<shuffletype>32x4_1<mask_name>): Ensure the > first constants in each quadruple are multiples of 4. Formatting fixes. > > * gcc.target/i386/avx512vl-pr87214-1.c: New test. > * gcc.target/i386/avx512vl-pr87214-2.c: New test.
OK. Thanks, Uros. > --- gcc/config/i386/sse.md.jj 2019-01-25 23:46:02.156263173 +0100 > +++ gcc/config/i386/sse.md 2019-01-26 00:01:24.510168638 +0100 > @@ -13372,13 +13372,15 @@ (define_insn "<mask_codefor>avx512dq_shu > (vec_concat:<ssedoublemode> > (match_operand:VI8F_256 1 "register_operand" "v") > (match_operand:VI8F_256 2 "nonimmediate_operand" "vm")) > - (parallel [(match_operand 3 "const_0_to_3_operand") > - (match_operand 4 "const_0_to_3_operand") > - (match_operand 5 "const_4_to_7_operand") > - (match_operand 6 "const_4_to_7_operand")])))] > + (parallel [(match_operand 3 "const_0_to_3_operand") > + (match_operand 4 "const_0_to_3_operand") > + (match_operand 5 "const_4_to_7_operand") > + (match_operand 6 "const_4_to_7_operand")])))] > "TARGET_AVX512VL > - && (INTVAL (operands[3]) == (INTVAL (operands[4]) - 1) > - && INTVAL (operands[5]) == (INTVAL (operands[6]) - 1))" > + && (INTVAL (operands[3]) & 1) == 0 > + && INTVAL (operands[3]) == INTVAL (operands[4]) - 1 > + && (INTVAL (operands[5]) & 1) == 0 > + && INTVAL (operands[5]) == INTVAL (operands[6]) - 1" > { > int mask; > mask = INTVAL (operands[3]) / 2; > @@ -13421,19 +13423,23 @@ (define_insn "avx512f_shuf_<shuffletype> > (vec_concat:<ssedoublemode> > (match_operand:V8FI 1 "register_operand" "v") > (match_operand:V8FI 2 "nonimmediate_operand" "vm")) > - (parallel [(match_operand 3 "const_0_to_7_operand") > - (match_operand 4 "const_0_to_7_operand") > - (match_operand 5 "const_0_to_7_operand") > - (match_operand 6 "const_0_to_7_operand") > - (match_operand 7 "const_8_to_15_operand") > - (match_operand 8 "const_8_to_15_operand") > - (match_operand 9 "const_8_to_15_operand") > - (match_operand 10 "const_8_to_15_operand")])))] > + (parallel [(match_operand 3 "const_0_to_7_operand") > + (match_operand 4 "const_0_to_7_operand") > + (match_operand 5 "const_0_to_7_operand") > + (match_operand 6 "const_0_to_7_operand") > + (match_operand 7 "const_8_to_15_operand") > + (match_operand 8 "const_8_to_15_operand") > + (match_operand 9 "const_8_to_15_operand") > + (match_operand 10 "const_8_to_15_operand")])))] > "TARGET_AVX512F > - && (INTVAL (operands[3]) == (INTVAL (operands[4]) - 1) > - && INTVAL (operands[5]) == (INTVAL (operands[6]) - 1) > - && INTVAL (operands[7]) == (INTVAL (operands[8]) - 1) > - && INTVAL (operands[9]) == (INTVAL (operands[10]) - 1))" > + && (INTVAL (operands[3]) & 1) == 0 > + && INTVAL (operands[3]) == INTVAL (operands[4]) - 1 > + && (INTVAL (operands[5]) & 1) == 0 > + && INTVAL (operands[5]) == INTVAL (operands[6]) - 1 > + && (INTVAL (operands[7]) & 1) == 0 > + && INTVAL (operands[7]) == INTVAL (operands[8]) - 1 > + && (INTVAL (operands[9]) & 1) == 0 > + && INTVAL (operands[9]) == INTVAL (operands[10]) - 1" > { > int mask; > mask = INTVAL (operands[3]) / 2; > @@ -13479,21 +13485,23 @@ (define_insn "avx512vl_shuf_<shuffletype > (vec_concat:<ssedoublemode> > (match_operand:VI4F_256 1 "register_operand" "v") > (match_operand:VI4F_256 2 "nonimmediate_operand" "vm")) > - (parallel [(match_operand 3 "const_0_to_7_operand") > - (match_operand 4 "const_0_to_7_operand") > - (match_operand 5 "const_0_to_7_operand") > - (match_operand 6 "const_0_to_7_operand") > - (match_operand 7 "const_8_to_15_operand") > - (match_operand 8 "const_8_to_15_operand") > - (match_operand 9 "const_8_to_15_operand") > + (parallel [(match_operand 3 "const_0_to_7_operand") > + (match_operand 4 "const_0_to_7_operand") > + (match_operand 5 "const_0_to_7_operand") > + (match_operand 6 "const_0_to_7_operand") > + (match_operand 7 "const_8_to_15_operand") > + (match_operand 8 "const_8_to_15_operand") > + (match_operand 9 "const_8_to_15_operand") > (match_operand 10 "const_8_to_15_operand")])))] > "TARGET_AVX512VL > - && (INTVAL (operands[3]) == (INTVAL (operands[4]) - 1) > - && INTVAL (operands[3]) == (INTVAL (operands[5]) - 2) > - && INTVAL (operands[3]) == (INTVAL (operands[6]) - 3) > - && INTVAL (operands[7]) == (INTVAL (operands[8]) - 1) > - && INTVAL (operands[7]) == (INTVAL (operands[9]) - 2) > - && INTVAL (operands[7]) == (INTVAL (operands[10]) - 3))" > + && (INTVAL (operands[3]) & 3) == 0 > + && INTVAL (operands[3]) == INTVAL (operands[4]) - 1 > + && INTVAL (operands[3]) == INTVAL (operands[5]) - 2 > + && INTVAL (operands[3]) == INTVAL (operands[6]) - 3 > + && (INTVAL (operands[7]) & 3) == 0 > + && INTVAL (operands[7]) == INTVAL (operands[8]) - 1 > + && INTVAL (operands[7]) == INTVAL (operands[9]) - 2 > + && INTVAL (operands[7]) == INTVAL (operands[10]) - 3" > { > int mask; > mask = INTVAL (operands[3]) / 4; > @@ -13545,35 +13553,39 @@ (define_insn "avx512f_shuf_<shuffletype> > (vec_concat:<ssedoublemode> > (match_operand:V16FI 1 "register_operand" "v") > (match_operand:V16FI 2 "nonimmediate_operand" "vm")) > - (parallel [(match_operand 3 "const_0_to_15_operand") > - (match_operand 4 "const_0_to_15_operand") > - (match_operand 5 "const_0_to_15_operand") > - (match_operand 6 "const_0_to_15_operand") > - (match_operand 7 "const_0_to_15_operand") > - (match_operand 8 "const_0_to_15_operand") > - (match_operand 9 "const_0_to_15_operand") > - (match_operand 10 "const_0_to_15_operand") > - (match_operand 11 "const_16_to_31_operand") > - (match_operand 12 "const_16_to_31_operand") > - (match_operand 13 "const_16_to_31_operand") > - (match_operand 14 "const_16_to_31_operand") > - (match_operand 15 "const_16_to_31_operand") > - (match_operand 16 "const_16_to_31_operand") > - (match_operand 17 "const_16_to_31_operand") > - (match_operand 18 "const_16_to_31_operand")])))] > + (parallel [(match_operand 3 "const_0_to_15_operand") > + (match_operand 4 "const_0_to_15_operand") > + (match_operand 5 "const_0_to_15_operand") > + (match_operand 6 "const_0_to_15_operand") > + (match_operand 7 "const_0_to_15_operand") > + (match_operand 8 "const_0_to_15_operand") > + (match_operand 9 "const_0_to_15_operand") > + (match_operand 10 "const_0_to_15_operand") > + (match_operand 11 "const_16_to_31_operand") > + (match_operand 12 "const_16_to_31_operand") > + (match_operand 13 "const_16_to_31_operand") > + (match_operand 14 "const_16_to_31_operand") > + (match_operand 15 "const_16_to_31_operand") > + (match_operand 16 "const_16_to_31_operand") > + (match_operand 17 "const_16_to_31_operand") > + (match_operand 18 "const_16_to_31_operand")])))] > "TARGET_AVX512F > - && (INTVAL (operands[3]) == (INTVAL (operands[4]) - 1) > - && INTVAL (operands[3]) == (INTVAL (operands[5]) - 2) > - && INTVAL (operands[3]) == (INTVAL (operands[6]) - 3) > - && INTVAL (operands[7]) == (INTVAL (operands[8]) - 1) > - && INTVAL (operands[7]) == (INTVAL (operands[9]) - 2) > - && INTVAL (operands[7]) == (INTVAL (operands[10]) - 3) > - && INTVAL (operands[11]) == (INTVAL (operands[12]) - 1) > - && INTVAL (operands[11]) == (INTVAL (operands[13]) - 2) > - && INTVAL (operands[11]) == (INTVAL (operands[14]) - 3) > - && INTVAL (operands[15]) == (INTVAL (operands[16]) - 1) > - && INTVAL (operands[15]) == (INTVAL (operands[17]) - 2) > - && INTVAL (operands[15]) == (INTVAL (operands[18]) - 3))" > + && (INTVAL (operands[3]) & 3) == 0 > + && INTVAL (operands[3]) == INTVAL (operands[4]) - 1 > + && INTVAL (operands[3]) == INTVAL (operands[5]) - 2 > + && INTVAL (operands[3]) == INTVAL (operands[6]) - 3 > + && (INTVAL (operands[7]) & 3) == 0 > + && INTVAL (operands[7]) == INTVAL (operands[8]) - 1 > + && INTVAL (operands[7]) == INTVAL (operands[9]) - 2 > + && INTVAL (operands[7]) == INTVAL (operands[10]) - 3 > + && (INTVAL (operands[11]) & 3) == 0 > + && INTVAL (operands[11]) == INTVAL (operands[12]) - 1 > + && INTVAL (operands[11]) == INTVAL (operands[13]) - 2 > + && INTVAL (operands[11]) == INTVAL (operands[14]) - 3 > + && (INTVAL (operands[15]) & 3) == 0 > + && INTVAL (operands[15]) == INTVAL (operands[16]) - 1 > + && INTVAL (operands[15]) == INTVAL (operands[17]) - 2 > + && INTVAL (operands[15]) == INTVAL (operands[18]) - 3" > { > int mask; > mask = INTVAL (operands[3]) / 4; > --- gcc/testsuite/gcc.target/i386/avx512vl-pr87214-1.c.jj 2019-01-26 > 00:01:24.511168621 +0100 > +++ gcc/testsuite/gcc.target/i386/avx512vl-pr87214-1.c 2019-01-26 > 00:13:39.730135406 +0100 > @@ -0,0 +1,44 @@ > +/* PR target/87214 */ > +/* { dg-do run { target { avx512vl } } } */ > +/* { dg-options "-O3 -mavx512vl -mtune=skylake-avx512" } */ > + > +#define AVX512VL > +#define AVX512F_LEN 512 > +#define AVX512F_LEN_HALF 256 > +#include "avx512f-check.h" > + > +struct s { unsigned long a, b, c; }; > + > +void __attribute__ ((noipa)) > +foo (struct s *restrict s1, struct s *restrict s2, int n) > +{ > + for (int i = 0; i < n; ++i) > + { > + s1[i].b = s2[i].b; > + s1[i].c = s2[i].c; > + s2[i].c = 0; > + } > +} > + > +#define N 12 > + > +static void > +test_256 (void) > +{ > + struct s s1[N], s2[N]; > + for (unsigned int j = 0; j < N; ++j) > + { > + s2[j].a = j * 5; > + s2[j].b = j * 5 + 2; > + s2[j].c = j * 5 + 4; > + } > + foo (s1, s2, N); > + for (unsigned int j = 0; j < N; ++j) > + if (s1[j].b != j * 5 + 2) > + __builtin_abort (); > +} > + > +static void > +test_128 (void) > +{ > +} > --- gcc/testsuite/gcc.target/i386/avx512vl-pr87214-2.c.jj 2019-01-26 > 00:01:24.511168621 +0100 > +++ gcc/testsuite/gcc.target/i386/avx512vl-pr87214-2.c 2019-01-26 > 00:17:19.941530293 +0100 > @@ -0,0 +1,128 @@ > +/* PR target/87214 */ > +/* { dg-do run { target { avx512vl } } } */ > +/* { dg-options "-O2 -mavx512vl" } */ > + > +#define AVX512VL > +#define AVX512F_LEN 512 > +#define AVX512F_LEN_HALF 256 > +#include "avx512f-check.h" > + > +typedef long long int v4di __attribute__((vector_size (4 * sizeof (long > long int)))); > +typedef double v4df __attribute__((vector_size (4 * sizeof (double)))); > +typedef long long int v8di __attribute__((vector_size (8 * sizeof (long > long int)))); > +typedef double v8df __attribute__((vector_size (8 * sizeof (double)))); > +typedef int v8si __attribute__((vector_size (8 * sizeof (int)))); > +typedef float v8sf __attribute__((vector_size (8 * sizeof (float)))); > +typedef int v16si __attribute__((vector_size (16 * sizeof (int)))); > +typedef float v16sf __attribute__((vector_size (16 * sizeof (float)))); > + > +__attribute__((noipa)) void > +f1 (v4di *p) > +{ > + p[0] = __builtin_shuffle (p[1], p[2], (v4di) { 2, 3, 5, 6 }); > +} > + > +__attribute__((noipa)) void > +f2 (v4df *p) > +{ > + p[0] = __builtin_shuffle (p[1], p[2], (v4di) { 1, 2, 6, 7 }); > +} > + > +__attribute__((noipa)) void > +f3 (v8di *p) > +{ > + p[0] = __builtin_shuffle (p[1], p[2], (v8di) { 2, 3, 5, 6, 8, 9, 11, 12 > }); > +} > + > +__attribute__((noipa)) void > +f4 (v8df *p) > +{ > + p[0] = __builtin_shuffle (p[1], p[2], (v8di) { 1, 2, 6, 7, 9, 10, 12, 13 > }); > +} > + > +__attribute__((noipa)) void > +f5 (v8si *p) > +{ > + p[0] = __builtin_shuffle (p[1], p[2], (v8si) { 2, 3, 4, 5, 9, 10, 11, 12 > }); > +} > + > +__attribute__((noipa)) void > +f6 (v8sf *p) > +{ > + p[0] = __builtin_shuffle (p[1], p[2], (v8si) { 1, 2, 3, 4, 12, 13, 14, 15 > }); > +} > + > +__attribute__((noipa)) void > +f7 (v16si *p) > +{ > + p[0] = __builtin_shuffle (p[1], p[2], (v16si) { 0, 1, 2, 3, 1, 2, 3, 4, > 16, 17, 18, 19, 25, 26, 27, 28 }); > +} > + > +__attribute__((noipa)) void > +f8 (v16sf *p) > +{ > + p[0] = __builtin_shuffle (p[1], p[2], (v16si) { 1, 2, 3, 4, 4, 5, 6, 7, > 17, 18, 19, 20, 18, 19, 20, 21 }); > +} > + > +static void > +test_256 (void) > +{ > + v4di a[3] = { { 0, 0, 0, 0 }, { 10, 11, 12, 13 }, { 14, 15, 16, 17 } }; > + f1 (a); > + if (a[0][0] != 12 || a[0][1] != 13 || a[0][2] != 15 || a[0][3] != 16) > + __builtin_abort (); > + v4df b[3] = { { 0.0, 0.0, 0.0, 0.0 }, { 10.0, 11.0, 12.0, 13.0 }, { 14.0, > 15.0, 16.0, 17.0 } }; > + f2 (b); > + if (b[0][0] != 11.0 || b[0][1] != 12.0 || b[0][2] != 16.0 || b[0][3] != > 17.0) > + __builtin_abort (); > + v8di c[3] = { { 0, 0, 0, 0, 0, 0, 0, 0 }, { 10, 11, 12, 13, 14, 15, 16, > 17 }, { 18, 19, 20, 21, 22, 23, 24, 25 } }; > + f3 (c); > + if (c[0][0] != 12 || c[0][1] != 13 || c[0][2] != 15 || c[0][3] != 16 > + || c[0][4] != 18 || c[0][5] != 19 || c[0][6] != 21 || c[0][7] != 22) > + __builtin_abort (); > + v8df d[3] = { { 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0 }, > + { 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0, 17.0 }, > + { 18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0 } }; > + f4 (d); > + if (d[0][0] != 11.0 || d[0][1] != 12.0 || d[0][2] != 16.0 || d[0][3] != > 17.0 > + || d[0][4] != 19.0 || d[0][5] != 20.0 || d[0][6] != 22.0 || d[0][7] > != 23.0) > + __builtin_abort (); > + v8si e[3] = { { 0, 0, 0, 0, 0, 0, 0, 0 }, { 10, 11, 12, 13, 14, 15, 16, > 17 }, { 18, 19, 20, 21, 22, 23, 24, 25 } }; > + f5 (e); > + if (e[0][0] != 12 || e[0][1] != 13 || e[0][2] != 14 || e[0][3] != 15 > + || e[0][4] != 19 || e[0][5] != 20 || e[0][6] != 21 || e[0][7] != 22) > + __builtin_abort (); > + v8sf f[3] = { { 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f }, > + { 10.0f, 11.0f, 12.0f, 13.0f, 14.0f, 15.0f, 16.0f, 17.0f }, > + { 18.0f, 19.0f, 20.0f, 21.0f, 22.0f, 23.0f, 24.0f, 25.0f } }; > + f6 (f); > + if (f[0][0] != 11.0f || f[0][1] != 12.0f || f[0][2] != 13.0f || f[0][3] > != 14.0f > + || f[0][4] != 22.0f || f[0][5] != 23.0f || f[0][6] != 24.0f || > f[0][7] != 25.0f) > + __builtin_abort (); > + v16si g[3] = { { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, > + { 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, > 25 }, > + { 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, > 41 } }; > + f7 (g); > + if (g[0][0] != 10 || g[0][1] != 11 || g[0][2] != 12 || g[0][3] != 13 > + || g[0][4] != 11 || g[0][5] != 12 || g[0][6] != 13 || g[0][7] != 14 > + || g[0][8] != 26 || g[0][9] != 27 || g[0][10] != 28 || g[0][11] != > 29 > + || g[0][12] != 35 || g[0][13] != 36 || g[0][14] != 37 || g[0][15] != > 38) > + __builtin_abort (); > + v16sf h[3] = { { 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, > + 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f }, > + { 10.0f, 11.0f, 12.0f, 13.0f, 14.0f, 15.0f, 16.0f, 17.0f, > + 18.0f, 19.0f, 20.0f, 21.0f, 22.0f, 23.0f, 24.0f, 25.0f }, > + { 26.0f, 27.0f, 28.0f, 29.0f, 30.0f, 31.0f, 32.0f, 33.0f, > + 34.0f, 35.0f, 36.0f, 37.0f, 38.0f, 39.0f, 40.0f, 41.0f } }; > + f8 (h); > + if (h[0][0] != 11.0f || h[0][1] != 12.0f || h[0][2] != 13.0f || h[0][3] > != 14.0f > + || h[0][4] != 14.0f || h[0][5] != 15.0f || h[0][6] != 16.0f || > h[0][7] != 17.0f > + || h[0][8] != 27.0f || h[0][9] != 28.0f || h[0][10] != 29.0f || > h[0][11] != 30.0f > + || h[0][12] != 28.0f || h[0][13] != 29.0f || h[0][14] != 30.0f || > h[0][15] != 31.0f) > + __builtin_abort (); > +} > + > +static void > +test_128 (void) > +{ > +} > > Jakub >