Re: [PATCH] Fix avx512{f,vl} shuffles (PR target/87214)

Uros Bizjak Sun, 27 Jan 2019 02:39:05 -0800

On 1/26/19, Jakub Jelinek <ja...@redhat.com> wrote:
> Hi!
>
> The following 4 define_insn shuffle patterns don't have sufficient
> conditions.  As can be seen even from the way how they transform the
> RTL representation into the mask, e.g.:
>   mask = INTVAL (operands[3]) / 2;
>   mask |= INTVAL (operands[5]) / 2 << 2;
>   mask |= (INTVAL (operands[7]) - 8) / 2 << 4;
>   mask |= (INTVAL (operands[9]) - 8) / 2 << 6;
>   operands[3] = GEN_INT (mask);
> or how corresponding expander constructs the RTL representation from the
> mask,
> e.g.:
>   emit_insn (gen_avx512f_shuf_<shuffletype>64x2_1_mask
>       (operands[0], operands[1], operands[2],
>        GEN_INT (((mask >> 0) & 3) * 2),
>        GEN_INT (((mask >> 0) & 3) * 2 + 1),
>        GEN_INT (((mask >> 2) & 3) * 2),
>        GEN_INT (((mask >> 2) & 3) * 2 + 1),
>        GEN_INT (((mask >> 4) & 3) * 2 + 8),
>        GEN_INT (((mask >> 4) & 3) * 2 + 9),
>        GEN_INT (((mask >> 6) & 3) * 2 + 8),
>        GEN_INT (((mask >> 6) & 3) * 2 + 9),
> they really require not just that there are 2 (or 4) consecutive numbers
> from certain range (in the predicate), but also that the first of these
> numbers is a multiple of 2 (or 4) - the least significant 1 (or 2) bits
> are ignored when creating the mask for the hw instruction.
> Rather than including a huge set of new predicates like
> const_0_or_2_operand, const_0_2_4_or_6_operand etc., this patch just
> verifies the least significant 1 (or 2) bits are zero where needed,
> plus some formatting fixes.
>
> Bootstrapped/regtested on x86_64-linux and i686-linux (on skylake-avx512),
> verified both testcases FAIL without the patch, including for the second
> one
> every single subtest in there (all those are where at least one set of
> pairs
> or quadruples starts with a number that is not a multiple of 2 or 4).
> Ok for trunk and release branches after a while?
>
> 2019-01-26  Jakub Jelinek  <ja...@redhat.com>
>
>       PR target/87214
>       * config/i386/sse.md
>       (<mask_codefor>avx512dq_shuf_<shuffletype>64x2_1<mask_name>,
>       avx512f_shuf_<shuffletype>64x2_1<mask_name>): Ensure the
>       first constants in pairs are multiples of 2.  Formatting fixes.
>       (avx512vl_shuf_<shuffletype>32x4_1<mask_name>,
>       avx512vl_shuf_<shuffletype>32x4_1<mask_name>): Ensure the
>       first constants in each quadruple are multiples of 4.  Formatting fixes.
>
>       * gcc.target/i386/avx512vl-pr87214-1.c: New test.
>       * gcc.target/i386/avx512vl-pr87214-2.c: New test.


OK.

Thanks,
Uros.

> --- gcc/config/i386/sse.md.jj 2019-01-25 23:46:02.156263173 +0100
> +++ gcc/config/i386/sse.md    2019-01-26 00:01:24.510168638 +0100
> @@ -13372,13 +13372,15 @@ (define_insn "<mask_codefor>avx512dq_shu
>         (vec_concat:<ssedoublemode>
>           (match_operand:VI8F_256 1 "register_operand" "v")
>           (match_operand:VI8F_256 2 "nonimmediate_operand" "vm"))
> -       (parallel [(match_operand 3  "const_0_to_3_operand")
> -                  (match_operand 4  "const_0_to_3_operand")
> -                  (match_operand 5  "const_4_to_7_operand")
> -                  (match_operand 6  "const_4_to_7_operand")])))]
> +       (parallel [(match_operand 3 "const_0_to_3_operand")
> +                  (match_operand 4 "const_0_to_3_operand")
> +                  (match_operand 5 "const_4_to_7_operand")
> +                  (match_operand 6 "const_4_to_7_operand")])))]
>    "TARGET_AVX512VL
> -   && (INTVAL (operands[3]) == (INTVAL (operands[4]) - 1)
> -       && INTVAL (operands[5]) == (INTVAL (operands[6]) - 1))"
> +   && (INTVAL (operands[3]) & 1) == 0
> +   && INTVAL (operands[3]) == INTVAL (operands[4]) - 1
> +   && (INTVAL (operands[5]) & 1) == 0
> +   && INTVAL (operands[5]) == INTVAL (operands[6]) - 1"
>  {
>    int mask;
>    mask = INTVAL (operands[3]) / 2;
> @@ -13421,19 +13423,23 @@ (define_insn "avx512f_shuf_<shuffletype>
>         (vec_concat:<ssedoublemode>
>           (match_operand:V8FI 1 "register_operand" "v")
>           (match_operand:V8FI 2 "nonimmediate_operand" "vm"))
> -       (parallel [(match_operand 3  "const_0_to_7_operand")
> -                  (match_operand 4  "const_0_to_7_operand")
> -                  (match_operand 5  "const_0_to_7_operand")
> -                  (match_operand 6  "const_0_to_7_operand")
> -                  (match_operand 7  "const_8_to_15_operand")
> -                  (match_operand 8  "const_8_to_15_operand")
> -                  (match_operand 9  "const_8_to_15_operand")
> -                  (match_operand 10  "const_8_to_15_operand")])))]
> +       (parallel [(match_operand 3 "const_0_to_7_operand")
> +                  (match_operand 4 "const_0_to_7_operand")
> +                  (match_operand 5 "const_0_to_7_operand")
> +                  (match_operand 6 "const_0_to_7_operand")
> +                  (match_operand 7 "const_8_to_15_operand")
> +                  (match_operand 8 "const_8_to_15_operand")
> +                  (match_operand 9 "const_8_to_15_operand")
> +                  (match_operand 10 "const_8_to_15_operand")])))]
>    "TARGET_AVX512F
> -   && (INTVAL (operands[3]) == (INTVAL (operands[4]) - 1)
> -       && INTVAL (operands[5]) == (INTVAL (operands[6]) - 1)
> -       && INTVAL (operands[7]) == (INTVAL (operands[8]) - 1)
> -       && INTVAL (operands[9]) == (INTVAL (operands[10]) - 1))"
> +   && (INTVAL (operands[3]) & 1) == 0
> +   && INTVAL (operands[3]) == INTVAL (operands[4]) - 1
> +   && (INTVAL (operands[5]) & 1) == 0
> +   && INTVAL (operands[5]) == INTVAL (operands[6]) - 1
> +   && (INTVAL (operands[7]) & 1) == 0
> +   && INTVAL (operands[7]) == INTVAL (operands[8]) - 1
> +   && (INTVAL (operands[9]) & 1) == 0
> +   && INTVAL (operands[9]) == INTVAL (operands[10]) - 1"
>  {
>    int mask;
>    mask = INTVAL (operands[3]) / 2;
> @@ -13479,21 +13485,23 @@ (define_insn "avx512vl_shuf_<shuffletype
>         (vec_concat:<ssedoublemode>
>           (match_operand:VI4F_256 1 "register_operand" "v")
>           (match_operand:VI4F_256 2 "nonimmediate_operand" "vm"))
> -       (parallel [(match_operand 3  "const_0_to_7_operand")
> -                  (match_operand 4  "const_0_to_7_operand")
> -                  (match_operand 5  "const_0_to_7_operand")
> -                  (match_operand 6  "const_0_to_7_operand")
> -                  (match_operand 7  "const_8_to_15_operand")
> -                  (match_operand 8  "const_8_to_15_operand")
> -                  (match_operand 9  "const_8_to_15_operand")
> +       (parallel [(match_operand 3 "const_0_to_7_operand")
> +                  (match_operand 4 "const_0_to_7_operand")
> +                  (match_operand 5 "const_0_to_7_operand")
> +                  (match_operand 6 "const_0_to_7_operand")
> +                  (match_operand 7 "const_8_to_15_operand")
> +                  (match_operand 8 "const_8_to_15_operand")
> +                  (match_operand 9 "const_8_to_15_operand")
>                    (match_operand 10 "const_8_to_15_operand")])))]
>    "TARGET_AVX512VL
> -   && (INTVAL (operands[3]) == (INTVAL (operands[4]) - 1)
> -       && INTVAL (operands[3]) == (INTVAL (operands[5]) - 2)
> -       && INTVAL (operands[3]) == (INTVAL (operands[6]) - 3)
> -       && INTVAL (operands[7]) == (INTVAL (operands[8]) - 1)
> -       && INTVAL (operands[7]) == (INTVAL (operands[9]) - 2)
> -       && INTVAL (operands[7]) == (INTVAL (operands[10]) - 3))"
> +   && (INTVAL (operands[3]) & 3) == 0
> +   && INTVAL (operands[3]) == INTVAL (operands[4]) - 1
> +   && INTVAL (operands[3]) == INTVAL (operands[5]) - 2
> +   && INTVAL (operands[3]) == INTVAL (operands[6]) - 3
> +   && (INTVAL (operands[7]) & 3) == 0
> +   && INTVAL (operands[7]) == INTVAL (operands[8]) - 1
> +   && INTVAL (operands[7]) == INTVAL (operands[9]) - 2
> +   && INTVAL (operands[7]) == INTVAL (operands[10]) - 3"
>  {
>    int mask;
>    mask = INTVAL (operands[3]) / 4;
> @@ -13545,35 +13553,39 @@ (define_insn "avx512f_shuf_<shuffletype>
>         (vec_concat:<ssedoublemode>
>           (match_operand:V16FI 1 "register_operand" "v")
>           (match_operand:V16FI 2 "nonimmediate_operand" "vm"))
> -       (parallel [(match_operand 3  "const_0_to_15_operand")
> -                  (match_operand 4  "const_0_to_15_operand")
> -                  (match_operand 5  "const_0_to_15_operand")
> -                  (match_operand 6  "const_0_to_15_operand")
> -                  (match_operand 7  "const_0_to_15_operand")
> -                  (match_operand 8  "const_0_to_15_operand")
> -                  (match_operand 9  "const_0_to_15_operand")
> -                  (match_operand 10  "const_0_to_15_operand")
> -                  (match_operand 11  "const_16_to_31_operand")
> -                  (match_operand 12  "const_16_to_31_operand")
> -                  (match_operand 13  "const_16_to_31_operand")
> -                  (match_operand 14  "const_16_to_31_operand")
> -                  (match_operand 15  "const_16_to_31_operand")
> -                  (match_operand 16  "const_16_to_31_operand")
> -                  (match_operand 17  "const_16_to_31_operand")
> -                  (match_operand 18  "const_16_to_31_operand")])))]
> +       (parallel [(match_operand 3 "const_0_to_15_operand")
> +                  (match_operand 4 "const_0_to_15_operand")
> +                  (match_operand 5 "const_0_to_15_operand")
> +                  (match_operand 6 "const_0_to_15_operand")
> +                  (match_operand 7 "const_0_to_15_operand")
> +                  (match_operand 8 "const_0_to_15_operand")
> +                  (match_operand 9 "const_0_to_15_operand")
> +                  (match_operand 10 "const_0_to_15_operand")
> +                  (match_operand 11 "const_16_to_31_operand")
> +                  (match_operand 12 "const_16_to_31_operand")
> +                  (match_operand 13 "const_16_to_31_operand")
> +                  (match_operand 14 "const_16_to_31_operand")
> +                  (match_operand 15 "const_16_to_31_operand")
> +                  (match_operand 16 "const_16_to_31_operand")
> +                  (match_operand 17 "const_16_to_31_operand")
> +                  (match_operand 18 "const_16_to_31_operand")])))]
>    "TARGET_AVX512F
> -   && (INTVAL (operands[3]) == (INTVAL (operands[4]) - 1)
> -       && INTVAL (operands[3]) == (INTVAL (operands[5]) - 2)
> -       && INTVAL (operands[3]) == (INTVAL (operands[6]) - 3)
> -       && INTVAL (operands[7]) == (INTVAL (operands[8]) - 1)
> -       && INTVAL (operands[7]) == (INTVAL (operands[9]) - 2)
> -       && INTVAL (operands[7]) == (INTVAL (operands[10]) - 3)
> -       && INTVAL (operands[11]) == (INTVAL (operands[12]) - 1)
> -       && INTVAL (operands[11]) == (INTVAL (operands[13]) - 2)
> -       && INTVAL (operands[11]) == (INTVAL (operands[14]) - 3)
> -       && INTVAL (operands[15]) == (INTVAL (operands[16]) - 1)
> -       && INTVAL (operands[15]) == (INTVAL (operands[17]) - 2)
> -       && INTVAL (operands[15]) == (INTVAL (operands[18]) - 3))"
> +   && (INTVAL (operands[3]) & 3) == 0
> +   && INTVAL (operands[3]) == INTVAL (operands[4]) - 1
> +   && INTVAL (operands[3]) == INTVAL (operands[5]) - 2
> +   && INTVAL (operands[3]) == INTVAL (operands[6]) - 3
> +   && (INTVAL (operands[7]) & 3) == 0
> +   && INTVAL (operands[7]) == INTVAL (operands[8]) - 1
> +   && INTVAL (operands[7]) == INTVAL (operands[9]) - 2
> +   && INTVAL (operands[7]) == INTVAL (operands[10]) - 3
> +   && (INTVAL (operands[11]) & 3) == 0
> +   && INTVAL (operands[11]) == INTVAL (operands[12]) - 1
> +   && INTVAL (operands[11]) == INTVAL (operands[13]) - 2
> +   && INTVAL (operands[11]) == INTVAL (operands[14]) - 3
> +   && (INTVAL (operands[15]) & 3) == 0
> +   && INTVAL (operands[15]) == INTVAL (operands[16]) - 1
> +   && INTVAL (operands[15]) == INTVAL (operands[17]) - 2
> +   && INTVAL (operands[15]) == INTVAL (operands[18]) - 3"
>  {
>    int mask;
>    mask = INTVAL (operands[3]) / 4;
> --- gcc/testsuite/gcc.target/i386/avx512vl-pr87214-1.c.jj     2019-01-26
> 00:01:24.511168621 +0100
> +++ gcc/testsuite/gcc.target/i386/avx512vl-pr87214-1.c        2019-01-26
> 00:13:39.730135406 +0100
> @@ -0,0 +1,44 @@
> +/* PR target/87214 */
> +/* { dg-do run { target { avx512vl } } } */
> +/* { dg-options "-O3 -mavx512vl -mtune=skylake-avx512" } */
> +
> +#define AVX512VL
> +#define AVX512F_LEN 512
> +#define AVX512F_LEN_HALF 256
> +#include "avx512f-check.h"
> +
> +struct s { unsigned long a, b, c; };
> +
> +void __attribute__ ((noipa))
> +foo (struct s *restrict s1, struct s *restrict s2, int n)
> +{
> +  for (int i = 0; i < n; ++i)
> +    {
> +      s1[i].b = s2[i].b;
> +      s1[i].c = s2[i].c;
> +      s2[i].c = 0;
> +    }
> +}
> +
> +#define N 12
> +
> +static void
> +test_256 (void)
> +{
> +  struct s s1[N], s2[N];
> +  for (unsigned int j = 0; j < N; ++j)
> +    {
> +      s2[j].a = j * 5;
> +      s2[j].b = j * 5 + 2;
> +      s2[j].c = j * 5 + 4;
> +    }
> +  foo (s1, s2, N);
> +  for (unsigned int j = 0; j < N; ++j)
> +  if (s1[j].b != j * 5 + 2)
> +    __builtin_abort ();
> +}
> +
> +static void
> +test_128 (void)
> +{
> +}
> --- gcc/testsuite/gcc.target/i386/avx512vl-pr87214-2.c.jj     2019-01-26
> 00:01:24.511168621 +0100
> +++ gcc/testsuite/gcc.target/i386/avx512vl-pr87214-2.c        2019-01-26
> 00:17:19.941530293 +0100
> @@ -0,0 +1,128 @@
> +/* PR target/87214 */
> +/* { dg-do run { target { avx512vl } } } */
> +/* { dg-options "-O2 -mavx512vl" } */
> +
> +#define AVX512VL
> +#define AVX512F_LEN 512
> +#define AVX512F_LEN_HALF 256
> +#include "avx512f-check.h"
> +
> +typedef long long int v4di __attribute__((vector_size (4 * sizeof (long
> long int))));
> +typedef double v4df __attribute__((vector_size (4 * sizeof (double))));
> +typedef long long int v8di __attribute__((vector_size (8 * sizeof (long
> long int))));
> +typedef double v8df __attribute__((vector_size (8 * sizeof (double))));
> +typedef int v8si __attribute__((vector_size (8 * sizeof (int))));
> +typedef float v8sf __attribute__((vector_size (8 * sizeof (float))));
> +typedef int v16si __attribute__((vector_size (16 * sizeof (int))));
> +typedef float v16sf __attribute__((vector_size (16 * sizeof (float))));
> +
> +__attribute__((noipa)) void
> +f1 (v4di *p)
> +{
> +  p[0] = __builtin_shuffle (p[1], p[2], (v4di) { 2, 3, 5, 6 });
> +}
> +
> +__attribute__((noipa)) void
> +f2 (v4df *p)
> +{
> +  p[0] = __builtin_shuffle (p[1], p[2], (v4di) { 1, 2, 6, 7 });
> +}
> +
> +__attribute__((noipa)) void
> +f3 (v8di *p)
> +{
> +  p[0] = __builtin_shuffle (p[1], p[2], (v8di) { 2, 3, 5, 6, 8, 9, 11, 12
> });
> +}
> +
> +__attribute__((noipa)) void
> +f4 (v8df *p)
> +{
> +  p[0] = __builtin_shuffle (p[1], p[2], (v8di) { 1, 2, 6, 7, 9, 10, 12, 13
> });
> +}
> +
> +__attribute__((noipa)) void
> +f5 (v8si *p)
> +{
> +  p[0] = __builtin_shuffle (p[1], p[2], (v8si) { 2, 3, 4, 5, 9, 10, 11, 12
> });
> +}
> +
> +__attribute__((noipa)) void
> +f6 (v8sf *p)
> +{
> +  p[0] = __builtin_shuffle (p[1], p[2], (v8si) { 1, 2, 3, 4, 12, 13, 14, 15
> });
> +}
> +
> +__attribute__((noipa)) void
> +f7 (v16si *p)
> +{
> +  p[0] = __builtin_shuffle (p[1], p[2], (v16si) { 0, 1, 2, 3, 1, 2, 3, 4,
> 16, 17, 18, 19, 25, 26, 27, 28 });
> +}
> +
> +__attribute__((noipa)) void
> +f8 (v16sf *p)
> +{
> +  p[0] = __builtin_shuffle (p[1], p[2], (v16si) { 1, 2, 3, 4, 4, 5, 6, 7,
> 17, 18, 19, 20, 18, 19, 20, 21 });
> +}
> +
> +static void
> +test_256 (void)
> +{
> +  v4di a[3] = { { 0, 0, 0, 0 }, { 10, 11, 12, 13 }, { 14, 15, 16, 17 } };
> +  f1 (a);
> +  if (a[0][0] != 12 || a[0][1] != 13 || a[0][2] != 15 || a[0][3] != 16)
> +    __builtin_abort ();
> +  v4df b[3] = { { 0.0, 0.0, 0.0, 0.0 }, { 10.0, 11.0, 12.0, 13.0 }, { 14.0,
> 15.0, 16.0, 17.0 } };
> +  f2 (b);
> +  if (b[0][0] != 11.0 || b[0][1] != 12.0 || b[0][2] != 16.0 || b[0][3] !=
> 17.0)
> +    __builtin_abort ();
> +  v8di c[3] = { { 0, 0, 0, 0, 0, 0, 0, 0 }, { 10, 11, 12, 13, 14, 15, 16,
> 17 }, { 18, 19, 20, 21, 22, 23, 24, 25 } };
> +  f3 (c);
> +  if (c[0][0] != 12 || c[0][1] != 13 || c[0][2] != 15 || c[0][3] != 16
> +      || c[0][4] != 18 || c[0][5] != 19 || c[0][6] != 21 || c[0][7] != 22)
> +    __builtin_abort ();
> +  v8df d[3] = { { 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0 },
> +             { 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0, 17.0 },
> +             { 18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0 } };
> +  f4 (d);
> +  if (d[0][0] != 11.0 || d[0][1] != 12.0 || d[0][2] != 16.0 || d[0][3] !=
> 17.0
> +      || d[0][4] != 19.0 || d[0][5] != 20.0 || d[0][6] != 22.0 || d[0][7]
> != 23.0)
> +    __builtin_abort ();
> +  v8si e[3] = { { 0, 0, 0, 0, 0, 0, 0, 0 }, { 10, 11, 12, 13, 14, 15, 16,
> 17 }, { 18, 19, 20, 21, 22, 23, 24, 25 } };
> +  f5 (e);
> +  if (e[0][0] != 12 || e[0][1] != 13 || e[0][2] != 14 || e[0][3] != 15
> +      || e[0][4] != 19 || e[0][5] != 20 || e[0][6] != 21 || e[0][7] != 22)
> +    __builtin_abort ();
> +  v8sf f[3] = { { 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f },
> +             { 10.0f, 11.0f, 12.0f, 13.0f, 14.0f, 15.0f, 16.0f, 17.0f },
> +             { 18.0f, 19.0f, 20.0f, 21.0f, 22.0f, 23.0f, 24.0f, 25.0f } };
> +  f6 (f);
> +  if (f[0][0] != 11.0f || f[0][1] != 12.0f || f[0][2] != 13.0f || f[0][3]
> != 14.0f
> +      || f[0][4] != 22.0f || f[0][5] != 23.0f || f[0][6] != 24.0f ||
> f[0][7] != 25.0f)
> +    __builtin_abort ();
> +  v16si g[3] = { { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
> +              { 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 
> 25 },
> +              { 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 
> 41 } };
> +  f7 (g);
> +  if (g[0][0] != 10 || g[0][1] != 11 || g[0][2] != 12 || g[0][3] != 13
> +      || g[0][4] != 11 || g[0][5] != 12 || g[0][6] != 13 || g[0][7] != 14
> +      || g[0][8] != 26 || g[0][9] != 27 || g[0][10] != 28 || g[0][11] !=
> 29
> +      || g[0][12] != 35 || g[0][13] != 36 || g[0][14] != 37 || g[0][15] !=
> 38)
> +    __builtin_abort ();
> +  v16sf h[3] = { { 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f,
> +                0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f },
> +              { 10.0f, 11.0f, 12.0f, 13.0f, 14.0f, 15.0f, 16.0f, 17.0f,
> +                18.0f, 19.0f, 20.0f, 21.0f, 22.0f, 23.0f, 24.0f, 25.0f },
> +              { 26.0f, 27.0f, 28.0f, 29.0f, 30.0f, 31.0f, 32.0f, 33.0f,
> +                34.0f, 35.0f, 36.0f, 37.0f, 38.0f, 39.0f, 40.0f, 41.0f } };
> +  f8 (h);
> +  if (h[0][0] != 11.0f || h[0][1] != 12.0f || h[0][2] != 13.0f || h[0][3]
> != 14.0f
> +      || h[0][4] != 14.0f || h[0][5] != 15.0f || h[0][6] != 16.0f ||
> h[0][7] != 17.0f
> +      || h[0][8] != 27.0f || h[0][9] != 28.0f || h[0][10] != 29.0f ||
> h[0][11] != 30.0f
> +      || h[0][12] != 28.0f || h[0][13] != 29.0f || h[0][14] != 30.0f ||
> h[0][15] != 31.0f)
> +    __builtin_abort ();
> +}
> +
> +static void
> +test_128 (void)
> +{
> +}
>
>       Jakub
>

Re: [PATCH] Fix avx512{f,vl} shuffles (PR target/87214)

Reply via email to