On Tue, Apr 7, 2020 at 2:29 PM Jakub Jelinek <ja...@redhat.com> wrote:
>
> Hi!
>
> The following testcases are miscompiled, because expand_vec_perm_pshufb
> incorrectly thinks it can use vpshufb instruction for the permutations
> when it can't.
> The
>           if (vmode == V32QImode)
>             {
>               /* vpshufb only works intra lanes, it is not
>                  possible to shuffle bytes in between the lanes.  */
>               for (i = 0; i < nelt; ++i)
>                 if ((d->perm[i] ^ i) & (nelt / 2))
>                   return false;
>             }
> intra-lane check which is correct has been copied and adjusted for 64-byte
> modes into:
>           if (vmode == V64QImode)
>             {
>               /* vpshufb only works intra lanes, it is not
>                  possible to shuffle bytes in between the lanes.  */
>               for (i = 0; i < nelt; ++i)
>                 if ((d->perm[i] ^ i) & (nelt / 4))
>                   return false;
>             }
> which is not correct, because 64-byte modes have 4 lanes rather than just
> two and the above is only testing that the permutation grabs even lane elts
> from even lanes and odd lane elts from odd lanes, but not that they are
> from the same 256-bit half.
>
> The following patch fixes it by using 3 * nelt / 4 instead of nelt / 4,
> so we actually check the most significant 2 bits rather than just one.
>
> Bootstrapped/regtested on x86_64-linux and i686-linux, ok for trunk and
> release branches?
>
> 2020-04-07  Jakub Jelinek  <ja...@redhat.com>
>
>         PR target/94509
>         * config/i386/i386-expand.c (expand_vec_perm_pshufb): Fix the check
>         for inter-lane permutation for 64-byte modes.
>
>         * gcc.target/i386/avx512bw-pr94509-1.c: New test.
>         * gcc.target/i386/avx512bw-pr94509-2.c: New test.

The patch is OK, and after reading your detailed explanation, even as
obvious everywhere.

Thanks,
Uros.


> --- gcc/config/i386/i386-expand.c.jj    2020-04-07 08:27:13.770891167 +0200
> +++ gcc/config/i386/i386-expand.c       2020-04-07 10:40:09.754089204 +0200
> @@ -16781,7 +16781,7 @@ expand_vec_perm_pshufb (struct expand_ve
>               /* vpshufb only works intra lanes, it is not
>                  possible to shuffle bytes in between the lanes.  */
>               for (i = 0; i < nelt; ++i)
> -               if ((d->perm[i] ^ i) & (nelt / 4))
> +               if ((d->perm[i] ^ i) & (3 * nelt / 4))
>                   return false;
>             }
>         }
> --- gcc/testsuite/gcc.target/i386/avx512bw-pr94509-1.c.jj       2020-04-07 
> 10:42:58.560566492 +0200
> +++ gcc/testsuite/gcc.target/i386/avx512bw-pr94509-1.c  2020-04-07 
> 10:44:44.010990594 +0200
> @@ -0,0 +1,30 @@
> +/* PR target/94509 */
> +/* { dg-do run { target avx512bw } } */
> +/* { dg-options "-O2 -mavx512bw" } */
> +
> +#define AVX512BW
> +#include "avx512f-helper.h"
> +
> +typedef unsigned short __attribute__ ((__vector_size__ (64))) V;
> +
> +__attribute__((noipa)) V
> +foo (V x)
> +{
> +  return __builtin_shuffle (x, (V) { 0, 0, 0, 0, 0, 0, 0, 0,
> +                                    15, 15, 15, 15, 15, 15, 15, 15,
> +                                    0, 0, 0, 0, 0, 0, 0, 0,
> +                                    15, 15, 15, 15, 15, 15, 15, 15 });
> +}
> +
> +static void
> +TEST (void)
> +{
> +  V v = foo ((V) { 1, 2, 3, 4, 5, 6, 7, 8,
> +                  9, 10, 11, 12, 13, 14, 15, 16,
> +                  17, 18, 19, 20, 21, 22, 23, 24,
> +                  25, 26, 27, 28, 29, 30, 31, 32 });
> +  unsigned int i;
> +  for (i = 0; i < sizeof (v) / sizeof (v[0]); i++)
> +    if (v[i] != ((i & 8) ? 16 : 1))
> +      abort ();
> +}
> --- gcc/testsuite/gcc.target/i386/avx512bw-pr94509-2.c.jj       2020-04-07 
> 10:43:01.739518984 +0200
> +++ gcc/testsuite/gcc.target/i386/avx512bw-pr94509-2.c  2020-04-07 
> 10:44:49.811903904 +0200
> @@ -0,0 +1,38 @@
> +/* PR target/94509 */
> +/* { dg-do run { target avx512bw } } */
> +/* { dg-options "-O2 -mavx512bw" } */
> +
> +#define AVX512BW
> +#include "avx512f-helper.h"
> +
> +typedef unsigned char __attribute__ ((__vector_size__ (64))) V;
> +
> +__attribute__((noipa)) V
> +foo (V x)
> +{
> +  return __builtin_shuffle (x, (V) { 0, 1, 0, 1, 0, 1, 0, 1,
> +                                    0, 1, 0, 1, 0, 1, 0, 1,
> +                                    30, 31, 30, 31, 30, 31, 30, 31,
> +                                    30, 31, 30, 31, 30, 31, 30, 31,
> +                                    0, 1, 0, 1, 0, 1, 0, 1,
> +                                    0, 1, 0, 1, 0, 1, 0, 1,
> +                                    30, 31, 30, 31, 30, 31, 30, 31,
> +                                    30, 31, 30, 31, 30, 31, 30, 31 });
> +}
> +
> +static void
> +TEST (void)
> +{
> +  V v = foo ((V) { 1, 2, 3, 4, 5, 6, 7, 8,
> +                  9, 10, 11, 12, 13, 14, 15, 16,
> +                  17, 18, 19, 20, 21, 22, 23, 24,
> +                  25, 26, 27, 28, 29, 30, 31, 32,
> +                  33, 34, 35, 36, 37, 38, 39, 40,
> +                  41, 42, 43, 44, 45, 46, 47, 48,
> +                  49, 50, 51, 52, 53, 54, 55, 56,
> +                  57, 58, 59, 60, 61, 62, 63, 64 });
> +  unsigned int i;
> +  for (i = 0; i < sizeof (v) / sizeof (v[0]); i++)
> +    if (v[i] != ((i & 16) ? 31 : 1) + (i & 1))
> +      abort ();
> +}
>
>         Jakub
>

Reply via email to