On Mon, Aug 12, 2019 at 4:57 PM Jakub Jelinek <ja...@redhat.com> wrote:
>
> Hi!
>
> The following patch adds 9 missing intrinsics, which are like _mm*_cast*,
> but don't leave the upper bits undefined - set them to zero instead.
> The implementation uses code that combine manages to optimize well,
> the only problem is that as the 512-bit intrinsics are supposed to be
> avx512f and some needed intrinsics they'd ideally use are avx512dq, it means
> that for _mm512_zextpd128_pd512/_mm512_zextps256_ps512 we emit
> vmovaps/vmovapd instead of vmovapd/vmovaps.
>
> I've also discovered that for AVX, there is no test coverage of the various
> cast intrinsics, so I've added that too.
>
> The PR has some details on other possible expansions, it would be nice to
> optimize also those definitions into the same code, but it will require some
> extra define_insn_and_split, though I think that can be done incrementally;
> and once done, perhaps we could change the 
> _mm512_zextpd128_pd512/_mm512_zextps256_ps512
> so that they actually generate the right ps vs. pd variant of move.
>
> Bootstrapped/regtested on x86_64-linux and i686-linux, ok for trunk?
>
> 2019-08-12  Jakub Jelinek  <ja...@redhat.com>
>
>         PR target/83250
>         PR target/91340
>         * config/i386/avxintrin.h (_mm256_zextpd128_pd256,
>         _mm256_zextps128_ps256, _mm256_zextsi128_si256): New intrinsics.
>         * config/i386/avx512fintrin.h (_mm512_zextpd128_pd512,
>         _mm512_zextps128_ps512, _mm512_zextsi128_si512, 
> _mm512_zextpd256_pd512,
>         _mm512_zextps256_ps512, _mm512_zextsi256_si512): Likewise.
>
>         * gcc.target/i386/avx-typecast-1.c: New test.
>         * gcc.target/i386/avx-typecast-2.c: New test.
>         * gcc.target/i386/avx512f-typecast-2.c: New test.

OK for AVX, LGTM for AVX512F.

Thanks,
Uros.

>
> --- gcc/config/i386/avxintrin.h.jj      2019-08-05 12:25:34.476667673 +0200
> +++ gcc/config/i386/avxintrin.h 2019-08-12 14:33:07.905601186 +0200
> @@ -1484,6 +1484,26 @@ _mm256_castsi128_si256 (__m128i __A)
>    return (__m256i) __builtin_ia32_si256_si ((__v4si)__A);
>  }
>
> +/* Similarly, but with zero extension instead of undefined values.  */
> +
> +extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, 
> __artificial__))
> +_mm256_zextpd128_pd256 (__m128d __A)
> +{
> +  return _mm256_insertf128_pd (_mm256_setzero_pd (), __A, 0);
> +}
> +
> +extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, 
> __artificial__))
> +_mm256_zextps128_ps256 (__m128 __A)
> +{
> +  return _mm256_insertf128_ps (_mm256_setzero_ps (), __A, 0);
> +}
> +
> +extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, 
> __artificial__))
> +_mm256_zextsi128_si256 (__m128i __A)
> +{
> +  return _mm256_insertf128_si256 (_mm256_setzero_si256 (), __A, 0);
> +}
> +
>  extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, 
> __artificial__))
>  _mm256_set_m128 ( __m128 __H, __m128 __L)
>  {
> --- gcc/config/i386/avx512fintrin.h.jj  2019-07-12 09:34:49.524385009 +0200
> +++ gcc/config/i386/avx512fintrin.h     2019-08-12 14:36:52.281169281 +0200
> @@ -15437,6 +15437,48 @@ _mm512_castsi256_si512 (__m256i __A)
>    return (__m512i)__builtin_ia32_si512_256si ((__v8si)__A);
>  }
>
> +extern __inline __m512d
> +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
> +_mm512_zextpd128_pd512 (__m128d __A)
> +{
> +  return (__m512d) _mm512_insertf32x4 (_mm512_setzero_ps (), (__m128) __A, 
> 0);
> +}
> +
> +extern __inline __m512
> +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
> +_mm512_zextps128_ps512 (__m128 __A)
> +{
> +  return _mm512_insertf32x4 (_mm512_setzero_ps (), __A, 0);
> +}
> +
> +extern __inline __m512i
> +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
> +_mm512_zextsi128_si512 (__m128i __A)
> +{
> +  return _mm512_inserti32x4 (_mm512_setzero_si512 (), __A, 0);
> +}
> +
> +extern __inline __m512d
> +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
> +_mm512_zextpd256_pd512 (__m256d __A)
> +{
> +  return _mm512_insertf64x4 (_mm512_setzero_pd (), __A, 0);
> +}
> +
> +extern __inline __m512
> +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
> +_mm512_zextps256_ps512 (__m256 __A)
> +{
> +  return (__m512) _mm512_insertf64x4 (_mm512_setzero_pd (), (__m256d) __A, 
> 0);
> +}
> +
> +extern __inline __m512i
> +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
> +_mm512_zextsi256_si512 (__m256i __A)
> +{
> +  return _mm512_inserti64x4 (_mm512_setzero_si512 (), __A, 0);
> +}
> +
>  extern __inline __mmask16
>  __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
>  _mm512_cmpeq_epu32_mask (__m512i __A, __m512i __B)
> --- gcc/testsuite/gcc.target/i386/avx-typecast-1.c.jj   2019-08-12 
> 15:12:51.597209881 +0200
> +++ gcc/testsuite/gcc.target/i386/avx-typecast-1.c      2019-08-12 
> 15:12:47.334274860 +0200
> @@ -0,0 +1,83 @@
> +/* { dg-do run } */
> +/* { dg-options "-O2 -mavx" } */
> +/* { dg-require-effective-target avx } */
> +
> +#include "avx-check.h"
> +
> +extern int memcmp (const void *, const void *, __SIZE_TYPE__);
> +
> +void
> +avx_test (void)
> +{
> +  union256i_d  a, ad;
> +  union256  b, bd;
> +  union256d  c, cd;
> +  union128i_d  d, dd;
> +  union128  e, ed;
> +  union128d  f, fd;
> +  int i;
> +
> +  for (i = 0; i < 8; i++)
> +    {
> +      a.a[i] = 7146908634 + i;
> +      b.a[i] = 45.12f + i;
> +    }
> +
> +  for (i = 0; i < 4; i++)
> +    {
> +      c.a[i] = 41234512513451345.0905 + i;
> +      d.a[i] = 109534 + i;
> +      e.a[i] = 85034.095f + i;
> +    }
> +
> +  for (i = 0; i < 2; i++)
> +    f.a[i] = 41234512451345.0905 + i;
> +
> +  bd.x = _mm256_castpd_ps (c.x);
> +  if (memcmp (bd.a, c.a, 32))
> +    abort ();
> +
> +  ad.x = _mm256_castpd_si256 (c.x);
> +  if (memcmp (ad.a, c.a, 32))
> +    abort ();
> +
> +  cd.x = _mm256_castps_pd (b.x);
> +  if (memcmp (cd.a, b.a, 32))
> +    abort ();
> +
> +  ad.x = _mm256_castps_si256 (b.x);
> +  if (memcmp (ad.a, b.a, 32))
> +    abort ();
> +
> +  bd.x = _mm256_castsi256_ps (a.x);
> +  if (memcmp (bd.a, a.a, 32))
> +    abort ();
> +
> +  cd.x = _mm256_castsi256_pd (a.x);
> +  if (memcmp (cd.a, a.a, 32))
> +    abort ();
> +
> +  fd.x = _mm256_castpd256_pd128 (c.x);
> +  if (memcmp (fd.a, c.a, 16))
> +    abort ();
> +
> +  ed.x = _mm256_castps256_ps128 (b.x);
> +  if (memcmp (ed.a, b.a, 16))
> +    abort ();
> +
> +  dd.x = _mm256_castsi256_si128 (a.x);
> +  if (memcmp (dd.a, a.a, 16))
> +    abort ();
> +
> +  cd.x = _mm256_castpd128_pd256 (f.x);
> +  if (memcmp (cd.a, f.a, 16))
> +    abort ();
> +
> +  bd.x = _mm256_castps128_ps256 (e.x);
> +  if (memcmp (bd.a, e.a, 16))
> +    abort ();
> +
> +  ad.x = _mm256_castsi128_si256 (d.x);
> +  if (memcmp (ad.a, d.a, 16))
> +    abort ();
> +}
> --- gcc/testsuite/gcc.target/i386/avx-typecast-2.c.jj   2019-08-12 
> 15:12:55.056157156 +0200
> +++ gcc/testsuite/gcc.target/i386/avx-typecast-2.c      2019-08-12 
> 15:14:57.108296731 +0200
> @@ -0,0 +1,46 @@
> +/* { dg-do run } */
> +/* { dg-options "-O2 -mavx" } */
> +/* { dg-require-effective-target avx } */
> +
> +#include "avx-check.h"
> +
> +extern int memcmp (const void *, const void *, __SIZE_TYPE__);
> +
> +void
> +avx_test (void)
> +{
> +  union256i_d ad, zero;
> +  union256 bd;
> +  union256d cd;
> +  union128i_d d;
> +  union128 e;
> +  union128d f;
> +  int i;
> +
> +  for (i = 0; i < 8; i++)
> +    zero.a[i] = 0;
> +
> +  for (i = 0; i < 4; i++)
> +    {
> +      d.a[i] = 109534 + i;
> +      e.a[i] = 85034.095f + i;
> +    }
> +
> +  for (i = 0; i < 2; i++)
> +    f.a[i] = 41234512451345.0905 + i;
> +
> +  cd.x = _mm256_zextpd128_pd256 (f.x);
> +  if (memcmp (cd.a, f.a, 16)
> +      || memcmp (&cd.a[2], &zero.a, 16))
> +    abort ();
> +
> +  bd.x = _mm256_zextps128_ps256 (e.x);
> +  if (memcmp (bd.a, e.a, 16)
> +      || memcmp (&bd.a[4], &zero.a, 16))
> +    abort ();
> +
> +  ad.x = _mm256_zextsi128_si256 (d.x);
> +  if (memcmp (ad.a, d.a, 16)
> +      || memcmp (&ad.a[4], &zero.a, 16))
> +    abort ();
> +}
> --- gcc/testsuite/gcc.target/i386/avx512f-typecast-2.c.jj       2019-08-12 
> 14:38:41.389500441 +0200
> +++ gcc/testsuite/gcc.target/i386/avx512f-typecast-2.c  2019-08-12 
> 14:47:10.291717937 +0200
> @@ -0,0 +1,71 @@
> +/* { dg-do run } */
> +/* { dg-options "-O2 -mavx512f" } */
> +/* { dg-require-effective-target avx512f } */
> +
> +#include "avx512f-check.h"
> +
> +extern int memcmp (const void *, const void *, __SIZE_TYPE__);
> +
> +void
> +avx512f_test (void)
> +{
> +  union512i_d ad, zero;
> +  union512 bd;
> +  union512d cd;
> +  union256i_d d;
> +  union256 e;
> +  union256d f;
> +  union128i_d g;
> +  union128 h;
> +  union128d k;
> +  int i;
> +
> +  for (i = 0; i < 16; i++)
> +    zero.a[i] = 0;
> +
> +  for (i = 0; i < 8; i++)
> +    {
> +      d.a[i] = 109534 + i;
> +      e.a[i] = 85034.095f + i;
> +    }
> +
> +  for (i = 0; i < 4; i++)
> +    {
> +      f.a[i] = 41234512451345.0905 + i;
> +      g.a[i] = 71469086341 + i;
> +      h.a[i] = 45.1264f + i;
> +    }
> +
> +  for (i = 0; i < 2; i++)
> +    k.a[i] = 7146908634.576 + i;
> +
> +  cd.x = _mm512_zextpd128_pd512 (k.x);
> +  if (memcmp (cd.a, k.a, 16)
> +      || memcmp (&cd.a[2], &zero.a, 48))
> +    abort ();
> +
> +  bd.x = _mm512_zextps128_ps512 (h.x);
> +  if (memcmp (bd.a, h.a, 16)
> +      || memcmp (&bd.a[4], &zero.a, 48))
> +    abort ();
> +
> +  ad.x = _mm512_zextsi128_si512 (g.x);
> +  if (memcmp (ad.a, g.a, 16)
> +      || memcmp (&ad.a[4], &zero.a, 48))
> +    abort ();
> +
> +  cd.x = _mm512_zextpd256_pd512 (f.x);
> +  if (memcmp (cd.a, f.a, 32)
> +      || memcmp (&cd.a[4], &zero.a, 32))
> +    abort ();
> +
> +  bd.x = _mm512_zextps256_ps512 (e.x);
> +  if (memcmp (bd.a, e.a, 32)
> +      || memcmp (&bd.a[8], &zero.a, 32))
> +    abort ();
> +
> +  ad.x = _mm512_zextsi256_si512 (d.x);
> +  if (memcmp (ad.a, d.a, 32)
> +      || memcmp (&ad.a[8], &zero.a, 32))
> +    abort ();
> +}
>
>         Jakub

Reply via email to