On Mon, Aug 12, 2019 at 4:57 PM Jakub Jelinek <ja...@redhat.com> wrote: > > Hi! > > The following patch adds 9 missing intrinsics, which are like _mm*_cast*, > but don't leave the upper bits undefined - set them to zero instead. > The implementation uses code that combine manages to optimize well, > the only problem is that as the 512-bit intrinsics are supposed to be > avx512f and some needed intrinsics they'd ideally use are avx512dq, it means > that for _mm512_zextpd128_pd512/_mm512_zextps256_ps512 we emit > vmovaps/vmovapd instead of vmovapd/vmovaps. > > I've also discovered that for AVX, there is no test coverage of the various > cast intrinsics, so I've added that too. > > The PR has some details on other possible expansions, it would be nice to > optimize also those definitions into the same code, but it will require some > extra define_insn_and_split, though I think that can be done incrementally; > and once done, perhaps we could change the > _mm512_zextpd128_pd512/_mm512_zextps256_ps512 > so that they actually generate the right ps vs. pd variant of move. > > Bootstrapped/regtested on x86_64-linux and i686-linux, ok for trunk? > > 2019-08-12 Jakub Jelinek <ja...@redhat.com> > > PR target/83250 > PR target/91340 > * config/i386/avxintrin.h (_mm256_zextpd128_pd256, > _mm256_zextps128_ps256, _mm256_zextsi128_si256): New intrinsics. > * config/i386/avx512fintrin.h (_mm512_zextpd128_pd512, > _mm512_zextps128_ps512, _mm512_zextsi128_si512, > _mm512_zextpd256_pd512, > _mm512_zextps256_ps512, _mm512_zextsi256_si512): Likewise. > > * gcc.target/i386/avx-typecast-1.c: New test. > * gcc.target/i386/avx-typecast-2.c: New test. > * gcc.target/i386/avx512f-typecast-2.c: New test.
OK for AVX, LGTM for AVX512F. Thanks, Uros. > > --- gcc/config/i386/avxintrin.h.jj 2019-08-05 12:25:34.476667673 +0200 > +++ gcc/config/i386/avxintrin.h 2019-08-12 14:33:07.905601186 +0200 > @@ -1484,6 +1484,26 @@ _mm256_castsi128_si256 (__m128i __A) > return (__m256i) __builtin_ia32_si256_si ((__v4si)__A); > } > > +/* Similarly, but with zero extension instead of undefined values. */ > + > +extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, > __artificial__)) > +_mm256_zextpd128_pd256 (__m128d __A) > +{ > + return _mm256_insertf128_pd (_mm256_setzero_pd (), __A, 0); > +} > + > +extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, > __artificial__)) > +_mm256_zextps128_ps256 (__m128 __A) > +{ > + return _mm256_insertf128_ps (_mm256_setzero_ps (), __A, 0); > +} > + > +extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, > __artificial__)) > +_mm256_zextsi128_si256 (__m128i __A) > +{ > + return _mm256_insertf128_si256 (_mm256_setzero_si256 (), __A, 0); > +} > + > extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, > __artificial__)) > _mm256_set_m128 ( __m128 __H, __m128 __L) > { > --- gcc/config/i386/avx512fintrin.h.jj 2019-07-12 09:34:49.524385009 +0200 > +++ gcc/config/i386/avx512fintrin.h 2019-08-12 14:36:52.281169281 +0200 > @@ -15437,6 +15437,48 @@ _mm512_castsi256_si512 (__m256i __A) > return (__m512i)__builtin_ia32_si512_256si ((__v8si)__A); > } > > +extern __inline __m512d > +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) > +_mm512_zextpd128_pd512 (__m128d __A) > +{ > + return (__m512d) _mm512_insertf32x4 (_mm512_setzero_ps (), (__m128) __A, > 0); > +} > + > +extern __inline __m512 > +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) > +_mm512_zextps128_ps512 (__m128 __A) > +{ > + return _mm512_insertf32x4 (_mm512_setzero_ps (), __A, 0); > +} > + > +extern __inline __m512i > +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) > +_mm512_zextsi128_si512 (__m128i __A) > +{ > + return _mm512_inserti32x4 (_mm512_setzero_si512 (), __A, 0); > +} > + > +extern __inline __m512d > +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) > +_mm512_zextpd256_pd512 (__m256d __A) > +{ > + return _mm512_insertf64x4 (_mm512_setzero_pd (), __A, 0); > +} > + > +extern __inline __m512 > +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) > +_mm512_zextps256_ps512 (__m256 __A) > +{ > + return (__m512) _mm512_insertf64x4 (_mm512_setzero_pd (), (__m256d) __A, > 0); > +} > + > +extern __inline __m512i > +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) > +_mm512_zextsi256_si512 (__m256i __A) > +{ > + return _mm512_inserti64x4 (_mm512_setzero_si512 (), __A, 0); > +} > + > extern __inline __mmask16 > __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) > _mm512_cmpeq_epu32_mask (__m512i __A, __m512i __B) > --- gcc/testsuite/gcc.target/i386/avx-typecast-1.c.jj 2019-08-12 > 15:12:51.597209881 +0200 > +++ gcc/testsuite/gcc.target/i386/avx-typecast-1.c 2019-08-12 > 15:12:47.334274860 +0200 > @@ -0,0 +1,83 @@ > +/* { dg-do run } */ > +/* { dg-options "-O2 -mavx" } */ > +/* { dg-require-effective-target avx } */ > + > +#include "avx-check.h" > + > +extern int memcmp (const void *, const void *, __SIZE_TYPE__); > + > +void > +avx_test (void) > +{ > + union256i_d a, ad; > + union256 b, bd; > + union256d c, cd; > + union128i_d d, dd; > + union128 e, ed; > + union128d f, fd; > + int i; > + > + for (i = 0; i < 8; i++) > + { > + a.a[i] = 7146908634 + i; > + b.a[i] = 45.12f + i; > + } > + > + for (i = 0; i < 4; i++) > + { > + c.a[i] = 41234512513451345.0905 + i; > + d.a[i] = 109534 + i; > + e.a[i] = 85034.095f + i; > + } > + > + for (i = 0; i < 2; i++) > + f.a[i] = 41234512451345.0905 + i; > + > + bd.x = _mm256_castpd_ps (c.x); > + if (memcmp (bd.a, c.a, 32)) > + abort (); > + > + ad.x = _mm256_castpd_si256 (c.x); > + if (memcmp (ad.a, c.a, 32)) > + abort (); > + > + cd.x = _mm256_castps_pd (b.x); > + if (memcmp (cd.a, b.a, 32)) > + abort (); > + > + ad.x = _mm256_castps_si256 (b.x); > + if (memcmp (ad.a, b.a, 32)) > + abort (); > + > + bd.x = _mm256_castsi256_ps (a.x); > + if (memcmp (bd.a, a.a, 32)) > + abort (); > + > + cd.x = _mm256_castsi256_pd (a.x); > + if (memcmp (cd.a, a.a, 32)) > + abort (); > + > + fd.x = _mm256_castpd256_pd128 (c.x); > + if (memcmp (fd.a, c.a, 16)) > + abort (); > + > + ed.x = _mm256_castps256_ps128 (b.x); > + if (memcmp (ed.a, b.a, 16)) > + abort (); > + > + dd.x = _mm256_castsi256_si128 (a.x); > + if (memcmp (dd.a, a.a, 16)) > + abort (); > + > + cd.x = _mm256_castpd128_pd256 (f.x); > + if (memcmp (cd.a, f.a, 16)) > + abort (); > + > + bd.x = _mm256_castps128_ps256 (e.x); > + if (memcmp (bd.a, e.a, 16)) > + abort (); > + > + ad.x = _mm256_castsi128_si256 (d.x); > + if (memcmp (ad.a, d.a, 16)) > + abort (); > +} > --- gcc/testsuite/gcc.target/i386/avx-typecast-2.c.jj 2019-08-12 > 15:12:55.056157156 +0200 > +++ gcc/testsuite/gcc.target/i386/avx-typecast-2.c 2019-08-12 > 15:14:57.108296731 +0200 > @@ -0,0 +1,46 @@ > +/* { dg-do run } */ > +/* { dg-options "-O2 -mavx" } */ > +/* { dg-require-effective-target avx } */ > + > +#include "avx-check.h" > + > +extern int memcmp (const void *, const void *, __SIZE_TYPE__); > + > +void > +avx_test (void) > +{ > + union256i_d ad, zero; > + union256 bd; > + union256d cd; > + union128i_d d; > + union128 e; > + union128d f; > + int i; > + > + for (i = 0; i < 8; i++) > + zero.a[i] = 0; > + > + for (i = 0; i < 4; i++) > + { > + d.a[i] = 109534 + i; > + e.a[i] = 85034.095f + i; > + } > + > + for (i = 0; i < 2; i++) > + f.a[i] = 41234512451345.0905 + i; > + > + cd.x = _mm256_zextpd128_pd256 (f.x); > + if (memcmp (cd.a, f.a, 16) > + || memcmp (&cd.a[2], &zero.a, 16)) > + abort (); > + > + bd.x = _mm256_zextps128_ps256 (e.x); > + if (memcmp (bd.a, e.a, 16) > + || memcmp (&bd.a[4], &zero.a, 16)) > + abort (); > + > + ad.x = _mm256_zextsi128_si256 (d.x); > + if (memcmp (ad.a, d.a, 16) > + || memcmp (&ad.a[4], &zero.a, 16)) > + abort (); > +} > --- gcc/testsuite/gcc.target/i386/avx512f-typecast-2.c.jj 2019-08-12 > 14:38:41.389500441 +0200 > +++ gcc/testsuite/gcc.target/i386/avx512f-typecast-2.c 2019-08-12 > 14:47:10.291717937 +0200 > @@ -0,0 +1,71 @@ > +/* { dg-do run } */ > +/* { dg-options "-O2 -mavx512f" } */ > +/* { dg-require-effective-target avx512f } */ > + > +#include "avx512f-check.h" > + > +extern int memcmp (const void *, const void *, __SIZE_TYPE__); > + > +void > +avx512f_test (void) > +{ > + union512i_d ad, zero; > + union512 bd; > + union512d cd; > + union256i_d d; > + union256 e; > + union256d f; > + union128i_d g; > + union128 h; > + union128d k; > + int i; > + > + for (i = 0; i < 16; i++) > + zero.a[i] = 0; > + > + for (i = 0; i < 8; i++) > + { > + d.a[i] = 109534 + i; > + e.a[i] = 85034.095f + i; > + } > + > + for (i = 0; i < 4; i++) > + { > + f.a[i] = 41234512451345.0905 + i; > + g.a[i] = 71469086341 + i; > + h.a[i] = 45.1264f + i; > + } > + > + for (i = 0; i < 2; i++) > + k.a[i] = 7146908634.576 + i; > + > + cd.x = _mm512_zextpd128_pd512 (k.x); > + if (memcmp (cd.a, k.a, 16) > + || memcmp (&cd.a[2], &zero.a, 48)) > + abort (); > + > + bd.x = _mm512_zextps128_ps512 (h.x); > + if (memcmp (bd.a, h.a, 16) > + || memcmp (&bd.a[4], &zero.a, 48)) > + abort (); > + > + ad.x = _mm512_zextsi128_si512 (g.x); > + if (memcmp (ad.a, g.a, 16) > + || memcmp (&ad.a[4], &zero.a, 48)) > + abort (); > + > + cd.x = _mm512_zextpd256_pd512 (f.x); > + if (memcmp (cd.a, f.a, 32) > + || memcmp (&cd.a[4], &zero.a, 32)) > + abort (); > + > + bd.x = _mm512_zextps256_ps512 (e.x); > + if (memcmp (bd.a, e.a, 32) > + || memcmp (&bd.a[8], &zero.a, 32)) > + abort (); > + > + ad.x = _mm512_zextsi256_si512 (d.x); > + if (memcmp (ad.a, d.a, 32) > + || memcmp (&ad.a[8], &zero.a, 32)) > + abort (); > +} > > Jakub