Re: [x86, 3/n] Replace builtins with vector extensions

2014-11-09 Thread Uros Bizjak
On Sat, Nov 8, 2014 at 1:47 PM, Marc Glisse  wrote:
> Hello,
>
> this patch mechanically extends +-* for integer vectors of size 256 and
> 512 (the previous patch only handled 128).
>
> Regtested together with the next patch.
>
> 2014-11-10  Marc Glisse  
>
> * config/i386/avxintrin.h (__v4du, __v8su, __v16hu, __v32qu):
> New typedefs.
> * config/i386/avx512fintrin.h (__v8du, __v16su, __v32hu, __v64qu):
> Likewise.
> (_mm512_mullo_epi32, _mm512_add_epi64, _mm512_sub_epi64,
> _mm512_add_epi32, _mm512_sub_epi32): Use vector extensions
> instead of builtins.
> * config/i386/avx2intrin.h (_mm256_add_epi8, _mm256_add_epi16,
> _mm256_add_epi32, _mm256_add_epi64, _mm256_mullo_epi16,
> _mm256_mullo_epi32, _mm256_sub_epi8, _mm256_sub_epi16,
> _mm256_sub_epi32, _mm256_sub_epi64): Likewise.
> * config/i386/avx512bwintrin.h (_mm512_mullo_epi16, _mm512_add_epi8,
> _mm512_sub_epi8, _mm512_sub_epi16, _mm512_add_epi16): Likewise.
> * config/i386/avx512dqintrin.h (_mm512_mullo_epi64): Likewise.
> * config/i386/avx512vldqintrin.h (_mm256_mullo_epi64,
> _mm_mullo_epi64):
> Likewise.

OK.

Uros.


[x86, 3/n] Replace builtins with vector extensions

2014-11-08 Thread Marc Glisse

Hello,

this patch mechanically extends +-* for integer vectors of size 256 and
512 (the previous patch only handled 128).

Regtested together with the next patch.

2014-11-10  Marc Glisse  

* config/i386/avxintrin.h (__v4du, __v8su, __v16hu, __v32qu):
New typedefs.
* config/i386/avx512fintrin.h (__v8du, __v16su, __v32hu, __v64qu):
Likewise.
(_mm512_mullo_epi32, _mm512_add_epi64, _mm512_sub_epi64,
_mm512_add_epi32, _mm512_sub_epi32): Use vector extensions
instead of builtins.
* config/i386/avx2intrin.h (_mm256_add_epi8, _mm256_add_epi16,
_mm256_add_epi32, _mm256_add_epi64, _mm256_mullo_epi16,
_mm256_mullo_epi32, _mm256_sub_epi8, _mm256_sub_epi16,
_mm256_sub_epi32, _mm256_sub_epi64): Likewise.
* config/i386/avx512bwintrin.h (_mm512_mullo_epi16, _mm512_add_epi8,
_mm512_sub_epi8, _mm512_sub_epi16, _mm512_add_epi16): Likewise.
* config/i386/avx512dqintrin.h (_mm512_mullo_epi64): Likewise.
* config/i386/avx512vldqintrin.h (_mm256_mullo_epi64, _mm_mullo_epi64):
Likewise.

--
Marc GlisseIndex: config/i386/avx2intrin.h
===
--- config/i386/avx2intrin.h(revision 217249)
+++ config/i386/avx2intrin.h(working copy)
@@ -97,42 +97,42 @@ extern __inline __m256i
 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
 _mm256_packus_epi16 (__m256i __A, __m256i __B)
 {
   return (__m256i)__builtin_ia32_packuswb256 ((__v16hi)__A, (__v16hi)__B);
 }
 
 extern __inline __m256i
 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
 _mm256_add_epi8 (__m256i __A, __m256i __B)
 {
-  return (__m256i)__builtin_ia32_paddb256 ((__v32qi)__A, (__v32qi)__B);
+  return (__m256i) ((__v32qu)__A + (__v32qu)__B);
 }
 
 extern __inline __m256i
 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
 _mm256_add_epi16 (__m256i __A, __m256i __B)
 {
-  return (__m256i)__builtin_ia32_paddw256 ((__v16hi)__A, (__v16hi)__B);
+  return (__m256i) ((__v16hu)__A + (__v16hu)__B);
 }
 
 extern __inline __m256i
 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
 _mm256_add_epi32 (__m256i __A, __m256i __B)
 {
-  return (__m256i)__builtin_ia32_paddd256 ((__v8si)__A, (__v8si)__B);
+  return (__m256i) ((__v8su)__A + (__v8su)__B);
 }
 
 extern __inline __m256i
 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
 _mm256_add_epi64 (__m256i __A, __m256i __B)
 {
-  return (__m256i)__builtin_ia32_paddq256 ((__v4di)__A, (__v4di)__B);
+  return (__m256i) ((__v4du)__A + (__v4du)__B);
 }
 
 extern __inline __m256i
 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
 _mm256_adds_epi8 (__m256i __A, __m256i __B)
 {
   return (__m256i)__builtin_ia32_paddsb256 ((__v32qi)__A, (__v32qi)__B);
 }
 
 extern __inline __m256i
@@ -548,28 +548,28 @@ extern __inline __m256i
 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
 _mm256_mulhi_epi16 (__m256i __A, __m256i __B)
 {
   return (__m256i)__builtin_ia32_pmulhw256 ((__v16hi)__A, (__v16hi)__B);
 }
 
 extern __inline __m256i
 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
 _mm256_mullo_epi16 (__m256i __A, __m256i __B)
 {
-  return (__m256i)__builtin_ia32_pmullw256 ((__v16hi)__A, (__v16hi)__B);
+  return (__m256i) ((__v16hu)__A * (__v16hu)__B);
 }
 
 extern __inline __m256i
 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
 _mm256_mullo_epi32 (__m256i __A, __m256i __B)
 {
-  return (__m256i)__builtin_ia32_pmulld256 ((__v8si)__A, (__v8si)__B);
+  return (__m256i) ((__v8su)__A * (__v8su)__B);
 }
 
 extern __inline __m256i
 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
 _mm256_mul_epu32 (__m256i __A, __m256i __B)
 {
   return (__m256i)__builtin_ia32_pmuludq256 ((__v8si)__A, (__v8si)__B);
 }
 
 extern __inline __m256i
@@ -778,42 +778,42 @@ extern __inline __m256i
 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
 _mm256_srl_epi64 (__m256i __A, __m128i __B)
 {
   return (__m256i)__builtin_ia32_psrlq256((__v4di)__A, (__v2di)__B);
 }
 
 extern __inline __m256i
 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
 _mm256_sub_epi8 (__m256i __A, __m256i __B)
 {
-  return (__m256i)__builtin_ia32_psubb256 ((__v32qi)__A, (__v32qi)__B);
+  return (__m256i) ((__v32qu)__A - (__v32qu)__B);
 }
 
 extern __inline __m256i
 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
 _mm256_sub_epi16 (__m256i __A, __m256i __B)
 {
-  return (__m256i)__builtin_ia32_psubw256 ((__v16hi)__A, (__v16hi)__B);
+  return (__m256i) ((__v16hu)__A - (__v16hu)__B);
 }
 
 extern __inline __m256i
 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
 _mm256_sub_epi32 (__m256i __A, __m256i __B)
 {
-  return (__m256i)__builtin_ia32_psubd256 ((__v8si)__A, (__v8si)__B);
+  return (__m256i) ((__v8su)__A - (__v8su)__B);
 }
 
 extern __inline __m256i
 __attr