jina.nahias created this revision. https://reviews.llvm.org/D38672
Files: lib/Headers/avx512fintrin.h lib/Headers/avx512vlintrin.h test/CodeGen/avx512f-builtins.c test/CodeGen/avx512vl-builtins.c
Index: test/CodeGen/avx512vl-builtins.c =================================================================== --- test/CodeGen/avx512vl-builtins.c +++ test/CodeGen/avx512vl-builtins.c @@ -5630,73 +5630,85 @@ } __m256 test_mm256_shuffle_f32x4(__m256 __A, __m256 __B) { // CHECK-LABEL: @test_mm256_shuffle_f32x4 - // CHECK: @llvm.x86.avx512.mask.shuf.f32x4 + // CHECK: shufflevector <4 x double> %{{.*}}, <4 x double> %{{.*}}, <4 x i32> <i32 2, i32 3, i32 6, i32 7> return _mm256_shuffle_f32x4(__A, __B, 3); } __m256 test_mm256_mask_shuffle_f32x4(__m256 __W, __mmask8 __U, __m256 __A, __m256 __B) { // CHECK-LABEL: @test_mm256_mask_shuffle_f32x4 - // CHECK: @llvm.x86.avx512.mask.shuf.f32x4 + // CHECK: shufflevector <4 x double> %{{.*}}, <4 x double> %{{.*}}, <4 x i32> <i32 2, i32 3, i32 6, i32 7> + // CHECK: select <8 x i1> %{{.*}}, <8 x float> %{{.*}}, <8 x float> %{{.*}} return _mm256_mask_shuffle_f32x4(__W, __U, __A, __B, 3); } __m256 test_mm256_maskz_shuffle_f32x4(__mmask8 __U, __m256 __A, __m256 __B) { // CHECK-LABEL: @test_mm256_maskz_shuffle_f32x4 - // CHECK: @llvm.x86.avx512.mask.shuf.f32x4 + // CHECK: shufflevector <4 x double> %{{.*}}, <4 x double> %{{.*}}, <4 x i32> <i32 2, i32 3, i32 6, i32 7> + // CHECK: select <8 x i1> %{{.*}}, <8 x float> %{{.*}}, <8 x float> %{{.*}} return _mm256_maskz_shuffle_f32x4(__U, __A, __B, 3); } __m256d test_mm256_shuffle_f64x2(__m256d __A, __m256d __B) { // CHECK-LABEL: @test_mm256_shuffle_f64x2 - // CHECK: @llvm.x86.avx512.mask.shuf.f64x2 + // CHECK: shufflevector <4 x double> %{{.*}}, <4 x double> %{{.*}}, <4 x i32> <i32 2, i32 3, i32 6, i32 7> return _mm256_shuffle_f64x2(__A, __B, 3); } __m256d test_mm256_mask_shuffle_f64x2(__m256d __W, __mmask8 __U, __m256d __A, __m256d __B) { // CHECK-LABEL: @test_mm256_mask_shuffle_f64x2 - // CHECK: @llvm.x86.avx512.mask.shuf.f64x2 + // CHECK: shufflevector <4 x double> %{{.*}}, <4 x double> %{{.*}}, <4 x i32> <i32 2, i32 3, i32 6, i32 7> + // CHECK: shufflevector <8 x i1> %{{.*}}, <8 x i1> %{{.*}}, <4 x i32> <i32 0, i32 1, i32 2, i32 3> + // CHECK: select <4 x i1> %{{.*}}, <4 x double> %{{.*}}, <4 x double> %{{.*}} return _mm256_mask_shuffle_f64x2(__W, __U, __A, __B, 3); } __m256d test_mm256_maskz_shuffle_f64x2(__mmask8 __U, __m256d __A, __m256d __B) { // CHECK-LABEL: @test_mm256_maskz_shuffle_f64x2 - // CHECK: @llvm.x86.avx512.mask.shuf.f64x2 + // CHECK: shufflevector <4 x double> %{{.*}}, <4 x double> %{{.*}}, <4 x i32> <i32 2, i32 3, i32 6, i32 7> + // CHECK: shufflevector <8 x i1> %{{.*}}1, <8 x i1> %{{.*}}, <4 x i32> <i32 0, i32 1, i32 2, i32 3> + // CHECK: select <4 x i1> %{{.*}}, <4 x double> %{{.*}}, <4 x double> %{{.*}} return _mm256_maskz_shuffle_f64x2(__U, __A, __B, 3); } __m256i test_mm256_shuffle_i32x4(__m256i __A, __m256i __B) { // CHECK-LABEL: @test_mm256_shuffle_i32x4 - // CHECK: @llvm.x86.avx512.mask.shuf.i32x4 + // CHECK: shufflevector <4 x i64> %{{.*}}, <4 x i64> %{{.*}}, <4 x i32> <i32 2, i32 3, i32 6, i32 7> return _mm256_shuffle_i32x4(__A, __B, 3); } __m256i test_mm256_mask_shuffle_i32x4(__m256i __W, __mmask8 __U, __m256i __A, __m256i __B) { // CHECK-LABEL: @test_mm256_mask_shuffle_i32x4 - // CHECK: @llvm.x86.avx512.mask.shuf.i32x4 + // CHECK: shufflevector <4 x double> %{{.*}}, <4 x double> %{{.*}}, <4 x i32> <i32 2, i32 3, i32 6, i32 7> + // CHECK: select <8 x i1> %{{.*}}, <8 x i32> %{{.*}}, <8 x i32> %{{.*}} return _mm256_mask_shuffle_i32x4(__W, __U, __A, __B, 3); } __m256i test_mm256_maskz_shuffle_i32x4(__mmask8 __U, __m256i __A, __m256i __B) { // CHECK-LABEL: @test_mm256_maskz_shuffle_i32x4 - // CHECK: @llvm.x86.avx512.mask.shuf.i32x4 + // CHECK: shufflevector <4 x double> %{{.*}}, <4 x double> %{{.*}}, <4 x i32> <i32 2, i32 3, i32 6, i32 7> + // CHECK: select <8 x i1> %{{.*}}, <8 x i32> %{{.*}}, <8 x i32> %{{.*}} return _mm256_maskz_shuffle_i32x4(__U, __A, __B, 3); } __m256i test_mm256_shuffle_i64x2(__m256i __A, __m256i __B) { // CHECK-LABEL: @test_mm256_shuffle_i64x2 - // CHECK: @llvm.x86.avx512.mask.shuf.i64x2 + // CHECK: shufflevector <4 x i64> %{{.*}}, <4 x i64> %{{.*}}, <4 x i32> <i32 2, i32 3, i32 6, i32 7> return _mm256_shuffle_i64x2(__A, __B, 3); } __m256i test_mm256_mask_shuffle_i64x2(__m256i __W, __mmask8 __U, __m256i __A, __m256i __B) { // CHECK-LABEL: @test_mm256_mask_shuffle_i64x2 - // CHECK: @llvm.x86.avx512.mask.shuf.i64x2 + // CHECK: shufflevector <4 x double> %{{.*}}, <4 x double> %{{.*}}, <4 x i32> <i32 2, i32 3, i32 6, i32 7> + // CHECK: shufflevector <8 x i1> %{{.*}}, <8 x i1> %{{.*}}, <4 x i32> <i32 0, i32 1, i32 2, i32 3> + // CHECK: select <4 x i1> %{{.*}}, <4 x i64> %{{.*}}, <4 x i64> %{{.*}} return _mm256_mask_shuffle_i64x2(__W, __U, __A, __B, 3); } __m256i test_mm256_maskz_shuffle_i64x2(__mmask8 __U, __m256i __A, __m256i __B) { // CHECK-LABEL: @test_mm256_maskz_shuffle_i64x2 - // CHECK: @llvm.x86.avx512.mask.shuf.i64x2 + // CHECK: shufflevector <4 x double> %{{.*}}, <4 x double> %{{.*}}, <4 x i32> <i32 2, i32 3, i32 6, i32 7> + // CHECK: shufflevector <8 x i1> %{{.*}}, <8 x i1> %{{.*}}, <4 x i32> <i32 0, i32 1, i32 2, i32 3> + // CHECK: select <4 x i1> %{{.*}}, <4 x i64> %{{.*}}, <4 x i64> %{{.*}} return _mm256_maskz_shuffle_i64x2(__U, __A, __B, 3); } Index: test/CodeGen/avx512f-builtins.c =================================================================== --- test/CodeGen/avx512f-builtins.c +++ test/CodeGen/avx512f-builtins.c @@ -4488,73 +4488,81 @@ __m512 test_mm512_shuffle_f32x4(__m512 __A, __m512 __B) { // CHECK-LABEL: @test_mm512_shuffle_f32x4 - // CHECK: @llvm.x86.avx512.mask.shuf.f32x4 + // CHECK: shufflevector <8 x double> %{{.*}}, <8 x double> %{{.*}}, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 8, i32 9> return _mm512_shuffle_f32x4(__A, __B, 4); } __m512 test_mm512_mask_shuffle_f32x4(__m512 __W, __mmask16 __U, __m512 __A, __m512 __B) { // CHECK-LABEL: @test_mm512_mask_shuffle_f32x4 - // CHECK: @llvm.x86.avx512.mask.shuf.f32x4 + // CHECK: shufflevector <8 x double> %{{.*}}, <8 x double> %{{.*}}, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 8, i32 9> + // CHECK: select <16 x i1> %{{.*}}, <16 x float> %{{.*}}, <16 x float> %{{.*}} return _mm512_mask_shuffle_f32x4(__W, __U, __A, __B, 4); } __m512 test_mm512_maskz_shuffle_f32x4(__mmask16 __U, __m512 __A, __m512 __B) { // CHECK-LABEL: @test_mm512_maskz_shuffle_f32x4 - // CHECK: @llvm.x86.avx512.mask.shuf.f32x4 + // CHECK: shufflevector <8 x double> %{{.*}}, <8 x double> %{{.*}}, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 8, i32 9> + // CHECK: select <16 x i1> %{{.*}}, <16 x float> %{{.*}}, <16 x float> %{{.*}} return _mm512_maskz_shuffle_f32x4(__U, __A, __B, 4); } __m512d test_mm512_shuffle_f64x2(__m512d __A, __m512d __B) { // CHECK-LABEL: @test_mm512_shuffle_f64x2 - // CHECK: @llvm.x86.avx512.mask.shuf.f64x2 + // CHECK: shufflevector <8 x double> %0, <8 x double> %{{.*}}, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 8, i32 9> return _mm512_shuffle_f64x2(__A, __B, 4); } __m512d test_mm512_mask_shuffle_f64x2(__m512d __W, __mmask8 __U, __m512d __A, __m512d __B) { // CHECK-LABEL: @test_mm512_mask_shuffle_f64x2 - // CHECK: @llvm.x86.avx512.mask.shuf.f64x2 + // CHECK: shufflevector <8 x double> %{{.*}}, <8 x double> %{{.*}}, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 8, i32 9> + // CHECK: select <8 x i1> %{{.*}}, <8 x double> %{{.*}}, <8 x double> %{{.*}} return _mm512_mask_shuffle_f64x2(__W, __U, __A, __B, 4); } __m512d test_mm512_maskz_shuffle_f64x2(__mmask8 __U, __m512d __A, __m512d __B) { // CHECK-LABEL: @test_mm512_maskz_shuffle_f64x2 - // CHECK: @llvm.x86.avx512.mask.shuf.f64x2 + // CHECK: shufflevector <8 x double> %{{.*}}, <8 x double> %{{.*}}, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 8, i32 9> + // CHECK: select <8 x i1> %{{.*}}, <8 x double> %{{.*}}, <8 x double> %{{.*}} return _mm512_maskz_shuffle_f64x2(__U, __A, __B, 4); } __m512i test_mm512_shuffle_i32x4(__m512i __A, __m512i __B) { // CHECK-LABEL: @test_mm512_shuffle_i32x4 - // CHECK: @llvm.x86.avx512.mask.shuf.i32x4 + // CHECK: shufflevector <8 x i64> %0, <8 x i64> %{{.*}}, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 8, i32 9> return _mm512_shuffle_i32x4(__A, __B, 4); } __m512i test_mm512_mask_shuffle_i32x4(__m512i __W, __mmask16 __U, __m512i __A, __m512i __B) { // CHECK-LABEL: @test_mm512_mask_shuffle_i32x4 - // CHECK: @llvm.x86.avx512.mask.shuf.i32x4 + // CHECK: shufflevector <8 x i64> %{{.*}}, <8 x i64> %{{.*}}, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 8, i32 9> + // CHECK: select <16 x i1> %{{.*}}, <16 x i32> %{{.*}}, <16 x i32> %{{.*}} return _mm512_mask_shuffle_i32x4(__W, __U, __A, __B, 4); } __m512i test_mm512_maskz_shuffle_i32x4(__mmask16 __U, __m512i __A, __m512i __B) { // CHECK-LABEL: @test_mm512_maskz_shuffle_i32x4 - // CHECK: @llvm.x86.avx512.mask.shuf.i32x4 + // CHECK: shufflevector <8 x i64> %{{.*}}, <8 x i64> %{{.*}}, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 8, i32 9> + // CHECK: select <16 x i1> %{{.*}}, <16 x i32> %{{.*}}, <16 x i32> %{{.*}} return _mm512_maskz_shuffle_i32x4(__U, __A, __B, 4); } __m512i test_mm512_shuffle_i64x2(__m512i __A, __m512i __B) { // CHECK-LABEL: @test_mm512_shuffle_i64x2 - // CHECK: @llvm.x86.avx512.mask.shuf.i64x2 + // CHECK: shufflevector <8 x i64> %0, <8 x i64> %{{.*}}, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 8, i32 9> return _mm512_shuffle_i64x2(__A, __B, 4); } __m512i test_mm512_mask_shuffle_i64x2(__m512i __W, __mmask8 __U, __m512i __A, __m512i __B) { // CHECK-LABEL: @test_mm512_mask_shuffle_i64x2 - // CHECK: @llvm.x86.avx512.mask.shuf.i64x2 + // CHECK: shufflevector <8 x i64> %{{.*}}, <8 x i64> %{{.*}}, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 8, i32 9> + // CHECK: select <8 x i1> %{{.*}}, <8 x i64> %{{.*}}, <8 x i64> %{{.*}} return _mm512_mask_shuffle_i64x2(__W, __U, __A, __B, 4); } __m512i test_mm512_maskz_shuffle_i64x2(__mmask8 __U, __m512i __A, __m512i __B) { // CHECK-LABEL: @test_mm512_maskz_shuffle_i64x2 - // CHECK: @llvm.x86.avx512.mask.shuf.i64x2 + // CHECK: shufflevector <8 x i64> %{{.*}}, <8 x i64> %{{.*}}, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 8, i32 9> + // CHECK: select <8 x i1> %{{.*}}, <8 x i64> %{{.*}}, <8 x i64> %{{.*}} return _mm512_maskz_shuffle_i64x2(__U, __A, __B, 4); } Index: lib/Headers/avx512vlintrin.h =================================================================== --- lib/Headers/avx512vlintrin.h +++ lib/Headers/avx512vlintrin.h @@ -6977,85 +6977,77 @@ #define _mm256_shuffle_f32x4(A, B, imm) __extension__ ({ \ - (__m256)__builtin_ia32_shuf_f32x4_256_mask((__v8sf)(__m256)(A), \ - (__v8sf)(__m256)(B), (int)(imm), \ - (__v8sf)_mm256_setzero_ps(), \ - (__mmask8)-1); }) + (__m256)__builtin_shufflevector((__v4df)(__m256)(A), \ + (__v4df)(__m256)(B), \ + 0 + ((((imm) >> 0) & 0x1) * 2), \ + 1 + ((((imm) >> 0) & 0x1) * 2), \ + 4 + ((((imm) >> 1) & 0x1) * 2), \ + 5 + ((((imm) >> 1) & 0x1) * 2)); }) #define _mm256_mask_shuffle_f32x4(W, U, A, B, imm) __extension__ ({ \ - (__m256)__builtin_ia32_shuf_f32x4_256_mask((__v8sf)(__m256)(A), \ - (__v8sf)(__m256)(B), (int)(imm), \ - (__v8sf)(__m256)(W), \ - (__mmask8)(U)); }) + (__m256)__builtin_ia32_selectps_256((__mmask8)(U), \ + (__v8sf)_mm256_shuffle_f32x4((A), (B), (imm)), \ + (__v8sf)(__m256)(W)); }) #define _mm256_maskz_shuffle_f32x4(U, A, B, imm) __extension__ ({ \ - (__m256)__builtin_ia32_shuf_f32x4_256_mask((__v8sf)(__m256)(A), \ - (__v8sf)(__m256)(B), (int)(imm), \ - (__v8sf)_mm256_setzero_ps(), \ - (__mmask8)(U)); }) + (__m256)__builtin_ia32_selectps_256((__mmask8)(U), \ + (__v8sf)_mm256_shuffle_f32x4((A), (B), (imm)), \ + (__v8sf)_mm256_setzero_ps()); }) #define _mm256_shuffle_f64x2(A, B, imm) __extension__ ({ \ - (__m256d)__builtin_ia32_shuf_f64x2_256_mask((__v4df)(__m256d)(A), \ - (__v4df)(__m256d)(B), \ - (int)(imm), \ - (__v4df)_mm256_setzero_pd(), \ - (__mmask8)-1); }) + (__m256d)__builtin_shufflevector((__v4df)(__m256d)(A), \ + (__v4df)(__m256d)(B), \ + 0 + ((((imm) >> 0) & 0x1) * 2), \ + 1 + ((((imm) >> 0) & 0x1) * 2), \ + 4 + ((((imm) >> 1) & 0x1) * 2), \ + 5 + ((((imm) >> 1) & 0x1) * 2)); }) #define _mm256_mask_shuffle_f64x2(W, U, A, B, imm) __extension__ ({ \ - (__m256d)__builtin_ia32_shuf_f64x2_256_mask((__v4df)(__m256d)(A), \ - (__v4df)(__m256d)(B), \ - (int)(imm), \ - (__v4df)(__m256d)(W), \ - (__mmask8)(U)); }) + (__m256d)__builtin_ia32_selectpd_256((__mmask8)(U), \ + (__v4df)_mm256_shuffle_f32x4((A), (B), (imm)), \ + (__v4df)(__m256)(W)); }) #define _mm256_maskz_shuffle_f64x2(U, A, B, imm) __extension__ ({ \ - (__m256d)__builtin_ia32_shuf_f64x2_256_mask((__v4df)(__m256d)(A), \ - (__v4df)(__m256d)(B), \ - (int)(imm), \ - (__v4df)_mm256_setzero_pd(), \ - (__mmask8)(U)); }) + (__m256d)__builtin_ia32_selectpd_256((__mmask8)(U), \ + (__v4df)_mm256_shuffle_f32x4((A), (B), (imm)), \ + (__v4df)_mm256_setzero_pd()); }) #define _mm256_shuffle_i32x4(A, B, imm) __extension__ ({ \ - (__m256i)__builtin_ia32_shuf_i32x4_256_mask((__v8si)(__m256i)(A), \ - (__v8si)(__m256i)(B), \ - (int)(imm), \ - (__v8si)_mm256_setzero_si256(), \ - (__mmask8)-1); }) + (__m256i)__builtin_shufflevector((__v4di)(__m256i)(A), \ + (__v4di)(__m256i)(B), \ + 0 + ((((imm) >> 0) & 0x1) * 2), \ + 1 + ((((imm) >> 0) & 0x1) * 2), \ + 4 + ((((imm) >> 1) & 0x1) * 2), \ + 5 + ((((imm) >> 1) & 0x1) * 2)); }) #define _mm256_mask_shuffle_i32x4(W, U, A, B, imm) __extension__ ({ \ - (__m256i)__builtin_ia32_shuf_i32x4_256_mask((__v8si)(__m256i)(A), \ - (__v8si)(__m256i)(B), \ - (int)(imm), \ - (__v8si)(__m256i)(W), \ - (__mmask8)(U)); }) + (__m256i)__builtin_ia32_selectd_256((__mmask8)(U), \ + (__v8si)_mm256_shuffle_f32x4((A), (B), (imm)), \ + (__v8si)(__m256)(W)); }) #define _mm256_maskz_shuffle_i32x4(U, A, B, imm) __extension__ ({ \ - (__m256i)__builtin_ia32_shuf_i32x4_256_mask((__v8si)(__m256i)(A), \ - (__v8si)(__m256i)(B), \ - (int)(imm), \ - (__v8si)_mm256_setzero_si256(), \ - (__mmask8)(U)); }) + (__m256i)__builtin_ia32_selectd_256((__mmask8)(U), \ + (__v8si)_mm256_shuffle_f32x4((A), (B), (imm)), \ + (__v8si)_mm256_setzero_si256()); }) #define _mm256_shuffle_i64x2(A, B, imm) __extension__ ({ \ - (__m256i)__builtin_ia32_shuf_i64x2_256_mask((__v4di)(__m256i)(A), \ - (__v4di)(__m256i)(B), \ - (int)(imm), \ - (__v4di)_mm256_setzero_si256(), \ - (__mmask8)-1); }) + (__m256i)__builtin_shufflevector((__v4di)(__m256i)(A), \ + (__v4di)(__m256i)(B), \ + 0 + ((((imm) >> 0) & 0x1) * 2), \ + 1 + ((((imm) >> 0) & 0x1) * 2), \ + 4 + ((((imm) >> 1) & 0x1) * 2), \ + 5 + ((((imm) >> 1) & 0x1) * 2)); }) #define _mm256_mask_shuffle_i64x2(W, U, A, B, imm) __extension__ ({ \ - (__m256i)__builtin_ia32_shuf_i64x2_256_mask((__v4di)(__m256i)(A), \ - (__v4di)(__m256i)(B), \ - (int)(imm), \ - (__v4di)(__m256i)(W), \ - (__mmask8)(U)); }) + (__m256i)__builtin_ia32_selectq_256((__mmask8)(U), \ + (__v4di)_mm256_shuffle_f64x2((A), (B), (imm)), \ + (__v4di)(__m256)(W)); }) + #define _mm256_maskz_shuffle_i64x2(U, A, B, imm) __extension__ ({ \ - (__m256i)__builtin_ia32_shuf_i64x2_256_mask((__v4di)(__m256i)(A), \ - (__v4di)(__m256i)(B), \ - (int)(imm), \ - (__v4di)_mm256_setzero_si256(), \ - (__mmask8)(U)); }) + (__m256i)__builtin_ia32_selectq_256((__mmask8)(U), \ + (__v4di)_mm256_shuffle_f64x2((A), (B), (imm)), \ + (__v4di)_mm256_setzero_si256()); }) #define _mm_mask_shuffle_pd(W, U, A, B, M) __extension__ ({ \ (__m128d)__builtin_ia32_selectpd_128((__mmask8)(U), \ Index: lib/Headers/avx512fintrin.h =================================================================== --- lib/Headers/avx512fintrin.h +++ lib/Headers/avx512fintrin.h @@ -7194,76 +7194,92 @@ } #define _mm512_shuffle_f32x4(A, B, imm) __extension__ ({ \ - (__m512)__builtin_ia32_shuf_f32x4_mask((__v16sf)(__m512)(A), \ - (__v16sf)(__m512)(B), (int)(imm), \ - (__v16sf)_mm512_undefined_ps(), \ - (__mmask16)-1); }) + (__m512)__builtin_shufflevector((__v8df)(__m512)(A), \ + (__v8df)(__m512)(B), \ + 0 + ((((imm) >> 0) & 0x3) * 2), \ + 1 + ((((imm) >> 0) & 0x3) * 2), \ + 0 + ((((imm) >> 2) & 0x3) * 2), \ + 1 + ((((imm) >> 2) & 0x3) * 2), \ + 8 + ((((imm) >> 4) & 0x3) * 2), \ + 9 + ((((imm) >> 4) & 0x3) * 2), \ + 8 + ((((imm) >> 6) & 0x3) * 2), \ + 9 + ((((imm) >> 6) & 0x3) * 2)); }) #define _mm512_mask_shuffle_f32x4(W, U, A, B, imm) __extension__ ({ \ - (__m512)__builtin_ia32_shuf_f32x4_mask((__v16sf)(__m512)(A), \ - (__v16sf)(__m512)(B), (int)(imm), \ - (__v16sf)(__m512)(W), \ - (__mmask16)(U)); }) + (__m512)__builtin_ia32_selectps_512((__mmask16)(U), \ + (__v16sf)_mm512_shuffle_f32x4((A), (B), (imm)), \ + (__v16sf)(__m512)(W)); }) #define _mm512_maskz_shuffle_f32x4(U, A, B, imm) __extension__ ({ \ - (__m512)__builtin_ia32_shuf_f32x4_mask((__v16sf)(__m512)(A), \ - (__v16sf)(__m512)(B), (int)(imm), \ - (__v16sf)_mm512_setzero_ps(), \ - (__mmask16)(U)); }) + (__m512)__builtin_ia32_selectps_512((__mmask16)(U), \ + (__v16sf)_mm512_shuffle_f32x4((A), (B), (imm)), \ + (__v16sf)_mm512_setzero_ps()); }) #define _mm512_shuffle_f64x2(A, B, imm) __extension__ ({ \ - (__m512d)__builtin_ia32_shuf_f64x2_mask((__v8df)(__m512d)(A), \ - (__v8df)(__m512d)(B), (int)(imm), \ - (__v8df)_mm512_undefined_pd(), \ - (__mmask8)-1); }) + (__m512d)__builtin_shufflevector((__v8df)(__m512d)(A), \ + (__v8df)(__m512d)(B), \ + 0 + ((((imm) >> 0) & 0x3) * 2), \ + 1 + ((((imm) >> 0) & 0x3) * 2), \ + 0 + ((((imm) >> 2) & 0x3) * 2), \ + 1 + ((((imm) >> 2) & 0x3) * 2), \ + 8 + ((((imm) >> 4) & 0x3) * 2), \ + 9 + ((((imm) >> 4) & 0x3) * 2), \ + 8 + ((((imm) >> 6) & 0x3) * 2), \ + 9 + ((((imm) >> 6) & 0x3) * 2)); }) #define _mm512_mask_shuffle_f64x2(W, U, A, B, imm) __extension__ ({ \ - (__m512d)__builtin_ia32_shuf_f64x2_mask((__v8df)(__m512d)(A), \ - (__v8df)(__m512d)(B), (int)(imm), \ - (__v8df)(__m512d)(W), \ - (__mmask8)(U)); }) + (__m512d)__builtin_ia32_selectpd_512((__mmask8)(U), \ + (__v8df)_mm512_shuffle_f64x2((A), (B), (imm)), \ + (__v8df)(__m512d)(W)); }) #define _mm512_maskz_shuffle_f64x2(U, A, B, imm) __extension__ ({ \ - (__m512d)__builtin_ia32_shuf_f64x2_mask((__v8df)(__m512d)(A), \ - (__v8df)(__m512d)(B), (int)(imm), \ - (__v8df)_mm512_setzero_pd(), \ - (__mmask8)(U)); }) + (__m512d)__builtin_ia32_selectpd_512((__mmask8)(U), \ + (__v8df)_mm512_shuffle_f64x2((A), (B), (imm)), \ + (__v8df)_mm512_setzero_pd()); }) #define _mm512_shuffle_i32x4(A, B, imm) __extension__ ({ \ - (__m512i)__builtin_ia32_shuf_i32x4_mask((__v16si)(__m512i)(A), \ - (__v16si)(__m512i)(B), (int)(imm), \ - (__v16si)_mm512_setzero_si512(), \ - (__mmask16)-1); }) + (__m512i)__builtin_shufflevector((__v8di)(__m512i)(A), \ + (__v8di)(__m512i)(B), \ + 0 + ((((imm) >> 0) & 0x3) * 2), \ + 1 + ((((imm) >> 0) & 0x3) * 2), \ + 0 + ((((imm) >> 2) & 0x3) * 2), \ + 1 + ((((imm) >> 2) & 0x3) * 2), \ + 8 + ((((imm) >> 4) & 0x3) * 2), \ + 9 + ((((imm) >> 4) & 0x3) * 2), \ + 8 + ((((imm) >> 6) & 0x3) * 2), \ + 9 + ((((imm) >> 6) & 0x3) * 2)); }) #define _mm512_mask_shuffle_i32x4(W, U, A, B, imm) __extension__ ({ \ - (__m512i)__builtin_ia32_shuf_i32x4_mask((__v16si)(__m512i)(A), \ - (__v16si)(__m512i)(B), (int)(imm), \ - (__v16si)(__m512i)(W), \ - (__mmask16)(U)); }) + (__m512i)__builtin_ia32_selectd_512((__mmask16)(U), \ + (__v16si)_mm512_shuffle_i32x4((A), (B), (imm)), \ + (__v16si)(__m512i)(W)); }) #define _mm512_maskz_shuffle_i32x4(U, A, B, imm) __extension__ ({ \ - (__m512i)__builtin_ia32_shuf_i32x4_mask((__v16si)(__m512i)(A), \ - (__v16si)(__m512i)(B), (int)(imm), \ - (__v16si)_mm512_setzero_si512(), \ - (__mmask16)(U)); }) + (__m512i)__builtin_ia32_selectd_512((__mmask16)(U), \ + (__v16si)_mm512_shuffle_i32x4((A), (B), (imm)), \ + (__v16si)_mm512_setzero_si512()); }) #define _mm512_shuffle_i64x2(A, B, imm) __extension__ ({ \ - (__m512i)__builtin_ia32_shuf_i64x2_mask((__v8di)(__m512i)(A), \ - (__v8di)(__m512i)(B), (int)(imm), \ - (__v8di)_mm512_setzero_si512(), \ - (__mmask8)-1); }) + (__m512i)__builtin_shufflevector((__v8di)(__m512i)(A), \ + (__v8di)(__m512i)(B), \ + 0 + ((((imm) >> 0) & 0x3) * 2), \ + 1 + ((((imm) >> 0) & 0x3) * 2), \ + 0 + ((((imm) >> 2) & 0x3) * 2), \ + 1 + ((((imm) >> 2) & 0x3) * 2), \ + 8 + ((((imm) >> 4) & 0x3) * 2), \ + 9 + ((((imm) >> 4) & 0x3) * 2), \ + 8 + ((((imm) >> 6) & 0x3) * 2), \ + 9 + ((((imm) >> 6) & 0x3) * 2)); }) #define _mm512_mask_shuffle_i64x2(W, U, A, B, imm) __extension__ ({ \ - (__m512i)__builtin_ia32_shuf_i64x2_mask((__v8di)(__m512i)(A), \ - (__v8di)(__m512i)(B), (int)(imm), \ - (__v8di)(__m512i)(W), \ - (__mmask8)(U)); }) + (__m512i)__builtin_ia32_selectq_512((__mmask8)(U), \ + (__v8di)_mm512_shuffle_i64x2((A), (B), (imm)), \ + (__v8di)(__m512i)(W)); }) #define _mm512_maskz_shuffle_i64x2(U, A, B, imm) __extension__ ({ \ - (__m512i)__builtin_ia32_shuf_i64x2_mask((__v8di)(__m512i)(A), \ - (__v8di)(__m512i)(B), (int)(imm), \ - (__v8di)_mm512_setzero_si512(), \ - (__mmask8)(U)); }) + (__m512i)__builtin_ia32_selectq_512((__mmask8)(U), \ + (__v8di)_mm512_shuffle_i64x2((A), (B), (imm)), \ + (__v8di)_mm512_setzero_si512()); }) #define _mm512_shuffle_pd(A, B, M) __extension__ ({ \ (__m512d)__builtin_shufflevector((__v8df)(__m512d)(A), \
_______________________________________________ cfe-commits mailing list cfe-commits@lists.llvm.org http://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits