qiucf updated this revision to Diff 407444. Herald added a subscriber: mgorny.
Repository: rG LLVM Github Monorepo CHANGES SINCE LAST ACTION https://reviews.llvm.org/D119407/new/ https://reviews.llvm.org/D119407 Files: clang/lib/Headers/CMakeLists.txt clang/lib/Headers/ppc_wrappers/bmi2intrin.h clang/lib/Headers/ppc_wrappers/bmiintrin.h clang/lib/Headers/ppc_wrappers/emmintrin.h clang/lib/Headers/ppc_wrappers/immintrin.h clang/lib/Headers/ppc_wrappers/nmmintrin.h clang/lib/Headers/ppc_wrappers/pmmintrin.h clang/lib/Headers/ppc_wrappers/smmintrin.h clang/lib/Headers/ppc_wrappers/tmmintrin.h clang/lib/Headers/ppc_wrappers/x86gprintrin.h clang/lib/Headers/ppc_wrappers/x86intrin.h clang/lib/Headers/ppc_wrappers/xmmintrin.h clang/test/CodeGen/PowerPC/ppc-smmintrin.c clang/test/CodeGen/PowerPC/ppc-x86gprintrin.c
Index: clang/test/CodeGen/PowerPC/ppc-x86gprintrin.c =================================================================== --- /dev/null +++ clang/test/CodeGen/PowerPC/ppc-x86gprintrin.c @@ -0,0 +1,239 @@ +// REQUIRES: powerpc-registered-target +// RUN: %clang -S -emit-llvm -target powerpc64le-unknown-linux-gnu -mcpu=pwr7 -ffreestanding -DNO_WARN_X86_INTRINSICS %s \ +// RUN: -fno-discard-value-names -mllvm -disable-llvm-optzns -o - | llvm-cxxfilt -n | FileCheck %s +// RUN: %clang -S -emit-llvm -target powerpc64-unknown-linux-gnu -mcpu=pwr7 -ffreestanding -DNO_WARN_X86_INTRINSICS %s \ +// RUN: -fno-discard-value-names -mllvm -disable-llvm-optzns -o - | llvm-cxxfilt -n | FileCheck %s + +// RUN: %clang -S -emit-llvm -target powerpc64le-unknown-freebsd13.0 -mcpu=pwr7 -ffreestanding -DNO_WARN_X86_INTRINSICS %s \ +// RUN: -fno-discard-value-names -mllvm -disable-llvm-optzns -o - | llvm-cxxfilt -n | FileCheck %s +// RUN: %clang -S -emit-llvm -target powerpc64-unknown-freebsd13.0 -mcpu=pwr7 -ffreestanding -DNO_WARN_X86_INTRINSICS %s \ +// RUN: -fno-discard-value-names -mllvm -disable-llvm-optzns -o - | llvm-cxxfilt -n | FileCheck %s + +#include <x86gprintrin.h> + +unsigned short us; +unsigned ui; +unsigned long long ul; + +void __attribute__((noinline)) +test_bmiintrin() { + __tzcnt_u16(us); + __andn_u32(ui, ui); + _bextr_u32(ui, ui, ui); + __bextr_u32(ui, ui); + __blsi_u32(ui); + _blsi_u32(ui); + __blsmsk_u32(ui); + _blsmsk_u32(ui); + __blsr_u32(ui); + _blsr_u32(ui); + __tzcnt_u32(ui); + _tzcnt_u32(ui); + __andn_u64(ul, ul); + _bextr_u64(ul, ui, ui); + __bextr_u64(ul, ul); + __blsi_u64(ul); + _blsi_u64(ul); + __blsmsk_u64(ul); + _blsmsk_u64(ul); + __blsr_u64(ul); + _blsr_u64(ul); + __tzcnt_u64(ul); + _tzcnt_u64(ul); +} + +// CHECK-LABEL: @test_bmiintrin + +// CHECK-LABEL: define available_externally zeroext i16 @__tzcnt_u16(i16 noundef zeroext %{{[0-9a-zA-Z._]+}}) +// CHECK: %[[CONV:[0-9a-zA-Z._]+]] = zext i16 %{{[0-9a-zA-Z._]+}} to i32 +// CHECK: %[[CALL:[0-9a-zA-Z._]+]] = call i32 @llvm.cttz.i32(i32 %[[CONV]], i1 false) +// CHECK: trunc i32 %[[CALL]] to i16 + +// CHECK-LABEL: define available_externally zeroext i32 @__andn_u32(i32 noundef zeroext %{{[0-9a-zA-Z._]+}}, i32 noundef zeroext %{{[0-9a-zA-Z._]+}}) +// CHECK: %[[NEG:[0-9a-zA-Z._]+]] = xor i32 %{{[0-9a-zA-Z._]+}}, -1 +// CHECK: and i32 %[[NEG]], %1 + +// CHECK-LABEL: define available_externally zeroext i32 @_bextr_u32(i32 noundef zeroext %{{[0-9a-zA-Z._]+}}, i32 noundef zeroext %{{[0-9a-zA-Z._]+}}, i32 noundef zeroext %{{[0-9a-zA-Z._]+}}) +// CHECK: %[[ADD:[0-9a-zA-Z._]+]] = add i32 %{{[0-9a-zA-Z._]+}}, %{{[0-9a-zA-Z._]+}} +// CHECK: %[[SUB:[0-9a-zA-Z._]+]] = sub i32 32, %[[ADD]] +// CHECK: %[[SHL:[0-9a-zA-Z._]+]] = shl i32 %{{[0-9a-zA-Z._]+}}, %[[SUB]] +// CHECK: %[[SUB]]1 = sub i32 32, %{{[0-9a-zA-Z._]+}} +// CHECK: lshr i32 %[[SHL]], %[[SUB]]1 + +// CHECK-LABEL: define available_externally zeroext i32 @__bextr_u32(i32 noundef zeroext %{{[0-9a-zA-Z._]+}}, i32 noundef zeroext %{{[0-9a-zA-Z._]+}}) +// CHECK: %[[AND:[0-9a-zA-Z._]+]] = and i32 %{{[0-9a-zA-Z._]+}}, 255 +// CHECK: %[[SHR:[0-9a-zA-Z._]+]] = lshr i32 %{{[0-9a-zA-Z._]+}}, 8 +// CHECK: and i32 %[[SHR]], 255 +// CHECK: call zeroext i32 @_bextr_u32 + +// CHECK-LABEL: define available_externally zeroext i32 @__blsi_u32(i32 noundef zeroext %{{[0-9a-zA-Z._]+}}) +// CHECK: %[[SUB:[0-9a-zA-Z._]+]] = sub i32 0, %1 +// CHECK: and i32 %0, %[[SUB]] + +// CHECK-LABEL: define available_externally zeroext i32 @_blsi_u32(i32 noundef zeroext %{{[0-9a-zA-Z._]+}}) +// CHECK: call zeroext i32 @__blsi_u32 + +// CHECK-LABEL: define available_externally zeroext i32 @__blsmsk_u32(i32 noundef zeroext %{{[0-9a-zA-Z._]+}}) +// CHECK: %[[SUB:[0-9a-zA-Z._]+]] = sub i32 %{{[0-9a-zA-Z._]+}}, 1 +// CHECK: xor i32 %{{[0-9a-zA-Z._]+}}, %[[SUB]] + +// CHECK-LABEL: define available_externally zeroext i32 @_blsmsk_u32(i32 noundef zeroext %{{[0-9a-zA-Z._]+}}) +// CHECK: call zeroext i32 @__blsmsk_u32 + +// CHECK-LABEL: define available_externally zeroext i32 @__blsr_u32(i32 noundef zeroext %{{[0-9a-zA-Z._]+}}) +// CHECK: %[[SUB:[0-9a-zA-Z._]+]] = sub i32 %{{[0-9a-zA-Z._]+}}, 1 +// CHECK: and i32 %{{[0-9a-zA-Z._]+}}, %[[SUB]] + +// CHECK-LABEL: define available_externally zeroext i32 @_blsr_u32(i32 noundef zeroext %{{[0-9a-zA-Z._]+}}) +// CHECK: call zeroext i32 @__blsr_u32 + +// CHECK-LABEL: define available_externally zeroext i32 @__tzcnt_u32(i32 noundef zeroext %{{[0-9a-zA-Z._]+}}) +// CHECK: call i32 @llvm.cttz.i32(i32 %{{[0-9a-zA-Z._]+}}, i1 false) + +// CHECK-LABEL: define available_externally zeroext i32 @_tzcnt_u32(i32 noundef zeroext %{{[0-9a-zA-Z._]+}}) +// CHECK: call i32 @llvm.cttz.i32(i32 %{{[0-9a-zA-Z._]+}}, i1 false) + +// CHECK-LABEL: define available_externally i64 @__andn_u64(i64 noundef %{{[0-9a-zA-Z._]+}}, i64 noundef %{{[0-9a-zA-Z._]+}}) +// CHECK: %[[NEG:[0-9a-zA-Z._]+]] = xor i64 %{{[0-9a-zA-Z._]+}}, -1 +// CHECK: and i64 %[[NEG]], %{{[0-9a-zA-Z._]+}} + +// CHECK-LABEL: define available_externally i64 @_bextr_u64(i64 noundef %{{[0-9a-zA-Z._]+}}, i32 noundef zeroext %{{[0-9a-zA-Z._]+}}, i32 noundef zeroext %{{[0-9a-zA-Z._]+}}) +// CHECK: %[[ADD:[0-9a-zA-Z._]+]] = add i32 %{{[0-9a-zA-Z._]+}}, %{{[0-9a-zA-Z._]+}} +// CHECK: %[[SUB:[0-9a-zA-Z._]+]] = sub i32 64, %[[ADD]] +// CHECK: %[[EXT:[0-9a-zA-Z._]+]] = zext i32 %[[SUB]] to i64 +// CHECK: %[[SHL:[0-9a-zA-Z._]+]] = shl i64 %{{[0-9a-zA-Z._]+}}, %[[EXT]] +// CHECK: %[[SUB1:[0-9a-zA-Z._]+]] = sub i32 64, %{{[0-9a-zA-Z._]+}} +// CHECK: %[[EXT2:[0-9a-zA-Z._]+]] = zext i32 %[[SUB1]] to i64 +// CHECK: lshr i64 %[[SHL]], %[[EXT2]] + +// CHECK-LABEL: define available_externally i64 @__bextr_u64(i64 noundef %{{[0-9a-zA-Z._]+}}, i64 noundef %{{[0-9a-zA-Z._]+}}) +// CHECK: %[[AND:[0-9a-zA-Z._]+]] = and i64 %{{[0-9a-zA-Z._]+}}, 255 +// CHECK: trunc i64 %[[AND]] to i32 +// CHECK: %[[AND1:[0-9a-zA-Z._]+]] = and i64 %{{[0-9a-zA-Z._]+}}, 65280 +// CHECK: %[[SHR:[0-9a-zA-Z._]+]] = lshr i64 %[[AND1]], 8 +// CHECK: trunc i64 %[[SHR]] to i32 +// CHECK: call i64 @_bextr_u64 + +// CHECK-LABEL: define available_externally i64 @__blsi_u64(i64 noundef %{{[0-9a-zA-Z._]+}}) +// CHECK: %[[SUB:[0-9a-zA-Z._]+]] = sub i64 0, %{{[0-9a-zA-Z._]+}} +// CHECK: and i64 %0, %[[SUB]] + +// CHECK-LABEL: define available_externally i64 @_blsi_u64(i64 noundef %{{[0-9a-zA-Z._]+}}) +// CHECK: call i64 @__blsi_u64 + +// CHECK-LABEL: define available_externally i64 @__blsmsk_u64(i64 noundef %{{[0-9a-zA-Z._]+}}) +// CHECK: %[[SUB:[0-9a-zA-Z._]+]] = sub i64 %{{[0-9a-zA-Z._]+}}, 1 +// CHECK: xor i64 %0, %[[SUB]] + +// CHECK-LABEL: define available_externally i64 @_blsmsk_u64(i64 noundef %{{[0-9a-zA-Z._]+}}) +// CHECK: call i64 @__blsmsk_u64 + +// CHECK-LABEL: define available_externally i64 @__blsr_u64(i64 noundef %{{[0-9a-zA-Z._]+}}) +// CHECK: %[[SUB:[0-9a-zA-Z._]+]] = sub i64 %{{[0-9a-zA-Z._]+}}, 1 +// CHECK: and i64 %{{[0-9a-zA-Z._]+}}, %[[SUB]] + +// CHECK-LABEL: define available_externally i64 @_blsr_u64(i64 noundef %{{[0-9a-zA-Z._]+}}) +// CHECK: call i64 @__blsr_u64 + +// CHECK-LABEL: define available_externally i64 @__tzcnt_u64(i64 noundef %{{[0-9a-zA-Z._]+}}) +// CHECK: %[[CALL:[0-9a-zA-Z._]+]] = call i64 @llvm.cttz.i64(i64 %{{[0-9a-zA-Z._]+}}, i1 false) +// CHECK: %[[CAST:[0-9a-zA-Z._]+]] = trunc i64 %[[CALL]] to i32 +// CHECK: sext i32 %[[CAST]] to i64 + +// CHECK-LABEL: define available_externally i64 @_tzcnt_u64(i64 noundef %{{[0-9a-zA-Z._]+}}) +// CHECK: %[[CALL:[0-9a-zA-Z._]+]] = call i64 @llvm.cttz.i64(i64 %{{[0-9a-zA-Z._]+}}, i1 false) +// CHECK: %[[CAST:[0-9a-zA-Z._]+]] = trunc i64 %[[CALL]] to i32 +// CHECK: sext i32 %[[CAST]] to i64 + +void __attribute__((noinline)) +test_bmi2intrin() { + _bzhi_u32(ui, ui); + _mulx_u32(ui, ui, &ui); + _bzhi_u64(ul, ul); + _mulx_u64(ul, ul, &ul); + _pdep_u64(ul, ul); + _pext_u64(ul, ul); + _pdep_u32(ui, ui); + _pext_u32(ui, ui); +} + +// CHECK-LABEL: @test_bmi2intrin + +// CHECK-LABEL: define available_externally zeroext i32 @_bzhi_u32(i32 noundef zeroext %{{[0-9a-zA-Z._]+}}, i32 noundef zeroext %{{[0-9a-zA-Z._]+}}) +// CHECK: %[[SUB:[0-9a-zA-Z._]+]] = sub i32 32, %{{[0-9a-zA-Z._]+}} +// CHECK: %[[SHL:[0-9a-zA-Z._]+]] = shl i32 %{{[0-9a-zA-Z._]+}}, %[[SUB]] +// CHECK: %[[SUB1:[0-9a-zA-Z._]+]] = sub i32 32, %{{[0-9a-zA-Z._]+}} +// CHECK: lshr i32 %[[SHL]], %[[SUB1]] + +// CHECK-LABEL: define available_externally zeroext i32 @_mulx_u32(i32 noundef zeroext %{{[0-9a-zA-Z._]+}}, i32 noundef zeroext %{{[0-9a-zA-Z._]+}}, i32* noundef %{{[0-9a-zA-Z._]+}}) +// CHECK: zext i32 %{{[0-9a-zA-Z._]+}} to i64 +// CHECK: zext i32 %{{[0-9a-zA-Z._]+}} to i64 +// CHECK: %[[SHR:[0-9a-zA-Z._]+]] = lshr i64 %{{[0-9a-zA-Z._]+}}, 32 +// CHECK: trunc i64 %[[SHR]] to i32 +// CHECK: trunc i64 %{{[0-9a-zA-Z._]+}} to i32 + +// CHECK-LABEL: define available_externally i64 @_bzhi_u64(i64 noundef %{{[0-9a-zA-Z._]+}}, i64 noundef %{{[0-9a-zA-Z._]+}}) +// CHECK: %[[SUB:[0-9a-zA-Z._]+]] = sub i64 64, %{{[0-9a-zA-Z._]+}} +// CHECK: %[[SHL:[0-9a-zA-Z._]+]] = shl i64 %{{[0-9a-zA-Z._]+}}, %[[SUB]] +// CHECK: %[[SUB1:[0-9a-zA-Z._]+]] = sub i64 64, %{{[0-9a-zA-Z._]+}} +// CHECK: lshr i64 %[[SHL]], %[[SUB1]] + +// CHECK-LABEL: define available_externally i64 @_mulx_u64(i64 noundef %{{[0-9a-zA-Z._]+}}, i64 noundef %{{[0-9a-zA-Z._]+}}, i64* noundef %{{[0-9a-zA-Z._]+}}) +// CHECK: zext i64 %{{[0-9a-zA-Z._]+}} to i128 +// CHECK: zext i64 %{{[0-9a-zA-Z._]+}} to i128 +// CHECK: %[[SHR:[0-9a-zA-Z._]+]] = lshr i128 %{{[0-9a-zA-Z._]+}}, 64 +// CHECK: trunc i128 %[[SHR]] to i64 +// CHECK: trunc i128 %{{[0-9a-zA-Z._]+}} to i64 + +// CHECK-LABEL: define available_externally i64 @_pdep_u64(i64 noundef %{{[0-9a-zA-Z._]+}}, i64 noundef %{{[0-9a-zA-Z._]+}}) +// CHECK: %[[CALL:[0-9a-zA-Z._]+]] = call i64 @llvm.ctpop.i64(i64 %{{[0-9a-zA-Z._]+}}) +// CHECK: %[[CAST:[0-9a-zA-Z._]+]] = trunc i64 %[[CALL]] to i32 +// CHECK: %[[SUB:[0-9a-zA-Z._]+]] = sub nsw i32 64, %[[CAST]] +// CHECK: sext i32 %[[SUB]] to i64 +// CHECK: %[[CALL2:[0-9a-zA-Z._]+]] = call i64 @llvm.ctlz.i64(i64 %{{[0-9a-zA-Z._]+}}, i1 false) +// CHECK: %[[CAST2:[0-9a-zA-Z._]+]] = trunc i64 %[[CALL2]] to i32 +// CHECK: sext i32 %[[CAST2]] to i64 +// CHECK: %[[SUB2:[0-9a-zA-Z._]+]] = sub i64 %{{[0-9a-zA-Z._]+}}, %{{[0-9a-zA-Z._]+}} +// CHECK: shl i64 %{{[0-9a-zA-Z._]+}}, %[[SUB2]] +// CHECK: %[[SHR:[0-9a-zA-Z._]+]] = lshr i64 -9223372036854775808, %{{[0-9a-zA-Z._]+}} +// CHECK: xor i64 %{{[0-9a-zA-Z._]+}}, %[[SHR]] +// CHECK: %[[SHR2:[0-9a-zA-Z._]+]] = lshr i64 -9223372036854775808, %{{[0-9a-zA-Z._]+}} +// CHECK: %[[AND:[0-9a-zA-Z._]+]] = and i64 %{{[0-9a-zA-Z._]+}}, %[[SHR2]] +// CHECK: or i64 %{{[0-9a-zA-Z._]+}}, %[[AND]] +// CHECK: add i64 %{{[0-9a-zA-Z._]+}}, 1 + +// CHECK-LABEL: define available_externally i64 @_pext_u64(i64 noundef %{{[0-9a-zA-Z._]+}}, i64 noundef %{{[0-9a-zA-Z._]+}}) +// CHECK: %[[CALL:[0-9a-zA-Z._]+]] = call i1 @llvm.is.constant.i64(i64 %{{[0-9a-zA-Z._]+}}) +// CHECK: br i1 %[[CALL]], label %[[TRUECOND:[0-9a-zA-Z._]+]], label %[[FALSECOND:[0-9a-zA-Z._]+]] +// CHECK: [[TRUECOND]]: +// CHECK: %[[CALL2:[0-9a-zA-Z._]+]] = call i64 @llvm.ctpop.i64(i64 %{{[0-9a-zA-Z._]+}}) +// CHECK: call i64 @llvm.ctlz.i64(i64 %{{[0-9a-zA-Z._]+}}, i1 false) +// CHECK: %[[SHL:[0-9a-zA-Z._]+]] = shl i64 %{{[0-9a-zA-Z._]+}}, 8 +// CHECK: or i64 %[[SHL]], %{{[0-9a-zA-Z._]+}} +// CHECK: %[[SHR:[0-9a-zA-Z._]+]] = lshr i64 -9223372036854775808, %{{[0-9a-zA-Z._]+}} +// CHECK: xor i64 %{{[0-9a-zA-Z._]+}}, %[[SHR]] +// CHECK: add nsw i64 %{{[0-9a-zA-Z._]+}}, 1 +// CHECK: call i64 @llvm.ppc.bpermd +// CHECK: [[FALSECOND]]: +// CHECK: call i64 @llvm.ctpop.i64(i64 %{{[0-9a-zA-Z._]+}}) +// CHECK: call i64 @llvm.ctlz.i64(i64 %{{[0-9a-zA-Z._]+}}, i1 false) +// CHECK: %[[SHR2:[0-9a-zA-Z._]+]] = lshr i64 -9223372036854775808, %{{[0-9a-zA-Z._]+}} +// CHECK: %[[AND:[0-9a-zA-Z._]+]] = and i64 %{{[0-9a-zA-Z._]+}}, %[[SHR2]] +// CHECK: %[[SUB:[0-9a-zA-Z._]+]] = sub i64 %{{[0-9a-zA-Z._]+}}, %{{[0-9a-zA-Z._]+}} +// CHECK: lshr i64 %[[AND]], %[[SUB]] +// CHECK: %[[SHR3:[0-9a-zA-Z._]+]] = lshr i64 -9223372036854775808, %{{[0-9a-zA-Z._]+}} +// CHECK: xor i64 %{{[0-9a-zA-Z._]+}}, %[[SHR3]] +// CHECK: or i64 %{{[0-9a-zA-Z._]+}}, %{{[0-9a-zA-Z._]+}} +// CHECK: add i64 %{{[0-9a-zA-Z._]+}}, 1 + +// CHECK-LABEL: define available_externally zeroext i32 @_pdep_u32(i32 noundef zeroext %{{[0-9a-zA-Z._]+}}, i32 noundef zeroext %{{[0-9a-zA-Z._]+}}) +// CHECK: %[[CONV:[0-9a-zA-Z._]+]] = zext i32 %{{[0-9a-zA-Z._]+}} to i64 +// CHECK: %[[CONV1:[0-9a-zA-Z._]+]] = zext i32 %{{[0-9a-zA-Z._]+}} to i64 +// CHECK: %[[CALL:[0-9a-zA-Z._]+]] = call i64 @_pdep_u64(i64 noundef %[[CONV]], i64 noundef %[[CONV1]]) +// CHECK: trunc i64 %[[CALL]] to i32 + +// CHECK-LABEL: define available_externally zeroext i32 @_pext_u32(i32 noundef zeroext %{{[0-9a-zA-Z._]+}}, i32 noundef zeroext %{{[0-9a-zA-Z._]+}}) +// CHECK: %[[CONV:[0-9a-zA-Z._]+]] = zext i32 %{{[0-9a-zA-Z._]+}} to i64 +// CHECK: %[[CONV1:[0-9a-zA-Z._]+]] = zext i32 %{{[0-9a-zA-Z._]+}} to i64 +// CHECK: %[[CALL:[0-9a-zA-Z._]+]] = call i64 @_pext_u64(i64 noundef %[[CONV]], i64 noundef %[[CONV1]]) +// CHECK: trunc i64 %[[CALL]] to i32 Index: clang/test/CodeGen/PowerPC/ppc-smmintrin.c =================================================================== --- clang/test/CodeGen/PowerPC/ppc-smmintrin.c +++ clang/test/CodeGen/PowerPC/ppc-smmintrin.c @@ -1,4 +1,3 @@ -// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py // REQUIRES: powerpc-registered-target // RUN: %clang -S -emit-llvm -target powerpc64le-unknown-linux-gnu -mcpu=pwr8 -ffreestanding -DNO_WARN_X86_INTRINSICS %s \ @@ -13,6 +12,8 @@ #include <smmintrin.h> +__m128 mn1, mn2; +__m128d md1, md2; __m128i mi, m1, m2; void __attribute__((noinline)) @@ -25,102 +26,50 @@ // CHECK-LABEL: @test_extract -// CHECK: define available_externally signext i32 @_mm_extract_epi8(<2 x i64> noundef [[REG1:[0-9a-zA-Z_%.]+]], i32 noundef signext [[REG2:[0-9a-zA-Z_%.]+]]) -// CHECK: store <2 x i64> [[REG1]], <2 x i64>* [[REG3:[0-9a-zA-Z_%.]+]], align 16 -// CHECK-NEXT: store i32 [[REG2]], i32* [[REG4:[0-9a-zA-Z_%.]+]], align 4 -// CHECK-NEXT: [[REG5:[0-9a-zA-Z_%.]+]] = load <2 x i64>, <2 x i64>* [[REG3]], align 16 -// CHECK-NEXT: [[REG6:[0-9a-zA-Z_%.]+]] = bitcast <2 x i64> [[REG5]] to <16 x i8> -// CHECK-NEXT: [[REG7:[0-9a-zA-Z_%.]+]] = load i32, i32* [[REG4]], align 4 -// CHECK-NEXT: [[REG8:[0-9a-zA-Z_%.]+]] = and i32 [[REG7]], 15 -// CHECK-NEXT: [[REG9:[0-9a-zA-Z_%.]+]] = extractelement <16 x i8> [[REG6]], i32 [[REG8]] -// CHECK-NEXT: [[REG10:[0-9a-zA-Z_%.]+]] = zext i8 [[REG9]] to i32 -// CHECK-NEXT: ret i32 [[REG10]] - -// CHECK: define available_externally signext i32 @_mm_extract_epi32(<2 x i64> noundef [[REG11:[0-9a-zA-Z_%.]+]], i32 noundef signext [[REG12:[0-9a-zA-Z_%.]+]]) -// CHECK: store <2 x i64> [[REG11]], <2 x i64>* [[REG13:[0-9a-zA-Z_%.]+]], align 16 -// CHECK-NEXT: store i32 [[REG12]], i32* [[REG14:[0-9a-zA-Z_%.]+]], align 4 -// CHECK-NEXT: [[REG15:[0-9a-zA-Z_%.]+]] = load <2 x i64>, <2 x i64>* [[REG13]], align 16 -// CHECK-NEXT: [[REG16:[0-9a-zA-Z_%.]+]] = bitcast <2 x i64> [[REG15]] to <4 x i32> -// CHECK-NEXT: [[REG17:[0-9a-zA-Z_%.]+]] = load i32, i32* [[REG14]], align 4 -// CHECK-NEXT: [[REG18:[0-9a-zA-Z_%.]+]] = and i32 [[REG17]], 3 -// CHECK-NEXT: [[REG19:[0-9a-zA-Z_%.]+]] = extractelement <4 x i32> [[REG16]], i32 [[REG18]] -// CHECK-NEXT: ret i32 [[REG19]] - -// CHECK: define available_externally signext i32 @_mm_extract_epi64(<2 x i64> noundef [[REG20:[0-9a-zA-Z_%.]+]], i32 noundef signext [[REG21:[0-9a-zA-Z_%.]+]]) -// CHECK: store <2 x i64> [[REG20]], <2 x i64>* [[REG22:[0-9a-zA-Z_%.]+]], align 16 -// CHECK-NEXT: store i32 [[REG21]], i32* [[REG23:[0-9a-zA-Z_%.]+]], align 4 -// CHECK-NEXT: [[REG24:[0-9a-zA-Z_%.]+]] = load <2 x i64>, <2 x i64>* [[REG22]], align 16 -// CHECK-NEXT: [[REG25:[0-9a-zA-Z_%.]+]] = load i32, i32* [[REG23]], align 4 -// CHECK-NEXT: [[REG26:[0-9a-zA-Z_%.]+]] = and i32 [[REG25]], 1 -// CHECK-NEXT: [[REG27:[0-9a-zA-Z_%.]+]] = extractelement <2 x i64> [[REG24]], i32 [[REG26]] -// CHECK-NEXT: [[REG28:[0-9a-zA-Z_%.]+]] = trunc i64 [[REG27]] to i32 -// CHECK-NEXT: ret i32 [[REG28]] - -// CHECK: define available_externally signext i32 @_mm_extract_ps(<4 x float> noundef [[REG29:[0-9a-zA-Z_%.]+]], i32 noundef signext [[REG30:[0-9a-zA-Z_%.]+]]) -// CHECK: store <4 x float> [[REG29]], <4 x float>* [[REG31:[0-9a-zA-Z_%.]+]], align 16 -// CHECK-NEXT: store i32 [[REG30]], i32* [[REG32:[0-9a-zA-Z_%.]+]], align 4 -// CHECK-NEXT: [[REG33:[0-9a-zA-Z_%.]+]] = load <4 x float>, <4 x float>* [[REG31]], align 16 -// CHECK-NEXT: [[REG34:[0-9a-zA-Z_%.]+]] = bitcast <4 x float> [[REG33]] to <4 x i32> -// CHECK-NEXT: [[REG35:[0-9a-zA-Z_%.]+]] = load i32, i32* [[REG32]], align 4 -// CHECK-NEXT: [[REG36:[0-9a-zA-Z_%.]+]] = and i32 [[REG35]], 3 -// CHECK-NEXT: [[REG37:[0-9a-zA-Z_%.]+]] = extractelement <4 x i32> [[REG34]], i32 [[REG36]] -// CHECK-NEXT: ret i32 [[REG37]] +// CHECK-LABEL: define available_externally signext i32 @_mm_extract_epi8(<2 x i64> noundef %{{[0-9a-zA-Z_.]+}}, i32 noundef signext %{{[0-9a-zA-Z_.]+}}) +// CHECK: %[[AND:[0-9a-zA-Z_.]+]] = and i32 %{{[0-9a-zA-Z_.]+}}, 15 +// CHECK: %[[EXT:[0-9a-zA-Z_.]+]] = extractelement <16 x i8> %{{[0-9a-zA-Z_.]+}}, i32 %[[AND]] +// CHECK: zext i8 %[[EXT]] to i32 + +// CHECK-LABEL: define available_externally signext i32 @_mm_extract_epi32(<2 x i64> noundef %{{[0-9a-zA-Z_.]+}}, i32 noundef signext %{{[0-9a-zA-Z_.]+}}) +// CHECK: %[[AND:[0-9a-zA-Z_.]+]] = and i32 %{{[0-9a-zA-Z_.]+}}, 3 +// CHECK: extractelement <4 x i32> %{{[0-9a-zA-Z_.]+}}, i32 %[[AND]] + +// CHECK-LABEL: define available_externally signext i32 @_mm_extract_epi64(<2 x i64> noundef %{{[0-9a-zA-Z_.]+}}, i32 noundef signext %{{[0-9a-zA-Z_.]+}}) +// CHECK: %[[AND:[0-9a-zA-Z_.]+]] = and i32 %{{[0-9a-zA-Z_.]+}}, 1 +// CHECK: %[[EXT:[0-9a-zA-Z_.]+]] = extractelement <2 x i64> %{{[0-9a-zA-Z_.]+}}, i32 %[[AND]] +// CHECK: trunc i64 %[[EXT]] to i32 + +// CHECK-LABEL: define available_externally signext i32 @_mm_extract_ps(<4 x float> noundef %{{[0-9a-zA-Z_.]+}}, i32 noundef signext %{{[0-9a-zA-Z_.]+}}) +// CHECK: %[[AND:[0-9a-zA-Z_.]+]] = and i32 %{{[0-9a-zA-Z_.]+}}, 3 +// CHECK: extractelement <4 x i32> %{{[0-9a-zA-Z_.]+}}, i32 %[[AND]] void __attribute__((noinline)) test_blend() { _mm_blend_epi16(m1, m2, 0); _mm_blendv_epi8(m1, m2, mi); + _mm_blend_ps(mn1, mn2, 0); + _mm_blendv_ps(mn1, mn2, mn1); + _mm_blend_pd(md1, md2, 0); + _mm_blendv_pd(md1, md2, md1); } // CHECK-LABEL: @test_blend -// CHECK: define available_externally <2 x i64> @_mm_blend_epi16(<2 x i64> noundef [[REG38:[0-9a-zA-Z_%.]+]], <2 x i64> noundef [[REG39:[0-9a-zA-Z_%.]+]], i32 noundef signext [[REG40:[0-9a-zA-Z_%.]+]]) -// CHECK: store <2 x i64> [[REG38]], <2 x i64>* [[REG41:[0-9a-zA-Z_%.]+]], align 16 -// CHECK-NEXT: store <2 x i64> [[REG39]], <2 x i64>* [[REG42:[0-9a-zA-Z_%.]+]], align 16 -// CHECK-NEXT: store i32 [[REG40]], i32* [[REG43:[0-9a-zA-Z_%.]+]], align 4 -// CHECK-NEXT: [[REG44:[0-9a-zA-Z_%.]+]] = load i32, i32* [[REG43]], align 4 -// CHECK-NEXT: [[REG45:[0-9a-zA-Z_%.]+]] = trunc i32 [[REG44]] to i8 -// CHECK-NEXT: [[REG46:[0-9a-zA-Z_%.]+]] = call <16 x i8> @vec_splats(signed char)(i8 noundef signext [[REG45]]) -// CHECK-NEXT: store <16 x i8> [[REG46]], <16 x i8>* [[REG47:[0-9a-zA-Z_%.]+]], align 16 -// CHECK-NEXT: [[REG48:[0-9a-zA-Z_%.]+]] = load <16 x i8>, <16 x i8>* [[REG47]], align 16 -// CHECK-NEXT: [[REG49:[0-9a-zA-Z_%.]+]] = call <16 x i8> @llvm.ppc.altivec.vgbbd(<16 x i8> [[REG48]]) -// CHECK-NEXT: store <16 x i8> [[REG49]], <16 x i8>* [[REG47]], align 16 -// CHECK-NEXT: [[REG50:[0-9a-zA-Z_%.]+]] = load <16 x i8>, <16 x i8>* [[REG47]], align 16 -// CHECK-NEXT: [[REG51:[0-9a-zA-Z_%.]+]] = call <8 x i16> @vec_unpackh(signed char vector[16])(<16 x i8> noundef [[REG50]]) -// CHECK-NEXT: store <8 x i16> [[REG51]], <8 x i16>* [[REG52:[0-9a-zA-Z_%.]+]], align 16 - -// BE: [[REG53:[0-9a-zA-Z_%.]+]] = load <8 x i16>, <8 x i16>* [[REG52]], align 16 -// BE-NEXT: [[REG54:[0-9a-zA-Z_%.]+]] = call <8 x i16> @vec_reve(unsigned short vector[8])(<8 x i16> [[REG53]]) -// BE-NEXT: store <8 x i16> [[REG54]], <8 x i16>* [[REG52]], align 16 - -// CHECK: [[REG55:[0-9a-zA-Z_%.]+]] = load <2 x i64>, <2 x i64>* [[REG41]], align 16 -// CHECK-NEXT: [[REG56:[0-9a-zA-Z_%.]+]] = bitcast <2 x i64> [[REG55]] to <8 x i16> -// CHECK-NEXT: [[REG57:[0-9a-zA-Z_%.]+]] = load <2 x i64>, <2 x i64>* [[REG42]], align 16 -// CHECK-NEXT: [[REG58:[0-9a-zA-Z_%.]+]] = bitcast <2 x i64> [[REG57]] to <8 x i16> -// CHECK-NEXT: [[REG59:[0-9a-zA-Z_%.]+]] = load <8 x i16>, <8 x i16>* [[REG52]], align 16 -// CHECK-NEXT: [[REG60:[0-9a-zA-Z_%.]+]] = call <8 x i16> @vec_sel(unsigned short vector[8], unsigned short vector[8], unsigned short vector[8])(<8 x i16> noundef [[REG56]], <8 x i16> noundef [[REG58]], <8 x i16> noundef [[REG59]]) -// CHECK-NEXT: [[REG61:[0-9a-zA-Z_%.]+]] = bitcast <8 x i16> [[REG60]] to <2 x i64> -// CHECK-NEXT: ret <2 x i64> [[REG61]] - -// CHECK: define available_externally <2 x i64> @_mm_blendv_epi8(<2 x i64> noundef [[REG62:[0-9a-zA-Z_%.]+]], <2 x i64> noundef [[REG63:[0-9a-zA-Z_%.]+]], <2 x i64> noundef [[REG64:[0-9a-zA-Z_%.]+]]) -// CHECK: store <2 x i64> [[REG62]], <2 x i64>* [[REG65:[0-9a-zA-Z_%.]+]], align 16 -// CHECK-NEXT: store <2 x i64> [[REG63]], <2 x i64>* [[REG66:[0-9a-zA-Z_%.]+]], align 16 -// CHECK-NEXT: store <2 x i64> [[REG64]], <2 x i64>* [[REG67:[0-9a-zA-Z_%.]+]], align 16 -// CHECK-NEXT: [[REG68:[0-9a-zA-Z_%.]+]] = call <16 x i8> @vec_splats(unsigned char)(i8 noundef zeroext 7) -// CHECK-NEXT: store <16 x i8> [[REG68]], <16 x i8>* [[REG69:[0-9a-zA-Z_%.]+]], align 16 -// CHECK-NEXT: [[REG70:[0-9a-zA-Z_%.]+]] = load <2 x i64>, <2 x i64>* [[REG67]], align 16 -// CHECK-NEXT: [[REG71:[0-9a-zA-Z_%.]+]] = bitcast <2 x i64> [[REG70]] to <16 x i8> -// CHECK-NEXT: [[REG72:[0-9a-zA-Z_%.]+]] = load <16 x i8>, <16 x i8>* [[REG69]], align 16 -// CHECK-NEXT: [[REG73:[0-9a-zA-Z_%.]+]] = call <16 x i8> @vec_sra(unsigned char vector[16], unsigned char vector[16])(<16 x i8> noundef [[REG71]], <16 x i8> noundef [[REG72]]) -// CHECK-NEXT: store <16 x i8> [[REG73]], <16 x i8>* [[REG74:[0-9a-zA-Z_%.]+]], align 16 -// CHECK-NEXT: [[REG75:[0-9a-zA-Z_%.]+]] = load <2 x i64>, <2 x i64>* [[REG65]], align 16 -// CHECK-NEXT: [[REG76:[0-9a-zA-Z_%.]+]] = bitcast <2 x i64> [[REG75]] to <16 x i8> -// CHECK-NEXT: [[REG77:[0-9a-zA-Z_%.]+]] = load <2 x i64>, <2 x i64>* [[REG66]], align 16 -// CHECK-NEXT: [[REG78:[0-9a-zA-Z_%.]+]] = bitcast <2 x i64> [[REG77]] to <16 x i8> -// CHECK-NEXT: [[REG79:[0-9a-zA-Z_%.]+]] = load <16 x i8>, <16 x i8>* [[REG74]], align 16 -// CHECK-NEXT: [[REG80:[0-9a-zA-Z_%.]+]] = call <16 x i8> @vec_sel(unsigned char vector[16], unsigned char vector[16], unsigned char vector[16])(<16 x i8> noundef [[REG76]], <16 x i8> noundef [[REG78]], <16 x i8> noundef [[REG79]]) -// CHECK-NEXT: [[REG81:[0-9a-zA-Z_%.]+]] = bitcast <16 x i8> [[REG80]] to <2 x i64> -// CHECK-NEXT: ret <2 x i64> [[REG81]] +// CHECK-LABEL: define available_externally <2 x i64> @_mm_blend_epi16(<2 x i64> noundef %{{[0-9a-zA-Z_.]+}}, <2 x i64> noundef %{{[0-9a-zA-Z_.]+}}, i32 noundef signext %{{[0-9a-zA-Z_.]+}}) +// CHECK: %[[TRUNC:[0-9a-zA-Z_.]+]] = trunc i32 %{{[0-9a-zA-Z_.]+}} to i8 +// CHECK: call <16 x i8> @vec_splats(signed char)(i8 noundef signext %[[TRUNC]]) +// CHECK: call <16 x i8> @llvm.ppc.altivec.vgbbd(<16 x i8> %{{[0-9a-zA-Z_.]+}}) +// CHECK: %[[PACK:[0-9a-zA-Z_.]+]] = call <8 x i16> @vec_unpackh(signed char vector[16]) +// CHECK: store <8 x i16> %[[PACK]], <8 x i16>* %{{[0-9a-zA-Z_.]+}}, align 16 +// BE: %[[REVE:[0-9a-zA-Z_.]+]] = call <8 x i16> @vec_reve(unsigned short vector[8]) +// BE: store <8 x i16> %[[REVE]], <8 x i16>* %{{[0-9a-zA-Z_.]+}}, align 16 +// CHECK: call <8 x i16> @vec_sel(unsigned short vector[8], unsigned short vector[8], unsigned short vector[8]) + +// CHECK-LABEL: define available_externally <2 x i64> @_mm_blendv_epi8(<2 x i64> noundef %{{[0-9a-zA-Z_.]+}}, <2 x i64> noundef %{{[0-9a-zA-Z_.]+}}, <2 x i64> noundef %{{[0-9a-zA-Z_.]+}}) +// CHECK: call <16 x i8> @vec_splats(unsigned char)(i8 noundef zeroext 7) +// CHECK: call <16 x i8> @vec_sra(unsigned char vector[16], unsigned char vector[16]) +// CHECK: call <16 x i8> @vec_sel(signed char vector[16], signed char vector[16], unsigned char vector[16]) void __attribute__((noinline)) test_insert() { @@ -131,25 +80,251 @@ // CHECK-LABEL: @test_insert -// CHECK: define available_externally <2 x i64> @_mm_insert_epi8(<2 x i64> noundef {{[0-9a-zA-Z_%.]+}}, i32 noundef signext {{[0-9a-zA-Z_%.]+}}, i32 noundef signext {{[0-9a-zA-Z_%.]+}}) -// CHECK: %{{[0-9a-zA-Z_.]+}} = bitcast <2 x i64> %{{[0-9a-zA-Z_.]+}} to <16 x i8> -// CHECK: %[[R0:[0-9a-zA-Z_.]+]] = trunc i32 %{{[0-9a-zA-Z_.]+}} to i8 -// CHECK: %[[R1:[0-9a-zA-Z_.]+]] = and i32 %{{[0-9a-zA-Z_.]+}}, 15 -// CHECK: %{{[0-9a-zA-Z_.]+}} = insertelement <16 x i8> %{{[0-9a-zA-Z_.]+}}, i8 %[[R0]], i32 %[[R1]] -// CHECK: %[[R2:[0-9a-zA-Z_.]+]] = bitcast <16 x i8> %{{[0-9a-zA-Z_.]+}} to <2 x i64> -// CHECK: ret <2 x i64> %[[R2]] - -// CHECK: define available_externally <2 x i64> @_mm_insert_epi32(<2 x i64> noundef {{[0-9a-zA-Z_%.]+}}, i32 noundef signext {{[0-9a-zA-Z_%.]+}}, i32 noundef signext {{[0-9a-zA-Z_%.]+}}) -// CHECK: %{{[0-9a-zA-Z_.]+}} = bitcast <2 x i64> %{{[0-9a-zA-Z_.]+}} to <4 x i32> -// CHECK: %[[R0:[0-9a-zA-Z_.]+]] = and i32 %{{[0-9a-zA-Z_.]+}}, 3 -// CHECK: %{{[0-9a-zA-Z_.]+}} = insertelement <4 x i32> %{{[0-9a-zA-Z_.]+}}, i32 %{{[0-9a-zA-Z_.]+}}, i32 %[[R0]] -// CHECK: %[[R1:[0-9a-zA-Z_.]+]] = bitcast <4 x i32> %{{[0-9a-zA-Z_.]+}} to <2 x i64> -// CHECK: ret <2 x i64> %[[R1]] - -// CHECK: define available_externally <2 x i64> @_mm_insert_epi64(<2 x i64> noundef {{[0-9a-zA-Z_%.]+}}, i64 noundef {{[0-9a-zA-Z_%.]+}}, i32 noundef signext {{[0-9a-zA-Z_%.]+}}) -// CHECK: %[[R0:[0-9a-zA-Z_.]+]] = and i32 %{{[0-9a-zA-Z_.]+}}, 1 -// CHECK: %{{[0-9a-zA-Z_.]+}} = insertelement <2 x i64> %{{[0-9a-zA-Z_.]+}}, i64 %{{[0-9a-zA-Z_.]+}}, i32 %[[R0:[0-9a-zA-Z_.]+]] -// CHECK: ret <2 x i64> %{{[0-9a-zA-Z_.]+}} +// CHECK-LABEL: define available_externally <2 x i64> @_mm_insert_epi8(<2 x i64> noundef %{{[0-9a-zA-Z_.]+}}, i32 noundef signext %{{[0-9a-zA-Z_.]+}}, i32 noundef signext %{{[0-9a-zA-Z_.]+}}) +// CHECK: %[[TRUNC:[0-9a-zA-Z_.]+]] = trunc i32 %{{[0-9a-zA-Z_.]+}} to i8 +// CHECK: %[[AND:[0-9a-zA-Z_.]+]] = and i32 %{{[0-9a-zA-Z_.]+}}, 15 +// CHECK: insertelement <16 x i8> %{{[0-9a-zA-Z_.]+}}, i8 %[[TRUNC]], i32 %[[AND]] + +// CHECK-LABEL: define available_externally <2 x i64> @_mm_insert_epi32(<2 x i64> noundef %{{[0-9a-zA-Z_.]+}}, i32 noundef signext %{{[0-9a-zA-Z_.]+}}, i32 noundef signext %{{[0-9a-zA-Z_.]+}}) +// CHECK: %[[AND:[0-9a-zA-Z_.]+]] = and i32 %{{[0-9a-zA-Z_.]+}}, 3 +// CHECK: insertelement <4 x i32> %{{[0-9a-zA-Z_.]+}}, i32 %{{[0-9a-zA-Z_.]+}}, i32 %[[AND]] + +// CHECK-LABEL: define available_externally <2 x i64> @_mm_insert_epi64(<2 x i64> noundef %{{[0-9a-zA-Z_.]+}}, i64 noundef %{{[0-9a-zA-Z_.]+}}, i32 noundef signext %{{[0-9a-zA-Z_.]+}}) +// CHECK: %[[AND:[0-9a-zA-Z_.]+]] = and i32 %{{[0-9a-zA-Z_.]+}}, 1 +// CHECK: insertelement <2 x i64> %{{[0-9a-zA-Z_.]+}}, i64 %{{[0-9a-zA-Z_.]+}}, i32 %[[AND:[0-9a-zA-Z_.]+]] + +void __attribute__((noinline)) +test_convert() { + _mm_cvtepi16_epi32(m1); + _mm_cvtepi16_epi64(m1); + _mm_cvtepi32_epi64(m1); + _mm_cvtepi8_epi16(m1); + _mm_cvtepi8_epi32(m1); + _mm_cvtepi8_epi64(m1); + _mm_cvtepu16_epi32(m1); + _mm_cvtepu16_epi64(m1); + _mm_cvtepu32_epi64(m1); + _mm_cvtepu8_epi16(m1); + _mm_cvtepu8_epi32(m1); + _mm_cvtepu8_epi64(m1); +} + +// CHECK-LABEL: @test_convert + +// CHECK-LABEL: define available_externally <2 x i64> @_mm_cvtepi16_epi32(<2 x i64> noundef %{{[0-9a-zA-Z_.]+}}) +// CHECK: call <4 x i32> @vec_unpackh(short vector[8]) + +// CHECK-LABEL: define available_externally <2 x i64> @_mm_cvtepi16_epi64(<2 x i64> noundef %{{[0-9a-zA-Z_.]+}}) +// CHECK: call <4 x i32> @vec_unpackh(short vector[8]) +// CHECK: call <2 x i64> @vec_unpackh(int vector[4]) + +// CHECK-LABEL: define available_externally <2 x i64> @_mm_cvtepi32_epi64(<2 x i64> noundef %{{[0-9a-zA-Z_.]+}}) +// CHECK: call <2 x i64> @vec_unpackh(int vector[4]) + +// CHECK-LABEL: define available_externally <2 x i64> @_mm_cvtepi8_epi16(<2 x i64> noundef %{{[0-9a-zA-Z_.]+}}) +// CHECK: call <8 x i16> @vec_unpackh(signed char vector[16]) + +// CHECK-LABEL: define available_externally <2 x i64> @_mm_cvtepi8_epi32(<2 x i64> noundef %{{[0-9a-zA-Z_.]+}}) +// CHECK: call <8 x i16> @vec_unpackh(signed char vector[16]) +// CHECK: call <4 x i32> @vec_unpackh(short vector[8]) + +// CHECK-LABEL: define available_externally <2 x i64> @_mm_cvtepi8_epi64(<2 x i64> noundef %{{[0-9a-zA-Z_.]+}}) +// CHECK: call <8 x i16> @vec_unpackh(signed char vector[16]) +// CHECK: call <4 x i32> @vec_unpackh(short vector[8]) +// CHECK: call <2 x i64> @vec_unpackh(int vector[4]) + +// CHECK-LABEL: define available_externally <2 x i64> @_mm_cvtepu16_epi32(<2 x i64> noundef %{{[0-9a-zA-Z_.]+}}) +// LE: call <8 x i16> @vec_mergeh(unsigned short vector[8], unsigned short vector[8])(<8 x i16> noundef %{{[0-9a-zA-Z_.]+}}, <8 x i16> noundef zeroinitializer) +// BE: call <8 x i16> @vec_mergeh(unsigned short vector[8], unsigned short vector[8])(<8 x i16> noundef zeroinitializer, <8 x i16> noundef %{{[0-9a-zA-Z_.]+}}) + +// CHECK-LABEL: define available_externally <2 x i64> @_mm_cvtepu16_epi64(<2 x i64> noundef %{{[0-9a-zA-Z_.]+}}) +// LE: call <8 x i16> @vec_mergeh(unsigned short vector[8], unsigned short vector[8])(<8 x i16> noundef %{{[0-9a-zA-Z_.]+}}, <8 x i16> noundef zeroinitializer) +// LE: call <4 x i32> @vec_mergeh(unsigned int vector[4], unsigned int vector[4])(<4 x i32> noundef %{{[0-9a-zA-Z_.]+}}, <4 x i32> noundef zeroinitializer) +// BE: call <8 x i16> @vec_mergeh(unsigned short vector[8], unsigned short vector[8])(<8 x i16> noundef zeroinitializer, <8 x i16> noundef %{{[0-9a-zA-Z_.]+}}) +// BE: call <4 x i32> @vec_mergeh(unsigned int vector[4], unsigned int vector[4])(<4 x i32> noundef zeroinitializer, <4 x i32> noundef %{{[0-9a-zA-Z_.]+}}) + +// CHECK-LABEL: define available_externally <2 x i64> @_mm_cvtepu32_epi64(<2 x i64> noundef %{{[0-9a-zA-Z_.]+}}) +// LE: call <4 x i32> @vec_mergeh(unsigned int vector[4], unsigned int vector[4])(<4 x i32> noundef %{{[0-9a-zA-Z_.]+}}, <4 x i32> noundef zeroinitializer) +// BE: call <4 x i32> @vec_mergeh(unsigned int vector[4], unsigned int vector[4])(<4 x i32> noundef zeroinitializer, <4 x i32> noundef %{{[0-9a-zA-Z_.]+}}) + +// CHECK-LABEL: define available_externally <2 x i64> @_mm_cvtepu8_epi16(<2 x i64> noundef %{{[0-9a-zA-Z_.]+}}) +// LE: call <16 x i8> @vec_mergeh(unsigned char vector[16], unsigned char vector[16])(<16 x i8> noundef %{{[0-9a-zA-Z_.]+}}, <16 x i8> noundef zeroinitializer) +// BE: call <16 x i8> @vec_mergeh(unsigned char vector[16], unsigned char vector[16])(<16 x i8> noundef zeroinitializer, <16 x i8> noundef %{{[0-9a-zA-Z_.]+}}) + +// CHECK-LABEL: define available_externally <2 x i64> @_mm_cvtepu8_epi32(<2 x i64> noundef %{{[0-9a-zA-Z_.]+}}) +// LE: call <16 x i8> @vec_mergeh(unsigned char vector[16], unsigned char vector[16])(<16 x i8> noundef %{{[0-9a-zA-Z_.]+}}, <16 x i8> noundef zeroinitializer) +// LE: call <8 x i16> @vec_mergeh(unsigned short vector[8], unsigned short vector[8])(<8 x i16> noundef %{{[0-9a-zA-Z_.]+}}, <8 x i16> noundef zeroinitializer) +// BE: call <16 x i8> @vec_mergeh(unsigned char vector[16], unsigned char vector[16])(<16 x i8> noundef zeroinitializer, <16 x i8> noundef %{{[0-9a-zA-Z_.]+}}) +// BE: call <8 x i16> @vec_mergeh(unsigned short vector[8], unsigned short vector[8])(<8 x i16> noundef zeroinitializer, <8 x i16> noundef %{{[0-9a-zA-Z_.]+}}) + +// CHECK-LABEL: define available_externally <2 x i64> @_mm_cvtepu8_epi64(<2 x i64> noundef %{{[0-9a-zA-Z_.]+}}) +// CHECK: call <16 x i8> @vec_mergeh(unsigned char vector[16], unsigned char vector[16]) +// CHECK: call <8 x i16> @vec_mergeh(unsigned short vector[8], unsigned short vector[8]) +// CHECK: call <4 x i32> @vec_mergeh(unsigned int vector[4], unsigned int vector[4]) + +void __attribute__((noinline)) +test_minmax() { + _mm_max_epi32(m1, m2); + _mm_max_epi8(m1, m2); + _mm_max_epu16(m1, m2); + _mm_max_epu32(m1, m2); + _mm_min_epi32(m1, m2); + _mm_min_epi8(m1, m2); + _mm_min_epu16(m1, m2); + _mm_min_epu32(m1, m2); + _mm_minpos_epu16(m1); +} + +// CHECK-LABEL: @test_minmax + +// CHECK-LABEL: define available_externally <2 x i64> @_mm_max_epi32(<2 x i64> noundef %{{[0-9a-zA-Z_.]+}}, <2 x i64> noundef %{{[0-9a-zA-Z_.]+}}) +// CHECK: call <4 x i32> @vec_max(int vector[4], int vector[4]) + +// CHECK-LABEL: define available_externally <2 x i64> @_mm_max_epi8(<2 x i64> noundef %{{[0-9a-zA-Z_.]+}}, <2 x i64> noundef %{{[0-9a-zA-Z_.]+}}) +// CHECK: call <16 x i8> @vec_max(signed char vector[16], signed char vector[16]) + +// CHECK-LABEL: define available_externally <2 x i64> @_mm_max_epu16(<2 x i64> noundef %{{[0-9a-zA-Z_.]+}}, <2 x i64> noundef %{{[0-9a-zA-Z_.]+}}) +// CHECK: call <8 x i16> @vec_max(unsigned short vector[8], unsigned short vector[8]) + +// CHECK-LABEL: define available_externally <2 x i64> @_mm_max_epu32(<2 x i64> noundef %{{[0-9a-zA-Z_.]+}}, <2 x i64> noundef %{{[0-9a-zA-Z_.]+}}) +// CHECK: call <4 x i32> @vec_max(unsigned int vector[4], unsigned int vector[4]) + +// CHECK-LABEL: define available_externally <2 x i64> @_mm_min_epi32(<2 x i64> noundef %{{[0-9a-zA-Z_.]+}}, <2 x i64> noundef %{{[0-9a-zA-Z_.]+}}) +// CHECK: call <4 x i32> @vec_min(int vector[4], int vector[4]) + +// CHECK-LABEL: define available_externally <2 x i64> @_mm_min_epi8(<2 x i64> noundef %{{[0-9a-zA-Z_.]+}}, <2 x i64> noundef %{{[0-9a-zA-Z_.]+}}) +// CHECK: call <16 x i8> @vec_min(signed char vector[16], signed char vector[16]) + +// CHECK-LABEL: define available_externally <2 x i64> @_mm_min_epu16(<2 x i64> noundef %{{[0-9a-zA-Z_.]+}}, <2 x i64> noundef %{{[0-9a-zA-Z_.]+}}) +// CHECK: call <8 x i16> @vec_min(unsigned short vector[8], unsigned short vector[8]) + +// CHECK-LABEL: define available_externally <2 x i64> @_mm_min_epu32(<2 x i64> noundef %{{[0-9a-zA-Z_.]+}}, <2 x i64> noundef %{{[0-9a-zA-Z_.]+}}) +// CHECK: call <4 x i32> @vec_min(unsigned int vector[4], unsigned int vector[4]) + +// CHECK-LABEL: define available_externally <2 x i64> @_mm_minpos_epu16(<2 x i64> noundef %{{[0-9a-zA-Z_.]+}}) +// CHECK: call void @llvm.memset.p0i8.i64(i8* align 16 %{{[0-9a-zA-Z_.]+}}, i8 0, i64 16, i1 false) +// CHECK: extractelement <8 x i16> %{{[0-9a-zA-Z_.]+}}, i16 %{{[0-9a-zA-Z_.]+}} +// CHECK: %[[VEXT:[0-9a-zA-Z_.]+]] = extractelement <8 x i16> %{{[0-9a-zA-Z_.]+}}, i64 %{{[0-9a-zA-Z_.]+}} +// CHECK: zext i16 %[[VEXT]] to i32 +// CHECK: zext i16 %{{[0-9a-zA-Z_.]+}} to i32 +// CHECK: extractelement <8 x i16> %{{[0-9a-zA-Z_.]+}}, i64 %{{[0-9a-zA-Z_.]+}} +// CHECK: add i64 %{{[0-9a-zA-Z_.]+}}, 1 + +void __attribute__((noinline)) +test_round() { + _mm_round_ps(mn1, 0); + _mm_round_ss(mn1, mn2, 0); + _mm_round_pd(mn1, 0); + _mm_round_sd(mn1, mn2, 0); +} + +// CHECK-LABEL: @test_round + +// CHECK-LABEL: define available_externally <4 x float> @_mm_round_ps(<4 x float> noundef %{{[0-9a-zA-Z_.]+}}, i32 noundef signext %{{[0-9a-zA-Z_.]+}}) +// CHECK: call signext i32 bitcast (i32 (...)* @__builtin_mffs to i32 ()*)() +// CHECK: call signext i32 bitcast (i32 (...)* @__builtin_mtfsf to i32 (i32, double)*)(i32 noundef signext 3, double noundef %{{[0-9a-zA-Z_.]+}}) +// CHECK: %{{[0-9a-zA-Z_.]+}} = call <4 x float> asm "", "=^wa,0" +// CHECK: call signext i32 bitcast (i32 (...)* @__builtin_mffsl to i32 ()*)() +// CHECK: call signext i32 bitcast (i32 (...)* @__builtin_set_fpscr_rn to i32 (i32)*)(i32 noundef signext 0) +// CHECK: %{{[0-9a-zA-Z_.]+}} = call <4 x float> asm "", "=^wa,0" +// CHECK: call <4 x float> @vec_rint(float vector[4]) +// CHECK: call void asm sideeffect "", "^wa" +// CHECK: call signext i32 bitcast (i32 (...)* @__builtin_set_fpscr_rn to i32 (i64)*)(i64 noundef %{{[0-9a-zA-Z_.]+}}) +// CHECK: call <4 x float> @vec_floor(float vector[4]) +// CHECK: call <4 x float> @vec_ceil(float vector[4]) +// CHECK: call <4 x float> @vec_trunc(float vector[4]) +// CHECK: call <4 x float> @vec_rint(float vector[4]) +// CHECK: call void asm sideeffect "", "^wa" +// CHECK: call signext i32 bitcast (i32 (...)* @__builtin_mffsl to i32 ()*)() +// CHECK: call signext i32 bitcast (i32 (...)* @__builtin_mtfsf to i32 (i32, double)*)(i32 noundef signext 3, double noundef %{{[0-9a-zA-Z_.]+}}) + +// CHECK-LABEL: define available_externally <4 x float> @_mm_round_ss(<4 x float> noundef %{{[0-9a-zA-Z_.]+}}, <4 x float> noundef %{{[0-9a-zA-Z_.]+}}, i32 noundef signext %{{[0-9a-zA-Z_.]+}}) +// CHECK: call <4 x float> @_mm_round_ps(<4 x float> noundef %{{[0-9a-zA-Z_.]+}}, i32 noundef signext %{{[0-9a-zA-Z_.]+}}) +// CHECK: extractelement <4 x float> %{{[0-9a-zA-Z_.]+}}, i32 0 + +// CHECK-LABEL: define available_externally <2 x double> @_mm_round_pd(<2 x double> noundef %{{[0-9a-zA-Z_.]+}}, i32 noundef signext %{{[0-9a-zA-Z_.]+}}) +// CHECK: call signext i32 bitcast (i32 (...)* @__builtin_mffs to i32 ()*)() +// CHECK: call signext i32 bitcast (i32 (...)* @__builtin_mtfsf to i32 (i32, double)*)(i32 noundef signext 3, double noundef %{{[0-9a-zA-Z_.]+}}) +// CHECK: %{{[0-9a-zA-Z_.]+}} = call <2 x double> asm "", "=^wa,0" +// CHECK: call signext i32 bitcast (i32 (...)* @__builtin_mffsl to i32 ()*)() +// CHECK: call signext i32 bitcast (i32 (...)* @__builtin_set_fpscr_rn to i32 (i32)*)(i32 noundef signext 0) +// CHECK: %{{[0-9a-zA-Z_.]+}} = call <2 x double> asm "", "=^wa,0" +// CHECK: call <2 x double> @vec_rint(double vector[2]) +// CHECK: call void asm sideeffect "", "^wa" +// CHECK: call signext i32 bitcast (i32 (...)* @__builtin_set_fpscr_rn to i32 (i64)*)(i64 noundef %{{[0-9a-zA-Z_.]+}}) +// CHECK: call <2 x double> @vec_floor(double vector[2]) +// CHECK: call <2 x double> @vec_ceil(double vector[2]) +// CHECK: call <2 x double> @vec_trunc(double vector[2]) +// CHECK: call <2 x double> @vec_rint(double vector[2]) +// CHECK: call void asm sideeffect "", "^wa" +// CHECK: call signext i32 bitcast (i32 (...)* @__builtin_mffsl to i32 ()*)() +// CHECK: call signext i32 bitcast (i32 (...)* @__builtin_mtfsf to i32 (i32, double)*)(i32 noundef signext 3, double noundef %{{[0-9a-zA-Z_.]+}}) + +// CHECK-LABEL: define available_externally <2 x double> @_mm_round_sd(<2 x double> noundef %{{[0-9a-zA-Z_.]+}}, <2 x double> noundef %{{[0-9a-zA-Z_.]+}}, i32 noundef signext %{{[0-9a-zA-Z_.]+}}) +// CHECK: call <2 x double> @_mm_round_pd(<2 x double> noundef %{{[0-9a-zA-Z_.]+}}, i32 noundef signext %{{[0-9a-zA-Z_.]+}}) +// CHECK: extractelement <2 x double> %{{[0-9a-zA-Z_.]+}}, i32 0 +// CHECK: extractelement <2 x double> %{{[0-9a-zA-Z_.]+}}, i32 1 + +void __attribute__((noinline)) +test_testing() { + _mm_testc_si128(m1, m2); + _mm_testnzc_si128(m1, m2); + _mm_testz_si128(m1, m2); +} + +// CHECK-LABEL: @test_testing + +// CHECK-LABEL: define available_externally signext i32 @_mm_testc_si128(<2 x i64> noundef %{{[0-9a-zA-Z_.]+}}, <2 x i64> noundef %{{[0-9a-zA-Z_.]+}}) +// CHECK: call <16 x i8> @vec_nor(unsigned char vector[16], unsigned char vector[16]) +// CHECK: call <16 x i8> @vec_and(unsigned char vector[16], unsigned char vector[16]) +// CHECK: call signext i32 @vec_all_eq(unsigned char vector[16], unsigned char vector[16])(<16 x i8> noundef %call1, <16 x i8> noundef zeroinitializer) + +// CHECK-LABEL: define available_externally signext i32 @_mm_testnzc_si128(<2 x i64> noundef %{{[0-9a-zA-Z_.]+}}, <2 x i64> noundef %{{[0-9a-zA-Z_.]+}}) +// CHECK: call signext i32 @_mm_testz_si128(<2 x i64> noundef %{{[0-9a-zA-Z_.]+}}, <2 x i64> noundef %{{[0-9a-zA-Z_.]+}}) +// CHECK: call signext i32 @_mm_testc_si128(<2 x i64> noundef %{{[0-9a-zA-Z_.]+}}, <2 x i64> noundef %{{[0-9a-zA-Z_.]+}}) +// CHECK: zext i1 %{{[0-9a-zA-Z_.]+}} to i32 + +// CHECK-LABEL: define available_externally signext i32 @_mm_testz_si128(<2 x i64> noundef %{{[0-9a-zA-Z_.]+}}, <2 x i64> noundef %{{[0-9a-zA-Z_.]+}}) +// CHECK: call <16 x i8> @vec_and(unsigned char vector[16], unsigned char vector[16]) +// CHECK: call signext i32 @vec_all_eq(unsigned char vector[16], unsigned char vector[16])(<16 x i8> noundef %call, <16 x i8> noundef zeroinitializer) + +void __attribute__((noinline)) +test_compare() { + _mm_cmpeq_epi64(m1, m2); + _mm_cmpgt_epi64(m1, m2); +} + +// CHECK-LABEL: @test_compare + +// CHECK-LABEL: define available_externally <2 x i64> @_mm_cmpeq_epi64(<2 x i64> noundef %{{[0-9a-zA-Z_.]+}}, <2 x i64> noundef %{{[0-9a-zA-Z_.]+}}) +// CHECK: call <2 x i64> @vec_cmpeq(long long vector[2], long long vector[2]) + +// CHECK-LABEL: define available_externally <2 x i64> @_mm_cmpgt_epi64(<2 x i64> noundef %{{[0-9a-zA-Z_.]+}}, <2 x i64> noundef %{{[0-9a-zA-Z_.]+}}) +// CHECK: call <2 x i64> @vec_cmpgt(long long vector[2], long long vector[2]) + +void __attribute__((noinline)) +test_mul() { + _mm_mul_epi32(m1, m2); + _mm_mullo_epi32(m1, m2); +} + +// CHECK-LABEL: @test_mul + +// CHECK-LABEL: define available_externally <2 x i64> @_mm_mul_epi32(<2 x i64> noundef %{{[0-9a-zA-Z_.]+}}, <2 x i64> noundef %{{[0-9a-zA-Z_.]+}}) +// CHECK: %call = call <2 x i64> @vec_mule(int vector[4], int vector[4]) + +// CHECK-LABEL: define available_externally <2 x i64> @_mm_mullo_epi32(<2 x i64> noundef %{{[0-9a-zA-Z_.]+}}, <2 x i64> noundef %{{[0-9a-zA-Z_.]+}}) +// CHECK: %call = call <4 x i32> @vec_mul(unsigned int vector[4], unsigned int vector[4]) + +void __attribute__((noinline)) +test_packus() { + _mm_packus_epi32(m1, m2); +} + +// CHECK-LABEL: @test_packus + +// CHECK-LABEL: define available_externally <2 x i64> @_mm_packus_epi32(<2 x i64> noundef %{{[0-9a-zA-Z_.]+}}, <2 x i64> noundef %{{[0-9a-zA-Z_.]+}}) +// CHECK: call <8 x i16> @vec_packsu(int vector[4], int vector[4])(<4 x i32> noundef %1, <4 x i32> noundef %3) // To test smmintrin.h includes tmmintrin.h @@ -160,4 +335,4 @@ // CHECK-LABEL: @test_abs_ssse3 -// CHECK: define available_externally <2 x i64> @_mm_abs_epi16(<2 x i64> noundef {{[0-9a-zA-Z_%.]+}}) +// CHECK-LABEL: define available_externally <2 x i64> @_mm_abs_epi16(<2 x i64> noundef %{{[0-9a-zA-Z_.]+}}) Index: clang/lib/Headers/ppc_wrappers/xmmintrin.h =================================================================== --- clang/lib/Headers/ppc_wrappers/xmmintrin.h +++ clang/lib/Headers/ppc_wrappers/xmmintrin.h @@ -31,8 +31,8 @@ #error "Please read comment above. Use -DNO_WARN_X86_INTRINSICS to disable this error." #endif -#ifndef _XMMINTRIN_H_INCLUDED -#define _XMMINTRIN_H_INCLUDED +#ifndef XMMINTRIN_H_ +#define XMMINTRIN_H_ #if defined(__ppc64__) && (defined(__linux__) || defined(__FreeBSD__)) @@ -62,13 +62,14 @@ /* The Intel API is flexible enough that we must allow aliasing with other vector types, and their scalar components. */ -typedef vector float __m128 __attribute__((__may_alias__)); +typedef float __m128 __attribute__ ((__vector_size__ (16), __may_alias__)); /* Unaligned version of the same type. */ -typedef vector float __m128_u __attribute__((__may_alias__, __aligned__(1))); +typedef float __m128_u __attribute__ ((__vector_size__ (16), __may_alias__, + __aligned__ (1))); /* Internal data types for implementing the intrinsics. */ -typedef vector float __v4sf; +typedef float __v4sf __attribute__ ((__vector_size__ (16))); /* Create an undefined vector. */ extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) @@ -881,7 +882,7 @@ extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm_cvtss_si32 (__m128 __A) { - __m64 res = 0; + int res; #ifdef _ARCH_PWR8 double dtmp; __asm__( @@ -914,8 +915,8 @@ extern __inline long long __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm_cvtss_si64 (__m128 __A) { - __m64 res = 0; -#ifdef _ARCH_PWR8 + long long res; +#if defined (_ARCH_PWR8) && defined (__powerpc64__) double dtmp; __asm__( #ifdef __LITTLE_ENDIAN__ @@ -1328,6 +1329,9 @@ extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm_movemask_ps (__m128 __A) { +#ifdef _ARCH_PWR10 + return vec_extractm ((__vector unsigned int) __A); +#else __vector unsigned long long result; static const __vector unsigned int perm_mask = { @@ -1347,6 +1351,7 @@ #else return result[0]; #endif +#endif /* !_ARCH_PWR10 */ } #endif /* _ARCH_PWR8 */ @@ -1553,6 +1558,7 @@ extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm_movemask_pi8 (__m64 __A) { +#ifdef __powerpc64__ unsigned long long p = #ifdef __LITTLE_ENDIAN__ 0x0008101820283038UL; // permute control for sign bits @@ -1560,6 +1566,18 @@ 0x3830282018100800UL; // permute control for sign bits #endif return __builtin_bpermd (p, __A); +#else +#ifdef __LITTLE_ENDIAN__ + unsigned int mask = 0x20283038UL; + unsigned int r1 = __builtin_bpermd (mask, __A) & 0xf; + unsigned int r2 = __builtin_bpermd (mask, __A >> 32) & 0xf; +#else + unsigned int mask = 0x38302820UL; + unsigned int r1 = __builtin_bpermd (mask, __A >> 32) & 0xf; + unsigned int r2 = __builtin_bpermd (mask, __A) & 0xf; +#endif + return (r2 << 4) | r1; +#endif } extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__)) @@ -1841,4 +1859,4 @@ #endif /* defined(__ppc64__) && (defined(__linux__) || defined(__FreeBSD__)) \ */ -#endif /* _XMMINTRIN_H_INCLUDED */ +#endif /* XMMINTRIN_H_ */ Index: clang/lib/Headers/ppc_wrappers/x86intrin.h =================================================================== --- /dev/null +++ clang/lib/Headers/ppc_wrappers/x86intrin.h @@ -0,0 +1,28 @@ +/*===---- x86intrin.h - Implementation of X86 intrinsics on PowerPC --------=== + * + * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. + * See https://llvm.org/LICENSE.txt for license information. + * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + * + *===-----------------------------------------------------------------------=== + */ + +#ifndef NO_WARN_X86_INTRINSICS +/* This header is distributed to simplify porting x86_64 code that + makes explicit use of Intel intrinsics to powerpc64le. + It is the user's responsibility to determine if the results are + acceptable and make additional changes as necessary. + Note that much code that uses Intel intrinsics can be rewritten in + standard C or GNU C extensions, which are more portable and better + optimized across multiple targets. */ +#error "Please read comment above. Use -DNO_WARN_X86_INTRINSICS to disable this error." +#endif + +#ifndef X86INTRIN_H_ +#define X86INTRIN_H_ + +#ifdef __ALTIVEC__ +#include <immintrin.h> +#endif /* __ALTIVEC__ */ + +#endif /* X86INTRIN_H_ */ Index: clang/lib/Headers/ppc_wrappers/x86gprintrin.h =================================================================== --- /dev/null +++ clang/lib/Headers/ppc_wrappers/x86gprintrin.h @@ -0,0 +1,17 @@ +/*===--- x86gprintrin.h - Implementation of X86 GPR intrinsics on PowerPC --=== + * + * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. + * See https://llvm.org/LICENSE.txt for license information. + * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + * + *===-----------------------------------------------------------------------=== + */ + +#ifndef X86GPRINTRIN_H_ +#define X86GPRINTRIN_H_ + +#include <bmiintrin.h> + +#include <bmi2intrin.h> + +#endif /* X86GPRINTRIN_H_ */ Index: clang/lib/Headers/ppc_wrappers/tmmintrin.h =================================================================== --- clang/lib/Headers/ppc_wrappers/tmmintrin.h +++ clang/lib/Headers/ppc_wrappers/tmmintrin.h @@ -339,6 +339,7 @@ return (__m64) ((__v2du) (__C))[0]; } +#ifdef _ARCH_PWR8 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm_sign_epi8 (__m128i __A, __m128i __B) @@ -350,7 +351,9 @@ __v16qi __conv = vec_add (__selectneg, __selectpos); return (__m128i) vec_mul ((__v16qi) __A, (__v16qi) __conv); } +#endif +#ifdef _ARCH_PWR8 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm_sign_epi16 (__m128i __A, __m128i __B) @@ -362,7 +365,9 @@ __v8hi __conv = vec_add (__selectneg, __selectpos); return (__m128i) vec_mul ((__v8hi) __A, (__v8hi) __conv); } +#endif +#ifdef _ARCH_PWR8 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm_sign_epi32 (__m128i __A, __m128i __B) @@ -374,7 +379,9 @@ __v4si __conv = vec_add (__selectneg, __selectpos); return (__m128i) vec_mul ((__v4si) __A, (__v4si) __conv); } +#endif +#ifdef _ARCH_PWR8 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm_sign_pi8 (__m64 __A, __m64 __B) @@ -385,7 +392,9 @@ __C = (__v16qi) _mm_sign_epi8 ((__m128i) __C, (__m128i) __D); return (__m64) ((__v2du) (__C))[0]; } +#endif +#ifdef _ARCH_PWR8 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm_sign_pi16 (__m64 __A, __m64 __B) @@ -396,7 +405,9 @@ __C = (__v8hi) _mm_sign_epi16 ((__m128i) __C, (__m128i) __D); return (__m64) ((__v2du) (__C))[0]; } +#endif +#ifdef _ARCH_PWR8 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm_sign_pi32 (__m64 __A, __m64 __B) @@ -407,6 +418,7 @@ __C = (__v4si) _mm_sign_epi32 ((__m128i) __C, (__m128i) __D); return (__m64) ((__v2du) (__C))[0]; } +#endif extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) Index: clang/lib/Headers/ppc_wrappers/smmintrin.h =================================================================== --- clang/lib/Headers/ppc_wrappers/smmintrin.h +++ clang/lib/Headers/ppc_wrappers/smmintrin.h @@ -34,77 +34,683 @@ #include <altivec.h> #include <tmmintrin.h> -extern __inline int - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_extract_epi8(__m128i __X, const int __N) { - return (unsigned char)((__v16qi)__X)[__N & 15]; +/* Rounding mode macros. */ +#define _MM_FROUND_TO_NEAREST_INT 0x00 +#define _MM_FROUND_TO_ZERO 0x01 +#define _MM_FROUND_TO_POS_INF 0x02 +#define _MM_FROUND_TO_NEG_INF 0x03 +#define _MM_FROUND_CUR_DIRECTION 0x04 + +#define _MM_FROUND_NINT \ + (_MM_FROUND_TO_NEAREST_INT | _MM_FROUND_RAISE_EXC) +#define _MM_FROUND_FLOOR \ + (_MM_FROUND_TO_NEG_INF | _MM_FROUND_RAISE_EXC) +#define _MM_FROUND_CEIL \ + (_MM_FROUND_TO_POS_INF | _MM_FROUND_RAISE_EXC) +#define _MM_FROUND_TRUNC \ + (_MM_FROUND_TO_ZERO | _MM_FROUND_RAISE_EXC) +#define _MM_FROUND_RINT \ + (_MM_FROUND_CUR_DIRECTION | _MM_FROUND_RAISE_EXC) +#define _MM_FROUND_NEARBYINT \ + (_MM_FROUND_CUR_DIRECTION | _MM_FROUND_NO_EXC) + +#define _MM_FROUND_RAISE_EXC 0x00 +#define _MM_FROUND_NO_EXC 0x08 + +extern __inline __m128d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_round_pd (__m128d __A, int __rounding) +{ + __v2df __r; + union { + double __fr; + long long __fpscr; + } __enables_save, __fpscr_save; + + if (__rounding & _MM_FROUND_NO_EXC) + { + /* Save enabled exceptions, disable all exceptions, + and preserve the rounding mode. */ +#ifdef _ARCH_PWR9 + __asm__ ("mffsce %0" : "=f" (__fpscr_save.__fr)); + __enables_save.__fpscr = __fpscr_save.__fpscr & 0xf8; +#else + __fpscr_save.__fr = __builtin_mffs (); + __enables_save.__fpscr = __fpscr_save.__fpscr & 0xf8; + __fpscr_save.__fpscr &= ~0xf8; + __builtin_mtfsf (0b00000011, __fpscr_save.__fr); +#endif + /* Insert an artificial "read/write" reference to the variable + read below, to ensure the compiler does not schedule + a read/use of the variable before the FPSCR is modified, above. + This can be removed if and when GCC PR102783 is fixed. + */ + __asm__ ("" : "+wa" (__A)); + } + + switch (__rounding) + { + case _MM_FROUND_TO_NEAREST_INT: + __fpscr_save.__fr = __builtin_mffsl (); + __attribute__ ((fallthrough)); + case _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC: + __builtin_set_fpscr_rn (0b00); + /* Insert an artificial "read/write" reference to the variable + read below, to ensure the compiler does not schedule + a read/use of the variable before the FPSCR is modified, above. + This can be removed if and when GCC PR102783 is fixed. + */ + __asm__ ("" : "+wa" (__A)); + + __r = vec_rint ((__v2df) __A); + + /* Insert an artificial "read" reference to the variable written + above, to ensure the compiler does not schedule the computation + of the value after the manipulation of the FPSCR, below. + This can be removed if and when GCC PR102783 is fixed. + */ + __asm__ ("" : : "wa" (__r)); + __builtin_set_fpscr_rn (__fpscr_save.__fpscr); + break; + case _MM_FROUND_TO_NEG_INF: + case _MM_FROUND_TO_NEG_INF | _MM_FROUND_NO_EXC: + __r = vec_floor ((__v2df) __A); + break; + case _MM_FROUND_TO_POS_INF: + case _MM_FROUND_TO_POS_INF | _MM_FROUND_NO_EXC: + __r = vec_ceil ((__v2df) __A); + break; + case _MM_FROUND_TO_ZERO: + case _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC: + __r = vec_trunc ((__v2df) __A); + break; + case _MM_FROUND_CUR_DIRECTION: + __r = vec_rint ((__v2df) __A); + break; + } + if (__rounding & _MM_FROUND_NO_EXC) + { + /* Insert an artificial "read" reference to the variable written + above, to ensure the compiler does not schedule the computation + of the value after the manipulation of the FPSCR, below. + This can be removed if and when GCC PR102783 is fixed. + */ + __asm__ ("" : : "wa" (__r)); + /* Restore enabled exceptions. */ + __fpscr_save.__fr = __builtin_mffsl (); + __fpscr_save.__fpscr |= __enables_save.__fpscr; + __builtin_mtfsf (0b00000011, __fpscr_save.__fr); + } + return (__m128d) __r; } -extern __inline int - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_extract_epi32(__m128i __X, const int __N) { +extern __inline __m128d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_round_sd (__m128d __A, __m128d __B, int __rounding) +{ + __B = _mm_round_pd (__B, __rounding); + __v2df __r = { ((__v2df) __B)[0], ((__v2df) __A)[1] }; + return (__m128d) __r; +} + +extern __inline __m128 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_round_ps (__m128 __A, int __rounding) +{ + __v4sf __r; + union { + double __fr; + long long __fpscr; + } __enables_save, __fpscr_save; + + if (__rounding & _MM_FROUND_NO_EXC) + { + /* Save enabled exceptions, disable all exceptions, + and preserve the rounding mode. */ +#ifdef _ARCH_PWR9 + __asm__ ("mffsce %0" : "=f" (__fpscr_save.__fr)); + __enables_save.__fpscr = __fpscr_save.__fpscr & 0xf8; +#else + __fpscr_save.__fr = __builtin_mffs (); + __enables_save.__fpscr = __fpscr_save.__fpscr & 0xf8; + __fpscr_save.__fpscr &= ~0xf8; + __builtin_mtfsf (0b00000011, __fpscr_save.__fr); +#endif + /* Insert an artificial "read/write" reference to the variable + read below, to ensure the compiler does not schedule + a read/use of the variable before the FPSCR is modified, above. + This can be removed if and when GCC PR102783 is fixed. + */ + __asm__ ("" : "+wa" (__A)); + } + + switch (__rounding) + { + case _MM_FROUND_TO_NEAREST_INT: + __fpscr_save.__fr = __builtin_mffsl (); + __attribute__ ((fallthrough)); + case _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC: + __builtin_set_fpscr_rn (0b00); + /* Insert an artificial "read/write" reference to the variable + read below, to ensure the compiler does not schedule + a read/use of the variable before the FPSCR is modified, above. + This can be removed if and when GCC PR102783 is fixed. + */ + __asm__ ("" : "+wa" (__A)); + + __r = vec_rint ((__v4sf) __A); + + /* Insert an artificial "read" reference to the variable written + above, to ensure the compiler does not schedule the computation + of the value after the manipulation of the FPSCR, below. + This can be removed if and when GCC PR102783 is fixed. + */ + __asm__ ("" : : "wa" (__r)); + __builtin_set_fpscr_rn (__fpscr_save.__fpscr); + break; + case _MM_FROUND_TO_NEG_INF: + case _MM_FROUND_TO_NEG_INF | _MM_FROUND_NO_EXC: + __r = vec_floor ((__v4sf) __A); + break; + case _MM_FROUND_TO_POS_INF: + case _MM_FROUND_TO_POS_INF | _MM_FROUND_NO_EXC: + __r = vec_ceil ((__v4sf) __A); + break; + case _MM_FROUND_TO_ZERO: + case _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC: + __r = vec_trunc ((__v4sf) __A); + break; + case _MM_FROUND_CUR_DIRECTION: + __r = vec_rint ((__v4sf) __A); + break; + } + if (__rounding & _MM_FROUND_NO_EXC) + { + /* Insert an artificial "read" reference to the variable written + above, to ensure the compiler does not schedule the computation + of the value after the manipulation of the FPSCR, below. + This can be removed if and when GCC PR102783 is fixed. + */ + __asm__ ("" : : "wa" (__r)); + /* Restore enabled exceptions. */ + __fpscr_save.__fr = __builtin_mffsl (); + __fpscr_save.__fpscr |= __enables_save.__fpscr; + __builtin_mtfsf (0b00000011, __fpscr_save.__fr); + } + return (__m128) __r; +} + +extern __inline __m128 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_round_ss (__m128 __A, __m128 __B, int __rounding) +{ + __B = _mm_round_ps (__B, __rounding); + __v4sf __r = (__v4sf) __A; + __r[0] = ((__v4sf) __B)[0]; + return (__m128) __r; +} + +#define _mm_ceil_pd(V) _mm_round_pd ((V), _MM_FROUND_CEIL) +#define _mm_ceil_sd(D, V) _mm_round_sd ((D), (V), _MM_FROUND_CEIL) + +#define _mm_floor_pd(V) _mm_round_pd((V), _MM_FROUND_FLOOR) +#define _mm_floor_sd(D, V) _mm_round_sd ((D), (V), _MM_FROUND_FLOOR) + +#define _mm_ceil_ps(V) _mm_round_ps ((V), _MM_FROUND_CEIL) +#define _mm_ceil_ss(D, V) _mm_round_ss ((D), (V), _MM_FROUND_CEIL) + +#define _mm_floor_ps(V) _mm_round_ps ((V), _MM_FROUND_FLOOR) +#define _mm_floor_ss(D, V) _mm_round_ss ((D), (V), _MM_FROUND_FLOOR) + +extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_insert_epi8 (__m128i const __A, int const __D, int const __N) +{ + __v16qi result = (__v16qi)__A; + + result [__N & 0xf] = __D; + + return (__m128i) result; +} + +extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_insert_epi32 (__m128i const __A, int const __D, int const __N) +{ + __v4si result = (__v4si)__A; + + result [__N & 3] = __D; + + return (__m128i) result; +} + +extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_insert_epi64 (__m128i const __A, long long const __D, int const __N) +{ + __v2di result = (__v2di)__A; + + result [__N & 1] = __D; + + return (__m128i) result; +} + +extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_extract_epi8 (__m128i __X, const int __N) +{ + return (unsigned char) ((__v16qi)__X)[__N & 15]; +} + +extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_extract_epi32 (__m128i __X, const int __N) +{ return ((__v4si)__X)[__N & 3]; } -extern __inline int - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_extract_epi64(__m128i __X, const int __N) { +extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_extract_epi64 (__m128i __X, const int __N) +{ return ((__v2di)__X)[__N & 1]; } -extern __inline int - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_extract_ps(__m128 __X, const int __N) { +extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_extract_ps (__m128 __X, const int __N) +{ return ((__v4si)__X)[__N & 3]; } +#ifdef _ARCH_PWR8 +extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_blend_epi16 (__m128i __A, __m128i __B, const int __imm8) +{ + __v16qi __charmask = vec_splats ((signed char) __imm8); + __charmask = vec_gb (__charmask); + __v8hu __shortmask = (__v8hu) vec_unpackh (__charmask); + #ifdef __BIG_ENDIAN__ + __shortmask = vec_reve (__shortmask); + #endif + return (__m128i) vec_sel ((__v8hu) __A, (__v8hu) __B, __shortmask); +} +#endif + +extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_blendv_epi8 (__m128i __A, __m128i __B, __m128i __mask) +{ +#ifdef _ARCH_PWR10 + return (__m128i) vec_blendv ((__v16qi) __A, (__v16qi) __B, (__v16qu) __mask); +#else + const __v16qu __seven = vec_splats ((unsigned char) 0x07); + __v16qu __lmask = vec_sra ((__v16qu) __mask, __seven); + return (__m128i) vec_sel ((__v16qi) __A, (__v16qi) __B, __lmask); +#endif +} + +extern __inline __m128 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_blend_ps (__m128 __A, __m128 __B, const int __imm8) +{ + __v16qu __pcv[] = + { + { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 }, + { 16, 17, 18, 19, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 }, + { 0, 1, 2, 3, 20, 21, 22, 23, 8, 9, 10, 11, 12, 13, 14, 15 }, + { 16, 17, 18, 19, 20, 21, 22, 23, 8, 9, 10, 11, 12, 13, 14, 15 }, + { 0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 12, 13, 14, 15 }, + { 16, 17, 18, 19, 4, 5, 6, 7, 24, 25, 26, 27, 12, 13, 14, 15 }, + { 0, 1, 2, 3, 20, 21, 22, 23, 24, 25, 26, 27, 12, 13, 14, 15 }, + { 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 12, 13, 14, 15 }, + { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 28, 29, 30, 31 }, + { 16, 17, 18, 19, 4, 5, 6, 7, 8, 9, 10, 11, 28, 29, 30, 31 }, + { 0, 1, 2, 3, 20, 21, 22, 23, 8, 9, 10, 11, 28, 29, 30, 31 }, + { 16, 17, 18, 19, 20, 21, 22, 23, 8, 9, 10, 11, 28, 29, 30, 31 }, + { 0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31 }, + { 16, 17, 18, 19, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31 }, + { 0, 1, 2, 3, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31 }, + { 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31 }, + }; + __v16qu __r = vec_perm ((__v16qu) __A, (__v16qu)__B, __pcv[__imm8]); + return (__m128) __r; +} + +extern __inline __m128 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_blendv_ps (__m128 __A, __m128 __B, __m128 __mask) +{ +#ifdef _ARCH_PWR10 + return (__m128) vec_blendv ((__v4sf) __A, (__v4sf) __B, (__v4su) __mask); +#else + const __v4si __zero = {0}; + const __vector __bool int __boolmask = vec_cmplt ((__v4si) __mask, __zero); + return (__m128) vec_sel ((__v4su) __A, (__v4su) __B, (__v4su) __boolmask); +#endif +} + +extern __inline __m128d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_blend_pd (__m128d __A, __m128d __B, const int __imm8) +{ + __v16qu __pcv[] = + { + { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 }, + { 16, 17, 18, 19, 20, 21, 22, 23, 8, 9, 10, 11, 12, 13, 14, 15 }, + { 0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31 }, + { 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31 } + }; + __v16qu __r = vec_perm ((__v16qu) __A, (__v16qu)__B, __pcv[__imm8]); + return (__m128d) __r; +} + +#ifdef _ARCH_PWR8 +extern __inline __m128d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_blendv_pd (__m128d __A, __m128d __B, __m128d __mask) +{ +#ifdef _ARCH_PWR10 + return (__m128d) vec_blendv ((__v2df) __A, (__v2df) __B, (__v2du) __mask); +#else + const __v2di __zero = {0}; + const __vector __bool long long __boolmask = vec_cmplt ((__v2di) __mask, __zero); + return (__m128d) vec_sel ((__v2du) __A, (__v2du) __B, (__v2du) __boolmask); +#endif +} +#endif + +extern __inline int +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_testz_si128 (__m128i __A, __m128i __B) +{ + /* Note: This implementation does NOT set "zero" or "carry" flags. */ + const __v16qu __zero = {0}; + return vec_all_eq (vec_and ((__v16qu) __A, (__v16qu) __B), __zero); +} + +extern __inline int +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_testc_si128 (__m128i __A, __m128i __B) +{ + /* Note: This implementation does NOT set "zero" or "carry" flags. */ + const __v16qu __zero = {0}; + const __v16qu __notA = vec_nor ((__v16qu) __A, (__v16qu) __A); + return vec_all_eq (vec_and ((__v16qu) __notA, (__v16qu) __B), __zero); +} + +extern __inline int +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_testnzc_si128 (__m128i __A, __m128i __B) +{ + /* Note: This implementation does NOT set "zero" or "carry" flags. */ + return _mm_testz_si128 (__A, __B) == 0 && _mm_testc_si128 (__A, __B) == 0; +} + +#define _mm_test_all_zeros(M, V) _mm_testz_si128 ((M), (V)) + +#define _mm_test_all_ones(V) \ + _mm_testc_si128 ((V), _mm_cmpeq_epi32 ((V), (V))) + +#define _mm_test_mix_ones_zeros(M, V) _mm_testnzc_si128 ((M), (V)) + +#ifdef _ARCH_PWR8 extern __inline __m128i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_blend_epi16(__m128i __A, __m128i __B, const int __imm8) { - __v16qi __charmask = vec_splats((signed char)__imm8); - __charmask = vec_gb(__charmask); - __v8hu __shortmask = (__v8hu)vec_unpackh(__charmask); -#ifdef __BIG_ENDIAN__ - __shortmask = vec_reve(__shortmask); +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cmpeq_epi64 (__m128i __X, __m128i __Y) +{ + return (__m128i) vec_cmpeq ((__v2di) __X, (__v2di) __Y); +} #endif - return (__m128i)vec_sel((__v8hu)__A, (__v8hu)__B, __shortmask); + +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_min_epi8 (__m128i __X, __m128i __Y) +{ + return (__m128i) vec_min ((__v16qi)__X, (__v16qi)__Y); } extern __inline __m128i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_blendv_epi8(__m128i __A, __m128i __B, __m128i __mask) { - const __v16qu __seven = vec_splats((unsigned char)0x07); - __v16qu __lmask = vec_sra((__v16qu)__mask, __seven); - return (__m128i)vec_sel((__v16qu)__A, (__v16qu)__B, __lmask); +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_min_epu16 (__m128i __X, __m128i __Y) +{ + return (__m128i) vec_min ((__v8hu)__X, (__v8hu)__Y); } extern __inline __m128i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_insert_epi8(__m128i const __A, int const __D, int const __N) { - __v16qi result = (__v16qi)__A; - result[__N & 0xf] = __D; - return (__m128i)result; +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_min_epi32 (__m128i __X, __m128i __Y) +{ + return (__m128i) vec_min ((__v4si)__X, (__v4si)__Y); } extern __inline __m128i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_insert_epi32(__m128i const __A, int const __D, int const __N) { - __v4si result = (__v4si)__A; - result[__N & 3] = __D; - return (__m128i)result; +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_min_epu32 (__m128i __X, __m128i __Y) +{ + return (__m128i) vec_min ((__v4su)__X, (__v4su)__Y); } extern __inline __m128i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_insert_epi64(__m128i const __A, long long const __D, int const __N) { - __v2di result = (__v2di)__A; - result[__N & 1] = __D; - return (__m128i)result; +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_max_epi8 (__m128i __X, __m128i __Y) +{ + return (__m128i) vec_max ((__v16qi)__X, (__v16qi)__Y); +} + +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_max_epu16 (__m128i __X, __m128i __Y) +{ + return (__m128i) vec_max ((__v8hu)__X, (__v8hu)__Y); +} + +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_max_epi32 (__m128i __X, __m128i __Y) +{ + return (__m128i) vec_max ((__v4si)__X, (__v4si)__Y); +} + +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_max_epu32 (__m128i __X, __m128i __Y) +{ + return (__m128i) vec_max ((__v4su)__X, (__v4su)__Y); +} + +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mullo_epi32 (__m128i __X, __m128i __Y) +{ + return (__m128i) vec_mul ((__v4su) __X, (__v4su) __Y); +} + +#ifdef _ARCH_PWR8 +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mul_epi32 (__m128i __X, __m128i __Y) +{ + return (__m128i) vec_mule ((__v4si) __X, (__v4si) __Y); +} +#endif + +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cvtepi8_epi16 (__m128i __A) +{ + return (__m128i) vec_unpackh ((__v16qi) __A); +} + +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cvtepi8_epi32 (__m128i __A) +{ + __A = (__m128i) vec_unpackh ((__v16qi) __A); + return (__m128i) vec_unpackh ((__v8hi) __A); +} + +#ifdef _ARCH_PWR8 +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cvtepi8_epi64 (__m128i __A) +{ + __A = (__m128i) vec_unpackh ((__v16qi) __A); + __A = (__m128i) vec_unpackh ((__v8hi) __A); + return (__m128i) vec_unpackh ((__v4si) __A); +} +#endif + +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cvtepi16_epi32 (__m128i __A) +{ + return (__m128i) vec_unpackh ((__v8hi) __A); +} + +#ifdef _ARCH_PWR8 +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cvtepi16_epi64 (__m128i __A) +{ + __A = (__m128i) vec_unpackh ((__v8hi) __A); + return (__m128i) vec_unpackh ((__v4si) __A); +} +#endif + +#ifdef _ARCH_PWR8 +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cvtepi32_epi64 (__m128i __A) +{ + return (__m128i) vec_unpackh ((__v4si) __A); } +#endif + +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cvtepu8_epi16 (__m128i __A) +{ + const __v16qu __zero = {0}; +#ifdef __LITTLE_ENDIAN__ + __A = (__m128i) vec_mergeh ((__v16qu) __A, __zero); +#else /* __BIG_ENDIAN__. */ + __A = (__m128i) vec_mergeh (__zero, (__v16qu) __A); +#endif /* __BIG_ENDIAN__. */ + return __A; +} + +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cvtepu8_epi32 (__m128i __A) +{ + const __v16qu __zero = {0}; +#ifdef __LITTLE_ENDIAN__ + __A = (__m128i) vec_mergeh ((__v16qu) __A, __zero); + __A = (__m128i) vec_mergeh ((__v8hu) __A, (__v8hu) __zero); +#else /* __BIG_ENDIAN__. */ + __A = (__m128i) vec_mergeh (__zero, (__v16qu) __A); + __A = (__m128i) vec_mergeh ((__v8hu) __zero, (__v8hu) __A); +#endif /* __BIG_ENDIAN__. */ + return __A; +} + +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cvtepu8_epi64 (__m128i __A) +{ + const __v16qu __zero = {0}; +#ifdef __LITTLE_ENDIAN__ + __A = (__m128i) vec_mergeh ((__v16qu) __A, __zero); + __A = (__m128i) vec_mergeh ((__v8hu) __A, (__v8hu) __zero); + __A = (__m128i) vec_mergeh ((__v4su) __A, (__v4su) __zero); +#else /* __BIG_ENDIAN__. */ + __A = (__m128i) vec_mergeh (__zero, (__v16qu) __A); + __A = (__m128i) vec_mergeh ((__v8hu) __zero, (__v8hu) __A); + __A = (__m128i) vec_mergeh ((__v4su) __zero, (__v4su) __A); +#endif /* __BIG_ENDIAN__. */ + return __A; +} + +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cvtepu16_epi32 (__m128i __A) +{ + const __v8hu __zero = {0}; +#ifdef __LITTLE_ENDIAN__ + __A = (__m128i) vec_mergeh ((__v8hu) __A, __zero); +#else /* __BIG_ENDIAN__. */ + __A = (__m128i) vec_mergeh (__zero, (__v8hu) __A); +#endif /* __BIG_ENDIAN__. */ + return __A; +} + +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cvtepu16_epi64 (__m128i __A) +{ + const __v8hu __zero = {0}; +#ifdef __LITTLE_ENDIAN__ + __A = (__m128i) vec_mergeh ((__v8hu) __A, __zero); + __A = (__m128i) vec_mergeh ((__v4su) __A, (__v4su) __zero); +#else /* __BIG_ENDIAN__. */ + __A = (__m128i) vec_mergeh (__zero, (__v8hu) __A); + __A = (__m128i) vec_mergeh ((__v4su) __zero, (__v4su) __A); +#endif /* __BIG_ENDIAN__. */ + return __A; +} + +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cvtepu32_epi64 (__m128i __A) +{ + const __v4su __zero = {0}; +#ifdef __LITTLE_ENDIAN__ + __A = (__m128i) vec_mergeh ((__v4su) __A, __zero); +#else /* __BIG_ENDIAN__. */ + __A = (__m128i) vec_mergeh (__zero, (__v4su) __A); +#endif /* __BIG_ENDIAN__. */ + return __A; +} + +/* Return horizontal packed word minimum and its index in bits [15:0] + and bits [18:16] respectively. */ +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_minpos_epu16 (__m128i __A) +{ + union __u + { + __m128i __m; + __v8hu __uh; + }; + union __u __u = { .__m = __A }, __r = { .__m = {0} }; + unsigned short __ridx = 0; + unsigned short __rmin = __u.__uh[__ridx]; + for (unsigned long __i = 1; __i < 8; __i++) + { + if (__u.__uh[__i] < __rmin) + { + __rmin = __u.__uh[__i]; + __ridx = __i; + } + } + __r.__uh[0] = __rmin; + __r.__uh[1] = __ridx; + return __r.__m; +} + +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_packus_epi32 (__m128i __X, __m128i __Y) +{ + return (__m128i) vec_packsu ((__v4si) __X, (__v4si) __Y); +} + +#ifdef _ARCH_PWR8 +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cmpgt_epi64 (__m128i __X, __m128i __Y) +{ + return (__m128i) vec_cmpgt ((__v2di) __X, (__v2di) __Y); +} +#endif #else #include_next <smmintrin.h> #endif /* defined(__ppc64__) && (defined(__linux__) || defined(__FreeBSD__)) \ */ -#endif /* _SMMINTRIN_H_ */ +#endif /* SMMINTRIN_H_ */ Index: clang/lib/Headers/ppc_wrappers/pmmintrin.h =================================================================== --- clang/lib/Headers/ppc_wrappers/pmmintrin.h +++ clang/lib/Headers/ppc_wrappers/pmmintrin.h @@ -111,17 +111,21 @@ vec_mergel ((__v2df) __X, (__v2df)__Y)); } +#ifdef _ARCH_PWR8 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm_movehdup_ps (__m128 __X) { return (__m128)vec_mergeo ((__v4su)__X, (__v4su)__X); } +#endif +#ifdef _ARCH_PWR8 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm_moveldup_ps (__m128 __X) { return (__m128)vec_mergee ((__v4su)__X, (__v4su)__X); } +#endif extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm_loaddup_pd (double const *__P) Index: clang/lib/Headers/ppc_wrappers/nmmintrin.h =================================================================== --- /dev/null +++ clang/lib/Headers/ppc_wrappers/nmmintrin.h @@ -0,0 +1,26 @@ +/*===---- nmmintrin.h - Implementation of SSE4 intrinsics on PowerPC -------=== + * + * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. + * See https://llvm.org/LICENSE.txt for license information. + * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + * + *===-----------------------------------------------------------------------=== + */ + +#ifndef NO_WARN_X86_INTRINSICS +/* This header is distributed to simplify porting x86_64 code that + makes explicit use of Intel intrinsics to powerpc64le. + It is the user's responsibility to determine if the results are + acceptable and make additional changes as necessary. + Note that much code that uses Intel intrinsics can be rewritten in + standard C or GNU C extensions, which are more portable and better + optimized across multiple targets. */ +#endif + +#ifndef NMMINTRIN_H_ +#define NMMINTRIN_H_ + +/* We just include SSE4.1 header file. */ +#include <smmintrin.h> + +#endif /* NMMINTRIN_H_ */ Index: clang/lib/Headers/ppc_wrappers/immintrin.h =================================================================== --- /dev/null +++ clang/lib/Headers/ppc_wrappers/immintrin.h @@ -0,0 +1,27 @@ +/*===---- immintrin.h - Implementation of Intel intrinsics on PowerPC ------=== + * + * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. + * See https://llvm.org/LICENSE.txt for license information. + * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + * + *===-----------------------------------------------------------------------=== + */ + +#ifndef IMMINTRIN_H_ +#define IMMINTRIN_H_ + +#include <x86gprintrin.h> + +#include <mmintrin.h> + +#include <xmmintrin.h> + +#include <emmintrin.h> + +#include <pmmintrin.h> + +#include <tmmintrin.h> + +#include <smmintrin.h> + +#endif /* IMMINTRIN_H_ */ Index: clang/lib/Headers/ppc_wrappers/emmintrin.h =================================================================== --- clang/lib/Headers/ppc_wrappers/emmintrin.h +++ clang/lib/Headers/ppc_wrappers/emmintrin.h @@ -405,20 +405,10 @@ extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm_cmpord_pd (__m128d __A, __m128d __B) { -#if _ARCH_PWR8 __v2du c, d; /* Compare against self will return false (0's) if NAN. */ c = (__v2du)vec_cmpeq (__A, __A); d = (__v2du)vec_cmpeq (__B, __B); -#else - __v2du a, b; - __v2du c, d; - const __v2du double_exp_mask = {0x7ff0000000000000, 0x7ff0000000000000}; - a = (__v2du)vec_abs ((__v2df)__A); - b = (__v2du)vec_abs ((__v2df)__B); - c = (__v2du)vec_cmpgt (double_exp_mask, a); - d = (__v2du)vec_cmpgt (double_exp_mask, b); -#endif /* A != NAN and B != NAN. */ return ((__m128d)vec_and(c, d)); } @@ -861,7 +851,11 @@ : ); #ifdef _ARCH_PWR8 +#ifdef __LITTLE_ENDIAN__ temp = vec_mergeo (temp, temp); +#else + temp = vec_mergee (temp, temp); +#endif result = (__v4si) vec_vpkudum ((__vector long long) temp, (__vector long long) vzero); #else @@ -896,7 +890,11 @@ : ); #ifdef _ARCH_PWR8 +#ifdef __LITTLE_ENDIAN__ temp = vec_mergeo (temp, temp); +#else + temp = vec_mergee (temp, temp); +#endif result = (__v4sf) vec_vpkudum ((__vector long long) temp, (__vector long long) vzero); #else @@ -925,7 +923,11 @@ : ); #ifdef _ARCH_PWR8 +#ifdef __LITTLE_ENDIAN__ temp = vec_mergeo (temp, temp); +#else + temp = vec_mergee (temp, temp); +#endif result = (__v4si) vec_vpkudum ((__vector long long) temp, (__vector long long) vzero); #else @@ -1205,6 +1207,9 @@ extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm_movemask_pd (__m128d __A) { +#ifdef _ARCH_PWR10 + return vec_extractm ((__v2du) __A); +#else __vector unsigned long long result; static const __vector unsigned int perm_mask = { @@ -1224,6 +1229,7 @@ #else return result[0]; #endif +#endif /* !_ARCH_PWR10 */ } #endif /* _ARCH_PWR8 */ @@ -1434,6 +1440,7 @@ return ((__m64)a * (__m64)b); } +#ifdef _ARCH_PWR8 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm_mul_epu32 (__m128i __A, __m128i __B) { @@ -1460,6 +1467,7 @@ return (__m128i) vec_mule ((__v4su)__A, (__v4su)__B); #endif } +#endif extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm_slli_epi16 (__m128i __A, int __B) @@ -1749,7 +1757,7 @@ lshift = vec_splat ((__v2du) __B, 0); shmask = vec_cmplt (lshift, shmax); result = vec_sl ((__v2du) __A, lshift); - result = (__v2du)vec_sel ((__v2df) shmask, (__v2df)result, shmask); + result = vec_sel ((__v2du) shmask, result, shmask); return (__m128i) result; } @@ -1843,7 +1851,7 @@ rshift = vec_splat ((__v2du) __B, 0); shmask = vec_cmplt (rshift, shmax); result = vec_sr ((__v2du) __A, rshift); - result = (__v2du)vec_sel ((__v2df) shmask, (__v2df)result, shmask); + result = vec_sel ((__v2du) shmask, result, shmask); return (__m128i) result; } @@ -1995,10 +2003,14 @@ #ifdef _ARCH_PWR8 /* Intrinsic functions that require PowerISA 2.07 minimum. */ -/* Creates a 4-bit mask from the most significant bits of the SPFP values. */ +/* Return a mask created from the most significant bit of each 8-bit + element in A. */ extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm_movemask_epi8 (__m128i __A) { +#ifdef _ARCH_PWR10 + return vec_extractm ((__v16qu) __A); +#else __vector unsigned long long result; static const __vector unsigned char perm_mask = { @@ -2015,6 +2027,7 @@ #else return result[0]; #endif +#endif /* !_ARCH_PWR10 */ } #endif /* _ARCH_PWR8 */ @@ -2158,27 +2171,37 @@ _mm_sad_epu8 (__m128i __A, __m128i __B) { __v16qu a, b; - __v16qu vmin, vmax, vabsdiff; + __v16qu vabsdiff; __v4si vsum; const __v4su zero = { 0, 0, 0, 0 }; __v4si result; a = (__v16qu) __A; b = (__v16qu) __B; - vmin = vec_min (a, b); - vmax = vec_max (a, b); +#ifndef _ARCH_PWR9 + __v16qu vmin = vec_min (a, b); + __v16qu vmax = vec_max (a, b); vabsdiff = vec_sub (vmax, vmin); +#else + vabsdiff = vec_absd (a, b); +#endif /* Sum four groups of bytes into integers. */ vsum = (__vector signed int) vec_sum4s (vabsdiff, zero); +#ifdef __LITTLE_ENDIAN__ + /* Sum across four integers with two integer results. */ + asm ("vsum2sws %0,%1,%2" : "=v" (result) : "v" (vsum), "v" (zero)); + /* Note: vec_sum2s could be used here, but on little-endian, vector + shifts are added that are not needed for this use-case. + A vector shift to correctly position the 32-bit integer results + (currently at [0] and [2]) to [1] and [3] would then need to be + swapped back again since the desired results are two 64-bit + integers ([1]|[0] and [3]|[2]). Thus, no shift is performed. */ +#else /* Sum across four integers with two integer results. */ result = vec_sum2s (vsum, (__vector signed int) zero); /* Rotate the sums into the correct position. */ -#ifdef __LITTLE_ENDIAN__ - result = vec_sld (result, result, 4); -#else result = vec_sld (result, result, 6); #endif - /* Rotate the sums into the correct position. */ return (__m128i) result; } Index: clang/lib/Headers/ppc_wrappers/bmiintrin.h =================================================================== --- /dev/null +++ clang/lib/Headers/ppc_wrappers/bmiintrin.h @@ -0,0 +1,165 @@ +/*===---- bmiintrin.h - Implementation of BMI intrinsics on PowerPC --------=== + * + * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. + * See https://llvm.org/LICENSE.txt for license information. + * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + * + *===-----------------------------------------------------------------------=== + */ + +#if !defined X86GPRINTRIN_H_ +#error "Never use <bmiintrin.h> directly; include <x86gprintrin.h> instead." +#endif + +#ifndef BMIINTRIN_H_ +#define BMIINTRIN_H_ + +extern __inline unsigned short + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + __tzcnt_u16(unsigned short __X) { + return __builtin_ctz(__X); +} + +extern __inline unsigned int + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + __andn_u32(unsigned int __X, unsigned int __Y) { + return (~__X & __Y); +} + +extern __inline unsigned int + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _bextr_u32(unsigned int __X, unsigned int __P, unsigned int __L) { + return ((__X << (32 - (__L + __P))) >> (32 - __L)); +} + +extern __inline unsigned int + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + __bextr_u32(unsigned int __X, unsigned int __Y) { + unsigned int __P, __L; + __P = __Y & 0xFF; + __L = (__Y >> 8) & 0xFF; + return (_bextr_u32(__X, __P, __L)); +} + +extern __inline unsigned int + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + __blsi_u32(unsigned int __X) { + return (__X & -__X); +} + +extern __inline unsigned int + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _blsi_u32(unsigned int __X) { + return __blsi_u32(__X); +} + +extern __inline unsigned int + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + __blsmsk_u32(unsigned int __X) { + return (__X ^ (__X - 1)); +} + +extern __inline unsigned int + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _blsmsk_u32(unsigned int __X) { + return __blsmsk_u32(__X); +} + +extern __inline unsigned int + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + __blsr_u32(unsigned int __X) { + return (__X & (__X - 1)); +} + +extern __inline unsigned int + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _blsr_u32(unsigned int __X) { + return __blsr_u32(__X); +} + +extern __inline unsigned int + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + __tzcnt_u32(unsigned int __X) { + return __builtin_ctz(__X); +} + +extern __inline unsigned int + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _tzcnt_u32(unsigned int __X) { + return __builtin_ctz(__X); +} + +/* use the 64-bit shift, rotate, and count leading zeros instructions + for long long. */ +#ifdef __PPC64__ +extern __inline unsigned long long + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + __andn_u64(unsigned long long __X, unsigned long long __Y) { + return (~__X & __Y); +} + +extern __inline unsigned long long + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _bextr_u64(unsigned long long __X, unsigned int __P, unsigned int __L) { + return ((__X << (64 - (__L + __P))) >> (64 - __L)); +} + +extern __inline unsigned long long + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + __bextr_u64(unsigned long long __X, unsigned long long __Y) { + unsigned int __P, __L; + __P = __Y & 0xFF; + __L = (__Y & 0xFF00) >> 8; + return (_bextr_u64(__X, __P, __L)); +} + +extern __inline unsigned long long + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + __blsi_u64(unsigned long long __X) { + return __X & -__X; +} + +extern __inline unsigned long long + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _blsi_u64(unsigned long long __X) { + return __blsi_u64(__X); +} + +extern __inline unsigned long long + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + __blsmsk_u64(unsigned long long __X) { + return (__X ^ (__X - 1)); +} + +extern __inline unsigned long long + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _blsmsk_u64(unsigned long long __X) { + return __blsmsk_u64(__X); +} + +extern __inline unsigned long long + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + __blsr_u64(unsigned long long __X) { + return (__X & (__X - 1)); +} + +extern __inline unsigned long long + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _blsr_u64(unsigned long long __X) { + return __blsr_u64(__X); +} + +extern __inline unsigned long long + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + __tzcnt_u64(unsigned long long __X) { + return __builtin_ctzll(__X); +} + +extern __inline unsigned long long + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _tzcnt_u64(unsigned long long __X) { + return __builtin_ctzll(__X); +} +#endif /* __PPC64__ */ + +#endif /* BMIINTRIN_H_ */ Index: clang/lib/Headers/ppc_wrappers/bmi2intrin.h =================================================================== --- /dev/null +++ clang/lib/Headers/ppc_wrappers/bmi2intrin.h @@ -0,0 +1,133 @@ +/*===---- bmiintrin.h - Implementation of BMI2 intrinsics on PowerPC -------=== + * + * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. + * See https://llvm.org/LICENSE.txt for license information. + * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + * + *===-----------------------------------------------------------------------=== + */ + +#if !defined X86GPRINTRIN_H_ +#error "Never use <bmi2intrin.h> directly; include <x86gprintrin.h> instead." +#endif + +#ifndef BMI2INTRIN_H_ +#define BMI2INTRIN_H_ + +extern __inline unsigned int + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _bzhi_u32(unsigned int __X, unsigned int __Y) { + return ((__X << (32 - __Y)) >> (32 - __Y)); +} + +extern __inline unsigned int + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mulx_u32(unsigned int __X, unsigned int __Y, unsigned int *__P) { + unsigned long long __res = (unsigned long long)__X * __Y; + *__P = (unsigned int)(__res >> 32); + return (unsigned int)__res; +} + +#ifdef __PPC64__ +extern __inline unsigned long long + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _bzhi_u64(unsigned long long __X, unsigned long long __Y) { + return ((__X << (64 - __Y)) >> (64 - __Y)); +} + +/* __int128 requires base 64-bit. */ +extern __inline unsigned long long + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mulx_u64(unsigned long long __X, unsigned long long __Y, + unsigned long long *__P) { + unsigned __int128 __res = (unsigned __int128)__X * __Y; + *__P = (unsigned long long)(__res >> 64); + return (unsigned long long)__res; +} + +#ifdef _ARCH_PWR7 +/* popcount and bpermd require power7 minimum. */ +extern __inline unsigned long long + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _pdep_u64(unsigned long long __X, unsigned long long __M) { + unsigned long result = 0x0UL; + const unsigned long mask = 0x8000000000000000UL; + unsigned long m = __M; + unsigned long c, t; + unsigned long p; + + /* The pop-count of the mask gives the number of the bits from + source to process. This is also needed to shift bits from the + source into the correct position for the result. */ + p = 64 - __builtin_popcountl(__M); + + /* The loop is for the number of '1' bits in the mask and clearing + each mask bit as it is processed. */ + while (m != 0) { + c = __builtin_clzl(m); + t = __X << (p - c); + m ^= (mask >> c); + result |= (t & (mask >> c)); + p++; + } + return (result); +} + +extern __inline unsigned long long + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _pext_u64(unsigned long long __X, unsigned long long __M) { + unsigned long p = 0x4040404040404040UL; // initial bit permute control + const unsigned long mask = 0x8000000000000000UL; + unsigned long m = __M; + unsigned long c; + unsigned long result; + + /* if the mask is constant and selects 8 bits or less we can use + the Power8 Bit permute instruction. */ + if (__builtin_constant_p(__M) && (__builtin_popcountl(__M) <= 8)) { + /* Also if the pext mask is constant, then the popcount is + constant, we can evaluate the following loop at compile + time and use a constant bit permute vector. */ + for (long i = 0; i < __builtin_popcountl(__M); i++) { + c = __builtin_clzl(m); + p = (p << 8) | c; + m ^= (mask >> c); + } + result = __builtin_bpermd(p, __X); + } else { + p = 64 - __builtin_popcountl(__M); + result = 0; + /* We could a use a for loop here, but that combined with + -funroll-loops can expand to a lot of code. The while + loop avoids unrolling and the compiler commons the xor + from clearing the mask bit with the (m != 0) test. The + result is a more compact loop setup and body. */ + while (m != 0) { + unsigned long t; + c = __builtin_clzl(m); + t = (__X & (mask >> c)) >> (p - c); + m ^= (mask >> c); + result |= (t); + p++; + } + } + return (result); +} + +/* these 32-bit implementations depend on 64-bit pdep/pext + which depend on _ARCH_PWR7. */ +extern __inline unsigned int + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _pdep_u32(unsigned int __X, unsigned int __Y) { + return _pdep_u64(__X, __Y); +} + +extern __inline unsigned int + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _pext_u32(unsigned int __X, unsigned int __Y) { + return _pext_u64(__X, __Y); +} +#endif /* _ARCH_PWR7 */ +#endif /* __PPC64__ */ + +#endif /* BMI2INTRIN_H_ */ Index: clang/lib/Headers/CMakeLists.txt =================================================================== --- clang/lib/Headers/CMakeLists.txt +++ clang/lib/Headers/CMakeLists.txt @@ -161,6 +161,12 @@ ppc_wrappers/pmmintrin.h ppc_wrappers/tmmintrin.h ppc_wrappers/smmintrin.h + ppc_wrappers/bmiintrin.h + ppc_wrappers/bmi2intrin.h + ppc_wrappers/immintrin.h + ppc_wrappers/tmmintrin.h + ppc_wrappers/x86intrin.h + ppc_wrappers/x86gprintrin.h ) set(openmp_wrapper_files
_______________________________________________ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits