[PATCH] D119407: [PowerPC] [Clang] Add SSE4 and BMI compatible intrinsics implementation for PowerPC

Qiu Chaofan via Phabricator via cfe-commits Thu, 10 Feb 2022 02:44:22 -0800

qiucf updated this revision to Diff 407444.
Herald added a subscriber: mgorny.


Repository:
  rG LLVM Github Monorepo

CHANGES SINCE LAST ACTION
  https://reviews.llvm.org/D119407/new/

https://reviews.llvm.org/D119407

Files:
  clang/lib/Headers/CMakeLists.txt
  clang/lib/Headers/ppc_wrappers/bmi2intrin.h
  clang/lib/Headers/ppc_wrappers/bmiintrin.h
  clang/lib/Headers/ppc_wrappers/emmintrin.h
  clang/lib/Headers/ppc_wrappers/immintrin.h
  clang/lib/Headers/ppc_wrappers/nmmintrin.h
  clang/lib/Headers/ppc_wrappers/pmmintrin.h
  clang/lib/Headers/ppc_wrappers/smmintrin.h
  clang/lib/Headers/ppc_wrappers/tmmintrin.h
  clang/lib/Headers/ppc_wrappers/x86gprintrin.h
  clang/lib/Headers/ppc_wrappers/x86intrin.h
  clang/lib/Headers/ppc_wrappers/xmmintrin.h
  clang/test/CodeGen/PowerPC/ppc-smmintrin.c
  clang/test/CodeGen/PowerPC/ppc-x86gprintrin.c

Index: clang/test/CodeGen/PowerPC/ppc-x86gprintrin.c
===================================================================
--- /dev/null
+++ clang/test/CodeGen/PowerPC/ppc-x86gprintrin.c
@@ -0,0 +1,239 @@
+// REQUIRES: powerpc-registered-target
+// RUN: %clang -S -emit-llvm -target powerpc64le-unknown-linux-gnu -mcpu=pwr7 -ffreestanding -DNO_WARN_X86_INTRINSICS %s \
+// RUN:   -fno-discard-value-names -mllvm -disable-llvm-optzns -o - | llvm-cxxfilt -n | FileCheck %s
+// RUN: %clang -S -emit-llvm -target powerpc64-unknown-linux-gnu -mcpu=pwr7 -ffreestanding -DNO_WARN_X86_INTRINSICS %s \
+// RUN:   -fno-discard-value-names -mllvm -disable-llvm-optzns -o - | llvm-cxxfilt -n | FileCheck %s
+
+// RUN: %clang -S -emit-llvm -target powerpc64le-unknown-freebsd13.0 -mcpu=pwr7 -ffreestanding -DNO_WARN_X86_INTRINSICS %s \
+// RUN:   -fno-discard-value-names -mllvm -disable-llvm-optzns -o - | llvm-cxxfilt -n | FileCheck %s
+// RUN: %clang -S -emit-llvm -target powerpc64-unknown-freebsd13.0 -mcpu=pwr7 -ffreestanding -DNO_WARN_X86_INTRINSICS %s \
+// RUN:   -fno-discard-value-names -mllvm -disable-llvm-optzns -o - | llvm-cxxfilt -n | FileCheck %s
+
+#include <x86gprintrin.h>
+
+unsigned short us;
+unsigned ui;
+unsigned long long ul;
+
+void __attribute__((noinline))
+test_bmiintrin() {
+  __tzcnt_u16(us);
+  __andn_u32(ui, ui);
+  _bextr_u32(ui, ui, ui);
+  __bextr_u32(ui, ui);
+  __blsi_u32(ui);
+  _blsi_u32(ui);
+  __blsmsk_u32(ui);
+  _blsmsk_u32(ui);
+  __blsr_u32(ui);
+  _blsr_u32(ui);
+  __tzcnt_u32(ui);
+  _tzcnt_u32(ui);
+  __andn_u64(ul, ul);
+  _bextr_u64(ul, ui, ui);
+  __bextr_u64(ul, ul);
+  __blsi_u64(ul);
+  _blsi_u64(ul);
+  __blsmsk_u64(ul);
+  _blsmsk_u64(ul);
+  __blsr_u64(ul);
+  _blsr_u64(ul);
+  __tzcnt_u64(ul);
+  _tzcnt_u64(ul);
+}
+
+// CHECK-LABEL: @test_bmiintrin
+
+// CHECK-LABEL: define available_externally zeroext i16 @__tzcnt_u16(i16 noundef zeroext %{{[0-9a-zA-Z._]+}})
+// CHECK: %[[CONV:[0-9a-zA-Z._]+]] = zext i16 %{{[0-9a-zA-Z._]+}} to i32
+// CHECK: %[[CALL:[0-9a-zA-Z._]+]] = call i32 @llvm.cttz.i32(i32 %[[CONV]], i1 false)
+// CHECK: trunc i32 %[[CALL]] to i16
+
+// CHECK-LABEL: define available_externally zeroext i32 @__andn_u32(i32 noundef zeroext %{{[0-9a-zA-Z._]+}}, i32 noundef zeroext %{{[0-9a-zA-Z._]+}})
+// CHECK: %[[NEG:[0-9a-zA-Z._]+]] = xor i32 %{{[0-9a-zA-Z._]+}}, -1
+// CHECK: and i32 %[[NEG]], %1
+
+// CHECK-LABEL: define available_externally zeroext i32 @_bextr_u32(i32 noundef zeroext %{{[0-9a-zA-Z._]+}}, i32 noundef zeroext %{{[0-9a-zA-Z._]+}}, i32 noundef zeroext %{{[0-9a-zA-Z._]+}})
+// CHECK: %[[ADD:[0-9a-zA-Z._]+]] = add i32 %{{[0-9a-zA-Z._]+}}, %{{[0-9a-zA-Z._]+}}
+// CHECK: %[[SUB:[0-9a-zA-Z._]+]] = sub i32 32, %[[ADD]]
+// CHECK: %[[SHL:[0-9a-zA-Z._]+]] = shl i32 %{{[0-9a-zA-Z._]+}}, %[[SUB]]
+// CHECK: %[[SUB]]1 = sub i32 32, %{{[0-9a-zA-Z._]+}}
+// CHECK: lshr i32 %[[SHL]], %[[SUB]]1
+
+// CHECK-LABEL: define available_externally zeroext i32 @__bextr_u32(i32 noundef zeroext %{{[0-9a-zA-Z._]+}}, i32 noundef zeroext %{{[0-9a-zA-Z._]+}})
+// CHECK: %[[AND:[0-9a-zA-Z._]+]] = and i32 %{{[0-9a-zA-Z._]+}}, 255
+// CHECK: %[[SHR:[0-9a-zA-Z._]+]] = lshr i32 %{{[0-9a-zA-Z._]+}}, 8
+// CHECK: and i32 %[[SHR]], 255
+// CHECK: call zeroext i32 @_bextr_u32
+
+// CHECK-LABEL: define available_externally zeroext i32 @__blsi_u32(i32 noundef zeroext %{{[0-9a-zA-Z._]+}})
+// CHECK: %[[SUB:[0-9a-zA-Z._]+]] = sub i32 0, %1
+// CHECK: and i32 %0, %[[SUB]]
+
+// CHECK-LABEL: define available_externally zeroext i32 @_blsi_u32(i32 noundef zeroext %{{[0-9a-zA-Z._]+}})
+// CHECK: call zeroext i32 @__blsi_u32
+
+// CHECK-LABEL: define available_externally zeroext i32 @__blsmsk_u32(i32 noundef zeroext %{{[0-9a-zA-Z._]+}})
+// CHECK: %[[SUB:[0-9a-zA-Z._]+]] = sub i32 %{{[0-9a-zA-Z._]+}}, 1
+// CHECK: xor i32 %{{[0-9a-zA-Z._]+}}, %[[SUB]]
+
+// CHECK-LABEL: define available_externally zeroext i32 @_blsmsk_u32(i32 noundef zeroext %{{[0-9a-zA-Z._]+}})
+// CHECK: call zeroext i32 @__blsmsk_u32
+
+// CHECK-LABEL: define available_externally zeroext i32 @__blsr_u32(i32 noundef zeroext %{{[0-9a-zA-Z._]+}})
+// CHECK: %[[SUB:[0-9a-zA-Z._]+]] = sub i32 %{{[0-9a-zA-Z._]+}}, 1
+// CHECK: and i32 %{{[0-9a-zA-Z._]+}}, %[[SUB]]
+
+// CHECK-LABEL: define available_externally zeroext i32 @_blsr_u32(i32 noundef zeroext %{{[0-9a-zA-Z._]+}})
+// CHECK: call zeroext i32 @__blsr_u32
+
+// CHECK-LABEL: define available_externally zeroext i32 @__tzcnt_u32(i32 noundef zeroext %{{[0-9a-zA-Z._]+}})
+// CHECK: call i32 @llvm.cttz.i32(i32 %{{[0-9a-zA-Z._]+}}, i1 false)
+
+// CHECK-LABEL: define available_externally zeroext i32 @_tzcnt_u32(i32 noundef zeroext %{{[0-9a-zA-Z._]+}})
+// CHECK: call i32 @llvm.cttz.i32(i32 %{{[0-9a-zA-Z._]+}}, i1 false)
+
+// CHECK-LABEL: define available_externally i64 @__andn_u64(i64 noundef %{{[0-9a-zA-Z._]+}}, i64 noundef %{{[0-9a-zA-Z._]+}})
+// CHECK: %[[NEG:[0-9a-zA-Z._]+]] = xor i64 %{{[0-9a-zA-Z._]+}}, -1
+// CHECK: and i64 %[[NEG]], %{{[0-9a-zA-Z._]+}}
+
+// CHECK-LABEL: define available_externally i64 @_bextr_u64(i64 noundef %{{[0-9a-zA-Z._]+}}, i32 noundef zeroext %{{[0-9a-zA-Z._]+}}, i32 noundef zeroext %{{[0-9a-zA-Z._]+}})
+// CHECK: %[[ADD:[0-9a-zA-Z._]+]] = add i32 %{{[0-9a-zA-Z._]+}}, %{{[0-9a-zA-Z._]+}}
+// CHECK: %[[SUB:[0-9a-zA-Z._]+]] = sub i32 64, %[[ADD]]
+// CHECK: %[[EXT:[0-9a-zA-Z._]+]] = zext i32 %[[SUB]] to i64
+// CHECK: %[[SHL:[0-9a-zA-Z._]+]] = shl i64 %{{[0-9a-zA-Z._]+}}, %[[EXT]]
+// CHECK: %[[SUB1:[0-9a-zA-Z._]+]] = sub i32 64, %{{[0-9a-zA-Z._]+}}
+// CHECK: %[[EXT2:[0-9a-zA-Z._]+]] = zext i32 %[[SUB1]] to i64
+// CHECK: lshr i64 %[[SHL]], %[[EXT2]]
+
+// CHECK-LABEL: define available_externally i64 @__bextr_u64(i64 noundef %{{[0-9a-zA-Z._]+}}, i64 noundef %{{[0-9a-zA-Z._]+}})
+// CHECK: %[[AND:[0-9a-zA-Z._]+]] = and i64 %{{[0-9a-zA-Z._]+}}, 255
+// CHECK: trunc i64 %[[AND]] to i32
+// CHECK: %[[AND1:[0-9a-zA-Z._]+]] = and i64 %{{[0-9a-zA-Z._]+}}, 65280
+// CHECK: %[[SHR:[0-9a-zA-Z._]+]] = lshr i64 %[[AND1]], 8
+// CHECK: trunc i64 %[[SHR]] to i32
+// CHECK: call i64 @_bextr_u64
+
+// CHECK-LABEL: define available_externally i64 @__blsi_u64(i64 noundef %{{[0-9a-zA-Z._]+}})
+// CHECK: %[[SUB:[0-9a-zA-Z._]+]] = sub i64 0, %{{[0-9a-zA-Z._]+}}
+// CHECK: and i64 %0, %[[SUB]]
+
+// CHECK-LABEL: define available_externally i64 @_blsi_u64(i64 noundef %{{[0-9a-zA-Z._]+}})
+// CHECK: call i64 @__blsi_u64
+
+// CHECK-LABEL: define available_externally i64 @__blsmsk_u64(i64 noundef %{{[0-9a-zA-Z._]+}})
+// CHECK: %[[SUB:[0-9a-zA-Z._]+]] = sub i64 %{{[0-9a-zA-Z._]+}}, 1
+// CHECK: xor i64 %0, %[[SUB]]
+
+// CHECK-LABEL: define available_externally i64 @_blsmsk_u64(i64 noundef %{{[0-9a-zA-Z._]+}})
+// CHECK: call i64 @__blsmsk_u64
+
+// CHECK-LABEL: define available_externally i64 @__blsr_u64(i64 noundef %{{[0-9a-zA-Z._]+}})
+// CHECK: %[[SUB:[0-9a-zA-Z._]+]] = sub i64 %{{[0-9a-zA-Z._]+}}, 1
+// CHECK: and i64 %{{[0-9a-zA-Z._]+}}, %[[SUB]]
+
+// CHECK-LABEL: define available_externally i64 @_blsr_u64(i64 noundef %{{[0-9a-zA-Z._]+}})
+// CHECK: call i64 @__blsr_u64
+
+// CHECK-LABEL: define available_externally i64 @__tzcnt_u64(i64 noundef %{{[0-9a-zA-Z._]+}})
+// CHECK: %[[CALL:[0-9a-zA-Z._]+]] = call i64 @llvm.cttz.i64(i64 %{{[0-9a-zA-Z._]+}}, i1 false)
+// CHECK: %[[CAST:[0-9a-zA-Z._]+]] = trunc i64 %[[CALL]] to i32
+// CHECK: sext i32 %[[CAST]] to i64
+
+// CHECK-LABEL: define available_externally i64 @_tzcnt_u64(i64 noundef %{{[0-9a-zA-Z._]+}})
+// CHECK: %[[CALL:[0-9a-zA-Z._]+]] = call i64 @llvm.cttz.i64(i64 %{{[0-9a-zA-Z._]+}}, i1 false)
+// CHECK: %[[CAST:[0-9a-zA-Z._]+]] = trunc i64 %[[CALL]] to i32
+// CHECK: sext i32 %[[CAST]] to i64
+
+void __attribute__((noinline))
+test_bmi2intrin() {
+  _bzhi_u32(ui, ui);
+  _mulx_u32(ui, ui, &ui);
+  _bzhi_u64(ul, ul);
+  _mulx_u64(ul, ul, &ul);
+  _pdep_u64(ul, ul);
+  _pext_u64(ul, ul);
+  _pdep_u32(ui, ui);
+  _pext_u32(ui, ui);
+}
+
+// CHECK-LABEL: @test_bmi2intrin
+
+// CHECK-LABEL: define available_externally zeroext i32 @_bzhi_u32(i32 noundef zeroext %{{[0-9a-zA-Z._]+}}, i32 noundef zeroext %{{[0-9a-zA-Z._]+}})
+// CHECK: %[[SUB:[0-9a-zA-Z._]+]] = sub i32 32, %{{[0-9a-zA-Z._]+}}
+// CHECK: %[[SHL:[0-9a-zA-Z._]+]] = shl i32 %{{[0-9a-zA-Z._]+}}, %[[SUB]]
+// CHECK: %[[SUB1:[0-9a-zA-Z._]+]] = sub i32 32, %{{[0-9a-zA-Z._]+}}
+// CHECK: lshr i32 %[[SHL]], %[[SUB1]]
+
+// CHECK-LABEL: define available_externally zeroext i32 @_mulx_u32(i32 noundef zeroext %{{[0-9a-zA-Z._]+}}, i32 noundef zeroext %{{[0-9a-zA-Z._]+}}, i32* noundef %{{[0-9a-zA-Z._]+}})
+// CHECK: zext i32 %{{[0-9a-zA-Z._]+}} to i64
+// CHECK: zext i32 %{{[0-9a-zA-Z._]+}} to i64
+// CHECK: %[[SHR:[0-9a-zA-Z._]+]] = lshr i64 %{{[0-9a-zA-Z._]+}}, 32
+// CHECK: trunc i64 %[[SHR]] to i32
+// CHECK: trunc i64 %{{[0-9a-zA-Z._]+}} to i32
+
+// CHECK-LABEL: define available_externally i64 @_bzhi_u64(i64 noundef %{{[0-9a-zA-Z._]+}}, i64 noundef %{{[0-9a-zA-Z._]+}})
+// CHECK: %[[SUB:[0-9a-zA-Z._]+]] = sub i64 64, %{{[0-9a-zA-Z._]+}}
+// CHECK: %[[SHL:[0-9a-zA-Z._]+]] = shl i64 %{{[0-9a-zA-Z._]+}}, %[[SUB]]
+// CHECK: %[[SUB1:[0-9a-zA-Z._]+]] = sub i64 64, %{{[0-9a-zA-Z._]+}}
+// CHECK: lshr i64 %[[SHL]], %[[SUB1]]
+
+// CHECK-LABEL: define available_externally i64 @_mulx_u64(i64 noundef %{{[0-9a-zA-Z._]+}}, i64 noundef %{{[0-9a-zA-Z._]+}}, i64* noundef %{{[0-9a-zA-Z._]+}})
+// CHECK: zext i64 %{{[0-9a-zA-Z._]+}} to i128
+// CHECK: zext i64 %{{[0-9a-zA-Z._]+}} to i128
+// CHECK: %[[SHR:[0-9a-zA-Z._]+]] = lshr i128 %{{[0-9a-zA-Z._]+}}, 64
+// CHECK: trunc i128 %[[SHR]] to i64
+// CHECK: trunc i128 %{{[0-9a-zA-Z._]+}} to i64
+
+// CHECK-LABEL: define available_externally i64 @_pdep_u64(i64 noundef %{{[0-9a-zA-Z._]+}}, i64 noundef %{{[0-9a-zA-Z._]+}})
+// CHECK: %[[CALL:[0-9a-zA-Z._]+]] = call i64 @llvm.ctpop.i64(i64 %{{[0-9a-zA-Z._]+}})
+// CHECK: %[[CAST:[0-9a-zA-Z._]+]] = trunc i64 %[[CALL]] to i32
+// CHECK: %[[SUB:[0-9a-zA-Z._]+]] = sub nsw i32 64, %[[CAST]]
+// CHECK: sext i32 %[[SUB]] to i64
+// CHECK: %[[CALL2:[0-9a-zA-Z._]+]] = call i64 @llvm.ctlz.i64(i64 %{{[0-9a-zA-Z._]+}}, i1 false)
+// CHECK: %[[CAST2:[0-9a-zA-Z._]+]] = trunc i64 %[[CALL2]] to i32
+// CHECK: sext i32 %[[CAST2]] to i64
+// CHECK: %[[SUB2:[0-9a-zA-Z._]+]] = sub i64 %{{[0-9a-zA-Z._]+}}, %{{[0-9a-zA-Z._]+}}
+// CHECK: shl i64 %{{[0-9a-zA-Z._]+}}, %[[SUB2]]
+// CHECK: %[[SHR:[0-9a-zA-Z._]+]] = lshr i64 -9223372036854775808, %{{[0-9a-zA-Z._]+}}
+// CHECK: xor i64 %{{[0-9a-zA-Z._]+}}, %[[SHR]]
+// CHECK: %[[SHR2:[0-9a-zA-Z._]+]] = lshr i64 -9223372036854775808, %{{[0-9a-zA-Z._]+}}
+// CHECK: %[[AND:[0-9a-zA-Z._]+]] = and i64 %{{[0-9a-zA-Z._]+}}, %[[SHR2]]
+// CHECK: or i64 %{{[0-9a-zA-Z._]+}}, %[[AND]]
+// CHECK: add i64 %{{[0-9a-zA-Z._]+}}, 1
+
+// CHECK-LABEL: define available_externally i64 @_pext_u64(i64 noundef %{{[0-9a-zA-Z._]+}}, i64 noundef %{{[0-9a-zA-Z._]+}})
+// CHECK: %[[CALL:[0-9a-zA-Z._]+]] = call i1 @llvm.is.constant.i64(i64 %{{[0-9a-zA-Z._]+}})
+// CHECK: br i1 %[[CALL]], label %[[TRUECOND:[0-9a-zA-Z._]+]], label %[[FALSECOND:[0-9a-zA-Z._]+]]
+// CHECK: [[TRUECOND]]:
+// CHECK: %[[CALL2:[0-9a-zA-Z._]+]] = call i64 @llvm.ctpop.i64(i64 %{{[0-9a-zA-Z._]+}})
+// CHECK: call i64 @llvm.ctlz.i64(i64 %{{[0-9a-zA-Z._]+}}, i1 false)
+// CHECK: %[[SHL:[0-9a-zA-Z._]+]] = shl i64 %{{[0-9a-zA-Z._]+}}, 8
+// CHECK: or i64 %[[SHL]], %{{[0-9a-zA-Z._]+}}
+// CHECK: %[[SHR:[0-9a-zA-Z._]+]] = lshr i64 -9223372036854775808, %{{[0-9a-zA-Z._]+}}
+// CHECK: xor i64 %{{[0-9a-zA-Z._]+}}, %[[SHR]]
+// CHECK: add nsw i64 %{{[0-9a-zA-Z._]+}}, 1
+// CHECK: call i64 @llvm.ppc.bpermd
+// CHECK: [[FALSECOND]]:
+// CHECK: call i64 @llvm.ctpop.i64(i64 %{{[0-9a-zA-Z._]+}})
+// CHECK: call i64 @llvm.ctlz.i64(i64 %{{[0-9a-zA-Z._]+}}, i1 false)
+// CHECK: %[[SHR2:[0-9a-zA-Z._]+]] = lshr i64 -9223372036854775808, %{{[0-9a-zA-Z._]+}}
+// CHECK: %[[AND:[0-9a-zA-Z._]+]] = and i64 %{{[0-9a-zA-Z._]+}}, %[[SHR2]]
+// CHECK: %[[SUB:[0-9a-zA-Z._]+]] = sub i64 %{{[0-9a-zA-Z._]+}}, %{{[0-9a-zA-Z._]+}}
+// CHECK: lshr i64 %[[AND]], %[[SUB]]
+// CHECK: %[[SHR3:[0-9a-zA-Z._]+]] = lshr i64 -9223372036854775808, %{{[0-9a-zA-Z._]+}}
+// CHECK: xor i64 %{{[0-9a-zA-Z._]+}}, %[[SHR3]]
+// CHECK: or i64 %{{[0-9a-zA-Z._]+}}, %{{[0-9a-zA-Z._]+}}
+// CHECK: add i64 %{{[0-9a-zA-Z._]+}}, 1
+
+// CHECK-LABEL: define available_externally zeroext i32 @_pdep_u32(i32 noundef zeroext %{{[0-9a-zA-Z._]+}}, i32 noundef zeroext %{{[0-9a-zA-Z._]+}})
+// CHECK: %[[CONV:[0-9a-zA-Z._]+]] = zext i32 %{{[0-9a-zA-Z._]+}} to i64
+// CHECK: %[[CONV1:[0-9a-zA-Z._]+]] = zext i32 %{{[0-9a-zA-Z._]+}} to i64
+// CHECK: %[[CALL:[0-9a-zA-Z._]+]] = call i64 @_pdep_u64(i64 noundef %[[CONV]], i64 noundef %[[CONV1]])
+// CHECK: trunc i64 %[[CALL]] to i32
+
+// CHECK-LABEL: define available_externally zeroext i32 @_pext_u32(i32 noundef zeroext %{{[0-9a-zA-Z._]+}}, i32 noundef zeroext %{{[0-9a-zA-Z._]+}})
+// CHECK: %[[CONV:[0-9a-zA-Z._]+]] = zext i32 %{{[0-9a-zA-Z._]+}} to i64
+// CHECK: %[[CONV1:[0-9a-zA-Z._]+]] = zext i32 %{{[0-9a-zA-Z._]+}} to i64
+// CHECK: %[[CALL:[0-9a-zA-Z._]+]] = call i64 @_pext_u64(i64 noundef %[[CONV]], i64 noundef %[[CONV1]])
+// CHECK: trunc i64 %[[CALL]] to i32
Index: clang/test/CodeGen/PowerPC/ppc-smmintrin.c
===================================================================
--- clang/test/CodeGen/PowerPC/ppc-smmintrin.c
+++ clang/test/CodeGen/PowerPC/ppc-smmintrin.c
@@ -1,4 +1,3 @@
-// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
 // REQUIRES: powerpc-registered-target
 
 // RUN: %clang -S -emit-llvm -target powerpc64le-unknown-linux-gnu -mcpu=pwr8 -ffreestanding -DNO_WARN_X86_INTRINSICS %s \
@@ -13,6 +12,8 @@
 
 #include <smmintrin.h>
 
+__m128 mn1, mn2;
+__m128d md1, md2;
 __m128i mi, m1, m2;
 
 void __attribute__((noinline))
@@ -25,102 +26,50 @@
 
 // CHECK-LABEL: @test_extract
 
-// CHECK: define available_externally signext i32 @_mm_extract_epi8(<2 x i64> noundef [[REG1:[0-9a-zA-Z_%.]+]], i32 noundef signext [[REG2:[0-9a-zA-Z_%.]+]])
-// CHECK: store <2 x i64> [[REG1]], <2 x i64>* [[REG3:[0-9a-zA-Z_%.]+]], align 16
-// CHECK-NEXT: store i32 [[REG2]], i32* [[REG4:[0-9a-zA-Z_%.]+]], align 4
-// CHECK-NEXT: [[REG5:[0-9a-zA-Z_%.]+]] = load <2 x i64>, <2 x i64>* [[REG3]], align 16
-// CHECK-NEXT: [[REG6:[0-9a-zA-Z_%.]+]] = bitcast <2 x i64> [[REG5]] to <16 x i8>
-// CHECK-NEXT: [[REG7:[0-9a-zA-Z_%.]+]] = load i32, i32* [[REG4]], align 4
-// CHECK-NEXT: [[REG8:[0-9a-zA-Z_%.]+]] = and i32 [[REG7]], 15
-// CHECK-NEXT: [[REG9:[0-9a-zA-Z_%.]+]] = extractelement <16 x i8> [[REG6]], i32 [[REG8]]
-// CHECK-NEXT: [[REG10:[0-9a-zA-Z_%.]+]] = zext i8 [[REG9]] to i32
-// CHECK-NEXT: ret i32 [[REG10]]
-
-// CHECK: define available_externally signext i32 @_mm_extract_epi32(<2 x i64> noundef [[REG11:[0-9a-zA-Z_%.]+]], i32 noundef signext [[REG12:[0-9a-zA-Z_%.]+]])
-// CHECK: store <2 x i64> [[REG11]], <2 x i64>* [[REG13:[0-9a-zA-Z_%.]+]], align 16
-// CHECK-NEXT: store i32 [[REG12]], i32* [[REG14:[0-9a-zA-Z_%.]+]], align 4
-// CHECK-NEXT: [[REG15:[0-9a-zA-Z_%.]+]] = load <2 x i64>, <2 x i64>* [[REG13]], align 16
-// CHECK-NEXT: [[REG16:[0-9a-zA-Z_%.]+]] = bitcast <2 x i64> [[REG15]] to <4 x i32>
-// CHECK-NEXT: [[REG17:[0-9a-zA-Z_%.]+]] = load i32, i32* [[REG14]], align 4
-// CHECK-NEXT: [[REG18:[0-9a-zA-Z_%.]+]] = and i32 [[REG17]], 3
-// CHECK-NEXT: [[REG19:[0-9a-zA-Z_%.]+]] = extractelement <4 x i32> [[REG16]], i32 [[REG18]]
-// CHECK-NEXT: ret i32 [[REG19]]
-
-// CHECK: define available_externally signext i32 @_mm_extract_epi64(<2 x i64> noundef [[REG20:[0-9a-zA-Z_%.]+]], i32 noundef signext [[REG21:[0-9a-zA-Z_%.]+]])
-// CHECK: store <2 x i64> [[REG20]], <2 x i64>* [[REG22:[0-9a-zA-Z_%.]+]], align 16
-// CHECK-NEXT: store i32 [[REG21]], i32* [[REG23:[0-9a-zA-Z_%.]+]], align 4
-// CHECK-NEXT: [[REG24:[0-9a-zA-Z_%.]+]] = load <2 x i64>, <2 x i64>* [[REG22]], align 16
-// CHECK-NEXT: [[REG25:[0-9a-zA-Z_%.]+]] = load i32, i32* [[REG23]], align 4
-// CHECK-NEXT: [[REG26:[0-9a-zA-Z_%.]+]] = and i32 [[REG25]], 1
-// CHECK-NEXT: [[REG27:[0-9a-zA-Z_%.]+]] = extractelement <2 x i64> [[REG24]], i32 [[REG26]]
-// CHECK-NEXT: [[REG28:[0-9a-zA-Z_%.]+]] = trunc i64 [[REG27]] to i32
-// CHECK-NEXT: ret i32 [[REG28]]
-
-// CHECK: define available_externally signext i32 @_mm_extract_ps(<4 x float> noundef [[REG29:[0-9a-zA-Z_%.]+]], i32 noundef signext [[REG30:[0-9a-zA-Z_%.]+]])
-// CHECK: store <4 x float> [[REG29]], <4 x float>* [[REG31:[0-9a-zA-Z_%.]+]], align 16
-// CHECK-NEXT: store i32 [[REG30]], i32* [[REG32:[0-9a-zA-Z_%.]+]], align 4
-// CHECK-NEXT: [[REG33:[0-9a-zA-Z_%.]+]] = load <4 x float>, <4 x float>* [[REG31]], align 16
-// CHECK-NEXT: [[REG34:[0-9a-zA-Z_%.]+]] = bitcast <4 x float> [[REG33]] to <4 x i32>
-// CHECK-NEXT: [[REG35:[0-9a-zA-Z_%.]+]] = load i32, i32* [[REG32]], align 4
-// CHECK-NEXT: [[REG36:[0-9a-zA-Z_%.]+]] = and i32 [[REG35]], 3
-// CHECK-NEXT: [[REG37:[0-9a-zA-Z_%.]+]] = extractelement <4 x i32> [[REG34]], i32 [[REG36]]
-// CHECK-NEXT: ret i32 [[REG37]]
+// CHECK-LABEL: define available_externally signext i32 @_mm_extract_epi8(<2 x i64> noundef %{{[0-9a-zA-Z_.]+}}, i32 noundef signext %{{[0-9a-zA-Z_.]+}})
+// CHECK: %[[AND:[0-9a-zA-Z_.]+]] = and i32 %{{[0-9a-zA-Z_.]+}}, 15
+// CHECK: %[[EXT:[0-9a-zA-Z_.]+]] = extractelement <16 x i8> %{{[0-9a-zA-Z_.]+}}, i32 %[[AND]]
+// CHECK: zext i8 %[[EXT]] to i32
+
+// CHECK-LABEL: define available_externally signext i32 @_mm_extract_epi32(<2 x i64> noundef %{{[0-9a-zA-Z_.]+}}, i32 noundef signext %{{[0-9a-zA-Z_.]+}})
+// CHECK: %[[AND:[0-9a-zA-Z_.]+]] = and i32 %{{[0-9a-zA-Z_.]+}}, 3
+// CHECK: extractelement <4 x i32> %{{[0-9a-zA-Z_.]+}}, i32 %[[AND]]
+
+// CHECK-LABEL: define available_externally signext i32 @_mm_extract_epi64(<2 x i64> noundef %{{[0-9a-zA-Z_.]+}}, i32 noundef signext %{{[0-9a-zA-Z_.]+}})
+// CHECK: %[[AND:[0-9a-zA-Z_.]+]] = and i32 %{{[0-9a-zA-Z_.]+}}, 1
+// CHECK: %[[EXT:[0-9a-zA-Z_.]+]] = extractelement <2 x i64> %{{[0-9a-zA-Z_.]+}}, i32 %[[AND]]
+// CHECK: trunc i64 %[[EXT]] to i32
+
+// CHECK-LABEL: define available_externally signext i32 @_mm_extract_ps(<4 x float> noundef %{{[0-9a-zA-Z_.]+}}, i32 noundef signext %{{[0-9a-zA-Z_.]+}})
+// CHECK: %[[AND:[0-9a-zA-Z_.]+]] = and i32 %{{[0-9a-zA-Z_.]+}}, 3
+// CHECK: extractelement <4 x i32> %{{[0-9a-zA-Z_.]+}}, i32 %[[AND]]
 
 void __attribute__((noinline))
 test_blend() {
   _mm_blend_epi16(m1, m2, 0);
   _mm_blendv_epi8(m1, m2, mi);
+  _mm_blend_ps(mn1, mn2, 0);
+  _mm_blendv_ps(mn1, mn2, mn1);
+  _mm_blend_pd(md1, md2, 0);
+  _mm_blendv_pd(md1, md2, md1);
 }
 
 // CHECK-LABEL: @test_blend
 
-// CHECK: define available_externally <2 x i64> @_mm_blend_epi16(<2 x i64> noundef [[REG38:[0-9a-zA-Z_%.]+]], <2 x i64> noundef [[REG39:[0-9a-zA-Z_%.]+]], i32 noundef signext [[REG40:[0-9a-zA-Z_%.]+]])
-// CHECK: store <2 x i64> [[REG38]], <2 x i64>* [[REG41:[0-9a-zA-Z_%.]+]], align 16
-// CHECK-NEXT: store <2 x i64> [[REG39]], <2 x i64>* [[REG42:[0-9a-zA-Z_%.]+]], align 16
-// CHECK-NEXT: store i32 [[REG40]], i32* [[REG43:[0-9a-zA-Z_%.]+]], align 4
-// CHECK-NEXT: [[REG44:[0-9a-zA-Z_%.]+]] = load i32, i32* [[REG43]], align 4
-// CHECK-NEXT: [[REG45:[0-9a-zA-Z_%.]+]] = trunc i32 [[REG44]] to i8
-// CHECK-NEXT: [[REG46:[0-9a-zA-Z_%.]+]] = call <16 x i8> @vec_splats(signed char)(i8 noundef signext [[REG45]])
-// CHECK-NEXT: store <16 x i8> [[REG46]], <16 x i8>* [[REG47:[0-9a-zA-Z_%.]+]], align 16
-// CHECK-NEXT: [[REG48:[0-9a-zA-Z_%.]+]] = load <16 x i8>, <16 x i8>* [[REG47]], align 16
-// CHECK-NEXT: [[REG49:[0-9a-zA-Z_%.]+]] = call <16 x i8> @llvm.ppc.altivec.vgbbd(<16 x i8> [[REG48]])
-// CHECK-NEXT: store <16 x i8> [[REG49]], <16 x i8>* [[REG47]], align 16
-// CHECK-NEXT: [[REG50:[0-9a-zA-Z_%.]+]] = load <16 x i8>, <16 x i8>* [[REG47]], align 16
-// CHECK-NEXT: [[REG51:[0-9a-zA-Z_%.]+]] = call <8 x i16> @vec_unpackh(signed char vector[16])(<16 x i8> noundef [[REG50]])
-// CHECK-NEXT: store <8 x i16> [[REG51]], <8 x i16>* [[REG52:[0-9a-zA-Z_%.]+]], align 16
-
-// BE: [[REG53:[0-9a-zA-Z_%.]+]] = load <8 x i16>, <8 x i16>* [[REG52]], align 16
-// BE-NEXT: [[REG54:[0-9a-zA-Z_%.]+]] = call <8 x i16> @vec_reve(unsigned short vector[8])(<8 x i16> [[REG53]])
-// BE-NEXT: store <8 x i16> [[REG54]], <8 x i16>* [[REG52]], align 16
-
-// CHECK: [[REG55:[0-9a-zA-Z_%.]+]] = load <2 x i64>, <2 x i64>* [[REG41]], align 16
-// CHECK-NEXT: [[REG56:[0-9a-zA-Z_%.]+]] = bitcast <2 x i64> [[REG55]] to <8 x i16>
-// CHECK-NEXT: [[REG57:[0-9a-zA-Z_%.]+]] = load <2 x i64>, <2 x i64>* [[REG42]], align 16
-// CHECK-NEXT: [[REG58:[0-9a-zA-Z_%.]+]] = bitcast <2 x i64> [[REG57]] to <8 x i16>
-// CHECK-NEXT: [[REG59:[0-9a-zA-Z_%.]+]] = load <8 x i16>, <8 x i16>* [[REG52]], align 16
-// CHECK-NEXT: [[REG60:[0-9a-zA-Z_%.]+]] = call <8 x i16> @vec_sel(unsigned short vector[8], unsigned short vector[8], unsigned short vector[8])(<8 x i16> noundef [[REG56]], <8 x i16> noundef [[REG58]], <8 x i16> noundef [[REG59]])
-// CHECK-NEXT: [[REG61:[0-9a-zA-Z_%.]+]] = bitcast <8 x i16> [[REG60]] to <2 x i64>
-// CHECK-NEXT: ret <2 x i64> [[REG61]]
-
-// CHECK: define available_externally <2 x i64> @_mm_blendv_epi8(<2 x i64> noundef [[REG62:[0-9a-zA-Z_%.]+]], <2 x i64> noundef [[REG63:[0-9a-zA-Z_%.]+]], <2 x i64> noundef [[REG64:[0-9a-zA-Z_%.]+]])
-// CHECK: store <2 x i64> [[REG62]], <2 x i64>* [[REG65:[0-9a-zA-Z_%.]+]], align 16
-// CHECK-NEXT: store <2 x i64> [[REG63]], <2 x i64>* [[REG66:[0-9a-zA-Z_%.]+]], align 16
-// CHECK-NEXT: store <2 x i64> [[REG64]], <2 x i64>* [[REG67:[0-9a-zA-Z_%.]+]], align 16
-// CHECK-NEXT: [[REG68:[0-9a-zA-Z_%.]+]] = call <16 x i8> @vec_splats(unsigned char)(i8 noundef zeroext 7)
-// CHECK-NEXT: store <16 x i8> [[REG68]], <16 x i8>* [[REG69:[0-9a-zA-Z_%.]+]], align 16
-// CHECK-NEXT: [[REG70:[0-9a-zA-Z_%.]+]] = load <2 x i64>, <2 x i64>* [[REG67]], align 16
-// CHECK-NEXT: [[REG71:[0-9a-zA-Z_%.]+]] = bitcast <2 x i64> [[REG70]] to <16 x i8>
-// CHECK-NEXT: [[REG72:[0-9a-zA-Z_%.]+]] = load <16 x i8>, <16 x i8>* [[REG69]], align 16
-// CHECK-NEXT: [[REG73:[0-9a-zA-Z_%.]+]] = call <16 x i8> @vec_sra(unsigned char vector[16], unsigned char vector[16])(<16 x i8> noundef [[REG71]], <16 x i8> noundef [[REG72]])
-// CHECK-NEXT: store <16 x i8> [[REG73]], <16 x i8>* [[REG74:[0-9a-zA-Z_%.]+]], align 16
-// CHECK-NEXT: [[REG75:[0-9a-zA-Z_%.]+]] = load <2 x i64>, <2 x i64>* [[REG65]], align 16
-// CHECK-NEXT: [[REG76:[0-9a-zA-Z_%.]+]] = bitcast <2 x i64> [[REG75]] to <16 x i8>
-// CHECK-NEXT: [[REG77:[0-9a-zA-Z_%.]+]] = load <2 x i64>, <2 x i64>* [[REG66]], align 16
-// CHECK-NEXT: [[REG78:[0-9a-zA-Z_%.]+]] = bitcast <2 x i64> [[REG77]] to <16 x i8>
-// CHECK-NEXT: [[REG79:[0-9a-zA-Z_%.]+]] = load <16 x i8>, <16 x i8>* [[REG74]], align 16
-// CHECK-NEXT: [[REG80:[0-9a-zA-Z_%.]+]] = call <16 x i8> @vec_sel(unsigned char vector[16], unsigned char vector[16], unsigned char vector[16])(<16 x i8> noundef [[REG76]], <16 x i8> noundef [[REG78]], <16 x i8> noundef [[REG79]])
-// CHECK-NEXT: [[REG81:[0-9a-zA-Z_%.]+]] = bitcast <16 x i8> [[REG80]] to <2 x i64>
-// CHECK-NEXT: ret <2 x i64> [[REG81]]
+// CHECK-LABEL: define available_externally <2 x i64> @_mm_blend_epi16(<2 x i64> noundef %{{[0-9a-zA-Z_.]+}}, <2 x i64> noundef %{{[0-9a-zA-Z_.]+}}, i32 noundef signext %{{[0-9a-zA-Z_.]+}})
+// CHECK: %[[TRUNC:[0-9a-zA-Z_.]+]] = trunc i32 %{{[0-9a-zA-Z_.]+}} to i8
+// CHECK: call <16 x i8> @vec_splats(signed char)(i8 noundef signext %[[TRUNC]])
+// CHECK: call <16 x i8> @llvm.ppc.altivec.vgbbd(<16 x i8> %{{[0-9a-zA-Z_.]+}})
+// CHECK: %[[PACK:[0-9a-zA-Z_.]+]] = call <8 x i16> @vec_unpackh(signed char vector[16])
+// CHECK: store <8 x i16> %[[PACK]], <8 x i16>* %{{[0-9a-zA-Z_.]+}}, align 16
+// BE: %[[REVE:[0-9a-zA-Z_.]+]] = call <8 x i16> @vec_reve(unsigned short vector[8])
+// BE: store <8 x i16> %[[REVE]], <8 x i16>* %{{[0-9a-zA-Z_.]+}}, align 16
+// CHECK: call <8 x i16> @vec_sel(unsigned short vector[8], unsigned short vector[8], unsigned short vector[8])
+
+// CHECK-LABEL: define available_externally <2 x i64> @_mm_blendv_epi8(<2 x i64> noundef %{{[0-9a-zA-Z_.]+}}, <2 x i64> noundef %{{[0-9a-zA-Z_.]+}}, <2 x i64> noundef %{{[0-9a-zA-Z_.]+}})
+// CHECK: call <16 x i8> @vec_splats(unsigned char)(i8 noundef zeroext 7)
+// CHECK: call <16 x i8> @vec_sra(unsigned char vector[16], unsigned char vector[16])
+// CHECK: call <16 x i8> @vec_sel(signed char vector[16], signed char vector[16], unsigned char vector[16])
 
 void __attribute__((noinline))
 test_insert() {
@@ -131,25 +80,251 @@
 
 // CHECK-LABEL: @test_insert
 
-// CHECK: define available_externally <2 x i64> @_mm_insert_epi8(<2 x i64> noundef {{[0-9a-zA-Z_%.]+}}, i32 noundef signext {{[0-9a-zA-Z_%.]+}}, i32 noundef signext {{[0-9a-zA-Z_%.]+}})
-// CHECK: %{{[0-9a-zA-Z_.]+}} = bitcast <2 x i64> %{{[0-9a-zA-Z_.]+}} to <16 x i8>
-// CHECK: %[[R0:[0-9a-zA-Z_.]+]] = trunc i32 %{{[0-9a-zA-Z_.]+}} to i8
-// CHECK: %[[R1:[0-9a-zA-Z_.]+]] = and i32 %{{[0-9a-zA-Z_.]+}}, 15
-// CHECK: %{{[0-9a-zA-Z_.]+}} = insertelement <16 x i8> %{{[0-9a-zA-Z_.]+}}, i8 %[[R0]], i32 %[[R1]]
-// CHECK: %[[R2:[0-9a-zA-Z_.]+]] = bitcast <16 x i8> %{{[0-9a-zA-Z_.]+}} to <2 x i64>
-// CHECK: ret <2 x i64> %[[R2]]
-
-// CHECK: define available_externally <2 x i64> @_mm_insert_epi32(<2 x i64> noundef {{[0-9a-zA-Z_%.]+}}, i32 noundef signext {{[0-9a-zA-Z_%.]+}}, i32 noundef signext {{[0-9a-zA-Z_%.]+}})
-// CHECK: %{{[0-9a-zA-Z_.]+}} = bitcast <2 x i64> %{{[0-9a-zA-Z_.]+}} to <4 x i32>
-// CHECK: %[[R0:[0-9a-zA-Z_.]+]] = and i32 %{{[0-9a-zA-Z_.]+}}, 3
-// CHECK: %{{[0-9a-zA-Z_.]+}} = insertelement <4 x i32> %{{[0-9a-zA-Z_.]+}}, i32 %{{[0-9a-zA-Z_.]+}}, i32 %[[R0]]
-// CHECK: %[[R1:[0-9a-zA-Z_.]+]] = bitcast <4 x i32> %{{[0-9a-zA-Z_.]+}} to <2 x i64>
-// CHECK: ret <2 x i64> %[[R1]]
-
-// CHECK: define available_externally <2 x i64> @_mm_insert_epi64(<2 x i64> noundef {{[0-9a-zA-Z_%.]+}}, i64 noundef {{[0-9a-zA-Z_%.]+}}, i32 noundef signext {{[0-9a-zA-Z_%.]+}})
-// CHECK: %[[R0:[0-9a-zA-Z_.]+]] = and i32 %{{[0-9a-zA-Z_.]+}}, 1
-// CHECK: %{{[0-9a-zA-Z_.]+}} = insertelement <2 x i64> %{{[0-9a-zA-Z_.]+}}, i64 %{{[0-9a-zA-Z_.]+}}, i32 %[[R0:[0-9a-zA-Z_.]+]]
-// CHECK: ret <2 x i64> %{{[0-9a-zA-Z_.]+}}
+// CHECK-LABEL: define available_externally <2 x i64> @_mm_insert_epi8(<2 x i64> noundef %{{[0-9a-zA-Z_.]+}}, i32 noundef signext %{{[0-9a-zA-Z_.]+}}, i32 noundef signext %{{[0-9a-zA-Z_.]+}})
+// CHECK: %[[TRUNC:[0-9a-zA-Z_.]+]] = trunc i32 %{{[0-9a-zA-Z_.]+}} to i8
+// CHECK: %[[AND:[0-9a-zA-Z_.]+]] = and i32 %{{[0-9a-zA-Z_.]+}}, 15
+// CHECK: insertelement <16 x i8> %{{[0-9a-zA-Z_.]+}}, i8 %[[TRUNC]], i32 %[[AND]]
+
+// CHECK-LABEL: define available_externally <2 x i64> @_mm_insert_epi32(<2 x i64> noundef %{{[0-9a-zA-Z_.]+}}, i32 noundef signext %{{[0-9a-zA-Z_.]+}}, i32 noundef signext %{{[0-9a-zA-Z_.]+}})
+// CHECK: %[[AND:[0-9a-zA-Z_.]+]] = and i32 %{{[0-9a-zA-Z_.]+}}, 3
+// CHECK: insertelement <4 x i32> %{{[0-9a-zA-Z_.]+}}, i32 %{{[0-9a-zA-Z_.]+}}, i32 %[[AND]]
+
+// CHECK-LABEL: define available_externally <2 x i64> @_mm_insert_epi64(<2 x i64> noundef %{{[0-9a-zA-Z_.]+}}, i64 noundef %{{[0-9a-zA-Z_.]+}}, i32 noundef signext %{{[0-9a-zA-Z_.]+}})
+// CHECK: %[[AND:[0-9a-zA-Z_.]+]] = and i32 %{{[0-9a-zA-Z_.]+}}, 1
+// CHECK: insertelement <2 x i64> %{{[0-9a-zA-Z_.]+}}, i64 %{{[0-9a-zA-Z_.]+}}, i32 %[[AND:[0-9a-zA-Z_.]+]]
+
+void __attribute__((noinline))
+test_convert() {
+  _mm_cvtepi16_epi32(m1);
+  _mm_cvtepi16_epi64(m1);
+  _mm_cvtepi32_epi64(m1);
+  _mm_cvtepi8_epi16(m1);
+  _mm_cvtepi8_epi32(m1);
+  _mm_cvtepi8_epi64(m1);
+  _mm_cvtepu16_epi32(m1);
+  _mm_cvtepu16_epi64(m1);
+  _mm_cvtepu32_epi64(m1);
+  _mm_cvtepu8_epi16(m1);
+  _mm_cvtepu8_epi32(m1);
+  _mm_cvtepu8_epi64(m1);
+}
+
+// CHECK-LABEL: @test_convert
+
+// CHECK-LABEL: define available_externally <2 x i64> @_mm_cvtepi16_epi32(<2 x i64> noundef %{{[0-9a-zA-Z_.]+}})
+// CHECK: call <4 x i32> @vec_unpackh(short vector[8])
+
+// CHECK-LABEL: define available_externally <2 x i64> @_mm_cvtepi16_epi64(<2 x i64> noundef %{{[0-9a-zA-Z_.]+}})
+// CHECK: call <4 x i32> @vec_unpackh(short vector[8])
+// CHECK: call <2 x i64> @vec_unpackh(int vector[4])
+
+// CHECK-LABEL: define available_externally <2 x i64> @_mm_cvtepi32_epi64(<2 x i64> noundef %{{[0-9a-zA-Z_.]+}})
+// CHECK: call <2 x i64> @vec_unpackh(int vector[4])
+
+// CHECK-LABEL: define available_externally <2 x i64> @_mm_cvtepi8_epi16(<2 x i64> noundef %{{[0-9a-zA-Z_.]+}})
+// CHECK: call <8 x i16> @vec_unpackh(signed char vector[16])
+
+// CHECK-LABEL: define available_externally <2 x i64> @_mm_cvtepi8_epi32(<2 x i64> noundef %{{[0-9a-zA-Z_.]+}})
+// CHECK: call <8 x i16> @vec_unpackh(signed char vector[16])
+// CHECK: call <4 x i32> @vec_unpackh(short vector[8])
+
+// CHECK-LABEL: define available_externally <2 x i64> @_mm_cvtepi8_epi64(<2 x i64> noundef %{{[0-9a-zA-Z_.]+}})
+// CHECK: call <8 x i16> @vec_unpackh(signed char vector[16])
+// CHECK: call <4 x i32> @vec_unpackh(short vector[8])
+// CHECK: call <2 x i64> @vec_unpackh(int vector[4])
+
+// CHECK-LABEL: define available_externally <2 x i64> @_mm_cvtepu16_epi32(<2 x i64> noundef %{{[0-9a-zA-Z_.]+}})
+// LE: call <8 x i16> @vec_mergeh(unsigned short vector[8], unsigned short vector[8])(<8 x i16> noundef %{{[0-9a-zA-Z_.]+}}, <8 x i16> noundef zeroinitializer)
+// BE: call <8 x i16> @vec_mergeh(unsigned short vector[8], unsigned short vector[8])(<8 x i16> noundef zeroinitializer, <8 x i16> noundef %{{[0-9a-zA-Z_.]+}})
+
+// CHECK-LABEL: define available_externally <2 x i64> @_mm_cvtepu16_epi64(<2 x i64> noundef %{{[0-9a-zA-Z_.]+}})
+// LE: call <8 x i16> @vec_mergeh(unsigned short vector[8], unsigned short vector[8])(<8 x i16> noundef %{{[0-9a-zA-Z_.]+}}, <8 x i16> noundef zeroinitializer)
+// LE: call <4 x i32> @vec_mergeh(unsigned int vector[4], unsigned int vector[4])(<4 x i32> noundef %{{[0-9a-zA-Z_.]+}}, <4 x i32> noundef zeroinitializer)
+// BE: call <8 x i16> @vec_mergeh(unsigned short vector[8], unsigned short vector[8])(<8 x i16> noundef zeroinitializer, <8 x i16> noundef %{{[0-9a-zA-Z_.]+}})
+// BE: call <4 x i32> @vec_mergeh(unsigned int vector[4], unsigned int vector[4])(<4 x i32> noundef zeroinitializer, <4 x i32> noundef %{{[0-9a-zA-Z_.]+}})
+
+// CHECK-LABEL: define available_externally <2 x i64> @_mm_cvtepu32_epi64(<2 x i64> noundef %{{[0-9a-zA-Z_.]+}})
+// LE: call <4 x i32> @vec_mergeh(unsigned int vector[4], unsigned int vector[4])(<4 x i32> noundef %{{[0-9a-zA-Z_.]+}}, <4 x i32> noundef zeroinitializer)
+// BE: call <4 x i32> @vec_mergeh(unsigned int vector[4], unsigned int vector[4])(<4 x i32> noundef zeroinitializer, <4 x i32> noundef %{{[0-9a-zA-Z_.]+}})
+
+// CHECK-LABEL: define available_externally <2 x i64> @_mm_cvtepu8_epi16(<2 x i64> noundef %{{[0-9a-zA-Z_.]+}})
+// LE: call <16 x i8> @vec_mergeh(unsigned char vector[16], unsigned char vector[16])(<16 x i8> noundef %{{[0-9a-zA-Z_.]+}}, <16 x i8> noundef zeroinitializer)
+// BE: call <16 x i8> @vec_mergeh(unsigned char vector[16], unsigned char vector[16])(<16 x i8> noundef zeroinitializer, <16 x i8> noundef %{{[0-9a-zA-Z_.]+}})
+
+// CHECK-LABEL: define available_externally <2 x i64> @_mm_cvtepu8_epi32(<2 x i64> noundef %{{[0-9a-zA-Z_.]+}})
+// LE: call <16 x i8> @vec_mergeh(unsigned char vector[16], unsigned char vector[16])(<16 x i8> noundef %{{[0-9a-zA-Z_.]+}}, <16 x i8> noundef zeroinitializer)
+// LE: call <8 x i16> @vec_mergeh(unsigned short vector[8], unsigned short vector[8])(<8 x i16> noundef %{{[0-9a-zA-Z_.]+}}, <8 x i16> noundef zeroinitializer)
+// BE: call <16 x i8> @vec_mergeh(unsigned char vector[16], unsigned char vector[16])(<16 x i8> noundef zeroinitializer, <16 x i8> noundef %{{[0-9a-zA-Z_.]+}})
+// BE: call <8 x i16> @vec_mergeh(unsigned short vector[8], unsigned short vector[8])(<8 x i16> noundef zeroinitializer, <8 x i16> noundef %{{[0-9a-zA-Z_.]+}})
+
+// CHECK-LABEL: define available_externally <2 x i64> @_mm_cvtepu8_epi64(<2 x i64> noundef %{{[0-9a-zA-Z_.]+}})
+// CHECK: call <16 x i8> @vec_mergeh(unsigned char vector[16], unsigned char vector[16])
+// CHECK: call <8 x i16> @vec_mergeh(unsigned short vector[8], unsigned short vector[8])
+// CHECK: call <4 x i32> @vec_mergeh(unsigned int vector[4], unsigned int vector[4])
+
+void __attribute__((noinline))
+test_minmax() {
+  _mm_max_epi32(m1, m2);
+  _mm_max_epi8(m1, m2);
+  _mm_max_epu16(m1, m2);
+  _mm_max_epu32(m1, m2);
+  _mm_min_epi32(m1, m2);
+  _mm_min_epi8(m1, m2);
+  _mm_min_epu16(m1, m2);
+  _mm_min_epu32(m1, m2);
+  _mm_minpos_epu16(m1);
+}
+
+// CHECK-LABEL: @test_minmax
+
+// CHECK-LABEL: define available_externally <2 x i64> @_mm_max_epi32(<2 x i64> noundef %{{[0-9a-zA-Z_.]+}}, <2 x i64> noundef %{{[0-9a-zA-Z_.]+}})
+// CHECK: call <4 x i32> @vec_max(int vector[4], int vector[4])
+
+// CHECK-LABEL: define available_externally <2 x i64> @_mm_max_epi8(<2 x i64> noundef %{{[0-9a-zA-Z_.]+}}, <2 x i64> noundef %{{[0-9a-zA-Z_.]+}})
+// CHECK: call <16 x i8> @vec_max(signed char vector[16], signed char vector[16])
+
+// CHECK-LABEL: define available_externally <2 x i64> @_mm_max_epu16(<2 x i64> noundef %{{[0-9a-zA-Z_.]+}}, <2 x i64> noundef %{{[0-9a-zA-Z_.]+}})
+// CHECK: call <8 x i16> @vec_max(unsigned short vector[8], unsigned short vector[8])
+
+// CHECK-LABEL: define available_externally <2 x i64> @_mm_max_epu32(<2 x i64> noundef %{{[0-9a-zA-Z_.]+}}, <2 x i64> noundef %{{[0-9a-zA-Z_.]+}})
+// CHECK: call <4 x i32> @vec_max(unsigned int vector[4], unsigned int vector[4])
+
+// CHECK-LABEL: define available_externally <2 x i64> @_mm_min_epi32(<2 x i64> noundef %{{[0-9a-zA-Z_.]+}}, <2 x i64> noundef %{{[0-9a-zA-Z_.]+}})
+// CHECK: call <4 x i32> @vec_min(int vector[4], int vector[4])
+
+// CHECK-LABEL: define available_externally <2 x i64> @_mm_min_epi8(<2 x i64> noundef %{{[0-9a-zA-Z_.]+}}, <2 x i64> noundef %{{[0-9a-zA-Z_.]+}})
+// CHECK: call <16 x i8> @vec_min(signed char vector[16], signed char vector[16])
+
+// CHECK-LABEL: define available_externally <2 x i64> @_mm_min_epu16(<2 x i64> noundef %{{[0-9a-zA-Z_.]+}}, <2 x i64> noundef %{{[0-9a-zA-Z_.]+}})
+// CHECK: call <8 x i16> @vec_min(unsigned short vector[8], unsigned short vector[8])
+
+// CHECK-LABEL: define available_externally <2 x i64> @_mm_min_epu32(<2 x i64> noundef %{{[0-9a-zA-Z_.]+}}, <2 x i64> noundef %{{[0-9a-zA-Z_.]+}})
+// CHECK: call <4 x i32> @vec_min(unsigned int vector[4], unsigned int vector[4])
+
+// CHECK-LABEL: define available_externally <2 x i64> @_mm_minpos_epu16(<2 x i64> noundef %{{[0-9a-zA-Z_.]+}})
+// CHECK: call void @llvm.memset.p0i8.i64(i8* align 16 %{{[0-9a-zA-Z_.]+}}, i8 0, i64 16, i1 false)
+// CHECK: extractelement <8 x i16> %{{[0-9a-zA-Z_.]+}}, i16 %{{[0-9a-zA-Z_.]+}}
+// CHECK: %[[VEXT:[0-9a-zA-Z_.]+]] = extractelement <8 x i16> %{{[0-9a-zA-Z_.]+}}, i64 %{{[0-9a-zA-Z_.]+}}
+// CHECK: zext i16 %[[VEXT]] to i32
+// CHECK: zext i16 %{{[0-9a-zA-Z_.]+}} to i32
+// CHECK: extractelement <8 x i16> %{{[0-9a-zA-Z_.]+}}, i64 %{{[0-9a-zA-Z_.]+}}
+// CHECK: add i64 %{{[0-9a-zA-Z_.]+}}, 1
+
+void __attribute__((noinline))
+test_round() {
+  _mm_round_ps(mn1, 0);
+  _mm_round_ss(mn1, mn2, 0);
+  _mm_round_pd(mn1, 0);
+  _mm_round_sd(mn1, mn2, 0);
+}
+
+// CHECK-LABEL: @test_round
+
+// CHECK-LABEL: define available_externally <4 x float> @_mm_round_ps(<4 x float> noundef %{{[0-9a-zA-Z_.]+}}, i32 noundef signext %{{[0-9a-zA-Z_.]+}})
+// CHECK: call signext i32 bitcast (i32 (...)* @__builtin_mffs to i32 ()*)()
+// CHECK: call signext i32 bitcast (i32 (...)* @__builtin_mtfsf to i32 (i32, double)*)(i32 noundef signext 3, double noundef %{{[0-9a-zA-Z_.]+}})
+// CHECK: %{{[0-9a-zA-Z_.]+}} = call <4 x float> asm "", "=^wa,0"
+// CHECK: call signext i32 bitcast (i32 (...)* @__builtin_mffsl to i32 ()*)()
+// CHECK: call signext i32 bitcast (i32 (...)* @__builtin_set_fpscr_rn to i32 (i32)*)(i32 noundef signext 0)
+// CHECK: %{{[0-9a-zA-Z_.]+}} = call <4 x float> asm "", "=^wa,0"
+// CHECK: call <4 x float> @vec_rint(float vector[4])
+// CHECK: call void asm sideeffect "", "^wa"
+// CHECK: call signext i32 bitcast (i32 (...)* @__builtin_set_fpscr_rn to i32 (i64)*)(i64 noundef %{{[0-9a-zA-Z_.]+}})
+// CHECK: call <4 x float> @vec_floor(float vector[4])
+// CHECK: call <4 x float> @vec_ceil(float vector[4])
+// CHECK: call <4 x float> @vec_trunc(float vector[4])
+// CHECK: call <4 x float> @vec_rint(float vector[4])
+// CHECK: call void asm sideeffect "", "^wa"
+// CHECK: call signext i32 bitcast (i32 (...)* @__builtin_mffsl to i32 ()*)()
+// CHECK: call signext i32 bitcast (i32 (...)* @__builtin_mtfsf to i32 (i32, double)*)(i32 noundef signext 3, double noundef %{{[0-9a-zA-Z_.]+}})
+
+// CHECK-LABEL: define available_externally <4 x float> @_mm_round_ss(<4 x float> noundef %{{[0-9a-zA-Z_.]+}}, <4 x float> noundef %{{[0-9a-zA-Z_.]+}}, i32 noundef signext %{{[0-9a-zA-Z_.]+}})
+// CHECK: call <4 x float> @_mm_round_ps(<4 x float> noundef %{{[0-9a-zA-Z_.]+}}, i32 noundef signext %{{[0-9a-zA-Z_.]+}})
+// CHECK: extractelement <4 x float> %{{[0-9a-zA-Z_.]+}}, i32 0
+
+// CHECK-LABEL: define available_externally <2 x double> @_mm_round_pd(<2 x double> noundef %{{[0-9a-zA-Z_.]+}}, i32 noundef signext %{{[0-9a-zA-Z_.]+}})
+// CHECK: call signext i32 bitcast (i32 (...)* @__builtin_mffs to i32 ()*)()
+// CHECK: call signext i32 bitcast (i32 (...)* @__builtin_mtfsf to i32 (i32, double)*)(i32 noundef signext 3, double noundef %{{[0-9a-zA-Z_.]+}})
+// CHECK: %{{[0-9a-zA-Z_.]+}} = call <2 x double> asm "", "=^wa,0"
+// CHECK: call signext i32 bitcast (i32 (...)* @__builtin_mffsl to i32 ()*)()
+// CHECK: call signext i32 bitcast (i32 (...)* @__builtin_set_fpscr_rn to i32 (i32)*)(i32 noundef signext 0)
+// CHECK: %{{[0-9a-zA-Z_.]+}} = call <2 x double> asm "", "=^wa,0"
+// CHECK: call <2 x double> @vec_rint(double vector[2])
+// CHECK: call void asm sideeffect "", "^wa"
+// CHECK: call signext i32 bitcast (i32 (...)* @__builtin_set_fpscr_rn to i32 (i64)*)(i64 noundef %{{[0-9a-zA-Z_.]+}})
+// CHECK: call <2 x double> @vec_floor(double vector[2])
+// CHECK: call <2 x double> @vec_ceil(double vector[2])
+// CHECK: call <2 x double> @vec_trunc(double vector[2])
+// CHECK: call <2 x double> @vec_rint(double vector[2])
+// CHECK: call void asm sideeffect "", "^wa"
+// CHECK: call signext i32 bitcast (i32 (...)* @__builtin_mffsl to i32 ()*)()
+// CHECK: call signext i32 bitcast (i32 (...)* @__builtin_mtfsf to i32 (i32, double)*)(i32 noundef signext 3, double noundef %{{[0-9a-zA-Z_.]+}})
+
+// CHECK-LABEL: define available_externally <2 x double> @_mm_round_sd(<2 x double> noundef %{{[0-9a-zA-Z_.]+}}, <2 x double> noundef %{{[0-9a-zA-Z_.]+}}, i32 noundef signext %{{[0-9a-zA-Z_.]+}})
+// CHECK: call <2 x double> @_mm_round_pd(<2 x double> noundef %{{[0-9a-zA-Z_.]+}}, i32 noundef signext %{{[0-9a-zA-Z_.]+}})
+// CHECK: extractelement <2 x double> %{{[0-9a-zA-Z_.]+}}, i32 0
+// CHECK: extractelement <2 x double> %{{[0-9a-zA-Z_.]+}}, i32 1
+
+void __attribute__((noinline))
+test_testing() {
+  _mm_testc_si128(m1, m2);
+  _mm_testnzc_si128(m1, m2);
+  _mm_testz_si128(m1, m2);
+}
+
+// CHECK-LABEL: @test_testing
+
+// CHECK-LABEL: define available_externally signext i32 @_mm_testc_si128(<2 x i64> noundef %{{[0-9a-zA-Z_.]+}}, <2 x i64> noundef %{{[0-9a-zA-Z_.]+}})
+// CHECK: call <16 x i8> @vec_nor(unsigned char vector[16], unsigned char vector[16])
+// CHECK: call <16 x i8> @vec_and(unsigned char vector[16], unsigned char vector[16])
+// CHECK: call signext i32 @vec_all_eq(unsigned char vector[16], unsigned char vector[16])(<16 x i8> noundef %call1, <16 x i8> noundef zeroinitializer)
+
+// CHECK-LABEL: define available_externally signext i32 @_mm_testnzc_si128(<2 x i64> noundef %{{[0-9a-zA-Z_.]+}}, <2 x i64> noundef %{{[0-9a-zA-Z_.]+}})
+// CHECK: call signext i32 @_mm_testz_si128(<2 x i64> noundef %{{[0-9a-zA-Z_.]+}}, <2 x i64> noundef %{{[0-9a-zA-Z_.]+}})
+// CHECK: call signext i32 @_mm_testc_si128(<2 x i64> noundef %{{[0-9a-zA-Z_.]+}}, <2 x i64> noundef %{{[0-9a-zA-Z_.]+}})
+// CHECK: zext i1 %{{[0-9a-zA-Z_.]+}} to i32
+
+// CHECK-LABEL: define available_externally signext i32 @_mm_testz_si128(<2 x i64> noundef %{{[0-9a-zA-Z_.]+}}, <2 x i64> noundef %{{[0-9a-zA-Z_.]+}})
+// CHECK: call <16 x i8> @vec_and(unsigned char vector[16], unsigned char vector[16])
+// CHECK: call signext i32 @vec_all_eq(unsigned char vector[16], unsigned char vector[16])(<16 x i8> noundef %call, <16 x i8> noundef zeroinitializer)
+
+void __attribute__((noinline))
+test_compare() {
+  _mm_cmpeq_epi64(m1, m2);
+  _mm_cmpgt_epi64(m1, m2);
+}
+
+// CHECK-LABEL: @test_compare
+
+// CHECK-LABEL: define available_externally <2 x i64> @_mm_cmpeq_epi64(<2 x i64> noundef %{{[0-9a-zA-Z_.]+}}, <2 x i64> noundef %{{[0-9a-zA-Z_.]+}})
+// CHECK: call <2 x i64> @vec_cmpeq(long long vector[2], long long vector[2])
+
+// CHECK-LABEL: define available_externally <2 x i64> @_mm_cmpgt_epi64(<2 x i64> noundef %{{[0-9a-zA-Z_.]+}}, <2 x i64> noundef %{{[0-9a-zA-Z_.]+}})
+// CHECK: call <2 x i64> @vec_cmpgt(long long vector[2], long long vector[2])
+
+void __attribute__((noinline))
+test_mul() {
+  _mm_mul_epi32(m1, m2);
+  _mm_mullo_epi32(m1, m2);
+}
+
+// CHECK-LABEL: @test_mul
+
+// CHECK-LABEL: define available_externally <2 x i64> @_mm_mul_epi32(<2 x i64> noundef %{{[0-9a-zA-Z_.]+}}, <2 x i64> noundef %{{[0-9a-zA-Z_.]+}})
+// CHECK: %call = call <2 x i64> @vec_mule(int vector[4], int vector[4])
+
+// CHECK-LABEL: define available_externally <2 x i64> @_mm_mullo_epi32(<2 x i64> noundef %{{[0-9a-zA-Z_.]+}}, <2 x i64> noundef %{{[0-9a-zA-Z_.]+}})
+// CHECK: %call = call <4 x i32> @vec_mul(unsigned int vector[4], unsigned int vector[4])
+
+void __attribute__((noinline))
+test_packus() {
+  _mm_packus_epi32(m1, m2);
+}
+
+// CHECK-LABEL: @test_packus
+
+// CHECK-LABEL: define available_externally <2 x i64> @_mm_packus_epi32(<2 x i64> noundef %{{[0-9a-zA-Z_.]+}}, <2 x i64> noundef %{{[0-9a-zA-Z_.]+}})
+// CHECK: call <8 x i16> @vec_packsu(int vector[4], int vector[4])(<4 x i32> noundef %1, <4 x i32> noundef %3)
 
 // To test smmintrin.h includes tmmintrin.h
 
@@ -160,4 +335,4 @@
 
 // CHECK-LABEL: @test_abs_ssse3
 
-// CHECK: define available_externally <2 x i64> @_mm_abs_epi16(<2 x i64> noundef {{[0-9a-zA-Z_%.]+}})
+// CHECK-LABEL: define available_externally <2 x i64> @_mm_abs_epi16(<2 x i64> noundef %{{[0-9a-zA-Z_.]+}})
Index: clang/lib/Headers/ppc_wrappers/xmmintrin.h
===================================================================
--- clang/lib/Headers/ppc_wrappers/xmmintrin.h
+++ clang/lib/Headers/ppc_wrappers/xmmintrin.h
@@ -31,8 +31,8 @@
 #error "Please read comment above. Use -DNO_WARN_X86_INTRINSICS to disable this error."
 #endif
 
-#ifndef _XMMINTRIN_H_INCLUDED
-#define _XMMINTRIN_H_INCLUDED
+#ifndef XMMINTRIN_H_
+#define XMMINTRIN_H_
 
 #if defined(__ppc64__) && (defined(__linux__) || defined(__FreeBSD__))
 
@@ -62,13 +62,14 @@
 
 /* The Intel API is flexible enough that we must allow aliasing with other
    vector types, and their scalar components.  */
-typedef vector float __m128 __attribute__((__may_alias__));
+typedef float __m128 __attribute__ ((__vector_size__ (16), __may_alias__));
 
 /* Unaligned version of the same type.  */
-typedef vector float __m128_u __attribute__((__may_alias__, __aligned__(1)));
+typedef float __m128_u __attribute__ ((__vector_size__ (16), __may_alias__,
+				       __aligned__ (1)));
 
 /* Internal data types for implementing the intrinsics.  */
-typedef vector float __v4sf;
+typedef float __v4sf __attribute__ ((__vector_size__ (16)));
 
 /* Create an undefined vector.  */
 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
@@ -881,7 +882,7 @@
 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 _mm_cvtss_si32 (__m128 __A)
 {
-  __m64 res = 0;
+  int res;
 #ifdef _ARCH_PWR8
   double dtmp;
   __asm__(
@@ -914,8 +915,8 @@
 extern __inline long long __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 _mm_cvtss_si64 (__m128 __A)
 {
-  __m64 res = 0;
-#ifdef _ARCH_PWR8
+  long long res;
+#if defined (_ARCH_PWR8) && defined (__powerpc64__)
   double dtmp;
   __asm__(
 #ifdef __LITTLE_ENDIAN__
@@ -1328,6 +1329,9 @@
 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 _mm_movemask_ps (__m128  __A)
 {
+#ifdef _ARCH_PWR10
+  return vec_extractm ((__vector unsigned int) __A);
+#else
   __vector unsigned long long result;
   static const __vector unsigned int perm_mask =
     {
@@ -1347,6 +1351,7 @@
 #else
   return result[0];
 #endif
+#endif /* !_ARCH_PWR10 */
 }
 #endif /* _ARCH_PWR8 */
 
@@ -1553,6 +1558,7 @@
 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 _mm_movemask_pi8 (__m64 __A)
 {
+#ifdef __powerpc64__
   unsigned long long p =
 #ifdef __LITTLE_ENDIAN__
                          0x0008101820283038UL; // permute control for sign bits
@@ -1560,6 +1566,18 @@
                          0x3830282018100800UL; // permute control for sign bits
 #endif
   return __builtin_bpermd (p, __A);
+#else
+#ifdef __LITTLE_ENDIAN__
+  unsigned int mask = 0x20283038UL;
+  unsigned int r1 = __builtin_bpermd (mask, __A) & 0xf;
+  unsigned int r2 = __builtin_bpermd (mask, __A >> 32) & 0xf;
+#else
+  unsigned int mask = 0x38302820UL;
+  unsigned int r1 = __builtin_bpermd (mask, __A >> 32) & 0xf;
+  unsigned int r2 = __builtin_bpermd (mask, __A) & 0xf;
+#endif
+  return (r2 << 4) | r1;
+#endif
 }
 
 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
@@ -1841,4 +1859,4 @@
 #endif /* defined(__ppc64__) && (defined(__linux__) || defined(__FreeBSD__))   \
         */
 
-#endif /* _XMMINTRIN_H_INCLUDED */
+#endif /* XMMINTRIN_H_ */
Index: clang/lib/Headers/ppc_wrappers/x86intrin.h
===================================================================
--- /dev/null
+++ clang/lib/Headers/ppc_wrappers/x86intrin.h
@@ -0,0 +1,28 @@
+/*===---- x86intrin.h - Implementation of X86 intrinsics on PowerPC --------===
+ *
+ * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+ * See https://llvm.org/LICENSE.txt for license information.
+ * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+ *
+ *===-----------------------------------------------------------------------===
+ */
+
+#ifndef NO_WARN_X86_INTRINSICS
+/* This header is distributed to simplify porting x86_64 code that
+   makes explicit use of Intel intrinsics to powerpc64le.
+   It is the user's responsibility to determine if the results are
+   acceptable and make additional changes as necessary.
+   Note that much code that uses Intel intrinsics can be rewritten in
+   standard C or GNU C extensions, which are more portable and better
+   optimized across multiple targets.  */
+#error "Please read comment above.  Use -DNO_WARN_X86_INTRINSICS to disable this error."
+#endif
+
+#ifndef X86INTRIN_H_
+#define X86INTRIN_H_
+
+#ifdef __ALTIVEC__
+#include <immintrin.h>
+#endif /* __ALTIVEC__ */
+
+#endif /* X86INTRIN_H_ */
Index: clang/lib/Headers/ppc_wrappers/x86gprintrin.h
===================================================================
--- /dev/null
+++ clang/lib/Headers/ppc_wrappers/x86gprintrin.h
@@ -0,0 +1,17 @@
+/*===--- x86gprintrin.h - Implementation of X86 GPR intrinsics on PowerPC --===
+ *
+ * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+ * See https://llvm.org/LICENSE.txt for license information.
+ * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+ *
+ *===-----------------------------------------------------------------------===
+ */
+
+#ifndef X86GPRINTRIN_H_
+#define X86GPRINTRIN_H_
+
+#include <bmiintrin.h>
+
+#include <bmi2intrin.h>
+
+#endif /* X86GPRINTRIN_H_ */
Index: clang/lib/Headers/ppc_wrappers/tmmintrin.h
===================================================================
--- clang/lib/Headers/ppc_wrappers/tmmintrin.h
+++ clang/lib/Headers/ppc_wrappers/tmmintrin.h
@@ -339,6 +339,7 @@
   return (__m64) ((__v2du) (__C))[0];
 }
 
+#ifdef _ARCH_PWR8
 extern __inline __m128i
 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 _mm_sign_epi8 (__m128i __A, __m128i __B)
@@ -350,7 +351,9 @@
   __v16qi __conv = vec_add (__selectneg, __selectpos);
   return (__m128i) vec_mul ((__v16qi) __A, (__v16qi) __conv);
 }
+#endif
 
+#ifdef _ARCH_PWR8
 extern __inline __m128i
 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 _mm_sign_epi16 (__m128i __A, __m128i __B)
@@ -362,7 +365,9 @@
   __v8hi __conv = vec_add (__selectneg, __selectpos);
   return (__m128i) vec_mul ((__v8hi) __A, (__v8hi) __conv);
 }
+#endif
 
+#ifdef _ARCH_PWR8
 extern __inline __m128i
 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 _mm_sign_epi32 (__m128i __A, __m128i __B)
@@ -374,7 +379,9 @@
   __v4si __conv = vec_add (__selectneg, __selectpos);
   return (__m128i) vec_mul ((__v4si) __A, (__v4si) __conv);
 }
+#endif
 
+#ifdef _ARCH_PWR8
 extern __inline __m64
 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 _mm_sign_pi8 (__m64 __A, __m64 __B)
@@ -385,7 +392,9 @@
   __C = (__v16qi) _mm_sign_epi8 ((__m128i) __C, (__m128i) __D);
   return (__m64) ((__v2du) (__C))[0];
 }
+#endif
 
+#ifdef _ARCH_PWR8
 extern __inline __m64
 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 _mm_sign_pi16 (__m64 __A, __m64 __B)
@@ -396,7 +405,9 @@
   __C = (__v8hi) _mm_sign_epi16 ((__m128i) __C, (__m128i) __D);
   return (__m64) ((__v2du) (__C))[0];
 }
+#endif
 
+#ifdef _ARCH_PWR8
 extern __inline __m64
 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 _mm_sign_pi32 (__m64 __A, __m64 __B)
@@ -407,6 +418,7 @@
   __C = (__v4si) _mm_sign_epi32 ((__m128i) __C, (__m128i) __D);
   return (__m64) ((__v2du) (__C))[0];
 }
+#endif
 
 extern __inline __m128i
 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
Index: clang/lib/Headers/ppc_wrappers/smmintrin.h
===================================================================
--- clang/lib/Headers/ppc_wrappers/smmintrin.h
+++ clang/lib/Headers/ppc_wrappers/smmintrin.h
@@ -34,77 +34,683 @@
 #include <altivec.h>
 #include <tmmintrin.h>
 
-extern __inline int
-    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
-    _mm_extract_epi8(__m128i __X, const int __N) {
-  return (unsigned char)((__v16qi)__X)[__N & 15];
+/* Rounding mode macros. */
+#define _MM_FROUND_TO_NEAREST_INT       0x00
+#define _MM_FROUND_TO_ZERO              0x01
+#define _MM_FROUND_TO_POS_INF           0x02
+#define _MM_FROUND_TO_NEG_INF           0x03
+#define _MM_FROUND_CUR_DIRECTION        0x04
+
+#define _MM_FROUND_NINT		\
+  (_MM_FROUND_TO_NEAREST_INT | _MM_FROUND_RAISE_EXC)
+#define _MM_FROUND_FLOOR	\
+  (_MM_FROUND_TO_NEG_INF | _MM_FROUND_RAISE_EXC)
+#define _MM_FROUND_CEIL		\
+  (_MM_FROUND_TO_POS_INF | _MM_FROUND_RAISE_EXC)
+#define _MM_FROUND_TRUNC	\
+  (_MM_FROUND_TO_ZERO | _MM_FROUND_RAISE_EXC)
+#define _MM_FROUND_RINT		\
+  (_MM_FROUND_CUR_DIRECTION | _MM_FROUND_RAISE_EXC)
+#define _MM_FROUND_NEARBYINT	\
+  (_MM_FROUND_CUR_DIRECTION | _MM_FROUND_NO_EXC)
+
+#define _MM_FROUND_RAISE_EXC            0x00
+#define _MM_FROUND_NO_EXC               0x08
+
+extern __inline __m128d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_round_pd (__m128d __A, int __rounding)
+{
+  __v2df __r;
+  union {
+    double __fr;
+    long long __fpscr;
+  } __enables_save, __fpscr_save;
+
+  if (__rounding & _MM_FROUND_NO_EXC)
+    {
+      /* Save enabled exceptions, disable all exceptions,
+	 and preserve the rounding mode.  */
+#ifdef _ARCH_PWR9
+      __asm__ ("mffsce %0" : "=f" (__fpscr_save.__fr));
+      __enables_save.__fpscr = __fpscr_save.__fpscr & 0xf8;
+#else
+      __fpscr_save.__fr = __builtin_mffs ();
+      __enables_save.__fpscr = __fpscr_save.__fpscr & 0xf8;
+      __fpscr_save.__fpscr &= ~0xf8;
+      __builtin_mtfsf (0b00000011, __fpscr_save.__fr);
+#endif
+      /* Insert an artificial "read/write" reference to the variable
+	 read below, to ensure the compiler does not schedule
+	 a read/use of the variable before the FPSCR is modified, above.
+	 This can be removed if and when GCC PR102783 is fixed.
+       */
+      __asm__ ("" : "+wa" (__A));
+    }
+
+  switch (__rounding)
+    {
+      case _MM_FROUND_TO_NEAREST_INT:
+	__fpscr_save.__fr = __builtin_mffsl ();
+	__attribute__ ((fallthrough));
+      case _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC:
+	__builtin_set_fpscr_rn (0b00);
+	/* Insert an artificial "read/write" reference to the variable
+	   read below, to ensure the compiler does not schedule
+	   a read/use of the variable before the FPSCR is modified, above.
+	   This can be removed if and when GCC PR102783 is fixed.
+	 */
+	__asm__ ("" : "+wa" (__A));
+
+	__r = vec_rint ((__v2df) __A);
+
+	/* Insert an artificial "read" reference to the variable written
+	   above, to ensure the compiler does not schedule the computation
+	   of the value after the manipulation of the FPSCR, below.
+	   This can be removed if and when GCC PR102783 is fixed.
+	 */
+	__asm__ ("" : : "wa" (__r));
+	__builtin_set_fpscr_rn (__fpscr_save.__fpscr);
+	break;
+      case _MM_FROUND_TO_NEG_INF:
+      case _MM_FROUND_TO_NEG_INF | _MM_FROUND_NO_EXC:
+	__r = vec_floor ((__v2df) __A);
+	break;
+      case _MM_FROUND_TO_POS_INF:
+      case _MM_FROUND_TO_POS_INF | _MM_FROUND_NO_EXC:
+	__r = vec_ceil ((__v2df) __A);
+	break;
+      case _MM_FROUND_TO_ZERO:
+      case _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC:
+	__r = vec_trunc ((__v2df) __A);
+	break;
+      case _MM_FROUND_CUR_DIRECTION:
+	__r = vec_rint ((__v2df) __A);
+	break;
+    }
+  if (__rounding & _MM_FROUND_NO_EXC)
+    {
+      /* Insert an artificial "read" reference to the variable written
+	 above, to ensure the compiler does not schedule the computation
+	 of the value after the manipulation of the FPSCR, below.
+	 This can be removed if and when GCC PR102783 is fixed.
+       */
+      __asm__ ("" : : "wa" (__r));
+      /* Restore enabled exceptions.  */
+      __fpscr_save.__fr = __builtin_mffsl ();
+      __fpscr_save.__fpscr |= __enables_save.__fpscr;
+      __builtin_mtfsf (0b00000011, __fpscr_save.__fr);
+    }
+  return (__m128d) __r;
 }
 
-extern __inline int
-    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
-    _mm_extract_epi32(__m128i __X, const int __N) {
+extern __inline __m128d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_round_sd (__m128d __A, __m128d __B, int __rounding)
+{
+  __B = _mm_round_pd (__B, __rounding);
+  __v2df __r = { ((__v2df) __B)[0], ((__v2df) __A)[1] };
+  return (__m128d) __r;
+}
+
+extern __inline __m128
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_round_ps (__m128 __A, int __rounding)
+{
+  __v4sf __r;
+  union {
+    double __fr;
+    long long __fpscr;
+  } __enables_save, __fpscr_save;
+
+  if (__rounding & _MM_FROUND_NO_EXC)
+    {
+      /* Save enabled exceptions, disable all exceptions,
+	 and preserve the rounding mode.  */
+#ifdef _ARCH_PWR9
+      __asm__ ("mffsce %0" : "=f" (__fpscr_save.__fr));
+      __enables_save.__fpscr = __fpscr_save.__fpscr & 0xf8;
+#else
+      __fpscr_save.__fr = __builtin_mffs ();
+      __enables_save.__fpscr = __fpscr_save.__fpscr & 0xf8;
+      __fpscr_save.__fpscr &= ~0xf8;
+      __builtin_mtfsf (0b00000011, __fpscr_save.__fr);
+#endif
+      /* Insert an artificial "read/write" reference to the variable
+	 read below, to ensure the compiler does not schedule
+	 a read/use of the variable before the FPSCR is modified, above.
+	 This can be removed if and when GCC PR102783 is fixed.
+       */
+      __asm__ ("" : "+wa" (__A));
+    }
+
+  switch (__rounding)
+    {
+      case _MM_FROUND_TO_NEAREST_INT:
+	__fpscr_save.__fr = __builtin_mffsl ();
+	__attribute__ ((fallthrough));
+      case _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC:
+	__builtin_set_fpscr_rn (0b00);
+	/* Insert an artificial "read/write" reference to the variable
+	   read below, to ensure the compiler does not schedule
+	   a read/use of the variable before the FPSCR is modified, above.
+	   This can be removed if and when GCC PR102783 is fixed.
+	 */
+	__asm__ ("" : "+wa" (__A));
+
+	__r = vec_rint ((__v4sf) __A);
+
+	/* Insert an artificial "read" reference to the variable written
+	   above, to ensure the compiler does not schedule the computation
+	   of the value after the manipulation of the FPSCR, below.
+	   This can be removed if and when GCC PR102783 is fixed.
+	 */
+	__asm__ ("" : : "wa" (__r));
+	__builtin_set_fpscr_rn (__fpscr_save.__fpscr);
+	break;
+      case _MM_FROUND_TO_NEG_INF:
+      case _MM_FROUND_TO_NEG_INF | _MM_FROUND_NO_EXC:
+	__r = vec_floor ((__v4sf) __A);
+	break;
+      case _MM_FROUND_TO_POS_INF:
+      case _MM_FROUND_TO_POS_INF | _MM_FROUND_NO_EXC:
+	__r = vec_ceil ((__v4sf) __A);
+	break;
+      case _MM_FROUND_TO_ZERO:
+      case _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC:
+	__r = vec_trunc ((__v4sf) __A);
+	break;
+      case _MM_FROUND_CUR_DIRECTION:
+	__r = vec_rint ((__v4sf) __A);
+	break;
+    }
+  if (__rounding & _MM_FROUND_NO_EXC)
+    {
+      /* Insert an artificial "read" reference to the variable written
+	 above, to ensure the compiler does not schedule the computation
+	 of the value after the manipulation of the FPSCR, below.
+	 This can be removed if and when GCC PR102783 is fixed.
+       */
+      __asm__ ("" : : "wa" (__r));
+      /* Restore enabled exceptions.  */
+      __fpscr_save.__fr = __builtin_mffsl ();
+      __fpscr_save.__fpscr |= __enables_save.__fpscr;
+      __builtin_mtfsf (0b00000011, __fpscr_save.__fr);
+    }
+  return (__m128) __r;
+}
+
+extern __inline __m128
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_round_ss (__m128 __A, __m128 __B, int __rounding)
+{
+  __B = _mm_round_ps (__B, __rounding);
+  __v4sf __r = (__v4sf) __A;
+  __r[0] = ((__v4sf) __B)[0];
+  return (__m128) __r;
+}
+
+#define _mm_ceil_pd(V)	   _mm_round_pd ((V), _MM_FROUND_CEIL)
+#define _mm_ceil_sd(D, V)  _mm_round_sd ((D), (V), _MM_FROUND_CEIL)
+
+#define _mm_floor_pd(V)	   _mm_round_pd((V), _MM_FROUND_FLOOR)
+#define _mm_floor_sd(D, V) _mm_round_sd ((D), (V), _MM_FROUND_FLOOR)
+
+#define _mm_ceil_ps(V)	   _mm_round_ps ((V), _MM_FROUND_CEIL)
+#define _mm_ceil_ss(D, V)  _mm_round_ss ((D), (V), _MM_FROUND_CEIL)
+
+#define _mm_floor_ps(V)	   _mm_round_ps ((V), _MM_FROUND_FLOOR)
+#define _mm_floor_ss(D, V) _mm_round_ss ((D), (V), _MM_FROUND_FLOOR)
+
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_insert_epi8 (__m128i const __A, int const __D, int const __N)
+{
+  __v16qi result = (__v16qi)__A;
+
+  result [__N & 0xf] = __D;
+
+  return (__m128i) result;
+}
+
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_insert_epi32 (__m128i const __A, int const __D, int const __N)
+{
+  __v4si result = (__v4si)__A;
+
+  result [__N & 3] = __D;
+
+  return (__m128i) result;
+}
+
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_insert_epi64 (__m128i const __A, long long const __D, int const __N)
+{
+  __v2di result = (__v2di)__A;
+
+  result [__N & 1] = __D;
+
+  return (__m128i) result;
+}
+
+extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_extract_epi8 (__m128i __X, const int __N)
+{
+  return (unsigned char) ((__v16qi)__X)[__N & 15];
+}
+
+extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_extract_epi32 (__m128i __X, const int __N)
+{
   return ((__v4si)__X)[__N & 3];
 }
 
-extern __inline int
-    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
-    _mm_extract_epi64(__m128i __X, const int __N) {
+extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_extract_epi64 (__m128i __X, const int __N)
+{
   return ((__v2di)__X)[__N & 1];
 }
 
-extern __inline int
-    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
-    _mm_extract_ps(__m128 __X, const int __N) {
+extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_extract_ps (__m128 __X, const int __N)
+{
   return ((__v4si)__X)[__N & 3];
 }
 
+#ifdef _ARCH_PWR8
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_blend_epi16 (__m128i __A, __m128i __B, const int __imm8)
+{
+  __v16qi __charmask = vec_splats ((signed char) __imm8);
+  __charmask = vec_gb (__charmask);
+  __v8hu __shortmask = (__v8hu) vec_unpackh (__charmask);
+  #ifdef __BIG_ENDIAN__
+  __shortmask = vec_reve (__shortmask);
+  #endif
+  return (__m128i) vec_sel ((__v8hu) __A, (__v8hu) __B, __shortmask);
+}
+#endif
+
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_blendv_epi8 (__m128i __A, __m128i __B, __m128i __mask)
+{
+#ifdef _ARCH_PWR10
+  return (__m128i) vec_blendv ((__v16qi) __A, (__v16qi) __B, (__v16qu) __mask);
+#else
+  const __v16qu __seven = vec_splats ((unsigned char) 0x07);
+  __v16qu __lmask = vec_sra ((__v16qu) __mask, __seven);
+  return (__m128i) vec_sel ((__v16qi) __A, (__v16qi) __B, __lmask);
+#endif
+}
+
+extern __inline __m128
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_blend_ps (__m128 __A, __m128 __B, const int __imm8)
+{
+  __v16qu __pcv[] =
+    {
+      {  0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15 },
+      { 16, 17, 18, 19,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15 },
+      {  0,  1,  2,  3, 20, 21, 22, 23,  8,  9, 10, 11, 12, 13, 14, 15 },
+      { 16, 17, 18, 19, 20, 21, 22, 23,  8,  9, 10, 11, 12, 13, 14, 15 },
+      {  0,  1,  2,  3,  4,  5,  6,  7, 24, 25, 26, 27, 12, 13, 14, 15 },
+      { 16, 17, 18, 19,  4,  5,  6,  7, 24, 25, 26, 27, 12, 13, 14, 15 },
+      {  0,  1,  2,  3, 20, 21, 22, 23, 24, 25, 26, 27, 12, 13, 14, 15 },
+      { 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 12, 13, 14, 15 },
+      {  0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 28, 29, 30, 31 },
+      { 16, 17, 18, 19,  4,  5,  6,  7,  8,  9, 10, 11, 28, 29, 30, 31 },
+      {  0,  1,  2,  3, 20, 21, 22, 23,  8,  9, 10, 11, 28, 29, 30, 31 },
+      { 16, 17, 18, 19, 20, 21, 22, 23,  8,  9, 10, 11, 28, 29, 30, 31 },
+      {  0,  1,  2,  3,  4,  5,  6,  7, 24, 25, 26, 27, 28, 29, 30, 31 },
+      { 16, 17, 18, 19,  4,  5,  6,  7, 24, 25, 26, 27, 28, 29, 30, 31 },
+      {  0,  1,  2,  3, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31 },
+      { 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31 },
+    };
+  __v16qu __r = vec_perm ((__v16qu) __A, (__v16qu)__B, __pcv[__imm8]);
+  return (__m128) __r;
+}
+
+extern __inline __m128
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_blendv_ps (__m128 __A, __m128 __B, __m128 __mask)
+{
+#ifdef _ARCH_PWR10
+  return (__m128) vec_blendv ((__v4sf) __A, (__v4sf) __B, (__v4su) __mask);
+#else
+  const __v4si __zero = {0};
+  const __vector __bool int __boolmask = vec_cmplt ((__v4si) __mask, __zero);
+  return (__m128) vec_sel ((__v4su) __A, (__v4su) __B, (__v4su) __boolmask);
+#endif
+}
+
+extern __inline __m128d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_blend_pd (__m128d __A, __m128d __B, const int __imm8)
+{
+  __v16qu __pcv[] =
+    {
+      {  0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15 },
+      { 16, 17, 18, 19, 20, 21, 22, 23,  8,  9, 10, 11, 12, 13, 14, 15 },
+      {  0,  1,  2,  3,  4,  5,  6,  7, 24, 25, 26, 27, 28, 29, 30, 31 },
+      { 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31 }
+    };
+  __v16qu __r = vec_perm ((__v16qu) __A, (__v16qu)__B, __pcv[__imm8]);
+  return (__m128d) __r;
+}
+
+#ifdef _ARCH_PWR8
+extern __inline __m128d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_blendv_pd (__m128d __A, __m128d __B, __m128d __mask)
+{
+#ifdef _ARCH_PWR10
+  return (__m128d) vec_blendv ((__v2df) __A, (__v2df) __B, (__v2du) __mask);
+#else
+  const __v2di __zero = {0};
+  const __vector __bool long long __boolmask = vec_cmplt ((__v2di) __mask, __zero);
+  return (__m128d) vec_sel ((__v2du) __A, (__v2du) __B, (__v2du) __boolmask);
+#endif
+}
+#endif
+
+extern __inline int
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_testz_si128 (__m128i __A, __m128i __B)
+{
+  /* Note: This implementation does NOT set "zero" or "carry" flags.  */
+  const __v16qu __zero = {0};
+  return vec_all_eq (vec_and ((__v16qu) __A, (__v16qu) __B), __zero);
+}
+
+extern __inline int
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_testc_si128 (__m128i __A, __m128i __B)
+{
+  /* Note: This implementation does NOT set "zero" or "carry" flags.  */
+  const __v16qu __zero = {0};
+  const __v16qu __notA = vec_nor ((__v16qu) __A, (__v16qu) __A);
+  return vec_all_eq (vec_and ((__v16qu) __notA, (__v16qu) __B), __zero);
+}
+
+extern __inline int
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_testnzc_si128 (__m128i __A, __m128i __B)
+{
+  /* Note: This implementation does NOT set "zero" or "carry" flags.  */
+  return _mm_testz_si128 (__A, __B) == 0 && _mm_testc_si128 (__A, __B) == 0;
+}
+
+#define _mm_test_all_zeros(M, V) _mm_testz_si128 ((M), (V))
+
+#define _mm_test_all_ones(V) \
+  _mm_testc_si128 ((V), _mm_cmpeq_epi32 ((V), (V)))
+
+#define _mm_test_mix_ones_zeros(M, V) _mm_testnzc_si128 ((M), (V))
+
+#ifdef _ARCH_PWR8
 extern __inline __m128i
-    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
-    _mm_blend_epi16(__m128i __A, __m128i __B, const int __imm8) {
-  __v16qi __charmask = vec_splats((signed char)__imm8);
-  __charmask = vec_gb(__charmask);
-  __v8hu __shortmask = (__v8hu)vec_unpackh(__charmask);
-#ifdef __BIG_ENDIAN__
-  __shortmask = vec_reve(__shortmask);
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cmpeq_epi64 (__m128i __X, __m128i __Y)
+{
+  return (__m128i) vec_cmpeq ((__v2di) __X, (__v2di) __Y);
+}
 #endif
-  return (__m128i)vec_sel((__v8hu)__A, (__v8hu)__B, __shortmask);
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_min_epi8 (__m128i __X, __m128i __Y)
+{
+  return (__m128i) vec_min ((__v16qi)__X, (__v16qi)__Y);
 }
 
 extern __inline __m128i
-    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
-    _mm_blendv_epi8(__m128i __A, __m128i __B, __m128i __mask) {
-  const __v16qu __seven = vec_splats((unsigned char)0x07);
-  __v16qu __lmask = vec_sra((__v16qu)__mask, __seven);
-  return (__m128i)vec_sel((__v16qu)__A, (__v16qu)__B, __lmask);
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_min_epu16 (__m128i __X, __m128i __Y)
+{
+  return (__m128i) vec_min ((__v8hu)__X, (__v8hu)__Y);
 }
 
 extern __inline __m128i
-    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
-    _mm_insert_epi8(__m128i const __A, int const __D, int const __N) {
-  __v16qi result = (__v16qi)__A;
-  result[__N & 0xf] = __D;
-  return (__m128i)result;
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_min_epi32 (__m128i __X, __m128i __Y)
+{
+  return (__m128i) vec_min ((__v4si)__X, (__v4si)__Y);
 }
 
 extern __inline __m128i
-    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
-    _mm_insert_epi32(__m128i const __A, int const __D, int const __N) {
-  __v4si result = (__v4si)__A;
-  result[__N & 3] = __D;
-  return (__m128i)result;
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_min_epu32 (__m128i __X, __m128i __Y)
+{
+  return (__m128i) vec_min ((__v4su)__X, (__v4su)__Y);
 }
 
 extern __inline __m128i
-    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
-    _mm_insert_epi64(__m128i const __A, long long const __D, int const __N) {
-  __v2di result = (__v2di)__A;
-  result[__N & 1] = __D;
-  return (__m128i)result;
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_max_epi8 (__m128i __X, __m128i __Y)
+{
+  return (__m128i) vec_max ((__v16qi)__X, (__v16qi)__Y);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_max_epu16 (__m128i __X, __m128i __Y)
+{
+  return (__m128i) vec_max ((__v8hu)__X, (__v8hu)__Y);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_max_epi32 (__m128i __X, __m128i __Y)
+{
+  return (__m128i) vec_max ((__v4si)__X, (__v4si)__Y);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_max_epu32 (__m128i __X, __m128i __Y)
+{
+  return (__m128i) vec_max ((__v4su)__X, (__v4su)__Y);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mullo_epi32 (__m128i __X, __m128i __Y)
+{
+  return (__m128i) vec_mul ((__v4su) __X, (__v4su) __Y);
+}
+
+#ifdef _ARCH_PWR8
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mul_epi32 (__m128i __X, __m128i __Y)
+{
+  return (__m128i) vec_mule ((__v4si) __X, (__v4si) __Y);
+}
+#endif
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cvtepi8_epi16 (__m128i __A)
+{
+  return (__m128i) vec_unpackh ((__v16qi) __A);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cvtepi8_epi32 (__m128i __A)
+{
+  __A = (__m128i) vec_unpackh ((__v16qi) __A);
+  return (__m128i) vec_unpackh ((__v8hi) __A);
+}
+
+#ifdef _ARCH_PWR8
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cvtepi8_epi64 (__m128i __A)
+{
+  __A = (__m128i) vec_unpackh ((__v16qi) __A);
+  __A = (__m128i) vec_unpackh ((__v8hi) __A);
+  return (__m128i) vec_unpackh ((__v4si) __A);
+}
+#endif
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cvtepi16_epi32 (__m128i __A)
+{
+  return (__m128i) vec_unpackh ((__v8hi) __A);
+}
+
+#ifdef _ARCH_PWR8
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cvtepi16_epi64 (__m128i __A)
+{
+  __A = (__m128i) vec_unpackh ((__v8hi) __A);
+  return (__m128i) vec_unpackh ((__v4si) __A);
+}
+#endif
+
+#ifdef _ARCH_PWR8
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cvtepi32_epi64 (__m128i __A)
+{
+  return (__m128i) vec_unpackh ((__v4si) __A);
 }
+#endif
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cvtepu8_epi16 (__m128i __A)
+{
+  const __v16qu __zero = {0};
+#ifdef __LITTLE_ENDIAN__
+  __A = (__m128i) vec_mergeh ((__v16qu) __A, __zero);
+#else /* __BIG_ENDIAN__.  */
+  __A = (__m128i) vec_mergeh (__zero, (__v16qu) __A);
+#endif /* __BIG_ENDIAN__.  */
+  return __A;
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cvtepu8_epi32 (__m128i __A)
+{
+  const __v16qu __zero = {0};
+#ifdef __LITTLE_ENDIAN__
+  __A = (__m128i) vec_mergeh ((__v16qu) __A, __zero);
+  __A = (__m128i) vec_mergeh ((__v8hu) __A, (__v8hu) __zero);
+#else /* __BIG_ENDIAN__.  */
+  __A = (__m128i) vec_mergeh (__zero, (__v16qu) __A);
+  __A = (__m128i) vec_mergeh ((__v8hu) __zero, (__v8hu) __A);
+#endif /* __BIG_ENDIAN__.  */
+  return __A;
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cvtepu8_epi64 (__m128i __A)
+{
+  const __v16qu __zero = {0};
+#ifdef __LITTLE_ENDIAN__
+  __A = (__m128i) vec_mergeh ((__v16qu) __A, __zero);
+  __A = (__m128i) vec_mergeh ((__v8hu) __A, (__v8hu) __zero);
+  __A = (__m128i) vec_mergeh ((__v4su) __A, (__v4su) __zero);
+#else /* __BIG_ENDIAN__.  */
+  __A = (__m128i) vec_mergeh (__zero, (__v16qu) __A);
+  __A = (__m128i) vec_mergeh ((__v8hu) __zero, (__v8hu) __A);
+  __A = (__m128i) vec_mergeh ((__v4su) __zero, (__v4su) __A);
+#endif /* __BIG_ENDIAN__.  */
+  return __A;
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cvtepu16_epi32 (__m128i __A)
+{
+  const __v8hu __zero = {0};
+#ifdef __LITTLE_ENDIAN__
+  __A = (__m128i) vec_mergeh ((__v8hu) __A, __zero);
+#else /* __BIG_ENDIAN__.  */
+  __A = (__m128i) vec_mergeh (__zero, (__v8hu) __A);
+#endif /* __BIG_ENDIAN__.  */
+  return __A;
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cvtepu16_epi64 (__m128i __A)
+{
+  const __v8hu __zero = {0};
+#ifdef __LITTLE_ENDIAN__
+  __A = (__m128i) vec_mergeh ((__v8hu) __A, __zero);
+  __A = (__m128i) vec_mergeh ((__v4su) __A, (__v4su) __zero);
+#else /* __BIG_ENDIAN__.  */
+  __A = (__m128i) vec_mergeh (__zero, (__v8hu) __A);
+  __A = (__m128i) vec_mergeh ((__v4su) __zero, (__v4su) __A);
+#endif /* __BIG_ENDIAN__.  */
+  return __A;
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cvtepu32_epi64 (__m128i __A)
+{
+  const __v4su __zero = {0};
+#ifdef __LITTLE_ENDIAN__
+  __A = (__m128i) vec_mergeh ((__v4su) __A, __zero);
+#else /* __BIG_ENDIAN__.  */
+  __A = (__m128i) vec_mergeh (__zero, (__v4su) __A);
+#endif /* __BIG_ENDIAN__.  */
+  return __A;
+}
+
+/* Return horizontal packed word minimum and its index in bits [15:0]
+   and bits [18:16] respectively.  */
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_minpos_epu16 (__m128i __A)
+{
+  union __u
+    {
+      __m128i __m;
+      __v8hu __uh;
+    };
+  union __u __u = { .__m = __A }, __r = { .__m = {0} };
+  unsigned short __ridx = 0;
+  unsigned short __rmin = __u.__uh[__ridx];
+  for (unsigned long __i = 1; __i < 8; __i++)
+    {
+      if (__u.__uh[__i] < __rmin)
+	{
+	  __rmin = __u.__uh[__i];
+	  __ridx = __i;
+	}
+    }
+  __r.__uh[0] = __rmin;
+  __r.__uh[1] = __ridx;
+  return __r.__m;
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_packus_epi32 (__m128i __X, __m128i __Y)
+{
+  return (__m128i) vec_packsu ((__v4si) __X, (__v4si) __Y);
+}
+
+#ifdef _ARCH_PWR8
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cmpgt_epi64 (__m128i __X, __m128i __Y)
+{
+  return (__m128i) vec_cmpgt ((__v2di) __X, (__v2di) __Y);
+}
+#endif
 
 #else
 #include_next <smmintrin.h>
 #endif /* defined(__ppc64__) && (defined(__linux__) || defined(__FreeBSD__))   \
         */
 
-#endif /* _SMMINTRIN_H_ */
+#endif /* SMMINTRIN_H_ */
Index: clang/lib/Headers/ppc_wrappers/pmmintrin.h
===================================================================
--- clang/lib/Headers/ppc_wrappers/pmmintrin.h
+++ clang/lib/Headers/ppc_wrappers/pmmintrin.h
@@ -111,17 +111,21 @@
 			    vec_mergel ((__v2df) __X, (__v2df)__Y));
 }
 
+#ifdef _ARCH_PWR8
 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 _mm_movehdup_ps (__m128 __X)
 {
   return (__m128)vec_mergeo ((__v4su)__X, (__v4su)__X);
 }
+#endif
 
+#ifdef _ARCH_PWR8
 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 _mm_moveldup_ps (__m128 __X)
 {
   return (__m128)vec_mergee ((__v4su)__X, (__v4su)__X);
 }
+#endif
 
 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 _mm_loaddup_pd (double const *__P)
Index: clang/lib/Headers/ppc_wrappers/nmmintrin.h
===================================================================
--- /dev/null
+++ clang/lib/Headers/ppc_wrappers/nmmintrin.h
@@ -0,0 +1,26 @@
+/*===---- nmmintrin.h - Implementation of SSE4 intrinsics on PowerPC -------===
+ *
+ * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+ * See https://llvm.org/LICENSE.txt for license information.
+ * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+ *
+ *===-----------------------------------------------------------------------===
+ */
+
+#ifndef NO_WARN_X86_INTRINSICS
+/* This header is distributed to simplify porting x86_64 code that
+   makes explicit use of Intel intrinsics to powerpc64le.
+   It is the user's responsibility to determine if the results are
+   acceptable and make additional changes as necessary.
+   Note that much code that uses Intel intrinsics can be rewritten in
+   standard C or GNU C extensions, which are more portable and better
+   optimized across multiple targets.  */
+#endif
+
+#ifndef NMMINTRIN_H_
+#define NMMINTRIN_H_
+
+/* We just include SSE4.1 header file.  */
+#include <smmintrin.h>
+
+#endif /* NMMINTRIN_H_ */
Index: clang/lib/Headers/ppc_wrappers/immintrin.h
===================================================================
--- /dev/null
+++ clang/lib/Headers/ppc_wrappers/immintrin.h
@@ -0,0 +1,27 @@
+/*===---- immintrin.h - Implementation of Intel intrinsics on PowerPC ------===
+ *
+ * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+ * See https://llvm.org/LICENSE.txt for license information.
+ * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+ *
+ *===-----------------------------------------------------------------------===
+ */
+
+#ifndef IMMINTRIN_H_
+#define IMMINTRIN_H_
+
+#include <x86gprintrin.h>
+
+#include <mmintrin.h>
+
+#include <xmmintrin.h>
+
+#include <emmintrin.h>
+
+#include <pmmintrin.h>
+
+#include <tmmintrin.h>
+
+#include <smmintrin.h>
+
+#endif /* IMMINTRIN_H_ */
Index: clang/lib/Headers/ppc_wrappers/emmintrin.h
===================================================================
--- clang/lib/Headers/ppc_wrappers/emmintrin.h
+++ clang/lib/Headers/ppc_wrappers/emmintrin.h
@@ -405,20 +405,10 @@
 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 _mm_cmpord_pd (__m128d __A, __m128d __B)
 {
-#if _ARCH_PWR8
   __v2du c, d;
   /* Compare against self will return false (0's) if NAN.  */
   c = (__v2du)vec_cmpeq (__A, __A);
   d = (__v2du)vec_cmpeq (__B, __B);
-#else
-  __v2du a, b;
-  __v2du c, d;
-  const __v2du double_exp_mask  = {0x7ff0000000000000, 0x7ff0000000000000};
-  a = (__v2du)vec_abs ((__v2df)__A);
-  b = (__v2du)vec_abs ((__v2df)__B);
-  c = (__v2du)vec_cmpgt (double_exp_mask, a);
-  d = (__v2du)vec_cmpgt (double_exp_mask, b);
-#endif
   /* A != NAN and B != NAN.  */
   return ((__m128d)vec_and(c, d));
 }
@@ -861,7 +851,11 @@
       : );
 
 #ifdef _ARCH_PWR8
+#ifdef __LITTLE_ENDIAN__
   temp = vec_mergeo (temp, temp);
+#else
+  temp = vec_mergee (temp, temp);
+#endif
   result = (__v4si) vec_vpkudum ((__vector long long) temp,
 				 (__vector long long) vzero);
 #else
@@ -896,7 +890,11 @@
       : );
 
 #ifdef _ARCH_PWR8
+#ifdef __LITTLE_ENDIAN__
   temp = vec_mergeo (temp, temp);
+#else
+  temp = vec_mergee (temp, temp);
+#endif
   result = (__v4sf) vec_vpkudum ((__vector long long) temp,
 				 (__vector long long) vzero);
 #else
@@ -925,7 +923,11 @@
       : );
 
 #ifdef _ARCH_PWR8
+#ifdef __LITTLE_ENDIAN__
   temp = vec_mergeo (temp, temp);
+#else
+  temp = vec_mergee (temp, temp);
+#endif
   result = (__v4si) vec_vpkudum ((__vector long long) temp,
 				 (__vector long long) vzero);
 #else
@@ -1205,6 +1207,9 @@
 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 _mm_movemask_pd (__m128d  __A)
 {
+#ifdef _ARCH_PWR10
+  return vec_extractm ((__v2du) __A);
+#else
   __vector unsigned long long result;
   static const __vector unsigned int perm_mask =
     {
@@ -1224,6 +1229,7 @@
 #else
   return result[0];
 #endif
+#endif /* !_ARCH_PWR10 */
 }
 #endif /* _ARCH_PWR8 */
 
@@ -1434,6 +1440,7 @@
   return ((__m64)a * (__m64)b);
 }
 
+#ifdef _ARCH_PWR8
 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 _mm_mul_epu32 (__m128i __A, __m128i __B)
 {
@@ -1460,6 +1467,7 @@
   return (__m128i) vec_mule ((__v4su)__A, (__v4su)__B);
 #endif
 }
+#endif
 
 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 _mm_slli_epi16 (__m128i __A, int __B)
@@ -1749,7 +1757,7 @@
   lshift = vec_splat ((__v2du) __B, 0);
   shmask = vec_cmplt (lshift, shmax);
   result = vec_sl ((__v2du) __A, lshift);
-  result = (__v2du)vec_sel ((__v2df) shmask, (__v2df)result, shmask);
+  result = vec_sel ((__v2du) shmask, result, shmask);
 
   return (__m128i) result;
 }
@@ -1843,7 +1851,7 @@
   rshift = vec_splat ((__v2du) __B, 0);
   shmask = vec_cmplt (rshift, shmax);
   result = vec_sr ((__v2du) __A, rshift);
-  result = (__v2du)vec_sel ((__v2df) shmask, (__v2df)result, shmask);
+  result = vec_sel ((__v2du) shmask, result, shmask);
 
   return (__m128i) result;
 }
@@ -1995,10 +2003,14 @@
 #ifdef _ARCH_PWR8
 /* Intrinsic functions that require PowerISA 2.07 minimum.  */
 
-/* Creates a 4-bit mask from the most significant bits of the SPFP values.  */
+/* Return a mask created from the most significant bit of each 8-bit
+   element in A.  */
 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 _mm_movemask_epi8 (__m128i __A)
 {
+#ifdef _ARCH_PWR10
+  return vec_extractm ((__v16qu) __A);
+#else
   __vector unsigned long long result;
   static const __vector unsigned char perm_mask =
     {
@@ -2015,6 +2027,7 @@
 #else
   return result[0];
 #endif
+#endif /* !_ARCH_PWR10 */
 }
 #endif /* _ARCH_PWR8 */
 
@@ -2158,27 +2171,37 @@
 _mm_sad_epu8 (__m128i __A, __m128i __B)
 {
   __v16qu a, b;
-  __v16qu vmin, vmax, vabsdiff;
+  __v16qu vabsdiff;
   __v4si vsum;
   const __v4su zero = { 0, 0, 0, 0 };
   __v4si result;
 
   a = (__v16qu) __A;
   b = (__v16qu) __B;
-  vmin = vec_min (a, b);
-  vmax = vec_max (a, b);
+#ifndef _ARCH_PWR9
+  __v16qu vmin = vec_min (a, b);
+  __v16qu vmax = vec_max (a, b);
   vabsdiff = vec_sub (vmax, vmin);
+#else
+  vabsdiff = vec_absd (a, b);
+#endif
   /* Sum four groups of bytes into integers.  */
   vsum = (__vector signed int) vec_sum4s (vabsdiff, zero);
+#ifdef __LITTLE_ENDIAN__
+  /* Sum across four integers with two integer results.  */
+  asm ("vsum2sws %0,%1,%2" : "=v" (result) : "v" (vsum), "v" (zero));
+  /* Note: vec_sum2s could be used here, but on little-endian, vector
+     shifts are added that are not needed for this use-case.
+     A vector shift to correctly position the 32-bit integer results
+     (currently at [0] and [2]) to [1] and [3] would then need to be
+     swapped back again since the desired results are two 64-bit
+     integers ([1]|[0] and [3]|[2]).  Thus, no shift is performed.  */
+#else
   /* Sum across four integers with two integer results.  */
   result = vec_sum2s (vsum, (__vector signed int) zero);
   /* Rotate the sums into the correct position.  */
-#ifdef __LITTLE_ENDIAN__
-  result = vec_sld (result, result, 4);
-#else
   result = vec_sld (result, result, 6);
 #endif
-  /* Rotate the sums into the correct position.  */
   return (__m128i) result;
 }
 
Index: clang/lib/Headers/ppc_wrappers/bmiintrin.h
===================================================================
--- /dev/null
+++ clang/lib/Headers/ppc_wrappers/bmiintrin.h
@@ -0,0 +1,165 @@
+/*===---- bmiintrin.h - Implementation of BMI intrinsics on PowerPC --------===
+ *
+ * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+ * See https://llvm.org/LICENSE.txt for license information.
+ * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+ *
+ *===-----------------------------------------------------------------------===
+ */
+
+#if !defined X86GPRINTRIN_H_
+#error "Never use <bmiintrin.h> directly; include <x86gprintrin.h> instead."
+#endif
+
+#ifndef BMIINTRIN_H_
+#define BMIINTRIN_H_
+
+extern __inline unsigned short
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+    __tzcnt_u16(unsigned short __X) {
+  return __builtin_ctz(__X);
+}
+
+extern __inline unsigned int
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+    __andn_u32(unsigned int __X, unsigned int __Y) {
+  return (~__X & __Y);
+}
+
+extern __inline unsigned int
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+    _bextr_u32(unsigned int __X, unsigned int __P, unsigned int __L) {
+  return ((__X << (32 - (__L + __P))) >> (32 - __L));
+}
+
+extern __inline unsigned int
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+    __bextr_u32(unsigned int __X, unsigned int __Y) {
+  unsigned int __P, __L;
+  __P = __Y & 0xFF;
+  __L = (__Y >> 8) & 0xFF;
+  return (_bextr_u32(__X, __P, __L));
+}
+
+extern __inline unsigned int
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+    __blsi_u32(unsigned int __X) {
+  return (__X & -__X);
+}
+
+extern __inline unsigned int
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+    _blsi_u32(unsigned int __X) {
+  return __blsi_u32(__X);
+}
+
+extern __inline unsigned int
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+    __blsmsk_u32(unsigned int __X) {
+  return (__X ^ (__X - 1));
+}
+
+extern __inline unsigned int
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+    _blsmsk_u32(unsigned int __X) {
+  return __blsmsk_u32(__X);
+}
+
+extern __inline unsigned int
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+    __blsr_u32(unsigned int __X) {
+  return (__X & (__X - 1));
+}
+
+extern __inline unsigned int
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+    _blsr_u32(unsigned int __X) {
+  return __blsr_u32(__X);
+}
+
+extern __inline unsigned int
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+    __tzcnt_u32(unsigned int __X) {
+  return __builtin_ctz(__X);
+}
+
+extern __inline unsigned int
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+    _tzcnt_u32(unsigned int __X) {
+  return __builtin_ctz(__X);
+}
+
+/* use the 64-bit shift, rotate, and count leading zeros instructions
+   for long long.  */
+#ifdef __PPC64__
+extern __inline unsigned long long
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+    __andn_u64(unsigned long long __X, unsigned long long __Y) {
+  return (~__X & __Y);
+}
+
+extern __inline unsigned long long
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+    _bextr_u64(unsigned long long __X, unsigned int __P, unsigned int __L) {
+  return ((__X << (64 - (__L + __P))) >> (64 - __L));
+}
+
+extern __inline unsigned long long
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+    __bextr_u64(unsigned long long __X, unsigned long long __Y) {
+  unsigned int __P, __L;
+  __P = __Y & 0xFF;
+  __L = (__Y & 0xFF00) >> 8;
+  return (_bextr_u64(__X, __P, __L));
+}
+
+extern __inline unsigned long long
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+    __blsi_u64(unsigned long long __X) {
+  return __X & -__X;
+}
+
+extern __inline unsigned long long
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+    _blsi_u64(unsigned long long __X) {
+  return __blsi_u64(__X);
+}
+
+extern __inline unsigned long long
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+    __blsmsk_u64(unsigned long long __X) {
+  return (__X ^ (__X - 1));
+}
+
+extern __inline unsigned long long
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+    _blsmsk_u64(unsigned long long __X) {
+  return __blsmsk_u64(__X);
+}
+
+extern __inline unsigned long long
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+    __blsr_u64(unsigned long long __X) {
+  return (__X & (__X - 1));
+}
+
+extern __inline unsigned long long
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+    _blsr_u64(unsigned long long __X) {
+  return __blsr_u64(__X);
+}
+
+extern __inline unsigned long long
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+    __tzcnt_u64(unsigned long long __X) {
+  return __builtin_ctzll(__X);
+}
+
+extern __inline unsigned long long
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+    _tzcnt_u64(unsigned long long __X) {
+  return __builtin_ctzll(__X);
+}
+#endif /* __PPC64__  */
+
+#endif /* BMIINTRIN_H_ */
Index: clang/lib/Headers/ppc_wrappers/bmi2intrin.h
===================================================================
--- /dev/null
+++ clang/lib/Headers/ppc_wrappers/bmi2intrin.h
@@ -0,0 +1,133 @@
+/*===---- bmiintrin.h - Implementation of BMI2 intrinsics on PowerPC -------===
+ *
+ * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+ * See https://llvm.org/LICENSE.txt for license information.
+ * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+ *
+ *===-----------------------------------------------------------------------===
+ */
+
+#if !defined X86GPRINTRIN_H_
+#error "Never use <bmi2intrin.h> directly; include <x86gprintrin.h> instead."
+#endif
+
+#ifndef BMI2INTRIN_H_
+#define BMI2INTRIN_H_
+
+extern __inline unsigned int
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+    _bzhi_u32(unsigned int __X, unsigned int __Y) {
+  return ((__X << (32 - __Y)) >> (32 - __Y));
+}
+
+extern __inline unsigned int
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+    _mulx_u32(unsigned int __X, unsigned int __Y, unsigned int *__P) {
+  unsigned long long __res = (unsigned long long)__X * __Y;
+  *__P = (unsigned int)(__res >> 32);
+  return (unsigned int)__res;
+}
+
+#ifdef __PPC64__
+extern __inline unsigned long long
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+    _bzhi_u64(unsigned long long __X, unsigned long long __Y) {
+  return ((__X << (64 - __Y)) >> (64 - __Y));
+}
+
+/* __int128 requires base 64-bit.  */
+extern __inline unsigned long long
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+    _mulx_u64(unsigned long long __X, unsigned long long __Y,
+              unsigned long long *__P) {
+  unsigned __int128 __res = (unsigned __int128)__X * __Y;
+  *__P = (unsigned long long)(__res >> 64);
+  return (unsigned long long)__res;
+}
+
+#ifdef _ARCH_PWR7
+/* popcount and bpermd require power7 minimum.  */
+extern __inline unsigned long long
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+    _pdep_u64(unsigned long long __X, unsigned long long __M) {
+  unsigned long result = 0x0UL;
+  const unsigned long mask = 0x8000000000000000UL;
+  unsigned long m = __M;
+  unsigned long c, t;
+  unsigned long p;
+
+  /* The pop-count of the mask gives the number of the bits from
+   source to process.  This is also needed to shift bits from the
+   source into the correct position for the result.  */
+  p = 64 - __builtin_popcountl(__M);
+
+  /* The loop is for the number of '1' bits in the mask and clearing
+   each mask bit as it is processed.  */
+  while (m != 0) {
+    c = __builtin_clzl(m);
+    t = __X << (p - c);
+    m ^= (mask >> c);
+    result |= (t & (mask >> c));
+    p++;
+  }
+  return (result);
+}
+
+extern __inline unsigned long long
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+    _pext_u64(unsigned long long __X, unsigned long long __M) {
+  unsigned long p = 0x4040404040404040UL; // initial bit permute control
+  const unsigned long mask = 0x8000000000000000UL;
+  unsigned long m = __M;
+  unsigned long c;
+  unsigned long result;
+
+  /* if the mask is constant and selects 8 bits or less we can use
+   the Power8 Bit permute instruction.  */
+  if (__builtin_constant_p(__M) && (__builtin_popcountl(__M) <= 8)) {
+    /* Also if the pext mask is constant, then the popcount is
+     constant, we can evaluate the following loop at compile
+     time and use a constant bit permute vector.  */
+    for (long i = 0; i < __builtin_popcountl(__M); i++) {
+      c = __builtin_clzl(m);
+      p = (p << 8) | c;
+      m ^= (mask >> c);
+    }
+    result = __builtin_bpermd(p, __X);
+  } else {
+    p = 64 - __builtin_popcountl(__M);
+    result = 0;
+    /* We could a use a for loop here, but that combined with
+     -funroll-loops can expand to a lot of code.  The while
+     loop avoids unrolling and the compiler commons the xor
+     from clearing the mask bit with the (m != 0) test.  The
+     result is a more compact loop setup and body.  */
+    while (m != 0) {
+      unsigned long t;
+      c = __builtin_clzl(m);
+      t = (__X & (mask >> c)) >> (p - c);
+      m ^= (mask >> c);
+      result |= (t);
+      p++;
+    }
+  }
+  return (result);
+}
+
+/* these 32-bit implementations depend on 64-bit pdep/pext
+   which depend on _ARCH_PWR7.  */
+extern __inline unsigned int
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+    _pdep_u32(unsigned int __X, unsigned int __Y) {
+  return _pdep_u64(__X, __Y);
+}
+
+extern __inline unsigned int
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+    _pext_u32(unsigned int __X, unsigned int __Y) {
+  return _pext_u64(__X, __Y);
+}
+#endif /* _ARCH_PWR7  */
+#endif /* __PPC64__  */
+
+#endif /* BMI2INTRIN_H_ */
Index: clang/lib/Headers/CMakeLists.txt
===================================================================
--- clang/lib/Headers/CMakeLists.txt
+++ clang/lib/Headers/CMakeLists.txt
@@ -161,6 +161,12 @@
   ppc_wrappers/pmmintrin.h
   ppc_wrappers/tmmintrin.h
   ppc_wrappers/smmintrin.h
+  ppc_wrappers/bmiintrin.h
+  ppc_wrappers/bmi2intrin.h
+  ppc_wrappers/immintrin.h
+  ppc_wrappers/tmmintrin.h
+  ppc_wrappers/x86intrin.h
+  ppc_wrappers/x86gprintrin.h
 )
 
 set(openmp_wrapper_files

_______________________________________________
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits

[PATCH] D119407: [PowerPC] [Clang] Add SSE4 and BMI compatible intrinsics implementation for PowerPC

Reply via email to