[clang] [CIR][AArch64] Upstream narrowing-addition NEON builtins (PR #204989)

Vicky Nguyen via cfe-commits Sun, 21 Jun 2026 09:10:32 -0700

https://github.com/iamvickynguyen created 
https://github.com/llvm/llvm-project/pull/204989


Related to https://github.com/llvm/llvm-project/issues/185382

CIR lowering for
- narrowing-addition intrinsics 
(https://arm-software.github.io/acle/neon_intrinsics/advsimd.html#narrowing-addition)

Port tests:
- `clang/test/CodeGen/AArch64/neon_intrinsics.c` to 
`clang/test/CodeGen/AArch64/neon/add.c`

>From 2810377ce88d23d61491c97694b9d7223cf90aac Mon Sep 17 00:00:00 2001
From: Vicky Nguyen <[email protected]>
Date: Fri, 19 Jun 2026 09:50:15 -0700
Subject: [PATCH] [CIR][AArch64] Upstream narrowing-addition NEON builtins

---
 .../lib/CIR/CodeGen/CIRGenBuiltinAArch64.cpp  |  46 +-
 clang/test/CodeGen/AArch64/neon-intrinsics.c  | 724 ------------------
 clang/test/CodeGen/AArch64/neon/add.c         | 560 ++++++++++++++
 3 files changed, 605 insertions(+), 725 deletions(-)

diff --git a/clang/lib/CIR/CodeGen/CIRGenBuiltinAArch64.cpp 
b/clang/lib/CIR/CodeGen/CIRGenBuiltinAArch64.cpp
index 8b077620d2bab..365e114e3762e 100644
--- a/clang/lib/CIR/CodeGen/CIRGenBuiltinAArch64.cpp
+++ b/clang/lib/CIR/CodeGen/CIRGenBuiltinAArch64.cpp
@@ -685,7 +685,31 @@ static mlir::Value emitCommonNeonBuiltinExpr(
     mlir::Value result = cgf.getBuilder().createXor(loc, ops[0], ops[1]);
     return cgf.getBuilder().createBitcast(result, ty);
   }
-  case NEON::BI__builtin_neon_vaddhn_v:
+  case NEON::BI__builtin_neon_vaddhn_v: {
+    // Build the extended (double-width) source vector type. Use an unsigned
+    // element type so the high-half extraction lowers to a logical shift.
+    cir::VectorType srcTy =
+        cgf.getBuilder().getExtendedOrTruncatedElementVectorType(
+            vTy, /*isExtended=*/true, /*isSigned=*/false);
+    unsigned narrowWidth =
+        mlir::cast<cir::IntType>(vTy.getElementType()).getWidth();
+
+    // %sum = add <n x iW> %lhs, %rhs
+    ops[0] = cgf.getBuilder().createBitcast(ops[0], srcTy);
+    ops[1] = cgf.getBuilder().createBitcast(ops[1], srcTy);
+    mlir::Value result = cgf.getBuilder().createAdd(loc, ops[0], ops[1]);
+
+    // %high = lshr <n x iW> %sum, splat(iW W/2)
+    mlir::Value shiftAmt = cgf.getBuilder().getConstantInt(
+        loc, srcTy.getElementType(), narrowWidth);
+    mlir::Value shiftVec =
+        emitNeonShiftVector(cgf.getBuilder(), shiftAmt, srcTy, loc,
+                            /*neg=*/false);
+    result = cgf.getBuilder().createShiftRight(loc, result, shiftVec);
+
+    // %res = trunc <n x iW> %high to <n x iW/2>
+    return cgf.getBuilder().createIntCast(result, vTy);
+  }
   case NEON::BI__builtin_neon_vcale_v:
   case NEON::BI__builtin_neon_vcaleq_v:
   case NEON::BI__builtin_neon_vcalt_v:
@@ -1060,6 +1084,10 @@ static mlir::Value emitCommonNeonBuiltinExpr(
                      std::string("unimplemented AArch64 builtin call: ") +
                          cgf.getContext().BuiltinInfo.getName(builtinID));
     break;
+  case NEON::BI__builtin_neon_vhadd_v:
+  case NEON::BI__builtin_neon_vhaddq_v:
+  case NEON::BI__builtin_neon_vrhadd_v:
+  case NEON::BI__builtin_neon_vrhaddq_v:
   case NEON::BI__builtin_neon_vshl_v:
   case NEON::BI__builtin_neon_vshlq_v: {
     llvm::StringRef llvmIntrName =
@@ -1073,6 +1101,22 @@ static mlir::Value emitCommonNeonBuiltinExpr(
     mlir::Type resultType = cgf.convertType(expr->getType());
     return cgf.getBuilder().createBitcast(result, resultType);
   }
+  case NEON::BI__builtin_neon_vraddhn_v: {
+    // The raddhn intrinsic operates on the double-width source operands and
+    // returns the narrowed result vector.
+    cir::VectorType srcTy =
+        cgf.getBuilder().getExtendedOrTruncatedElementVectorType(
+            vTy, /*isExtended=*/true, /*isSigned=*/true);
+
+    llvm::StringRef llvmIntrName = getLLVMIntrNameNoPrefix(
+        static_cast<llvm::Intrinsic::ID>(llvmIntrinsic));
+    mlir::Value result =
+        emitNeonCall(cgf.getCIRGenModule(), cgf.getBuilder(),
+                     /*argTypes=*/{srcTy, srcTy}, ops, llvmIntrName,
+                     /*funcResTy=*/vTy, loc);
+    mlir::Type resultType = cgf.convertType(expr->getType());
+    return cgf.getBuilder().createBitcast(result, resultType);
+  }
   }
 
   // NYI
diff --git a/clang/test/CodeGen/AArch64/neon-intrinsics.c 
b/clang/test/CodeGen/AArch64/neon-intrinsics.c
index b37ed5aa29f10..78b4495e9f8cc 100644
--- a/clang/test/CodeGen/AArch64/neon-intrinsics.c
+++ b/clang/test/CodeGen/AArch64/neon-intrinsics.c
@@ -2404,174 +2404,6 @@ uint64x2_t test_vcltq_f64(float64x2_t v1, float64x2_t 
v2) {
   return vcltq_f64(v1, v2);
 }
 
-// CHECK-LABEL: define dso_local <8 x i8> @test_vhadd_s8(
-// CHECK-SAME: <8 x i8> noundef [[V1:%.*]], <8 x i8> noundef [[V2:%.*]]) 
#[[ATTR0]] {
-// CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[VHADD_V_I:%.*]] = call <8 x i8> 
@llvm.aarch64.neon.shadd.v8i8(<8 x i8> [[V1]], <8 x i8> [[V2]])
-// CHECK-NEXT:    ret <8 x i8> [[VHADD_V_I]]
-//
-int8x8_t test_vhadd_s8(int8x8_t v1, int8x8_t v2) {
-  return vhadd_s8(v1, v2);
-}
-
-// CHECK-LABEL: define dso_local <4 x i16> @test_vhadd_s16(
-// CHECK-SAME: <4 x i16> noundef [[V1:%.*]], <4 x i16> noundef [[V2:%.*]]) 
#[[ATTR0]] {
-// CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i16> [[V1]] to <8 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i16> [[V2]] to <8 x i8>
-// CHECK-NEXT:    [[VHADD_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
-// CHECK-NEXT:    [[VHADD_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
-// CHECK-NEXT:    [[VHADD_V2_I:%.*]] = call <4 x i16> 
@llvm.aarch64.neon.shadd.v4i16(<4 x i16> [[VHADD_V_I]], <4 x i16> 
[[VHADD_V1_I]])
-// CHECK-NEXT:    [[VHADD_V3_I:%.*]] = bitcast <4 x i16> [[VHADD_V2_I]] to <8 
x i8>
-// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <8 x i8> [[VHADD_V3_I]] to <4 x i16>
-// CHECK-NEXT:    ret <4 x i16> [[TMP2]]
-//
-int16x4_t test_vhadd_s16(int16x4_t v1, int16x4_t v2) {
-  return vhadd_s16(v1, v2);
-}
-
-// CHECK-LABEL: define dso_local <2 x i32> @test_vhadd_s32(
-// CHECK-SAME: <2 x i32> noundef [[V1:%.*]], <2 x i32> noundef [[V2:%.*]]) 
#[[ATTR0]] {
-// CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i32> [[V1]] to <8 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <2 x i32> [[V2]] to <8 x i8>
-// CHECK-NEXT:    [[VHADD_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
-// CHECK-NEXT:    [[VHADD_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
-// CHECK-NEXT:    [[VHADD_V2_I:%.*]] = call <2 x i32> 
@llvm.aarch64.neon.shadd.v2i32(<2 x i32> [[VHADD_V_I]], <2 x i32> 
[[VHADD_V1_I]])
-// CHECK-NEXT:    [[VHADD_V3_I:%.*]] = bitcast <2 x i32> [[VHADD_V2_I]] to <8 
x i8>
-// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <8 x i8> [[VHADD_V3_I]] to <2 x i32>
-// CHECK-NEXT:    ret <2 x i32> [[TMP2]]
-//
-int32x2_t test_vhadd_s32(int32x2_t v1, int32x2_t v2) {
-  return vhadd_s32(v1, v2);
-}
-
-// CHECK-LABEL: define dso_local <8 x i8> @test_vhadd_u8(
-// CHECK-SAME: <8 x i8> noundef [[V1:%.*]], <8 x i8> noundef [[V2:%.*]]) 
#[[ATTR0]] {
-// CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[VHADD_V_I:%.*]] = call <8 x i8> 
@llvm.aarch64.neon.uhadd.v8i8(<8 x i8> [[V1]], <8 x i8> [[V2]])
-// CHECK-NEXT:    ret <8 x i8> [[VHADD_V_I]]
-//
-uint8x8_t test_vhadd_u8(uint8x8_t v1, uint8x8_t v2) {
-  return vhadd_u8(v1, v2);
-}
-
-// CHECK-LABEL: define dso_local <4 x i16> @test_vhadd_u16(
-// CHECK-SAME: <4 x i16> noundef [[V1:%.*]], <4 x i16> noundef [[V2:%.*]]) 
#[[ATTR0]] {
-// CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i16> [[V1]] to <8 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i16> [[V2]] to <8 x i8>
-// CHECK-NEXT:    [[VHADD_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
-// CHECK-NEXT:    [[VHADD_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
-// CHECK-NEXT:    [[VHADD_V2_I:%.*]] = call <4 x i16> 
@llvm.aarch64.neon.uhadd.v4i16(<4 x i16> [[VHADD_V_I]], <4 x i16> 
[[VHADD_V1_I]])
-// CHECK-NEXT:    [[VHADD_V3_I:%.*]] = bitcast <4 x i16> [[VHADD_V2_I]] to <8 
x i8>
-// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <8 x i8> [[VHADD_V3_I]] to <4 x i16>
-// CHECK-NEXT:    ret <4 x i16> [[TMP2]]
-//
-uint16x4_t test_vhadd_u16(uint16x4_t v1, uint16x4_t v2) {
-  return vhadd_u16(v1, v2);
-}
-
-// CHECK-LABEL: define dso_local <2 x i32> @test_vhadd_u32(
-// CHECK-SAME: <2 x i32> noundef [[V1:%.*]], <2 x i32> noundef [[V2:%.*]]) 
#[[ATTR0]] {
-// CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i32> [[V1]] to <8 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <2 x i32> [[V2]] to <8 x i8>
-// CHECK-NEXT:    [[VHADD_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
-// CHECK-NEXT:    [[VHADD_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
-// CHECK-NEXT:    [[VHADD_V2_I:%.*]] = call <2 x i32> 
@llvm.aarch64.neon.uhadd.v2i32(<2 x i32> [[VHADD_V_I]], <2 x i32> 
[[VHADD_V1_I]])
-// CHECK-NEXT:    [[VHADD_V3_I:%.*]] = bitcast <2 x i32> [[VHADD_V2_I]] to <8 
x i8>
-// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <8 x i8> [[VHADD_V3_I]] to <2 x i32>
-// CHECK-NEXT:    ret <2 x i32> [[TMP2]]
-//
-uint32x2_t test_vhadd_u32(uint32x2_t v1, uint32x2_t v2) {
-  return vhadd_u32(v1, v2);
-}
-
-// CHECK-LABEL: define dso_local <16 x i8> @test_vhaddq_s8(
-// CHECK-SAME: <16 x i8> noundef [[V1:%.*]], <16 x i8> noundef [[V2:%.*]]) 
#[[ATTR0]] {
-// CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[VHADDQ_V_I:%.*]] = call <16 x i8> 
@llvm.aarch64.neon.shadd.v16i8(<16 x i8> [[V1]], <16 x i8> [[V2]])
-// CHECK-NEXT:    ret <16 x i8> [[VHADDQ_V_I]]
-//
-int8x16_t test_vhaddq_s8(int8x16_t v1, int8x16_t v2) {
-  return vhaddq_s8(v1, v2);
-}
-
-// CHECK-LABEL: define dso_local <8 x i16> @test_vhaddq_s16(
-// CHECK-SAME: <8 x i16> noundef [[V1:%.*]], <8 x i16> noundef [[V2:%.*]]) 
#[[ATTR0]] {
-// CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x i16> [[V1]] to <16 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i16> [[V2]] to <16 x i8>
-// CHECK-NEXT:    [[VHADDQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
-// CHECK-NEXT:    [[VHADDQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
-// CHECK-NEXT:    [[VHADDQ_V2_I:%.*]] = call <8 x i16> 
@llvm.aarch64.neon.shadd.v8i16(<8 x i16> [[VHADDQ_V_I]], <8 x i16> 
[[VHADDQ_V1_I]])
-// CHECK-NEXT:    [[VHADDQ_V3_I:%.*]] = bitcast <8 x i16> [[VHADDQ_V2_I]] to 
<16 x i8>
-// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <16 x i8> [[VHADDQ_V3_I]] to <8 x i16>
-// CHECK-NEXT:    ret <8 x i16> [[TMP2]]
-//
-int16x8_t test_vhaddq_s16(int16x8_t v1, int16x8_t v2) {
-  return vhaddq_s16(v1, v2);
-}
-
-// CHECK-LABEL: define dso_local <4 x i32> @test_vhaddq_s32(
-// CHECK-SAME: <4 x i32> noundef [[V1:%.*]], <4 x i32> noundef [[V2:%.*]]) 
#[[ATTR0]] {
-// CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[V1]] to <16 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i32> [[V2]] to <16 x i8>
-// CHECK-NEXT:    [[VHADDQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
-// CHECK-NEXT:    [[VHADDQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
-// CHECK-NEXT:    [[VHADDQ_V2_I:%.*]] = call <4 x i32> 
@llvm.aarch64.neon.shadd.v4i32(<4 x i32> [[VHADDQ_V_I]], <4 x i32> 
[[VHADDQ_V1_I]])
-// CHECK-NEXT:    [[VHADDQ_V3_I:%.*]] = bitcast <4 x i32> [[VHADDQ_V2_I]] to 
<16 x i8>
-// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <16 x i8> [[VHADDQ_V3_I]] to <4 x i32>
-// CHECK-NEXT:    ret <4 x i32> [[TMP2]]
-//
-int32x4_t test_vhaddq_s32(int32x4_t v1, int32x4_t v2) {
-  return vhaddq_s32(v1, v2);
-}
-
-// CHECK-LABEL: define dso_local <16 x i8> @test_vhaddq_u8(
-// CHECK-SAME: <16 x i8> noundef [[V1:%.*]], <16 x i8> noundef [[V2:%.*]]) 
#[[ATTR0]] {
-// CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[VHADDQ_V_I:%.*]] = call <16 x i8> 
@llvm.aarch64.neon.uhadd.v16i8(<16 x i8> [[V1]], <16 x i8> [[V2]])
-// CHECK-NEXT:    ret <16 x i8> [[VHADDQ_V_I]]
-//
-uint8x16_t test_vhaddq_u8(uint8x16_t v1, uint8x16_t v2) {
-  return vhaddq_u8(v1, v2);
-}
-
-// CHECK-LABEL: define dso_local <8 x i16> @test_vhaddq_u16(
-// CHECK-SAME: <8 x i16> noundef [[V1:%.*]], <8 x i16> noundef [[V2:%.*]]) 
#[[ATTR0]] {
-// CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x i16> [[V1]] to <16 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i16> [[V2]] to <16 x i8>
-// CHECK-NEXT:    [[VHADDQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
-// CHECK-NEXT:    [[VHADDQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
-// CHECK-NEXT:    [[VHADDQ_V2_I:%.*]] = call <8 x i16> 
@llvm.aarch64.neon.uhadd.v8i16(<8 x i16> [[VHADDQ_V_I]], <8 x i16> 
[[VHADDQ_V1_I]])
-// CHECK-NEXT:    [[VHADDQ_V3_I:%.*]] = bitcast <8 x i16> [[VHADDQ_V2_I]] to 
<16 x i8>
-// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <16 x i8> [[VHADDQ_V3_I]] to <8 x i16>
-// CHECK-NEXT:    ret <8 x i16> [[TMP2]]
-//
-uint16x8_t test_vhaddq_u16(uint16x8_t v1, uint16x8_t v2) {
-  return vhaddq_u16(v1, v2);
-}
-
-// CHECK-LABEL: define dso_local <4 x i32> @test_vhaddq_u32(
-// CHECK-SAME: <4 x i32> noundef [[V1:%.*]], <4 x i32> noundef [[V2:%.*]]) 
#[[ATTR0]] {
-// CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[V1]] to <16 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i32> [[V2]] to <16 x i8>
-// CHECK-NEXT:    [[VHADDQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
-// CHECK-NEXT:    [[VHADDQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
-// CHECK-NEXT:    [[VHADDQ_V2_I:%.*]] = call <4 x i32> 
@llvm.aarch64.neon.uhadd.v4i32(<4 x i32> [[VHADDQ_V_I]], <4 x i32> 
[[VHADDQ_V1_I]])
-// CHECK-NEXT:    [[VHADDQ_V3_I:%.*]] = bitcast <4 x i32> [[VHADDQ_V2_I]] to 
<16 x i8>
-// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <16 x i8> [[VHADDQ_V3_I]] to <4 x i32>
-// CHECK-NEXT:    ret <4 x i32> [[TMP2]]
-//
-uint32x4_t test_vhaddq_u32(uint32x4_t v1, uint32x4_t v2) {
-  return vhaddq_u32(v1, v2);
-}
-
 // CHECK-LABEL: define dso_local <8 x i8> @test_vhsub_s8(
 // CHECK-SAME: <8 x i8> noundef [[V1:%.*]], <8 x i8> noundef [[V2:%.*]]) 
#[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
@@ -2740,174 +2572,6 @@ uint32x4_t test_vhsubq_u32(uint32x4_t v1, uint32x4_t 
v2) {
   return vhsubq_u32(v1, v2);
 }
 
-// CHECK-LABEL: define dso_local <8 x i8> @test_vrhadd_s8(
-// CHECK-SAME: <8 x i8> noundef [[V1:%.*]], <8 x i8> noundef [[V2:%.*]]) 
#[[ATTR0]] {
-// CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[VRHADD_V_I:%.*]] = call <8 x i8> 
@llvm.aarch64.neon.srhadd.v8i8(<8 x i8> [[V1]], <8 x i8> [[V2]])
-// CHECK-NEXT:    ret <8 x i8> [[VRHADD_V_I]]
-//
-int8x8_t test_vrhadd_s8(int8x8_t v1, int8x8_t v2) {
-  return vrhadd_s8(v1, v2);
-}
-
-// CHECK-LABEL: define dso_local <4 x i16> @test_vrhadd_s16(
-// CHECK-SAME: <4 x i16> noundef [[V1:%.*]], <4 x i16> noundef [[V2:%.*]]) 
#[[ATTR0]] {
-// CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i16> [[V1]] to <8 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i16> [[V2]] to <8 x i8>
-// CHECK-NEXT:    [[VRHADD_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
-// CHECK-NEXT:    [[VRHADD_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
-// CHECK-NEXT:    [[VRHADD_V2_I:%.*]] = call <4 x i16> 
@llvm.aarch64.neon.srhadd.v4i16(<4 x i16> [[VRHADD_V_I]], <4 x i16> 
[[VRHADD_V1_I]])
-// CHECK-NEXT:    [[VRHADD_V3_I:%.*]] = bitcast <4 x i16> [[VRHADD_V2_I]] to 
<8 x i8>
-// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <8 x i8> [[VRHADD_V3_I]] to <4 x i16>
-// CHECK-NEXT:    ret <4 x i16> [[TMP2]]
-//
-int16x4_t test_vrhadd_s16(int16x4_t v1, int16x4_t v2) {
-  return vrhadd_s16(v1, v2);
-}
-
-// CHECK-LABEL: define dso_local <2 x i32> @test_vrhadd_s32(
-// CHECK-SAME: <2 x i32> noundef [[V1:%.*]], <2 x i32> noundef [[V2:%.*]]) 
#[[ATTR0]] {
-// CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i32> [[V1]] to <8 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <2 x i32> [[V2]] to <8 x i8>
-// CHECK-NEXT:    [[VRHADD_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
-// CHECK-NEXT:    [[VRHADD_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
-// CHECK-NEXT:    [[VRHADD_V2_I:%.*]] = call <2 x i32> 
@llvm.aarch64.neon.srhadd.v2i32(<2 x i32> [[VRHADD_V_I]], <2 x i32> 
[[VRHADD_V1_I]])
-// CHECK-NEXT:    [[VRHADD_V3_I:%.*]] = bitcast <2 x i32> [[VRHADD_V2_I]] to 
<8 x i8>
-// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <8 x i8> [[VRHADD_V3_I]] to <2 x i32>
-// CHECK-NEXT:    ret <2 x i32> [[TMP2]]
-//
-int32x2_t test_vrhadd_s32(int32x2_t v1, int32x2_t v2) {
-  return vrhadd_s32(v1, v2);
-}
-
-// CHECK-LABEL: define dso_local <8 x i8> @test_vrhadd_u8(
-// CHECK-SAME: <8 x i8> noundef [[V1:%.*]], <8 x i8> noundef [[V2:%.*]]) 
#[[ATTR0]] {
-// CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[VRHADD_V_I:%.*]] = call <8 x i8> 
@llvm.aarch64.neon.urhadd.v8i8(<8 x i8> [[V1]], <8 x i8> [[V2]])
-// CHECK-NEXT:    ret <8 x i8> [[VRHADD_V_I]]
-//
-uint8x8_t test_vrhadd_u8(uint8x8_t v1, uint8x8_t v2) {
-  return vrhadd_u8(v1, v2);
-}
-
-// CHECK-LABEL: define dso_local <4 x i16> @test_vrhadd_u16(
-// CHECK-SAME: <4 x i16> noundef [[V1:%.*]], <4 x i16> noundef [[V2:%.*]]) 
#[[ATTR0]] {
-// CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i16> [[V1]] to <8 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i16> [[V2]] to <8 x i8>
-// CHECK-NEXT:    [[VRHADD_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
-// CHECK-NEXT:    [[VRHADD_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
-// CHECK-NEXT:    [[VRHADD_V2_I:%.*]] = call <4 x i16> 
@llvm.aarch64.neon.urhadd.v4i16(<4 x i16> [[VRHADD_V_I]], <4 x i16> 
[[VRHADD_V1_I]])
-// CHECK-NEXT:    [[VRHADD_V3_I:%.*]] = bitcast <4 x i16> [[VRHADD_V2_I]] to 
<8 x i8>
-// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <8 x i8> [[VRHADD_V3_I]] to <4 x i16>
-// CHECK-NEXT:    ret <4 x i16> [[TMP2]]
-//
-uint16x4_t test_vrhadd_u16(uint16x4_t v1, uint16x4_t v2) {
-  return vrhadd_u16(v1, v2);
-}
-
-// CHECK-LABEL: define dso_local <2 x i32> @test_vrhadd_u32(
-// CHECK-SAME: <2 x i32> noundef [[V1:%.*]], <2 x i32> noundef [[V2:%.*]]) 
#[[ATTR0]] {
-// CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i32> [[V1]] to <8 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <2 x i32> [[V2]] to <8 x i8>
-// CHECK-NEXT:    [[VRHADD_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
-// CHECK-NEXT:    [[VRHADD_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
-// CHECK-NEXT:    [[VRHADD_V2_I:%.*]] = call <2 x i32> 
@llvm.aarch64.neon.urhadd.v2i32(<2 x i32> [[VRHADD_V_I]], <2 x i32> 
[[VRHADD_V1_I]])
-// CHECK-NEXT:    [[VRHADD_V3_I:%.*]] = bitcast <2 x i32> [[VRHADD_V2_I]] to 
<8 x i8>
-// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <8 x i8> [[VRHADD_V3_I]] to <2 x i32>
-// CHECK-NEXT:    ret <2 x i32> [[TMP2]]
-//
-uint32x2_t test_vrhadd_u32(uint32x2_t v1, uint32x2_t v2) {
-  return vrhadd_u32(v1, v2);
-}
-
-// CHECK-LABEL: define dso_local <16 x i8> @test_vrhaddq_s8(
-// CHECK-SAME: <16 x i8> noundef [[V1:%.*]], <16 x i8> noundef [[V2:%.*]]) 
#[[ATTR0]] {
-// CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[VRHADDQ_V_I:%.*]] = call <16 x i8> 
@llvm.aarch64.neon.srhadd.v16i8(<16 x i8> [[V1]], <16 x i8> [[V2]])
-// CHECK-NEXT:    ret <16 x i8> [[VRHADDQ_V_I]]
-//
-int8x16_t test_vrhaddq_s8(int8x16_t v1, int8x16_t v2) {
-  return vrhaddq_s8(v1, v2);
-}
-
-// CHECK-LABEL: define dso_local <8 x i16> @test_vrhaddq_s16(
-// CHECK-SAME: <8 x i16> noundef [[V1:%.*]], <8 x i16> noundef [[V2:%.*]]) 
#[[ATTR0]] {
-// CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x i16> [[V1]] to <16 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i16> [[V2]] to <16 x i8>
-// CHECK-NEXT:    [[VRHADDQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
-// CHECK-NEXT:    [[VRHADDQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x 
i16>
-// CHECK-NEXT:    [[VRHADDQ_V2_I:%.*]] = call <8 x i16> 
@llvm.aarch64.neon.srhadd.v8i16(<8 x i16> [[VRHADDQ_V_I]], <8 x i16> 
[[VRHADDQ_V1_I]])
-// CHECK-NEXT:    [[VRHADDQ_V3_I:%.*]] = bitcast <8 x i16> [[VRHADDQ_V2_I]] to 
<16 x i8>
-// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <16 x i8> [[VRHADDQ_V3_I]] to <8 x 
i16>
-// CHECK-NEXT:    ret <8 x i16> [[TMP2]]
-//
-int16x8_t test_vrhaddq_s16(int16x8_t v1, int16x8_t v2) {
-  return vrhaddq_s16(v1, v2);
-}
-
-// CHECK-LABEL: define dso_local <4 x i32> @test_vrhaddq_s32(
-// CHECK-SAME: <4 x i32> noundef [[V1:%.*]], <4 x i32> noundef [[V2:%.*]]) 
#[[ATTR0]] {
-// CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[V1]] to <16 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i32> [[V2]] to <16 x i8>
-// CHECK-NEXT:    [[VRHADDQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
-// CHECK-NEXT:    [[VRHADDQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x 
i32>
-// CHECK-NEXT:    [[VRHADDQ_V2_I:%.*]] = call <4 x i32> 
@llvm.aarch64.neon.srhadd.v4i32(<4 x i32> [[VRHADDQ_V_I]], <4 x i32> 
[[VRHADDQ_V1_I]])
-// CHECK-NEXT:    [[VRHADDQ_V3_I:%.*]] = bitcast <4 x i32> [[VRHADDQ_V2_I]] to 
<16 x i8>
-// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <16 x i8> [[VRHADDQ_V3_I]] to <4 x 
i32>
-// CHECK-NEXT:    ret <4 x i32> [[TMP2]]
-//
-int32x4_t test_vrhaddq_s32(int32x4_t v1, int32x4_t v2) {
-  return vrhaddq_s32(v1, v2);
-}
-
-// CHECK-LABEL: define dso_local <16 x i8> @test_vrhaddq_u8(
-// CHECK-SAME: <16 x i8> noundef [[V1:%.*]], <16 x i8> noundef [[V2:%.*]]) 
#[[ATTR0]] {
-// CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[VRHADDQ_V_I:%.*]] = call <16 x i8> 
@llvm.aarch64.neon.urhadd.v16i8(<16 x i8> [[V1]], <16 x i8> [[V2]])
-// CHECK-NEXT:    ret <16 x i8> [[VRHADDQ_V_I]]
-//
-uint8x16_t test_vrhaddq_u8(uint8x16_t v1, uint8x16_t v2) {
-  return vrhaddq_u8(v1, v2);
-}
-
-// CHECK-LABEL: define dso_local <8 x i16> @test_vrhaddq_u16(
-// CHECK-SAME: <8 x i16> noundef [[V1:%.*]], <8 x i16> noundef [[V2:%.*]]) 
#[[ATTR0]] {
-// CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x i16> [[V1]] to <16 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i16> [[V2]] to <16 x i8>
-// CHECK-NEXT:    [[VRHADDQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
-// CHECK-NEXT:    [[VRHADDQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x 
i16>
-// CHECK-NEXT:    [[VRHADDQ_V2_I:%.*]] = call <8 x i16> 
@llvm.aarch64.neon.urhadd.v8i16(<8 x i16> [[VRHADDQ_V_I]], <8 x i16> 
[[VRHADDQ_V1_I]])
-// CHECK-NEXT:    [[VRHADDQ_V3_I:%.*]] = bitcast <8 x i16> [[VRHADDQ_V2_I]] to 
<16 x i8>
-// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <16 x i8> [[VRHADDQ_V3_I]] to <8 x 
i16>
-// CHECK-NEXT:    ret <8 x i16> [[TMP2]]
-//
-uint16x8_t test_vrhaddq_u16(uint16x8_t v1, uint16x8_t v2) {
-  return vrhaddq_u16(v1, v2);
-}
-
-// CHECK-LABEL: define dso_local <4 x i32> @test_vrhaddq_u32(
-// CHECK-SAME: <4 x i32> noundef [[V1:%.*]], <4 x i32> noundef [[V2:%.*]]) 
#[[ATTR0]] {
-// CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[V1]] to <16 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i32> [[V2]] to <16 x i8>
-// CHECK-NEXT:    [[VRHADDQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
-// CHECK-NEXT:    [[VRHADDQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x 
i32>
-// CHECK-NEXT:    [[VRHADDQ_V2_I:%.*]] = call <4 x i32> 
@llvm.aarch64.neon.urhadd.v4i32(<4 x i32> [[VRHADDQ_V_I]], <4 x i32> 
[[VRHADDQ_V1_I]])
-// CHECK-NEXT:    [[VRHADDQ_V3_I:%.*]] = bitcast <4 x i32> [[VRHADDQ_V2_I]] to 
<16 x i8>
-// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <16 x i8> [[VRHADDQ_V3_I]] to <4 x 
i32>
-// CHECK-NEXT:    ret <4 x i32> [[TMP2]]
-//
-uint32x4_t test_vrhaddq_u32(uint32x4_t v1, uint32x4_t v2) {
-  return vrhaddq_u32(v1, v2);
-}
-
 // CHECK-LABEL: define dso_local <8 x i8> @test_vqadd_s8(
 // CHECK-SAME: <8 x i8> noundef [[A:%.*]], <8 x i8> noundef [[B:%.*]]) 
#[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
@@ -5399,394 +5063,6 @@ uint64x2_t test_vsubw_high_u32(uint64x2_t a, uint32x4_t 
b) {
   return vsubw_high_u32(a, b);
 }
 
-// CHECK-LABEL: define dso_local <8 x i8> @test_vaddhn_s16(
-// CHECK-SAME: <8 x i16> noundef [[A:%.*]], <8 x i16> noundef [[B:%.*]]) 
#[[ATTR0]] {
-// CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x i16> [[A]] to <16 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i16> [[B]] to <16 x i8>
-// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
-// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
-// CHECK-NEXT:    [[VADDHN_I:%.*]] = add <8 x i16> [[TMP2]], [[TMP3]]
-// CHECK-NEXT:    [[VADDHN1_I:%.*]] = lshr <8 x i16> [[VADDHN_I]], splat (i16 
8)
-// CHECK-NEXT:    [[VADDHN2_I:%.*]] = trunc <8 x i16> [[VADDHN1_I]] to <8 x i8>
-// CHECK-NEXT:    ret <8 x i8> [[VADDHN2_I]]
-//
-int8x8_t test_vaddhn_s16(int16x8_t a, int16x8_t b) {
-  return vaddhn_s16(a, b);
-}
-
-// CHECK-LABEL: define dso_local <4 x i16> @test_vaddhn_s32(
-// CHECK-SAME: <4 x i32> noundef [[A:%.*]], <4 x i32> noundef [[B:%.*]]) 
#[[ATTR0]] {
-// CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[A]] to <16 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i32> [[B]] to <16 x i8>
-// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
-// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
-// CHECK-NEXT:    [[VADDHN_I:%.*]] = add <4 x i32> [[TMP2]], [[TMP3]]
-// CHECK-NEXT:    [[VADDHN1_I:%.*]] = lshr <4 x i32> [[VADDHN_I]], splat (i32 
16)
-// CHECK-NEXT:    [[VADDHN2_I:%.*]] = trunc <4 x i32> [[VADDHN1_I]] to <4 x 
i16>
-// CHECK-NEXT:    ret <4 x i16> [[VADDHN2_I]]
-//
-int16x4_t test_vaddhn_s32(int32x4_t a, int32x4_t b) {
-  return vaddhn_s32(a, b);
-}
-
-// CHECK-LABEL: define dso_local <2 x i32> @test_vaddhn_s64(
-// CHECK-SAME: <2 x i64> noundef [[A:%.*]], <2 x i64> noundef [[B:%.*]]) 
#[[ATTR0]] {
-// CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i64> [[A]] to <16 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <2 x i64> [[B]] to <16 x i8>
-// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
-// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x i64>
-// CHECK-NEXT:    [[VADDHN_I:%.*]] = add <2 x i64> [[TMP2]], [[TMP3]]
-// CHECK-NEXT:    [[VADDHN1_I:%.*]] = lshr <2 x i64> [[VADDHN_I]], splat (i64 
32)
-// CHECK-NEXT:    [[VADDHN2_I:%.*]] = trunc <2 x i64> [[VADDHN1_I]] to <2 x 
i32>
-// CHECK-NEXT:    ret <2 x i32> [[VADDHN2_I]]
-//
-int32x2_t test_vaddhn_s64(int64x2_t a, int64x2_t b) {
-  return vaddhn_s64(a, b);
-}
-
-// CHECK-LABEL: define dso_local <8 x i8> @test_vaddhn_u16(
-// CHECK-SAME: <8 x i16> noundef [[A:%.*]], <8 x i16> noundef [[B:%.*]]) 
#[[ATTR0]] {
-// CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x i16> [[A]] to <16 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i16> [[B]] to <16 x i8>
-// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
-// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
-// CHECK-NEXT:    [[VADDHN_I:%.*]] = add <8 x i16> [[TMP2]], [[TMP3]]
-// CHECK-NEXT:    [[VADDHN1_I:%.*]] = lshr <8 x i16> [[VADDHN_I]], splat (i16 
8)
-// CHECK-NEXT:    [[VADDHN2_I:%.*]] = trunc <8 x i16> [[VADDHN1_I]] to <8 x i8>
-// CHECK-NEXT:    ret <8 x i8> [[VADDHN2_I]]
-//
-uint8x8_t test_vaddhn_u16(uint16x8_t a, uint16x8_t b) {
-  return vaddhn_u16(a, b);
-}
-
-// CHECK-LABEL: define dso_local <4 x i16> @test_vaddhn_u32(
-// CHECK-SAME: <4 x i32> noundef [[A:%.*]], <4 x i32> noundef [[B:%.*]]) 
#[[ATTR0]] {
-// CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[A]] to <16 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i32> [[B]] to <16 x i8>
-// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
-// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
-// CHECK-NEXT:    [[VADDHN_I:%.*]] = add <4 x i32> [[TMP2]], [[TMP3]]
-// CHECK-NEXT:    [[VADDHN1_I:%.*]] = lshr <4 x i32> [[VADDHN_I]], splat (i32 
16)
-// CHECK-NEXT:    [[VADDHN2_I:%.*]] = trunc <4 x i32> [[VADDHN1_I]] to <4 x 
i16>
-// CHECK-NEXT:    ret <4 x i16> [[VADDHN2_I]]
-//
-uint16x4_t test_vaddhn_u32(uint32x4_t a, uint32x4_t b) {
-  return vaddhn_u32(a, b);
-}
-
-// CHECK-LABEL: define dso_local <2 x i32> @test_vaddhn_u64(
-// CHECK-SAME: <2 x i64> noundef [[A:%.*]], <2 x i64> noundef [[B:%.*]]) 
#[[ATTR0]] {
-// CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i64> [[A]] to <16 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <2 x i64> [[B]] to <16 x i8>
-// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
-// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x i64>
-// CHECK-NEXT:    [[VADDHN_I:%.*]] = add <2 x i64> [[TMP2]], [[TMP3]]
-// CHECK-NEXT:    [[VADDHN1_I:%.*]] = lshr <2 x i64> [[VADDHN_I]], splat (i64 
32)
-// CHECK-NEXT:    [[VADDHN2_I:%.*]] = trunc <2 x i64> [[VADDHN1_I]] to <2 x 
i32>
-// CHECK-NEXT:    ret <2 x i32> [[VADDHN2_I]]
-//
-uint32x2_t test_vaddhn_u64(uint64x2_t a, uint64x2_t b) {
-  return vaddhn_u64(a, b);
-}
-
-// CHECK-LABEL: define dso_local <16 x i8> @test_vaddhn_high_s16(
-// CHECK-SAME: <8 x i8> noundef [[R:%.*]], <8 x i16> noundef [[A:%.*]], <8 x 
i16> noundef [[B:%.*]]) #[[ATTR0]] {
-// CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x i16> [[A]] to <16 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i16> [[B]] to <16 x i8>
-// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
-// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
-// CHECK-NEXT:    [[VADDHN_I_I:%.*]] = add <8 x i16> [[TMP2]], [[TMP3]]
-// CHECK-NEXT:    [[VADDHN1_I_I:%.*]] = lshr <8 x i16> [[VADDHN_I_I]], splat 
(i16 8)
-// CHECK-NEXT:    [[VADDHN2_I_I:%.*]] = trunc <8 x i16> [[VADDHN1_I_I]] to <8 
x i8>
-// CHECK-NEXT:    [[SHUFFLE_I_I:%.*]] = shufflevector <8 x i8> [[R]], <8 x i8> 
[[VADDHN2_I_I]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, 
i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-// CHECK-NEXT:    ret <16 x i8> [[SHUFFLE_I_I]]
-//
-int8x16_t test_vaddhn_high_s16(int8x8_t r, int16x8_t a, int16x8_t b) {
-  return vaddhn_high_s16(r, a, b);
-}
-
-// CHECK-LABEL: define dso_local <8 x i16> @test_vaddhn_high_s32(
-// CHECK-SAME: <4 x i16> noundef [[R:%.*]], <4 x i32> noundef [[A:%.*]], <4 x 
i32> noundef [[B:%.*]]) #[[ATTR0]] {
-// CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[A]] to <16 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i32> [[B]] to <16 x i8>
-// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
-// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
-// CHECK-NEXT:    [[VADDHN_I_I:%.*]] = add <4 x i32> [[TMP2]], [[TMP3]]
-// CHECK-NEXT:    [[VADDHN1_I_I:%.*]] = lshr <4 x i32> [[VADDHN_I_I]], splat 
(i32 16)
-// CHECK-NEXT:    [[VADDHN2_I_I:%.*]] = trunc <4 x i32> [[VADDHN1_I_I]] to <4 
x i16>
-// CHECK-NEXT:    [[SHUFFLE_I_I:%.*]] = shufflevector <4 x i16> [[R]], <4 x 
i16> [[VADDHN2_I_I]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 
6, i32 7>
-// CHECK-NEXT:    ret <8 x i16> [[SHUFFLE_I_I]]
-//
-int16x8_t test_vaddhn_high_s32(int16x4_t r, int32x4_t a, int32x4_t b) {
-  return vaddhn_high_s32(r, a, b);
-}
-
-// CHECK-LABEL: define dso_local <4 x i32> @test_vaddhn_high_s64(
-// CHECK-SAME: <2 x i32> noundef [[R:%.*]], <2 x i64> noundef [[A:%.*]], <2 x 
i64> noundef [[B:%.*]]) #[[ATTR0]] {
-// CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i64> [[A]] to <16 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <2 x i64> [[B]] to <16 x i8>
-// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
-// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x i64>
-// CHECK-NEXT:    [[VADDHN_I_I:%.*]] = add <2 x i64> [[TMP2]], [[TMP3]]
-// CHECK-NEXT:    [[VADDHN1_I_I:%.*]] = lshr <2 x i64> [[VADDHN_I_I]], splat 
(i64 32)
-// CHECK-NEXT:    [[VADDHN2_I_I:%.*]] = trunc <2 x i64> [[VADDHN1_I_I]] to <2 
x i32>
-// CHECK-NEXT:    [[SHUFFLE_I_I:%.*]] = shufflevector <2 x i32> [[R]], <2 x 
i32> [[VADDHN2_I_I]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-// CHECK-NEXT:    ret <4 x i32> [[SHUFFLE_I_I]]
-//
-int32x4_t test_vaddhn_high_s64(int32x2_t r, int64x2_t a, int64x2_t b) {
-  return vaddhn_high_s64(r, a, b);
-}
-
-// CHECK-LABEL: define dso_local <16 x i8> @test_vaddhn_high_u16(
-// CHECK-SAME: <8 x i8> noundef [[R:%.*]], <8 x i16> noundef [[A:%.*]], <8 x 
i16> noundef [[B:%.*]]) #[[ATTR0]] {
-// CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x i16> [[A]] to <16 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i16> [[B]] to <16 x i8>
-// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
-// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
-// CHECK-NEXT:    [[VADDHN_I_I:%.*]] = add <8 x i16> [[TMP2]], [[TMP3]]
-// CHECK-NEXT:    [[VADDHN1_I_I:%.*]] = lshr <8 x i16> [[VADDHN_I_I]], splat 
(i16 8)
-// CHECK-NEXT:    [[VADDHN2_I_I:%.*]] = trunc <8 x i16> [[VADDHN1_I_I]] to <8 
x i8>
-// CHECK-NEXT:    [[SHUFFLE_I_I:%.*]] = shufflevector <8 x i8> [[R]], <8 x i8> 
[[VADDHN2_I_I]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, 
i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-// CHECK-NEXT:    ret <16 x i8> [[SHUFFLE_I_I]]
-//
-uint8x16_t test_vaddhn_high_u16(uint8x8_t r, uint16x8_t a, uint16x8_t b) {
-  return vaddhn_high_u16(r, a, b);
-}
-
-// CHECK-LABEL: define dso_local <8 x i16> @test_vaddhn_high_u32(
-// CHECK-SAME: <4 x i16> noundef [[R:%.*]], <4 x i32> noundef [[A:%.*]], <4 x 
i32> noundef [[B:%.*]]) #[[ATTR0]] {
-// CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[A]] to <16 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i32> [[B]] to <16 x i8>
-// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
-// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
-// CHECK-NEXT:    [[VADDHN_I_I:%.*]] = add <4 x i32> [[TMP2]], [[TMP3]]
-// CHECK-NEXT:    [[VADDHN1_I_I:%.*]] = lshr <4 x i32> [[VADDHN_I_I]], splat 
(i32 16)
-// CHECK-NEXT:    [[VADDHN2_I_I:%.*]] = trunc <4 x i32> [[VADDHN1_I_I]] to <4 
x i16>
-// CHECK-NEXT:    [[SHUFFLE_I_I:%.*]] = shufflevector <4 x i16> [[R]], <4 x 
i16> [[VADDHN2_I_I]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 
6, i32 7>
-// CHECK-NEXT:    ret <8 x i16> [[SHUFFLE_I_I]]
-//
-uint16x8_t test_vaddhn_high_u32(uint16x4_t r, uint32x4_t a, uint32x4_t b) {
-  return vaddhn_high_u32(r, a, b);
-}
-
-// CHECK-LABEL: define dso_local <4 x i32> @test_vaddhn_high_u64(
-// CHECK-SAME: <2 x i32> noundef [[R:%.*]], <2 x i64> noundef [[A:%.*]], <2 x 
i64> noundef [[B:%.*]]) #[[ATTR0]] {
-// CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i64> [[A]] to <16 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <2 x i64> [[B]] to <16 x i8>
-// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
-// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x i64>
-// CHECK-NEXT:    [[VADDHN_I_I:%.*]] = add <2 x i64> [[TMP2]], [[TMP3]]
-// CHECK-NEXT:    [[VADDHN1_I_I:%.*]] = lshr <2 x i64> [[VADDHN_I_I]], splat 
(i64 32)
-// CHECK-NEXT:    [[VADDHN2_I_I:%.*]] = trunc <2 x i64> [[VADDHN1_I_I]] to <2 
x i32>
-// CHECK-NEXT:    [[SHUFFLE_I_I:%.*]] = shufflevector <2 x i32> [[R]], <2 x 
i32> [[VADDHN2_I_I]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-// CHECK-NEXT:    ret <4 x i32> [[SHUFFLE_I_I]]
-//
-uint32x4_t test_vaddhn_high_u64(uint32x2_t r, uint64x2_t a, uint64x2_t b) {
-  return vaddhn_high_u64(r, a, b);
-}
-
-// CHECK-LABEL: define dso_local <8 x i8> @test_vraddhn_s16(
-// CHECK-SAME: <8 x i16> noundef [[A:%.*]], <8 x i16> noundef [[B:%.*]]) 
#[[ATTR0]] {
-// CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x i16> [[A]] to <16 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i16> [[B]] to <16 x i8>
-// CHECK-NEXT:    [[VRADDHN_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
-// CHECK-NEXT:    [[VRADDHN_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x 
i16>
-// CHECK-NEXT:    [[VRADDHN_V2_I:%.*]] = call <8 x i8> 
@llvm.aarch64.neon.raddhn.v8i8(<8 x i16> [[VRADDHN_V_I]], <8 x i16> 
[[VRADDHN_V1_I]])
-// CHECK-NEXT:    ret <8 x i8> [[VRADDHN_V2_I]]
-//
-int8x8_t test_vraddhn_s16(int16x8_t a, int16x8_t b) {
-  return vraddhn_s16(a, b);
-}
-
-// CHECK-LABEL: define dso_local <4 x i16> @test_vraddhn_s32(
-// CHECK-SAME: <4 x i32> noundef [[A:%.*]], <4 x i32> noundef [[B:%.*]]) 
#[[ATTR0]] {
-// CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[A]] to <16 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i32> [[B]] to <16 x i8>
-// CHECK-NEXT:    [[VRADDHN_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
-// CHECK-NEXT:    [[VRADDHN_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x 
i32>
-// CHECK-NEXT:    [[VRADDHN_V2_I:%.*]] = call <4 x i16> 
@llvm.aarch64.neon.raddhn.v4i16(<4 x i32> [[VRADDHN_V_I]], <4 x i32> 
[[VRADDHN_V1_I]])
-// CHECK-NEXT:    [[VRADDHN_V3_I:%.*]] = bitcast <4 x i16> [[VRADDHN_V2_I]] to 
<8 x i8>
-// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <8 x i8> [[VRADDHN_V3_I]] to <4 x i16>
-// CHECK-NEXT:    ret <4 x i16> [[TMP2]]
-//
-int16x4_t test_vraddhn_s32(int32x4_t a, int32x4_t b) {
-  return vraddhn_s32(a, b);
-}
-
-// CHECK-LABEL: define dso_local <2 x i32> @test_vraddhn_s64(
-// CHECK-SAME: <2 x i64> noundef [[A:%.*]], <2 x i64> noundef [[B:%.*]]) 
#[[ATTR0]] {
-// CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i64> [[A]] to <16 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <2 x i64> [[B]] to <16 x i8>
-// CHECK-NEXT:    [[VRADDHN_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
-// CHECK-NEXT:    [[VRADDHN_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x 
i64>
-// CHECK-NEXT:    [[VRADDHN_V2_I:%.*]] = call <2 x i32> 
@llvm.aarch64.neon.raddhn.v2i32(<2 x i64> [[VRADDHN_V_I]], <2 x i64> 
[[VRADDHN_V1_I]])
-// CHECK-NEXT:    [[VRADDHN_V3_I:%.*]] = bitcast <2 x i32> [[VRADDHN_V2_I]] to 
<8 x i8>
-// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <8 x i8> [[VRADDHN_V3_I]] to <2 x i32>
-// CHECK-NEXT:    ret <2 x i32> [[TMP2]]
-//
-int32x2_t test_vraddhn_s64(int64x2_t a, int64x2_t b) {
-  return vraddhn_s64(a, b);
-}
-
-// CHECK-LABEL: define dso_local <8 x i8> @test_vraddhn_u16(
-// CHECK-SAME: <8 x i16> noundef [[A:%.*]], <8 x i16> noundef [[B:%.*]]) 
#[[ATTR0]] {
-// CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x i16> [[A]] to <16 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i16> [[B]] to <16 x i8>
-// CHECK-NEXT:    [[VRADDHN_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
-// CHECK-NEXT:    [[VRADDHN_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x 
i16>
-// CHECK-NEXT:    [[VRADDHN_V2_I:%.*]] = call <8 x i8> 
@llvm.aarch64.neon.raddhn.v8i8(<8 x i16> [[VRADDHN_V_I]], <8 x i16> 
[[VRADDHN_V1_I]])
-// CHECK-NEXT:    ret <8 x i8> [[VRADDHN_V2_I]]
-//
-uint8x8_t test_vraddhn_u16(uint16x8_t a, uint16x8_t b) {
-  return vraddhn_u16(a, b);
-}
-
-// CHECK-LABEL: define dso_local <4 x i16> @test_vraddhn_u32(
-// CHECK-SAME: <4 x i32> noundef [[A:%.*]], <4 x i32> noundef [[B:%.*]]) 
#[[ATTR0]] {
-// CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[A]] to <16 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i32> [[B]] to <16 x i8>
-// CHECK-NEXT:    [[VRADDHN_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
-// CHECK-NEXT:    [[VRADDHN_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x 
i32>
-// CHECK-NEXT:    [[VRADDHN_V2_I:%.*]] = call <4 x i16> 
@llvm.aarch64.neon.raddhn.v4i16(<4 x i32> [[VRADDHN_V_I]], <4 x i32> 
[[VRADDHN_V1_I]])
-// CHECK-NEXT:    [[VRADDHN_V3_I:%.*]] = bitcast <4 x i16> [[VRADDHN_V2_I]] to 
<8 x i8>
-// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <8 x i8> [[VRADDHN_V3_I]] to <4 x i16>
-// CHECK-NEXT:    ret <4 x i16> [[TMP2]]
-//
-uint16x4_t test_vraddhn_u32(uint32x4_t a, uint32x4_t b) {
-  return vraddhn_u32(a, b);
-}
-
-// CHECK-LABEL: define dso_local <2 x i32> @test_vraddhn_u64(
-// CHECK-SAME: <2 x i64> noundef [[A:%.*]], <2 x i64> noundef [[B:%.*]]) 
#[[ATTR0]] {
-// CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i64> [[A]] to <16 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <2 x i64> [[B]] to <16 x i8>
-// CHECK-NEXT:    [[VRADDHN_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
-// CHECK-NEXT:    [[VRADDHN_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x 
i64>
-// CHECK-NEXT:    [[VRADDHN_V2_I:%.*]] = call <2 x i32> 
@llvm.aarch64.neon.raddhn.v2i32(<2 x i64> [[VRADDHN_V_I]], <2 x i64> 
[[VRADDHN_V1_I]])
-// CHECK-NEXT:    [[VRADDHN_V3_I:%.*]] = bitcast <2 x i32> [[VRADDHN_V2_I]] to 
<8 x i8>
-// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <8 x i8> [[VRADDHN_V3_I]] to <2 x i32>
-// CHECK-NEXT:    ret <2 x i32> [[TMP2]]
-//
-uint32x2_t test_vraddhn_u64(uint64x2_t a, uint64x2_t b) {
-  return vraddhn_u64(a, b);
-}
-
-// CHECK-LABEL: define dso_local <16 x i8> @test_vraddhn_high_s16(
-// CHECK-SAME: <8 x i8> noundef [[R:%.*]], <8 x i16> noundef [[A:%.*]], <8 x 
i16> noundef [[B:%.*]]) #[[ATTR0]] {
-// CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x i16> [[A]] to <16 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i16> [[B]] to <16 x i8>
-// CHECK-NEXT:    [[VRADDHN_V_I_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x 
i16>
-// CHECK-NEXT:    [[VRADDHN_V1_I_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x 
i16>
-// CHECK-NEXT:    [[VRADDHN_V2_I_I:%.*]] = call <8 x i8> 
@llvm.aarch64.neon.raddhn.v8i8(<8 x i16> [[VRADDHN_V_I_I]], <8 x i16> 
[[VRADDHN_V1_I_I]])
-// CHECK-NEXT:    [[SHUFFLE_I_I:%.*]] = shufflevector <8 x i8> [[R]], <8 x i8> 
[[VRADDHN_V2_I_I]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 
6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-// CHECK-NEXT:    ret <16 x i8> [[SHUFFLE_I_I]]
-//
-int8x16_t test_vraddhn_high_s16(int8x8_t r, int16x8_t a, int16x8_t b) {
-  return vraddhn_high_s16(r, a, b);
-}
-
-// CHECK-LABEL: define dso_local <8 x i16> @test_vraddhn_high_s32(
-// CHECK-SAME: <4 x i16> noundef [[R:%.*]], <4 x i32> noundef [[A:%.*]], <4 x 
i32> noundef [[B:%.*]]) #[[ATTR0]] {
-// CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[A]] to <16 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i32> [[B]] to <16 x i8>
-// CHECK-NEXT:    [[VRADDHN_V_I_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x 
i32>
-// CHECK-NEXT:    [[VRADDHN_V1_I_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x 
i32>
-// CHECK-NEXT:    [[VRADDHN_V2_I_I:%.*]] = call <4 x i16> 
@llvm.aarch64.neon.raddhn.v4i16(<4 x i32> [[VRADDHN_V_I_I]], <4 x i32> 
[[VRADDHN_V1_I_I]])
-// CHECK-NEXT:    [[VRADDHN_V3_I_I:%.*]] = bitcast <4 x i16> 
[[VRADDHN_V2_I_I]] to <8 x i8>
-// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <8 x i8> [[VRADDHN_V3_I_I]] to <4 x 
i16>
-// CHECK-NEXT:    [[SHUFFLE_I_I:%.*]] = shufflevector <4 x i16> [[R]], <4 x 
i16> [[TMP2]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 
7>
-// CHECK-NEXT:    ret <8 x i16> [[SHUFFLE_I_I]]
-//
-int16x8_t test_vraddhn_high_s32(int16x4_t r, int32x4_t a, int32x4_t b) {
-  return vraddhn_high_s32(r, a, b);
-}
-
-// CHECK-LABEL: define dso_local <4 x i32> @test_vraddhn_high_s64(
-// CHECK-SAME: <2 x i32> noundef [[R:%.*]], <2 x i64> noundef [[A:%.*]], <2 x 
i64> noundef [[B:%.*]]) #[[ATTR0]] {
-// CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i64> [[A]] to <16 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <2 x i64> [[B]] to <16 x i8>
-// CHECK-NEXT:    [[VRADDHN_V_I_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x 
i64>
-// CHECK-NEXT:    [[VRADDHN_V1_I_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x 
i64>
-// CHECK-NEXT:    [[VRADDHN_V2_I_I:%.*]] = call <2 x i32> 
@llvm.aarch64.neon.raddhn.v2i32(<2 x i64> [[VRADDHN_V_I_I]], <2 x i64> 
[[VRADDHN_V1_I_I]])
-// CHECK-NEXT:    [[VRADDHN_V3_I_I:%.*]] = bitcast <2 x i32> 
[[VRADDHN_V2_I_I]] to <8 x i8>
-// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <8 x i8> [[VRADDHN_V3_I_I]] to <2 x 
i32>
-// CHECK-NEXT:    [[SHUFFLE_I_I:%.*]] = shufflevector <2 x i32> [[R]], <2 x 
i32> [[TMP2]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-// CHECK-NEXT:    ret <4 x i32> [[SHUFFLE_I_I]]
-//
-int32x4_t test_vraddhn_high_s64(int32x2_t r, int64x2_t a, int64x2_t b) {
-  return vraddhn_high_s64(r, a, b);
-}
-
-// CHECK-LABEL: define dso_local <16 x i8> @test_vraddhn_high_u16(
-// CHECK-SAME: <8 x i8> noundef [[R:%.*]], <8 x i16> noundef [[A:%.*]], <8 x 
i16> noundef [[B:%.*]]) #[[ATTR0]] {
-// CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x i16> [[A]] to <16 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i16> [[B]] to <16 x i8>
-// CHECK-NEXT:    [[VRADDHN_V_I_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x 
i16>
-// CHECK-NEXT:    [[VRADDHN_V1_I_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x 
i16>
-// CHECK-NEXT:    [[VRADDHN_V2_I_I:%.*]] = call <8 x i8> 
@llvm.aarch64.neon.raddhn.v8i8(<8 x i16> [[VRADDHN_V_I_I]], <8 x i16> 
[[VRADDHN_V1_I_I]])
-// CHECK-NEXT:    [[SHUFFLE_I_I:%.*]] = shufflevector <8 x i8> [[R]], <8 x i8> 
[[VRADDHN_V2_I_I]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 
6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-// CHECK-NEXT:    ret <16 x i8> [[SHUFFLE_I_I]]
-//
-uint8x16_t test_vraddhn_high_u16(uint8x8_t r, uint16x8_t a, uint16x8_t b) {
-  return vraddhn_high_u16(r, a, b);
-}
-
-// CHECK-LABEL: define dso_local <8 x i16> @test_vraddhn_high_u32(
-// CHECK-SAME: <4 x i16> noundef [[R:%.*]], <4 x i32> noundef [[A:%.*]], <4 x 
i32> noundef [[B:%.*]]) #[[ATTR0]] {
-// CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[A]] to <16 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i32> [[B]] to <16 x i8>
-// CHECK-NEXT:    [[VRADDHN_V_I_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x 
i32>
-// CHECK-NEXT:    [[VRADDHN_V1_I_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x 
i32>
-// CHECK-NEXT:    [[VRADDHN_V2_I_I:%.*]] = call <4 x i16> 
@llvm.aarch64.neon.raddhn.v4i16(<4 x i32> [[VRADDHN_V_I_I]], <4 x i32> 
[[VRADDHN_V1_I_I]])
-// CHECK-NEXT:    [[VRADDHN_V3_I_I:%.*]] = bitcast <4 x i16> 
[[VRADDHN_V2_I_I]] to <8 x i8>
-// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <8 x i8> [[VRADDHN_V3_I_I]] to <4 x 
i16>
-// CHECK-NEXT:    [[SHUFFLE_I_I:%.*]] = shufflevector <4 x i16> [[R]], <4 x 
i16> [[TMP2]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 
7>
-// CHECK-NEXT:    ret <8 x i16> [[SHUFFLE_I_I]]
-//
-uint16x8_t test_vraddhn_high_u32(uint16x4_t r, uint32x4_t a, uint32x4_t b) {
-  return vraddhn_high_u32(r, a, b);
-}
-
-// CHECK-LABEL: define dso_local <4 x i32> @test_vraddhn_high_u64(
-// CHECK-SAME: <2 x i32> noundef [[R:%.*]], <2 x i64> noundef [[A:%.*]], <2 x 
i64> noundef [[B:%.*]]) #[[ATTR0]] {
-// CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i64> [[A]] to <16 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <2 x i64> [[B]] to <16 x i8>
-// CHECK-NEXT:    [[VRADDHN_V_I_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x 
i64>
-// CHECK-NEXT:    [[VRADDHN_V1_I_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x 
i64>
-// CHECK-NEXT:    [[VRADDHN_V2_I_I:%.*]] = call <2 x i32> 
@llvm.aarch64.neon.raddhn.v2i32(<2 x i64> [[VRADDHN_V_I_I]], <2 x i64> 
[[VRADDHN_V1_I_I]])
-// CHECK-NEXT:    [[VRADDHN_V3_I_I:%.*]] = bitcast <2 x i32> 
[[VRADDHN_V2_I_I]] to <8 x i8>
-// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <8 x i8> [[VRADDHN_V3_I_I]] to <2 x 
i32>
-// CHECK-NEXT:    [[SHUFFLE_I_I:%.*]] = shufflevector <2 x i32> [[R]], <2 x 
i32> [[TMP2]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-// CHECK-NEXT:    ret <4 x i32> [[SHUFFLE_I_I]]
-//
-uint32x4_t test_vraddhn_high_u64(uint32x2_t r, uint64x2_t a, uint64x2_t b) {
-  return vraddhn_high_u64(r, a, b);
-}
-
 // CHECK-LABEL: define dso_local <8 x i8> @test_vsubhn_s16(
 // CHECK-SAME: <8 x i16> noundef [[A:%.*]], <8 x i16> noundef [[B:%.*]]) 
#[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
diff --git a/clang/test/CodeGen/AArch64/neon/add.c 
b/clang/test/CodeGen/AArch64/neon/add.c
index b1dc5c253e7d6..ce35451dd0fa7 100644
--- a/clang/test/CodeGen/AArch64/neon/add.c
+++ b/clang/test/CodeGen/AArch64/neon/add.c
@@ -720,3 +720,563 @@ uint64x2_t test_vaddw_high_u32(uint64x2_t a, uint32x4_t 
b) {
   // LLVM-NEXT:    ret <2 x i64> [[ADD_I]]
   return vaddw_high_u32(a, b);
 }
+
+//===----------------------------------------------------------------------===//
+// 2.x Halving add
+// https://arm-software.github.io/acle/neon_intrinsics/advsimd.html#halving-add
+//===----------------------------------------------------------------------===//
+
+// LLVM-LABEL: @test_vhadd_s8(
+// CIR-LABEL: @vhadd_s8(
+int8x8_t test_vhadd_s8(int8x8_t v1, int8x8_t v2) {
+  // CIR: cir.call_llvm_intrinsic "aarch64.neon.shadd"
+
+  // LLVM: call <8 x i8> @llvm.aarch64.neon.shadd.v8i8(<8 x i8> {{.*}}, <8 x 
i8> {{.*}})
+  // LLVM: ret <8 x i8>
+  return vhadd_s8(v1, v2);
+}
+
+// LLVM-LABEL: @test_vhadd_s16(
+// CIR-LABEL: @vhadd_s16(
+int16x4_t test_vhadd_s16(int16x4_t v1, int16x4_t v2) {
+  // CIR: cir.call_llvm_intrinsic "aarch64.neon.shadd"
+
+  // LLVM: call <4 x i16> @llvm.aarch64.neon.shadd.v4i16(<4 x i16> {{.*}}, <4 
x i16> {{.*}})
+  // LLVM: ret <4 x i16>
+  return vhadd_s16(v1, v2);
+}
+
+// LLVM-LABEL: @test_vhadd_s32(
+// CIR-LABEL: @vhadd_s32(
+int32x2_t test_vhadd_s32(int32x2_t v1, int32x2_t v2) {
+  // CIR: cir.call_llvm_intrinsic "aarch64.neon.shadd"
+
+  // LLVM: call <2 x i32> @llvm.aarch64.neon.shadd.v2i32(<2 x i32> {{.*}}, <2 
x i32> {{.*}})
+  // LLVM: ret <2 x i32>
+  return vhadd_s32(v1, v2);
+}
+
+// LLVM-LABEL: @test_vhadd_u8(
+// CIR-LABEL: @vhadd_u8(
+uint8x8_t test_vhadd_u8(uint8x8_t v1, uint8x8_t v2) {
+  // CIR: cir.call_llvm_intrinsic "aarch64.neon.uhadd"
+
+  // LLVM: call <8 x i8> @llvm.aarch64.neon.uhadd.v8i8(<8 x i8> {{.*}}, <8 x 
i8> {{.*}})
+  // LLVM: ret <8 x i8>
+  return vhadd_u8(v1, v2);
+}
+
+// LLVM-LABEL: @test_vhadd_u16(
+// CIR-LABEL: @vhadd_u16(
+uint16x4_t test_vhadd_u16(uint16x4_t v1, uint16x4_t v2) {
+  // CIR: cir.call_llvm_intrinsic "aarch64.neon.uhadd"
+
+  // LLVM: call <4 x i16> @llvm.aarch64.neon.uhadd.v4i16(<4 x i16> {{.*}}, <4 
x i16> {{.*}})
+  // LLVM: ret <4 x i16>
+  return vhadd_u16(v1, v2);
+}
+
+// LLVM-LABEL: @test_vhadd_u32(
+// CIR-LABEL: @vhadd_u32(
+uint32x2_t test_vhadd_u32(uint32x2_t v1, uint32x2_t v2) {
+  // CIR: cir.call_llvm_intrinsic "aarch64.neon.uhadd"
+
+  // LLVM: call <2 x i32> @llvm.aarch64.neon.uhadd.v2i32(<2 x i32> {{.*}}, <2 
x i32> {{.*}})
+  // LLVM: ret <2 x i32>
+  return vhadd_u32(v1, v2);
+}
+
+// LLVM-LABEL: @test_vhaddq_s8(
+// CIR-LABEL: @vhaddq_s8(
+int8x16_t test_vhaddq_s8(int8x16_t v1, int8x16_t v2) {
+  // CIR: cir.call_llvm_intrinsic "aarch64.neon.shadd"
+
+  // LLVM: call <16 x i8> @llvm.aarch64.neon.shadd.v16i8(<16 x i8> {{.*}}, <16 
x i8> {{.*}})
+  // LLVM: ret <16 x i8>
+  return vhaddq_s8(v1, v2);
+}
+
+// LLVM-LABEL: @test_vhaddq_s16(
+// CIR-LABEL: @vhaddq_s16(
+int16x8_t test_vhaddq_s16(int16x8_t v1, int16x8_t v2) {
+  // CIR: cir.call_llvm_intrinsic "aarch64.neon.shadd"
+
+  // LLVM: call <8 x i16> @llvm.aarch64.neon.shadd.v8i16(<8 x i16> {{.*}}, <8 
x i16> {{.*}})
+  // LLVM: ret <8 x i16>
+  return vhaddq_s16(v1, v2);
+}
+
+// LLVM-LABEL: @test_vhaddq_s32(
+// CIR-LABEL: @vhaddq_s32(
+int32x4_t test_vhaddq_s32(int32x4_t v1, int32x4_t v2) {
+  // CIR: cir.call_llvm_intrinsic "aarch64.neon.shadd"
+
+  // LLVM: call <4 x i32> @llvm.aarch64.neon.shadd.v4i32(<4 x i32> {{.*}}, <4 
x i32> {{.*}})
+  // LLVM: ret <4 x i32>
+  return vhaddq_s32(v1, v2);
+}
+
+// LLVM-LABEL: @test_vhaddq_u8(
+// CIR-LABEL: @vhaddq_u8(
+uint8x16_t test_vhaddq_u8(uint8x16_t v1, uint8x16_t v2) {
+  // CIR: cir.call_llvm_intrinsic "aarch64.neon.uhadd"
+
+  // LLVM: call <16 x i8> @llvm.aarch64.neon.uhadd.v16i8(<16 x i8> {{.*}}, <16 
x i8> {{.*}})
+  // LLVM: ret <16 x i8>
+  return vhaddq_u8(v1, v2);
+}
+
+// LLVM-LABEL: @test_vhaddq_u16(
+// CIR-LABEL: @vhaddq_u16(
+uint16x8_t test_vhaddq_u16(uint16x8_t v1, uint16x8_t v2) {
+  // CIR: cir.call_llvm_intrinsic "aarch64.neon.uhadd"
+
+  // LLVM: call <8 x i16> @llvm.aarch64.neon.uhadd.v8i16(<8 x i16> {{.*}}, <8 
x i16> {{.*}})
+  // LLVM: ret <8 x i16>
+  return vhaddq_u16(v1, v2);
+}
+
+// LLVM-LABEL: @test_vhaddq_u32(
+// CIR-LABEL: @vhaddq_u32(
+uint32x4_t test_vhaddq_u32(uint32x4_t v1, uint32x4_t v2) {
+  // CIR: cir.call_llvm_intrinsic "aarch64.neon.uhadd"
+
+  // LLVM: call <4 x i32> @llvm.aarch64.neon.uhadd.v4i32(<4 x i32> {{.*}}, <4 
x i32> {{.*}})
+  // LLVM: ret <4 x i32>
+  return vhaddq_u32(v1, v2);
+}
+
+//===----------------------------------------------------------------------===//
+// 2.x Rounding halving add
+// 
https://arm-software.github.io/acle/neon_intrinsics/advsimd.html#rounding-halving-add
+//===----------------------------------------------------------------------===//
+
+// LLVM-LABEL: @test_vrhadd_s8(
+// CIR-LABEL: @vrhadd_s8(
+int8x8_t test_vrhadd_s8(int8x8_t v1, int8x8_t v2) {
+  // CIR: cir.call_llvm_intrinsic "aarch64.neon.srhadd"
+
+  // LLVM: call <8 x i8> @llvm.aarch64.neon.srhadd.v8i8(<8 x i8> {{.*}}, <8 x 
i8> {{.*}})
+  // LLVM: ret <8 x i8>
+  return vrhadd_s8(v1, v2);
+}
+
+// LLVM-LABEL: @test_vrhadd_s16(
+// CIR-LABEL: @vrhadd_s16(
+int16x4_t test_vrhadd_s16(int16x4_t v1, int16x4_t v2) {
+  // CIR: cir.call_llvm_intrinsic "aarch64.neon.srhadd"
+
+  // LLVM: call <4 x i16> @llvm.aarch64.neon.srhadd.v4i16(<4 x i16> {{.*}}, <4 
x i16> {{.*}})
+  // LLVM: ret <4 x i16>
+  return vrhadd_s16(v1, v2);
+}
+
+// LLVM-LABEL: @test_vrhadd_s32(
+// CIR-LABEL: @vrhadd_s32(
+int32x2_t test_vrhadd_s32(int32x2_t v1, int32x2_t v2) {
+  // CIR: cir.call_llvm_intrinsic "aarch64.neon.srhadd"
+
+  // LLVM: call <2 x i32> @llvm.aarch64.neon.srhadd.v2i32(<2 x i32> {{.*}}, <2 
x i32> {{.*}})
+  // LLVM: ret <2 x i32>
+  return vrhadd_s32(v1, v2);
+}
+
+// LLVM-LABEL: @test_vrhadd_u8(
+// CIR-LABEL: @vrhadd_u8(
+uint8x8_t test_vrhadd_u8(uint8x8_t v1, uint8x8_t v2) {
+  // CIR: cir.call_llvm_intrinsic "aarch64.neon.urhadd"
+
+  // LLVM: call <8 x i8> @llvm.aarch64.neon.urhadd.v8i8(<8 x i8> {{.*}}, <8 x 
i8> {{.*}})
+  // LLVM: ret <8 x i8>
+  return vrhadd_u8(v1, v2);
+}
+
+// LLVM-LABEL: @test_vrhadd_u16(
+// CIR-LABEL: @vrhadd_u16(
+uint16x4_t test_vrhadd_u16(uint16x4_t v1, uint16x4_t v2) {
+  // CIR: cir.call_llvm_intrinsic "aarch64.neon.urhadd"
+
+  // LLVM: call <4 x i16> @llvm.aarch64.neon.urhadd.v4i16(<4 x i16> {{.*}}, <4 
x i16> {{.*}})
+  // LLVM: ret <4 x i16>
+  return vrhadd_u16(v1, v2);
+}
+
+// LLVM-LABEL: @test_vrhadd_u32(
+// CIR-LABEL: @vrhadd_u32(
+uint32x2_t test_vrhadd_u32(uint32x2_t v1, uint32x2_t v2) {
+  // CIR: cir.call_llvm_intrinsic "aarch64.neon.urhadd"
+
+  // LLVM: call <2 x i32> @llvm.aarch64.neon.urhadd.v2i32(<2 x i32> {{.*}}, <2 
x i32> {{.*}})
+  // LLVM: ret <2 x i32>
+  return vrhadd_u32(v1, v2);
+}
+
+// LLVM-LABEL: @test_vrhaddq_s8(
+// CIR-LABEL: @vrhaddq_s8(
+int8x16_t test_vrhaddq_s8(int8x16_t v1, int8x16_t v2) {
+  // CIR: cir.call_llvm_intrinsic "aarch64.neon.srhadd"
+
+  // LLVM: call <16 x i8> @llvm.aarch64.neon.srhadd.v16i8(<16 x i8> {{.*}}, 
<16 x i8> {{.*}})
+  // LLVM: ret <16 x i8>
+  return vrhaddq_s8(v1, v2);
+}
+
+// LLVM-LABEL: @test_vrhaddq_s16(
+// CIR-LABEL: @vrhaddq_s16(
+int16x8_t test_vrhaddq_s16(int16x8_t v1, int16x8_t v2) {
+  // CIR: cir.call_llvm_intrinsic "aarch64.neon.srhadd"
+
+  // LLVM: call <8 x i16> @llvm.aarch64.neon.srhadd.v8i16(<8 x i16> {{.*}}, <8 
x i16> {{.*}})
+  // LLVM: ret <8 x i16>
+  return vrhaddq_s16(v1, v2);
+}
+
+// LLVM-LABEL: @test_vrhaddq_s32(
+// CIR-LABEL: @vrhaddq_s32(
+int32x4_t test_vrhaddq_s32(int32x4_t v1, int32x4_t v2) {
+  // CIR: cir.call_llvm_intrinsic "aarch64.neon.srhadd"
+
+  // LLVM: call <4 x i32> @llvm.aarch64.neon.srhadd.v4i32(<4 x i32> {{.*}}, <4 
x i32> {{.*}})
+  // LLVM: ret <4 x i32>
+  return vrhaddq_s32(v1, v2);
+}
+
+// LLVM-LABEL: @test_vrhaddq_u8(
+// CIR-LABEL: @vrhaddq_u8(
+uint8x16_t test_vrhaddq_u8(uint8x16_t v1, uint8x16_t v2) {
+  // CIR: cir.call_llvm_intrinsic "aarch64.neon.urhadd"
+
+  // LLVM: call <16 x i8> @llvm.aarch64.neon.urhadd.v16i8(<16 x i8> {{.*}}, 
<16 x i8> {{.*}})
+  // LLVM: ret <16 x i8>
+  return vrhaddq_u8(v1, v2);
+}
+
+// LLVM-LABEL: @test_vrhaddq_u16(
+// CIR-LABEL: @vrhaddq_u16(
+uint16x8_t test_vrhaddq_u16(uint16x8_t v1, uint16x8_t v2) {
+  // CIR: cir.call_llvm_intrinsic "aarch64.neon.urhadd"
+
+  // LLVM: call <8 x i16> @llvm.aarch64.neon.urhadd.v8i16(<8 x i16> {{.*}}, <8 
x i16> {{.*}})
+  // LLVM: ret <8 x i16>
+  return vrhaddq_u16(v1, v2);
+}
+
+// LLVM-LABEL: @test_vrhaddq_u32(
+// CIR-LABEL: @vrhaddq_u32(
+uint32x4_t test_vrhaddq_u32(uint32x4_t v1, uint32x4_t v2) {
+  // CIR: cir.call_llvm_intrinsic "aarch64.neon.urhadd"
+
+  // LLVM: call <4 x i32> @llvm.aarch64.neon.urhadd.v4i32(<4 x i32> {{.*}}, <4 
x i32> {{.*}})
+  // LLVM: ret <4 x i32>
+  return vrhaddq_u32(v1, v2);
+}
+
+//===----------------------------------------------------------------------===//
+// 2.x Add narrowing high half
+// 
https://arm-software.github.io/acle/neon_intrinsics/advsimd.html#add-narrowing-high-half
+//===----------------------------------------------------------------------===//
+
+// LLVM-LABEL: @test_vaddhn_s16(
+// CIR-LABEL: @vaddhn_s16(
+int8x8_t test_vaddhn_s16(int16x8_t a, int16x8_t b) {
+  // CIR: cir.add
+  // CIR: cir.shift(right
+  // CIR: cir.cast integral
+
+  // LLVM: [[ADD:%.*]] = add <8 x i16> {{.*}}, {{.*}}
+  // LLVM: [[SH:%.*]] = lshr <8 x i16> [[ADD]], splat (i16 8)
+  // LLVM: [[TR:%.*]] = trunc <8 x i16> [[SH]] to <8 x i8>
+  // LLVM: ret <8 x i8> [[TR]]
+  return vaddhn_s16(a, b);
+}
+
+// LLVM-LABEL: @test_vaddhn_s32(
+// CIR-LABEL: @vaddhn_s32(
+int16x4_t test_vaddhn_s32(int32x4_t a, int32x4_t b) {
+  // CIR: cir.add
+  // CIR: cir.shift(right
+  // CIR: cir.cast integral
+
+  // LLVM: [[ADD:%.*]] = add <4 x i32> {{.*}}, {{.*}}
+  // LLVM: [[SH:%.*]] = lshr <4 x i32> [[ADD]], splat (i32 16)
+  // LLVM: [[TR:%.*]] = trunc <4 x i32> [[SH]] to <4 x i16>
+  // LLVM: ret <4 x i16> [[TR]]
+  return vaddhn_s32(a, b);
+}
+
+// LLVM-LABEL: @test_vaddhn_s64(
+// CIR-LABEL: @vaddhn_s64(
+int32x2_t test_vaddhn_s64(int64x2_t a, int64x2_t b) {
+  // CIR: cir.add
+  // CIR: cir.shift(right
+  // CIR: cir.cast integral
+
+  // LLVM: [[ADD:%.*]] = add <2 x i64> {{.*}}, {{.*}}
+  // LLVM: [[SH:%.*]] = lshr <2 x i64> [[ADD]], splat (i64 32)
+  // LLVM: [[TR:%.*]] = trunc <2 x i64> [[SH]] to <2 x i32>
+  // LLVM: ret <2 x i32> [[TR]]
+  return vaddhn_s64(a, b);
+}
+
+// LLVM-LABEL: @test_vaddhn_u16(
+// CIR-LABEL: @vaddhn_u16(
+uint8x8_t test_vaddhn_u16(uint16x8_t a, uint16x8_t b) {
+  // CIR: cir.add
+  // CIR: cir.shift(right
+  // CIR: cir.cast integral
+
+  // LLVM: [[ADD:%.*]] = add <8 x i16> {{.*}}, {{.*}}
+  // LLVM: [[SH:%.*]] = lshr <8 x i16> [[ADD]], splat (i16 8)
+  // LLVM: [[TR:%.*]] = trunc <8 x i16> [[SH]] to <8 x i8>
+  // LLVM: ret <8 x i8> [[TR]]
+  return vaddhn_u16(a, b);
+}
+
+// LLVM-LABEL: @test_vaddhn_u32(
+// CIR-LABEL: @vaddhn_u32(
+uint16x4_t test_vaddhn_u32(uint32x4_t a, uint32x4_t b) {
+  // CIR: cir.add
+  // CIR: cir.shift(right
+  // CIR: cir.cast integral
+
+  // LLVM: [[ADD:%.*]] = add <4 x i32> {{.*}}, {{.*}}
+  // LLVM: [[SH:%.*]] = lshr <4 x i32> [[ADD]], splat (i32 16)
+  // LLVM: [[TR:%.*]] = trunc <4 x i32> [[SH]] to <4 x i16>
+  // LLVM: ret <4 x i16> [[TR]]
+  return vaddhn_u32(a, b);
+}
+
+// LLVM-LABEL: @test_vaddhn_u64(
+// CIR-LABEL: @vaddhn_u64(
+uint32x2_t test_vaddhn_u64(uint64x2_t a, uint64x2_t b) {
+  // CIR: cir.add
+  // CIR: cir.shift(right
+  // CIR: cir.cast integral
+
+  // LLVM: [[ADD:%.*]] = add <2 x i64> {{.*}}, {{.*}}
+  // LLVM: [[SH:%.*]] = lshr <2 x i64> [[ADD]], splat (i64 32)
+  // LLVM: [[TR:%.*]] = trunc <2 x i64> [[SH]] to <2 x i32>
+  // LLVM: ret <2 x i32> [[TR]]
+  return vaddhn_u64(a, b);
+}
+
+// LLVM-LABEL: @test_vaddhn_high_s16(
+// CIR-LABEL: @vaddhn_high_s16(
+int8x16_t test_vaddhn_high_s16(int8x8_t r, int16x8_t a, int16x8_t b) {
+  // CIR: cir.call @vaddhn_s16(
+  // CIR: cir.call @vcombine_s8(
+
+  // LLVM: [[ADD:%.*]] = add <8 x i16> {{.*}}, {{.*}}
+  // LLVM: [[SH:%.*]] = lshr <8 x i16> [[ADD]], splat (i16 8)
+  // LLVM: [[TR:%.*]] = trunc <8 x i16> [[SH]] to <8 x i8>
+  // LLVM: [[RES:%.*]] = shufflevector <8 x i8> {{.*}}, <8 x i8> [[TR]], <16 x 
i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 
10, i32 11, i32 12, i32 13, i32 14, i32 15>
+  // LLVM: ret <16 x i8> [[RES]]
+  return vaddhn_high_s16(r, a, b);
+}
+
+// LLVM-LABEL: @test_vaddhn_high_s32(
+// CIR-LABEL: @vaddhn_high_s32(
+int16x8_t test_vaddhn_high_s32(int16x4_t r, int32x4_t a, int32x4_t b) {
+  // CIR: cir.call @vaddhn_s32(
+  // CIR: cir.call @vcombine_s16(
+
+  // LLVM: [[ADD:%.*]] = add <4 x i32> {{.*}}, {{.*}}
+  // LLVM: [[SH:%.*]] = lshr <4 x i32> [[ADD]], splat (i32 16)
+  // LLVM: [[TR:%.*]] = trunc <4 x i32> [[SH]] to <4 x i16>
+  // LLVM: [[RES:%.*]] = shufflevector <4 x i16> {{.*}}, <4 x i16> [[TR]], <8 
x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  // LLVM: ret <8 x i16> [[RES]]
+  return vaddhn_high_s32(r, a, b);
+}
+
+// LLVM-LABEL: @test_vaddhn_high_s64(
+// CIR-LABEL: @vaddhn_high_s64(
+int32x4_t test_vaddhn_high_s64(int32x2_t r, int64x2_t a, int64x2_t b) {
+  // CIR: cir.call @vaddhn_s64(
+  // CIR: cir.call @vcombine_s32(
+
+  // LLVM: [[ADD:%.*]] = add <2 x i64> {{.*}}, {{.*}}
+  // LLVM: [[SH:%.*]] = lshr <2 x i64> [[ADD]], splat (i64 32)
+  // LLVM: [[TR:%.*]] = trunc <2 x i64> [[SH]] to <2 x i32>
+  // LLVM: [[RES:%.*]] = shufflevector <2 x i32> {{.*}}, <2 x i32> [[TR]], <4 
x i32> <i32 0, i32 1, i32 2, i32 3>
+  // LLVM: ret <4 x i32> [[RES]]
+  return vaddhn_high_s64(r, a, b);
+}
+
+// LLVM-LABEL: @test_vaddhn_high_u16(
+// CIR-LABEL: @vaddhn_high_u16(
+uint8x16_t test_vaddhn_high_u16(uint8x8_t r, uint16x8_t a, uint16x8_t b) {
+  // CIR: cir.call @vaddhn_u16(
+  // CIR: cir.call @vcombine_u8(
+
+  // LLVM: [[ADD:%.*]] = add <8 x i16> {{.*}}, {{.*}}
+  // LLVM: [[SH:%.*]] = lshr <8 x i16> [[ADD]], splat (i16 8)
+  // LLVM: [[TR:%.*]] = trunc <8 x i16> [[SH]] to <8 x i8>
+  // LLVM: [[RES:%.*]] = shufflevector <8 x i8> {{.*}}, <8 x i8> [[TR]], <16 x 
i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 
10, i32 11, i32 12, i32 13, i32 14, i32 15>
+  // LLVM: ret <16 x i8> [[RES]]
+  return vaddhn_high_u16(r, a, b);
+}
+
+// LLVM-LABEL: @test_vaddhn_high_u32(
+// CIR-LABEL: @vaddhn_high_u32(
+uint16x8_t test_vaddhn_high_u32(uint16x4_t r, uint32x4_t a, uint32x4_t b) {
+  // CIR: cir.call @vaddhn_u32(
+  // CIR: cir.call @vcombine_u16(
+
+  // LLVM: [[ADD:%.*]] = add <4 x i32> {{.*}}, {{.*}}
+  // LLVM: [[SH:%.*]] = lshr <4 x i32> [[ADD]], splat (i32 16)
+  // LLVM: [[TR:%.*]] = trunc <4 x i32> [[SH]] to <4 x i16>
+  // LLVM: [[RES:%.*]] = shufflevector <4 x i16> {{.*}}, <4 x i16> [[TR]], <8 
x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  // LLVM: ret <8 x i16> [[RES]]
+  return vaddhn_high_u32(r, a, b);
+}
+
+// LLVM-LABEL: @test_vaddhn_high_u64(
+// CIR-LABEL: @vaddhn_high_u64(
+uint32x4_t test_vaddhn_high_u64(uint32x2_t r, uint64x2_t a, uint64x2_t b) {
+  // CIR: cir.call @vaddhn_u64(
+  // CIR: cir.call @vcombine_u32(
+
+  // LLVM: [[ADD:%.*]] = add <2 x i64> {{.*}}, {{.*}}
+  // LLVM: [[SH:%.*]] = lshr <2 x i64> [[ADD]], splat (i64 32)
+  // LLVM: [[TR:%.*]] = trunc <2 x i64> [[SH]] to <2 x i32>
+  // LLVM: [[RES:%.*]] = shufflevector <2 x i32> {{.*}}, <2 x i32> [[TR]], <4 
x i32> <i32 0, i32 1, i32 2, i32 3>
+  // LLVM: ret <4 x i32> [[RES]]
+  return vaddhn_high_u64(r, a, b);
+}
+
+//===----------------------------------------------------------------------===//
+// 2.x Rounding add narrowing high half
+// 
https://arm-software.github.io/acle/neon_intrinsics/advsimd.html#rounding-add-narrowing-high-half
+//===----------------------------------------------------------------------===//
+
+// LLVM-LABEL: @test_vraddhn_s16(
+// CIR-LABEL: @vraddhn_s16(
+int8x8_t test_vraddhn_s16(int16x8_t a, int16x8_t b) {
+  // CIR: cir.call_llvm_intrinsic "aarch64.neon.raddhn"
+
+  // LLVM: call <8 x i8> @llvm.aarch64.neon.raddhn.v8i8(<8 x i16> {{.*}}, <8 x 
i16> {{.*}})
+  // LLVM: ret <8 x i8>
+  return vraddhn_s16(a, b);
+}
+
+// LLVM-LABEL: @test_vraddhn_s32(
+// CIR-LABEL: @vraddhn_s32(
+int16x4_t test_vraddhn_s32(int32x4_t a, int32x4_t b) {
+  // CIR: cir.call_llvm_intrinsic "aarch64.neon.raddhn"
+
+  // LLVM: call <4 x i16> @llvm.aarch64.neon.raddhn.v4i16(<4 x i32> {{.*}}, <4 
x i32> {{.*}})
+  // LLVM: ret <4 x i16>
+  return vraddhn_s32(a, b);
+}
+
+// LLVM-LABEL: @test_vraddhn_s64(
+// CIR-LABEL: @vraddhn_s64(
+int32x2_t test_vraddhn_s64(int64x2_t a, int64x2_t b) {
+  // CIR: cir.call_llvm_intrinsic "aarch64.neon.raddhn"
+
+  // LLVM: call <2 x i32> @llvm.aarch64.neon.raddhn.v2i32(<2 x i64> {{.*}}, <2 
x i64> {{.*}})
+  // LLVM: ret <2 x i32>
+  return vraddhn_s64(a, b);
+}
+
+// LLVM-LABEL: @test_vraddhn_u16(
+// CIR-LABEL: @vraddhn_u16(
+uint8x8_t test_vraddhn_u16(uint16x8_t a, uint16x8_t b) {
+  // CIR: cir.call_llvm_intrinsic "aarch64.neon.raddhn"
+
+  // LLVM: call <8 x i8> @llvm.aarch64.neon.raddhn.v8i8(<8 x i16> {{.*}}, <8 x 
i16> {{.*}})
+  // LLVM: ret <8 x i8>
+  return vraddhn_u16(a, b);
+}
+
+// LLVM-LABEL: @test_vraddhn_u32(
+// CIR-LABEL: @vraddhn_u32(
+uint16x4_t test_vraddhn_u32(uint32x4_t a, uint32x4_t b) {
+  // CIR: cir.call_llvm_intrinsic "aarch64.neon.raddhn"
+
+  // LLVM: call <4 x i16> @llvm.aarch64.neon.raddhn.v4i16(<4 x i32> {{.*}}, <4 
x i32> {{.*}})
+  // LLVM: ret <4 x i16>
+  return vraddhn_u32(a, b);
+}
+
+// LLVM-LABEL: @test_vraddhn_u64(
+// CIR-LABEL: @vraddhn_u64(
+uint32x2_t test_vraddhn_u64(uint64x2_t a, uint64x2_t b) {
+  // CIR: cir.call_llvm_intrinsic "aarch64.neon.raddhn"
+
+  // LLVM: call <2 x i32> @llvm.aarch64.neon.raddhn.v2i32(<2 x i64> {{.*}}, <2 
x i64> {{.*}})
+  // LLVM: ret <2 x i32>
+  return vraddhn_u64(a, b);
+}
+
+// LLVM-LABEL: @test_vraddhn_high_s16(
+// CIR-LABEL: @vraddhn_high_s16(
+int8x16_t test_vraddhn_high_s16(int8x8_t r, int16x8_t a, int16x8_t b) {
+  // CIR: cir.call @vraddhn_s16(
+  // CIR: cir.call @vcombine_s8(
+
+  // LLVM: call <8 x i8> @llvm.aarch64.neon.raddhn.v8i8(<8 x i16> {{.*}}, <8 x 
i16> {{.*}})
+  // LLVM: [[RES:%.*]] = shufflevector <8 x i8> {{.*}}, <8 x i8> {{.*}}, <16 x 
i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 
10, i32 11, i32 12, i32 13, i32 14, i32 15>
+  // LLVM: ret <16 x i8> [[RES]]
+  return vraddhn_high_s16(r, a, b);
+}
+
+// LLVM-LABEL: @test_vraddhn_high_s32(
+// CIR-LABEL: @vraddhn_high_s32(
+int16x8_t test_vraddhn_high_s32(int16x4_t r, int32x4_t a, int32x4_t b) {
+  // CIR: cir.call @vraddhn_s32(
+  // CIR: cir.call @vcombine_s16(
+
+  // LLVM: call <4 x i16> @llvm.aarch64.neon.raddhn.v4i16(<4 x i32> {{.*}}, <4 
x i32> {{.*}})
+  // LLVM: [[RES:%.*]] = shufflevector <4 x i16> {{.*}}, <4 x i16> {{.*}}, <8 
x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  // LLVM: ret <8 x i16> [[RES]]
+  return vraddhn_high_s32(r, a, b);
+}
+
+// LLVM-LABEL: @test_vraddhn_high_s64(
+// CIR-LABEL: @vraddhn_high_s64(
+int32x4_t test_vraddhn_high_s64(int32x2_t r, int64x2_t a, int64x2_t b) {
+  // CIR: cir.call @vraddhn_s64(
+  // CIR: cir.call @vcombine_s32(
+
+  // LLVM: call <2 x i32> @llvm.aarch64.neon.raddhn.v2i32(<2 x i64> {{.*}}, <2 
x i64> {{.*}})
+  // LLVM: [[RES:%.*]] = shufflevector <2 x i32> {{.*}}, <2 x i32> {{.*}}, <4 
x i32> <i32 0, i32 1, i32 2, i32 3>
+  // LLVM: ret <4 x i32> [[RES]]
+  return vraddhn_high_s64(r, a, b);
+}
+
+// LLVM-LABEL: @test_vraddhn_high_u16(
+// CIR-LABEL: @vraddhn_high_u16(
+uint8x16_t test_vraddhn_high_u16(uint8x8_t r, uint16x8_t a, uint16x8_t b) {
+  // CIR: cir.call @vraddhn_u16(
+  // CIR: cir.call @vcombine_u8(
+
+  // LLVM: call <8 x i8> @llvm.aarch64.neon.raddhn.v8i8(<8 x i16> {{.*}}, <8 x 
i16> {{.*}})
+  // LLVM: [[RES:%.*]] = shufflevector <8 x i8> {{.*}}, <8 x i8> {{.*}}, <16 x 
i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 
10, i32 11, i32 12, i32 13, i32 14, i32 15>
+  // LLVM: ret <16 x i8> [[RES]]
+  return vraddhn_high_u16(r, a, b);
+}
+
+// LLVM-LABEL: @test_vraddhn_high_u32(
+// CIR-LABEL: @vraddhn_high_u32(
+uint16x8_t test_vraddhn_high_u32(uint16x4_t r, uint32x4_t a, uint32x4_t b) {
+  // CIR: cir.call @vraddhn_u32(
+  // CIR: cir.call @vcombine_u16(
+
+  // LLVM: call <4 x i16> @llvm.aarch64.neon.raddhn.v4i16(<4 x i32> {{.*}}, <4 
x i32> {{.*}})
+  // LLVM: [[RES:%.*]] = shufflevector <4 x i16> {{.*}}, <4 x i16> {{.*}}, <8 
x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  // LLVM: ret <8 x i16> [[RES]]
+  return vraddhn_high_u32(r, a, b);
+}
+
+// LLVM-LABEL: @test_vraddhn_high_u64(
+// CIR-LABEL: @vraddhn_high_u64(
+uint32x4_t test_vraddhn_high_u64(uint32x2_t r, uint64x2_t a, uint64x2_t b) {
+  // CIR: cir.call @vraddhn_u64(
+  // CIR: cir.call @vcombine_u32(
+
+  // LLVM: call <2 x i32> @llvm.aarch64.neon.raddhn.v2i32(<2 x i64> {{.*}}, <2 
x i64> {{.*}})
+  // LLVM: [[RES:%.*]] = shufflevector <2 x i32> {{.*}}, <2 x i32> {{.*}}, <4 
x i32> <i32 0, i32 1, i32 2, i32 3>
+  // LLVM: ret <4 x i32> [[RES]]
+  return vraddhn_high_u64(r, a, b);
+}

_______________________________________________
cfe-commits mailing list
[email protected]
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits

[clang] [CIR][AArch64] Upstream narrowing-addition NEON builtins (PR #204989)

Reply via email to