[clang] [llvm] [LLVM][AArch64] Add "u" variants of sve.[s,u]hadd intrinsics (PR #170894)

Paul Walker via cfe-commits Fri, 05 Dec 2025 10:09:41 -0800

https://github.com/paulwalker-arm created 
https://github.com/llvm/llvm-project/pull/170894


None

>From 08a70da9f9ddc1f83eeb062bdb6299c011cb7ffb Mon Sep 17 00:00:00 2001
From: Paul Walker <[email protected]>
Date: Tue, 2 Dec 2025 12:33:17 +0000
Subject: [PATCH] [LLVM][AArch64] Add "u" variants of sve.[s,u]hadd intrinsics

---
 clang/include/clang/Basic/arm_sve.td          |   8 +-
 .../AArch64/sve2-intrinsics/acle_sve2_hsub.c  |  64 ++--
 .../AArch64/sve2-intrinsics/acle_sve2_hsubr.c |  64 ++--
 llvm/include/llvm/IR/IntrinsicsAArch64.td     |   2 +
 .../lib/Target/AArch64/AArch64SVEInstrInfo.td |  11 +-
 llvm/lib/Target/AArch64/AArch64SchedA510.td   |   2 +-
 .../Target/AArch64/AArch64SchedNeoverseN2.td  |   2 +-
 .../Target/AArch64/AArch64SchedNeoverseN3.td  |   2 +-
 .../Target/AArch64/AArch64SchedNeoverseV2.td  |   2 +-
 .../AArch64/AArch64TargetTransformInfo.cpp    |   8 +
 .../sve2-intrinsics-uniform-dsp-undef.ll      | 296 ++++++++++++++++++
 .../sve-intrinsic-comb-no-active-lanes.ll     |  12 +-
 .../sve-intrinsic-simplify-to-u-form.ll       |  44 +++
 13 files changed, 433 insertions(+), 84 deletions(-)

diff --git a/clang/include/clang/Basic/arm_sve.td 
b/clang/include/clang/Basic/arm_sve.td
index 41179207dd060..1b1350f1b2ec1 100644
--- a/clang/include/clang/Basic/arm_sve.td
+++ b/clang/include/clang/Basic/arm_sve.td
@@ -1280,10 +1280,10 @@ defm SVQSUB_S  : SInstZPZZ<"svqsub",  "csli",     
"aarch64_sve_sqsub",  "aarch64
 defm SVQSUB_U  : SInstZPZZ<"svqsub",  "UcUsUiUl", "aarch64_sve_uqsub",  
"aarch64_sve_uqsub_u">;
 defm SVQSUBR_S : SInstZPZZ<"svqsubr", "csli",     "aarch64_sve_sqsubr", 
"aarch64_sve_sqsub_u", [ReverseMergeAnyBinOp]>;
 defm SVQSUBR_U : SInstZPZZ<"svqsubr", "UcUsUiUl", "aarch64_sve_uqsubr", 
"aarch64_sve_uqsub_u", [ReverseMergeAnyBinOp]>;
-defm SVHSUB_S  : SInstZPZZ<"svhsub",  "csli",     "aarch64_sve_shsub",  
"aarch64_sve_shsub">;
-defm SVHSUB_U  : SInstZPZZ<"svhsub",  "UcUsUiUl", "aarch64_sve_uhsub",  
"aarch64_sve_uhsub">;
-defm SVHSUBR_S : SInstZPZZ<"svhsubr", "csli",     "aarch64_sve_shsubr", 
"aarch64_sve_shsubr">;
-defm SVHSUBR_U : SInstZPZZ<"svhsubr", "UcUsUiUl", "aarch64_sve_uhsubr", 
"aarch64_sve_uhsubr">;
+defm SVHSUB_S  : SInstZPZZ<"svhsub",  "csli",     "aarch64_sve_shsub",  
"aarch64_sve_shsub_u">;
+defm SVHSUB_U  : SInstZPZZ<"svhsub",  "UcUsUiUl", "aarch64_sve_uhsub",  
"aarch64_sve_uhsub_u">;
+defm SVHSUBR_S : SInstZPZZ<"svhsubr", "csli",     "aarch64_sve_shsubr", 
"aarch64_sve_shsub_u", [ReverseMergeAnyBinOp]>;
+defm SVHSUBR_U : SInstZPZZ<"svhsubr", "UcUsUiUl", "aarch64_sve_uhsubr", 
"aarch64_sve_uhsub_u", [ReverseMergeAnyBinOp]>;
 
 defm SVQABS   : SInstZPZ<"svqabs",   "csil", "aarch64_sve_sqabs">;
 defm SVQNEG   : SInstZPZ<"svqneg",   "csil", "aarch64_sve_sqneg">;
diff --git a/clang/test/CodeGen/AArch64/sve2-intrinsics/acle_sve2_hsub.c 
b/clang/test/CodeGen/AArch64/sve2-intrinsics/acle_sve2_hsub.c
index 22c11466a7288..e101dd2fe3399 100644
--- a/clang/test/CodeGen/AArch64/sve2-intrinsics/acle_sve2_hsub.c
+++ b/clang/test/CodeGen/AArch64/sve2-intrinsics/acle_sve2_hsub.c
@@ -297,12 +297,12 @@ svuint64_t test_svhsub_u64_m(svbool_t pg, svuint64_t op1, 
svuint64_t op2)
 
 // CHECK-LABEL: @test_svhsub_s8_x(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 16 x i8> 
@llvm.aarch64.sve.shsub.nxv16i8(<vscale x 16 x i1> [[PG:%.*]], <vscale x 16 x 
i8> [[OP1:%.*]], <vscale x 16 x i8> [[OP2:%.*]])
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 16 x i8> 
@llvm.aarch64.sve.shsub.u.nxv16i8(<vscale x 16 x i1> [[PG:%.*]], <vscale x 16 x 
i8> [[OP1:%.*]], <vscale x 16 x i8> [[OP2:%.*]])
 // CHECK-NEXT:    ret <vscale x 16 x i8> [[TMP0]]
 //
 // CPP-CHECK-LABEL: @_Z16test_svhsub_s8_xu10__SVBool_tu10__SVInt8_tS0_(
 // CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 16 x i8> 
@llvm.aarch64.sve.shsub.nxv16i8(<vscale x 16 x i1> [[PG:%.*]], <vscale x 16 x 
i8> [[OP1:%.*]], <vscale x 16 x i8> [[OP2:%.*]])
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 16 x i8> 
@llvm.aarch64.sve.shsub.u.nxv16i8(<vscale x 16 x i1> [[PG:%.*]], <vscale x 16 x 
i8> [[OP1:%.*]], <vscale x 16 x i8> [[OP2:%.*]])
 // CPP-CHECK-NEXT:    ret <vscale x 16 x i8> [[TMP0]]
 //
 svint8_t test_svhsub_s8_x(svbool_t pg, svint8_t op1, svint8_t op2)
@@ -313,13 +313,13 @@ svint8_t test_svhsub_s8_x(svbool_t pg, svint8_t op1, 
svint8_t op2)
 // CHECK-LABEL: @test_svhsub_s16_x(
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i1> 
@llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> [[PG:%.*]])
-// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x i16> 
@llvm.aarch64.sve.shsub.nxv8i16(<vscale x 8 x i1> [[TMP0]], <vscale x 8 x i16> 
[[OP1:%.*]], <vscale x 8 x i16> [[OP2:%.*]])
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x i16> 
@llvm.aarch64.sve.shsub.u.nxv8i16(<vscale x 8 x i1> [[TMP0]], <vscale x 8 x 
i16> [[OP1:%.*]], <vscale x 8 x i16> [[OP2:%.*]])
 // CHECK-NEXT:    ret <vscale x 8 x i16> [[TMP1]]
 //
 // CPP-CHECK-LABEL: @_Z17test_svhsub_s16_xu10__SVBool_tu11__SVInt16_tS0_(
 // CPP-CHECK-NEXT:  entry:
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i1> 
@llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> [[PG:%.*]])
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x i16> 
@llvm.aarch64.sve.shsub.nxv8i16(<vscale x 8 x i1> [[TMP0]], <vscale x 8 x i16> 
[[OP1:%.*]], <vscale x 8 x i16> [[OP2:%.*]])
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x i16> 
@llvm.aarch64.sve.shsub.u.nxv8i16(<vscale x 8 x i1> [[TMP0]], <vscale x 8 x 
i16> [[OP1:%.*]], <vscale x 8 x i16> [[OP2:%.*]])
 // CPP-CHECK-NEXT:    ret <vscale x 8 x i16> [[TMP1]]
 //
 svint16_t test_svhsub_s16_x(svbool_t pg, svint16_t op1, svint16_t op2)
@@ -330,13 +330,13 @@ svint16_t test_svhsub_s16_x(svbool_t pg, svint16_t op1, 
svint16_t op2)
 // CHECK-LABEL: @test_svhsub_s32_x(
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 4 x i1> 
@llvm.aarch64.sve.convert.from.svbool.nxv4i1(<vscale x 16 x i1> [[PG:%.*]])
-// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 4 x i32> 
@llvm.aarch64.sve.shsub.nxv4i32(<vscale x 4 x i1> [[TMP0]], <vscale x 4 x i32> 
[[OP1:%.*]], <vscale x 4 x i32> [[OP2:%.*]])
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 4 x i32> 
@llvm.aarch64.sve.shsub.u.nxv4i32(<vscale x 4 x i1> [[TMP0]], <vscale x 4 x 
i32> [[OP1:%.*]], <vscale x 4 x i32> [[OP2:%.*]])
 // CHECK-NEXT:    ret <vscale x 4 x i32> [[TMP1]]
 //
 // CPP-CHECK-LABEL: @_Z17test_svhsub_s32_xu10__SVBool_tu11__SVInt32_tS0_(
 // CPP-CHECK-NEXT:  entry:
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 4 x i1> 
@llvm.aarch64.sve.convert.from.svbool.nxv4i1(<vscale x 16 x i1> [[PG:%.*]])
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 4 x i32> 
@llvm.aarch64.sve.shsub.nxv4i32(<vscale x 4 x i1> [[TMP0]], <vscale x 4 x i32> 
[[OP1:%.*]], <vscale x 4 x i32> [[OP2:%.*]])
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 4 x i32> 
@llvm.aarch64.sve.shsub.u.nxv4i32(<vscale x 4 x i1> [[TMP0]], <vscale x 4 x 
i32> [[OP1:%.*]], <vscale x 4 x i32> [[OP2:%.*]])
 // CPP-CHECK-NEXT:    ret <vscale x 4 x i32> [[TMP1]]
 //
 svint32_t test_svhsub_s32_x(svbool_t pg, svint32_t op1, svint32_t op2)
@@ -347,13 +347,13 @@ svint32_t test_svhsub_s32_x(svbool_t pg, svint32_t op1, 
svint32_t op2)
 // CHECK-LABEL: @test_svhsub_s64_x(
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 2 x i1> 
@llvm.aarch64.sve.convert.from.svbool.nxv2i1(<vscale x 16 x i1> [[PG:%.*]])
-// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 2 x i64> 
@llvm.aarch64.sve.shsub.nxv2i64(<vscale x 2 x i1> [[TMP0]], <vscale x 2 x i64> 
[[OP1:%.*]], <vscale x 2 x i64> [[OP2:%.*]])
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 2 x i64> 
@llvm.aarch64.sve.shsub.u.nxv2i64(<vscale x 2 x i1> [[TMP0]], <vscale x 2 x 
i64> [[OP1:%.*]], <vscale x 2 x i64> [[OP2:%.*]])
 // CHECK-NEXT:    ret <vscale x 2 x i64> [[TMP1]]
 //
 // CPP-CHECK-LABEL: @_Z17test_svhsub_s64_xu10__SVBool_tu11__SVInt64_tS0_(
 // CPP-CHECK-NEXT:  entry:
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 2 x i1> 
@llvm.aarch64.sve.convert.from.svbool.nxv2i1(<vscale x 16 x i1> [[PG:%.*]])
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 2 x i64> 
@llvm.aarch64.sve.shsub.nxv2i64(<vscale x 2 x i1> [[TMP0]], <vscale x 2 x i64> 
[[OP1:%.*]], <vscale x 2 x i64> [[OP2:%.*]])
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 2 x i64> 
@llvm.aarch64.sve.shsub.u.nxv2i64(<vscale x 2 x i1> [[TMP0]], <vscale x 2 x 
i64> [[OP1:%.*]], <vscale x 2 x i64> [[OP2:%.*]])
 // CPP-CHECK-NEXT:    ret <vscale x 2 x i64> [[TMP1]]
 //
 svint64_t test_svhsub_s64_x(svbool_t pg, svint64_t op1, svint64_t op2)
@@ -363,12 +363,12 @@ svint64_t test_svhsub_s64_x(svbool_t pg, svint64_t op1, 
svint64_t op2)
 
 // CHECK-LABEL: @test_svhsub_u8_x(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 16 x i8> 
@llvm.aarch64.sve.uhsub.nxv16i8(<vscale x 16 x i1> [[PG:%.*]], <vscale x 16 x 
i8> [[OP1:%.*]], <vscale x 16 x i8> [[OP2:%.*]])
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 16 x i8> 
@llvm.aarch64.sve.uhsub.u.nxv16i8(<vscale x 16 x i1> [[PG:%.*]], <vscale x 16 x 
i8> [[OP1:%.*]], <vscale x 16 x i8> [[OP2:%.*]])
 // CHECK-NEXT:    ret <vscale x 16 x i8> [[TMP0]]
 //
 // CPP-CHECK-LABEL: @_Z16test_svhsub_u8_xu10__SVBool_tu11__SVUint8_tS0_(
 // CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 16 x i8> 
@llvm.aarch64.sve.uhsub.nxv16i8(<vscale x 16 x i1> [[PG:%.*]], <vscale x 16 x 
i8> [[OP1:%.*]], <vscale x 16 x i8> [[OP2:%.*]])
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 16 x i8> 
@llvm.aarch64.sve.uhsub.u.nxv16i8(<vscale x 16 x i1> [[PG:%.*]], <vscale x 16 x 
i8> [[OP1:%.*]], <vscale x 16 x i8> [[OP2:%.*]])
 // CPP-CHECK-NEXT:    ret <vscale x 16 x i8> [[TMP0]]
 //
 svuint8_t test_svhsub_u8_x(svbool_t pg, svuint8_t op1, svuint8_t op2)
@@ -379,13 +379,13 @@ svuint8_t test_svhsub_u8_x(svbool_t pg, svuint8_t op1, 
svuint8_t op2)
 // CHECK-LABEL: @test_svhsub_u16_x(
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i1> 
@llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> [[PG:%.*]])
-// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x i16> 
@llvm.aarch64.sve.uhsub.nxv8i16(<vscale x 8 x i1> [[TMP0]], <vscale x 8 x i16> 
[[OP1:%.*]], <vscale x 8 x i16> [[OP2:%.*]])
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x i16> 
@llvm.aarch64.sve.uhsub.u.nxv8i16(<vscale x 8 x i1> [[TMP0]], <vscale x 8 x 
i16> [[OP1:%.*]], <vscale x 8 x i16> [[OP2:%.*]])
 // CHECK-NEXT:    ret <vscale x 8 x i16> [[TMP1]]
 //
 // CPP-CHECK-LABEL: @_Z17test_svhsub_u16_xu10__SVBool_tu12__SVUint16_tS0_(
 // CPP-CHECK-NEXT:  entry:
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i1> 
@llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> [[PG:%.*]])
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x i16> 
@llvm.aarch64.sve.uhsub.nxv8i16(<vscale x 8 x i1> [[TMP0]], <vscale x 8 x i16> 
[[OP1:%.*]], <vscale x 8 x i16> [[OP2:%.*]])
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x i16> 
@llvm.aarch64.sve.uhsub.u.nxv8i16(<vscale x 8 x i1> [[TMP0]], <vscale x 8 x 
i16> [[OP1:%.*]], <vscale x 8 x i16> [[OP2:%.*]])
 // CPP-CHECK-NEXT:    ret <vscale x 8 x i16> [[TMP1]]
 //
 svuint16_t test_svhsub_u16_x(svbool_t pg, svuint16_t op1, svuint16_t op2)
@@ -396,13 +396,13 @@ svuint16_t test_svhsub_u16_x(svbool_t pg, svuint16_t op1, 
svuint16_t op2)
 // CHECK-LABEL: @test_svhsub_u32_x(
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 4 x i1> 
@llvm.aarch64.sve.convert.from.svbool.nxv4i1(<vscale x 16 x i1> [[PG:%.*]])
-// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 4 x i32> 
@llvm.aarch64.sve.uhsub.nxv4i32(<vscale x 4 x i1> [[TMP0]], <vscale x 4 x i32> 
[[OP1:%.*]], <vscale x 4 x i32> [[OP2:%.*]])
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 4 x i32> 
@llvm.aarch64.sve.uhsub.u.nxv4i32(<vscale x 4 x i1> [[TMP0]], <vscale x 4 x 
i32> [[OP1:%.*]], <vscale x 4 x i32> [[OP2:%.*]])
 // CHECK-NEXT:    ret <vscale x 4 x i32> [[TMP1]]
 //
 // CPP-CHECK-LABEL: @_Z17test_svhsub_u32_xu10__SVBool_tu12__SVUint32_tS0_(
 // CPP-CHECK-NEXT:  entry:
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 4 x i1> 
@llvm.aarch64.sve.convert.from.svbool.nxv4i1(<vscale x 16 x i1> [[PG:%.*]])
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 4 x i32> 
@llvm.aarch64.sve.uhsub.nxv4i32(<vscale x 4 x i1> [[TMP0]], <vscale x 4 x i32> 
[[OP1:%.*]], <vscale x 4 x i32> [[OP2:%.*]])
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 4 x i32> 
@llvm.aarch64.sve.uhsub.u.nxv4i32(<vscale x 4 x i1> [[TMP0]], <vscale x 4 x 
i32> [[OP1:%.*]], <vscale x 4 x i32> [[OP2:%.*]])
 // CPP-CHECK-NEXT:    ret <vscale x 4 x i32> [[TMP1]]
 //
 svuint32_t test_svhsub_u32_x(svbool_t pg, svuint32_t op1, svuint32_t op2)
@@ -413,13 +413,13 @@ svuint32_t test_svhsub_u32_x(svbool_t pg, svuint32_t op1, 
svuint32_t op2)
 // CHECK-LABEL: @test_svhsub_u64_x(
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 2 x i1> 
@llvm.aarch64.sve.convert.from.svbool.nxv2i1(<vscale x 16 x i1> [[PG:%.*]])
-// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 2 x i64> 
@llvm.aarch64.sve.uhsub.nxv2i64(<vscale x 2 x i1> [[TMP0]], <vscale x 2 x i64> 
[[OP1:%.*]], <vscale x 2 x i64> [[OP2:%.*]])
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 2 x i64> 
@llvm.aarch64.sve.uhsub.u.nxv2i64(<vscale x 2 x i1> [[TMP0]], <vscale x 2 x 
i64> [[OP1:%.*]], <vscale x 2 x i64> [[OP2:%.*]])
 // CHECK-NEXT:    ret <vscale x 2 x i64> [[TMP1]]
 //
 // CPP-CHECK-LABEL: @_Z17test_svhsub_u64_xu10__SVBool_tu12__SVUint64_tS0_(
 // CPP-CHECK-NEXT:  entry:
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 2 x i1> 
@llvm.aarch64.sve.convert.from.svbool.nxv2i1(<vscale x 16 x i1> [[PG:%.*]])
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 2 x i64> 
@llvm.aarch64.sve.uhsub.nxv2i64(<vscale x 2 x i1> [[TMP0]], <vscale x 2 x i64> 
[[OP1:%.*]], <vscale x 2 x i64> [[OP2:%.*]])
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 2 x i64> 
@llvm.aarch64.sve.uhsub.u.nxv2i64(<vscale x 2 x i1> [[TMP0]], <vscale x 2 x 
i64> [[OP1:%.*]], <vscale x 2 x i64> [[OP2:%.*]])
 // CPP-CHECK-NEXT:    ret <vscale x 2 x i64> [[TMP1]]
 //
 svuint64_t test_svhsub_u64_x(svbool_t pg, svuint64_t op1, svuint64_t op2)
@@ -775,14 +775,14 @@ svuint64_t test_svhsub_n_u64_m(svbool_t pg, svuint64_t 
op1, uint64_t op2)
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 16 x i8> 
poison, i8 [[OP2:%.*]], i64 0
 // CHECK-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <vscale x 16 x i8> 
[[DOTSPLATINSERT]], <vscale x 16 x i8> poison, <vscale x 16 x i32> 
zeroinitializer
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 16 x i8> 
@llvm.aarch64.sve.shsub.nxv16i8(<vscale x 16 x i1> [[PG:%.*]], <vscale x 16 x 
i8> [[OP1:%.*]], <vscale x 16 x i8> [[DOTSPLAT]])
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 16 x i8> 
@llvm.aarch64.sve.shsub.u.nxv16i8(<vscale x 16 x i1> [[PG:%.*]], <vscale x 16 x 
i8> [[OP1:%.*]], <vscale x 16 x i8> [[DOTSPLAT]])
 // CHECK-NEXT:    ret <vscale x 16 x i8> [[TMP0]]
 //
 // CPP-CHECK-LABEL: @_Z18test_svhsub_n_s8_xu10__SVBool_tu10__SVInt8_ta(
 // CPP-CHECK-NEXT:  entry:
 // CPP-CHECK-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 16 x 
i8> poison, i8 [[OP2:%.*]], i64 0
 // CPP-CHECK-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <vscale x 16 x i8> 
[[DOTSPLATINSERT]], <vscale x 16 x i8> poison, <vscale x 16 x i32> 
zeroinitializer
-// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 16 x i8> 
@llvm.aarch64.sve.shsub.nxv16i8(<vscale x 16 x i1> [[PG:%.*]], <vscale x 16 x 
i8> [[OP1:%.*]], <vscale x 16 x i8> [[DOTSPLAT]])
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 16 x i8> 
@llvm.aarch64.sve.shsub.u.nxv16i8(<vscale x 16 x i1> [[PG:%.*]], <vscale x 16 x 
i8> [[OP1:%.*]], <vscale x 16 x i8> [[DOTSPLAT]])
 // CPP-CHECK-NEXT:    ret <vscale x 16 x i8> [[TMP0]]
 //
 svint8_t test_svhsub_n_s8_x(svbool_t pg, svint8_t op1, int8_t op2)
@@ -795,7 +795,7 @@ svint8_t test_svhsub_n_s8_x(svbool_t pg, svint8_t op1, 
int8_t op2)
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i1> 
@llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> [[PG:%.*]])
 // CHECK-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 8 x i16> 
poison, i16 [[OP2:%.*]], i64 0
 // CHECK-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <vscale x 8 x i16> 
[[DOTSPLATINSERT]], <vscale x 8 x i16> poison, <vscale x 8 x i32> 
zeroinitializer
-// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x i16> 
@llvm.aarch64.sve.shsub.nxv8i16(<vscale x 8 x i1> [[TMP0]], <vscale x 8 x i16> 
[[OP1:%.*]], <vscale x 8 x i16> [[DOTSPLAT]])
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x i16> 
@llvm.aarch64.sve.shsub.u.nxv8i16(<vscale x 8 x i1> [[TMP0]], <vscale x 8 x 
i16> [[OP1:%.*]], <vscale x 8 x i16> [[DOTSPLAT]])
 // CHECK-NEXT:    ret <vscale x 8 x i16> [[TMP1]]
 //
 // CPP-CHECK-LABEL: @_Z19test_svhsub_n_s16_xu10__SVBool_tu11__SVInt16_ts(
@@ -803,7 +803,7 @@ svint8_t test_svhsub_n_s8_x(svbool_t pg, svint8_t op1, 
int8_t op2)
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i1> 
@llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> [[PG:%.*]])
 // CPP-CHECK-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 8 x 
i16> poison, i16 [[OP2:%.*]], i64 0
 // CPP-CHECK-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <vscale x 8 x i16> 
[[DOTSPLATINSERT]], <vscale x 8 x i16> poison, <vscale x 8 x i32> 
zeroinitializer
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x i16> 
@llvm.aarch64.sve.shsub.nxv8i16(<vscale x 8 x i1> [[TMP0]], <vscale x 8 x i16> 
[[OP1:%.*]], <vscale x 8 x i16> [[DOTSPLAT]])
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x i16> 
@llvm.aarch64.sve.shsub.u.nxv8i16(<vscale x 8 x i1> [[TMP0]], <vscale x 8 x 
i16> [[OP1:%.*]], <vscale x 8 x i16> [[DOTSPLAT]])
 // CPP-CHECK-NEXT:    ret <vscale x 8 x i16> [[TMP1]]
 //
 svint16_t test_svhsub_n_s16_x(svbool_t pg, svint16_t op1, int16_t op2)
@@ -816,7 +816,7 @@ svint16_t test_svhsub_n_s16_x(svbool_t pg, svint16_t op1, 
int16_t op2)
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 4 x i1> 
@llvm.aarch64.sve.convert.from.svbool.nxv4i1(<vscale x 16 x i1> [[PG:%.*]])
 // CHECK-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 4 x i32> 
poison, i32 [[OP2:%.*]], i64 0
 // CHECK-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <vscale x 4 x i32> 
[[DOTSPLATINSERT]], <vscale x 4 x i32> poison, <vscale x 4 x i32> 
zeroinitializer
-// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 4 x i32> 
@llvm.aarch64.sve.shsub.nxv4i32(<vscale x 4 x i1> [[TMP0]], <vscale x 4 x i32> 
[[OP1:%.*]], <vscale x 4 x i32> [[DOTSPLAT]])
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 4 x i32> 
@llvm.aarch64.sve.shsub.u.nxv4i32(<vscale x 4 x i1> [[TMP0]], <vscale x 4 x 
i32> [[OP1:%.*]], <vscale x 4 x i32> [[DOTSPLAT]])
 // CHECK-NEXT:    ret <vscale x 4 x i32> [[TMP1]]
 //
 // CPP-CHECK-LABEL: @_Z19test_svhsub_n_s32_xu10__SVBool_tu11__SVInt32_ti(
@@ -824,7 +824,7 @@ svint16_t test_svhsub_n_s16_x(svbool_t pg, svint16_t op1, 
int16_t op2)
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 4 x i1> 
@llvm.aarch64.sve.convert.from.svbool.nxv4i1(<vscale x 16 x i1> [[PG:%.*]])
 // CPP-CHECK-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 4 x 
i32> poison, i32 [[OP2:%.*]], i64 0
 // CPP-CHECK-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <vscale x 4 x i32> 
[[DOTSPLATINSERT]], <vscale x 4 x i32> poison, <vscale x 4 x i32> 
zeroinitializer
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 4 x i32> 
@llvm.aarch64.sve.shsub.nxv4i32(<vscale x 4 x i1> [[TMP0]], <vscale x 4 x i32> 
[[OP1:%.*]], <vscale x 4 x i32> [[DOTSPLAT]])
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 4 x i32> 
@llvm.aarch64.sve.shsub.u.nxv4i32(<vscale x 4 x i1> [[TMP0]], <vscale x 4 x 
i32> [[OP1:%.*]], <vscale x 4 x i32> [[DOTSPLAT]])
 // CPP-CHECK-NEXT:    ret <vscale x 4 x i32> [[TMP1]]
 //
 svint32_t test_svhsub_n_s32_x(svbool_t pg, svint32_t op1, int32_t op2)
@@ -837,7 +837,7 @@ svint32_t test_svhsub_n_s32_x(svbool_t pg, svint32_t op1, 
int32_t op2)
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 2 x i1> 
@llvm.aarch64.sve.convert.from.svbool.nxv2i1(<vscale x 16 x i1> [[PG:%.*]])
 // CHECK-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 2 x i64> 
poison, i64 [[OP2:%.*]], i64 0
 // CHECK-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <vscale x 2 x i64> 
[[DOTSPLATINSERT]], <vscale x 2 x i64> poison, <vscale x 2 x i32> 
zeroinitializer
-// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 2 x i64> 
@llvm.aarch64.sve.shsub.nxv2i64(<vscale x 2 x i1> [[TMP0]], <vscale x 2 x i64> 
[[OP1:%.*]], <vscale x 2 x i64> [[DOTSPLAT]])
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 2 x i64> 
@llvm.aarch64.sve.shsub.u.nxv2i64(<vscale x 2 x i1> [[TMP0]], <vscale x 2 x 
i64> [[OP1:%.*]], <vscale x 2 x i64> [[DOTSPLAT]])
 // CHECK-NEXT:    ret <vscale x 2 x i64> [[TMP1]]
 //
 // CPP-CHECK-LABEL: @_Z19test_svhsub_n_s64_xu10__SVBool_tu11__SVInt64_tl(
@@ -845,7 +845,7 @@ svint32_t test_svhsub_n_s32_x(svbool_t pg, svint32_t op1, 
int32_t op2)
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 2 x i1> 
@llvm.aarch64.sve.convert.from.svbool.nxv2i1(<vscale x 16 x i1> [[PG:%.*]])
 // CPP-CHECK-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 2 x 
i64> poison, i64 [[OP2:%.*]], i64 0
 // CPP-CHECK-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <vscale x 2 x i64> 
[[DOTSPLATINSERT]], <vscale x 2 x i64> poison, <vscale x 2 x i32> 
zeroinitializer
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 2 x i64> 
@llvm.aarch64.sve.shsub.nxv2i64(<vscale x 2 x i1> [[TMP0]], <vscale x 2 x i64> 
[[OP1:%.*]], <vscale x 2 x i64> [[DOTSPLAT]])
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 2 x i64> 
@llvm.aarch64.sve.shsub.u.nxv2i64(<vscale x 2 x i1> [[TMP0]], <vscale x 2 x 
i64> [[OP1:%.*]], <vscale x 2 x i64> [[DOTSPLAT]])
 // CPP-CHECK-NEXT:    ret <vscale x 2 x i64> [[TMP1]]
 //
 svint64_t test_svhsub_n_s64_x(svbool_t pg, svint64_t op1, int64_t op2)
@@ -857,14 +857,14 @@ svint64_t test_svhsub_n_s64_x(svbool_t pg, svint64_t op1, 
int64_t op2)
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 16 x i8> 
poison, i8 [[OP2:%.*]], i64 0
 // CHECK-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <vscale x 16 x i8> 
[[DOTSPLATINSERT]], <vscale x 16 x i8> poison, <vscale x 16 x i32> 
zeroinitializer
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 16 x i8> 
@llvm.aarch64.sve.uhsub.nxv16i8(<vscale x 16 x i1> [[PG:%.*]], <vscale x 16 x 
i8> [[OP1:%.*]], <vscale x 16 x i8> [[DOTSPLAT]])
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 16 x i8> 
@llvm.aarch64.sve.uhsub.u.nxv16i8(<vscale x 16 x i1> [[PG:%.*]], <vscale x 16 x 
i8> [[OP1:%.*]], <vscale x 16 x i8> [[DOTSPLAT]])
 // CHECK-NEXT:    ret <vscale x 16 x i8> [[TMP0]]
 //
 // CPP-CHECK-LABEL: @_Z18test_svhsub_n_u8_xu10__SVBool_tu11__SVUint8_th(
 // CPP-CHECK-NEXT:  entry:
 // CPP-CHECK-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 16 x 
i8> poison, i8 [[OP2:%.*]], i64 0
 // CPP-CHECK-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <vscale x 16 x i8> 
[[DOTSPLATINSERT]], <vscale x 16 x i8> poison, <vscale x 16 x i32> 
zeroinitializer
-// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 16 x i8> 
@llvm.aarch64.sve.uhsub.nxv16i8(<vscale x 16 x i1> [[PG:%.*]], <vscale x 16 x 
i8> [[OP1:%.*]], <vscale x 16 x i8> [[DOTSPLAT]])
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 16 x i8> 
@llvm.aarch64.sve.uhsub.u.nxv16i8(<vscale x 16 x i1> [[PG:%.*]], <vscale x 16 x 
i8> [[OP1:%.*]], <vscale x 16 x i8> [[DOTSPLAT]])
 // CPP-CHECK-NEXT:    ret <vscale x 16 x i8> [[TMP0]]
 //
 svuint8_t test_svhsub_n_u8_x(svbool_t pg, svuint8_t op1, uint8_t op2)
@@ -877,7 +877,7 @@ svuint8_t test_svhsub_n_u8_x(svbool_t pg, svuint8_t op1, 
uint8_t op2)
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i1> 
@llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> [[PG:%.*]])
 // CHECK-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 8 x i16> 
poison, i16 [[OP2:%.*]], i64 0
 // CHECK-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <vscale x 8 x i16> 
[[DOTSPLATINSERT]], <vscale x 8 x i16> poison, <vscale x 8 x i32> 
zeroinitializer
-// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x i16> 
@llvm.aarch64.sve.uhsub.nxv8i16(<vscale x 8 x i1> [[TMP0]], <vscale x 8 x i16> 
[[OP1:%.*]], <vscale x 8 x i16> [[DOTSPLAT]])
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x i16> 
@llvm.aarch64.sve.uhsub.u.nxv8i16(<vscale x 8 x i1> [[TMP0]], <vscale x 8 x 
i16> [[OP1:%.*]], <vscale x 8 x i16> [[DOTSPLAT]])
 // CHECK-NEXT:    ret <vscale x 8 x i16> [[TMP1]]
 //
 // CPP-CHECK-LABEL: @_Z19test_svhsub_n_u16_xu10__SVBool_tu12__SVUint16_tt(
@@ -885,7 +885,7 @@ svuint8_t test_svhsub_n_u8_x(svbool_t pg, svuint8_t op1, 
uint8_t op2)
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i1> 
@llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> [[PG:%.*]])
 // CPP-CHECK-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 8 x 
i16> poison, i16 [[OP2:%.*]], i64 0
 // CPP-CHECK-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <vscale x 8 x i16> 
[[DOTSPLATINSERT]], <vscale x 8 x i16> poison, <vscale x 8 x i32> 
zeroinitializer
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x i16> 
@llvm.aarch64.sve.uhsub.nxv8i16(<vscale x 8 x i1> [[TMP0]], <vscale x 8 x i16> 
[[OP1:%.*]], <vscale x 8 x i16> [[DOTSPLAT]])
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x i16> 
@llvm.aarch64.sve.uhsub.u.nxv8i16(<vscale x 8 x i1> [[TMP0]], <vscale x 8 x 
i16> [[OP1:%.*]], <vscale x 8 x i16> [[DOTSPLAT]])
 // CPP-CHECK-NEXT:    ret <vscale x 8 x i16> [[TMP1]]
 //
 svuint16_t test_svhsub_n_u16_x(svbool_t pg, svuint16_t op1, uint16_t op2)
@@ -898,7 +898,7 @@ svuint16_t test_svhsub_n_u16_x(svbool_t pg, svuint16_t op1, 
uint16_t op2)
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 4 x i1> 
@llvm.aarch64.sve.convert.from.svbool.nxv4i1(<vscale x 16 x i1> [[PG:%.*]])
 // CHECK-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 4 x i32> 
poison, i32 [[OP2:%.*]], i64 0
 // CHECK-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <vscale x 4 x i32> 
[[DOTSPLATINSERT]], <vscale x 4 x i32> poison, <vscale x 4 x i32> 
zeroinitializer
-// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 4 x i32> 
@llvm.aarch64.sve.uhsub.nxv4i32(<vscale x 4 x i1> [[TMP0]], <vscale x 4 x i32> 
[[OP1:%.*]], <vscale x 4 x i32> [[DOTSPLAT]])
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 4 x i32> 
@llvm.aarch64.sve.uhsub.u.nxv4i32(<vscale x 4 x i1> [[TMP0]], <vscale x 4 x 
i32> [[OP1:%.*]], <vscale x 4 x i32> [[DOTSPLAT]])
 // CHECK-NEXT:    ret <vscale x 4 x i32> [[TMP1]]
 //
 // CPP-CHECK-LABEL: @_Z19test_svhsub_n_u32_xu10__SVBool_tu12__SVUint32_tj(
@@ -906,7 +906,7 @@ svuint16_t test_svhsub_n_u16_x(svbool_t pg, svuint16_t op1, 
uint16_t op2)
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 4 x i1> 
@llvm.aarch64.sve.convert.from.svbool.nxv4i1(<vscale x 16 x i1> [[PG:%.*]])
 // CPP-CHECK-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 4 x 
i32> poison, i32 [[OP2:%.*]], i64 0
 // CPP-CHECK-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <vscale x 4 x i32> 
[[DOTSPLATINSERT]], <vscale x 4 x i32> poison, <vscale x 4 x i32> 
zeroinitializer
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 4 x i32> 
@llvm.aarch64.sve.uhsub.nxv4i32(<vscale x 4 x i1> [[TMP0]], <vscale x 4 x i32> 
[[OP1:%.*]], <vscale x 4 x i32> [[DOTSPLAT]])
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 4 x i32> 
@llvm.aarch64.sve.uhsub.u.nxv4i32(<vscale x 4 x i1> [[TMP0]], <vscale x 4 x 
i32> [[OP1:%.*]], <vscale x 4 x i32> [[DOTSPLAT]])
 // CPP-CHECK-NEXT:    ret <vscale x 4 x i32> [[TMP1]]
 //
 svuint32_t test_svhsub_n_u32_x(svbool_t pg, svuint32_t op1, uint32_t op2)
@@ -919,7 +919,7 @@ svuint32_t test_svhsub_n_u32_x(svbool_t pg, svuint32_t op1, 
uint32_t op2)
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 2 x i1> 
@llvm.aarch64.sve.convert.from.svbool.nxv2i1(<vscale x 16 x i1> [[PG:%.*]])
 // CHECK-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 2 x i64> 
poison, i64 [[OP2:%.*]], i64 0
 // CHECK-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <vscale x 2 x i64> 
[[DOTSPLATINSERT]], <vscale x 2 x i64> poison, <vscale x 2 x i32> 
zeroinitializer
-// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 2 x i64> 
@llvm.aarch64.sve.uhsub.nxv2i64(<vscale x 2 x i1> [[TMP0]], <vscale x 2 x i64> 
[[OP1:%.*]], <vscale x 2 x i64> [[DOTSPLAT]])
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 2 x i64> 
@llvm.aarch64.sve.uhsub.u.nxv2i64(<vscale x 2 x i1> [[TMP0]], <vscale x 2 x 
i64> [[OP1:%.*]], <vscale x 2 x i64> [[DOTSPLAT]])
 // CHECK-NEXT:    ret <vscale x 2 x i64> [[TMP1]]
 //
 // CPP-CHECK-LABEL: @_Z19test_svhsub_n_u64_xu10__SVBool_tu12__SVUint64_tm(
@@ -927,7 +927,7 @@ svuint32_t test_svhsub_n_u32_x(svbool_t pg, svuint32_t op1, 
uint32_t op2)
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 2 x i1> 
@llvm.aarch64.sve.convert.from.svbool.nxv2i1(<vscale x 16 x i1> [[PG:%.*]])
 // CPP-CHECK-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 2 x 
i64> poison, i64 [[OP2:%.*]], i64 0
 // CPP-CHECK-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <vscale x 2 x i64> 
[[DOTSPLATINSERT]], <vscale x 2 x i64> poison, <vscale x 2 x i32> 
zeroinitializer
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 2 x i64> 
@llvm.aarch64.sve.uhsub.nxv2i64(<vscale x 2 x i1> [[TMP0]], <vscale x 2 x i64> 
[[OP1:%.*]], <vscale x 2 x i64> [[DOTSPLAT]])
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 2 x i64> 
@llvm.aarch64.sve.uhsub.u.nxv2i64(<vscale x 2 x i1> [[TMP0]], <vscale x 2 x 
i64> [[OP1:%.*]], <vscale x 2 x i64> [[DOTSPLAT]])
 // CPP-CHECK-NEXT:    ret <vscale x 2 x i64> [[TMP1]]
 //
 svuint64_t test_svhsub_n_u64_x(svbool_t pg, svuint64_t op1, uint64_t op2)
diff --git a/clang/test/CodeGen/AArch64/sve2-intrinsics/acle_sve2_hsubr.c 
b/clang/test/CodeGen/AArch64/sve2-intrinsics/acle_sve2_hsubr.c
index 236ee3b5320e1..0f4fd0d84aff4 100644
--- a/clang/test/CodeGen/AArch64/sve2-intrinsics/acle_sve2_hsubr.c
+++ b/clang/test/CodeGen/AArch64/sve2-intrinsics/acle_sve2_hsubr.c
@@ -297,12 +297,12 @@ svuint64_t test_svhsubr_u64_m(svbool_t pg, svuint64_t 
op1, svuint64_t op2)
 
 // CHECK-LABEL: @test_svhsubr_s8_x(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 16 x i8> 
@llvm.aarch64.sve.shsubr.nxv16i8(<vscale x 16 x i1> [[PG:%.*]], <vscale x 16 x 
i8> [[OP1:%.*]], <vscale x 16 x i8> [[OP2:%.*]])
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 16 x i8> 
@llvm.aarch64.sve.shsub.u.nxv16i8(<vscale x 16 x i1> [[PG:%.*]], <vscale x 16 x 
i8> [[OP2:%.*]], <vscale x 16 x i8> [[OP1:%.*]])
 // CHECK-NEXT:    ret <vscale x 16 x i8> [[TMP0]]
 //
 // CPP-CHECK-LABEL: @_Z17test_svhsubr_s8_xu10__SVBool_tu10__SVInt8_tS0_(
 // CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 16 x i8> 
@llvm.aarch64.sve.shsubr.nxv16i8(<vscale x 16 x i1> [[PG:%.*]], <vscale x 16 x 
i8> [[OP1:%.*]], <vscale x 16 x i8> [[OP2:%.*]])
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 16 x i8> 
@llvm.aarch64.sve.shsub.u.nxv16i8(<vscale x 16 x i1> [[PG:%.*]], <vscale x 16 x 
i8> [[OP2:%.*]], <vscale x 16 x i8> [[OP1:%.*]])
 // CPP-CHECK-NEXT:    ret <vscale x 16 x i8> [[TMP0]]
 //
 svint8_t test_svhsubr_s8_x(svbool_t pg, svint8_t op1, svint8_t op2)
@@ -313,13 +313,13 @@ svint8_t test_svhsubr_s8_x(svbool_t pg, svint8_t op1, 
svint8_t op2)
 // CHECK-LABEL: @test_svhsubr_s16_x(
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i1> 
@llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> [[PG:%.*]])
-// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x i16> 
@llvm.aarch64.sve.shsubr.nxv8i16(<vscale x 8 x i1> [[TMP0]], <vscale x 8 x i16> 
[[OP1:%.*]], <vscale x 8 x i16> [[OP2:%.*]])
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x i16> 
@llvm.aarch64.sve.shsub.u.nxv8i16(<vscale x 8 x i1> [[TMP0]], <vscale x 8 x 
i16> [[OP2:%.*]], <vscale x 8 x i16> [[OP1:%.*]])
 // CHECK-NEXT:    ret <vscale x 8 x i16> [[TMP1]]
 //
 // CPP-CHECK-LABEL: @_Z18test_svhsubr_s16_xu10__SVBool_tu11__SVInt16_tS0_(
 // CPP-CHECK-NEXT:  entry:
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i1> 
@llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> [[PG:%.*]])
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x i16> 
@llvm.aarch64.sve.shsubr.nxv8i16(<vscale x 8 x i1> [[TMP0]], <vscale x 8 x i16> 
[[OP1:%.*]], <vscale x 8 x i16> [[OP2:%.*]])
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x i16> 
@llvm.aarch64.sve.shsub.u.nxv8i16(<vscale x 8 x i1> [[TMP0]], <vscale x 8 x 
i16> [[OP2:%.*]], <vscale x 8 x i16> [[OP1:%.*]])
 // CPP-CHECK-NEXT:    ret <vscale x 8 x i16> [[TMP1]]
 //
 svint16_t test_svhsubr_s16_x(svbool_t pg, svint16_t op1, svint16_t op2)
@@ -330,13 +330,13 @@ svint16_t test_svhsubr_s16_x(svbool_t pg, svint16_t op1, 
svint16_t op2)
 // CHECK-LABEL: @test_svhsubr_s32_x(
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 4 x i1> 
@llvm.aarch64.sve.convert.from.svbool.nxv4i1(<vscale x 16 x i1> [[PG:%.*]])
-// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 4 x i32> 
@llvm.aarch64.sve.shsubr.nxv4i32(<vscale x 4 x i1> [[TMP0]], <vscale x 4 x i32> 
[[OP1:%.*]], <vscale x 4 x i32> [[OP2:%.*]])
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 4 x i32> 
@llvm.aarch64.sve.shsub.u.nxv4i32(<vscale x 4 x i1> [[TMP0]], <vscale x 4 x 
i32> [[OP2:%.*]], <vscale x 4 x i32> [[OP1:%.*]])
 // CHECK-NEXT:    ret <vscale x 4 x i32> [[TMP1]]
 //
 // CPP-CHECK-LABEL: @_Z18test_svhsubr_s32_xu10__SVBool_tu11__SVInt32_tS0_(
 // CPP-CHECK-NEXT:  entry:
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 4 x i1> 
@llvm.aarch64.sve.convert.from.svbool.nxv4i1(<vscale x 16 x i1> [[PG:%.*]])
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 4 x i32> 
@llvm.aarch64.sve.shsubr.nxv4i32(<vscale x 4 x i1> [[TMP0]], <vscale x 4 x i32> 
[[OP1:%.*]], <vscale x 4 x i32> [[OP2:%.*]])
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 4 x i32> 
@llvm.aarch64.sve.shsub.u.nxv4i32(<vscale x 4 x i1> [[TMP0]], <vscale x 4 x 
i32> [[OP2:%.*]], <vscale x 4 x i32> [[OP1:%.*]])
 // CPP-CHECK-NEXT:    ret <vscale x 4 x i32> [[TMP1]]
 //
 svint32_t test_svhsubr_s32_x(svbool_t pg, svint32_t op1, svint32_t op2)
@@ -347,13 +347,13 @@ svint32_t test_svhsubr_s32_x(svbool_t pg, svint32_t op1, 
svint32_t op2)
 // CHECK-LABEL: @test_svhsubr_s64_x(
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 2 x i1> 
@llvm.aarch64.sve.convert.from.svbool.nxv2i1(<vscale x 16 x i1> [[PG:%.*]])
-// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 2 x i64> 
@llvm.aarch64.sve.shsubr.nxv2i64(<vscale x 2 x i1> [[TMP0]], <vscale x 2 x i64> 
[[OP1:%.*]], <vscale x 2 x i64> [[OP2:%.*]])
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 2 x i64> 
@llvm.aarch64.sve.shsub.u.nxv2i64(<vscale x 2 x i1> [[TMP0]], <vscale x 2 x 
i64> [[OP2:%.*]], <vscale x 2 x i64> [[OP1:%.*]])
 // CHECK-NEXT:    ret <vscale x 2 x i64> [[TMP1]]
 //
 // CPP-CHECK-LABEL: @_Z18test_svhsubr_s64_xu10__SVBool_tu11__SVInt64_tS0_(
 // CPP-CHECK-NEXT:  entry:
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 2 x i1> 
@llvm.aarch64.sve.convert.from.svbool.nxv2i1(<vscale x 16 x i1> [[PG:%.*]])
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 2 x i64> 
@llvm.aarch64.sve.shsubr.nxv2i64(<vscale x 2 x i1> [[TMP0]], <vscale x 2 x i64> 
[[OP1:%.*]], <vscale x 2 x i64> [[OP2:%.*]])
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 2 x i64> 
@llvm.aarch64.sve.shsub.u.nxv2i64(<vscale x 2 x i1> [[TMP0]], <vscale x 2 x 
i64> [[OP2:%.*]], <vscale x 2 x i64> [[OP1:%.*]])
 // CPP-CHECK-NEXT:    ret <vscale x 2 x i64> [[TMP1]]
 //
 svint64_t test_svhsubr_s64_x(svbool_t pg, svint64_t op1, svint64_t op2)
@@ -363,12 +363,12 @@ svint64_t test_svhsubr_s64_x(svbool_t pg, svint64_t op1, 
svint64_t op2)
 
 // CHECK-LABEL: @test_svhsubr_u8_x(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 16 x i8> 
@llvm.aarch64.sve.uhsubr.nxv16i8(<vscale x 16 x i1> [[PG:%.*]], <vscale x 16 x 
i8> [[OP1:%.*]], <vscale x 16 x i8> [[OP2:%.*]])
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 16 x i8> 
@llvm.aarch64.sve.uhsub.u.nxv16i8(<vscale x 16 x i1> [[PG:%.*]], <vscale x 16 x 
i8> [[OP2:%.*]], <vscale x 16 x i8> [[OP1:%.*]])
 // CHECK-NEXT:    ret <vscale x 16 x i8> [[TMP0]]
 //
 // CPP-CHECK-LABEL: @_Z17test_svhsubr_u8_xu10__SVBool_tu11__SVUint8_tS0_(
 // CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 16 x i8> 
@llvm.aarch64.sve.uhsubr.nxv16i8(<vscale x 16 x i1> [[PG:%.*]], <vscale x 16 x 
i8> [[OP1:%.*]], <vscale x 16 x i8> [[OP2:%.*]])
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 16 x i8> 
@llvm.aarch64.sve.uhsub.u.nxv16i8(<vscale x 16 x i1> [[PG:%.*]], <vscale x 16 x 
i8> [[OP2:%.*]], <vscale x 16 x i8> [[OP1:%.*]])
 // CPP-CHECK-NEXT:    ret <vscale x 16 x i8> [[TMP0]]
 //
 svuint8_t test_svhsubr_u8_x(svbool_t pg, svuint8_t op1, svuint8_t op2)
@@ -379,13 +379,13 @@ svuint8_t test_svhsubr_u8_x(svbool_t pg, svuint8_t op1, 
svuint8_t op2)
 // CHECK-LABEL: @test_svhsubr_u16_x(
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i1> 
@llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> [[PG:%.*]])
-// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x i16> 
@llvm.aarch64.sve.uhsubr.nxv8i16(<vscale x 8 x i1> [[TMP0]], <vscale x 8 x i16> 
[[OP1:%.*]], <vscale x 8 x i16> [[OP2:%.*]])
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x i16> 
@llvm.aarch64.sve.uhsub.u.nxv8i16(<vscale x 8 x i1> [[TMP0]], <vscale x 8 x 
i16> [[OP2:%.*]], <vscale x 8 x i16> [[OP1:%.*]])
 // CHECK-NEXT:    ret <vscale x 8 x i16> [[TMP1]]
 //
 // CPP-CHECK-LABEL: @_Z18test_svhsubr_u16_xu10__SVBool_tu12__SVUint16_tS0_(
 // CPP-CHECK-NEXT:  entry:
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i1> 
@llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> [[PG:%.*]])
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x i16> 
@llvm.aarch64.sve.uhsubr.nxv8i16(<vscale x 8 x i1> [[TMP0]], <vscale x 8 x i16> 
[[OP1:%.*]], <vscale x 8 x i16> [[OP2:%.*]])
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x i16> 
@llvm.aarch64.sve.uhsub.u.nxv8i16(<vscale x 8 x i1> [[TMP0]], <vscale x 8 x 
i16> [[OP2:%.*]], <vscale x 8 x i16> [[OP1:%.*]])
 // CPP-CHECK-NEXT:    ret <vscale x 8 x i16> [[TMP1]]
 //
 svuint16_t test_svhsubr_u16_x(svbool_t pg, svuint16_t op1, svuint16_t op2)
@@ -396,13 +396,13 @@ svuint16_t test_svhsubr_u16_x(svbool_t pg, svuint16_t 
op1, svuint16_t op2)
 // CHECK-LABEL: @test_svhsubr_u32_x(
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 4 x i1> 
@llvm.aarch64.sve.convert.from.svbool.nxv4i1(<vscale x 16 x i1> [[PG:%.*]])
-// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 4 x i32> 
@llvm.aarch64.sve.uhsubr.nxv4i32(<vscale x 4 x i1> [[TMP0]], <vscale x 4 x i32> 
[[OP1:%.*]], <vscale x 4 x i32> [[OP2:%.*]])
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 4 x i32> 
@llvm.aarch64.sve.uhsub.u.nxv4i32(<vscale x 4 x i1> [[TMP0]], <vscale x 4 x 
i32> [[OP2:%.*]], <vscale x 4 x i32> [[OP1:%.*]])
 // CHECK-NEXT:    ret <vscale x 4 x i32> [[TMP1]]
 //
 // CPP-CHECK-LABEL: @_Z18test_svhsubr_u32_xu10__SVBool_tu12__SVUint32_tS0_(
 // CPP-CHECK-NEXT:  entry:
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 4 x i1> 
@llvm.aarch64.sve.convert.from.svbool.nxv4i1(<vscale x 16 x i1> [[PG:%.*]])
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 4 x i32> 
@llvm.aarch64.sve.uhsubr.nxv4i32(<vscale x 4 x i1> [[TMP0]], <vscale x 4 x i32> 
[[OP1:%.*]], <vscale x 4 x i32> [[OP2:%.*]])
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 4 x i32> 
@llvm.aarch64.sve.uhsub.u.nxv4i32(<vscale x 4 x i1> [[TMP0]], <vscale x 4 x 
i32> [[OP2:%.*]], <vscale x 4 x i32> [[OP1:%.*]])
 // CPP-CHECK-NEXT:    ret <vscale x 4 x i32> [[TMP1]]
 //
 svuint32_t test_svhsubr_u32_x(svbool_t pg, svuint32_t op1, svuint32_t op2)
@@ -413,13 +413,13 @@ svuint32_t test_svhsubr_u32_x(svbool_t pg, svuint32_t 
op1, svuint32_t op2)
 // CHECK-LABEL: @test_svhsubr_u64_x(
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 2 x i1> 
@llvm.aarch64.sve.convert.from.svbool.nxv2i1(<vscale x 16 x i1> [[PG:%.*]])
-// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 2 x i64> 
@llvm.aarch64.sve.uhsubr.nxv2i64(<vscale x 2 x i1> [[TMP0]], <vscale x 2 x i64> 
[[OP1:%.*]], <vscale x 2 x i64> [[OP2:%.*]])
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 2 x i64> 
@llvm.aarch64.sve.uhsub.u.nxv2i64(<vscale x 2 x i1> [[TMP0]], <vscale x 2 x 
i64> [[OP2:%.*]], <vscale x 2 x i64> [[OP1:%.*]])
 // CHECK-NEXT:    ret <vscale x 2 x i64> [[TMP1]]
 //
 // CPP-CHECK-LABEL: @_Z18test_svhsubr_u64_xu10__SVBool_tu12__SVUint64_tS0_(
 // CPP-CHECK-NEXT:  entry:
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 2 x i1> 
@llvm.aarch64.sve.convert.from.svbool.nxv2i1(<vscale x 16 x i1> [[PG:%.*]])
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 2 x i64> 
@llvm.aarch64.sve.uhsubr.nxv2i64(<vscale x 2 x i1> [[TMP0]], <vscale x 2 x i64> 
[[OP1:%.*]], <vscale x 2 x i64> [[OP2:%.*]])
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 2 x i64> 
@llvm.aarch64.sve.uhsub.u.nxv2i64(<vscale x 2 x i1> [[TMP0]], <vscale x 2 x 
i64> [[OP2:%.*]], <vscale x 2 x i64> [[OP1:%.*]])
 // CPP-CHECK-NEXT:    ret <vscale x 2 x i64> [[TMP1]]
 //
 svuint64_t test_svhsubr_u64_x(svbool_t pg, svuint64_t op1, svuint64_t op2)
@@ -775,14 +775,14 @@ svuint64_t test_svhsubr_n_u64_m(svbool_t pg, svuint64_t 
op1, uint64_t op2)
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 16 x i8> 
poison, i8 [[OP2:%.*]], i64 0
 // CHECK-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <vscale x 16 x i8> 
[[DOTSPLATINSERT]], <vscale x 16 x i8> poison, <vscale x 16 x i32> 
zeroinitializer
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 16 x i8> 
@llvm.aarch64.sve.shsubr.nxv16i8(<vscale x 16 x i1> [[PG:%.*]], <vscale x 16 x 
i8> [[OP1:%.*]], <vscale x 16 x i8> [[DOTSPLAT]])
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 16 x i8> 
@llvm.aarch64.sve.shsub.u.nxv16i8(<vscale x 16 x i1> [[PG:%.*]], <vscale x 16 x 
i8> [[DOTSPLAT]], <vscale x 16 x i8> [[OP1:%.*]])
 // CHECK-NEXT:    ret <vscale x 16 x i8> [[TMP0]]
 //
 // CPP-CHECK-LABEL: @_Z19test_svhsubr_n_s8_xu10__SVBool_tu10__SVInt8_ta(
 // CPP-CHECK-NEXT:  entry:
 // CPP-CHECK-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 16 x 
i8> poison, i8 [[OP2:%.*]], i64 0
 // CPP-CHECK-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <vscale x 16 x i8> 
[[DOTSPLATINSERT]], <vscale x 16 x i8> poison, <vscale x 16 x i32> 
zeroinitializer
-// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 16 x i8> 
@llvm.aarch64.sve.shsubr.nxv16i8(<vscale x 16 x i1> [[PG:%.*]], <vscale x 16 x 
i8> [[OP1:%.*]], <vscale x 16 x i8> [[DOTSPLAT]])
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 16 x i8> 
@llvm.aarch64.sve.shsub.u.nxv16i8(<vscale x 16 x i1> [[PG:%.*]], <vscale x 16 x 
i8> [[DOTSPLAT]], <vscale x 16 x i8> [[OP1:%.*]])
 // CPP-CHECK-NEXT:    ret <vscale x 16 x i8> [[TMP0]]
 //
 svint8_t test_svhsubr_n_s8_x(svbool_t pg, svint8_t op1, int8_t op2)
@@ -795,7 +795,7 @@ svint8_t test_svhsubr_n_s8_x(svbool_t pg, svint8_t op1, 
int8_t op2)
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i1> 
@llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> [[PG:%.*]])
 // CHECK-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 8 x i16> 
poison, i16 [[OP2:%.*]], i64 0
 // CHECK-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <vscale x 8 x i16> 
[[DOTSPLATINSERT]], <vscale x 8 x i16> poison, <vscale x 8 x i32> 
zeroinitializer
-// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x i16> 
@llvm.aarch64.sve.shsubr.nxv8i16(<vscale x 8 x i1> [[TMP0]], <vscale x 8 x i16> 
[[OP1:%.*]], <vscale x 8 x i16> [[DOTSPLAT]])
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x i16> 
@llvm.aarch64.sve.shsub.u.nxv8i16(<vscale x 8 x i1> [[TMP0]], <vscale x 8 x 
i16> [[DOTSPLAT]], <vscale x 8 x i16> [[OP1:%.*]])
 // CHECK-NEXT:    ret <vscale x 8 x i16> [[TMP1]]
 //
 // CPP-CHECK-LABEL: @_Z20test_svhsubr_n_s16_xu10__SVBool_tu11__SVInt16_ts(
@@ -803,7 +803,7 @@ svint8_t test_svhsubr_n_s8_x(svbool_t pg, svint8_t op1, 
int8_t op2)
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i1> 
@llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> [[PG:%.*]])
 // CPP-CHECK-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 8 x 
i16> poison, i16 [[OP2:%.*]], i64 0
 // CPP-CHECK-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <vscale x 8 x i16> 
[[DOTSPLATINSERT]], <vscale x 8 x i16> poison, <vscale x 8 x i32> 
zeroinitializer
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x i16> 
@llvm.aarch64.sve.shsubr.nxv8i16(<vscale x 8 x i1> [[TMP0]], <vscale x 8 x i16> 
[[OP1:%.*]], <vscale x 8 x i16> [[DOTSPLAT]])
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x i16> 
@llvm.aarch64.sve.shsub.u.nxv8i16(<vscale x 8 x i1> [[TMP0]], <vscale x 8 x 
i16> [[DOTSPLAT]], <vscale x 8 x i16> [[OP1:%.*]])
 // CPP-CHECK-NEXT:    ret <vscale x 8 x i16> [[TMP1]]
 //
 svint16_t test_svhsubr_n_s16_x(svbool_t pg, svint16_t op1, int16_t op2)
@@ -816,7 +816,7 @@ svint16_t test_svhsubr_n_s16_x(svbool_t pg, svint16_t op1, 
int16_t op2)
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 4 x i1> 
@llvm.aarch64.sve.convert.from.svbool.nxv4i1(<vscale x 16 x i1> [[PG:%.*]])
 // CHECK-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 4 x i32> 
poison, i32 [[OP2:%.*]], i64 0
 // CHECK-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <vscale x 4 x i32> 
[[DOTSPLATINSERT]], <vscale x 4 x i32> poison, <vscale x 4 x i32> 
zeroinitializer
-// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 4 x i32> 
@llvm.aarch64.sve.shsubr.nxv4i32(<vscale x 4 x i1> [[TMP0]], <vscale x 4 x i32> 
[[OP1:%.*]], <vscale x 4 x i32> [[DOTSPLAT]])
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 4 x i32> 
@llvm.aarch64.sve.shsub.u.nxv4i32(<vscale x 4 x i1> [[TMP0]], <vscale x 4 x 
i32> [[DOTSPLAT]], <vscale x 4 x i32> [[OP1:%.*]])
 // CHECK-NEXT:    ret <vscale x 4 x i32> [[TMP1]]
 //
 // CPP-CHECK-LABEL: @_Z20test_svhsubr_n_s32_xu10__SVBool_tu11__SVInt32_ti(
@@ -824,7 +824,7 @@ svint16_t test_svhsubr_n_s16_x(svbool_t pg, svint16_t op1, 
int16_t op2)
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 4 x i1> 
@llvm.aarch64.sve.convert.from.svbool.nxv4i1(<vscale x 16 x i1> [[PG:%.*]])
 // CPP-CHECK-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 4 x 
i32> poison, i32 [[OP2:%.*]], i64 0
 // CPP-CHECK-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <vscale x 4 x i32> 
[[DOTSPLATINSERT]], <vscale x 4 x i32> poison, <vscale x 4 x i32> 
zeroinitializer
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 4 x i32> 
@llvm.aarch64.sve.shsubr.nxv4i32(<vscale x 4 x i1> [[TMP0]], <vscale x 4 x i32> 
[[OP1:%.*]], <vscale x 4 x i32> [[DOTSPLAT]])
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 4 x i32> 
@llvm.aarch64.sve.shsub.u.nxv4i32(<vscale x 4 x i1> [[TMP0]], <vscale x 4 x 
i32> [[DOTSPLAT]], <vscale x 4 x i32> [[OP1:%.*]])
 // CPP-CHECK-NEXT:    ret <vscale x 4 x i32> [[TMP1]]
 //
 svint32_t test_svhsubr_n_s32_x(svbool_t pg, svint32_t op1, int32_t op2)
@@ -837,7 +837,7 @@ svint32_t test_svhsubr_n_s32_x(svbool_t pg, svint32_t op1, 
int32_t op2)
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 2 x i1> 
@llvm.aarch64.sve.convert.from.svbool.nxv2i1(<vscale x 16 x i1> [[PG:%.*]])
 // CHECK-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 2 x i64> 
poison, i64 [[OP2:%.*]], i64 0
 // CHECK-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <vscale x 2 x i64> 
[[DOTSPLATINSERT]], <vscale x 2 x i64> poison, <vscale x 2 x i32> 
zeroinitializer
-// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 2 x i64> 
@llvm.aarch64.sve.shsubr.nxv2i64(<vscale x 2 x i1> [[TMP0]], <vscale x 2 x i64> 
[[OP1:%.*]], <vscale x 2 x i64> [[DOTSPLAT]])
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 2 x i64> 
@llvm.aarch64.sve.shsub.u.nxv2i64(<vscale x 2 x i1> [[TMP0]], <vscale x 2 x 
i64> [[DOTSPLAT]], <vscale x 2 x i64> [[OP1:%.*]])
 // CHECK-NEXT:    ret <vscale x 2 x i64> [[TMP1]]
 //
 // CPP-CHECK-LABEL: @_Z20test_svhsubr_n_s64_xu10__SVBool_tu11__SVInt64_tl(
@@ -845,7 +845,7 @@ svint32_t test_svhsubr_n_s32_x(svbool_t pg, svint32_t op1, 
int32_t op2)
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 2 x i1> 
@llvm.aarch64.sve.convert.from.svbool.nxv2i1(<vscale x 16 x i1> [[PG:%.*]])
 // CPP-CHECK-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 2 x 
i64> poison, i64 [[OP2:%.*]], i64 0
 // CPP-CHECK-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <vscale x 2 x i64> 
[[DOTSPLATINSERT]], <vscale x 2 x i64> poison, <vscale x 2 x i32> 
zeroinitializer
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 2 x i64> 
@llvm.aarch64.sve.shsubr.nxv2i64(<vscale x 2 x i1> [[TMP0]], <vscale x 2 x i64> 
[[OP1:%.*]], <vscale x 2 x i64> [[DOTSPLAT]])
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 2 x i64> 
@llvm.aarch64.sve.shsub.u.nxv2i64(<vscale x 2 x i1> [[TMP0]], <vscale x 2 x 
i64> [[DOTSPLAT]], <vscale x 2 x i64> [[OP1:%.*]])
 // CPP-CHECK-NEXT:    ret <vscale x 2 x i64> [[TMP1]]
 //
 svint64_t test_svhsubr_n_s64_x(svbool_t pg, svint64_t op1, int64_t op2)
@@ -857,14 +857,14 @@ svint64_t test_svhsubr_n_s64_x(svbool_t pg, svint64_t 
op1, int64_t op2)
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 16 x i8> 
poison, i8 [[OP2:%.*]], i64 0
 // CHECK-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <vscale x 16 x i8> 
[[DOTSPLATINSERT]], <vscale x 16 x i8> poison, <vscale x 16 x i32> 
zeroinitializer
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 16 x i8> 
@llvm.aarch64.sve.uhsubr.nxv16i8(<vscale x 16 x i1> [[PG:%.*]], <vscale x 16 x 
i8> [[OP1:%.*]], <vscale x 16 x i8> [[DOTSPLAT]])
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 16 x i8> 
@llvm.aarch64.sve.uhsub.u.nxv16i8(<vscale x 16 x i1> [[PG:%.*]], <vscale x 16 x 
i8> [[DOTSPLAT]], <vscale x 16 x i8> [[OP1:%.*]])
 // CHECK-NEXT:    ret <vscale x 16 x i8> [[TMP0]]
 //
 // CPP-CHECK-LABEL: @_Z19test_svhsubr_n_u8_xu10__SVBool_tu11__SVUint8_th(
 // CPP-CHECK-NEXT:  entry:
 // CPP-CHECK-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 16 x 
i8> poison, i8 [[OP2:%.*]], i64 0
 // CPP-CHECK-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <vscale x 16 x i8> 
[[DOTSPLATINSERT]], <vscale x 16 x i8> poison, <vscale x 16 x i32> 
zeroinitializer
-// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 16 x i8> 
@llvm.aarch64.sve.uhsubr.nxv16i8(<vscale x 16 x i1> [[PG:%.*]], <vscale x 16 x 
i8> [[OP1:%.*]], <vscale x 16 x i8> [[DOTSPLAT]])
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 16 x i8> 
@llvm.aarch64.sve.uhsub.u.nxv16i8(<vscale x 16 x i1> [[PG:%.*]], <vscale x 16 x 
i8> [[DOTSPLAT]], <vscale x 16 x i8> [[OP1:%.*]])
 // CPP-CHECK-NEXT:    ret <vscale x 16 x i8> [[TMP0]]
 //
 svuint8_t test_svhsubr_n_u8_x(svbool_t pg, svuint8_t op1, uint8_t op2)
@@ -877,7 +877,7 @@ svuint8_t test_svhsubr_n_u8_x(svbool_t pg, svuint8_t op1, 
uint8_t op2)
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i1> 
@llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> [[PG:%.*]])
 // CHECK-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 8 x i16> 
poison, i16 [[OP2:%.*]], i64 0
 // CHECK-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <vscale x 8 x i16> 
[[DOTSPLATINSERT]], <vscale x 8 x i16> poison, <vscale x 8 x i32> 
zeroinitializer
-// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x i16> 
@llvm.aarch64.sve.uhsubr.nxv8i16(<vscale x 8 x i1> [[TMP0]], <vscale x 8 x i16> 
[[OP1:%.*]], <vscale x 8 x i16> [[DOTSPLAT]])
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x i16> 
@llvm.aarch64.sve.uhsub.u.nxv8i16(<vscale x 8 x i1> [[TMP0]], <vscale x 8 x 
i16> [[DOTSPLAT]], <vscale x 8 x i16> [[OP1:%.*]])
 // CHECK-NEXT:    ret <vscale x 8 x i16> [[TMP1]]
 //
 // CPP-CHECK-LABEL: @_Z20test_svhsubr_n_u16_xu10__SVBool_tu12__SVUint16_tt(
@@ -885,7 +885,7 @@ svuint8_t test_svhsubr_n_u8_x(svbool_t pg, svuint8_t op1, 
uint8_t op2)
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i1> 
@llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> [[PG:%.*]])
 // CPP-CHECK-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 8 x 
i16> poison, i16 [[OP2:%.*]], i64 0
 // CPP-CHECK-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <vscale x 8 x i16> 
[[DOTSPLATINSERT]], <vscale x 8 x i16> poison, <vscale x 8 x i32> 
zeroinitializer
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x i16> 
@llvm.aarch64.sve.uhsubr.nxv8i16(<vscale x 8 x i1> [[TMP0]], <vscale x 8 x i16> 
[[OP1:%.*]], <vscale x 8 x i16> [[DOTSPLAT]])
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x i16> 
@llvm.aarch64.sve.uhsub.u.nxv8i16(<vscale x 8 x i1> [[TMP0]], <vscale x 8 x 
i16> [[DOTSPLAT]], <vscale x 8 x i16> [[OP1:%.*]])
 // CPP-CHECK-NEXT:    ret <vscale x 8 x i16> [[TMP1]]
 //
 svuint16_t test_svhsubr_n_u16_x(svbool_t pg, svuint16_t op1, uint16_t op2)
@@ -898,7 +898,7 @@ svuint16_t test_svhsubr_n_u16_x(svbool_t pg, svuint16_t 
op1, uint16_t op2)
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 4 x i1> 
@llvm.aarch64.sve.convert.from.svbool.nxv4i1(<vscale x 16 x i1> [[PG:%.*]])
 // CHECK-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 4 x i32> 
poison, i32 [[OP2:%.*]], i64 0
 // CHECK-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <vscale x 4 x i32> 
[[DOTSPLATINSERT]], <vscale x 4 x i32> poison, <vscale x 4 x i32> 
zeroinitializer
-// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 4 x i32> 
@llvm.aarch64.sve.uhsubr.nxv4i32(<vscale x 4 x i1> [[TMP0]], <vscale x 4 x i32> 
[[OP1:%.*]], <vscale x 4 x i32> [[DOTSPLAT]])
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 4 x i32> 
@llvm.aarch64.sve.uhsub.u.nxv4i32(<vscale x 4 x i1> [[TMP0]], <vscale x 4 x 
i32> [[DOTSPLAT]], <vscale x 4 x i32> [[OP1:%.*]])
 // CHECK-NEXT:    ret <vscale x 4 x i32> [[TMP1]]
 //
 // CPP-CHECK-LABEL: @_Z20test_svhsubr_n_u32_xu10__SVBool_tu12__SVUint32_tj(
@@ -906,7 +906,7 @@ svuint16_t test_svhsubr_n_u16_x(svbool_t pg, svuint16_t 
op1, uint16_t op2)
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 4 x i1> 
@llvm.aarch64.sve.convert.from.svbool.nxv4i1(<vscale x 16 x i1> [[PG:%.*]])
 // CPP-CHECK-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 4 x 
i32> poison, i32 [[OP2:%.*]], i64 0
 // CPP-CHECK-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <vscale x 4 x i32> 
[[DOTSPLATINSERT]], <vscale x 4 x i32> poison, <vscale x 4 x i32> 
zeroinitializer
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 4 x i32> 
@llvm.aarch64.sve.uhsubr.nxv4i32(<vscale x 4 x i1> [[TMP0]], <vscale x 4 x i32> 
[[OP1:%.*]], <vscale x 4 x i32> [[DOTSPLAT]])
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 4 x i32> 
@llvm.aarch64.sve.uhsub.u.nxv4i32(<vscale x 4 x i1> [[TMP0]], <vscale x 4 x 
i32> [[DOTSPLAT]], <vscale x 4 x i32> [[OP1:%.*]])
 // CPP-CHECK-NEXT:    ret <vscale x 4 x i32> [[TMP1]]
 //
 svuint32_t test_svhsubr_n_u32_x(svbool_t pg, svuint32_t op1, uint32_t op2)
@@ -919,7 +919,7 @@ svuint32_t test_svhsubr_n_u32_x(svbool_t pg, svuint32_t 
op1, uint32_t op2)
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 2 x i1> 
@llvm.aarch64.sve.convert.from.svbool.nxv2i1(<vscale x 16 x i1> [[PG:%.*]])
 // CHECK-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 2 x i64> 
poison, i64 [[OP2:%.*]], i64 0
 // CHECK-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <vscale x 2 x i64> 
[[DOTSPLATINSERT]], <vscale x 2 x i64> poison, <vscale x 2 x i32> 
zeroinitializer
-// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 2 x i64> 
@llvm.aarch64.sve.uhsubr.nxv2i64(<vscale x 2 x i1> [[TMP0]], <vscale x 2 x i64> 
[[OP1:%.*]], <vscale x 2 x i64> [[DOTSPLAT]])
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 2 x i64> 
@llvm.aarch64.sve.uhsub.u.nxv2i64(<vscale x 2 x i1> [[TMP0]], <vscale x 2 x 
i64> [[DOTSPLAT]], <vscale x 2 x i64> [[OP1:%.*]])
 // CHECK-NEXT:    ret <vscale x 2 x i64> [[TMP1]]
 //
 // CPP-CHECK-LABEL: @_Z20test_svhsubr_n_u64_xu10__SVBool_tu12__SVUint64_tm(
@@ -927,7 +927,7 @@ svuint32_t test_svhsubr_n_u32_x(svbool_t pg, svuint32_t 
op1, uint32_t op2)
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 2 x i1> 
@llvm.aarch64.sve.convert.from.svbool.nxv2i1(<vscale x 16 x i1> [[PG:%.*]])
 // CPP-CHECK-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 2 x 
i64> poison, i64 [[OP2:%.*]], i64 0
 // CPP-CHECK-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <vscale x 2 x i64> 
[[DOTSPLATINSERT]], <vscale x 2 x i64> poison, <vscale x 2 x i32> 
zeroinitializer
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 2 x i64> 
@llvm.aarch64.sve.uhsubr.nxv2i64(<vscale x 2 x i1> [[TMP0]], <vscale x 2 x i64> 
[[OP1:%.*]], <vscale x 2 x i64> [[DOTSPLAT]])
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 2 x i64> 
@llvm.aarch64.sve.uhsub.u.nxv2i64(<vscale x 2 x i1> [[TMP0]], <vscale x 2 x 
i64> [[DOTSPLAT]], <vscale x 2 x i64> [[OP1:%.*]])
 // CPP-CHECK-NEXT:    ret <vscale x 2 x i64> [[TMP1]]
 //
 svuint64_t test_svhsubr_n_u64_x(svbool_t pg, svuint64_t op1, uint64_t op2)
diff --git a/llvm/include/llvm/IR/IntrinsicsAArch64.td 
b/llvm/include/llvm/IR/IntrinsicsAArch64.td
index b8f87a0517efa..170eff4b7db90 100644
--- a/llvm/include/llvm/IR/IntrinsicsAArch64.td
+++ b/llvm/include/llvm/IR/IntrinsicsAArch64.td
@@ -2435,6 +2435,7 @@ def int_aarch64_sve_stnt1_scatter_scalar_offset  : 
AdvSIMD_ScatterStore_VS_Intri
 def int_aarch64_sve_saba          : 
AdvSIMD_3VectorArg_Intrinsic<[IntrSpeculatable]>;
 def int_aarch64_sve_shadd         : 
AdvSIMD_Pred2VectorArg_Intrinsic<[IntrSpeculatable]>;
 def int_aarch64_sve_shsub         : 
AdvSIMD_Pred2VectorArg_Intrinsic<[IntrSpeculatable]>;
+def int_aarch64_sve_shsub_u       : 
AdvSIMD_Pred2VectorArg_Intrinsic<[IntrSpeculatable]>;
 def int_aarch64_sve_shsubr        : 
AdvSIMD_Pred2VectorArg_Intrinsic<[IntrSpeculatable]>;
 def int_aarch64_sve_sli           : 
AdvSIMD_2VectorArgIndexed_Intrinsic<[IntrSpeculatable]>;
 def int_aarch64_sve_sqabs         : 
AdvSIMD_Merged1VectorArg_Intrinsic<[IntrSpeculatable]>;
@@ -2467,6 +2468,7 @@ def int_aarch64_sve_suqadd        : 
AdvSIMD_Pred2VectorArg_Intrinsic<[IntrSpecul
 def int_aarch64_sve_uaba          : 
AdvSIMD_3VectorArg_Intrinsic<[IntrSpeculatable]>;
 def int_aarch64_sve_uhadd         : 
AdvSIMD_Pred2VectorArg_Intrinsic<[IntrSpeculatable]>;
 def int_aarch64_sve_uhsub         : 
AdvSIMD_Pred2VectorArg_Intrinsic<[IntrSpeculatable]>;
+def int_aarch64_sve_uhsub_u       : 
AdvSIMD_Pred2VectorArg_Intrinsic<[IntrSpeculatable]>;
 def int_aarch64_sve_uhsubr        : 
AdvSIMD_Pred2VectorArg_Intrinsic<[IntrSpeculatable]>;
 def int_aarch64_sve_uqadd         : 
AdvSIMD_Pred2VectorArg_Intrinsic<[IntrSpeculatable]>;
 def int_aarch64_sve_uqrshl        : 
AdvSIMD_Pred2VectorArg_Intrinsic<[IntrSpeculatable]>;
diff --git a/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td 
b/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td
index bbcffb0c44e85..32eede90e42b8 100644
--- a/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td
+++ b/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td
@@ -3838,12 +3838,15 @@ let Predicates = [HasSVE2_or_SME] in {
   // SVE2 integer halving add/subtract (predicated)
   defm SHADD_ZPmZ  : sve2_int_arith_pred<0b100000, "shadd",  AArch64shadd>;
   defm UHADD_ZPmZ  : sve2_int_arith_pred<0b100010, "uhadd",  AArch64uhadd>;
-  defm SHSUB_ZPmZ  : sve2_int_arith_pred<0b100100, "shsub",  
int_aarch64_sve_shsub>;
-  defm UHSUB_ZPmZ  : sve2_int_arith_pred<0b100110, "uhsub",  
int_aarch64_sve_uhsub>;
+  defm SHSUB_ZPmZ  : sve2_int_arith_pred<0b100100, "shsub",  
int_aarch64_sve_shsub, "SHSUB_ZPZZ", DestructiveBinaryCommWithRev, 
"SHSUBR_ZPmZ">;
+  defm UHSUB_ZPmZ  : sve2_int_arith_pred<0b100110, "uhsub",  
int_aarch64_sve_uhsub, "UHSUB_ZPZZ", DestructiveBinaryCommWithRev, 
"UHSUBR_ZPmZ">;
   defm SRHADD_ZPmZ : sve2_int_arith_pred<0b101000, "srhadd", AArch64srhadd>;
   defm URHADD_ZPmZ : sve2_int_arith_pred<0b101010, "urhadd", AArch64urhadd>;
-  defm SHSUBR_ZPmZ : sve2_int_arith_pred<0b101100, "shsubr", 
int_aarch64_sve_shsubr>;
-  defm UHSUBR_ZPmZ : sve2_int_arith_pred<0b101110, "uhsubr", 
int_aarch64_sve_uhsubr>;
+  defm SHSUBR_ZPmZ : sve2_int_arith_pred<0b101100, "shsubr", 
int_aarch64_sve_shsubr, "SHSUBR_ZPZZ", DestructiveBinaryCommWithRev, 
"SHSUB_ZPmZ", /*isReverseInstr*/ 1>;
+  defm UHSUBR_ZPmZ : sve2_int_arith_pred<0b101110, "uhsubr", 
int_aarch64_sve_uhsubr, "UHSUBR_ZPZZ", DestructiveBinaryCommWithRev, 
"UHSUB_ZPmZ", /*isReverseInstr*/ 1>;
+
+  defm SHSUB_ZPZZ  : sve_int_bin_pred_bhsd<int_aarch64_sve_shsub_u>;
+  defm UHSUB_ZPZZ  : sve_int_bin_pred_bhsd<int_aarch64_sve_uhsub_u>;
 
   // SVE2 integer pairwise add and accumulate long
   defm SADALP_ZPmZ : sve2_int_sadd_long_accum_pairwise<0, "sadalp", 
int_aarch64_sve_sadalp>;
diff --git a/llvm/lib/Target/AArch64/AArch64SchedA510.td 
b/llvm/lib/Target/AArch64/AArch64SchedA510.td
index 66f49f040ad12..6acac222a81c2 100644
--- a/llvm/lib/Target/AArch64/AArch64SchedA510.td
+++ b/llvm/lib/Target/AArch64/AArch64SchedA510.td
@@ -652,7 +652,7 @@ def : InstRW<[CortexA510Write<3, CortexA510UnitVALU>],
                         "^(ADD|SUB|SUBR)_ZI_[BHSD]",
                         "^ADR_[SU]XTW_ZZZ_D_[0123]",
                         "^ADR_LSL_ZZZ_[SD]_[0123]",
-                        "^[SU]H(ADD|SUB|SUBR)_ZPmZ_[BHSD]")>;
+                        "^[SU]H(ADD|SUB|SUBR)_(ZPmZ|ZPZZ)_[BHSD]")>;
 def : InstRW<[CortexA510Write<4, CortexA510UnitVALU>],
              (instregex "^[SU](ADD|SUB)[LW][BT]_ZZZ_[HSD]",
                         "^SADDLBT_ZZZ_[HSD]",
diff --git a/llvm/lib/Target/AArch64/AArch64SchedNeoverseN2.td 
b/llvm/lib/Target/AArch64/AArch64SchedNeoverseN2.td
index a02130f8390a7..dee6e50f960b1 100644
--- a/llvm/lib/Target/AArch64/AArch64SchedNeoverseN2.td
+++ b/llvm/lib/Target/AArch64/AArch64SchedNeoverseN2.td
@@ -1662,7 +1662,7 @@ def : InstRW<[N2Write_2c_1V],
                         "^ADR_LSL_ZZZ_[SD]_[0123]",
                         "^[SU](ADD|SUB)[LW][BT]_ZZZ_[HSD]",
                         "^SADDLBT_ZZZ_[HSD]",
-                        "^[SU]H(ADD|SUB|SUBR)_ZPmZ_[BHSD]",
+                        "^[SU]H(ADD|SUB|SUBR)_(ZPmZ|ZPZZ)_[BHSD]",
                         "^SSUBL(BT|TB)_ZZZ_[HSD]")>;
 
 // Arithmetic, complex
diff --git a/llvm/lib/Target/AArch64/AArch64SchedNeoverseN3.td 
b/llvm/lib/Target/AArch64/AArch64SchedNeoverseN3.td
index 22e6d1107a337..817ddcbdca6f2 100644
--- a/llvm/lib/Target/AArch64/AArch64SchedNeoverseN3.td
+++ b/llvm/lib/Target/AArch64/AArch64SchedNeoverseN3.td
@@ -1752,7 +1752,7 @@ def : InstRW<[N3Write_2c_1V],
                         "^ADR_LSL_ZZZ_[SD]_[0123]",
                         "^[SU](ADD|SUB)[LW][BT]_ZZZ_[HSD]",
                         "^SADDLBT_ZZZ_[HSD]",
-                        "^[SU]H(ADD|SUB|SUBR)_ZPmZ_[BHSD]",
+                        "^[SU]H(ADD|SUB|SUBR)_(ZPmZ|ZPZZ)_[BHSD]",
                         "^SSUBL(BT|TB)_ZZZ_[HSD]")>;
 
 // Arithmetic, complex
diff --git a/llvm/lib/Target/AArch64/AArch64SchedNeoverseV2.td 
b/llvm/lib/Target/AArch64/AArch64SchedNeoverseV2.td
index 2387f176f3051..94861253e22d3 100644
--- a/llvm/lib/Target/AArch64/AArch64SchedNeoverseV2.td
+++ b/llvm/lib/Target/AArch64/AArch64SchedNeoverseV2.td
@@ -2125,7 +2125,7 @@ def : InstRW<[V2Write_2c_1V],
                         "^ADR_LSL_ZZZ_[SD]_[0123]",
                         "^[SU](ADD|SUB)[LW][BT]_ZZZ_[HSD]",
                         "^SADDLBT_ZZZ_[HSD]",
-                        "^[SU]H(ADD|SUB|SUBR)_ZPmZ_[BHSD]",
+                        "^[SU]H(ADD|SUB|SUBR)_(ZPmZ|ZPZZ)_[BHSD]",
                         "^SSUBL(BT|TB)_ZZZ_[HSD]")>;
 
 // Arithmetic, complex
diff --git a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp 
b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
index b2a9f9cb75910..1df0a38983261 100644
--- a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
+++ b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
@@ -1446,6 +1446,10 @@ static SVEIntrinsicInfo 
constructSVEIntrinsicInfo(IntrinsicInst &II) {
   case Intrinsic::aarch64_sve_orr:
     return SVEIntrinsicInfo::defaultMergingOp(Intrinsic::aarch64_sve_orr_u)
         .setMatchingIROpcode(Instruction::Or);
+  case Intrinsic::aarch64_sve_shsub:
+    return SVEIntrinsicInfo::defaultMergingOp(Intrinsic::aarch64_sve_shsub_u);
+  case Intrinsic::aarch64_sve_shsubr:
+    return SVEIntrinsicInfo::defaultMergingOp();
   case Intrinsic::aarch64_sve_sqrshl:
     return SVEIntrinsicInfo::defaultMergingOp(Intrinsic::aarch64_sve_sqrshl_u);
   case Intrinsic::aarch64_sve_sqshl:
@@ -1454,6 +1458,10 @@ static SVEIntrinsicInfo 
constructSVEIntrinsicInfo(IntrinsicInst &II) {
     return SVEIntrinsicInfo::defaultMergingOp(Intrinsic::aarch64_sve_sqsub_u);
   case Intrinsic::aarch64_sve_srshl:
     return SVEIntrinsicInfo::defaultMergingOp(Intrinsic::aarch64_sve_srshl_u);
+  case Intrinsic::aarch64_sve_uhsub:
+    return SVEIntrinsicInfo::defaultMergingOp(Intrinsic::aarch64_sve_uhsub_u);
+  case Intrinsic::aarch64_sve_uhsubr:
+    return SVEIntrinsicInfo::defaultMergingOp();
   case Intrinsic::aarch64_sve_uqrshl:
     return SVEIntrinsicInfo::defaultMergingOp(Intrinsic::aarch64_sve_uqrshl_u);
   case Intrinsic::aarch64_sve_uqshl:
diff --git a/llvm/test/CodeGen/AArch64/sve2-intrinsics-uniform-dsp-undef.ll 
b/llvm/test/CodeGen/AArch64/sve2-intrinsics-uniform-dsp-undef.ll
index a471625cd5ad8..5356bba94a30b 100644
--- a/llvm/test/CodeGen/AArch64/sve2-intrinsics-uniform-dsp-undef.ll
+++ b/llvm/test/CodeGen/AArch64/sve2-intrinsics-uniform-dsp-undef.ll
@@ -1,6 +1,154 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sve2 < %s | FileCheck %s
 
+;
+; SHSUB
+;
+
+define <vscale x 16 x i8> @shsub_i8(<vscale x 16 x i1> %pg, <vscale x 16 x i8> 
%a, <vscale x 16 x i8> %b) {
+; CHECK-LABEL: shsub_i8:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    shsub z0.b, p0/m, z0.b, z1.b
+; CHECK-NEXT:    ret
+  %out = call <vscale x 16 x i8> @llvm.aarch64.sve.shsub.u.nxv16i8(<vscale x 
16 x i1> %pg,
+                                                                   <vscale x 
16 x i8> %a,
+                                                                   <vscale x 
16 x i8> %b)
+  ret <vscale x 16 x i8> %out
+}
+
+define <vscale x 8 x i16> @shsub_i16(<vscale x 8 x i1> %pg, <vscale x 8 x i16> 
%a, <vscale x 8 x i16> %b) {
+; CHECK-LABEL: shsub_i16:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    shsub z0.h, p0/m, z0.h, z1.h
+; CHECK-NEXT:    ret
+  %out = call <vscale x 8 x i16> @llvm.aarch64.sve.shsub.u.nxv8i16(<vscale x 8 
x i1> %pg,
+                                                                   <vscale x 8 
x i16> %a,
+                                                                   <vscale x 8 
x i16> %b)
+  ret <vscale x 8 x i16> %out
+}
+
+define <vscale x 4 x i32> @shsub_i32(<vscale x 4 x i1> %pg, <vscale x 4 x i32> 
%a, <vscale x 4 x i32> %b) {
+; CHECK-LABEL: shsub_i32:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    shsub z0.s, p0/m, z0.s, z1.s
+; CHECK-NEXT:    ret
+  %out = call <vscale x 4 x i32> @llvm.aarch64.sve.shsub.u.nxv4i32(<vscale x 4 
x i1> %pg,
+                                                                   <vscale x 4 
x i32> %a,
+                                                                   <vscale x 4 
x i32> %b)
+  ret <vscale x 4 x i32> %out
+}
+
+define <vscale x 2 x i64> @shsub_i64(<vscale x 2 x i1> %pg, <vscale x 2 x i64> 
%a, <vscale x 2 x i64> %b) {
+; CHECK-LABEL: shsub_i64:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    shsub z0.d, p0/m, z0.d, z1.d
+; CHECK-NEXT:    ret
+  %out = call <vscale x 2 x i64> @llvm.aarch64.sve.shsub.u.nxv2i64(<vscale x 2 
x i1> %pg,
+                                                                   <vscale x 2 
x i64> %a,
+                                                                   <vscale x 2 
x i64> %b)
+  ret <vscale x 2 x i64> %out
+}
+
+;
+; SHSUB (swapped operands)
+;
+
+define <vscale x 16 x i8> @shsub_i8_swapped_operands(<vscale x 16 x i1> %pg, 
<vscale x 16 x i8> %a, <vscale x 16 x i8> %b) {
+; CHECK-LABEL: shsub_i8_swapped_operands:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    shsubr z0.b, p0/m, z0.b, z1.b
+; CHECK-NEXT:    ret
+  %out = call <vscale x 16 x i8> @llvm.aarch64.sve.shsub.u.nxv16i8(<vscale x 
16 x i1> %pg,
+                                                                   <vscale x 
16 x i8> %b,
+                                                                   <vscale x 
16 x i8> %a)
+  ret <vscale x 16 x i8> %out
+}
+
+define <vscale x 8 x i16> @shsub_i16_swapped_operands(<vscale x 8 x i1> %pg, 
<vscale x 8 x i16> %a, <vscale x 8 x i16> %b) {
+; CHECK-LABEL: shsub_i16_swapped_operands:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    shsubr z0.h, p0/m, z0.h, z1.h
+; CHECK-NEXT:    ret
+  %out = call <vscale x 8 x i16> @llvm.aarch64.sve.shsub.u.nxv8i16(<vscale x 8 
x i1> %pg,
+                                                                   <vscale x 8 
x i16> %b,
+                                                                   <vscale x 8 
x i16> %a)
+  ret <vscale x 8 x i16> %out
+}
+
+define <vscale x 4 x i32> @shsub_i32_swapped_operands(<vscale x 4 x i1> %pg, 
<vscale x 4 x i32> %a, <vscale x 4 x i32> %b) {
+; CHECK-LABEL: shsub_i32_swapped_operands:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    shsubr z0.s, p0/m, z0.s, z1.s
+; CHECK-NEXT:    ret
+  %out = call <vscale x 4 x i32> @llvm.aarch64.sve.shsub.u.nxv4i32(<vscale x 4 
x i1> %pg,
+                                                                   <vscale x 4 
x i32> %b,
+                                                                   <vscale x 4 
x i32> %a)
+  ret <vscale x 4 x i32> %out
+}
+
+define <vscale x 2 x i64> @shsub_i64_swapped_operands(<vscale x 2 x i1> %pg, 
<vscale x 2 x i64> %a, <vscale x 2 x i64> %b) {
+; CHECK-LABEL: shsub_i64_swapped_operands:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    shsubr z0.d, p0/m, z0.d, z1.d
+; CHECK-NEXT:    ret
+  %out = call <vscale x 2 x i64> @llvm.aarch64.sve.shsub.u.nxv2i64(<vscale x 2 
x i1> %pg,
+                                                                   <vscale x 2 
x i64> %b,
+                                                                   <vscale x 2 
x i64> %a)
+  ret <vscale x 2 x i64> %out
+}
+
+;
+; SHSUB (movprfx)
+;
+
+define <vscale x 16 x i8> @shsub_i8_movprfx(<vscale x 16 x i1> %pg, <vscale x 
16 x i8> %unused, <vscale x 16 x i8> %a, <vscale x 16 x i8> %b) {
+; CHECK-LABEL: shsub_i8_movprfx:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    movprfx z0, z1
+; CHECK-NEXT:    shsub z0.b, p0/m, z0.b, z2.b
+; CHECK-NEXT:    ret
+  %out = call <vscale x 16 x i8> @llvm.aarch64.sve.shsub.u.nxv16i8(<vscale x 
16 x i1> %pg,
+                                                                   <vscale x 
16 x i8> %a,
+                                                                   <vscale x 
16 x i8> %b)
+  ret <vscale x 16 x i8> %out
+}
+
+define <vscale x 8 x i16> @shsub_i16_movprfx(<vscale x 8 x i1> %pg, <vscale x 
8 x i16> %unused, <vscale x 8 x i16> %a, <vscale x 8 x i16> %b) {
+; CHECK-LABEL: shsub_i16_movprfx:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    movprfx z0, z1
+; CHECK-NEXT:    shsub z0.h, p0/m, z0.h, z2.h
+; CHECK-NEXT:    ret
+  %out = call <vscale x 8 x i16> @llvm.aarch64.sve.shsub.u.nxv8i16(<vscale x 8 
x i1> %pg,
+                                                                   <vscale x 8 
x i16> %a,
+                                                                   <vscale x 8 
x i16> %b)
+  ret <vscale x 8 x i16> %out
+}
+
+define <vscale x 4 x i32> @shsub_i32_movprfx(<vscale x 4 x i1> %pg, <vscale x 
4 x i32> %unused, <vscale x 4 x i32> %a, <vscale x 4 x i32> %b) {
+; CHECK-LABEL: shsub_i32_movprfx:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    movprfx z0, z1
+; CHECK-NEXT:    shsub z0.s, p0/m, z0.s, z2.s
+; CHECK-NEXT:    ret
+  %out = call <vscale x 4 x i32> @llvm.aarch64.sve.shsub.u.nxv4i32(<vscale x 4 
x i1> %pg,
+                                                                   <vscale x 4 
x i32> %a,
+                                                                   <vscale x 4 
x i32> %b)
+  ret <vscale x 4 x i32> %out
+}
+
+define <vscale x 2 x i64> @shsub_i64_movprfx(<vscale x 2 x i1> %pg, <vscale x 
2 x i64> %unused, <vscale x 2 x i64> %a, <vscale x 2 x i64> %b) {
+; CHECK-LABEL: shsub_i64_movprfx:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    movprfx z0, z1
+; CHECK-NEXT:    shsub z0.d, p0/m, z0.d, z2.d
+; CHECK-NEXT:    ret
+  %out = call <vscale x 2 x i64> @llvm.aarch64.sve.shsub.u.nxv2i64(<vscale x 2 
x i1> %pg,
+                                                                   <vscale x 2 
x i64> %a,
+                                                                   <vscale x 2 
x i64> %b)
+  ret <vscale x 2 x i64> %out
+}
+
 ;
 ; SQRSHL
 ;
@@ -641,6 +789,154 @@ define <vscale x 2 x i64> @srshl_i64_movprfx(<vscale x 2 
x i1> %pg, <vscale x 2
   ret <vscale x 2 x i64> %out
 }
 
+;
+; UHSUB
+;
+
+define <vscale x 16 x i8> @uhsub_i8(<vscale x 16 x i1> %pg, <vscale x 16 x i8> 
%a, <vscale x 16 x i8> %b) {
+; CHECK-LABEL: uhsub_i8:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    uhsub z0.b, p0/m, z0.b, z1.b
+; CHECK-NEXT:    ret
+  %out = call <vscale x 16 x i8> @llvm.aarch64.sve.uhsub.u.nxv16i8(<vscale x 
16 x i1> %pg,
+                                                                   <vscale x 
16 x i8> %a,
+                                                                   <vscale x 
16 x i8> %b)
+  ret <vscale x 16 x i8> %out
+}
+
+define <vscale x 8 x i16> @uhsub_i16(<vscale x 8 x i1> %pg, <vscale x 8 x i16> 
%a, <vscale x 8 x i16> %b) {
+; CHECK-LABEL: uhsub_i16:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    uhsub z0.h, p0/m, z0.h, z1.h
+; CHECK-NEXT:    ret
+  %out = call <vscale x 8 x i16> @llvm.aarch64.sve.uhsub.u.nxv8i16(<vscale x 8 
x i1> %pg,
+                                                                   <vscale x 8 
x i16> %a,
+                                                                   <vscale x 8 
x i16> %b)
+  ret <vscale x 8 x i16> %out
+}
+
+define <vscale x 4 x i32> @uhsub_i32(<vscale x 4 x i1> %pg, <vscale x 4 x i32> 
%a, <vscale x 4 x i32> %b) {
+; CHECK-LABEL: uhsub_i32:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    uhsub z0.s, p0/m, z0.s, z1.s
+; CHECK-NEXT:    ret
+  %out = call <vscale x 4 x i32> @llvm.aarch64.sve.uhsub.u.nxv4i32(<vscale x 4 
x i1> %pg,
+                                                                   <vscale x 4 
x i32> %a,
+                                                                   <vscale x 4 
x i32> %b)
+  ret <vscale x 4 x i32> %out
+}
+
+define <vscale x 2 x i64> @uhsub_i64(<vscale x 2 x i1> %pg, <vscale x 2 x i64> 
%a, <vscale x 2 x i64> %b) {
+; CHECK-LABEL: uhsub_i64:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    uhsub z0.d, p0/m, z0.d, z1.d
+; CHECK-NEXT:    ret
+  %out = call <vscale x 2 x i64> @llvm.aarch64.sve.uhsub.u.nxv2i64(<vscale x 2 
x i1> %pg,
+                                                                   <vscale x 2 
x i64> %a,
+                                                                   <vscale x 2 
x i64> %b)
+  ret <vscale x 2 x i64> %out
+}
+
+;
+; UHSUB (swapped operands)
+;
+
+define <vscale x 16 x i8> @uhsub_i8_swapped_operands(<vscale x 16 x i1> %pg, 
<vscale x 16 x i8> %a, <vscale x 16 x i8> %b) {
+; CHECK-LABEL: uhsub_i8_swapped_operands:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    uhsubr z0.b, p0/m, z0.b, z1.b
+; CHECK-NEXT:    ret
+  %out = call <vscale x 16 x i8> @llvm.aarch64.sve.uhsub.u.nxv16i8(<vscale x 
16 x i1> %pg,
+                                                                   <vscale x 
16 x i8> %b,
+                                                                   <vscale x 
16 x i8> %a)
+  ret <vscale x 16 x i8> %out
+}
+
+define <vscale x 8 x i16> @uhsub_i16_swapped_operands(<vscale x 8 x i1> %pg, 
<vscale x 8 x i16> %a, <vscale x 8 x i16> %b) {
+; CHECK-LABEL: uhsub_i16_swapped_operands:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    uhsubr z0.h, p0/m, z0.h, z1.h
+; CHECK-NEXT:    ret
+  %out = call <vscale x 8 x i16> @llvm.aarch64.sve.uhsub.u.nxv8i16(<vscale x 8 
x i1> %pg,
+                                                                   <vscale x 8 
x i16> %b,
+                                                                   <vscale x 8 
x i16> %a)
+  ret <vscale x 8 x i16> %out
+}
+
+define <vscale x 4 x i32> @uhsub_i32_swapped_operands(<vscale x 4 x i1> %pg, 
<vscale x 4 x i32> %a, <vscale x 4 x i32> %b) {
+; CHECK-LABEL: uhsub_i32_swapped_operands:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    uhsubr z0.s, p0/m, z0.s, z1.s
+; CHECK-NEXT:    ret
+  %out = call <vscale x 4 x i32> @llvm.aarch64.sve.uhsub.u.nxv4i32(<vscale x 4 
x i1> %pg,
+                                                                   <vscale x 4 
x i32> %b,
+                                                                   <vscale x 4 
x i32> %a)
+  ret <vscale x 4 x i32> %out
+}
+
+define <vscale x 2 x i64> @uhsub_i64_swapped_operands(<vscale x 2 x i1> %pg, 
<vscale x 2 x i64> %a, <vscale x 2 x i64> %b) {
+; CHECK-LABEL: uhsub_i64_swapped_operands:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    uhsubr z0.d, p0/m, z0.d, z1.d
+; CHECK-NEXT:    ret
+  %out = call <vscale x 2 x i64> @llvm.aarch64.sve.uhsub.u.nxv2i64(<vscale x 2 
x i1> %pg,
+                                                                   <vscale x 2 
x i64> %b,
+                                                                   <vscale x 2 
x i64> %a)
+  ret <vscale x 2 x i64> %out
+}
+
+;
+; UHSUB (movprfx)
+;
+
+define <vscale x 16 x i8> @uhsub_i8_movprfx(<vscale x 16 x i1> %pg, <vscale x 
16 x i8> %unused, <vscale x 16 x i8> %a, <vscale x 16 x i8> %b) {
+; CHECK-LABEL: uhsub_i8_movprfx:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    movprfx z0, z1
+; CHECK-NEXT:    uhsub z0.b, p0/m, z0.b, z2.b
+; CHECK-NEXT:    ret
+  %out = call <vscale x 16 x i8> @llvm.aarch64.sve.uhsub.u.nxv16i8(<vscale x 
16 x i1> %pg,
+                                                                   <vscale x 
16 x i8> %a,
+                                                                   <vscale x 
16 x i8> %b)
+  ret <vscale x 16 x i8> %out
+}
+
+define <vscale x 8 x i16> @uhsub_i16_movprfx(<vscale x 8 x i1> %pg, <vscale x 
8 x i16> %unused, <vscale x 8 x i16> %a, <vscale x 8 x i16> %b) {
+; CHECK-LABEL: uhsub_i16_movprfx:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    movprfx z0, z1
+; CHECK-NEXT:    uhsub z0.h, p0/m, z0.h, z2.h
+; CHECK-NEXT:    ret
+  %out = call <vscale x 8 x i16> @llvm.aarch64.sve.uhsub.u.nxv8i16(<vscale x 8 
x i1> %pg,
+                                                                   <vscale x 8 
x i16> %a,
+                                                                   <vscale x 8 
x i16> %b)
+  ret <vscale x 8 x i16> %out
+}
+
+define <vscale x 4 x i32> @uhsub_i32_movprfx(<vscale x 4 x i1> %pg, <vscale x 
4 x i32> %unused, <vscale x 4 x i32> %a, <vscale x 4 x i32> %b) {
+; CHECK-LABEL: uhsub_i32_movprfx:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    movprfx z0, z1
+; CHECK-NEXT:    uhsub z0.s, p0/m, z0.s, z2.s
+; CHECK-NEXT:    ret
+  %out = call <vscale x 4 x i32> @llvm.aarch64.sve.uhsub.u.nxv4i32(<vscale x 4 
x i1> %pg,
+                                                                   <vscale x 4 
x i32> %a,
+                                                                   <vscale x 4 
x i32> %b)
+  ret <vscale x 4 x i32> %out
+}
+
+define <vscale x 2 x i64> @uhsub_i64_movprfx(<vscale x 2 x i1> %pg, <vscale x 
2 x i64> %unused, <vscale x 2 x i64> %a, <vscale x 2 x i64> %b) {
+; CHECK-LABEL: uhsub_i64_movprfx:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    movprfx z0, z1
+; CHECK-NEXT:    uhsub z0.d, p0/m, z0.d, z2.d
+; CHECK-NEXT:    ret
+  %out = call <vscale x 2 x i64> @llvm.aarch64.sve.uhsub.u.nxv2i64(<vscale x 2 
x i1> %pg,
+                                                                   <vscale x 2 
x i64> %a,
+                                                                   <vscale x 2 
x i64> %b)
+  ret <vscale x 2 x i64> %out
+}
+
 ;
 ; UQRSHL
 ;
diff --git 
a/llvm/test/Transforms/InstCombine/AArch64/sve-intrinsic-comb-no-active-lanes.ll
 
b/llvm/test/Transforms/InstCombine/AArch64/sve-intrinsic-comb-no-active-lanes.ll
index b5420e9111746..f082bddbf9fec 100644
--- 
a/llvm/test/Transforms/InstCombine/AArch64/sve-intrinsic-comb-no-active-lanes.ll
+++ 
b/llvm/test/Transforms/InstCombine/AArch64/sve-intrinsic-comb-no-active-lanes.ll
@@ -741,8 +741,7 @@ declare <vscale x 4 x i32> 
@llvm.aarch64.sve.shsub.nxv4i32(<vscale x 4 x i1>, <v
 define <vscale x 4 x i32> @simplify_shsub_intrinsic(<vscale x 4 x i32> %a, 
<vscale x 4 x i32> %b) #0 {
 ; CHECK-LABEL: define <vscale x 4 x i32> @simplify_shsub_intrinsic
 ; CHECK-SAME: (<vscale x 4 x i32> [[A:%.*]], <vscale x 4 x i32> [[B:%.*]]) 
#[[ATTR1]] {
-; CHECK-NEXT:    [[R:%.*]] = tail call <vscale x 4 x i32> 
@llvm.aarch64.sve.shsub.nxv4i32(<vscale x 4 x i1> zeroinitializer, <vscale x 4 
x i32> [[A]], <vscale x 4 x i32> [[B]])
-; CHECK-NEXT:    ret <vscale x 4 x i32> [[R]]
+; CHECK-NEXT:    ret <vscale x 4 x i32> [[A]]
 ;
   %r = tail call <vscale x 4 x i32> @llvm.aarch64.sve.shsub.nxv4i32(<vscale x 
4 x i1> zeroinitializer, <vscale x 4 x i32> %a, <vscale x 4 x i32> %b)
   ret <vscale x 4 x i32> %r
@@ -752,8 +751,7 @@ declare <vscale x 4 x i32> 
@llvm.aarch64.sve.shsubr.nxv4i32(<vscale x 4 x i1>, <
 define <vscale x 4 x i32> @simplify_shsubr_intrinsic(<vscale x 4 x i32> %a, 
<vscale x 4 x i32> %b) #0 {
 ; CHECK-LABEL: define <vscale x 4 x i32> @simplify_shsubr_intrinsic
 ; CHECK-SAME: (<vscale x 4 x i32> [[A:%.*]], <vscale x 4 x i32> [[B:%.*]]) 
#[[ATTR1]] {
-; CHECK-NEXT:    [[R:%.*]] = tail call <vscale x 4 x i32> 
@llvm.aarch64.sve.shsubr.nxv4i32(<vscale x 4 x i1> zeroinitializer, <vscale x 4 
x i32> [[A]], <vscale x 4 x i32> [[B]])
-; CHECK-NEXT:    ret <vscale x 4 x i32> [[R]]
+; CHECK-NEXT:    ret <vscale x 4 x i32> [[A]]
 ;
   %r = tail call <vscale x 4 x i32> @llvm.aarch64.sve.shsubr.nxv4i32(<vscale x 
4 x i1> zeroinitializer, <vscale x 4 x i32> %a, <vscale x 4 x i32> %b)
   ret <vscale x 4 x i32> %r
@@ -1017,8 +1015,7 @@ declare <vscale x 4 x i32> 
@llvm.aarch64.sve.uhsub.nxv4i32(<vscale x 4 x i1>, <v
 define <vscale x 4 x i32> @simplify_uhsub_intrinsic(<vscale x 4 x i32> %a, 
<vscale x 4 x i32> %b) #0 {
 ; CHECK-LABEL: define <vscale x 4 x i32> @simplify_uhsub_intrinsic
 ; CHECK-SAME: (<vscale x 4 x i32> [[A:%.*]], <vscale x 4 x i32> [[B:%.*]]) 
#[[ATTR1]] {
-; CHECK-NEXT:    [[R:%.*]] = tail call <vscale x 4 x i32> 
@llvm.aarch64.sve.uhsub.nxv4i32(<vscale x 4 x i1> zeroinitializer, <vscale x 4 
x i32> [[A]], <vscale x 4 x i32> [[B]])
-; CHECK-NEXT:    ret <vscale x 4 x i32> [[R]]
+; CHECK-NEXT:    ret <vscale x 4 x i32> [[A]]
 ;
   %r = tail call <vscale x 4 x i32> @llvm.aarch64.sve.uhsub.nxv4i32(<vscale x 
4 x i1> zeroinitializer, <vscale x 4 x i32> %a, <vscale x 4 x i32> %b)
   ret <vscale x 4 x i32> %r
@@ -1028,8 +1025,7 @@ declare <vscale x 4 x i32> 
@llvm.aarch64.sve.uhsubr.nxv4i32(<vscale x 4 x i1>, <
 define <vscale x 4 x i32> @simplify_uhsubr_intrinsic(<vscale x 4 x i32> %a, 
<vscale x 4 x i32> %b) #0 {
 ; CHECK-LABEL: define <vscale x 4 x i32> @simplify_uhsubr_intrinsic
 ; CHECK-SAME: (<vscale x 4 x i32> [[A:%.*]], <vscale x 4 x i32> [[B:%.*]]) 
#[[ATTR1]] {
-; CHECK-NEXT:    [[R:%.*]] = tail call <vscale x 4 x i32> 
@llvm.aarch64.sve.uhsubr.nxv4i32(<vscale x 4 x i1> zeroinitializer, <vscale x 4 
x i32> [[A]], <vscale x 4 x i32> [[B]])
-; CHECK-NEXT:    ret <vscale x 4 x i32> [[R]]
+; CHECK-NEXT:    ret <vscale x 4 x i32> [[A]]
 ;
   %r = tail call <vscale x 4 x i32> @llvm.aarch64.sve.uhsubr.nxv4i32(<vscale x 
4 x i1> zeroinitializer, <vscale x 4 x i32> %a, <vscale x 4 x i32> %b)
   ret <vscale x 4 x i32> %r
diff --git 
a/llvm/test/Transforms/InstCombine/AArch64/sve-intrinsic-simplify-to-u-form.ll 
b/llvm/test/Transforms/InstCombine/AArch64/sve-intrinsic-simplify-to-u-form.ll
index 96ac0efde8764..097ed87279eda 100644
--- 
a/llvm/test/Transforms/InstCombine/AArch64/sve-intrinsic-simplify-to-u-form.ll
+++ 
b/llvm/test/Transforms/InstCombine/AArch64/sve-intrinsic-simplify-to-u-form.ll
@@ -324,6 +324,28 @@ define <vscale x 4 x i32> 
@replace_sdiv_intrinsic_i32(<vscale x 4 x i32> %a, <vs
   ret <vscale x 4 x i32> %r
 }
 
+declare <vscale x 4 x i32> @llvm.aarch64.sve.shsub.nxv4i32(<vscale x 4 x i1>, 
<vscale x 4 x i32>, <vscale x 4 x i32>)
+define <vscale x 4 x i32> @replace_shsub_intrinsic_i32(<vscale x 4 x i32> %a, 
<vscale x 4 x i32> %b) #0 {
+; CHECK-LABEL: define <vscale x 4 x i32> @replace_shsub_intrinsic_i32
+; CHECK-SAME: (<vscale x 4 x i32> [[A:%.*]], <vscale x 4 x i32> [[B:%.*]]) 
#[[ATTR1]] {
+; CHECK-NEXT:    [[R:%.*]] = tail call <vscale x 4 x i32> 
@llvm.aarch64.sve.shsub.u.nxv4i32(<vscale x 4 x i1> splat (i1 true), <vscale x 
4 x i32> [[A]], <vscale x 4 x i32> [[B]])
+; CHECK-NEXT:    ret <vscale x 4 x i32> [[R]]
+;
+  %r = tail call <vscale x 4 x i32> @llvm.aarch64.sve.shsub.nxv4i32(<vscale x 
4 x i1> splat (i1 true), <vscale x 4 x i32> %a, <vscale x 4 x i32> %b)
+  ret <vscale x 4 x i32> %r
+}
+
+declare <vscale x 4 x i32> @llvm.aarch64.sve.shsubr.nxv4i32(<vscale x 4 x i1>, 
<vscale x 4 x i32>, <vscale x 4 x i32>)
+define <vscale x 4 x i32> @replace_shsubr_intrinsic_i32(<vscale x 4 x i32> %a, 
<vscale x 4 x i32> %b) #0 {
+; CHECK-LABEL: define <vscale x 4 x i32> @replace_shsubr_intrinsic_i32
+; CHECK-SAME: (<vscale x 4 x i32> [[A:%.*]], <vscale x 4 x i32> [[B:%.*]]) 
#[[ATTR1]] {
+; CHECK-NEXT:    [[R:%.*]] = tail call <vscale x 4 x i32> 
@llvm.aarch64.sve.shsubr.nxv4i32(<vscale x 4 x i1> splat (i1 true), <vscale x 4 
x i32> [[A]], <vscale x 4 x i32> [[B]])
+; CHECK-NEXT:    ret <vscale x 4 x i32> [[R]]
+;
+  %r = tail call <vscale x 4 x i32> @llvm.aarch64.sve.shsubr.nxv4i32(<vscale x 
4 x i1> splat (i1 true), <vscale x 4 x i32> %a, <vscale x 4 x i32> %b)
+  ret <vscale x 4 x i32> %r
+}
+
 declare <vscale x 4 x i32> @llvm.aarch64.sve.smax.nxv4i32(<vscale x 4 x i1>, 
<vscale x 4 x i32>, <vscale x 4 x i32>)
 define <vscale x 4 x i32> @replace_smax_intrinsic_i32(<vscale x 4 x i32> %a, 
<vscale x 4 x i32> %b) #0 {
 ; CHECK-LABEL: define <vscale x 4 x i32> @replace_smax_intrinsic_i32
@@ -431,6 +453,28 @@ define <vscale x 4 x i32> 
@replace_udiv_intrinsic_i32(<vscale x 4 x i32> %a, <vs
   ret <vscale x 4 x i32> %r
 }
 
+declare <vscale x 4 x i32> @llvm.aarch64.sve.uhsub.nxv4i32(<vscale x 4 x i1>, 
<vscale x 4 x i32>, <vscale x 4 x i32>)
+define <vscale x 4 x i32> @replace_uhsub_intrinsic_i32(<vscale x 4 x i32> %a, 
<vscale x 4 x i32> %b) #0 {
+; CHECK-LABEL: define <vscale x 4 x i32> @replace_uhsub_intrinsic_i32
+; CHECK-SAME: (<vscale x 4 x i32> [[A:%.*]], <vscale x 4 x i32> [[B:%.*]]) 
#[[ATTR1]] {
+; CHECK-NEXT:    [[R:%.*]] = tail call <vscale x 4 x i32> 
@llvm.aarch64.sve.uhsub.u.nxv4i32(<vscale x 4 x i1> splat (i1 true), <vscale x 
4 x i32> [[A]], <vscale x 4 x i32> [[B]])
+; CHECK-NEXT:    ret <vscale x 4 x i32> [[R]]
+;
+  %r = tail call <vscale x 4 x i32> @llvm.aarch64.sve.uhsub.nxv4i32(<vscale x 
4 x i1> splat (i1 true), <vscale x 4 x i32> %a, <vscale x 4 x i32> %b)
+  ret <vscale x 4 x i32> %r
+}
+
+declare <vscale x 4 x i32> @llvm.aarch64.sve.uhsubr.nxv4i32(<vscale x 4 x i1>, 
<vscale x 4 x i32>, <vscale x 4 x i32>)
+define <vscale x 4 x i32> @replace_uhsubr_intrinsic_i32(<vscale x 4 x i32> %a, 
<vscale x 4 x i32> %b) #0 {
+; CHECK-LABEL: define <vscale x 4 x i32> @replace_uhsubr_intrinsic_i32
+; CHECK-SAME: (<vscale x 4 x i32> [[A:%.*]], <vscale x 4 x i32> [[B:%.*]]) 
#[[ATTR1]] {
+; CHECK-NEXT:    [[R:%.*]] = tail call <vscale x 4 x i32> 
@llvm.aarch64.sve.uhsubr.nxv4i32(<vscale x 4 x i1> splat (i1 true), <vscale x 4 
x i32> [[A]], <vscale x 4 x i32> [[B]])
+; CHECK-NEXT:    ret <vscale x 4 x i32> [[R]]
+;
+  %r = tail call <vscale x 4 x i32> @llvm.aarch64.sve.uhsubr.nxv4i32(<vscale x 
4 x i1> splat (i1 true), <vscale x 4 x i32> %a, <vscale x 4 x i32> %b)
+  ret <vscale x 4 x i32> %r
+}
+
 declare <vscale x 4 x i32> @llvm.aarch64.sve.umax.nxv4i32(<vscale x 4 x i1>, 
<vscale x 4 x i32>, <vscale x 4 x i32>)
 define <vscale x 4 x i32> @replace_umax_intrinsic_i32(<vscale x 4 x i32> %a, 
<vscale x 4 x i32> %b) #0 {
 ; CHECK-LABEL: define <vscale x 4 x i32> @replace_umax_intrinsic_i32

_______________________________________________
cfe-commits mailing list
[email protected]
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits

[clang] [llvm] [LLVM][AArch64] Add "u" variants of sve.[s,u]hadd intrinsics (PR #170894)

Reply via email to