https://github.com/Lukacma updated https://github.com/llvm/llvm-project/pull/189987
>From d4cae3a092effbdb0db752b2757b4dfb31504f76 Mon Sep 17 00:00:00 2001 From: Marian Lukac <[email protected]> Date: Wed, 1 Apr 2026 15:51:56 +0000 Subject: [PATCH 1/6] [AArch64] Add intrinsic support for Fdot instr. --- clang/include/clang/Basic/arm_neon.td | 8 ++ clang/lib/CodeGen/TargetBuiltins/ARM.cpp | 28 +++++ .../CodeGen/AArch64/f16f32dot-intrinsics.c | 112 ++++++++++++++++++ .../aarch64-neon-immediate-ranges/dotprod.c | 24 +++- llvm/include/llvm/IR/IntrinsicsAArch64.td | 2 +- .../lib/Target/AArch64/AArch64InstrFormats.td | 19 +++ .../AArch64/aarch64-f16f32dot-intrinsics.ll | 65 ++++++++++ 7 files changed, 256 insertions(+), 2 deletions(-) create mode 100644 clang/test/CodeGen/AArch64/f16f32dot-intrinsics.c create mode 100644 llvm/test/CodeGen/AArch64/aarch64-f16f32dot-intrinsics.ll diff --git a/clang/include/clang/Basic/arm_neon.td b/clang/include/clang/Basic/arm_neon.td index e91d7ce975d31..ed4879a3dbd24 100644 --- a/clang/include/clang/Basic/arm_neon.td +++ b/clang/include/clang/Basic/arm_neon.td @@ -1903,6 +1903,14 @@ let ArchGuard = "defined(__aarch64__) || defined(__arm64ec__)", TargetGuard = "d def UDOT_LANEQ : SOpInst<"vdot_laneq", "..(<<)(<<Q)I", "iUiQiQUi", OP_DOT_LNQ>; } +let ArchGuard = "defined(__aarch64__)", TargetGuard = "f16f32dot,neon" in { + def VFDOT_F16 : SInst<"vfdot", "..<<", "fQf">; + def VFDOT_LANE_F16 : SInst<"vfdot_lane", "..<(<q)I", "fQf", + [ImmCheck<3, ImmCheck0_1, 0>]>; + def VFDOT_LANEQ_F16 : SInst<"vfdot_laneq", "..<(<Q)I", "fQf", + [ImmCheck<3, ImmCheck0_3, 0>]>; +} + // v8.2-A FP16 fused multiply-add long instructions. let ArchGuard = "defined(__aarch64__) || defined(__arm64ec__)", TargetGuard = "fp16fml,neon" in { def VFMLAL_LOW : SInst<"vfmlal_low", ">>..", "hQh">; diff --git a/clang/lib/CodeGen/TargetBuiltins/ARM.cpp b/clang/lib/CodeGen/TargetBuiltins/ARM.cpp index 8ec2f5b83085c..c1731d1d8c100 100644 --- a/clang/lib/CodeGen/TargetBuiltins/ARM.cpp +++ b/clang/lib/CodeGen/TargetBuiltins/ARM.cpp @@ -7154,6 +7154,34 @@ Value *CodeGenFunction::EmitAArch64BuiltinExpr(unsigned BuiltinID, return EmitFP8NeonFDOTCall(Intrinsic::aarch64_neon_fp8_fdot4_lane, ExtendLaneArg, FloatTy, Ops, E, "fdot4_lane"); + case NEON::BI__builtin_neon_vfdot_f32: + case NEON::BI__builtin_neon_vfdotq_f32: { + llvm::Type *InputTy = + llvm::FixedVectorType::get(HalfTy, Ty->getPrimitiveSizeInBits() / 16); + llvm::Type *Tys[2] = {Ty, InputTy}; + return EmitNeonCall(CGM.getIntrinsic(Intrinsic::aarch64_neon_fdot, Tys), + Ops, "vfdot"); + } + + case NEON::BI__builtin_neon_vfdot_lane_f32: + case NEON::BI__builtin_neon_vfdot_laneq_f32: + case NEON::BI__builtin_neon_vfdotq_lane_f32: + case NEON::BI__builtin_neon_vfdotq_laneq_f32: { + llvm::FixedVectorType *InputTy = + llvm::FixedVectorType::get(HalfTy, Ty->getPrimitiveSizeInBits() / 16); + llvm::FixedVectorType *LaneTy = llvm::FixedVectorType::get( + HalfTy, Ops[2]->getType()->getPrimitiveSizeInBits() / 16); + // Treat the lane argument as a splat and use non-lane version of the + // intrinsic. + Ops[2] = Builder.CreateBitCast(Ops[2], LaneTy); + Ops[2] = EmitNeonSplat(Ops[2], cast<ConstantInt>(Ops[3]), + InputTy->getElementCount()); + llvm::Type *Tys[2] = {Ty, InputTy}; + Ops.pop_back(); + return EmitNeonCall(CGM.getIntrinsic(Intrinsic::aarch64_neon_fdot, Tys), + Ops, "vfdot"); + } + case NEON::BI__builtin_neon_vmlalbq_f16_mf8_fpm: return EmitFP8NeonCall(Intrinsic::aarch64_neon_fp8_fmlalb, {llvm::FixedVectorType::get(HalfTy, 8)}, Ops, E, diff --git a/clang/test/CodeGen/AArch64/f16f32dot-intrinsics.c b/clang/test/CodeGen/AArch64/f16f32dot-intrinsics.c new file mode 100644 index 0000000000000..aa5fe3a056d27 --- /dev/null +++ b/clang/test/CodeGen/AArch64/f16f32dot-intrinsics.c @@ -0,0 +1,112 @@ +// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 6 +// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +neon -target-feature +f16f32dot -disable-O0-optnone -emit-llvm -o - %s | opt -S -passes=mem2reg,sroa,instcombine | FileCheck %s +// RUN: %clang_cc1 -x c++ -triple aarch64-none-linux-gnu -target-feature +neon -target-feature +f16f32dot -disable-O0-optnone -emit-llvm -o - %s | opt -S -passes=mem2reg,sroa,instcombine | FileCheck %s -check-prefix CHECK-CXX +// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +neon -target-feature +f16f32dot -O3 -S -o /dev/null %s + +// REQUIRES: aarch64-registered-target + +#include <arm_neon.h> + +// CHECK-LABEL: define dso_local <2 x float> @test_vfdot_f32( +// CHECK-SAME: <2 x float> noundef [[R:%.*]], <4 x half> noundef [[A:%.*]], <4 x half> noundef [[B:%.*]]) #[[ATTR0:[0-9]+]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VFDOT3_I:%.*]] = call <2 x float> @llvm.aarch64.neon.fdot.v2f32.v4f16(<2 x float> [[R]], <4 x half> [[A]], <4 x half> [[B]]) +// CHECK-NEXT: ret <2 x float> [[VFDOT3_I]] +// +// CHECK-CXX-LABEL: define dso_local noundef <2 x float> @_Z14test_vfdot_f3213__Float32x2_t13__Float16x4_tS0_( +// CHECK-CXX-SAME: <2 x float> noundef [[R:%.*]], <4 x half> noundef [[A:%.*]], <4 x half> noundef [[B:%.*]]) #[[ATTR0:[0-9]+]] { +// CHECK-CXX-NEXT: [[ENTRY:.*:]] +// CHECK-CXX-NEXT: [[VFDOT3_I:%.*]] = call <2 x float> @llvm.aarch64.neon.fdot.v2f32.v4f16(<2 x float> [[R]], <4 x half> [[A]], <4 x half> [[B]]) +// CHECK-CXX-NEXT: ret <2 x float> [[VFDOT3_I]] +// +float32x2_t test_vfdot_f32(float32x2_t r, float16x4_t a, float16x4_t b) { + return vfdot_f32(r, a, b); +} + +// CHECK-LABEL: define dso_local <4 x float> @test_vfdotq_f32( +// CHECK-SAME: <4 x float> noundef [[R:%.*]], <8 x half> noundef [[A:%.*]], <8 x half> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VFDOT3_I:%.*]] = call <4 x float> @llvm.aarch64.neon.fdot.v4f32.v8f16(<4 x float> [[R]], <8 x half> [[A]], <8 x half> [[B]]) +// CHECK-NEXT: ret <4 x float> [[VFDOT3_I]] +// +// CHECK-CXX-LABEL: define dso_local noundef <4 x float> @_Z15test_vfdotq_f3213__Float32x4_t13__Float16x8_tS0_( +// CHECK-CXX-SAME: <4 x float> noundef [[R:%.*]], <8 x half> noundef [[A:%.*]], <8 x half> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-CXX-NEXT: [[ENTRY:.*:]] +// CHECK-CXX-NEXT: [[VFDOT3_I:%.*]] = call <4 x float> @llvm.aarch64.neon.fdot.v4f32.v8f16(<4 x float> [[R]], <8 x half> [[A]], <8 x half> [[B]]) +// CHECK-CXX-NEXT: ret <4 x float> [[VFDOT3_I]] +// +float32x4_t test_vfdotq_f32(float32x4_t r, float16x8_t a, float16x8_t b) { + return vfdotq_f32(r, a, b); +} + +// CHECK-LABEL: define dso_local <2 x float> @test_vfdot_lane_f32( +// CHECK-SAME: <2 x float> noundef [[R:%.*]], <4 x half> noundef [[A:%.*]], <4 x half> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x half> [[B]], <4 x half> poison, <4 x i32> zeroinitializer +// CHECK-NEXT: [[VFDOT2:%.*]] = call <2 x float> @llvm.aarch64.neon.fdot.v2f32.v4f16(<2 x float> [[R]], <4 x half> [[A]], <4 x half> [[LANE]]) +// CHECK-NEXT: ret <2 x float> [[VFDOT2]] +// +// CHECK-CXX-LABEL: define dso_local noundef <2 x float> @_Z19test_vfdot_lane_f3213__Float32x2_t13__Float16x4_tS0_( +// CHECK-CXX-SAME: <2 x float> noundef [[R:%.*]], <4 x half> noundef [[A:%.*]], <4 x half> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-CXX-NEXT: [[ENTRY:.*:]] +// CHECK-CXX-NEXT: [[LANE:%.*]] = shufflevector <4 x half> [[B]], <4 x half> poison, <4 x i32> zeroinitializer +// CHECK-CXX-NEXT: [[VFDOT2:%.*]] = call <2 x float> @llvm.aarch64.neon.fdot.v2f32.v4f16(<2 x float> [[R]], <4 x half> [[A]], <4 x half> [[LANE]]) +// CHECK-CXX-NEXT: ret <2 x float> [[VFDOT2]] +// +float32x2_t test_vfdot_lane_f32(float32x2_t r, float16x4_t a, float16x4_t b) { + return vfdot_lane_f32(r, a, b, 0); +} + +// CHECK-LABEL: define dso_local <2 x float> @test_vfdot_laneq_f32( +// CHECK-SAME: <2 x float> noundef [[R:%.*]], <4 x half> noundef [[A:%.*]], <8 x half> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <8 x half> [[B]], <8 x half> poison, <4 x i32> <i32 3, i32 3, i32 3, i32 3> +// CHECK-NEXT: [[VFDOT2:%.*]] = call <2 x float> @llvm.aarch64.neon.fdot.v2f32.v4f16(<2 x float> [[R]], <4 x half> [[A]], <4 x half> [[LANE]]) +// CHECK-NEXT: ret <2 x float> [[VFDOT2]] +// +// CHECK-CXX-LABEL: define dso_local noundef <2 x float> @_Z20test_vfdot_laneq_f3213__Float32x2_t13__Float16x4_t13__Float16x8_t( +// CHECK-CXX-SAME: <2 x float> noundef [[R:%.*]], <4 x half> noundef [[A:%.*]], <8 x half> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-CXX-NEXT: [[ENTRY:.*:]] +// CHECK-CXX-NEXT: [[LANE:%.*]] = shufflevector <8 x half> [[B]], <8 x half> poison, <4 x i32> <i32 3, i32 3, i32 3, i32 3> +// CHECK-CXX-NEXT: [[VFDOT2:%.*]] = call <2 x float> @llvm.aarch64.neon.fdot.v2f32.v4f16(<2 x float> [[R]], <4 x half> [[A]], <4 x half> [[LANE]]) +// CHECK-CXX-NEXT: ret <2 x float> [[VFDOT2]] +// +float32x2_t test_vfdot_laneq_f32(float32x2_t r, float16x4_t a, float16x8_t b) { + return vfdot_laneq_f32(r, a, b, 3); +} + +// CHECK-LABEL: define dso_local <4 x float> @test_vfdotq_lane_f32( +// CHECK-SAME: <4 x float> noundef [[R:%.*]], <8 x half> noundef [[A:%.*]], <4 x half> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x half> [[B]], <4 x half> poison, <8 x i32> zeroinitializer +// CHECK-NEXT: [[VFDOT2:%.*]] = call <4 x float> @llvm.aarch64.neon.fdot.v4f32.v8f16(<4 x float> [[R]], <8 x half> [[A]], <8 x half> [[LANE]]) +// CHECK-NEXT: ret <4 x float> [[VFDOT2]] +// +// CHECK-CXX-LABEL: define dso_local noundef <4 x float> @_Z20test_vfdotq_lane_f3213__Float32x4_t13__Float16x8_t13__Float16x4_t( +// CHECK-CXX-SAME: <4 x float> noundef [[R:%.*]], <8 x half> noundef [[A:%.*]], <4 x half> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-CXX-NEXT: [[ENTRY:.*:]] +// CHECK-CXX-NEXT: [[LANE:%.*]] = shufflevector <4 x half> [[B]], <4 x half> poison, <8 x i32> zeroinitializer +// CHECK-CXX-NEXT: [[VFDOT2:%.*]] = call <4 x float> @llvm.aarch64.neon.fdot.v4f32.v8f16(<4 x float> [[R]], <8 x half> [[A]], <8 x half> [[LANE]]) +// CHECK-CXX-NEXT: ret <4 x float> [[VFDOT2]] +// +float32x4_t test_vfdotq_lane_f32(float32x4_t r, float16x8_t a, float16x4_t b) { + return vfdotq_lane_f32(r, a, b, 0); +} + +// CHECK-LABEL: define dso_local <4 x float> @test_vfdotq_laneq_f32( +// CHECK-SAME: <4 x float> noundef [[R:%.*]], <8 x half> noundef [[A:%.*]], <8 x half> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <8 x half> [[B]], <8 x half> poison, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3> +// CHECK-NEXT: [[VFDOT2:%.*]] = call <4 x float> @llvm.aarch64.neon.fdot.v4f32.v8f16(<4 x float> [[R]], <8 x half> [[A]], <8 x half> [[LANE]]) +// CHECK-NEXT: ret <4 x float> [[VFDOT2]] +// +// CHECK-CXX-LABEL: define dso_local noundef <4 x float> @_Z21test_vfdotq_laneq_f3213__Float32x4_t13__Float16x8_tS0_( +// CHECK-CXX-SAME: <4 x float> noundef [[R:%.*]], <8 x half> noundef [[A:%.*]], <8 x half> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-CXX-NEXT: [[ENTRY:.*:]] +// CHECK-CXX-NEXT: [[LANE:%.*]] = shufflevector <8 x half> [[B]], <8 x half> poison, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3> +// CHECK-CXX-NEXT: [[VFDOT2:%.*]] = call <4 x float> @llvm.aarch64.neon.fdot.v4f32.v8f16(<4 x float> [[R]], <8 x half> [[A]], <8 x half> [[LANE]]) +// CHECK-CXX-NEXT: ret <4 x float> [[VFDOT2]] +// +float32x4_t test_vfdotq_laneq_f32(float32x4_t r, float16x8_t a, float16x8_t b) { + return vfdotq_laneq_f32(r, a, b, 3); +} diff --git a/clang/test/Sema/aarch64-neon-immediate-ranges/dotprod.c b/clang/test/Sema/aarch64-neon-immediate-ranges/dotprod.c index 11f2c660a8ff2..7f1947e5d9d07 100644 --- a/clang/test/Sema/aarch64-neon-immediate-ranges/dotprod.c +++ b/clang/test/Sema/aarch64-neon-immediate-ranges/dotprod.c @@ -1,4 +1,4 @@ -// RUN: %clang_cc1 -triple aarch64-linux-gnu -target-feature +neon -target-feature +v8.2a -target-feature +dotprod -ffreestanding -fsyntax-only -verify %s +// RUN: %clang_cc1 -triple aarch64-linux-gnu -target-feature +neon -target-feature +v8.2a -target-feature +dotprod -target-feature +f16f32dot -ffreestanding -fsyntax-only -verify %s #include <arm_neon.h> // REQUIRES: aarch64-registered-target @@ -48,3 +48,25 @@ void test_dot_product_s32(int32x2_t arg_i32x2, int8x16_t arg_i8x16, int8x8_t arg vdotq_lane_s32(arg_i32x4, arg_i8x16, arg_i8x8, 2); // expected-error-re {{argument value {{.*}} is outside the valid range}} } + +void test_dot_product_f32(float32x2_t r2, float32x4_t r4, float16x4_t h4, float16x8_t h8) { + (void)vfdot_lane_f32(r2, h4, h4, -1); +// expected-error@-1 {{argument value -1 is outside the valid range [0, 1]}} + (void)vfdot_lane_f32(r2, h4, h4, 2); +// expected-error@-1 {{argument value 2 is outside the valid range [0, 1]}} + + (void)vfdot_laneq_f32(r2, h4, h8, -1); +// expected-error@-1 {{argument value -1 is outside the valid range [0, 3]}} + (void)vfdot_laneq_f32(r2, h4, h8, 4); +// expected-error@-1 {{argument value 4 is outside the valid range [0, 3]}} + + (void)vfdotq_lane_f32(r4, h8, h4, -1); +// expected-error@-1 {{argument value -1 is outside the valid range [0, 1]}} + (void)vfdotq_lane_f32(r4, h8, h4, 2); +// expected-error@-1 {{argument value 2 is outside the valid range [0, 1]}} + + (void)vfdotq_laneq_f32(r4, h8, h8, -1); +// expected-error@-1 {{argument value -1 is outside the valid range [0, 3]}} + (void)vfdotq_laneq_f32(r4, h8, h8, 4); +// expected-error@-1 {{argument value 4 is outside the valid range [0, 3]}} +} diff --git a/llvm/include/llvm/IR/IntrinsicsAArch64.td b/llvm/include/llvm/IR/IntrinsicsAArch64.td index 63500beaa6521..8765842833ce9 100644 --- a/llvm/include/llvm/IR/IntrinsicsAArch64.td +++ b/llvm/include/llvm/IR/IntrinsicsAArch64.td @@ -521,6 +521,7 @@ let TargetPrefix = "aarch64" in { def int_aarch64_neon_fmmla : AdvSIMD_MatMul_Intrinsic; def int_aarch64_neon_usdot : AdvSIMD_Dot_Intrinsic; def int_aarch64_neon_bfdot : AdvSIMD_Dot_Intrinsic; + def int_aarch64_neon_fdot : AdvSIMD_Dot_Intrinsic; def int_aarch64_neon_bfmmla : DefaultAttrsIntrinsic<[llvm_v4f32_ty], [llvm_v4f32_ty, llvm_v8bf16_ty, llvm_v8bf16_ty], @@ -4296,4 +4297,3 @@ let TargetPrefix = "aarch64" in { def int_aarch64_sve_pmlal_pair_x2 : DefaultAttrsIntrinsic<[llvm_nxv2i64_ty, llvm_nxv2i64_ty], [llvm_nxv2i64_ty, llvm_nxv2i64_ty, llvm_nxv2i64_ty, llvm_nxv2i64_ty], [IntrNoMem]>; } - diff --git a/llvm/lib/Target/AArch64/AArch64InstrFormats.td b/llvm/lib/Target/AArch64/AArch64InstrFormats.td index 19312d34609ce..94249db0c1ed3 100644 --- a/llvm/lib/Target/AArch64/AArch64InstrFormats.td +++ b/llvm/lib/Target/AArch64/AArch64InstrFormats.td @@ -6699,6 +6699,13 @@ multiclass SIMDThreeSameVectorFDot<string asm, SDPatternOperator OpNode = null_f v2f32, v4f16, OpNode>; def v8f16_v4f32 : BaseSIMDThreeSameVectorDot<1, 0, 0b10, 0b1111, asm, ".4s", ".8h", V128, v4f32, v8f16, OpNode>; + + def : Pat<(v2f32 (int_aarch64_neon_fdot (v2f32 V64:$Rd), + (v4f16 V64:$Rn), (v4f16 V64:$Rm))), + (!cast<Instruction>(NAME # "v4f16_v2f32") $Rd, $Rn, $Rm)>; + def : Pat<(v4f32 (int_aarch64_neon_fdot (v4f32 V128:$Rd), + (v8f16 V128:$Rn), (v8f16 V128:$Rm))), + (!cast<Instruction>(NAME # "v8f16_v4f32") $Rd, $Rn, $Rm)>; } // FP8 assembly/disassembly classes @@ -9373,6 +9380,18 @@ multiclass SIMDThreeSameVectorFDOTIndex<string asm> { V64, v2f32, v4f16, VectorIndexS, null_frag>; def v8f16_v4f32 : BaseSIMDThreeSameVectorIndexS<0b1, 0b0, 0b01, 0b1001, asm, ".4s", ".8h",".2h", V128, v4f32, v8f16, VectorIndexS, null_frag>; + + def : Pat<(v2f32 (int_aarch64_neon_fdot + (v2f32 V64:$Rd), (v4f16 V64:$Rn), + (v4f16 (AArch64duplane16 (v8f16 V128:$Rm), VectorIndexS:$Idx)))), + (!cast<Instruction>(NAME # "v4f16_v2f32") $Rd, $Rn, $Rm, + VectorIndexS:$Idx)>; + + def : Pat<(v4f32 (int_aarch64_neon_fdot + (v4f32 V128:$Rd), (v8f16 V128:$Rn), + (v8f16 (AArch64duplane16 (v8f16 V128:$Rm), VectorIndexS:$Idx)))), + (!cast<Instruction>(NAME # "v8f16_v4f32") $Rd, $Rn, $Rm, + VectorIndexS:$Idx)>; } //---------------------------------------------------------------------------- diff --git a/llvm/test/CodeGen/AArch64/aarch64-f16f32dot-intrinsics.ll b/llvm/test/CodeGen/AArch64/aarch64-f16f32dot-intrinsics.ll new file mode 100644 index 0000000000000..9f31a241198a6 --- /dev/null +++ b/llvm/test/CodeGen/AArch64/aarch64-f16f32dot-intrinsics.ll @@ -0,0 +1,65 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6 +; RUN: llc -mtriple aarch64 -mattr=+f16f32dot %s -o - | FileCheck %s + +define <2 x float> @test_vfdot_f32(<2 x float> %r, <4 x half> %a, <4 x half> %b) { +; CHECK-LABEL: test_vfdot_f32: +; CHECK: // %bb.0: +; CHECK-NEXT: fdot v0.2s, v1.4h, v2.4h +; CHECK-NEXT: ret + %res = call <2 x float> @llvm.aarch64.neon.fdot.v2f32.v4f16(<2 x float> %r, <4 x half> %a, <4 x half> %b) + ret <2 x float> %res +} + +define <4 x float> @test_vfdotq_f32(<4 x float> %r, <8 x half> %a, <8 x half> %b) { +; CHECK-LABEL: test_vfdotq_f32: +; CHECK: // %bb.0: +; CHECK-NEXT: fdot v0.4s, v1.8h, v2.8h +; CHECK-NEXT: ret + %res = call <4 x float> @llvm.aarch64.neon.fdot.v4f32.v8f16(<4 x float> %r, <8 x half> %a, <8 x half> %b) + ret <4 x float> %res +} + +define <2 x float> @test_vfdot_lane_f32(<2 x float> %r, <4 x half> %a, <4 x half> %b) { +; CHECK-LABEL: test_vfdot_lane_f32: +; CHECK: // %bb.0: +; CHECK-NEXT: // kill: def $d2 killed $d2 def $q2 +; CHECK-NEXT: fdot v0.2s, v1.4h, v2.2h[0] +; CHECK-NEXT: ret + %lane = shufflevector <4 x half> %b, <4 x half> undef, <4 x i32> zeroinitializer + %res = call <2 x float> @llvm.aarch64.neon.fdot.v2f32.v4f16(<2 x float> %r, <4 x half> %a, <4 x half> %lane) + ret <2 x float> %res +} + +define <4 x float> @test_vfdotq_laneq_f32(<4 x float> %r, <8 x half> %a, <8 x half> %b) { +; CHECK-LABEL: test_vfdotq_laneq_f32: +; CHECK: // %bb.0: +; CHECK-NEXT: fdot v0.4s, v1.8h, v2.2h[3] +; CHECK-NEXT: ret + %lane = shufflevector <8 x half> %b, <8 x half> undef, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3> + %res = call <4 x float> @llvm.aarch64.neon.fdot.v4f32.v8f16(<4 x float> %r, <8 x half> %a, <8 x half> %lane) + ret <4 x float> %res +} + +define <2 x float> @test_vfdot_laneq_f32(<2 x float> %r, <4 x half> %a, <8 x half> %b) { +; CHECK-LABEL: test_vfdot_laneq_f32: +; CHECK: // %bb.0: +; CHECK-NEXT: fdot v0.2s, v1.4h, v2.2h[3] +; CHECK-NEXT: ret + %lane = shufflevector <8 x half> %b, <8 x half> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3> + %res = call <2 x float> @llvm.aarch64.neon.fdot.v2f32.v4f16(<2 x float> %r, <4 x half> %a, <4 x half> %lane) + ret <2 x float> %res +} + +define <4 x float> @test_vfdotq_lane_f32(<4 x float> %r, <8 x half> %a, <4 x half> %b) { +; CHECK-LABEL: test_vfdotq_lane_f32: +; CHECK: // %bb.0: +; CHECK-NEXT: // kill: def $d2 killed $d2 def $q2 +; CHECK-NEXT: fdot v0.4s, v1.8h, v2.2h[0] +; CHECK-NEXT: ret + %lane = shufflevector <4 x half> %b, <4 x half> undef, <8 x i32> zeroinitializer + %res = call <4 x float> @llvm.aarch64.neon.fdot.v4f32.v8f16(<4 x float> %r, <8 x half> %a, <8 x half> %lane) + ret <4 x float> %res +} + +declare <2 x float> @llvm.aarch64.neon.fdot.v2f32.v4f16(<2 x float>, <4 x half>, <4 x half>) +declare <4 x float> @llvm.aarch64.neon.fdot.v4f32.v8f16(<4 x float>, <8 x half>, <8 x half>) >From 4ab762fd08a1a30855550a7d5c1dadee91a57d7e Mon Sep 17 00:00:00 2001 From: Marian Lukac <[email protected]> Date: Wed, 1 Apr 2026 16:08:34 +0000 Subject: [PATCH 2/6] Remove undef --- llvm/test/CodeGen/AArch64/aarch64-f16f32dot-intrinsics.ll | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/llvm/test/CodeGen/AArch64/aarch64-f16f32dot-intrinsics.ll b/llvm/test/CodeGen/AArch64/aarch64-f16f32dot-intrinsics.ll index 9f31a241198a6..d7167c152955a 100644 --- a/llvm/test/CodeGen/AArch64/aarch64-f16f32dot-intrinsics.ll +++ b/llvm/test/CodeGen/AArch64/aarch64-f16f32dot-intrinsics.ll @@ -25,7 +25,7 @@ define <2 x float> @test_vfdot_lane_f32(<2 x float> %r, <4 x half> %a, <4 x half ; CHECK-NEXT: // kill: def $d2 killed $d2 def $q2 ; CHECK-NEXT: fdot v0.2s, v1.4h, v2.2h[0] ; CHECK-NEXT: ret - %lane = shufflevector <4 x half> %b, <4 x half> undef, <4 x i32> zeroinitializer + %lane = shufflevector <4 x half> %b, <4 x half> poison, <4 x i32> zeroinitializer %res = call <2 x float> @llvm.aarch64.neon.fdot.v2f32.v4f16(<2 x float> %r, <4 x half> %a, <4 x half> %lane) ret <2 x float> %res } @@ -35,7 +35,7 @@ define <4 x float> @test_vfdotq_laneq_f32(<4 x float> %r, <8 x half> %a, <8 x ha ; CHECK: // %bb.0: ; CHECK-NEXT: fdot v0.4s, v1.8h, v2.2h[3] ; CHECK-NEXT: ret - %lane = shufflevector <8 x half> %b, <8 x half> undef, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3> + %lane = shufflevector <8 x half> %b, <8 x half> poison, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3> %res = call <4 x float> @llvm.aarch64.neon.fdot.v4f32.v8f16(<4 x float> %r, <8 x half> %a, <8 x half> %lane) ret <4 x float> %res } @@ -45,7 +45,7 @@ define <2 x float> @test_vfdot_laneq_f32(<2 x float> %r, <4 x half> %a, <8 x hal ; CHECK: // %bb.0: ; CHECK-NEXT: fdot v0.2s, v1.4h, v2.2h[3] ; CHECK-NEXT: ret - %lane = shufflevector <8 x half> %b, <8 x half> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3> + %lane = shufflevector <8 x half> %b, <8 x half> poison, <4 x i32> <i32 3, i32 3, i32 3, i32 3> %res = call <2 x float> @llvm.aarch64.neon.fdot.v2f32.v4f16(<2 x float> %r, <4 x half> %a, <4 x half> %lane) ret <2 x float> %res } @@ -56,7 +56,7 @@ define <4 x float> @test_vfdotq_lane_f32(<4 x float> %r, <8 x half> %a, <4 x hal ; CHECK-NEXT: // kill: def $d2 killed $d2 def $q2 ; CHECK-NEXT: fdot v0.4s, v1.8h, v2.2h[0] ; CHECK-NEXT: ret - %lane = shufflevector <4 x half> %b, <4 x half> undef, <8 x i32> zeroinitializer + %lane = shufflevector <4 x half> %b, <4 x half> poison, <8 x i32> zeroinitializer %res = call <4 x float> @llvm.aarch64.neon.fdot.v4f32.v8f16(<4 x float> %r, <8 x half> %a, <8 x half> %lane) ret <4 x float> %res } >From 7f14f5eb3c854518dd6df07fffff3b41b7739628 Mon Sep 17 00:00:00 2001 From: Marian Lukac <[email protected]> Date: Thu, 16 Apr 2026 10:20:43 +0000 Subject: [PATCH 3/6] Rename to vdot --- clang/include/clang/Basic/arm_neon.td | 10 +- clang/lib/CodeGen/TargetBuiltins/ARM.cpp | 16 ++-- .../CodeGen/AArch64/f16f32dot-intrinsics.c | 96 +++++++++---------- .../aarch64-neon-immediate-ranges/dotprod.c | 16 ++-- .../AArch64/aarch64-f16f32dot-intrinsics.ll | 24 ++--- 5 files changed, 81 insertions(+), 81 deletions(-) diff --git a/clang/include/clang/Basic/arm_neon.td b/clang/include/clang/Basic/arm_neon.td index ed4879a3dbd24..64ee23a082370 100644 --- a/clang/include/clang/Basic/arm_neon.td +++ b/clang/include/clang/Basic/arm_neon.td @@ -1904,11 +1904,11 @@ let ArchGuard = "defined(__aarch64__) || defined(__arm64ec__)", TargetGuard = "d } let ArchGuard = "defined(__aarch64__)", TargetGuard = "f16f32dot,neon" in { - def VFDOT_F16 : SInst<"vfdot", "..<<", "fQf">; - def VFDOT_LANE_F16 : SInst<"vfdot_lane", "..<(<q)I", "fQf", - [ImmCheck<3, ImmCheck0_1, 0>]>; - def VFDOT_LANEQ_F16 : SInst<"vfdot_laneq", "..<(<Q)I", "fQf", - [ImmCheck<3, ImmCheck0_3, 0>]>; + def VDOT_F16 : SInst<"vdot", "..<<", "fQf">; + def VDOT_LANE_F16 : SInst<"vdot_lane", "..<(<q)I", "fQf", + [ImmCheck<3, ImmCheck0_1, 0>]>; + def VDOT_LANEQ_F16 : SInst<"vdot_laneq", "..<(<Q)I", "fQf", + [ImmCheck<3, ImmCheck0_3, 0>]>; } // v8.2-A FP16 fused multiply-add long instructions. diff --git a/clang/lib/CodeGen/TargetBuiltins/ARM.cpp b/clang/lib/CodeGen/TargetBuiltins/ARM.cpp index c1731d1d8c100..9cc5fea42ae67 100644 --- a/clang/lib/CodeGen/TargetBuiltins/ARM.cpp +++ b/clang/lib/CodeGen/TargetBuiltins/ARM.cpp @@ -7154,19 +7154,19 @@ Value *CodeGenFunction::EmitAArch64BuiltinExpr(unsigned BuiltinID, return EmitFP8NeonFDOTCall(Intrinsic::aarch64_neon_fp8_fdot4_lane, ExtendLaneArg, FloatTy, Ops, E, "fdot4_lane"); - case NEON::BI__builtin_neon_vfdot_f32: - case NEON::BI__builtin_neon_vfdotq_f32: { + case NEON::BI__builtin_neon_vdot_f32: + case NEON::BI__builtin_neon_vdotq_f32: { llvm::Type *InputTy = llvm::FixedVectorType::get(HalfTy, Ty->getPrimitiveSizeInBits() / 16); llvm::Type *Tys[2] = {Ty, InputTy}; return EmitNeonCall(CGM.getIntrinsic(Intrinsic::aarch64_neon_fdot, Tys), - Ops, "vfdot"); + Ops, "vdot"); } - case NEON::BI__builtin_neon_vfdot_lane_f32: - case NEON::BI__builtin_neon_vfdot_laneq_f32: - case NEON::BI__builtin_neon_vfdotq_lane_f32: - case NEON::BI__builtin_neon_vfdotq_laneq_f32: { + case NEON::BI__builtin_neon_vdot_lane_f32: + case NEON::BI__builtin_neon_vdot_laneq_f32: + case NEON::BI__builtin_neon_vdotq_lane_f32: + case NEON::BI__builtin_neon_vdotq_laneq_f32: { llvm::FixedVectorType *InputTy = llvm::FixedVectorType::get(HalfTy, Ty->getPrimitiveSizeInBits() / 16); llvm::FixedVectorType *LaneTy = llvm::FixedVectorType::get( @@ -7179,7 +7179,7 @@ Value *CodeGenFunction::EmitAArch64BuiltinExpr(unsigned BuiltinID, llvm::Type *Tys[2] = {Ty, InputTy}; Ops.pop_back(); return EmitNeonCall(CGM.getIntrinsic(Intrinsic::aarch64_neon_fdot, Tys), - Ops, "vfdot"); + Ops, "vdot"); } case NEON::BI__builtin_neon_vmlalbq_f16_mf8_fpm: diff --git a/clang/test/CodeGen/AArch64/f16f32dot-intrinsics.c b/clang/test/CodeGen/AArch64/f16f32dot-intrinsics.c index aa5fe3a056d27..7ae61460e40df 100644 --- a/clang/test/CodeGen/AArch64/f16f32dot-intrinsics.c +++ b/clang/test/CodeGen/AArch64/f16f32dot-intrinsics.c @@ -7,106 +7,106 @@ #include <arm_neon.h> -// CHECK-LABEL: define dso_local <2 x float> @test_vfdot_f32( +// CHECK-LABEL: define dso_local <2 x float> @test_vdot_f32( // CHECK-SAME: <2 x float> noundef [[R:%.*]], <4 x half> noundef [[A:%.*]], <4 x half> noundef [[B:%.*]]) #[[ATTR0:[0-9]+]] { // CHECK-NEXT: [[ENTRY:.*:]] -// CHECK-NEXT: [[VFDOT3_I:%.*]] = call <2 x float> @llvm.aarch64.neon.fdot.v2f32.v4f16(<2 x float> [[R]], <4 x half> [[A]], <4 x half> [[B]]) -// CHECK-NEXT: ret <2 x float> [[VFDOT3_I]] +// CHECK-NEXT: [[VDOT3_I:%.*]] = call <2 x float> @llvm.aarch64.neon.fdot.v2f32.v4f16(<2 x float> [[R]], <4 x half> [[A]], <4 x half> [[B]]) +// CHECK-NEXT: ret <2 x float> [[VDOT3_I]] // -// CHECK-CXX-LABEL: define dso_local noundef <2 x float> @_Z14test_vfdot_f3213__Float32x2_t13__Float16x4_tS0_( +// CHECK-CXX-LABEL: define dso_local noundef <2 x float> @_Z13test_vdot_f3213__Float32x2_t13__Float16x4_tS0_( // CHECK-CXX-SAME: <2 x float> noundef [[R:%.*]], <4 x half> noundef [[A:%.*]], <4 x half> noundef [[B:%.*]]) #[[ATTR0:[0-9]+]] { // CHECK-CXX-NEXT: [[ENTRY:.*:]] -// CHECK-CXX-NEXT: [[VFDOT3_I:%.*]] = call <2 x float> @llvm.aarch64.neon.fdot.v2f32.v4f16(<2 x float> [[R]], <4 x half> [[A]], <4 x half> [[B]]) -// CHECK-CXX-NEXT: ret <2 x float> [[VFDOT3_I]] +// CHECK-CXX-NEXT: [[VDOT3_I:%.*]] = call <2 x float> @llvm.aarch64.neon.fdot.v2f32.v4f16(<2 x float> [[R]], <4 x half> [[A]], <4 x half> [[B]]) +// CHECK-CXX-NEXT: ret <2 x float> [[VDOT3_I]] // -float32x2_t test_vfdot_f32(float32x2_t r, float16x4_t a, float16x4_t b) { - return vfdot_f32(r, a, b); +float32x2_t test_vdot_f32(float32x2_t r, float16x4_t a, float16x4_t b) { + return vdot_f32(r, a, b); } -// CHECK-LABEL: define dso_local <4 x float> @test_vfdotq_f32( +// CHECK-LABEL: define dso_local <4 x float> @test_vdotq_f32( // CHECK-SAME: <4 x float> noundef [[R:%.*]], <8 x half> noundef [[A:%.*]], <8 x half> noundef [[B:%.*]]) #[[ATTR0]] { // CHECK-NEXT: [[ENTRY:.*:]] -// CHECK-NEXT: [[VFDOT3_I:%.*]] = call <4 x float> @llvm.aarch64.neon.fdot.v4f32.v8f16(<4 x float> [[R]], <8 x half> [[A]], <8 x half> [[B]]) -// CHECK-NEXT: ret <4 x float> [[VFDOT3_I]] +// CHECK-NEXT: [[VDOT3_I:%.*]] = call <4 x float> @llvm.aarch64.neon.fdot.v4f32.v8f16(<4 x float> [[R]], <8 x half> [[A]], <8 x half> [[B]]) +// CHECK-NEXT: ret <4 x float> [[VDOT3_I]] // -// CHECK-CXX-LABEL: define dso_local noundef <4 x float> @_Z15test_vfdotq_f3213__Float32x4_t13__Float16x8_tS0_( +// CHECK-CXX-LABEL: define dso_local noundef <4 x float> @_Z14test_vdotq_f3213__Float32x4_t13__Float16x8_tS0_( // CHECK-CXX-SAME: <4 x float> noundef [[R:%.*]], <8 x half> noundef [[A:%.*]], <8 x half> noundef [[B:%.*]]) #[[ATTR0]] { // CHECK-CXX-NEXT: [[ENTRY:.*:]] -// CHECK-CXX-NEXT: [[VFDOT3_I:%.*]] = call <4 x float> @llvm.aarch64.neon.fdot.v4f32.v8f16(<4 x float> [[R]], <8 x half> [[A]], <8 x half> [[B]]) -// CHECK-CXX-NEXT: ret <4 x float> [[VFDOT3_I]] +// CHECK-CXX-NEXT: [[VDOT3_I:%.*]] = call <4 x float> @llvm.aarch64.neon.fdot.v4f32.v8f16(<4 x float> [[R]], <8 x half> [[A]], <8 x half> [[B]]) +// CHECK-CXX-NEXT: ret <4 x float> [[VDOT3_I]] // -float32x4_t test_vfdotq_f32(float32x4_t r, float16x8_t a, float16x8_t b) { - return vfdotq_f32(r, a, b); +float32x4_t test_vdotq_f32(float32x4_t r, float16x8_t a, float16x8_t b) { + return vdotq_f32(r, a, b); } -// CHECK-LABEL: define dso_local <2 x float> @test_vfdot_lane_f32( +// CHECK-LABEL: define dso_local <2 x float> @test_vdot_lane_f32( // CHECK-SAME: <2 x float> noundef [[R:%.*]], <4 x half> noundef [[A:%.*]], <4 x half> noundef [[B:%.*]]) #[[ATTR0]] { // CHECK-NEXT: [[ENTRY:.*:]] // CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x half> [[B]], <4 x half> poison, <4 x i32> zeroinitializer -// CHECK-NEXT: [[VFDOT2:%.*]] = call <2 x float> @llvm.aarch64.neon.fdot.v2f32.v4f16(<2 x float> [[R]], <4 x half> [[A]], <4 x half> [[LANE]]) -// CHECK-NEXT: ret <2 x float> [[VFDOT2]] +// CHECK-NEXT: [[VDOT2:%.*]] = call <2 x float> @llvm.aarch64.neon.fdot.v2f32.v4f16(<2 x float> [[R]], <4 x half> [[A]], <4 x half> [[LANE]]) +// CHECK-NEXT: ret <2 x float> [[VDOT2]] // -// CHECK-CXX-LABEL: define dso_local noundef <2 x float> @_Z19test_vfdot_lane_f3213__Float32x2_t13__Float16x4_tS0_( +// CHECK-CXX-LABEL: define dso_local noundef <2 x float> @_Z18test_vdot_lane_f3213__Float32x2_t13__Float16x4_tS0_( // CHECK-CXX-SAME: <2 x float> noundef [[R:%.*]], <4 x half> noundef [[A:%.*]], <4 x half> noundef [[B:%.*]]) #[[ATTR0]] { // CHECK-CXX-NEXT: [[ENTRY:.*:]] // CHECK-CXX-NEXT: [[LANE:%.*]] = shufflevector <4 x half> [[B]], <4 x half> poison, <4 x i32> zeroinitializer -// CHECK-CXX-NEXT: [[VFDOT2:%.*]] = call <2 x float> @llvm.aarch64.neon.fdot.v2f32.v4f16(<2 x float> [[R]], <4 x half> [[A]], <4 x half> [[LANE]]) -// CHECK-CXX-NEXT: ret <2 x float> [[VFDOT2]] +// CHECK-CXX-NEXT: [[VDOT2:%.*]] = call <2 x float> @llvm.aarch64.neon.fdot.v2f32.v4f16(<2 x float> [[R]], <4 x half> [[A]], <4 x half> [[LANE]]) +// CHECK-CXX-NEXT: ret <2 x float> [[VDOT2]] // -float32x2_t test_vfdot_lane_f32(float32x2_t r, float16x4_t a, float16x4_t b) { - return vfdot_lane_f32(r, a, b, 0); +float32x2_t test_vdot_lane_f32(float32x2_t r, float16x4_t a, float16x4_t b) { + return vdot_lane_f32(r, a, b, 0); } -// CHECK-LABEL: define dso_local <2 x float> @test_vfdot_laneq_f32( +// CHECK-LABEL: define dso_local <2 x float> @test_vdot_laneq_f32( // CHECK-SAME: <2 x float> noundef [[R:%.*]], <4 x half> noundef [[A:%.*]], <8 x half> noundef [[B:%.*]]) #[[ATTR0]] { // CHECK-NEXT: [[ENTRY:.*:]] // CHECK-NEXT: [[LANE:%.*]] = shufflevector <8 x half> [[B]], <8 x half> poison, <4 x i32> <i32 3, i32 3, i32 3, i32 3> -// CHECK-NEXT: [[VFDOT2:%.*]] = call <2 x float> @llvm.aarch64.neon.fdot.v2f32.v4f16(<2 x float> [[R]], <4 x half> [[A]], <4 x half> [[LANE]]) -// CHECK-NEXT: ret <2 x float> [[VFDOT2]] +// CHECK-NEXT: [[VDOT2:%.*]] = call <2 x float> @llvm.aarch64.neon.fdot.v2f32.v4f16(<2 x float> [[R]], <4 x half> [[A]], <4 x half> [[LANE]]) +// CHECK-NEXT: ret <2 x float> [[VDOT2]] // -// CHECK-CXX-LABEL: define dso_local noundef <2 x float> @_Z20test_vfdot_laneq_f3213__Float32x2_t13__Float16x4_t13__Float16x8_t( +// CHECK-CXX-LABEL: define dso_local noundef <2 x float> @_Z19test_vdot_laneq_f3213__Float32x2_t13__Float16x4_t13__Float16x8_t( // CHECK-CXX-SAME: <2 x float> noundef [[R:%.*]], <4 x half> noundef [[A:%.*]], <8 x half> noundef [[B:%.*]]) #[[ATTR0]] { // CHECK-CXX-NEXT: [[ENTRY:.*:]] // CHECK-CXX-NEXT: [[LANE:%.*]] = shufflevector <8 x half> [[B]], <8 x half> poison, <4 x i32> <i32 3, i32 3, i32 3, i32 3> -// CHECK-CXX-NEXT: [[VFDOT2:%.*]] = call <2 x float> @llvm.aarch64.neon.fdot.v2f32.v4f16(<2 x float> [[R]], <4 x half> [[A]], <4 x half> [[LANE]]) -// CHECK-CXX-NEXT: ret <2 x float> [[VFDOT2]] +// CHECK-CXX-NEXT: [[VDOT2:%.*]] = call <2 x float> @llvm.aarch64.neon.fdot.v2f32.v4f16(<2 x float> [[R]], <4 x half> [[A]], <4 x half> [[LANE]]) +// CHECK-CXX-NEXT: ret <2 x float> [[VDOT2]] // -float32x2_t test_vfdot_laneq_f32(float32x2_t r, float16x4_t a, float16x8_t b) { - return vfdot_laneq_f32(r, a, b, 3); +float32x2_t test_vdot_laneq_f32(float32x2_t r, float16x4_t a, float16x8_t b) { + return vdot_laneq_f32(r, a, b, 3); } -// CHECK-LABEL: define dso_local <4 x float> @test_vfdotq_lane_f32( +// CHECK-LABEL: define dso_local <4 x float> @test_vdotq_lane_f32( // CHECK-SAME: <4 x float> noundef [[R:%.*]], <8 x half> noundef [[A:%.*]], <4 x half> noundef [[B:%.*]]) #[[ATTR0]] { // CHECK-NEXT: [[ENTRY:.*:]] // CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x half> [[B]], <4 x half> poison, <8 x i32> zeroinitializer -// CHECK-NEXT: [[VFDOT2:%.*]] = call <4 x float> @llvm.aarch64.neon.fdot.v4f32.v8f16(<4 x float> [[R]], <8 x half> [[A]], <8 x half> [[LANE]]) -// CHECK-NEXT: ret <4 x float> [[VFDOT2]] +// CHECK-NEXT: [[VDOT2:%.*]] = call <4 x float> @llvm.aarch64.neon.fdot.v4f32.v8f16(<4 x float> [[R]], <8 x half> [[A]], <8 x half> [[LANE]]) +// CHECK-NEXT: ret <4 x float> [[VDOT2]] // -// CHECK-CXX-LABEL: define dso_local noundef <4 x float> @_Z20test_vfdotq_lane_f3213__Float32x4_t13__Float16x8_t13__Float16x4_t( +// CHECK-CXX-LABEL: define dso_local noundef <4 x float> @_Z19test_vdotq_lane_f3213__Float32x4_t13__Float16x8_t13__Float16x4_t( // CHECK-CXX-SAME: <4 x float> noundef [[R:%.*]], <8 x half> noundef [[A:%.*]], <4 x half> noundef [[B:%.*]]) #[[ATTR0]] { // CHECK-CXX-NEXT: [[ENTRY:.*:]] // CHECK-CXX-NEXT: [[LANE:%.*]] = shufflevector <4 x half> [[B]], <4 x half> poison, <8 x i32> zeroinitializer -// CHECK-CXX-NEXT: [[VFDOT2:%.*]] = call <4 x float> @llvm.aarch64.neon.fdot.v4f32.v8f16(<4 x float> [[R]], <8 x half> [[A]], <8 x half> [[LANE]]) -// CHECK-CXX-NEXT: ret <4 x float> [[VFDOT2]] +// CHECK-CXX-NEXT: [[VDOT2:%.*]] = call <4 x float> @llvm.aarch64.neon.fdot.v4f32.v8f16(<4 x float> [[R]], <8 x half> [[A]], <8 x half> [[LANE]]) +// CHECK-CXX-NEXT: ret <4 x float> [[VDOT2]] // -float32x4_t test_vfdotq_lane_f32(float32x4_t r, float16x8_t a, float16x4_t b) { - return vfdotq_lane_f32(r, a, b, 0); +float32x4_t test_vdotq_lane_f32(float32x4_t r, float16x8_t a, float16x4_t b) { + return vdotq_lane_f32(r, a, b, 0); } -// CHECK-LABEL: define dso_local <4 x float> @test_vfdotq_laneq_f32( +// CHECK-LABEL: define dso_local <4 x float> @test_vdotq_laneq_f32( // CHECK-SAME: <4 x float> noundef [[R:%.*]], <8 x half> noundef [[A:%.*]], <8 x half> noundef [[B:%.*]]) #[[ATTR0]] { // CHECK-NEXT: [[ENTRY:.*:]] // CHECK-NEXT: [[LANE:%.*]] = shufflevector <8 x half> [[B]], <8 x half> poison, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3> -// CHECK-NEXT: [[VFDOT2:%.*]] = call <4 x float> @llvm.aarch64.neon.fdot.v4f32.v8f16(<4 x float> [[R]], <8 x half> [[A]], <8 x half> [[LANE]]) -// CHECK-NEXT: ret <4 x float> [[VFDOT2]] +// CHECK-NEXT: [[VDOT2:%.*]] = call <4 x float> @llvm.aarch64.neon.fdot.v4f32.v8f16(<4 x float> [[R]], <8 x half> [[A]], <8 x half> [[LANE]]) +// CHECK-NEXT: ret <4 x float> [[VDOT2]] // -// CHECK-CXX-LABEL: define dso_local noundef <4 x float> @_Z21test_vfdotq_laneq_f3213__Float32x4_t13__Float16x8_tS0_( +// CHECK-CXX-LABEL: define dso_local noundef <4 x float> @_Z20test_vdotq_laneq_f3213__Float32x4_t13__Float16x8_tS0_( // CHECK-CXX-SAME: <4 x float> noundef [[R:%.*]], <8 x half> noundef [[A:%.*]], <8 x half> noundef [[B:%.*]]) #[[ATTR0]] { // CHECK-CXX-NEXT: [[ENTRY:.*:]] // CHECK-CXX-NEXT: [[LANE:%.*]] = shufflevector <8 x half> [[B]], <8 x half> poison, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3> -// CHECK-CXX-NEXT: [[VFDOT2:%.*]] = call <4 x float> @llvm.aarch64.neon.fdot.v4f32.v8f16(<4 x float> [[R]], <8 x half> [[A]], <8 x half> [[LANE]]) -// CHECK-CXX-NEXT: ret <4 x float> [[VFDOT2]] +// CHECK-CXX-NEXT: [[VDOT2:%.*]] = call <4 x float> @llvm.aarch64.neon.fdot.v4f32.v8f16(<4 x float> [[R]], <8 x half> [[A]], <8 x half> [[LANE]]) +// CHECK-CXX-NEXT: ret <4 x float> [[VDOT2]] // -float32x4_t test_vfdotq_laneq_f32(float32x4_t r, float16x8_t a, float16x8_t b) { - return vfdotq_laneq_f32(r, a, b, 3); +float32x4_t test_vdotq_laneq_f32(float32x4_t r, float16x8_t a, float16x8_t b) { + return vdotq_laneq_f32(r, a, b, 3); } diff --git a/clang/test/Sema/aarch64-neon-immediate-ranges/dotprod.c b/clang/test/Sema/aarch64-neon-immediate-ranges/dotprod.c index 7f1947e5d9d07..e3d3cec1cfe5d 100644 --- a/clang/test/Sema/aarch64-neon-immediate-ranges/dotprod.c +++ b/clang/test/Sema/aarch64-neon-immediate-ranges/dotprod.c @@ -50,23 +50,23 @@ void test_dot_product_s32(int32x2_t arg_i32x2, int8x16_t arg_i8x16, int8x8_t arg } void test_dot_product_f32(float32x2_t r2, float32x4_t r4, float16x4_t h4, float16x8_t h8) { - (void)vfdot_lane_f32(r2, h4, h4, -1); + (void)vdot_lane_f32(r2, h4, h4, -1); // expected-error@-1 {{argument value -1 is outside the valid range [0, 1]}} - (void)vfdot_lane_f32(r2, h4, h4, 2); + (void)vdot_lane_f32(r2, h4, h4, 2); // expected-error@-1 {{argument value 2 is outside the valid range [0, 1]}} - (void)vfdot_laneq_f32(r2, h4, h8, -1); + (void)vdot_laneq_f32(r2, h4, h8, -1); // expected-error@-1 {{argument value -1 is outside the valid range [0, 3]}} - (void)vfdot_laneq_f32(r2, h4, h8, 4); + (void)vdot_laneq_f32(r2, h4, h8, 4); // expected-error@-1 {{argument value 4 is outside the valid range [0, 3]}} - (void)vfdotq_lane_f32(r4, h8, h4, -1); + (void)vdotq_lane_f32(r4, h8, h4, -1); // expected-error@-1 {{argument value -1 is outside the valid range [0, 1]}} - (void)vfdotq_lane_f32(r4, h8, h4, 2); + (void)vdotq_lane_f32(r4, h8, h4, 2); // expected-error@-1 {{argument value 2 is outside the valid range [0, 1]}} - (void)vfdotq_laneq_f32(r4, h8, h8, -1); + (void)vdotq_laneq_f32(r4, h8, h8, -1); // expected-error@-1 {{argument value -1 is outside the valid range [0, 3]}} - (void)vfdotq_laneq_f32(r4, h8, h8, 4); + (void)vdotq_laneq_f32(r4, h8, h8, 4); // expected-error@-1 {{argument value 4 is outside the valid range [0, 3]}} } diff --git a/llvm/test/CodeGen/AArch64/aarch64-f16f32dot-intrinsics.ll b/llvm/test/CodeGen/AArch64/aarch64-f16f32dot-intrinsics.ll index d7167c152955a..44687ef5102fd 100644 --- a/llvm/test/CodeGen/AArch64/aarch64-f16f32dot-intrinsics.ll +++ b/llvm/test/CodeGen/AArch64/aarch64-f16f32dot-intrinsics.ll @@ -1,8 +1,8 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6 ; RUN: llc -mtriple aarch64 -mattr=+f16f32dot %s -o - | FileCheck %s -define <2 x float> @test_vfdot_f32(<2 x float> %r, <4 x half> %a, <4 x half> %b) { -; CHECK-LABEL: test_vfdot_f32: +define <2 x float> @test_vdot_f32(<2 x float> %r, <4 x half> %a, <4 x half> %b) { +; CHECK-LABEL: test_vdot_f32: ; CHECK: // %bb.0: ; CHECK-NEXT: fdot v0.2s, v1.4h, v2.4h ; CHECK-NEXT: ret @@ -10,8 +10,8 @@ define <2 x float> @test_vfdot_f32(<2 x float> %r, <4 x half> %a, <4 x half> %b) ret <2 x float> %res } -define <4 x float> @test_vfdotq_f32(<4 x float> %r, <8 x half> %a, <8 x half> %b) { -; CHECK-LABEL: test_vfdotq_f32: +define <4 x float> @test_vdotq_f32(<4 x float> %r, <8 x half> %a, <8 x half> %b) { +; CHECK-LABEL: test_vdotq_f32: ; CHECK: // %bb.0: ; CHECK-NEXT: fdot v0.4s, v1.8h, v2.8h ; CHECK-NEXT: ret @@ -19,8 +19,8 @@ define <4 x float> @test_vfdotq_f32(<4 x float> %r, <8 x half> %a, <8 x half> %b ret <4 x float> %res } -define <2 x float> @test_vfdot_lane_f32(<2 x float> %r, <4 x half> %a, <4 x half> %b) { -; CHECK-LABEL: test_vfdot_lane_f32: +define <2 x float> @test_vdot_lane_f32(<2 x float> %r, <4 x half> %a, <4 x half> %b) { +; CHECK-LABEL: test_vdot_lane_f32: ; CHECK: // %bb.0: ; CHECK-NEXT: // kill: def $d2 killed $d2 def $q2 ; CHECK-NEXT: fdot v0.2s, v1.4h, v2.2h[0] @@ -30,8 +30,8 @@ define <2 x float> @test_vfdot_lane_f32(<2 x float> %r, <4 x half> %a, <4 x half ret <2 x float> %res } -define <4 x float> @test_vfdotq_laneq_f32(<4 x float> %r, <8 x half> %a, <8 x half> %b) { -; CHECK-LABEL: test_vfdotq_laneq_f32: +define <4 x float> @test_vdotq_laneq_f32(<4 x float> %r, <8 x half> %a, <8 x half> %b) { +; CHECK-LABEL: test_vdotq_laneq_f32: ; CHECK: // %bb.0: ; CHECK-NEXT: fdot v0.4s, v1.8h, v2.2h[3] ; CHECK-NEXT: ret @@ -40,8 +40,8 @@ define <4 x float> @test_vfdotq_laneq_f32(<4 x float> %r, <8 x half> %a, <8 x ha ret <4 x float> %res } -define <2 x float> @test_vfdot_laneq_f32(<2 x float> %r, <4 x half> %a, <8 x half> %b) { -; CHECK-LABEL: test_vfdot_laneq_f32: +define <2 x float> @test_vdot_laneq_f32(<2 x float> %r, <4 x half> %a, <8 x half> %b) { +; CHECK-LABEL: test_vdot_laneq_f32: ; CHECK: // %bb.0: ; CHECK-NEXT: fdot v0.2s, v1.4h, v2.2h[3] ; CHECK-NEXT: ret @@ -50,8 +50,8 @@ define <2 x float> @test_vfdot_laneq_f32(<2 x float> %r, <4 x half> %a, <8 x hal ret <2 x float> %res } -define <4 x float> @test_vfdotq_lane_f32(<4 x float> %r, <8 x half> %a, <4 x half> %b) { -; CHECK-LABEL: test_vfdotq_lane_f32: +define <4 x float> @test_vdotq_lane_f32(<4 x float> %r, <8 x half> %a, <4 x half> %b) { +; CHECK-LABEL: test_vdotq_lane_f32: ; CHECK: // %bb.0: ; CHECK-NEXT: // kill: def $d2 killed $d2 def $q2 ; CHECK-NEXT: fdot v0.4s, v1.8h, v2.2h[0] >From c18b5b3ba1bc01976d24eeb0034f7c5d98e9ba2b Mon Sep 17 00:00:00 2001 From: Marian Lukac <[email protected]> Date: Thu, 23 Apr 2026 15:07:55 +0000 Subject: [PATCH 4/6] Adjust the Imm check --- clang/include/clang/Basic/arm_neon.td | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/clang/include/clang/Basic/arm_neon.td b/clang/include/clang/Basic/arm_neon.td index 64ee23a082370..484b905786f92 100644 --- a/clang/include/clang/Basic/arm_neon.td +++ b/clang/include/clang/Basic/arm_neon.td @@ -1906,9 +1906,9 @@ let ArchGuard = "defined(__aarch64__) || defined(__arm64ec__)", TargetGuard = "d let ArchGuard = "defined(__aarch64__)", TargetGuard = "f16f32dot,neon" in { def VDOT_F16 : SInst<"vdot", "..<<", "fQf">; def VDOT_LANE_F16 : SInst<"vdot_lane", "..<(<q)I", "fQf", - [ImmCheck<3, ImmCheck0_1, 0>]>; + [ImmCheck<3, ImmCheckLaneIndex, 0>]>; def VDOT_LANEQ_F16 : SInst<"vdot_laneq", "..<(<Q)I", "fQf", - [ImmCheck<3, ImmCheck0_3, 0>]>; + [ImmCheck<3, ImmCheckLaneIndex, 0>]>; } // v8.2-A FP16 fused multiply-add long instructions. >From 862d01af4dc10a48c36a4bdd9c04a408610a4851 Mon Sep 17 00:00:00 2001 From: Marian Lukac <[email protected]> Date: Thu, 23 Apr 2026 16:18:56 +0000 Subject: [PATCH 5/6] Fix imm check --- clang/include/clang/Basic/arm_neon.td | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/clang/include/clang/Basic/arm_neon.td b/clang/include/clang/Basic/arm_neon.td index 484b905786f92..17914620b2cfa 100644 --- a/clang/include/clang/Basic/arm_neon.td +++ b/clang/include/clang/Basic/arm_neon.td @@ -1906,9 +1906,9 @@ let ArchGuard = "defined(__aarch64__) || defined(__arm64ec__)", TargetGuard = "d let ArchGuard = "defined(__aarch64__)", TargetGuard = "f16f32dot,neon" in { def VDOT_F16 : SInst<"vdot", "..<<", "fQf">; def VDOT_LANE_F16 : SInst<"vdot_lane", "..<(<q)I", "fQf", - [ImmCheck<3, ImmCheckLaneIndex, 0>]>; + [ImmCheck<3, ImmCheckLaneIndex, 2>]>; def VDOT_LANEQ_F16 : SInst<"vdot_laneq", "..<(<Q)I", "fQf", - [ImmCheck<3, ImmCheckLaneIndex, 0>]>; + [ImmCheck<3, ImmCheckLaneIndex, 2>]>; } // v8.2-A FP16 fused multiply-add long instructions. >From bd656a23c3c0e0ef39dfe1c5adc0b28543961263 Mon Sep 17 00:00:00 2001 From: Marian Lukac <[email protected]> Date: Thu, 28 May 2026 10:20:19 +0000 Subject: [PATCH 6/6] rename intrinsics to correct name --- clang/include/clang/Basic/arm_neon.td | 6 +-- clang/lib/CodeGen/TargetBuiltins/ARM.cpp | 12 ++--- .../CodeGen/AArch64/f16f32dot-intrinsics.c | 48 +++++++++---------- .../aarch64-neon-immediate-ranges/dotprod.c | 18 +++---- .../AArch64/aarch64-f16f32dot-intrinsics.ll | 5 +- 5 files changed, 43 insertions(+), 46 deletions(-) diff --git a/clang/include/clang/Basic/arm_neon.td b/clang/include/clang/Basic/arm_neon.td index 17914620b2cfa..3bf61c1eed6aa 100644 --- a/clang/include/clang/Basic/arm_neon.td +++ b/clang/include/clang/Basic/arm_neon.td @@ -1904,10 +1904,10 @@ let ArchGuard = "defined(__aarch64__) || defined(__arm64ec__)", TargetGuard = "d } let ArchGuard = "defined(__aarch64__)", TargetGuard = "f16f32dot,neon" in { - def VDOT_F16 : SInst<"vdot", "..<<", "fQf">; - def VDOT_LANE_F16 : SInst<"vdot_lane", "..<(<q)I", "fQf", + def VDOT_F16 : SInst<"vdot_f32", ">>..", "hQh">; + def VDOT_LANE_F16 : SInst<"vdot_lane_f32", ">>.qI", "hQh", [ImmCheck<3, ImmCheckLaneIndex, 2>]>; - def VDOT_LANEQ_F16 : SInst<"vdot_laneq", "..<(<Q)I", "fQf", + def VDOT_LANEQ_F16 : SInst<"vdot_laneq_f32", ">>.QI", "hQh", [ImmCheck<3, ImmCheckLaneIndex, 2>]>; } diff --git a/clang/lib/CodeGen/TargetBuiltins/ARM.cpp b/clang/lib/CodeGen/TargetBuiltins/ARM.cpp index 9cc5fea42ae67..e432996429ea5 100644 --- a/clang/lib/CodeGen/TargetBuiltins/ARM.cpp +++ b/clang/lib/CodeGen/TargetBuiltins/ARM.cpp @@ -7154,8 +7154,8 @@ Value *CodeGenFunction::EmitAArch64BuiltinExpr(unsigned BuiltinID, return EmitFP8NeonFDOTCall(Intrinsic::aarch64_neon_fp8_fdot4_lane, ExtendLaneArg, FloatTy, Ops, E, "fdot4_lane"); - case NEON::BI__builtin_neon_vdot_f32: - case NEON::BI__builtin_neon_vdotq_f32: { + case NEON::BI__builtin_neon_vdot_f32_f16: + case NEON::BI__builtin_neon_vdotq_f32_f16: { llvm::Type *InputTy = llvm::FixedVectorType::get(HalfTy, Ty->getPrimitiveSizeInBits() / 16); llvm::Type *Tys[2] = {Ty, InputTy}; @@ -7163,10 +7163,10 @@ Value *CodeGenFunction::EmitAArch64BuiltinExpr(unsigned BuiltinID, Ops, "vdot"); } - case NEON::BI__builtin_neon_vdot_lane_f32: - case NEON::BI__builtin_neon_vdot_laneq_f32: - case NEON::BI__builtin_neon_vdotq_lane_f32: - case NEON::BI__builtin_neon_vdotq_laneq_f32: { + case NEON::BI__builtin_neon_vdot_lane_f32_f16: + case NEON::BI__builtin_neon_vdot_laneq_f32_f16: + case NEON::BI__builtin_neon_vdotq_lane_f32_f16: + case NEON::BI__builtin_neon_vdotq_laneq_f32_f16: { llvm::FixedVectorType *InputTy = llvm::FixedVectorType::get(HalfTy, Ty->getPrimitiveSizeInBits() / 16); llvm::FixedVectorType *LaneTy = llvm::FixedVectorType::get( diff --git a/clang/test/CodeGen/AArch64/f16f32dot-intrinsics.c b/clang/test/CodeGen/AArch64/f16f32dot-intrinsics.c index 7ae61460e40df..763a8228680f9 100644 --- a/clang/test/CodeGen/AArch64/f16f32dot-intrinsics.c +++ b/clang/test/CodeGen/AArch64/f16f32dot-intrinsics.c @@ -7,106 +7,106 @@ #include <arm_neon.h> -// CHECK-LABEL: define dso_local <2 x float> @test_vdot_f32( +// CHECK-LABEL: define dso_local <2 x float> @test_vdot_f32_f16( // CHECK-SAME: <2 x float> noundef [[R:%.*]], <4 x half> noundef [[A:%.*]], <4 x half> noundef [[B:%.*]]) #[[ATTR0:[0-9]+]] { // CHECK-NEXT: [[ENTRY:.*:]] // CHECK-NEXT: [[VDOT3_I:%.*]] = call <2 x float> @llvm.aarch64.neon.fdot.v2f32.v4f16(<2 x float> [[R]], <4 x half> [[A]], <4 x half> [[B]]) // CHECK-NEXT: ret <2 x float> [[VDOT3_I]] // -// CHECK-CXX-LABEL: define dso_local noundef <2 x float> @_Z13test_vdot_f3213__Float32x2_t13__Float16x4_tS0_( +// CHECK-CXX-LABEL: define dso_local noundef <2 x float> @_Z17test_vdot_f32_f1613__Float32x2_t13__Float16x4_tS0_( // CHECK-CXX-SAME: <2 x float> noundef [[R:%.*]], <4 x half> noundef [[A:%.*]], <4 x half> noundef [[B:%.*]]) #[[ATTR0:[0-9]+]] { // CHECK-CXX-NEXT: [[ENTRY:.*:]] // CHECK-CXX-NEXT: [[VDOT3_I:%.*]] = call <2 x float> @llvm.aarch64.neon.fdot.v2f32.v4f16(<2 x float> [[R]], <4 x half> [[A]], <4 x half> [[B]]) // CHECK-CXX-NEXT: ret <2 x float> [[VDOT3_I]] // -float32x2_t test_vdot_f32(float32x2_t r, float16x4_t a, float16x4_t b) { - return vdot_f32(r, a, b); +float32x2_t test_vdot_f32_f16(float32x2_t r, float16x4_t a, float16x4_t b) { + return vdot_f32_f16(r, a, b); } -// CHECK-LABEL: define dso_local <4 x float> @test_vdotq_f32( +// CHECK-LABEL: define dso_local <4 x float> @test_vdotq_f32_f16( // CHECK-SAME: <4 x float> noundef [[R:%.*]], <8 x half> noundef [[A:%.*]], <8 x half> noundef [[B:%.*]]) #[[ATTR0]] { // CHECK-NEXT: [[ENTRY:.*:]] // CHECK-NEXT: [[VDOT3_I:%.*]] = call <4 x float> @llvm.aarch64.neon.fdot.v4f32.v8f16(<4 x float> [[R]], <8 x half> [[A]], <8 x half> [[B]]) // CHECK-NEXT: ret <4 x float> [[VDOT3_I]] // -// CHECK-CXX-LABEL: define dso_local noundef <4 x float> @_Z14test_vdotq_f3213__Float32x4_t13__Float16x8_tS0_( +// CHECK-CXX-LABEL: define dso_local noundef <4 x float> @_Z18test_vdotq_f32_f1613__Float32x4_t13__Float16x8_tS0_( // CHECK-CXX-SAME: <4 x float> noundef [[R:%.*]], <8 x half> noundef [[A:%.*]], <8 x half> noundef [[B:%.*]]) #[[ATTR0]] { // CHECK-CXX-NEXT: [[ENTRY:.*:]] // CHECK-CXX-NEXT: [[VDOT3_I:%.*]] = call <4 x float> @llvm.aarch64.neon.fdot.v4f32.v8f16(<4 x float> [[R]], <8 x half> [[A]], <8 x half> [[B]]) // CHECK-CXX-NEXT: ret <4 x float> [[VDOT3_I]] // -float32x4_t test_vdotq_f32(float32x4_t r, float16x8_t a, float16x8_t b) { - return vdotq_f32(r, a, b); +float32x4_t test_vdotq_f32_f16(float32x4_t r, float16x8_t a, float16x8_t b) { + return vdotq_f32_f16(r, a, b); } -// CHECK-LABEL: define dso_local <2 x float> @test_vdot_lane_f32( +// CHECK-LABEL: define dso_local <2 x float> @test_vdot_lane_f32_f16( // CHECK-SAME: <2 x float> noundef [[R:%.*]], <4 x half> noundef [[A:%.*]], <4 x half> noundef [[B:%.*]]) #[[ATTR0]] { // CHECK-NEXT: [[ENTRY:.*:]] // CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x half> [[B]], <4 x half> poison, <4 x i32> zeroinitializer // CHECK-NEXT: [[VDOT2:%.*]] = call <2 x float> @llvm.aarch64.neon.fdot.v2f32.v4f16(<2 x float> [[R]], <4 x half> [[A]], <4 x half> [[LANE]]) // CHECK-NEXT: ret <2 x float> [[VDOT2]] // -// CHECK-CXX-LABEL: define dso_local noundef <2 x float> @_Z18test_vdot_lane_f3213__Float32x2_t13__Float16x4_tS0_( +// CHECK-CXX-LABEL: define dso_local noundef <2 x float> @_Z22test_vdot_lane_f32_f1613__Float32x2_t13__Float16x4_tS0_( // CHECK-CXX-SAME: <2 x float> noundef [[R:%.*]], <4 x half> noundef [[A:%.*]], <4 x half> noundef [[B:%.*]]) #[[ATTR0]] { // CHECK-CXX-NEXT: [[ENTRY:.*:]] // CHECK-CXX-NEXT: [[LANE:%.*]] = shufflevector <4 x half> [[B]], <4 x half> poison, <4 x i32> zeroinitializer // CHECK-CXX-NEXT: [[VDOT2:%.*]] = call <2 x float> @llvm.aarch64.neon.fdot.v2f32.v4f16(<2 x float> [[R]], <4 x half> [[A]], <4 x half> [[LANE]]) // CHECK-CXX-NEXT: ret <2 x float> [[VDOT2]] // -float32x2_t test_vdot_lane_f32(float32x2_t r, float16x4_t a, float16x4_t b) { - return vdot_lane_f32(r, a, b, 0); +float32x2_t test_vdot_lane_f32_f16(float32x2_t r, float16x4_t a, float16x4_t b) { + return vdot_lane_f32_f16(r, a, b, 0); } -// CHECK-LABEL: define dso_local <2 x float> @test_vdot_laneq_f32( +// CHECK-LABEL: define dso_local <2 x float> @test_vdot_laneq_f32_f16( // CHECK-SAME: <2 x float> noundef [[R:%.*]], <4 x half> noundef [[A:%.*]], <8 x half> noundef [[B:%.*]]) #[[ATTR0]] { // CHECK-NEXT: [[ENTRY:.*:]] // CHECK-NEXT: [[LANE:%.*]] = shufflevector <8 x half> [[B]], <8 x half> poison, <4 x i32> <i32 3, i32 3, i32 3, i32 3> // CHECK-NEXT: [[VDOT2:%.*]] = call <2 x float> @llvm.aarch64.neon.fdot.v2f32.v4f16(<2 x float> [[R]], <4 x half> [[A]], <4 x half> [[LANE]]) // CHECK-NEXT: ret <2 x float> [[VDOT2]] // -// CHECK-CXX-LABEL: define dso_local noundef <2 x float> @_Z19test_vdot_laneq_f3213__Float32x2_t13__Float16x4_t13__Float16x8_t( +// CHECK-CXX-LABEL: define dso_local noundef <2 x float> @_Z23test_vdot_laneq_f32_f1613__Float32x2_t13__Float16x4_t13__Float16x8_t( // CHECK-CXX-SAME: <2 x float> noundef [[R:%.*]], <4 x half> noundef [[A:%.*]], <8 x half> noundef [[B:%.*]]) #[[ATTR0]] { // CHECK-CXX-NEXT: [[ENTRY:.*:]] // CHECK-CXX-NEXT: [[LANE:%.*]] = shufflevector <8 x half> [[B]], <8 x half> poison, <4 x i32> <i32 3, i32 3, i32 3, i32 3> // CHECK-CXX-NEXT: [[VDOT2:%.*]] = call <2 x float> @llvm.aarch64.neon.fdot.v2f32.v4f16(<2 x float> [[R]], <4 x half> [[A]], <4 x half> [[LANE]]) // CHECK-CXX-NEXT: ret <2 x float> [[VDOT2]] // -float32x2_t test_vdot_laneq_f32(float32x2_t r, float16x4_t a, float16x8_t b) { - return vdot_laneq_f32(r, a, b, 3); +float32x2_t test_vdot_laneq_f32_f16(float32x2_t r, float16x4_t a, float16x8_t b) { + return vdot_laneq_f32_f16(r, a, b, 3); } -// CHECK-LABEL: define dso_local <4 x float> @test_vdotq_lane_f32( +// CHECK-LABEL: define dso_local <4 x float> @test_vdotq_lane_f32_f16( // CHECK-SAME: <4 x float> noundef [[R:%.*]], <8 x half> noundef [[A:%.*]], <4 x half> noundef [[B:%.*]]) #[[ATTR0]] { // CHECK-NEXT: [[ENTRY:.*:]] // CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x half> [[B]], <4 x half> poison, <8 x i32> zeroinitializer // CHECK-NEXT: [[VDOT2:%.*]] = call <4 x float> @llvm.aarch64.neon.fdot.v4f32.v8f16(<4 x float> [[R]], <8 x half> [[A]], <8 x half> [[LANE]]) // CHECK-NEXT: ret <4 x float> [[VDOT2]] // -// CHECK-CXX-LABEL: define dso_local noundef <4 x float> @_Z19test_vdotq_lane_f3213__Float32x4_t13__Float16x8_t13__Float16x4_t( +// CHECK-CXX-LABEL: define dso_local noundef <4 x float> @_Z23test_vdotq_lane_f32_f1613__Float32x4_t13__Float16x8_t13__Float16x4_t( // CHECK-CXX-SAME: <4 x float> noundef [[R:%.*]], <8 x half> noundef [[A:%.*]], <4 x half> noundef [[B:%.*]]) #[[ATTR0]] { // CHECK-CXX-NEXT: [[ENTRY:.*:]] // CHECK-CXX-NEXT: [[LANE:%.*]] = shufflevector <4 x half> [[B]], <4 x half> poison, <8 x i32> zeroinitializer // CHECK-CXX-NEXT: [[VDOT2:%.*]] = call <4 x float> @llvm.aarch64.neon.fdot.v4f32.v8f16(<4 x float> [[R]], <8 x half> [[A]], <8 x half> [[LANE]]) // CHECK-CXX-NEXT: ret <4 x float> [[VDOT2]] // -float32x4_t test_vdotq_lane_f32(float32x4_t r, float16x8_t a, float16x4_t b) { - return vdotq_lane_f32(r, a, b, 0); +float32x4_t test_vdotq_lane_f32_f16(float32x4_t r, float16x8_t a, float16x4_t b) { + return vdotq_lane_f32_f16(r, a, b, 0); } -// CHECK-LABEL: define dso_local <4 x float> @test_vdotq_laneq_f32( +// CHECK-LABEL: define dso_local <4 x float> @test_vdotq_laneq_f32_f16( // CHECK-SAME: <4 x float> noundef [[R:%.*]], <8 x half> noundef [[A:%.*]], <8 x half> noundef [[B:%.*]]) #[[ATTR0]] { // CHECK-NEXT: [[ENTRY:.*:]] // CHECK-NEXT: [[LANE:%.*]] = shufflevector <8 x half> [[B]], <8 x half> poison, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3> // CHECK-NEXT: [[VDOT2:%.*]] = call <4 x float> @llvm.aarch64.neon.fdot.v4f32.v8f16(<4 x float> [[R]], <8 x half> [[A]], <8 x half> [[LANE]]) // CHECK-NEXT: ret <4 x float> [[VDOT2]] // -// CHECK-CXX-LABEL: define dso_local noundef <4 x float> @_Z20test_vdotq_laneq_f3213__Float32x4_t13__Float16x8_tS0_( +// CHECK-CXX-LABEL: define dso_local noundef <4 x float> @_Z24test_vdotq_laneq_f32_f1613__Float32x4_t13__Float16x8_tS0_( // CHECK-CXX-SAME: <4 x float> noundef [[R:%.*]], <8 x half> noundef [[A:%.*]], <8 x half> noundef [[B:%.*]]) #[[ATTR0]] { // CHECK-CXX-NEXT: [[ENTRY:.*:]] // CHECK-CXX-NEXT: [[LANE:%.*]] = shufflevector <8 x half> [[B]], <8 x half> poison, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3> // CHECK-CXX-NEXT: [[VDOT2:%.*]] = call <4 x float> @llvm.aarch64.neon.fdot.v4f32.v8f16(<4 x float> [[R]], <8 x half> [[A]], <8 x half> [[LANE]]) // CHECK-CXX-NEXT: ret <4 x float> [[VDOT2]] // -float32x4_t test_vdotq_laneq_f32(float32x4_t r, float16x8_t a, float16x8_t b) { - return vdotq_laneq_f32(r, a, b, 3); +float32x4_t test_vdotq_laneq_f32_f16(float32x4_t r, float16x8_t a, float16x8_t b) { + return vdotq_laneq_f32_f16(r, a, b, 3); } diff --git a/clang/test/Sema/aarch64-neon-immediate-ranges/dotprod.c b/clang/test/Sema/aarch64-neon-immediate-ranges/dotprod.c index e3d3cec1cfe5d..c99d69d7caf41 100644 --- a/clang/test/Sema/aarch64-neon-immediate-ranges/dotprod.c +++ b/clang/test/Sema/aarch64-neon-immediate-ranges/dotprod.c @@ -49,24 +49,24 @@ void test_dot_product_s32(int32x2_t arg_i32x2, int8x16_t arg_i8x16, int8x8_t arg } -void test_dot_product_f32(float32x2_t r2, float32x4_t r4, float16x4_t h4, float16x8_t h8) { - (void)vdot_lane_f32(r2, h4, h4, -1); +void test_dot_product_f32_f16(float32x2_t r2, float32x4_t r4, float16x4_t h4, float16x8_t h8) { + (void)vdot_lane_f32_f16(r2, h4, h4, -1); // expected-error@-1 {{argument value -1 is outside the valid range [0, 1]}} - (void)vdot_lane_f32(r2, h4, h4, 2); + (void)vdot_lane_f32_f16(r2, h4, h4, 2); // expected-error@-1 {{argument value 2 is outside the valid range [0, 1]}} - (void)vdot_laneq_f32(r2, h4, h8, -1); + (void)vdot_laneq_f32_f16(r2, h4, h8, -1); // expected-error@-1 {{argument value -1 is outside the valid range [0, 3]}} - (void)vdot_laneq_f32(r2, h4, h8, 4); + (void)vdot_laneq_f32_f16(r2, h4, h8, 4); // expected-error@-1 {{argument value 4 is outside the valid range [0, 3]}} - (void)vdotq_lane_f32(r4, h8, h4, -1); + (void)vdotq_lane_f32_f16(r4, h8, h4, -1); // expected-error@-1 {{argument value -1 is outside the valid range [0, 1]}} - (void)vdotq_lane_f32(r4, h8, h4, 2); + (void)vdotq_lane_f32_f16(r4, h8, h4, 2); // expected-error@-1 {{argument value 2 is outside the valid range [0, 1]}} - (void)vdotq_laneq_f32(r4, h8, h8, -1); + (void)vdotq_laneq_f32_f16(r4, h8, h8, -1); // expected-error@-1 {{argument value -1 is outside the valid range [0, 3]}} - (void)vdotq_laneq_f32(r4, h8, h8, 4); + (void)vdotq_laneq_f32_f16(r4, h8, h8, 4); // expected-error@-1 {{argument value 4 is outside the valid range [0, 3]}} } diff --git a/llvm/test/CodeGen/AArch64/aarch64-f16f32dot-intrinsics.ll b/llvm/test/CodeGen/AArch64/aarch64-f16f32dot-intrinsics.ll index 44687ef5102fd..7da0076ea9b4b 100644 --- a/llvm/test/CodeGen/AArch64/aarch64-f16f32dot-intrinsics.ll +++ b/llvm/test/CodeGen/AArch64/aarch64-f16f32dot-intrinsics.ll @@ -59,7 +59,4 @@ define <4 x float> @test_vdotq_lane_f32(<4 x float> %r, <8 x half> %a, <4 x half %lane = shufflevector <4 x half> %b, <4 x half> poison, <8 x i32> zeroinitializer %res = call <4 x float> @llvm.aarch64.neon.fdot.v4f32.v8f16(<4 x float> %r, <8 x half> %a, <8 x half> %lane) ret <4 x float> %res -} - -declare <2 x float> @llvm.aarch64.neon.fdot.v2f32.v4f16(<2 x float>, <4 x half>, <4 x half>) -declare <4 x float> @llvm.aarch64.neon.fdot.v4f32.v8f16(<4 x float>, <8 x half>, <8 x half>) +} \ No newline at end of file _______________________________________________ cfe-commits mailing list [email protected] https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
