[clang] [llvm] [AArch64] Add intrinsic support for Fdot instr. (PR #189987)

via cfe-commits Fri, 29 May 2026 02:39:19 -0700

https://github.com/Lukacma updated 
https://github.com/llvm/llvm-project/pull/189987


>From d4cae3a092effbdb0db752b2757b4dfb31504f76 Mon Sep 17 00:00:00 2001
From: Marian Lukac <[email protected]>
Date: Wed, 1 Apr 2026 15:51:56 +0000
Subject: [PATCH 1/6] [AArch64] Add intrinsic support for Fdot instr.

---
 clang/include/clang/Basic/arm_neon.td         |   8 ++
 clang/lib/CodeGen/TargetBuiltins/ARM.cpp      |  28 +++++
 .../CodeGen/AArch64/f16f32dot-intrinsics.c    | 112 ++++++++++++++++++
 .../aarch64-neon-immediate-ranges/dotprod.c   |  24 +++-
 llvm/include/llvm/IR/IntrinsicsAArch64.td     |   2 +-
 .../lib/Target/AArch64/AArch64InstrFormats.td |  19 +++
 .../AArch64/aarch64-f16f32dot-intrinsics.ll   |  65 ++++++++++
 7 files changed, 256 insertions(+), 2 deletions(-)
 create mode 100644 clang/test/CodeGen/AArch64/f16f32dot-intrinsics.c
 create mode 100644 llvm/test/CodeGen/AArch64/aarch64-f16f32dot-intrinsics.ll

diff --git a/clang/include/clang/Basic/arm_neon.td 
b/clang/include/clang/Basic/arm_neon.td
index e91d7ce975d31..ed4879a3dbd24 100644
--- a/clang/include/clang/Basic/arm_neon.td
+++ b/clang/include/clang/Basic/arm_neon.td
@@ -1903,6 +1903,14 @@ let ArchGuard = "defined(__aarch64__) || 
defined(__arm64ec__)", TargetGuard = "d
   def UDOT_LANEQ : SOpInst<"vdot_laneq", "..(<<)(<<Q)I", "iUiQiQUi", 
OP_DOT_LNQ>;
 }
 
+let ArchGuard = "defined(__aarch64__)", TargetGuard = "f16f32dot,neon" in {
+  def VFDOT_F16 : SInst<"vfdot", "..<<", "fQf">;
+  def VFDOT_LANE_F16 : SInst<"vfdot_lane", "..<(<q)I", "fQf",
+                            [ImmCheck<3, ImmCheck0_1, 0>]>;
+  def VFDOT_LANEQ_F16 : SInst<"vfdot_laneq", "..<(<Q)I", "fQf",
+                             [ImmCheck<3, ImmCheck0_3, 0>]>;
+}
+
 // v8.2-A FP16 fused multiply-add long instructions.
 let ArchGuard = "defined(__aarch64__) || defined(__arm64ec__)", TargetGuard = 
"fp16fml,neon" in {
   def VFMLAL_LOW  : SInst<"vfmlal_low",  ">>..", "hQh">;
diff --git a/clang/lib/CodeGen/TargetBuiltins/ARM.cpp 
b/clang/lib/CodeGen/TargetBuiltins/ARM.cpp
index 8ec2f5b83085c..c1731d1d8c100 100644
--- a/clang/lib/CodeGen/TargetBuiltins/ARM.cpp
+++ b/clang/lib/CodeGen/TargetBuiltins/ARM.cpp
@@ -7154,6 +7154,34 @@ Value *CodeGenFunction::EmitAArch64BuiltinExpr(unsigned 
BuiltinID,
     return EmitFP8NeonFDOTCall(Intrinsic::aarch64_neon_fp8_fdot4_lane,
                                ExtendLaneArg, FloatTy, Ops, E, "fdot4_lane");
 
+  case NEON::BI__builtin_neon_vfdot_f32:
+  case NEON::BI__builtin_neon_vfdotq_f32: {
+    llvm::Type *InputTy =
+        llvm::FixedVectorType::get(HalfTy, Ty->getPrimitiveSizeInBits() / 16);
+    llvm::Type *Tys[2] = {Ty, InputTy};
+    return EmitNeonCall(CGM.getIntrinsic(Intrinsic::aarch64_neon_fdot, Tys),
+                        Ops, "vfdot");
+  }
+
+  case NEON::BI__builtin_neon_vfdot_lane_f32:
+  case NEON::BI__builtin_neon_vfdot_laneq_f32:
+  case NEON::BI__builtin_neon_vfdotq_lane_f32:
+  case NEON::BI__builtin_neon_vfdotq_laneq_f32: {
+    llvm::FixedVectorType *InputTy =
+        llvm::FixedVectorType::get(HalfTy, Ty->getPrimitiveSizeInBits() / 16);
+    llvm::FixedVectorType *LaneTy = llvm::FixedVectorType::get(
+        HalfTy, Ops[2]->getType()->getPrimitiveSizeInBits() / 16);
+    // Treat the lane argument as a splat and use non-lane version of the
+    // intrinsic.
+    Ops[2] = Builder.CreateBitCast(Ops[2], LaneTy);
+    Ops[2] = EmitNeonSplat(Ops[2], cast<ConstantInt>(Ops[3]),
+                           InputTy->getElementCount());
+    llvm::Type *Tys[2] = {Ty, InputTy};
+    Ops.pop_back();
+    return EmitNeonCall(CGM.getIntrinsic(Intrinsic::aarch64_neon_fdot, Tys),
+                        Ops, "vfdot");
+  }
+
   case NEON::BI__builtin_neon_vmlalbq_f16_mf8_fpm:
     return EmitFP8NeonCall(Intrinsic::aarch64_neon_fp8_fmlalb,
                            {llvm::FixedVectorType::get(HalfTy, 8)}, Ops, E,
diff --git a/clang/test/CodeGen/AArch64/f16f32dot-intrinsics.c 
b/clang/test/CodeGen/AArch64/f16f32dot-intrinsics.c
new file mode 100644
index 0000000000000..aa5fe3a056d27
--- /dev/null
+++ b/clang/test/CodeGen/AArch64/f16f32dot-intrinsics.c
@@ -0,0 +1,112 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py 
UTC_ARGS: --version 6
+// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +neon 
-target-feature +f16f32dot -disable-O0-optnone -emit-llvm -o - %s | opt -S 
-passes=mem2reg,sroa,instcombine | FileCheck %s
+// RUN: %clang_cc1 -x c++ -triple aarch64-none-linux-gnu -target-feature +neon 
-target-feature +f16f32dot -disable-O0-optnone -emit-llvm -o - %s | opt -S 
-passes=mem2reg,sroa,instcombine | FileCheck %s -check-prefix CHECK-CXX
+// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +neon 
-target-feature +f16f32dot -O3 -S -o /dev/null %s
+
+// REQUIRES: aarch64-registered-target
+
+#include <arm_neon.h>
+
+// CHECK-LABEL: define dso_local <2 x float> @test_vfdot_f32(
+// CHECK-SAME: <2 x float> noundef [[R:%.*]], <4 x half> noundef [[A:%.*]], <4 
x half> noundef [[B:%.*]]) #[[ATTR0:[0-9]+]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VFDOT3_I:%.*]] = call <2 x float> 
@llvm.aarch64.neon.fdot.v2f32.v4f16(<2 x float> [[R]], <4 x half> [[A]], <4 x 
half> [[B]])
+// CHECK-NEXT:    ret <2 x float> [[VFDOT3_I]]
+//
+// CHECK-CXX-LABEL: define dso_local noundef <2 x float> 
@_Z14test_vfdot_f3213__Float32x2_t13__Float16x4_tS0_(
+// CHECK-CXX-SAME: <2 x float> noundef [[R:%.*]], <4 x half> noundef 
[[A:%.*]], <4 x half> noundef [[B:%.*]]) #[[ATTR0:[0-9]+]] {
+// CHECK-CXX-NEXT:  [[ENTRY:.*:]]
+// CHECK-CXX-NEXT:    [[VFDOT3_I:%.*]] = call <2 x float> 
@llvm.aarch64.neon.fdot.v2f32.v4f16(<2 x float> [[R]], <4 x half> [[A]], <4 x 
half> [[B]])
+// CHECK-CXX-NEXT:    ret <2 x float> [[VFDOT3_I]]
+//
+float32x2_t test_vfdot_f32(float32x2_t r, float16x4_t a, float16x4_t b) {
+  return vfdot_f32(r, a, b);
+}
+
+// CHECK-LABEL: define dso_local <4 x float> @test_vfdotq_f32(
+// CHECK-SAME: <4 x float> noundef [[R:%.*]], <8 x half> noundef [[A:%.*]], <8 
x half> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VFDOT3_I:%.*]] = call <4 x float> 
@llvm.aarch64.neon.fdot.v4f32.v8f16(<4 x float> [[R]], <8 x half> [[A]], <8 x 
half> [[B]])
+// CHECK-NEXT:    ret <4 x float> [[VFDOT3_I]]
+//
+// CHECK-CXX-LABEL: define dso_local noundef <4 x float> 
@_Z15test_vfdotq_f3213__Float32x4_t13__Float16x8_tS0_(
+// CHECK-CXX-SAME: <4 x float> noundef [[R:%.*]], <8 x half> noundef 
[[A:%.*]], <8 x half> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-CXX-NEXT:  [[ENTRY:.*:]]
+// CHECK-CXX-NEXT:    [[VFDOT3_I:%.*]] = call <4 x float> 
@llvm.aarch64.neon.fdot.v4f32.v8f16(<4 x float> [[R]], <8 x half> [[A]], <8 x 
half> [[B]])
+// CHECK-CXX-NEXT:    ret <4 x float> [[VFDOT3_I]]
+//
+float32x4_t test_vfdotq_f32(float32x4_t r, float16x8_t a, float16x8_t b) {
+  return vfdotq_f32(r, a, b);
+}
+
+// CHECK-LABEL: define dso_local <2 x float> @test_vfdot_lane_f32(
+// CHECK-SAME: <2 x float> noundef [[R:%.*]], <4 x half> noundef [[A:%.*]], <4 
x half> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x half> [[B]], <4 x half> 
poison, <4 x i32> zeroinitializer
+// CHECK-NEXT:    [[VFDOT2:%.*]] = call <2 x float> 
@llvm.aarch64.neon.fdot.v2f32.v4f16(<2 x float> [[R]], <4 x half> [[A]], <4 x 
half> [[LANE]])
+// CHECK-NEXT:    ret <2 x float> [[VFDOT2]]
+//
+// CHECK-CXX-LABEL: define dso_local noundef <2 x float> 
@_Z19test_vfdot_lane_f3213__Float32x2_t13__Float16x4_tS0_(
+// CHECK-CXX-SAME: <2 x float> noundef [[R:%.*]], <4 x half> noundef 
[[A:%.*]], <4 x half> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-CXX-NEXT:  [[ENTRY:.*:]]
+// CHECK-CXX-NEXT:    [[LANE:%.*]] = shufflevector <4 x half> [[B]], <4 x 
half> poison, <4 x i32> zeroinitializer
+// CHECK-CXX-NEXT:    [[VFDOT2:%.*]] = call <2 x float> 
@llvm.aarch64.neon.fdot.v2f32.v4f16(<2 x float> [[R]], <4 x half> [[A]], <4 x 
half> [[LANE]])
+// CHECK-CXX-NEXT:    ret <2 x float> [[VFDOT2]]
+//
+float32x2_t test_vfdot_lane_f32(float32x2_t r, float16x4_t a, float16x4_t b) {
+  return vfdot_lane_f32(r, a, b, 0);
+}
+
+// CHECK-LABEL: define dso_local <2 x float> @test_vfdot_laneq_f32(
+// CHECK-SAME: <2 x float> noundef [[R:%.*]], <4 x half> noundef [[A:%.*]], <8 
x half> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <8 x half> [[B]], <8 x half> 
poison, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
+// CHECK-NEXT:    [[VFDOT2:%.*]] = call <2 x float> 
@llvm.aarch64.neon.fdot.v2f32.v4f16(<2 x float> [[R]], <4 x half> [[A]], <4 x 
half> [[LANE]])
+// CHECK-NEXT:    ret <2 x float> [[VFDOT2]]
+//
+// CHECK-CXX-LABEL: define dso_local noundef <2 x float> 
@_Z20test_vfdot_laneq_f3213__Float32x2_t13__Float16x4_t13__Float16x8_t(
+// CHECK-CXX-SAME: <2 x float> noundef [[R:%.*]], <4 x half> noundef 
[[A:%.*]], <8 x half> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-CXX-NEXT:  [[ENTRY:.*:]]
+// CHECK-CXX-NEXT:    [[LANE:%.*]] = shufflevector <8 x half> [[B]], <8 x 
half> poison, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
+// CHECK-CXX-NEXT:    [[VFDOT2:%.*]] = call <2 x float> 
@llvm.aarch64.neon.fdot.v2f32.v4f16(<2 x float> [[R]], <4 x half> [[A]], <4 x 
half> [[LANE]])
+// CHECK-CXX-NEXT:    ret <2 x float> [[VFDOT2]]
+//
+float32x2_t test_vfdot_laneq_f32(float32x2_t r, float16x4_t a, float16x8_t b) {
+  return vfdot_laneq_f32(r, a, b, 3);
+}
+
+// CHECK-LABEL: define dso_local <4 x float> @test_vfdotq_lane_f32(
+// CHECK-SAME: <4 x float> noundef [[R:%.*]], <8 x half> noundef [[A:%.*]], <4 
x half> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x half> [[B]], <4 x half> 
poison, <8 x i32> zeroinitializer
+// CHECK-NEXT:    [[VFDOT2:%.*]] = call <4 x float> 
@llvm.aarch64.neon.fdot.v4f32.v8f16(<4 x float> [[R]], <8 x half> [[A]], <8 x 
half> [[LANE]])
+// CHECK-NEXT:    ret <4 x float> [[VFDOT2]]
+//
+// CHECK-CXX-LABEL: define dso_local noundef <4 x float> 
@_Z20test_vfdotq_lane_f3213__Float32x4_t13__Float16x8_t13__Float16x4_t(
+// CHECK-CXX-SAME: <4 x float> noundef [[R:%.*]], <8 x half> noundef 
[[A:%.*]], <4 x half> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-CXX-NEXT:  [[ENTRY:.*:]]
+// CHECK-CXX-NEXT:    [[LANE:%.*]] = shufflevector <4 x half> [[B]], <4 x 
half> poison, <8 x i32> zeroinitializer
+// CHECK-CXX-NEXT:    [[VFDOT2:%.*]] = call <4 x float> 
@llvm.aarch64.neon.fdot.v4f32.v8f16(<4 x float> [[R]], <8 x half> [[A]], <8 x 
half> [[LANE]])
+// CHECK-CXX-NEXT:    ret <4 x float> [[VFDOT2]]
+//
+float32x4_t test_vfdotq_lane_f32(float32x4_t r, float16x8_t a, float16x4_t b) {
+  return vfdotq_lane_f32(r, a, b, 0);
+}
+
+// CHECK-LABEL: define dso_local <4 x float> @test_vfdotq_laneq_f32(
+// CHECK-SAME: <4 x float> noundef [[R:%.*]], <8 x half> noundef [[A:%.*]], <8 
x half> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <8 x half> [[B]], <8 x half> 
poison, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
+// CHECK-NEXT:    [[VFDOT2:%.*]] = call <4 x float> 
@llvm.aarch64.neon.fdot.v4f32.v8f16(<4 x float> [[R]], <8 x half> [[A]], <8 x 
half> [[LANE]])
+// CHECK-NEXT:    ret <4 x float> [[VFDOT2]]
+//
+// CHECK-CXX-LABEL: define dso_local noundef <4 x float> 
@_Z21test_vfdotq_laneq_f3213__Float32x4_t13__Float16x8_tS0_(
+// CHECK-CXX-SAME: <4 x float> noundef [[R:%.*]], <8 x half> noundef 
[[A:%.*]], <8 x half> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-CXX-NEXT:  [[ENTRY:.*:]]
+// CHECK-CXX-NEXT:    [[LANE:%.*]] = shufflevector <8 x half> [[B]], <8 x 
half> poison, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
+// CHECK-CXX-NEXT:    [[VFDOT2:%.*]] = call <4 x float> 
@llvm.aarch64.neon.fdot.v4f32.v8f16(<4 x float> [[R]], <8 x half> [[A]], <8 x 
half> [[LANE]])
+// CHECK-CXX-NEXT:    ret <4 x float> [[VFDOT2]]
+//
+float32x4_t test_vfdotq_laneq_f32(float32x4_t r, float16x8_t a, float16x8_t b) 
{
+  return vfdotq_laneq_f32(r, a, b, 3);
+}
diff --git a/clang/test/Sema/aarch64-neon-immediate-ranges/dotprod.c 
b/clang/test/Sema/aarch64-neon-immediate-ranges/dotprod.c
index 11f2c660a8ff2..7f1947e5d9d07 100644
--- a/clang/test/Sema/aarch64-neon-immediate-ranges/dotprod.c
+++ b/clang/test/Sema/aarch64-neon-immediate-ranges/dotprod.c
@@ -1,4 +1,4 @@
-// RUN: %clang_cc1 -triple aarch64-linux-gnu -target-feature +neon 
-target-feature +v8.2a -target-feature +dotprod -ffreestanding -fsyntax-only 
-verify %s
+// RUN: %clang_cc1 -triple aarch64-linux-gnu -target-feature +neon 
-target-feature +v8.2a -target-feature +dotprod -target-feature +f16f32dot 
-ffreestanding -fsyntax-only -verify %s
 
 #include <arm_neon.h>
 // REQUIRES: aarch64-registered-target
@@ -48,3 +48,25 @@ void test_dot_product_s32(int32x2_t arg_i32x2, int8x16_t 
arg_i8x16, int8x8_t arg
        vdotq_lane_s32(arg_i32x4, arg_i8x16, arg_i8x8, 2); // expected-error-re 
{{argument value {{.*}} is outside the valid range}}
 
 }
+
+void test_dot_product_f32(float32x2_t r2, float32x4_t r4, float16x4_t h4, 
float16x8_t h8) {
+  (void)vfdot_lane_f32(r2, h4, h4, -1);
+// expected-error@-1 {{argument value -1 is outside the valid range [0, 1]}}
+  (void)vfdot_lane_f32(r2, h4, h4, 2);
+// expected-error@-1 {{argument value 2 is outside the valid range [0, 1]}}
+
+  (void)vfdot_laneq_f32(r2, h4, h8, -1);
+// expected-error@-1 {{argument value -1 is outside the valid range [0, 3]}}
+  (void)vfdot_laneq_f32(r2, h4, h8, 4);
+// expected-error@-1 {{argument value 4 is outside the valid range [0, 3]}}
+
+  (void)vfdotq_lane_f32(r4, h8, h4, -1);
+// expected-error@-1 {{argument value -1 is outside the valid range [0, 1]}}
+  (void)vfdotq_lane_f32(r4, h8, h4, 2);
+// expected-error@-1 {{argument value 2 is outside the valid range [0, 1]}}
+
+  (void)vfdotq_laneq_f32(r4, h8, h8, -1);
+// expected-error@-1 {{argument value -1 is outside the valid range [0, 3]}}
+  (void)vfdotq_laneq_f32(r4, h8, h8, 4);
+// expected-error@-1 {{argument value 4 is outside the valid range [0, 3]}}
+}
diff --git a/llvm/include/llvm/IR/IntrinsicsAArch64.td 
b/llvm/include/llvm/IR/IntrinsicsAArch64.td
index 63500beaa6521..8765842833ce9 100644
--- a/llvm/include/llvm/IR/IntrinsicsAArch64.td
+++ b/llvm/include/llvm/IR/IntrinsicsAArch64.td
@@ -521,6 +521,7 @@ let TargetPrefix = "aarch64" in {
   def int_aarch64_neon_fmmla : AdvSIMD_MatMul_Intrinsic;
   def int_aarch64_neon_usdot : AdvSIMD_Dot_Intrinsic;
   def int_aarch64_neon_bfdot : AdvSIMD_Dot_Intrinsic;
+  def int_aarch64_neon_fdot : AdvSIMD_Dot_Intrinsic;
   def int_aarch64_neon_bfmmla
     : DefaultAttrsIntrinsic<[llvm_v4f32_ty],
                 [llvm_v4f32_ty, llvm_v8bf16_ty, llvm_v8bf16_ty],
@@ -4296,4 +4297,3 @@ let TargetPrefix = "aarch64" in {
   def int_aarch64_sve_pmlal_pair_x2 : DefaultAttrsIntrinsic<[llvm_nxv2i64_ty, 
llvm_nxv2i64_ty],
       [llvm_nxv2i64_ty, llvm_nxv2i64_ty, llvm_nxv2i64_ty, llvm_nxv2i64_ty], 
[IntrNoMem]>;
 }
-
diff --git a/llvm/lib/Target/AArch64/AArch64InstrFormats.td 
b/llvm/lib/Target/AArch64/AArch64InstrFormats.td
index 19312d34609ce..94249db0c1ed3 100644
--- a/llvm/lib/Target/AArch64/AArch64InstrFormats.td
+++ b/llvm/lib/Target/AArch64/AArch64InstrFormats.td
@@ -6699,6 +6699,13 @@ multiclass SIMDThreeSameVectorFDot<string asm, 
SDPatternOperator OpNode = null_f
                                          v2f32, v4f16, OpNode>;
   def v8f16_v4f32 : BaseSIMDThreeSameVectorDot<1, 0, 0b10, 0b1111, asm, ".4s", 
".8h", V128,
                                          v4f32, v8f16, OpNode>;
+
+  def : Pat<(v2f32 (int_aarch64_neon_fdot (v2f32 V64:$Rd),
+                    (v4f16 V64:$Rn), (v4f16 V64:$Rm))),
+            (!cast<Instruction>(NAME # "v4f16_v2f32") $Rd, $Rn, $Rm)>;
+  def : Pat<(v4f32 (int_aarch64_neon_fdot (v4f32 V128:$Rd),
+                    (v8f16 V128:$Rn), (v8f16 V128:$Rm))),
+            (!cast<Instruction>(NAME # "v8f16_v4f32") $Rd, $Rn, $Rm)>;
 }
 
 // FP8 assembly/disassembly classes
@@ -9373,6 +9380,18 @@ multiclass SIMDThreeSameVectorFDOTIndex<string asm> {
                                            V64, v2f32, v4f16, VectorIndexS, 
null_frag>;
   def v8f16_v4f32 : BaseSIMDThreeSameVectorIndexS<0b1, 0b0, 0b01, 0b1001, asm, 
".4s", ".8h",".2h",
                                             V128, v4f32, v8f16, VectorIndexS, 
null_frag>;
+
+  def : Pat<(v2f32 (int_aarch64_neon_fdot
+              (v2f32 V64:$Rd), (v4f16 V64:$Rn),
+              (v4f16 (AArch64duplane16 (v8f16 V128:$Rm), VectorIndexS:$Idx)))),
+            (!cast<Instruction>(NAME # "v4f16_v2f32") $Rd, $Rn, $Rm,
+              VectorIndexS:$Idx)>;
+
+  def : Pat<(v4f32 (int_aarch64_neon_fdot
+              (v4f32 V128:$Rd), (v8f16 V128:$Rn),
+              (v8f16 (AArch64duplane16 (v8f16 V128:$Rm), VectorIndexS:$Idx)))),
+            (!cast<Instruction>(NAME # "v8f16_v4f32") $Rd, $Rn, $Rm,
+              VectorIndexS:$Idx)>;
 }
 
 //----------------------------------------------------------------------------
diff --git a/llvm/test/CodeGen/AArch64/aarch64-f16f32dot-intrinsics.ll 
b/llvm/test/CodeGen/AArch64/aarch64-f16f32dot-intrinsics.ll
new file mode 100644
index 0000000000000..9f31a241198a6
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/aarch64-f16f32dot-intrinsics.ll
@@ -0,0 +1,65 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 
UTC_ARGS: --version 6
+; RUN: llc -mtriple aarch64 -mattr=+f16f32dot %s -o - | FileCheck %s
+
+define <2 x float> @test_vfdot_f32(<2 x float> %r, <4 x half> %a, <4 x half> 
%b) {
+; CHECK-LABEL: test_vfdot_f32:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    fdot v0.2s, v1.4h, v2.4h
+; CHECK-NEXT:    ret
+  %res = call <2 x float> @llvm.aarch64.neon.fdot.v2f32.v4f16(<2 x float> %r, 
<4 x half> %a, <4 x half> %b)
+  ret <2 x float> %res
+}
+
+define <4 x float> @test_vfdotq_f32(<4 x float> %r, <8 x half> %a, <8 x half> 
%b) {
+; CHECK-LABEL: test_vfdotq_f32:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    fdot v0.4s, v1.8h, v2.8h
+; CHECK-NEXT:    ret
+  %res = call <4 x float> @llvm.aarch64.neon.fdot.v4f32.v8f16(<4 x float> %r, 
<8 x half> %a, <8 x half> %b)
+  ret <4 x float> %res
+}
+
+define <2 x float> @test_vfdot_lane_f32(<2 x float> %r, <4 x half> %a, <4 x 
half> %b) {
+; CHECK-LABEL: test_vfdot_lane_f32:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    // kill: def $d2 killed $d2 def $q2
+; CHECK-NEXT:    fdot v0.2s, v1.4h, v2.2h[0]
+; CHECK-NEXT:    ret
+  %lane = shufflevector <4 x half> %b, <4 x half> undef, <4 x i32> 
zeroinitializer
+  %res = call <2 x float> @llvm.aarch64.neon.fdot.v2f32.v4f16(<2 x float> %r, 
<4 x half> %a, <4 x half> %lane)
+  ret <2 x float> %res
+}
+
+define <4 x float> @test_vfdotq_laneq_f32(<4 x float> %r, <8 x half> %a, <8 x 
half> %b) {
+; CHECK-LABEL: test_vfdotq_laneq_f32:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    fdot v0.4s, v1.8h, v2.2h[3]
+; CHECK-NEXT:    ret
+  %lane = shufflevector <8 x half> %b, <8 x half> undef, <8 x i32> <i32 3, i32 
3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
+  %res = call <4 x float> @llvm.aarch64.neon.fdot.v4f32.v8f16(<4 x float> %r, 
<8 x half> %a, <8 x half> %lane)
+  ret <4 x float> %res
+}
+
+define <2 x float> @test_vfdot_laneq_f32(<2 x float> %r, <4 x half> %a, <8 x 
half> %b) {
+; CHECK-LABEL: test_vfdot_laneq_f32:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    fdot v0.2s, v1.4h, v2.2h[3]
+; CHECK-NEXT:    ret
+  %lane = shufflevector <8 x half> %b, <8 x half> undef, <4 x i32> <i32 3, i32 
3, i32 3, i32 3>
+  %res = call <2 x float> @llvm.aarch64.neon.fdot.v2f32.v4f16(<2 x float> %r, 
<4 x half> %a, <4 x half> %lane)
+  ret <2 x float> %res
+}
+
+define <4 x float> @test_vfdotq_lane_f32(<4 x float> %r, <8 x half> %a, <4 x 
half> %b) {
+; CHECK-LABEL: test_vfdotq_lane_f32:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    // kill: def $d2 killed $d2 def $q2
+; CHECK-NEXT:    fdot v0.4s, v1.8h, v2.2h[0]
+; CHECK-NEXT:    ret
+  %lane = shufflevector <4 x half> %b, <4 x half> undef, <8 x i32> 
zeroinitializer
+  %res = call <4 x float> @llvm.aarch64.neon.fdot.v4f32.v8f16(<4 x float> %r, 
<8 x half> %a, <8 x half> %lane)
+  ret <4 x float> %res
+}
+
+declare <2 x float> @llvm.aarch64.neon.fdot.v2f32.v4f16(<2 x float>, <4 x 
half>, <4 x half>)
+declare <4 x float> @llvm.aarch64.neon.fdot.v4f32.v8f16(<4 x float>, <8 x 
half>, <8 x half>)

>From 4ab762fd08a1a30855550a7d5c1dadee91a57d7e Mon Sep 17 00:00:00 2001
From: Marian Lukac <[email protected]>
Date: Wed, 1 Apr 2026 16:08:34 +0000
Subject: [PATCH 2/6] Remove undef

---
 llvm/test/CodeGen/AArch64/aarch64-f16f32dot-intrinsics.ll | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/llvm/test/CodeGen/AArch64/aarch64-f16f32dot-intrinsics.ll 
b/llvm/test/CodeGen/AArch64/aarch64-f16f32dot-intrinsics.ll
index 9f31a241198a6..d7167c152955a 100644
--- a/llvm/test/CodeGen/AArch64/aarch64-f16f32dot-intrinsics.ll
+++ b/llvm/test/CodeGen/AArch64/aarch64-f16f32dot-intrinsics.ll
@@ -25,7 +25,7 @@ define <2 x float> @test_vfdot_lane_f32(<2 x float> %r, <4 x 
half> %a, <4 x half
 ; CHECK-NEXT:    // kill: def $d2 killed $d2 def $q2
 ; CHECK-NEXT:    fdot v0.2s, v1.4h, v2.2h[0]
 ; CHECK-NEXT:    ret
-  %lane = shufflevector <4 x half> %b, <4 x half> undef, <4 x i32> 
zeroinitializer
+  %lane = shufflevector <4 x half> %b, <4 x half> poison, <4 x i32> 
zeroinitializer
   %res = call <2 x float> @llvm.aarch64.neon.fdot.v2f32.v4f16(<2 x float> %r, 
<4 x half> %a, <4 x half> %lane)
   ret <2 x float> %res
 }
@@ -35,7 +35,7 @@ define <4 x float> @test_vfdotq_laneq_f32(<4 x float> %r, <8 
x half> %a, <8 x ha
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    fdot v0.4s, v1.8h, v2.2h[3]
 ; CHECK-NEXT:    ret
-  %lane = shufflevector <8 x half> %b, <8 x half> undef, <8 x i32> <i32 3, i32 
3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
+  %lane = shufflevector <8 x half> %b, <8 x half> poison, <8 x i32> <i32 3, 
i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
   %res = call <4 x float> @llvm.aarch64.neon.fdot.v4f32.v8f16(<4 x float> %r, 
<8 x half> %a, <8 x half> %lane)
   ret <4 x float> %res
 }
@@ -45,7 +45,7 @@ define <2 x float> @test_vfdot_laneq_f32(<2 x float> %r, <4 x 
half> %a, <8 x hal
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    fdot v0.2s, v1.4h, v2.2h[3]
 ; CHECK-NEXT:    ret
-  %lane = shufflevector <8 x half> %b, <8 x half> undef, <4 x i32> <i32 3, i32 
3, i32 3, i32 3>
+  %lane = shufflevector <8 x half> %b, <8 x half> poison, <4 x i32> <i32 3, 
i32 3, i32 3, i32 3>
   %res = call <2 x float> @llvm.aarch64.neon.fdot.v2f32.v4f16(<2 x float> %r, 
<4 x half> %a, <4 x half> %lane)
   ret <2 x float> %res
 }
@@ -56,7 +56,7 @@ define <4 x float> @test_vfdotq_lane_f32(<4 x float> %r, <8 x 
half> %a, <4 x hal
 ; CHECK-NEXT:    // kill: def $d2 killed $d2 def $q2
 ; CHECK-NEXT:    fdot v0.4s, v1.8h, v2.2h[0]
 ; CHECK-NEXT:    ret
-  %lane = shufflevector <4 x half> %b, <4 x half> undef, <8 x i32> 
zeroinitializer
+  %lane = shufflevector <4 x half> %b, <4 x half> poison, <8 x i32> 
zeroinitializer
   %res = call <4 x float> @llvm.aarch64.neon.fdot.v4f32.v8f16(<4 x float> %r, 
<8 x half> %a, <8 x half> %lane)
   ret <4 x float> %res
 }

>From 7f14f5eb3c854518dd6df07fffff3b41b7739628 Mon Sep 17 00:00:00 2001
From: Marian Lukac <[email protected]>
Date: Thu, 16 Apr 2026 10:20:43 +0000
Subject: [PATCH 3/6] Rename to vdot

---
 clang/include/clang/Basic/arm_neon.td         | 10 +-
 clang/lib/CodeGen/TargetBuiltins/ARM.cpp      | 16 ++--
 .../CodeGen/AArch64/f16f32dot-intrinsics.c    | 96 +++++++++----------
 .../aarch64-neon-immediate-ranges/dotprod.c   | 16 ++--
 .../AArch64/aarch64-f16f32dot-intrinsics.ll   | 24 ++---
 5 files changed, 81 insertions(+), 81 deletions(-)

diff --git a/clang/include/clang/Basic/arm_neon.td 
b/clang/include/clang/Basic/arm_neon.td
index ed4879a3dbd24..64ee23a082370 100644
--- a/clang/include/clang/Basic/arm_neon.td
+++ b/clang/include/clang/Basic/arm_neon.td
@@ -1904,11 +1904,11 @@ let ArchGuard = "defined(__aarch64__) || 
defined(__arm64ec__)", TargetGuard = "d
 }
 
 let ArchGuard = "defined(__aarch64__)", TargetGuard = "f16f32dot,neon" in {
-  def VFDOT_F16 : SInst<"vfdot", "..<<", "fQf">;
-  def VFDOT_LANE_F16 : SInst<"vfdot_lane", "..<(<q)I", "fQf",
-                            [ImmCheck<3, ImmCheck0_1, 0>]>;
-  def VFDOT_LANEQ_F16 : SInst<"vfdot_laneq", "..<(<Q)I", "fQf",
-                             [ImmCheck<3, ImmCheck0_3, 0>]>;
+  def VDOT_F16 : SInst<"vdot", "..<<", "fQf">;
+  def VDOT_LANE_F16 : SInst<"vdot_lane", "..<(<q)I", "fQf",
+                           [ImmCheck<3, ImmCheck0_1, 0>]>;
+  def VDOT_LANEQ_F16 : SInst<"vdot_laneq", "..<(<Q)I", "fQf",
+                            [ImmCheck<3, ImmCheck0_3, 0>]>;
 }
 
 // v8.2-A FP16 fused multiply-add long instructions.
diff --git a/clang/lib/CodeGen/TargetBuiltins/ARM.cpp 
b/clang/lib/CodeGen/TargetBuiltins/ARM.cpp
index c1731d1d8c100..9cc5fea42ae67 100644
--- a/clang/lib/CodeGen/TargetBuiltins/ARM.cpp
+++ b/clang/lib/CodeGen/TargetBuiltins/ARM.cpp
@@ -7154,19 +7154,19 @@ Value *CodeGenFunction::EmitAArch64BuiltinExpr(unsigned 
BuiltinID,
     return EmitFP8NeonFDOTCall(Intrinsic::aarch64_neon_fp8_fdot4_lane,
                                ExtendLaneArg, FloatTy, Ops, E, "fdot4_lane");
 
-  case NEON::BI__builtin_neon_vfdot_f32:
-  case NEON::BI__builtin_neon_vfdotq_f32: {
+  case NEON::BI__builtin_neon_vdot_f32:
+  case NEON::BI__builtin_neon_vdotq_f32: {
     llvm::Type *InputTy =
         llvm::FixedVectorType::get(HalfTy, Ty->getPrimitiveSizeInBits() / 16);
     llvm::Type *Tys[2] = {Ty, InputTy};
     return EmitNeonCall(CGM.getIntrinsic(Intrinsic::aarch64_neon_fdot, Tys),
-                        Ops, "vfdot");
+                        Ops, "vdot");
   }
 
-  case NEON::BI__builtin_neon_vfdot_lane_f32:
-  case NEON::BI__builtin_neon_vfdot_laneq_f32:
-  case NEON::BI__builtin_neon_vfdotq_lane_f32:
-  case NEON::BI__builtin_neon_vfdotq_laneq_f32: {
+  case NEON::BI__builtin_neon_vdot_lane_f32:
+  case NEON::BI__builtin_neon_vdot_laneq_f32:
+  case NEON::BI__builtin_neon_vdotq_lane_f32:
+  case NEON::BI__builtin_neon_vdotq_laneq_f32: {
     llvm::FixedVectorType *InputTy =
         llvm::FixedVectorType::get(HalfTy, Ty->getPrimitiveSizeInBits() / 16);
     llvm::FixedVectorType *LaneTy = llvm::FixedVectorType::get(
@@ -7179,7 +7179,7 @@ Value *CodeGenFunction::EmitAArch64BuiltinExpr(unsigned 
BuiltinID,
     llvm::Type *Tys[2] = {Ty, InputTy};
     Ops.pop_back();
     return EmitNeonCall(CGM.getIntrinsic(Intrinsic::aarch64_neon_fdot, Tys),
-                        Ops, "vfdot");
+                        Ops, "vdot");
   }
 
   case NEON::BI__builtin_neon_vmlalbq_f16_mf8_fpm:
diff --git a/clang/test/CodeGen/AArch64/f16f32dot-intrinsics.c 
b/clang/test/CodeGen/AArch64/f16f32dot-intrinsics.c
index aa5fe3a056d27..7ae61460e40df 100644
--- a/clang/test/CodeGen/AArch64/f16f32dot-intrinsics.c
+++ b/clang/test/CodeGen/AArch64/f16f32dot-intrinsics.c
@@ -7,106 +7,106 @@
 
 #include <arm_neon.h>
 
-// CHECK-LABEL: define dso_local <2 x float> @test_vfdot_f32(
+// CHECK-LABEL: define dso_local <2 x float> @test_vdot_f32(
 // CHECK-SAME: <2 x float> noundef [[R:%.*]], <4 x half> noundef [[A:%.*]], <4 
x half> noundef [[B:%.*]]) #[[ATTR0:[0-9]+]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[VFDOT3_I:%.*]] = call <2 x float> 
@llvm.aarch64.neon.fdot.v2f32.v4f16(<2 x float> [[R]], <4 x half> [[A]], <4 x 
half> [[B]])
-// CHECK-NEXT:    ret <2 x float> [[VFDOT3_I]]
+// CHECK-NEXT:    [[VDOT3_I:%.*]] = call <2 x float> 
@llvm.aarch64.neon.fdot.v2f32.v4f16(<2 x float> [[R]], <4 x half> [[A]], <4 x 
half> [[B]])
+// CHECK-NEXT:    ret <2 x float> [[VDOT3_I]]
 //
-// CHECK-CXX-LABEL: define dso_local noundef <2 x float> 
@_Z14test_vfdot_f3213__Float32x2_t13__Float16x4_tS0_(
+// CHECK-CXX-LABEL: define dso_local noundef <2 x float> 
@_Z13test_vdot_f3213__Float32x2_t13__Float16x4_tS0_(
 // CHECK-CXX-SAME: <2 x float> noundef [[R:%.*]], <4 x half> noundef 
[[A:%.*]], <4 x half> noundef [[B:%.*]]) #[[ATTR0:[0-9]+]] {
 // CHECK-CXX-NEXT:  [[ENTRY:.*:]]
-// CHECK-CXX-NEXT:    [[VFDOT3_I:%.*]] = call <2 x float> 
@llvm.aarch64.neon.fdot.v2f32.v4f16(<2 x float> [[R]], <4 x half> [[A]], <4 x 
half> [[B]])
-// CHECK-CXX-NEXT:    ret <2 x float> [[VFDOT3_I]]
+// CHECK-CXX-NEXT:    [[VDOT3_I:%.*]] = call <2 x float> 
@llvm.aarch64.neon.fdot.v2f32.v4f16(<2 x float> [[R]], <4 x half> [[A]], <4 x 
half> [[B]])
+// CHECK-CXX-NEXT:    ret <2 x float> [[VDOT3_I]]
 //
-float32x2_t test_vfdot_f32(float32x2_t r, float16x4_t a, float16x4_t b) {
-  return vfdot_f32(r, a, b);
+float32x2_t test_vdot_f32(float32x2_t r, float16x4_t a, float16x4_t b) {
+  return vdot_f32(r, a, b);
 }
 
-// CHECK-LABEL: define dso_local <4 x float> @test_vfdotq_f32(
+// CHECK-LABEL: define dso_local <4 x float> @test_vdotq_f32(
 // CHECK-SAME: <4 x float> noundef [[R:%.*]], <8 x half> noundef [[A:%.*]], <8 
x half> noundef [[B:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[VFDOT3_I:%.*]] = call <4 x float> 
@llvm.aarch64.neon.fdot.v4f32.v8f16(<4 x float> [[R]], <8 x half> [[A]], <8 x 
half> [[B]])
-// CHECK-NEXT:    ret <4 x float> [[VFDOT3_I]]
+// CHECK-NEXT:    [[VDOT3_I:%.*]] = call <4 x float> 
@llvm.aarch64.neon.fdot.v4f32.v8f16(<4 x float> [[R]], <8 x half> [[A]], <8 x 
half> [[B]])
+// CHECK-NEXT:    ret <4 x float> [[VDOT3_I]]
 //
-// CHECK-CXX-LABEL: define dso_local noundef <4 x float> 
@_Z15test_vfdotq_f3213__Float32x4_t13__Float16x8_tS0_(
+// CHECK-CXX-LABEL: define dso_local noundef <4 x float> 
@_Z14test_vdotq_f3213__Float32x4_t13__Float16x8_tS0_(
 // CHECK-CXX-SAME: <4 x float> noundef [[R:%.*]], <8 x half> noundef 
[[A:%.*]], <8 x half> noundef [[B:%.*]]) #[[ATTR0]] {
 // CHECK-CXX-NEXT:  [[ENTRY:.*:]]
-// CHECK-CXX-NEXT:    [[VFDOT3_I:%.*]] = call <4 x float> 
@llvm.aarch64.neon.fdot.v4f32.v8f16(<4 x float> [[R]], <8 x half> [[A]], <8 x 
half> [[B]])
-// CHECK-CXX-NEXT:    ret <4 x float> [[VFDOT3_I]]
+// CHECK-CXX-NEXT:    [[VDOT3_I:%.*]] = call <4 x float> 
@llvm.aarch64.neon.fdot.v4f32.v8f16(<4 x float> [[R]], <8 x half> [[A]], <8 x 
half> [[B]])
+// CHECK-CXX-NEXT:    ret <4 x float> [[VDOT3_I]]
 //
-float32x4_t test_vfdotq_f32(float32x4_t r, float16x8_t a, float16x8_t b) {
-  return vfdotq_f32(r, a, b);
+float32x4_t test_vdotq_f32(float32x4_t r, float16x8_t a, float16x8_t b) {
+  return vdotq_f32(r, a, b);
 }
 
-// CHECK-LABEL: define dso_local <2 x float> @test_vfdot_lane_f32(
+// CHECK-LABEL: define dso_local <2 x float> @test_vdot_lane_f32(
 // CHECK-SAME: <2 x float> noundef [[R:%.*]], <4 x half> noundef [[A:%.*]], <4 
x half> noundef [[B:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
 // CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x half> [[B]], <4 x half> 
poison, <4 x i32> zeroinitializer
-// CHECK-NEXT:    [[VFDOT2:%.*]] = call <2 x float> 
@llvm.aarch64.neon.fdot.v2f32.v4f16(<2 x float> [[R]], <4 x half> [[A]], <4 x 
half> [[LANE]])
-// CHECK-NEXT:    ret <2 x float> [[VFDOT2]]
+// CHECK-NEXT:    [[VDOT2:%.*]] = call <2 x float> 
@llvm.aarch64.neon.fdot.v2f32.v4f16(<2 x float> [[R]], <4 x half> [[A]], <4 x 
half> [[LANE]])
+// CHECK-NEXT:    ret <2 x float> [[VDOT2]]
 //
-// CHECK-CXX-LABEL: define dso_local noundef <2 x float> 
@_Z19test_vfdot_lane_f3213__Float32x2_t13__Float16x4_tS0_(
+// CHECK-CXX-LABEL: define dso_local noundef <2 x float> 
@_Z18test_vdot_lane_f3213__Float32x2_t13__Float16x4_tS0_(
 // CHECK-CXX-SAME: <2 x float> noundef [[R:%.*]], <4 x half> noundef 
[[A:%.*]], <4 x half> noundef [[B:%.*]]) #[[ATTR0]] {
 // CHECK-CXX-NEXT:  [[ENTRY:.*:]]
 // CHECK-CXX-NEXT:    [[LANE:%.*]] = shufflevector <4 x half> [[B]], <4 x 
half> poison, <4 x i32> zeroinitializer
-// CHECK-CXX-NEXT:    [[VFDOT2:%.*]] = call <2 x float> 
@llvm.aarch64.neon.fdot.v2f32.v4f16(<2 x float> [[R]], <4 x half> [[A]], <4 x 
half> [[LANE]])
-// CHECK-CXX-NEXT:    ret <2 x float> [[VFDOT2]]
+// CHECK-CXX-NEXT:    [[VDOT2:%.*]] = call <2 x float> 
@llvm.aarch64.neon.fdot.v2f32.v4f16(<2 x float> [[R]], <4 x half> [[A]], <4 x 
half> [[LANE]])
+// CHECK-CXX-NEXT:    ret <2 x float> [[VDOT2]]
 //
-float32x2_t test_vfdot_lane_f32(float32x2_t r, float16x4_t a, float16x4_t b) {
-  return vfdot_lane_f32(r, a, b, 0);
+float32x2_t test_vdot_lane_f32(float32x2_t r, float16x4_t a, float16x4_t b) {
+  return vdot_lane_f32(r, a, b, 0);
 }
 
-// CHECK-LABEL: define dso_local <2 x float> @test_vfdot_laneq_f32(
+// CHECK-LABEL: define dso_local <2 x float> @test_vdot_laneq_f32(
 // CHECK-SAME: <2 x float> noundef [[R:%.*]], <4 x half> noundef [[A:%.*]], <8 
x half> noundef [[B:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
 // CHECK-NEXT:    [[LANE:%.*]] = shufflevector <8 x half> [[B]], <8 x half> 
poison, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
-// CHECK-NEXT:    [[VFDOT2:%.*]] = call <2 x float> 
@llvm.aarch64.neon.fdot.v2f32.v4f16(<2 x float> [[R]], <4 x half> [[A]], <4 x 
half> [[LANE]])
-// CHECK-NEXT:    ret <2 x float> [[VFDOT2]]
+// CHECK-NEXT:    [[VDOT2:%.*]] = call <2 x float> 
@llvm.aarch64.neon.fdot.v2f32.v4f16(<2 x float> [[R]], <4 x half> [[A]], <4 x 
half> [[LANE]])
+// CHECK-NEXT:    ret <2 x float> [[VDOT2]]
 //
-// CHECK-CXX-LABEL: define dso_local noundef <2 x float> 
@_Z20test_vfdot_laneq_f3213__Float32x2_t13__Float16x4_t13__Float16x8_t(
+// CHECK-CXX-LABEL: define dso_local noundef <2 x float> 
@_Z19test_vdot_laneq_f3213__Float32x2_t13__Float16x4_t13__Float16x8_t(
 // CHECK-CXX-SAME: <2 x float> noundef [[R:%.*]], <4 x half> noundef 
[[A:%.*]], <8 x half> noundef [[B:%.*]]) #[[ATTR0]] {
 // CHECK-CXX-NEXT:  [[ENTRY:.*:]]
 // CHECK-CXX-NEXT:    [[LANE:%.*]] = shufflevector <8 x half> [[B]], <8 x 
half> poison, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
-// CHECK-CXX-NEXT:    [[VFDOT2:%.*]] = call <2 x float> 
@llvm.aarch64.neon.fdot.v2f32.v4f16(<2 x float> [[R]], <4 x half> [[A]], <4 x 
half> [[LANE]])
-// CHECK-CXX-NEXT:    ret <2 x float> [[VFDOT2]]
+// CHECK-CXX-NEXT:    [[VDOT2:%.*]] = call <2 x float> 
@llvm.aarch64.neon.fdot.v2f32.v4f16(<2 x float> [[R]], <4 x half> [[A]], <4 x 
half> [[LANE]])
+// CHECK-CXX-NEXT:    ret <2 x float> [[VDOT2]]
 //
-float32x2_t test_vfdot_laneq_f32(float32x2_t r, float16x4_t a, float16x8_t b) {
-  return vfdot_laneq_f32(r, a, b, 3);
+float32x2_t test_vdot_laneq_f32(float32x2_t r, float16x4_t a, float16x8_t b) {
+  return vdot_laneq_f32(r, a, b, 3);
 }
 
-// CHECK-LABEL: define dso_local <4 x float> @test_vfdotq_lane_f32(
+// CHECK-LABEL: define dso_local <4 x float> @test_vdotq_lane_f32(
 // CHECK-SAME: <4 x float> noundef [[R:%.*]], <8 x half> noundef [[A:%.*]], <4 
x half> noundef [[B:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
 // CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x half> [[B]], <4 x half> 
poison, <8 x i32> zeroinitializer
-// CHECK-NEXT:    [[VFDOT2:%.*]] = call <4 x float> 
@llvm.aarch64.neon.fdot.v4f32.v8f16(<4 x float> [[R]], <8 x half> [[A]], <8 x 
half> [[LANE]])
-// CHECK-NEXT:    ret <4 x float> [[VFDOT2]]
+// CHECK-NEXT:    [[VDOT2:%.*]] = call <4 x float> 
@llvm.aarch64.neon.fdot.v4f32.v8f16(<4 x float> [[R]], <8 x half> [[A]], <8 x 
half> [[LANE]])
+// CHECK-NEXT:    ret <4 x float> [[VDOT2]]
 //
-// CHECK-CXX-LABEL: define dso_local noundef <4 x float> 
@_Z20test_vfdotq_lane_f3213__Float32x4_t13__Float16x8_t13__Float16x4_t(
+// CHECK-CXX-LABEL: define dso_local noundef <4 x float> 
@_Z19test_vdotq_lane_f3213__Float32x4_t13__Float16x8_t13__Float16x4_t(
 // CHECK-CXX-SAME: <4 x float> noundef [[R:%.*]], <8 x half> noundef 
[[A:%.*]], <4 x half> noundef [[B:%.*]]) #[[ATTR0]] {
 // CHECK-CXX-NEXT:  [[ENTRY:.*:]]
 // CHECK-CXX-NEXT:    [[LANE:%.*]] = shufflevector <4 x half> [[B]], <4 x 
half> poison, <8 x i32> zeroinitializer
-// CHECK-CXX-NEXT:    [[VFDOT2:%.*]] = call <4 x float> 
@llvm.aarch64.neon.fdot.v4f32.v8f16(<4 x float> [[R]], <8 x half> [[A]], <8 x 
half> [[LANE]])
-// CHECK-CXX-NEXT:    ret <4 x float> [[VFDOT2]]
+// CHECK-CXX-NEXT:    [[VDOT2:%.*]] = call <4 x float> 
@llvm.aarch64.neon.fdot.v4f32.v8f16(<4 x float> [[R]], <8 x half> [[A]], <8 x 
half> [[LANE]])
+// CHECK-CXX-NEXT:    ret <4 x float> [[VDOT2]]
 //
-float32x4_t test_vfdotq_lane_f32(float32x4_t r, float16x8_t a, float16x4_t b) {
-  return vfdotq_lane_f32(r, a, b, 0);
+float32x4_t test_vdotq_lane_f32(float32x4_t r, float16x8_t a, float16x4_t b) {
+  return vdotq_lane_f32(r, a, b, 0);
 }
 
-// CHECK-LABEL: define dso_local <4 x float> @test_vfdotq_laneq_f32(
+// CHECK-LABEL: define dso_local <4 x float> @test_vdotq_laneq_f32(
 // CHECK-SAME: <4 x float> noundef [[R:%.*]], <8 x half> noundef [[A:%.*]], <8 
x half> noundef [[B:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
 // CHECK-NEXT:    [[LANE:%.*]] = shufflevector <8 x half> [[B]], <8 x half> 
poison, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
-// CHECK-NEXT:    [[VFDOT2:%.*]] = call <4 x float> 
@llvm.aarch64.neon.fdot.v4f32.v8f16(<4 x float> [[R]], <8 x half> [[A]], <8 x 
half> [[LANE]])
-// CHECK-NEXT:    ret <4 x float> [[VFDOT2]]
+// CHECK-NEXT:    [[VDOT2:%.*]] = call <4 x float> 
@llvm.aarch64.neon.fdot.v4f32.v8f16(<4 x float> [[R]], <8 x half> [[A]], <8 x 
half> [[LANE]])
+// CHECK-NEXT:    ret <4 x float> [[VDOT2]]
 //
-// CHECK-CXX-LABEL: define dso_local noundef <4 x float> 
@_Z21test_vfdotq_laneq_f3213__Float32x4_t13__Float16x8_tS0_(
+// CHECK-CXX-LABEL: define dso_local noundef <4 x float> 
@_Z20test_vdotq_laneq_f3213__Float32x4_t13__Float16x8_tS0_(
 // CHECK-CXX-SAME: <4 x float> noundef [[R:%.*]], <8 x half> noundef 
[[A:%.*]], <8 x half> noundef [[B:%.*]]) #[[ATTR0]] {
 // CHECK-CXX-NEXT:  [[ENTRY:.*:]]
 // CHECK-CXX-NEXT:    [[LANE:%.*]] = shufflevector <8 x half> [[B]], <8 x 
half> poison, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
-// CHECK-CXX-NEXT:    [[VFDOT2:%.*]] = call <4 x float> 
@llvm.aarch64.neon.fdot.v4f32.v8f16(<4 x float> [[R]], <8 x half> [[A]], <8 x 
half> [[LANE]])
-// CHECK-CXX-NEXT:    ret <4 x float> [[VFDOT2]]
+// CHECK-CXX-NEXT:    [[VDOT2:%.*]] = call <4 x float> 
@llvm.aarch64.neon.fdot.v4f32.v8f16(<4 x float> [[R]], <8 x half> [[A]], <8 x 
half> [[LANE]])
+// CHECK-CXX-NEXT:    ret <4 x float> [[VDOT2]]
 //
-float32x4_t test_vfdotq_laneq_f32(float32x4_t r, float16x8_t a, float16x8_t b) 
{
-  return vfdotq_laneq_f32(r, a, b, 3);
+float32x4_t test_vdotq_laneq_f32(float32x4_t r, float16x8_t a, float16x8_t b) {
+  return vdotq_laneq_f32(r, a, b, 3);
 }
diff --git a/clang/test/Sema/aarch64-neon-immediate-ranges/dotprod.c 
b/clang/test/Sema/aarch64-neon-immediate-ranges/dotprod.c
index 7f1947e5d9d07..e3d3cec1cfe5d 100644
--- a/clang/test/Sema/aarch64-neon-immediate-ranges/dotprod.c
+++ b/clang/test/Sema/aarch64-neon-immediate-ranges/dotprod.c
@@ -50,23 +50,23 @@ void test_dot_product_s32(int32x2_t arg_i32x2, int8x16_t 
arg_i8x16, int8x8_t arg
 }
 
 void test_dot_product_f32(float32x2_t r2, float32x4_t r4, float16x4_t h4, 
float16x8_t h8) {
-  (void)vfdot_lane_f32(r2, h4, h4, -1);
+  (void)vdot_lane_f32(r2, h4, h4, -1);
 // expected-error@-1 {{argument value -1 is outside the valid range [0, 1]}}
-  (void)vfdot_lane_f32(r2, h4, h4, 2);
+  (void)vdot_lane_f32(r2, h4, h4, 2);
 // expected-error@-1 {{argument value 2 is outside the valid range [0, 1]}}
 
-  (void)vfdot_laneq_f32(r2, h4, h8, -1);
+  (void)vdot_laneq_f32(r2, h4, h8, -1);
 // expected-error@-1 {{argument value -1 is outside the valid range [0, 3]}}
-  (void)vfdot_laneq_f32(r2, h4, h8, 4);
+  (void)vdot_laneq_f32(r2, h4, h8, 4);
 // expected-error@-1 {{argument value 4 is outside the valid range [0, 3]}}
 
-  (void)vfdotq_lane_f32(r4, h8, h4, -1);
+  (void)vdotq_lane_f32(r4, h8, h4, -1);
 // expected-error@-1 {{argument value -1 is outside the valid range [0, 1]}}
-  (void)vfdotq_lane_f32(r4, h8, h4, 2);
+  (void)vdotq_lane_f32(r4, h8, h4, 2);
 // expected-error@-1 {{argument value 2 is outside the valid range [0, 1]}}
 
-  (void)vfdotq_laneq_f32(r4, h8, h8, -1);
+  (void)vdotq_laneq_f32(r4, h8, h8, -1);
 // expected-error@-1 {{argument value -1 is outside the valid range [0, 3]}}
-  (void)vfdotq_laneq_f32(r4, h8, h8, 4);
+  (void)vdotq_laneq_f32(r4, h8, h8, 4);
 // expected-error@-1 {{argument value 4 is outside the valid range [0, 3]}}
 }
diff --git a/llvm/test/CodeGen/AArch64/aarch64-f16f32dot-intrinsics.ll 
b/llvm/test/CodeGen/AArch64/aarch64-f16f32dot-intrinsics.ll
index d7167c152955a..44687ef5102fd 100644
--- a/llvm/test/CodeGen/AArch64/aarch64-f16f32dot-intrinsics.ll
+++ b/llvm/test/CodeGen/AArch64/aarch64-f16f32dot-intrinsics.ll
@@ -1,8 +1,8 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 
UTC_ARGS: --version 6
 ; RUN: llc -mtriple aarch64 -mattr=+f16f32dot %s -o - | FileCheck %s
 
-define <2 x float> @test_vfdot_f32(<2 x float> %r, <4 x half> %a, <4 x half> 
%b) {
-; CHECK-LABEL: test_vfdot_f32:
+define <2 x float> @test_vdot_f32(<2 x float> %r, <4 x half> %a, <4 x half> 
%b) {
+; CHECK-LABEL: test_vdot_f32:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    fdot v0.2s, v1.4h, v2.4h
 ; CHECK-NEXT:    ret
@@ -10,8 +10,8 @@ define <2 x float> @test_vfdot_f32(<2 x float> %r, <4 x half> 
%a, <4 x half> %b)
   ret <2 x float> %res
 }
 
-define <4 x float> @test_vfdotq_f32(<4 x float> %r, <8 x half> %a, <8 x half> 
%b) {
-; CHECK-LABEL: test_vfdotq_f32:
+define <4 x float> @test_vdotq_f32(<4 x float> %r, <8 x half> %a, <8 x half> 
%b) {
+; CHECK-LABEL: test_vdotq_f32:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    fdot v0.4s, v1.8h, v2.8h
 ; CHECK-NEXT:    ret
@@ -19,8 +19,8 @@ define <4 x float> @test_vfdotq_f32(<4 x float> %r, <8 x 
half> %a, <8 x half> %b
   ret <4 x float> %res
 }
 
-define <2 x float> @test_vfdot_lane_f32(<2 x float> %r, <4 x half> %a, <4 x 
half> %b) {
-; CHECK-LABEL: test_vfdot_lane_f32:
+define <2 x float> @test_vdot_lane_f32(<2 x float> %r, <4 x half> %a, <4 x 
half> %b) {
+; CHECK-LABEL: test_vdot_lane_f32:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    // kill: def $d2 killed $d2 def $q2
 ; CHECK-NEXT:    fdot v0.2s, v1.4h, v2.2h[0]
@@ -30,8 +30,8 @@ define <2 x float> @test_vfdot_lane_f32(<2 x float> %r, <4 x 
half> %a, <4 x half
   ret <2 x float> %res
 }
 
-define <4 x float> @test_vfdotq_laneq_f32(<4 x float> %r, <8 x half> %a, <8 x 
half> %b) {
-; CHECK-LABEL: test_vfdotq_laneq_f32:
+define <4 x float> @test_vdotq_laneq_f32(<4 x float> %r, <8 x half> %a, <8 x 
half> %b) {
+; CHECK-LABEL: test_vdotq_laneq_f32:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    fdot v0.4s, v1.8h, v2.2h[3]
 ; CHECK-NEXT:    ret
@@ -40,8 +40,8 @@ define <4 x float> @test_vfdotq_laneq_f32(<4 x float> %r, <8 
x half> %a, <8 x ha
   ret <4 x float> %res
 }
 
-define <2 x float> @test_vfdot_laneq_f32(<2 x float> %r, <4 x half> %a, <8 x 
half> %b) {
-; CHECK-LABEL: test_vfdot_laneq_f32:
+define <2 x float> @test_vdot_laneq_f32(<2 x float> %r, <4 x half> %a, <8 x 
half> %b) {
+; CHECK-LABEL: test_vdot_laneq_f32:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    fdot v0.2s, v1.4h, v2.2h[3]
 ; CHECK-NEXT:    ret
@@ -50,8 +50,8 @@ define <2 x float> @test_vfdot_laneq_f32(<2 x float> %r, <4 x 
half> %a, <8 x hal
   ret <2 x float> %res
 }
 
-define <4 x float> @test_vfdotq_lane_f32(<4 x float> %r, <8 x half> %a, <4 x 
half> %b) {
-; CHECK-LABEL: test_vfdotq_lane_f32:
+define <4 x float> @test_vdotq_lane_f32(<4 x float> %r, <8 x half> %a, <4 x 
half> %b) {
+; CHECK-LABEL: test_vdotq_lane_f32:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    // kill: def $d2 killed $d2 def $q2
 ; CHECK-NEXT:    fdot v0.4s, v1.8h, v2.2h[0]

>From c18b5b3ba1bc01976d24eeb0034f7c5d98e9ba2b Mon Sep 17 00:00:00 2001
From: Marian Lukac <[email protected]>
Date: Thu, 23 Apr 2026 15:07:55 +0000
Subject: [PATCH 4/6] Adjust the Imm check

---
 clang/include/clang/Basic/arm_neon.td | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/clang/include/clang/Basic/arm_neon.td 
b/clang/include/clang/Basic/arm_neon.td
index 64ee23a082370..484b905786f92 100644
--- a/clang/include/clang/Basic/arm_neon.td
+++ b/clang/include/clang/Basic/arm_neon.td
@@ -1906,9 +1906,9 @@ let ArchGuard = "defined(__aarch64__) || 
defined(__arm64ec__)", TargetGuard = "d
 let ArchGuard = "defined(__aarch64__)", TargetGuard = "f16f32dot,neon" in {
   def VDOT_F16 : SInst<"vdot", "..<<", "fQf">;
   def VDOT_LANE_F16 : SInst<"vdot_lane", "..<(<q)I", "fQf",
-                           [ImmCheck<3, ImmCheck0_1, 0>]>;
+                           [ImmCheck<3, ImmCheckLaneIndex, 0>]>;
   def VDOT_LANEQ_F16 : SInst<"vdot_laneq", "..<(<Q)I", "fQf",
-                            [ImmCheck<3, ImmCheck0_3, 0>]>;
+                            [ImmCheck<3, ImmCheckLaneIndex, 0>]>;
 }
 
 // v8.2-A FP16 fused multiply-add long instructions.

>From 862d01af4dc10a48c36a4bdd9c04a408610a4851 Mon Sep 17 00:00:00 2001
From: Marian Lukac <[email protected]>
Date: Thu, 23 Apr 2026 16:18:56 +0000
Subject: [PATCH 5/6] Fix imm check

---
 clang/include/clang/Basic/arm_neon.td | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/clang/include/clang/Basic/arm_neon.td 
b/clang/include/clang/Basic/arm_neon.td
index 484b905786f92..17914620b2cfa 100644
--- a/clang/include/clang/Basic/arm_neon.td
+++ b/clang/include/clang/Basic/arm_neon.td
@@ -1906,9 +1906,9 @@ let ArchGuard = "defined(__aarch64__) || 
defined(__arm64ec__)", TargetGuard = "d
 let ArchGuard = "defined(__aarch64__)", TargetGuard = "f16f32dot,neon" in {
   def VDOT_F16 : SInst<"vdot", "..<<", "fQf">;
   def VDOT_LANE_F16 : SInst<"vdot_lane", "..<(<q)I", "fQf",
-                           [ImmCheck<3, ImmCheckLaneIndex, 0>]>;
+                           [ImmCheck<3, ImmCheckLaneIndex, 2>]>;
   def VDOT_LANEQ_F16 : SInst<"vdot_laneq", "..<(<Q)I", "fQf",
-                            [ImmCheck<3, ImmCheckLaneIndex, 0>]>;
+                            [ImmCheck<3, ImmCheckLaneIndex, 2>]>;
 }
 
 // v8.2-A FP16 fused multiply-add long instructions.

>From bd656a23c3c0e0ef39dfe1c5adc0b28543961263 Mon Sep 17 00:00:00 2001
From: Marian Lukac <[email protected]>
Date: Thu, 28 May 2026 10:20:19 +0000
Subject: [PATCH 6/6] rename intrinsics to correct name

---
 clang/include/clang/Basic/arm_neon.td         |  6 +--
 clang/lib/CodeGen/TargetBuiltins/ARM.cpp      | 12 ++---
 .../CodeGen/AArch64/f16f32dot-intrinsics.c    | 48 +++++++++----------
 .../aarch64-neon-immediate-ranges/dotprod.c   | 18 +++----
 .../AArch64/aarch64-f16f32dot-intrinsics.ll   |  5 +-
 5 files changed, 43 insertions(+), 46 deletions(-)

diff --git a/clang/include/clang/Basic/arm_neon.td 
b/clang/include/clang/Basic/arm_neon.td
index 17914620b2cfa..3bf61c1eed6aa 100644
--- a/clang/include/clang/Basic/arm_neon.td
+++ b/clang/include/clang/Basic/arm_neon.td
@@ -1904,10 +1904,10 @@ let ArchGuard = "defined(__aarch64__) || 
defined(__arm64ec__)", TargetGuard = "d
 }
 
 let ArchGuard = "defined(__aarch64__)", TargetGuard = "f16f32dot,neon" in {
-  def VDOT_F16 : SInst<"vdot", "..<<", "fQf">;
-  def VDOT_LANE_F16 : SInst<"vdot_lane", "..<(<q)I", "fQf",
+  def VDOT_F16 : SInst<"vdot_f32", ">>..", "hQh">;
+  def VDOT_LANE_F16 : SInst<"vdot_lane_f32", ">>.qI", "hQh",
                            [ImmCheck<3, ImmCheckLaneIndex, 2>]>;
-  def VDOT_LANEQ_F16 : SInst<"vdot_laneq", "..<(<Q)I", "fQf",
+  def VDOT_LANEQ_F16 : SInst<"vdot_laneq_f32", ">>.QI", "hQh",
                             [ImmCheck<3, ImmCheckLaneIndex, 2>]>;
 }
 
diff --git a/clang/lib/CodeGen/TargetBuiltins/ARM.cpp 
b/clang/lib/CodeGen/TargetBuiltins/ARM.cpp
index 9cc5fea42ae67..e432996429ea5 100644
--- a/clang/lib/CodeGen/TargetBuiltins/ARM.cpp
+++ b/clang/lib/CodeGen/TargetBuiltins/ARM.cpp
@@ -7154,8 +7154,8 @@ Value *CodeGenFunction::EmitAArch64BuiltinExpr(unsigned 
BuiltinID,
     return EmitFP8NeonFDOTCall(Intrinsic::aarch64_neon_fp8_fdot4_lane,
                                ExtendLaneArg, FloatTy, Ops, E, "fdot4_lane");
 
-  case NEON::BI__builtin_neon_vdot_f32:
-  case NEON::BI__builtin_neon_vdotq_f32: {
+  case NEON::BI__builtin_neon_vdot_f32_f16:
+  case NEON::BI__builtin_neon_vdotq_f32_f16: {
     llvm::Type *InputTy =
         llvm::FixedVectorType::get(HalfTy, Ty->getPrimitiveSizeInBits() / 16);
     llvm::Type *Tys[2] = {Ty, InputTy};
@@ -7163,10 +7163,10 @@ Value *CodeGenFunction::EmitAArch64BuiltinExpr(unsigned 
BuiltinID,
                         Ops, "vdot");
   }
 
-  case NEON::BI__builtin_neon_vdot_lane_f32:
-  case NEON::BI__builtin_neon_vdot_laneq_f32:
-  case NEON::BI__builtin_neon_vdotq_lane_f32:
-  case NEON::BI__builtin_neon_vdotq_laneq_f32: {
+  case NEON::BI__builtin_neon_vdot_lane_f32_f16:
+  case NEON::BI__builtin_neon_vdot_laneq_f32_f16:
+  case NEON::BI__builtin_neon_vdotq_lane_f32_f16:
+  case NEON::BI__builtin_neon_vdotq_laneq_f32_f16: {
     llvm::FixedVectorType *InputTy =
         llvm::FixedVectorType::get(HalfTy, Ty->getPrimitiveSizeInBits() / 16);
     llvm::FixedVectorType *LaneTy = llvm::FixedVectorType::get(
diff --git a/clang/test/CodeGen/AArch64/f16f32dot-intrinsics.c 
b/clang/test/CodeGen/AArch64/f16f32dot-intrinsics.c
index 7ae61460e40df..763a8228680f9 100644
--- a/clang/test/CodeGen/AArch64/f16f32dot-intrinsics.c
+++ b/clang/test/CodeGen/AArch64/f16f32dot-intrinsics.c
@@ -7,106 +7,106 @@
 
 #include <arm_neon.h>
 
-// CHECK-LABEL: define dso_local <2 x float> @test_vdot_f32(
+// CHECK-LABEL: define dso_local <2 x float> @test_vdot_f32_f16(
 // CHECK-SAME: <2 x float> noundef [[R:%.*]], <4 x half> noundef [[A:%.*]], <4 
x half> noundef [[B:%.*]]) #[[ATTR0:[0-9]+]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
 // CHECK-NEXT:    [[VDOT3_I:%.*]] = call <2 x float> 
@llvm.aarch64.neon.fdot.v2f32.v4f16(<2 x float> [[R]], <4 x half> [[A]], <4 x 
half> [[B]])
 // CHECK-NEXT:    ret <2 x float> [[VDOT3_I]]
 //
-// CHECK-CXX-LABEL: define dso_local noundef <2 x float> 
@_Z13test_vdot_f3213__Float32x2_t13__Float16x4_tS0_(
+// CHECK-CXX-LABEL: define dso_local noundef <2 x float> 
@_Z17test_vdot_f32_f1613__Float32x2_t13__Float16x4_tS0_(
 // CHECK-CXX-SAME: <2 x float> noundef [[R:%.*]], <4 x half> noundef 
[[A:%.*]], <4 x half> noundef [[B:%.*]]) #[[ATTR0:[0-9]+]] {
 // CHECK-CXX-NEXT:  [[ENTRY:.*:]]
 // CHECK-CXX-NEXT:    [[VDOT3_I:%.*]] = call <2 x float> 
@llvm.aarch64.neon.fdot.v2f32.v4f16(<2 x float> [[R]], <4 x half> [[A]], <4 x 
half> [[B]])
 // CHECK-CXX-NEXT:    ret <2 x float> [[VDOT3_I]]
 //
-float32x2_t test_vdot_f32(float32x2_t r, float16x4_t a, float16x4_t b) {
-  return vdot_f32(r, a, b);
+float32x2_t test_vdot_f32_f16(float32x2_t r, float16x4_t a, float16x4_t b) {
+  return vdot_f32_f16(r, a, b);
 }
 
-// CHECK-LABEL: define dso_local <4 x float> @test_vdotq_f32(
+// CHECK-LABEL: define dso_local <4 x float> @test_vdotq_f32_f16(
 // CHECK-SAME: <4 x float> noundef [[R:%.*]], <8 x half> noundef [[A:%.*]], <8 
x half> noundef [[B:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
 // CHECK-NEXT:    [[VDOT3_I:%.*]] = call <4 x float> 
@llvm.aarch64.neon.fdot.v4f32.v8f16(<4 x float> [[R]], <8 x half> [[A]], <8 x 
half> [[B]])
 // CHECK-NEXT:    ret <4 x float> [[VDOT3_I]]
 //
-// CHECK-CXX-LABEL: define dso_local noundef <4 x float> 
@_Z14test_vdotq_f3213__Float32x4_t13__Float16x8_tS0_(
+// CHECK-CXX-LABEL: define dso_local noundef <4 x float> 
@_Z18test_vdotq_f32_f1613__Float32x4_t13__Float16x8_tS0_(
 // CHECK-CXX-SAME: <4 x float> noundef [[R:%.*]], <8 x half> noundef 
[[A:%.*]], <8 x half> noundef [[B:%.*]]) #[[ATTR0]] {
 // CHECK-CXX-NEXT:  [[ENTRY:.*:]]
 // CHECK-CXX-NEXT:    [[VDOT3_I:%.*]] = call <4 x float> 
@llvm.aarch64.neon.fdot.v4f32.v8f16(<4 x float> [[R]], <8 x half> [[A]], <8 x 
half> [[B]])
 // CHECK-CXX-NEXT:    ret <4 x float> [[VDOT3_I]]
 //
-float32x4_t test_vdotq_f32(float32x4_t r, float16x8_t a, float16x8_t b) {
-  return vdotq_f32(r, a, b);
+float32x4_t test_vdotq_f32_f16(float32x4_t r, float16x8_t a, float16x8_t b) {
+  return vdotq_f32_f16(r, a, b);
 }
 
-// CHECK-LABEL: define dso_local <2 x float> @test_vdot_lane_f32(
+// CHECK-LABEL: define dso_local <2 x float> @test_vdot_lane_f32_f16(
 // CHECK-SAME: <2 x float> noundef [[R:%.*]], <4 x half> noundef [[A:%.*]], <4 
x half> noundef [[B:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
 // CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x half> [[B]], <4 x half> 
poison, <4 x i32> zeroinitializer
 // CHECK-NEXT:    [[VDOT2:%.*]] = call <2 x float> 
@llvm.aarch64.neon.fdot.v2f32.v4f16(<2 x float> [[R]], <4 x half> [[A]], <4 x 
half> [[LANE]])
 // CHECK-NEXT:    ret <2 x float> [[VDOT2]]
 //
-// CHECK-CXX-LABEL: define dso_local noundef <2 x float> 
@_Z18test_vdot_lane_f3213__Float32x2_t13__Float16x4_tS0_(
+// CHECK-CXX-LABEL: define dso_local noundef <2 x float> 
@_Z22test_vdot_lane_f32_f1613__Float32x2_t13__Float16x4_tS0_(
 // CHECK-CXX-SAME: <2 x float> noundef [[R:%.*]], <4 x half> noundef 
[[A:%.*]], <4 x half> noundef [[B:%.*]]) #[[ATTR0]] {
 // CHECK-CXX-NEXT:  [[ENTRY:.*:]]
 // CHECK-CXX-NEXT:    [[LANE:%.*]] = shufflevector <4 x half> [[B]], <4 x 
half> poison, <4 x i32> zeroinitializer
 // CHECK-CXX-NEXT:    [[VDOT2:%.*]] = call <2 x float> 
@llvm.aarch64.neon.fdot.v2f32.v4f16(<2 x float> [[R]], <4 x half> [[A]], <4 x 
half> [[LANE]])
 // CHECK-CXX-NEXT:    ret <2 x float> [[VDOT2]]
 //
-float32x2_t test_vdot_lane_f32(float32x2_t r, float16x4_t a, float16x4_t b) {
-  return vdot_lane_f32(r, a, b, 0);
+float32x2_t test_vdot_lane_f32_f16(float32x2_t r, float16x4_t a, float16x4_t 
b) {
+  return vdot_lane_f32_f16(r, a, b, 0);
 }
 
-// CHECK-LABEL: define dso_local <2 x float> @test_vdot_laneq_f32(
+// CHECK-LABEL: define dso_local <2 x float> @test_vdot_laneq_f32_f16(
 // CHECK-SAME: <2 x float> noundef [[R:%.*]], <4 x half> noundef [[A:%.*]], <8 
x half> noundef [[B:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
 // CHECK-NEXT:    [[LANE:%.*]] = shufflevector <8 x half> [[B]], <8 x half> 
poison, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
 // CHECK-NEXT:    [[VDOT2:%.*]] = call <2 x float> 
@llvm.aarch64.neon.fdot.v2f32.v4f16(<2 x float> [[R]], <4 x half> [[A]], <4 x 
half> [[LANE]])
 // CHECK-NEXT:    ret <2 x float> [[VDOT2]]
 //
-// CHECK-CXX-LABEL: define dso_local noundef <2 x float> 
@_Z19test_vdot_laneq_f3213__Float32x2_t13__Float16x4_t13__Float16x8_t(
+// CHECK-CXX-LABEL: define dso_local noundef <2 x float> 
@_Z23test_vdot_laneq_f32_f1613__Float32x2_t13__Float16x4_t13__Float16x8_t(
 // CHECK-CXX-SAME: <2 x float> noundef [[R:%.*]], <4 x half> noundef 
[[A:%.*]], <8 x half> noundef [[B:%.*]]) #[[ATTR0]] {
 // CHECK-CXX-NEXT:  [[ENTRY:.*:]]
 // CHECK-CXX-NEXT:    [[LANE:%.*]] = shufflevector <8 x half> [[B]], <8 x 
half> poison, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
 // CHECK-CXX-NEXT:    [[VDOT2:%.*]] = call <2 x float> 
@llvm.aarch64.neon.fdot.v2f32.v4f16(<2 x float> [[R]], <4 x half> [[A]], <4 x 
half> [[LANE]])
 // CHECK-CXX-NEXT:    ret <2 x float> [[VDOT2]]
 //
-float32x2_t test_vdot_laneq_f32(float32x2_t r, float16x4_t a, float16x8_t b) {
-  return vdot_laneq_f32(r, a, b, 3);
+float32x2_t test_vdot_laneq_f32_f16(float32x2_t r, float16x4_t a, float16x8_t 
b) {
+  return vdot_laneq_f32_f16(r, a, b, 3);
 }
 
-// CHECK-LABEL: define dso_local <4 x float> @test_vdotq_lane_f32(
+// CHECK-LABEL: define dso_local <4 x float> @test_vdotq_lane_f32_f16(
 // CHECK-SAME: <4 x float> noundef [[R:%.*]], <8 x half> noundef [[A:%.*]], <4 
x half> noundef [[B:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
 // CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x half> [[B]], <4 x half> 
poison, <8 x i32> zeroinitializer
 // CHECK-NEXT:    [[VDOT2:%.*]] = call <4 x float> 
@llvm.aarch64.neon.fdot.v4f32.v8f16(<4 x float> [[R]], <8 x half> [[A]], <8 x 
half> [[LANE]])
 // CHECK-NEXT:    ret <4 x float> [[VDOT2]]
 //
-// CHECK-CXX-LABEL: define dso_local noundef <4 x float> 
@_Z19test_vdotq_lane_f3213__Float32x4_t13__Float16x8_t13__Float16x4_t(
+// CHECK-CXX-LABEL: define dso_local noundef <4 x float> 
@_Z23test_vdotq_lane_f32_f1613__Float32x4_t13__Float16x8_t13__Float16x4_t(
 // CHECK-CXX-SAME: <4 x float> noundef [[R:%.*]], <8 x half> noundef 
[[A:%.*]], <4 x half> noundef [[B:%.*]]) #[[ATTR0]] {
 // CHECK-CXX-NEXT:  [[ENTRY:.*:]]
 // CHECK-CXX-NEXT:    [[LANE:%.*]] = shufflevector <4 x half> [[B]], <4 x 
half> poison, <8 x i32> zeroinitializer
 // CHECK-CXX-NEXT:    [[VDOT2:%.*]] = call <4 x float> 
@llvm.aarch64.neon.fdot.v4f32.v8f16(<4 x float> [[R]], <8 x half> [[A]], <8 x 
half> [[LANE]])
 // CHECK-CXX-NEXT:    ret <4 x float> [[VDOT2]]
 //
-float32x4_t test_vdotq_lane_f32(float32x4_t r, float16x8_t a, float16x4_t b) {
-  return vdotq_lane_f32(r, a, b, 0);
+float32x4_t test_vdotq_lane_f32_f16(float32x4_t r, float16x8_t a, float16x4_t 
b) {
+  return vdotq_lane_f32_f16(r, a, b, 0);
 }
 
-// CHECK-LABEL: define dso_local <4 x float> @test_vdotq_laneq_f32(
+// CHECK-LABEL: define dso_local <4 x float> @test_vdotq_laneq_f32_f16(
 // CHECK-SAME: <4 x float> noundef [[R:%.*]], <8 x half> noundef [[A:%.*]], <8 
x half> noundef [[B:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
 // CHECK-NEXT:    [[LANE:%.*]] = shufflevector <8 x half> [[B]], <8 x half> 
poison, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
 // CHECK-NEXT:    [[VDOT2:%.*]] = call <4 x float> 
@llvm.aarch64.neon.fdot.v4f32.v8f16(<4 x float> [[R]], <8 x half> [[A]], <8 x 
half> [[LANE]])
 // CHECK-NEXT:    ret <4 x float> [[VDOT2]]
 //
-// CHECK-CXX-LABEL: define dso_local noundef <4 x float> 
@_Z20test_vdotq_laneq_f3213__Float32x4_t13__Float16x8_tS0_(
+// CHECK-CXX-LABEL: define dso_local noundef <4 x float> 
@_Z24test_vdotq_laneq_f32_f1613__Float32x4_t13__Float16x8_tS0_(
 // CHECK-CXX-SAME: <4 x float> noundef [[R:%.*]], <8 x half> noundef 
[[A:%.*]], <8 x half> noundef [[B:%.*]]) #[[ATTR0]] {
 // CHECK-CXX-NEXT:  [[ENTRY:.*:]]
 // CHECK-CXX-NEXT:    [[LANE:%.*]] = shufflevector <8 x half> [[B]], <8 x 
half> poison, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
 // CHECK-CXX-NEXT:    [[VDOT2:%.*]] = call <4 x float> 
@llvm.aarch64.neon.fdot.v4f32.v8f16(<4 x float> [[R]], <8 x half> [[A]], <8 x 
half> [[LANE]])
 // CHECK-CXX-NEXT:    ret <4 x float> [[VDOT2]]
 //
-float32x4_t test_vdotq_laneq_f32(float32x4_t r, float16x8_t a, float16x8_t b) {
-  return vdotq_laneq_f32(r, a, b, 3);
+float32x4_t test_vdotq_laneq_f32_f16(float32x4_t r, float16x8_t a, float16x8_t 
b) {
+  return vdotq_laneq_f32_f16(r, a, b, 3);
 }
diff --git a/clang/test/Sema/aarch64-neon-immediate-ranges/dotprod.c 
b/clang/test/Sema/aarch64-neon-immediate-ranges/dotprod.c
index e3d3cec1cfe5d..c99d69d7caf41 100644
--- a/clang/test/Sema/aarch64-neon-immediate-ranges/dotprod.c
+++ b/clang/test/Sema/aarch64-neon-immediate-ranges/dotprod.c
@@ -49,24 +49,24 @@ void test_dot_product_s32(int32x2_t arg_i32x2, int8x16_t 
arg_i8x16, int8x8_t arg
 
 }
 
-void test_dot_product_f32(float32x2_t r2, float32x4_t r4, float16x4_t h4, 
float16x8_t h8) {
-  (void)vdot_lane_f32(r2, h4, h4, -1);
+void test_dot_product_f32_f16(float32x2_t r2, float32x4_t r4, float16x4_t h4, 
float16x8_t h8) {
+  (void)vdot_lane_f32_f16(r2, h4, h4, -1);
 // expected-error@-1 {{argument value -1 is outside the valid range [0, 1]}}
-  (void)vdot_lane_f32(r2, h4, h4, 2);
+  (void)vdot_lane_f32_f16(r2, h4, h4, 2);
 // expected-error@-1 {{argument value 2 is outside the valid range [0, 1]}}
 
-  (void)vdot_laneq_f32(r2, h4, h8, -1);
+  (void)vdot_laneq_f32_f16(r2, h4, h8, -1);
 // expected-error@-1 {{argument value -1 is outside the valid range [0, 3]}}
-  (void)vdot_laneq_f32(r2, h4, h8, 4);
+  (void)vdot_laneq_f32_f16(r2, h4, h8, 4);
 // expected-error@-1 {{argument value 4 is outside the valid range [0, 3]}}
 
-  (void)vdotq_lane_f32(r4, h8, h4, -1);
+  (void)vdotq_lane_f32_f16(r4, h8, h4, -1);
 // expected-error@-1 {{argument value -1 is outside the valid range [0, 1]}}
-  (void)vdotq_lane_f32(r4, h8, h4, 2);
+  (void)vdotq_lane_f32_f16(r4, h8, h4, 2);
 // expected-error@-1 {{argument value 2 is outside the valid range [0, 1]}}
 
-  (void)vdotq_laneq_f32(r4, h8, h8, -1);
+  (void)vdotq_laneq_f32_f16(r4, h8, h8, -1);
 // expected-error@-1 {{argument value -1 is outside the valid range [0, 3]}}
-  (void)vdotq_laneq_f32(r4, h8, h8, 4);
+  (void)vdotq_laneq_f32_f16(r4, h8, h8, 4);
 // expected-error@-1 {{argument value 4 is outside the valid range [0, 3]}}
 }
diff --git a/llvm/test/CodeGen/AArch64/aarch64-f16f32dot-intrinsics.ll 
b/llvm/test/CodeGen/AArch64/aarch64-f16f32dot-intrinsics.ll
index 44687ef5102fd..7da0076ea9b4b 100644
--- a/llvm/test/CodeGen/AArch64/aarch64-f16f32dot-intrinsics.ll
+++ b/llvm/test/CodeGen/AArch64/aarch64-f16f32dot-intrinsics.ll
@@ -59,7 +59,4 @@ define <4 x float> @test_vdotq_lane_f32(<4 x float> %r, <8 x 
half> %a, <4 x half
   %lane = shufflevector <4 x half> %b, <4 x half> poison, <8 x i32> 
zeroinitializer
   %res = call <4 x float> @llvm.aarch64.neon.fdot.v4f32.v8f16(<4 x float> %r, 
<8 x half> %a, <8 x half> %lane)
   ret <4 x float> %res
-}
-
-declare <2 x float> @llvm.aarch64.neon.fdot.v2f32.v4f16(<2 x float>, <4 x 
half>, <4 x half>)
-declare <4 x float> @llvm.aarch64.neon.fdot.v4f32.v8f16(<4 x float>, <8 x 
half>, <8 x half>)
+}
\ No newline at end of file

_______________________________________________
cfe-commits mailing list
[email protected]
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits

[clang] [llvm] [AArch64] Add intrinsic support for Fdot instr. (PR #189987)

Reply via email to