[clang] [CIR][AArch64] Lower vfmaq_lane_v and vfma_laneq_v (PR #197084)

Yair Ben Avraham via cfe-commits Sun, 17 May 2026 07:15:04 -0700

https://github.com/yairbenavraham updated 
https://github.com/llvm/llvm-project/pull/197084


>From dadd87ced2e7b534fd7b581796f7634981a46be1 Mon Sep 17 00:00:00 2001
From: Yair Ben Avraham <[email protected]>
Date: Tue, 12 May 2026 05:25:12 +0300
Subject: [PATCH 1/4] [CIR][AArch64] Lower vfmaq_lane_v builtin

Lower BI__builtin_neon_vfmaq_lane_v by bitcasting the addend and
multiplicand, splatting the selected lane operand, and emitting the fma
LLVM intrinsic through the shared constrained-call helper.

Move the existing vfmaq_lane tests into the CIR-enabled neon
fused-multiply files and remove the duplicated legacy coverage. The
non-q lane and laneq variants remain explicit NYI cases.
---
 .../lib/CIR/CodeGen/CIRGenBuiltinAArch64.cpp  | 16 ++++-
 clang/test/CodeGen/AArch64/neon-2velem.c      | 58 -----------------
 .../AArch64/neon/fused-multiple-fullfp16.c    | 25 +++++++-
 .../CodeGen/AArch64/neon/fused-multiply.c     | 64 ++++++++++++++++++-
 .../CodeGen/AArch64/v8.2a-neon-intrinsics.c   | 20 ------
 5 files changed, 101 insertions(+), 82 deletions(-)

diff --git a/clang/lib/CIR/CodeGen/CIRGenBuiltinAArch64.cpp 
b/clang/lib/CIR/CodeGen/CIRGenBuiltinAArch64.cpp
index c142b69f6be6e..346dd82580198 100644
--- a/clang/lib/CIR/CodeGen/CIRGenBuiltinAArch64.cpp
+++ b/clang/lib/CIR/CodeGen/CIRGenBuiltinAArch64.cpp
@@ -2567,7 +2567,21 @@ CIRGenFunction::emitAArch64BuiltinExpr(unsigned 
builtinID, const CallExpr *expr,
     return builder.createBitcast(ops[0], ty);
   }
   case NEON::BI__builtin_neon_vfma_lane_v:
-  case NEON::BI__builtin_neon_vfmaq_lane_v:
+    cgm.errorNYI(expr->getSourceRange(),
+                 std::string("unimplemented AArch64 builtin call: ") +
+                     getContext().BuiltinInfo.getName(builtinID));
+    return mlir::Value{};
+  case NEON::BI__builtin_neon_vfmaq_lane_v: {
+    mlir::Value addend = builder.createBitcast(ops[0], ty);
+    mlir::Value multiplicand = builder.createBitcast(ops[1], ty);
+    cir::VectorType sourceTy =
+        cir::VectorType::get(ty.getElementType(), ty.getSize() / 2);
+    mlir::Value laneSource = builder.createBitcast(ops[2], sourceTy);
+    laneSource = emitNeonSplat(builder, loc, laneSource, ops[3], ty.getSize());
+
+    llvm::SmallVector<mlir::Value> fmaOps = {multiplicand, laneSource, addend};
+    return emitCallMaybeConstrainedBuiltin(builder, loc, "fma", ty, fmaOps);
+  }
   case NEON::BI__builtin_neon_vfma_laneq_v:
   case NEON::BI__builtin_neon_vfmaq_laneq_v:
   case NEON::BI__builtin_neon_vfmah_lane_f16:
diff --git a/clang/test/CodeGen/AArch64/neon-2velem.c 
b/clang/test/CodeGen/AArch64/neon-2velem.c
index 2bc7212cde9f8..eab6452196956 100644
--- a/clang/test/CodeGen/AArch64/neon-2velem.c
+++ b/clang/test/CodeGen/AArch64/neon-2velem.c
@@ -424,25 +424,6 @@ float32x2_t test_vfma_lane_f32(float32x2_t a, float32x2_t 
b, float32x2_t v) {
   return vfma_lane_f32(a, b, v, 1);
 }
 
-// CHECK-LABEL: @test_vfmaq_lane_f32(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x float> [[A:%.*]] to <4 x i32>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x float> [[B:%.*]] to <4 x i32>
-// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x float> [[V:%.*]] to <2 x i32>
-// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i32> [[TMP0]] to <16 x i8>
-// CHECK-NEXT:    [[TMP4:%.*]] = bitcast <4 x i32> [[TMP1]] to <16 x i8>
-// CHECK-NEXT:    [[TMP5:%.*]] = bitcast <2 x i32> [[TMP2]] to <8 x i8>
-// CHECK-NEXT:    [[TMP6:%.*]] = bitcast <8 x i8> [[TMP5]] to <2 x float>
-// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <2 x float> [[TMP6]], <2 x 
float> [[TMP6]], <4 x i32> <i32 1, i32 1, i32 1, i32 1>
-// CHECK-NEXT:    [[FMLA:%.*]] = bitcast <16 x i8> [[TMP4]] to <4 x float>
-// CHECK-NEXT:    [[FMLA1:%.*]] = bitcast <16 x i8> [[TMP3]] to <4 x float>
-// CHECK-NEXT:    [[FMLA2:%.*]] = call <4 x float> @llvm.fma.v4f32(<4 x float> 
[[FMLA]], <4 x float> [[LANE]], <4 x float> [[FMLA1]])
-// CHECK-NEXT:    ret <4 x float> [[FMLA2]]
-//
-float32x4_t test_vfmaq_lane_f32(float32x4_t a, float32x4_t b, float32x2_t v) {
-  return vfmaq_lane_f32(a, b, v, 1);
-}
-
 // CHECK-LABEL: @test_vfma_laneq_f32(
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x float> [[A:%.*]] to <2 x i32>
@@ -561,26 +542,6 @@ float32x4_t test_vfmsq_laneq_f32(float32x4_t a, 
float32x4_t b, float32x4_t v) {
   return vfmsq_laneq_f32(a, b, v, 3);
 }
 
-// CHECK-LABEL: @test_vfmaq_lane_f64(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x double> [[A:%.*]] to <2 x i64>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <2 x double> [[B:%.*]] to <2 x i64>
-// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <1 x double> [[V:%.*]] to i64
-// CHECK-NEXT:    [[__S2_SROA_0_0_VEC_INSERT:%.*]] = insertelement <1 x i64> 
undef, i64 [[TMP2]], i32 0
-// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i64> [[TMP0]] to <16 x i8>
-// CHECK-NEXT:    [[TMP4:%.*]] = bitcast <2 x i64> [[TMP1]] to <16 x i8>
-// CHECK-NEXT:    [[TMP5:%.*]] = bitcast <1 x i64> 
[[__S2_SROA_0_0_VEC_INSERT]] to <8 x i8>
-// CHECK-NEXT:    [[TMP6:%.*]] = bitcast <8 x i8> [[TMP5]] to <1 x double>
-// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <1 x double> [[TMP6]], <1 x 
double> [[TMP6]], <2 x i32> zeroinitializer
-// CHECK-NEXT:    [[FMLA:%.*]] = bitcast <16 x i8> [[TMP4]] to <2 x double>
-// CHECK-NEXT:    [[FMLA1:%.*]] = bitcast <16 x i8> [[TMP3]] to <2 x double>
-// CHECK-NEXT:    [[FMLA2:%.*]] = call <2 x double> @llvm.fma.v2f64(<2 x 
double> [[FMLA]], <2 x double> [[LANE]], <2 x double> [[FMLA1]])
-// CHECK-NEXT:    ret <2 x double> [[FMLA2]]
-//
-float64x2_t test_vfmaq_lane_f64(float64x2_t a, float64x2_t b, float64x1_t v) {
-  return vfmaq_lane_f64(a, b, v, 0);
-}
-
 // CHECK-LABEL: @test_vfmaq_laneq_f64(
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x double> [[A:%.*]] to <2 x i64>
@@ -2567,25 +2528,6 @@ float32x2_t test_vfma_lane_f32_0(float32x2_t a, 
float32x2_t b, float32x2_t v) {
   return vfma_lane_f32(a, b, v, 0);
 }
 
-// CHECK-LABEL: @test_vfmaq_lane_f32_0(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x float> [[A:%.*]] to <4 x i32>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x float> [[B:%.*]] to <4 x i32>
-// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x float> [[V:%.*]] to <2 x i32>
-// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i32> [[TMP0]] to <16 x i8>
-// CHECK-NEXT:    [[TMP4:%.*]] = bitcast <4 x i32> [[TMP1]] to <16 x i8>
-// CHECK-NEXT:    [[TMP5:%.*]] = bitcast <2 x i32> [[TMP2]] to <8 x i8>
-// CHECK-NEXT:    [[TMP6:%.*]] = bitcast <8 x i8> [[TMP5]] to <2 x float>
-// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <2 x float> [[TMP6]], <2 x 
float> [[TMP6]], <4 x i32> zeroinitializer
-// CHECK-NEXT:    [[FMLA:%.*]] = bitcast <16 x i8> [[TMP4]] to <4 x float>
-// CHECK-NEXT:    [[FMLA1:%.*]] = bitcast <16 x i8> [[TMP3]] to <4 x float>
-// CHECK-NEXT:    [[FMLA2:%.*]] = call <4 x float> @llvm.fma.v4f32(<4 x float> 
[[FMLA]], <4 x float> [[LANE]], <4 x float> [[FMLA1]])
-// CHECK-NEXT:    ret <4 x float> [[FMLA2]]
-//
-float32x4_t test_vfmaq_lane_f32_0(float32x4_t a, float32x4_t b, float32x2_t v) 
{
-  return vfmaq_lane_f32(a, b, v, 0);
-}
-
 // CHECK-LABEL: @test_vfma_laneq_f32_0(
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x float> [[A:%.*]] to <2 x i32>
diff --git a/clang/test/CodeGen/AArch64/neon/fused-multiple-fullfp16.c 
b/clang/test/CodeGen/AArch64/neon/fused-multiple-fullfp16.c
index af9330865796d..13473e374f3c3 100644
--- a/clang/test/CodeGen/AArch64/neon/fused-multiple-fullfp16.c
+++ b/clang/test/CodeGen/AArch64/neon/fused-multiple-fullfp16.c
@@ -12,8 +12,8 @@
 // This file contains fullfp16 tests that were originally located in:
 //  * clang/test/CodeGen/AArch64/v8.2a-neon-intrinsics.c
 // The main difference is the use of RUN lines that enable ClangIR lowering.
-// This file currently covers the f16 wrapper that lowers through
-// BI__builtin_neon_vfmaq_v.
+// This file currently covers the f16 wrappers that lower through
+// BI__builtin_neon_vfmaq_v and BI__builtin_neon_vfmaq_lane_v.
 //
 // ACLE section headings based on v2025Q2 of the ACLE specification:
 //  * 
https://arm-software.github.io/acle/neon_intrinsics/advsimd.html#fused-multiply-accumulate-2
@@ -45,3 +45,24 @@ float16x8_t test_vfmaq_f16(float16x8_t a, float16x8_t b, 
float16x8_t c) {
 // LLVM-NEXT: ret <8 x half> [[FMA]]
   return vfmaq_f16(a, b, c);
 }
+
+// LLVM-LABEL: @test_vfmaq_lane_f16(
+// CIR-LABEL: @test_vfmaq_lane_f16(
+float16x8_t test_vfmaq_lane_f16(float16x8_t a, float16x8_t b,
+                                 float16x4_t c) {
+// CIR: [[LANE:%.*]] = cir.vec.shuffle(%{{.*}}, %{{.*}} : !cir.vector<4 x 
!cir.f16>) [#cir.int<3> : !s32i, #cir.int<3> : !s32i, #cir.int<3> : !s32i, 
#cir.int<3> : !s32i, #cir.int<3> : !s32i, #cir.int<3> : !s32i, #cir.int<3> : 
!s32i, #cir.int<3> : !s32i] : !cir.vector<8 x !cir.f16>
+// CIR: cir.call_llvm_intrinsic "fma" %{{.*}}, [[LANE]], %{{.*}} : 
(!cir.vector<8 x !cir.f16>, !cir.vector<8 x !cir.f16>, !cir.vector<8 x 
!cir.f16>) -> !cir.vector<8 x !cir.f16>
+
+// LLVM-SAME: <8 x half> {{.*}} [[A:%.*]], <8 x half> {{.*}} [[B:%.*]], <4 x 
half> {{.*}} [[C:%.*]]) {{.*}} {
+// LLVM:      [[A_I:%.*]] = bitcast <8 x half> [[A]] to <8 x i16>
+// LLVM-NEXT: [[B_I:%.*]] = bitcast <8 x half> [[B]] to <8 x i16>
+// LLVM-NEXT: [[C_I:%.*]] = bitcast <4 x half> [[C]] to <4 x i16>
+// LLVM-NEXT: [[A_BYTES:%.*]] = bitcast <8 x i16> [[A_I]] to <16 x i8>
+// LLVM-NEXT: [[B_BYTES:%.*]] = bitcast <8 x i16> [[B_I]] to <16 x i8>
+// LLVM-NEXT: [[C_BYTES:%.*]] = bitcast <4 x i16> [[C_I]] to <8 x i8>
+// LLVM:      [[C_CAST:%.*]] = bitcast <8 x i8> [[C_BYTES]] to <4 x half>
+// LLVM-NEXT: [[LANE:%.*]] = shufflevector <4 x half> [[C_CAST]], <4 x half> 
{{.*}}, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
+// LLVM:      [[FMA:%.*]] = call <8 x half> @llvm.fma.v8f16(<8 x half> 
%{{.*}}, <8 x half> [[LANE]], <8 x half> %{{.*}})
+// LLVM:      ret <8 x half> [[FMA]]
+  return vfmaq_lane_f16(a, b, c, 3);
+}
diff --git a/clang/test/CodeGen/AArch64/neon/fused-multiply.c 
b/clang/test/CodeGen/AArch64/neon/fused-multiply.c
index 2501f54fb5427..142e736318241 100644
--- a/clang/test/CodeGen/AArch64/neon/fused-multiply.c
+++ b/clang/test/CodeGen/AArch64/neon/fused-multiply.c
@@ -13,7 +13,7 @@
 //  * clang/test/CodeGen/AArch64/neon-intrinsics.c
 // The main difference is the use of RUN lines that enable ClangIR lowering.
 // This file currently covers the f32/f64 wrappers that lower through
-// BI__builtin_neon_vfmaq_v.
+// BI__builtin_neon_vfmaq_v and BI__builtin_neon_vfmaq_lane_v.
 //
 // ACLE section headings based on v2025Q2 of the ACLE specification:
 //  * 
https://arm-software.github.io/acle/neon_intrinsics/advsimd.html#fused-multiply-accumulate
@@ -65,3 +65,65 @@ float64x2_t test_vfmaq_f64(float64x2_t a, float64x2_t b, 
float64x2_t c) {
 // LLVM-NEXT: ret <2 x double> [[FMA]]
   return vfmaq_f64(a, b, c);
 }
+
+// LLVM-LABEL: @test_vfmaq_lane_f32(
+// CIR-LABEL: @test_vfmaq_lane_f32(
+float32x4_t test_vfmaq_lane_f32(float32x4_t a, float32x4_t b, float32x2_t v) {
+// CIR: [[LANE:%.*]] = cir.vec.shuffle(%{{.*}}, %{{.*}} : !cir.vector<2 x 
!cir.float>) [#cir.int<1> : !s32i, #cir.int<1> : !s32i, #cir.int<1> : !s32i, 
#cir.int<1> : !s32i] : !cir.vector<4 x !cir.float>
+// CIR: cir.call_llvm_intrinsic "fma" %{{.*}}, [[LANE]], %{{.*}} : 
(!cir.vector<4 x !cir.float>, !cir.vector<4 x !cir.float>, !cir.vector<4 x 
!cir.float>) -> !cir.vector<4 x !cir.float>
+
+// LLVM-SAME: <4 x float> {{.*}} [[A:%.*]], <4 x float> {{.*}} [[B:%.*]], <2 x 
float> {{.*}} [[V:%.*]]) {{.*}} {
+// LLVM:      [[A_I:%.*]] = bitcast <4 x float> [[A]] to <4 x i32>
+// LLVM-NEXT: [[B_I:%.*]] = bitcast <4 x float> [[B]] to <4 x i32>
+// LLVM-NEXT: [[V_I:%.*]] = bitcast <2 x float> [[V]] to <2 x i32>
+// LLVM-NEXT: [[A_BYTES:%.*]] = bitcast <4 x i32> [[A_I]] to <16 x i8>
+// LLVM-NEXT: [[B_BYTES:%.*]] = bitcast <4 x i32> [[B_I]] to <16 x i8>
+// LLVM-NEXT: [[V_BYTES:%.*]] = bitcast <2 x i32> [[V_I]] to <8 x i8>
+// LLVM:      [[V_CAST:%.*]] = bitcast <8 x i8> [[V_BYTES]] to <2 x float>
+// LLVM-NEXT: [[LANE:%.*]] = shufflevector <2 x float> [[V_CAST]], <2 x float> 
{{.*}}, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
+// LLVM:      [[FMA:%.*]] = call <4 x float> @llvm.fma.v4f32(<4 x float> 
%{{.*}}, <4 x float> [[LANE]], <4 x float> %{{.*}})
+// LLVM:      ret <4 x float> [[FMA]]
+  return vfmaq_lane_f32(a, b, v, 1);
+}
+
+// LLVM-LABEL: @test_vfmaq_lane_f64(
+// CIR-LABEL: @test_vfmaq_lane_f64(
+float64x2_t test_vfmaq_lane_f64(float64x2_t a, float64x2_t b, float64x1_t v) {
+// CIR: [[LANE:%.*]] = cir.vec.shuffle(%{{.*}}, %{{.*}} : !cir.vector<1 x 
!cir.double>) [#cir.int<0> : !s32i, #cir.int<0> : !s32i] : !cir.vector<2 x 
!cir.double>
+// CIR: cir.call_llvm_intrinsic "fma" %{{.*}}, [[LANE]], %{{.*}} : 
(!cir.vector<2 x !cir.double>, !cir.vector<2 x !cir.double>, !cir.vector<2 x 
!cir.double>) -> !cir.vector<2 x !cir.double>
+
+// LLVM-SAME: <2 x double> {{.*}} [[A:%.*]], <2 x double> {{.*}} [[B:%.*]], <1 
x double> {{.*}} [[V:%.*]]) {{.*}} {
+// LLVM:      [[A_I:%.*]] = bitcast <2 x double> [[A]] to <2 x i64>
+// LLVM-NEXT: [[B_I:%.*]] = bitcast <2 x double> [[B]] to <2 x i64>
+// LLVM-NEXT: [[V_I:%.*]] = bitcast <1 x double> [[V]] to i64
+// LLVM-NEXT: [[V_INSERT:%.*]] = insertelement <1 x i64> undef, i64 [[V_I]], 
i32 0
+// LLVM-NEXT: [[A_BYTES:%.*]] = bitcast <2 x i64> [[A_I]] to <16 x i8>
+// LLVM-NEXT: [[B_BYTES:%.*]] = bitcast <2 x i64> [[B_I]] to <16 x i8>
+// LLVM-NEXT: [[V_BYTES:%.*]] = bitcast <1 x i64> [[V_INSERT]] to <8 x i8>
+// LLVM:      [[V_CAST:%.*]] = bitcast <8 x i8> [[V_BYTES]] to <1 x double>
+// LLVM-NEXT: [[LANE:%.*]] = shufflevector <1 x double> [[V_CAST]], <1 x 
double> {{.*}}, <2 x i32> zeroinitializer
+// LLVM:      [[FMA:%.*]] = call <2 x double> @llvm.fma.v2f64(<2 x double> 
%{{.*}}, <2 x double> [[LANE]], <2 x double> %{{.*}})
+// LLVM:      ret <2 x double> [[FMA]]
+  return vfmaq_lane_f64(a, b, v, 0);
+}
+
+// LLVM-LABEL: @test_vfmaq_lane_f32_0(
+// CIR-LABEL: @test_vfmaq_lane_f32_0(
+float32x4_t test_vfmaq_lane_f32_0(float32x4_t a, float32x4_t b,
+                                   float32x2_t v) {
+// CIR: [[LANE:%.*]] = cir.vec.shuffle(%{{.*}}, %{{.*}} : !cir.vector<2 x 
!cir.float>) [#cir.int<0> : !s32i, #cir.int<0> : !s32i, #cir.int<0> : !s32i, 
#cir.int<0> : !s32i] : !cir.vector<4 x !cir.float>
+// CIR: cir.call_llvm_intrinsic "fma" %{{.*}}, [[LANE]], %{{.*}} : 
(!cir.vector<4 x !cir.float>, !cir.vector<4 x !cir.float>, !cir.vector<4 x 
!cir.float>) -> !cir.vector<4 x !cir.float>
+
+// LLVM-SAME: <4 x float> {{.*}} [[A:%.*]], <4 x float> {{.*}} [[B:%.*]], <2 x 
float> {{.*}} [[V:%.*]]) {{.*}} {
+// LLVM:      [[A_I:%.*]] = bitcast <4 x float> [[A]] to <4 x i32>
+// LLVM-NEXT: [[B_I:%.*]] = bitcast <4 x float> [[B]] to <4 x i32>
+// LLVM-NEXT: [[V_I:%.*]] = bitcast <2 x float> [[V]] to <2 x i32>
+// LLVM-NEXT: [[A_BYTES:%.*]] = bitcast <4 x i32> [[A_I]] to <16 x i8>
+// LLVM-NEXT: [[B_BYTES:%.*]] = bitcast <4 x i32> [[B_I]] to <16 x i8>
+// LLVM-NEXT: [[V_BYTES:%.*]] = bitcast <2 x i32> [[V_I]] to <8 x i8>
+// LLVM:      [[V_CAST:%.*]] = bitcast <8 x i8> [[V_BYTES]] to <2 x float>
+// LLVM-NEXT: [[LANE:%.*]] = shufflevector <2 x float> [[V_CAST]], <2 x float> 
{{.*}}, <4 x i32> zeroinitializer
+// LLVM:      [[FMA:%.*]] = call <4 x float> @llvm.fma.v4f32(<4 x float> 
%{{.*}}, <4 x float> [[LANE]], <4 x float> %{{.*}})
+// LLVM:      ret <4 x float> [[FMA]]
+  return vfmaq_lane_f32(a, b, v, 0);
+}
diff --git a/clang/test/CodeGen/AArch64/v8.2a-neon-intrinsics.c 
b/clang/test/CodeGen/AArch64/v8.2a-neon-intrinsics.c
index ff1c206fc6350..f1d5891052e8f 100644
--- a/clang/test/CodeGen/AArch64/v8.2a-neon-intrinsics.c
+++ b/clang/test/CodeGen/AArch64/v8.2a-neon-intrinsics.c
@@ -1681,26 +1681,6 @@ float16x4_t test_vfma_lane_f16(float16x4_t a, 
float16x4_t b, float16x4_t c) {
   return vfma_lane_f16(a, b, c, 3);
 }
 
-// CHECK-LABEL: define {{[^@]+}}@test_vfmaq_lane_f16
-// CHECK-SAME: (<8 x half> noundef [[A:%.*]], <8 x half> noundef [[B:%.*]], <4 
x half> noundef [[C:%.*]]) #[[ATTR0]] {
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x half> [[A]] to <8 x i16>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x half> [[B]] to <8 x i16>
-// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <4 x half> [[C]] to <4 x i16>
-// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <8 x i16> [[TMP0]] to <16 x i8>
-// CHECK-NEXT:    [[TMP4:%.*]] = bitcast <8 x i16> [[TMP1]] to <16 x i8>
-// CHECK-NEXT:    [[TMP5:%.*]] = bitcast <4 x i16> [[TMP2]] to <8 x i8>
-// CHECK-NEXT:    [[TMP6:%.*]] = bitcast <8 x i8> [[TMP5]] to <4 x half>
-// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x half> [[TMP6]], <4 x half> 
[[TMP6]], <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
-// CHECK-NEXT:    [[FMLA:%.*]] = bitcast <16 x i8> [[TMP4]] to <8 x half>
-// CHECK-NEXT:    [[FMLA1:%.*]] = bitcast <16 x i8> [[TMP3]] to <8 x half>
-// CHECK-NEXT:    [[FMLA2:%.*]] = call <8 x half> @llvm.fma.v8f16(<8 x half> 
[[FMLA]], <8 x half> [[LANE]], <8 x half> [[FMLA1]])
-// CHECK-NEXT:    ret <8 x half> [[FMLA2]]
-//
-float16x8_t test_vfmaq_lane_f16(float16x8_t a, float16x8_t b, float16x4_t c) {
-  return vfmaq_lane_f16(a, b, c, 3);
-}
-
 // CHECK-LABEL: define {{[^@]+}}@test_vfma_laneq_f16
 // CHECK-SAME: (<4 x half> noundef [[A:%.*]], <4 x half> noundef [[B:%.*]], <8 
x half> noundef [[C:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  entry:

>From e02e5593fda9347b8b278261776e27cd2c8cfc4b Mon Sep 17 00:00:00 2001
From: Yair Ben Avraham <[email protected]>
Date: Wed, 13 May 2026 04:50:32 +0300
Subject: [PATCH 2/4] [CIR][AArch64] Lower vfma_laneq_v

Handle BI__builtin_neon_vfma_laneq_v in CIRGen by selecting the
requested lane from the wide multiplier operand and emitting an fma call.
Keep scalar lane/laneq wrappers outside this patch as explicit NYI cases.

Move existing ACLE coverage for vfma_laneq_f16/f32/f64 into the AArch64
neon CIR tests, covering direct LLVM, CIR-to-LLVM, and CIR output
together.
---
 .../lib/CIR/CodeGen/CIRGenBuiltinAArch64.cpp  | 29 +++++++-
 clang/test/CodeGen/AArch64/neon-2velem.c      | 38 -----------
 .../AArch64/neon-scalar-x-indexed-elem.c      | 23 -------
 .../AArch64/neon/fused-multiple-fullfp16.c    | 24 ++++++-
 .../CodeGen/AArch64/neon/fused-multiply.c     | 68 ++++++++++++++++++-
 .../CodeGen/AArch64/v8.2a-neon-intrinsics.c   | 20 ------
 6 files changed, 118 insertions(+), 84 deletions(-)

diff --git a/clang/lib/CIR/CodeGen/CIRGenBuiltinAArch64.cpp 
b/clang/lib/CIR/CodeGen/CIRGenBuiltinAArch64.cpp
index 346dd82580198..76fb785b19e77 100644
--- a/clang/lib/CIR/CodeGen/CIRGenBuiltinAArch64.cpp
+++ b/clang/lib/CIR/CodeGen/CIRGenBuiltinAArch64.cpp
@@ -2582,7 +2582,34 @@ CIRGenFunction::emitAArch64BuiltinExpr(unsigned 
builtinID, const CallExpr *expr,
     llvm::SmallVector<mlir::Value> fmaOps = {multiplicand, laneSource, addend};
     return emitCallMaybeConstrainedBuiltin(builder, loc, "fma", ty, fmaOps);
   }
-  case NEON::BI__builtin_neon_vfma_laneq_v:
+  case NEON::BI__builtin_neon_vfma_laneq_v: {
+    if (ty.getElementType() == cgm.doubleTy) {
+      mlir::Value addend = builder.createBitcast(ops[0], cgm.doubleTy);
+      mlir::Value multiplicand = builder.createBitcast(ops[1], cgm.doubleTy);
+      cir::VectorType sourceTy = cir::VectorType::get(cgm.doubleTy, 2);
+      mlir::Value laneSource = builder.createBitcast(ops[2], sourceTy);
+      laneSource = builder.createExtractElement(
+          loc, laneSource,
+          static_cast<uint64_t>(getIntValueFromConstOp(ops[3])));
+
+      llvm::SmallVector<mlir::Value> fmaOps = {multiplicand, laneSource,
+                                               addend};
+      return builder.createBitcast(
+          emitCallMaybeConstrainedBuiltin(builder, loc, "fma", cgm.doubleTy,
+                                          fmaOps),
+          ty);
+    }
+
+    mlir::Value addend = builder.createBitcast(ops[0], ty);
+    mlir::Value multiplicand = builder.createBitcast(ops[1], ty);
+    cir::VectorType sourceTy =
+        cir::VectorType::get(ty.getElementType(), ty.getSize() * 2);
+    mlir::Value laneSource = builder.createBitcast(ops[2], sourceTy);
+    laneSource = emitNeonSplat(builder, loc, laneSource, ops[3], ty.getSize());
+
+    llvm::SmallVector<mlir::Value> fmaOps = {laneSource, multiplicand, addend};
+    return emitCallMaybeConstrainedBuiltin(builder, loc, "fma", ty, fmaOps);
+  }
   case NEON::BI__builtin_neon_vfmaq_laneq_v:
   case NEON::BI__builtin_neon_vfmah_lane_f16:
   case NEON::BI__builtin_neon_vfmas_lane_f32:
diff --git a/clang/test/CodeGen/AArch64/neon-2velem.c 
b/clang/test/CodeGen/AArch64/neon-2velem.c
index eab6452196956..89fdb979d8a98 100644
--- a/clang/test/CodeGen/AArch64/neon-2velem.c
+++ b/clang/test/CodeGen/AArch64/neon-2velem.c
@@ -424,25 +424,6 @@ float32x2_t test_vfma_lane_f32(float32x2_t a, float32x2_t 
b, float32x2_t v) {
   return vfma_lane_f32(a, b, v, 1);
 }
 
-// CHECK-LABEL: @test_vfma_laneq_f32(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x float> [[A:%.*]] to <2 x i32>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <2 x float> [[B:%.*]] to <2 x i32>
-// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <4 x float> [[V:%.*]] to <4 x i32>
-// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i32> [[TMP0]] to <8 x i8>
-// CHECK-NEXT:    [[TMP4:%.*]] = bitcast <2 x i32> [[TMP1]] to <8 x i8>
-// CHECK-NEXT:    [[TMP5:%.*]] = bitcast <4 x i32> [[TMP2]] to <16 x i8>
-// CHECK-NEXT:    [[TMP6:%.*]] = bitcast <8 x i8> [[TMP3]] to <2 x float>
-// CHECK-NEXT:    [[TMP7:%.*]] = bitcast <8 x i8> [[TMP4]] to <2 x float>
-// CHECK-NEXT:    [[TMP8:%.*]] = bitcast <16 x i8> [[TMP5]] to <4 x float>
-// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x float> [[TMP8]], <4 x 
float> [[TMP8]], <2 x i32> <i32 3, i32 3>
-// CHECK-NEXT:    [[TMP9:%.*]] = call <2 x float> @llvm.fma.v2f32(<2 x float> 
[[LANE]], <2 x float> [[TMP7]], <2 x float> [[TMP6]])
-// CHECK-NEXT:    ret <2 x float> [[TMP9]]
-//
-float32x2_t test_vfma_laneq_f32(float32x2_t a, float32x2_t b, float32x4_t v) {
-  return vfma_laneq_f32(a, b, v, 3);
-}
-
 // CHECK-LABEL: @test_vfmaq_laneq_f32(
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x float> [[A:%.*]] to <4 x i32>
@@ -2528,25 +2509,6 @@ float32x2_t test_vfma_lane_f32_0(float32x2_t a, 
float32x2_t b, float32x2_t v) {
   return vfma_lane_f32(a, b, v, 0);
 }
 
-// CHECK-LABEL: @test_vfma_laneq_f32_0(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x float> [[A:%.*]] to <2 x i32>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <2 x float> [[B:%.*]] to <2 x i32>
-// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <4 x float> [[V:%.*]] to <4 x i32>
-// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i32> [[TMP0]] to <8 x i8>
-// CHECK-NEXT:    [[TMP4:%.*]] = bitcast <2 x i32> [[TMP1]] to <8 x i8>
-// CHECK-NEXT:    [[TMP5:%.*]] = bitcast <4 x i32> [[TMP2]] to <16 x i8>
-// CHECK-NEXT:    [[TMP6:%.*]] = bitcast <8 x i8> [[TMP3]] to <2 x float>
-// CHECK-NEXT:    [[TMP7:%.*]] = bitcast <8 x i8> [[TMP4]] to <2 x float>
-// CHECK-NEXT:    [[TMP8:%.*]] = bitcast <16 x i8> [[TMP5]] to <4 x float>
-// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x float> [[TMP8]], <4 x 
float> [[TMP8]], <2 x i32> zeroinitializer
-// CHECK-NEXT:    [[TMP9:%.*]] = call <2 x float> @llvm.fma.v2f32(<2 x float> 
[[LANE]], <2 x float> [[TMP7]], <2 x float> [[TMP6]])
-// CHECK-NEXT:    ret <2 x float> [[TMP9]]
-//
-float32x2_t test_vfma_laneq_f32_0(float32x2_t a, float32x2_t b, float32x4_t v) 
{
-  return vfma_laneq_f32(a, b, v, 0);
-}
-
 // CHECK-LABEL: @test_vfmaq_laneq_f32_0(
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x float> [[A:%.*]] to <4 x i32>
diff --git a/clang/test/CodeGen/AArch64/neon-scalar-x-indexed-elem.c 
b/clang/test/CodeGen/AArch64/neon-scalar-x-indexed-elem.c
index 9b98126500444..b464bccdbf9ec 100644
--- a/clang/test/CodeGen/AArch64/neon-scalar-x-indexed-elem.c
+++ b/clang/test/CodeGen/AArch64/neon-scalar-x-indexed-elem.c
@@ -240,29 +240,6 @@ float64x1_t test_vfms_lane_f64(float64x1_t a, float64x1_t 
b, float64x1_t v) {
   return vfms_lane_f64(a, b, v, 0);
 }
 
-// CHECK-LABEL: define dso_local <1 x double> @test_vfma_laneq_f64(
-// CHECK-SAME: <1 x double> noundef [[A:%.*]], <1 x double> noundef [[B:%.*]], 
<2 x double> noundef [[V:%.*]]) #[[ATTR0]] {
-// CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <1 x double> [[A]] to i64
-// CHECK-NEXT:    [[__S0_SROA_0_0_VEC_INSERT:%.*]] = insertelement <1 x i64> 
undef, i64 [[TMP0]], i32 0
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <1 x double> [[B]] to i64
-// CHECK-NEXT:    [[__S1_SROA_0_0_VEC_INSERT:%.*]] = insertelement <1 x i64> 
undef, i64 [[TMP1]], i32 0
-// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x double> [[V]] to <2 x i64>
-// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <1 x i64> 
[[__S0_SROA_0_0_VEC_INSERT]] to <8 x i8>
-// CHECK-NEXT:    [[TMP4:%.*]] = bitcast <1 x i64> 
[[__S1_SROA_0_0_VEC_INSERT]] to <8 x i8>
-// CHECK-NEXT:    [[TMP5:%.*]] = bitcast <2 x i64> [[TMP2]] to <16 x i8>
-// CHECK-NEXT:    [[TMP6:%.*]] = bitcast <8 x i8> [[TMP3]] to double
-// CHECK-NEXT:    [[TMP7:%.*]] = bitcast <8 x i8> [[TMP4]] to double
-// CHECK-NEXT:    [[TMP8:%.*]] = bitcast <16 x i8> [[TMP5]] to <2 x double>
-// CHECK-NEXT:    [[EXTRACT:%.*]] = extractelement <2 x double> [[TMP8]], i32 0
-// CHECK-NEXT:    [[TMP9:%.*]] = call double @llvm.fma.f64(double [[TMP7]], 
double [[EXTRACT]], double [[TMP6]])
-// CHECK-NEXT:    [[TMP10:%.*]] = bitcast double [[TMP9]] to <1 x double>
-// CHECK-NEXT:    ret <1 x double> [[TMP10]]
-//
-float64x1_t test_vfma_laneq_f64(float64x1_t a, float64x1_t b, float64x2_t v) {
-  return vfma_laneq_f64(a, b, v, 0);
-}
-
 // CHECK-LABEL: define dso_local <1 x double> @test_vfms_laneq_f64(
 // CHECK-SAME: <1 x double> noundef [[A:%.*]], <1 x double> noundef [[B:%.*]], 
<2 x double> noundef [[V:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
diff --git a/clang/test/CodeGen/AArch64/neon/fused-multiple-fullfp16.c 
b/clang/test/CodeGen/AArch64/neon/fused-multiple-fullfp16.c
index 13473e374f3c3..525a3c54a5673 100644
--- a/clang/test/CodeGen/AArch64/neon/fused-multiple-fullfp16.c
+++ b/clang/test/CodeGen/AArch64/neon/fused-multiple-fullfp16.c
@@ -13,7 +13,8 @@
 //  * clang/test/CodeGen/AArch64/v8.2a-neon-intrinsics.c
 // The main difference is the use of RUN lines that enable ClangIR lowering.
 // This file currently covers the f16 wrappers that lower through
-// BI__builtin_neon_vfmaq_v and BI__builtin_neon_vfmaq_lane_v.
+// BI__builtin_neon_vfmaq_v, BI__builtin_neon_vfmaq_lane_v, and
+// BI__builtin_neon_vfma_laneq_v.
 //
 // ACLE section headings based on v2025Q2 of the ACLE specification:
 //  * 
https://arm-software.github.io/acle/neon_intrinsics/advsimd.html#fused-multiply-accumulate-2
@@ -66,3 +67,24 @@ float16x8_t test_vfmaq_lane_f16(float16x8_t a, float16x8_t b,
 // LLVM:      ret <8 x half> [[FMA]]
   return vfmaq_lane_f16(a, b, c, 3);
 }
+
+// LLVM-LABEL: @test_vfma_laneq_f16(
+// CIR-LABEL: @test_vfma_laneq_f16(
+float16x4_t test_vfma_laneq_f16(float16x4_t a, float16x4_t b,
+                                 float16x8_t c) {
+// CIR: [[LANE:%.*]] = cir.vec.shuffle(%{{.*}}, %{{.*}} : !cir.vector<8 x 
!cir.f16>) [#cir.int<7> : !s32i, #cir.int<7> : !s32i, #cir.int<7> : !s32i, 
#cir.int<7> : !s32i] : !cir.vector<4 x !cir.f16>
+// CIR: cir.call_llvm_intrinsic "fma" [[LANE]], %{{.*}}, %{{.*}} : 
(!cir.vector<4 x !cir.f16>, !cir.vector<4 x !cir.f16>, !cir.vector<4 x 
!cir.f16>) -> !cir.vector<4 x !cir.f16>
+
+// LLVM-SAME: <4 x half> {{.*}} [[A:%.*]], <4 x half> {{.*}} [[B:%.*]], <8 x 
half> {{.*}} [[C:%.*]]) {{.*}} {
+// LLVM:      [[A_I:%.*]] = bitcast <4 x half> [[A]] to <4 x i16>
+// LLVM-NEXT: [[B_I:%.*]] = bitcast <4 x half> [[B]] to <4 x i16>
+// LLVM-NEXT: [[C_I:%.*]] = bitcast <8 x half> [[C]] to <8 x i16>
+// LLVM-NEXT: [[A_BYTES:%.*]] = bitcast <4 x i16> [[A_I]] to <8 x i8>
+// LLVM-NEXT: [[B_BYTES:%.*]] = bitcast <4 x i16> [[B_I]] to <8 x i8>
+// LLVM-NEXT: [[C_BYTES:%.*]] = bitcast <8 x i16> [[C_I]] to <16 x i8>
+// LLVM:      [[C_CAST:%.*]] = bitcast <16 x i8> [[C_BYTES]] to <8 x half>
+// LLVM-NEXT: [[LANE:%.*]] = shufflevector <8 x half> [[C_CAST]], <8 x half> 
{{.*}}, <4 x i32> <i32 7, i32 7, i32 7, i32 7>
+// LLVM:      [[FMA:%.*]] = call <4 x half> @llvm.fma.v4f16(<4 x half> 
[[LANE]], <4 x half> %{{.*}}, <4 x half> %{{.*}})
+// LLVM:      ret <4 x half> [[FMA]]
+  return vfma_laneq_f16(a, b, c, 7);
+}
diff --git a/clang/test/CodeGen/AArch64/neon/fused-multiply.c 
b/clang/test/CodeGen/AArch64/neon/fused-multiply.c
index 142e736318241..7a08ad6b1716a 100644
--- a/clang/test/CodeGen/AArch64/neon/fused-multiply.c
+++ b/clang/test/CodeGen/AArch64/neon/fused-multiply.c
@@ -13,7 +13,8 @@
 //  * clang/test/CodeGen/AArch64/neon-intrinsics.c
 // The main difference is the use of RUN lines that enable ClangIR lowering.
 // This file currently covers the f32/f64 wrappers that lower through
-// BI__builtin_neon_vfmaq_v and BI__builtin_neon_vfmaq_lane_v.
+// BI__builtin_neon_vfmaq_v, BI__builtin_neon_vfmaq_lane_v, and
+// BI__builtin_neon_vfma_laneq_v.
 //
 // ACLE section headings based on v2025Q2 of the ACLE specification:
 //  * 
https://arm-software.github.io/acle/neon_intrinsics/advsimd.html#fused-multiply-accumulate
@@ -127,3 +128,68 @@ float32x4_t test_vfmaq_lane_f32_0(float32x4_t a, 
float32x4_t b,
 // LLVM:      ret <4 x float> [[FMA]]
   return vfmaq_lane_f32(a, b, v, 0);
 }
+
+// LLVM-LABEL: @test_vfma_laneq_f32(
+// CIR-LABEL: @test_vfma_laneq_f32(
+float32x2_t test_vfma_laneq_f32(float32x2_t a, float32x2_t b, float32x4_t v) {
+// CIR: [[LANE:%.*]] = cir.vec.shuffle(%{{.*}}, %{{.*}} : !cir.vector<4 x 
!cir.float>) [#cir.int<3> : !s32i, #cir.int<3> : !s32i] : !cir.vector<2 x 
!cir.float>
+// CIR: cir.call_llvm_intrinsic "fma" [[LANE]], %{{.*}}, %{{.*}} : 
(!cir.vector<2 x !cir.float>, !cir.vector<2 x !cir.float>, !cir.vector<2 x 
!cir.float>) -> !cir.vector<2 x !cir.float>
+
+// LLVM-SAME: <2 x float> {{.*}} [[A:%.*]], <2 x float> {{.*}} [[B:%.*]], <4 x 
float> {{.*}} [[V:%.*]]) {{.*}} {
+// LLVM:      [[A_I:%.*]] = bitcast <2 x float> [[A]] to <2 x i32>
+// LLVM-NEXT: [[B_I:%.*]] = bitcast <2 x float> [[B]] to <2 x i32>
+// LLVM-NEXT: [[V_I:%.*]] = bitcast <4 x float> [[V]] to <4 x i32>
+// LLVM-NEXT: [[A_BYTES:%.*]] = bitcast <2 x i32> [[A_I]] to <8 x i8>
+// LLVM-NEXT: [[B_BYTES:%.*]] = bitcast <2 x i32> [[B_I]] to <8 x i8>
+// LLVM-NEXT: [[V_BYTES:%.*]] = bitcast <4 x i32> [[V_I]] to <16 x i8>
+// LLVM:      [[V_CAST:%.*]] = bitcast <16 x i8> [[V_BYTES]] to <4 x float>
+// LLVM-NEXT: [[LANE:%.*]] = shufflevector <4 x float> [[V_CAST]], <4 x float> 
{{.*}}, <2 x i32> <i32 3, i32 3>
+// LLVM:      [[FMA:%.*]] = call <2 x float> @llvm.fma.v2f32(<2 x float> 
[[LANE]], <2 x float> %{{.*}}, <2 x float> %{{.*}})
+// LLVM:      ret <2 x float> [[FMA]]
+  return vfma_laneq_f32(a, b, v, 3);
+}
+
+// LLVM-LABEL: @test_vfma_laneq_f64(
+// CIR-LABEL: @test_vfma_laneq_f64(
+float64x1_t test_vfma_laneq_f64(float64x1_t a, float64x1_t b,
+                                 float64x2_t v) {
+// CIR: [[LANE:%.*]] = cir.vec.extract %{{.*}}[%{{.*}} : !u64i] : 
!cir.vector<2 x !cir.double>
+// CIR: cir.call_llvm_intrinsic "fma" %{{.*}}, [[LANE]], %{{.*}} : 
(!cir.double, !cir.double, !cir.double) -> !cir.double
+
+// LLVM-SAME: <1 x double> {{.*}} [[A:%.*]], <1 x double> {{.*}} [[B:%.*]], <2 
x double> {{.*}} [[V:%.*]]) {{.*}} {
+// LLVM:      [[A_I:%.*]] = bitcast <1 x double> [[A]] to i64
+// LLVM-NEXT: [[A_INSERT:%.*]] = insertelement <1 x i64> undef, i64 [[A_I]], 
i32 0
+// LLVM-NEXT: [[B_I:%.*]] = bitcast <1 x double> [[B]] to i64
+// LLVM-NEXT: [[B_INSERT:%.*]] = insertelement <1 x i64> undef, i64 [[B_I]], 
i32 0
+// LLVM-NEXT: [[V_I:%.*]] = bitcast <2 x double> [[V]] to <2 x i64>
+// LLVM-NEXT: [[A_BYTES:%.*]] = bitcast <1 x i64> [[A_INSERT]] to <8 x i8>
+// LLVM-NEXT: [[B_BYTES:%.*]] = bitcast <1 x i64> [[B_INSERT]] to <8 x i8>
+// LLVM-NEXT: [[V_BYTES:%.*]] = bitcast <2 x i64> [[V_I]] to <16 x i8>
+// LLVM:      [[V_CAST:%.*]] = bitcast <16 x i8> [[V_BYTES]] to <2 x double>
+// LLVM-NEXT: [[LANE:%.*]] = extractelement <2 x double> [[V_CAST]], 
i{{32|64}} 0
+// LLVM:      [[FMA:%.*]] = call double @llvm.fma.f64(double %{{.*}}, double 
[[LANE]], double %{{.*}})
+// LLVM:      [[RESULT:%.*]] = bitcast double [[FMA]] to <1 x double>
+// LLVM:      ret <1 x double> [[RESULT]]
+  return vfma_laneq_f64(a, b, v, 0);
+}
+
+// LLVM-LABEL: @test_vfma_laneq_f32_0(
+// CIR-LABEL: @test_vfma_laneq_f32_0(
+float32x2_t test_vfma_laneq_f32_0(float32x2_t a, float32x2_t b,
+                                   float32x4_t v) {
+// CIR: [[LANE:%.*]] = cir.vec.shuffle(%{{.*}}, %{{.*}} : !cir.vector<4 x 
!cir.float>) [#cir.int<0> : !s32i, #cir.int<0> : !s32i] : !cir.vector<2 x 
!cir.float>
+// CIR: cir.call_llvm_intrinsic "fma" [[LANE]], %{{.*}}, %{{.*}} : 
(!cir.vector<2 x !cir.float>, !cir.vector<2 x !cir.float>, !cir.vector<2 x 
!cir.float>) -> !cir.vector<2 x !cir.float>
+
+// LLVM-SAME: <2 x float> {{.*}} [[A:%.*]], <2 x float> {{.*}} [[B:%.*]], <4 x 
float> {{.*}} [[V:%.*]]) {{.*}} {
+// LLVM:      [[A_I:%.*]] = bitcast <2 x float> [[A]] to <2 x i32>
+// LLVM-NEXT: [[B_I:%.*]] = bitcast <2 x float> [[B]] to <2 x i32>
+// LLVM-NEXT: [[V_I:%.*]] = bitcast <4 x float> [[V]] to <4 x i32>
+// LLVM-NEXT: [[A_BYTES:%.*]] = bitcast <2 x i32> [[A_I]] to <8 x i8>
+// LLVM-NEXT: [[B_BYTES:%.*]] = bitcast <2 x i32> [[B_I]] to <8 x i8>
+// LLVM-NEXT: [[V_BYTES:%.*]] = bitcast <4 x i32> [[V_I]] to <16 x i8>
+// LLVM:      [[V_CAST:%.*]] = bitcast <16 x i8> [[V_BYTES]] to <4 x float>
+// LLVM-NEXT: [[LANE:%.*]] = shufflevector <4 x float> [[V_CAST]], <4 x float> 
{{.*}}, <2 x i32> zeroinitializer
+// LLVM:      [[FMA:%.*]] = call <2 x float> @llvm.fma.v2f32(<2 x float> 
[[LANE]], <2 x float> %{{.*}}, <2 x float> %{{.*}})
+// LLVM:      ret <2 x float> [[FMA]]
+  return vfma_laneq_f32(a, b, v, 0);
+}
diff --git a/clang/test/CodeGen/AArch64/v8.2a-neon-intrinsics.c 
b/clang/test/CodeGen/AArch64/v8.2a-neon-intrinsics.c
index f1d5891052e8f..e8f1eead2a0d5 100644
--- a/clang/test/CodeGen/AArch64/v8.2a-neon-intrinsics.c
+++ b/clang/test/CodeGen/AArch64/v8.2a-neon-intrinsics.c
@@ -1681,26 +1681,6 @@ float16x4_t test_vfma_lane_f16(float16x4_t a, 
float16x4_t b, float16x4_t c) {
   return vfma_lane_f16(a, b, c, 3);
 }
 
-// CHECK-LABEL: define {{[^@]+}}@test_vfma_laneq_f16
-// CHECK-SAME: (<4 x half> noundef [[A:%.*]], <4 x half> noundef [[B:%.*]], <8 
x half> noundef [[C:%.*]]) #[[ATTR0]] {
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x half> [[A]] to <4 x i16>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x half> [[B]] to <4 x i16>
-// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <8 x half> [[C]] to <8 x i16>
-// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i16> [[TMP0]] to <8 x i8>
-// CHECK-NEXT:    [[TMP4:%.*]] = bitcast <4 x i16> [[TMP1]] to <8 x i8>
-// CHECK-NEXT:    [[TMP5:%.*]] = bitcast <8 x i16> [[TMP2]] to <16 x i8>
-// CHECK-NEXT:    [[TMP6:%.*]] = bitcast <8 x i8> [[TMP3]] to <4 x half>
-// CHECK-NEXT:    [[TMP7:%.*]] = bitcast <8 x i8> [[TMP4]] to <4 x half>
-// CHECK-NEXT:    [[TMP8:%.*]] = bitcast <16 x i8> [[TMP5]] to <8 x half>
-// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <8 x half> [[TMP8]], <8 x half> 
[[TMP8]], <4 x i32> <i32 7, i32 7, i32 7, i32 7>
-// CHECK-NEXT:    [[TMP9:%.*]] = call <4 x half> @llvm.fma.v4f16(<4 x half> 
[[LANE]], <4 x half> [[TMP7]], <4 x half> [[TMP6]])
-// CHECK-NEXT:    ret <4 x half> [[TMP9]]
-//
-float16x4_t test_vfma_laneq_f16(float16x4_t a, float16x4_t b, float16x8_t c) {
-  return vfma_laneq_f16(a, b, c, 7);
-}
-
 // CHECK-LABEL: define {{[^@]+}}@test_vfmaq_laneq_f16
 // CHECK-SAME: (<8 x half> noundef [[A:%.*]], <8 x half> noundef [[B:%.*]], <8 
x half> noundef [[C:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  entry:

>From bc01de0b8a88cd2bf68ddfe63fc29418c7eb4166 Mon Sep 17 00:00:00 2001
From: Yair Ben Avraham <[email protected]>
Date: Thu, 14 May 2026 06:23:51 +0300
Subject: [PATCH 3/4] [CIR][AArch64] Address vfma lane review

Add comments explaining the vfma_laneq_f64 scalar fma special case and
the float64x2_t lane source width.

Remove the duplicate vfmaq_lane_f32 lane-zero test now that the lane-one
case covers the splat lowering.
---
 .../lib/CIR/CodeGen/CIRGenBuiltinAArch64.cpp  |  3 ++
 .../AArch64/neon/fused-multiple-fullfp16.c    |  6 ++--
 .../CodeGen/AArch64/neon/fused-multiply.c     | 36 +++----------------
 3 files changed, 10 insertions(+), 35 deletions(-)

diff --git a/clang/lib/CIR/CodeGen/CIRGenBuiltinAArch64.cpp 
b/clang/lib/CIR/CodeGen/CIRGenBuiltinAArch64.cpp
index 76fb785b19e77..af2472b34949f 100644
--- a/clang/lib/CIR/CodeGen/CIRGenBuiltinAArch64.cpp
+++ b/clang/lib/CIR/CodeGen/CIRGenBuiltinAArch64.cpp
@@ -2583,9 +2583,12 @@ CIRGenFunction::emitAArch64BuiltinExpr(unsigned 
builtinID, const CallExpr *expr,
     return emitCallMaybeConstrainedBuiltin(builder, loc, "fma", ty, fmaOps);
   }
   case NEON::BI__builtin_neon_vfma_laneq_v: {
+    // v1f64 fma should be mapped to Neon scalar f64 fma.
     if (ty.getElementType() == cgm.doubleTy) {
       mlir::Value addend = builder.createBitcast(ops[0], cgm.doubleTy);
       mlir::Value multiplicand = builder.createBitcast(ops[1], cgm.doubleTy);
+      // The laneq source operand is float64x2_t, so the source vector has two
+      // double lanes.
       cir::VectorType sourceTy = cir::VectorType::get(cgm.doubleTy, 2);
       mlir::Value laneSource = builder.createBitcast(ops[2], sourceTy);
       laneSource = builder.createExtractElement(
diff --git a/clang/test/CodeGen/AArch64/neon/fused-multiple-fullfp16.c 
b/clang/test/CodeGen/AArch64/neon/fused-multiple-fullfp16.c
index 525a3c54a5673..944d0322f4d53 100644
--- a/clang/test/CodeGen/AArch64/neon/fused-multiple-fullfp16.c
+++ b/clang/test/CodeGen/AArch64/neon/fused-multiple-fullfp16.c
@@ -47,8 +47,7 @@ float16x8_t test_vfmaq_f16(float16x8_t a, float16x8_t b, 
float16x8_t c) {
   return vfmaq_f16(a, b, c);
 }
 
-// LLVM-LABEL: @test_vfmaq_lane_f16(
-// CIR-LABEL: @test_vfmaq_lane_f16(
+// ALL-LABEL: @test_vfmaq_lane_f16(
 float16x8_t test_vfmaq_lane_f16(float16x8_t a, float16x8_t b,
                                  float16x4_t c) {
 // CIR: [[LANE:%.*]] = cir.vec.shuffle(%{{.*}}, %{{.*}} : !cir.vector<4 x 
!cir.f16>) [#cir.int<3> : !s32i, #cir.int<3> : !s32i, #cir.int<3> : !s32i, 
#cir.int<3> : !s32i, #cir.int<3> : !s32i, #cir.int<3> : !s32i, #cir.int<3> : 
!s32i, #cir.int<3> : !s32i] : !cir.vector<8 x !cir.f16>
@@ -68,8 +67,7 @@ float16x8_t test_vfmaq_lane_f16(float16x8_t a, float16x8_t b,
   return vfmaq_lane_f16(a, b, c, 3);
 }
 
-// LLVM-LABEL: @test_vfma_laneq_f16(
-// CIR-LABEL: @test_vfma_laneq_f16(
+// ALL-LABEL: @test_vfma_laneq_f16(
 float16x4_t test_vfma_laneq_f16(float16x4_t a, float16x4_t b,
                                  float16x8_t c) {
 // CIR: [[LANE:%.*]] = cir.vec.shuffle(%{{.*}}, %{{.*}} : !cir.vector<8 x 
!cir.f16>) [#cir.int<7> : !s32i, #cir.int<7> : !s32i, #cir.int<7> : !s32i, 
#cir.int<7> : !s32i] : !cir.vector<4 x !cir.f16>
diff --git a/clang/test/CodeGen/AArch64/neon/fused-multiply.c 
b/clang/test/CodeGen/AArch64/neon/fused-multiply.c
index 7a08ad6b1716a..183b906d05a0c 100644
--- a/clang/test/CodeGen/AArch64/neon/fused-multiply.c
+++ b/clang/test/CodeGen/AArch64/neon/fused-multiply.c
@@ -67,8 +67,7 @@ float64x2_t test_vfmaq_f64(float64x2_t a, float64x2_t b, 
float64x2_t c) {
   return vfmaq_f64(a, b, c);
 }
 
-// LLVM-LABEL: @test_vfmaq_lane_f32(
-// CIR-LABEL: @test_vfmaq_lane_f32(
+// ALL-LABEL: @test_vfmaq_lane_f32(
 float32x4_t test_vfmaq_lane_f32(float32x4_t a, float32x4_t b, float32x2_t v) {
 // CIR: [[LANE:%.*]] = cir.vec.shuffle(%{{.*}}, %{{.*}} : !cir.vector<2 x 
!cir.float>) [#cir.int<1> : !s32i, #cir.int<1> : !s32i, #cir.int<1> : !s32i, 
#cir.int<1> : !s32i] : !cir.vector<4 x !cir.float>
 // CIR: cir.call_llvm_intrinsic "fma" %{{.*}}, [[LANE]], %{{.*}} : 
(!cir.vector<4 x !cir.float>, !cir.vector<4 x !cir.float>, !cir.vector<4 x 
!cir.float>) -> !cir.vector<4 x !cir.float>
@@ -87,8 +86,7 @@ float32x4_t test_vfmaq_lane_f32(float32x4_t a, float32x4_t b, 
float32x2_t v) {
   return vfmaq_lane_f32(a, b, v, 1);
 }
 
-// LLVM-LABEL: @test_vfmaq_lane_f64(
-// CIR-LABEL: @test_vfmaq_lane_f64(
+// ALL-LABEL: @test_vfmaq_lane_f64(
 float64x2_t test_vfmaq_lane_f64(float64x2_t a, float64x2_t b, float64x1_t v) {
 // CIR: [[LANE:%.*]] = cir.vec.shuffle(%{{.*}}, %{{.*}} : !cir.vector<1 x 
!cir.double>) [#cir.int<0> : !s32i, #cir.int<0> : !s32i] : !cir.vector<2 x 
!cir.double>
 // CIR: cir.call_llvm_intrinsic "fma" %{{.*}}, [[LANE]], %{{.*}} : 
(!cir.vector<2 x !cir.double>, !cir.vector<2 x !cir.double>, !cir.vector<2 x 
!cir.double>) -> !cir.vector<2 x !cir.double>
@@ -108,29 +106,7 @@ float64x2_t test_vfmaq_lane_f64(float64x2_t a, float64x2_t 
b, float64x1_t v) {
   return vfmaq_lane_f64(a, b, v, 0);
 }
 
-// LLVM-LABEL: @test_vfmaq_lane_f32_0(
-// CIR-LABEL: @test_vfmaq_lane_f32_0(
-float32x4_t test_vfmaq_lane_f32_0(float32x4_t a, float32x4_t b,
-                                   float32x2_t v) {
-// CIR: [[LANE:%.*]] = cir.vec.shuffle(%{{.*}}, %{{.*}} : !cir.vector<2 x 
!cir.float>) [#cir.int<0> : !s32i, #cir.int<0> : !s32i, #cir.int<0> : !s32i, 
#cir.int<0> : !s32i] : !cir.vector<4 x !cir.float>
-// CIR: cir.call_llvm_intrinsic "fma" %{{.*}}, [[LANE]], %{{.*}} : 
(!cir.vector<4 x !cir.float>, !cir.vector<4 x !cir.float>, !cir.vector<4 x 
!cir.float>) -> !cir.vector<4 x !cir.float>
-
-// LLVM-SAME: <4 x float> {{.*}} [[A:%.*]], <4 x float> {{.*}} [[B:%.*]], <2 x 
float> {{.*}} [[V:%.*]]) {{.*}} {
-// LLVM:      [[A_I:%.*]] = bitcast <4 x float> [[A]] to <4 x i32>
-// LLVM-NEXT: [[B_I:%.*]] = bitcast <4 x float> [[B]] to <4 x i32>
-// LLVM-NEXT: [[V_I:%.*]] = bitcast <2 x float> [[V]] to <2 x i32>
-// LLVM-NEXT: [[A_BYTES:%.*]] = bitcast <4 x i32> [[A_I]] to <16 x i8>
-// LLVM-NEXT: [[B_BYTES:%.*]] = bitcast <4 x i32> [[B_I]] to <16 x i8>
-// LLVM-NEXT: [[V_BYTES:%.*]] = bitcast <2 x i32> [[V_I]] to <8 x i8>
-// LLVM:      [[V_CAST:%.*]] = bitcast <8 x i8> [[V_BYTES]] to <2 x float>
-// LLVM-NEXT: [[LANE:%.*]] = shufflevector <2 x float> [[V_CAST]], <2 x float> 
{{.*}}, <4 x i32> zeroinitializer
-// LLVM:      [[FMA:%.*]] = call <4 x float> @llvm.fma.v4f32(<4 x float> 
%{{.*}}, <4 x float> [[LANE]], <4 x float> %{{.*}})
-// LLVM:      ret <4 x float> [[FMA]]
-  return vfmaq_lane_f32(a, b, v, 0);
-}
-
-// LLVM-LABEL: @test_vfma_laneq_f32(
-// CIR-LABEL: @test_vfma_laneq_f32(
+// ALL-LABEL: @test_vfma_laneq_f32(
 float32x2_t test_vfma_laneq_f32(float32x2_t a, float32x2_t b, float32x4_t v) {
 // CIR: [[LANE:%.*]] = cir.vec.shuffle(%{{.*}}, %{{.*}} : !cir.vector<4 x 
!cir.float>) [#cir.int<3> : !s32i, #cir.int<3> : !s32i] : !cir.vector<2 x 
!cir.float>
 // CIR: cir.call_llvm_intrinsic "fma" [[LANE]], %{{.*}}, %{{.*}} : 
(!cir.vector<2 x !cir.float>, !cir.vector<2 x !cir.float>, !cir.vector<2 x 
!cir.float>) -> !cir.vector<2 x !cir.float>
@@ -149,8 +125,7 @@ float32x2_t test_vfma_laneq_f32(float32x2_t a, float32x2_t 
b, float32x4_t v) {
   return vfma_laneq_f32(a, b, v, 3);
 }
 
-// LLVM-LABEL: @test_vfma_laneq_f64(
-// CIR-LABEL: @test_vfma_laneq_f64(
+// ALL-LABEL: @test_vfma_laneq_f64(
 float64x1_t test_vfma_laneq_f64(float64x1_t a, float64x1_t b,
                                  float64x2_t v) {
 // CIR: [[LANE:%.*]] = cir.vec.extract %{{.*}}[%{{.*}} : !u64i] : 
!cir.vector<2 x !cir.double>
@@ -173,8 +148,7 @@ float64x1_t test_vfma_laneq_f64(float64x1_t a, float64x1_t 
b,
   return vfma_laneq_f64(a, b, v, 0);
 }
 
-// LLVM-LABEL: @test_vfma_laneq_f32_0(
-// CIR-LABEL: @test_vfma_laneq_f32_0(
+// ALL-LABEL: @test_vfma_laneq_f32_0(
 float32x2_t test_vfma_laneq_f32_0(float32x2_t a, float32x2_t b,
                                    float32x4_t v) {
 // CIR: [[LANE:%.*]] = cir.vec.shuffle(%{{.*}}, %{{.*}} : !cir.vector<4 x 
!cir.float>) [#cir.int<0> : !s32i, #cir.int<0> : !s32i] : !cir.vector<2 x 
!cir.float>

>From 187116e8b280b4265b170c9d8b5fb9d0c050d395 Mon Sep 17 00:00:00 2001
From: Yair Ben Avraham <[email protected]>
Date: Fri, 15 May 2026 12:17:45 +0300
Subject: [PATCH 4/4] [CIR][AArch64] Document vfma lane widths

Add comments explaining the source vector lane counts used by vfmaq_lane_v
and vfma_laneq_v lowering.
---
 clang/lib/CIR/CodeGen/CIRGenBuiltinAArch64.cpp | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/clang/lib/CIR/CodeGen/CIRGenBuiltinAArch64.cpp 
b/clang/lib/CIR/CodeGen/CIRGenBuiltinAArch64.cpp
index af2472b34949f..23a2bcab7798a 100644
--- a/clang/lib/CIR/CodeGen/CIRGenBuiltinAArch64.cpp
+++ b/clang/lib/CIR/CodeGen/CIRGenBuiltinAArch64.cpp
@@ -2574,6 +2574,8 @@ CIRGenFunction::emitAArch64BuiltinExpr(unsigned 
builtinID, const CallExpr *expr,
   case NEON::BI__builtin_neon_vfmaq_lane_v: {
     mlir::Value addend = builder.createBitcast(ops[0], ty);
     mlir::Value multiplicand = builder.createBitcast(ops[1], ty);
+    // The lane source operand is the non-quad vector, so it has half as many
+    // lanes as the quad result vector.
     cir::VectorType sourceTy =
         cir::VectorType::get(ty.getElementType(), ty.getSize() / 2);
     mlir::Value laneSource = builder.createBitcast(ops[2], sourceTy);
@@ -2605,6 +2607,8 @@ CIRGenFunction::emitAArch64BuiltinExpr(unsigned 
builtinID, const CallExpr *expr,
 
     mlir::Value addend = builder.createBitcast(ops[0], ty);
     mlir::Value multiplicand = builder.createBitcast(ops[1], ty);
+    // The laneq source operand is the quad vector, so it has twice as many
+    // lanes as the non-quad result vector.
     cir::VectorType sourceTy =
         cir::VectorType::get(ty.getElementType(), ty.getSize() * 2);
     mlir::Value laneSource = builder.createBitcast(ops[2], sourceTy);

_______________________________________________
cfe-commits mailing list
[email protected]
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits

[clang] [CIR][AArch64] Lower vfmaq_lane_v and vfma_laneq_v (PR #197084)

Reply via email to