[clang] [CIR][AArch64] Lower vfma lane builtins (PR #188190)

via cfe-commits Tue, 24 Mar 2026 00:33:10 -0700

llvmbot wrote:


<!--LLVM PR SUMMARY COMMENT-->

@llvm/pr-subscribers-clangir

Author: Yair Ben Avraham (yairbenavraham)

<details>
<summary>Changes</summary>

This PR implements the AArch64 NEON ClangIR lowering for the vfma lane/laneq 
builtins and adds CIR-enabled regression tests.

Covered scope:
  - vector lane/laneq forms
  - scalar lane/laneq forms
  - includes the vfmaq_laneq_v family called out in #<!-- -->185382

Validation:
  - clean build from scratch
  - post-build sanity check
  - focused llvm-lit validation for the touched AArch64 NEON tests

Part of #<!-- -->185382 

---
Full diff: https://github.com/llvm/llvm-project/pull/188190.diff


6 Files Affected:

- (modified) clang/include/clang/CIR/Dialect/IR/CMakeLists.txt (+1-1) 
- (modified) clang/lib/CIR/CodeGen/CIRGenBuiltinAArch64.cpp (+53-14) 
- (modified) clang/lib/CIR/Lowering/CMakeLists.txt (+3) 
- (modified) clang/lib/CIR/Lowering/DirectToLLVM/CMakeLists.txt (+1) 
- (added) clang/test/CodeGen/AArch64/neon/vfma-lane.c (+136) 
- (added) clang/test/CodeGen/AArch64/neon/vfma-scalar-lane.c (+77) 


``````````diff
diff --git a/clang/include/clang/CIR/Dialect/IR/CMakeLists.txt 
b/clang/include/clang/CIR/Dialect/IR/CMakeLists.txt
index 870f9e3f5d052..1388e5bc612f2 100644
--- a/clang/include/clang/CIR/Dialect/IR/CMakeLists.txt
+++ b/clang/include/clang/CIR/Dialect/IR/CMakeLists.txt
@@ -27,5 +27,5 @@ clang_tablegen(CIRLowering.inc -gen-cir-lowering
 set(LLVM_TARGET_DEFINITIONS CIRTypeConstraints.td)
 mlir_tablegen(CIRTypeConstraints.h.inc -gen-type-constraint-decls)
 mlir_tablegen(CIRTypeConstraints.cpp.inc -gen-type-constraint-defs)
-add_public_tablegen_target(MLIRCIRTypeConstraintsIncGen)
+add_mlir_generic_tablegen_target(MLIRCIRTypeConstraintsIncGen)
 add_dependencies(mlir-headers MLIRCIRTypeConstraintsIncGen)
diff --git a/clang/lib/CIR/CodeGen/CIRGenBuiltinAArch64.cpp 
b/clang/lib/CIR/CodeGen/CIRGenBuiltinAArch64.cpp
index 5d7b8d839fa84..26560b2ab3447 100644
--- a/clang/lib/CIR/CodeGen/CIRGenBuiltinAArch64.cpp
+++ b/clang/lib/CIR/CodeGen/CIRGenBuiltinAArch64.cpp
@@ -801,11 +801,10 @@ static cir::VectorType getNeonType(CIRGenFunction *cgf, 
NeonTypeFlags typeFlags,
       cgf->getCIRGenModule().errorNYI(loc, std::string("NEON type: BFloat16"));
     [[fallthrough]];
   case NeonTypeFlags::Float16:
-    if (hasLegalHalfType)
+    if (!hasLegalHalfType)
       cgf->getCIRGenModule().errorNYI(loc, std::string("NEON type: Float16"));
-    else
-      cgf->getCIRGenModule().errorNYI(loc, std::string("NEON type: Float16"));
-    [[fallthrough]];
+    return cir::VectorType::get(cgf->getCIRGenModule().fP16Ty,
+                                v1Ty ? 1 : (4 << isQuad));
   case NeonTypeFlags::Int32:
     return cir::VectorType::get(typeFlags.isUnsigned() ? cgf->uInt32Ty
                                                        : cgf->sInt32Ty,
@@ -2848,6 +2847,23 @@ CIRGenFunction::emitAArch64BuiltinExpr(unsigned 
builtinID, const CallExpr *expr,
     return mlir::Value{};
   }
 
+  switch (builtinID) {
+  case NEON::BI__builtin_neon_vfmah_lane_f16:
+  case NEON::BI__builtin_neon_vfmas_lane_f32:
+  case NEON::BI__builtin_neon_vfmah_laneq_f16:
+  case NEON::BI__builtin_neon_vfmas_laneq_f32:
+  case NEON::BI__builtin_neon_vfmad_lane_f64:
+  case NEON::BI__builtin_neon_vfmad_laneq_f64: {
+    mlir::Value lane = cir::VecExtractOp::create(builder, loc, ops[2], ops[3]);
+    mlir::Type scalarTy = convertType(expr->getType());
+    llvm::SmallVector<mlir::Value> fmaOps = {ops[1], lane, ops[0]};
+    return emitCallMaybeConstrainedBuiltin(builder, loc, "fma", scalarTy,
+                                           fmaOps);
+  }
+  default:
+    break;
+  }
+
   cir::VectorType ty = getNeonType(this, type, loc);
   if (!ty)
     return nullptr;
@@ -2859,16 +2875,6 @@ CIRGenFunction::emitAArch64BuiltinExpr(unsigned 
builtinID, const CallExpr *expr,
     return std::nullopt;
   case NEON::BI__builtin_neon_vbsl_v:
   case NEON::BI__builtin_neon_vbslq_v:
-  case NEON::BI__builtin_neon_vfma_lane_v:
-  case NEON::BI__builtin_neon_vfmaq_lane_v:
-  case NEON::BI__builtin_neon_vfma_laneq_v:
-  case NEON::BI__builtin_neon_vfmaq_laneq_v:
-  case NEON::BI__builtin_neon_vfmah_lane_f16:
-  case NEON::BI__builtin_neon_vfmas_lane_f32:
-  case NEON::BI__builtin_neon_vfmah_laneq_f16:
-  case NEON::BI__builtin_neon_vfmas_laneq_f32:
-  case NEON::BI__builtin_neon_vfmad_lane_f64:
-  case NEON::BI__builtin_neon_vfmad_laneq_f64:
   case NEON::BI__builtin_neon_vmull_v:
   case NEON::BI__builtin_neon_vmax_v:
   case NEON::BI__builtin_neon_vmaxq_v:
@@ -2886,6 +2892,39 @@ CIRGenFunction::emitAArch64BuiltinExpr(unsigned 
builtinID, const CallExpr *expr,
     if (cir::isFPOrVectorOfFPType(ty))
       intrName = "aarch64.neon.fabd";
     return emitNeonCall(cgm, builder, {ty, ty}, ops, intrName, ty, loc);
+  case NEON::BI__builtin_neon_vfma_lane_v:
+  case NEON::BI__builtin_neon_vfmaq_lane_v:
+  case NEON::BI__builtin_neon_vfma_laneq_v:
+  case NEON::BI__builtin_neon_vfmaq_laneq_v: {
+    mlir::Value addend = ops[0];
+    mlir::Value multiplicand = ops[1];
+    mlir::Value laneSource = ops[2];
+    auto vecTy = mlir::cast<cir::VectorType>(ty);
+    auto elemTy = vecTy.getElementType();
+    auto numElts = vecTy.getSize();
+
+    if (addend.getType() != ty)
+      addend = builder.createBitcast(loc, addend, ty);
+    if (multiplicand.getType() != ty)
+      multiplicand = builder.createBitcast(loc, multiplicand, ty);
+
+    cir::VectorType sourceTy = ty;
+    if (builtinID == NEON::BI__builtin_neon_vfmaq_lane_v)
+      sourceTy = cir::VectorType::get(elemTy, numElts / 2);
+    else if (builtinID == NEON::BI__builtin_neon_vfma_laneq_v)
+      sourceTy = cir::VectorType::get(elemTy, numElts * 2);
+
+    if (laneSource.getType() != sourceTy)
+      laneSource = builder.createBitcast(loc, laneSource, sourceTy);
+
+    int64_t lane =
+        expr->getArg(3)->EvaluateKnownConstInt(getContext()).getSExtValue();
+    llvm::SmallVector<int64_t> mask(numElts, lane);
+    mlir::Value splat = builder.createVecShuffle(loc, laneSource, mask);
+
+    llvm::SmallVector<mlir::Value> fmaOps = {multiplicand, splat, addend};
+    return emitCallMaybeConstrainedBuiltin(builder, loc, "fma", ty, fmaOps);
+  }
   case NEON::BI__builtin_neon_vpadal_v:
   case NEON::BI__builtin_neon_vpadalq_v:
   case NEON::BI__builtin_neon_vpmin_v:
diff --git a/clang/lib/CIR/Lowering/CMakeLists.txt 
b/clang/lib/CIR/Lowering/CMakeLists.txt
index 28ec3c551018c..77d28ef72d11d 100644
--- a/clang/lib/CIR/Lowering/CMakeLists.txt
+++ b/clang/lib/CIR/Lowering/CMakeLists.txt
@@ -9,6 +9,9 @@ add_clang_library(clangCIRLoweringCommon
   CIRPasses.cpp
   LoweringHelpers.cpp
 
+  DEPENDS
+  MLIRCIRTypeConstraintsIncGen
+
   LINK_LIBS
   clangCIR
   ${dialect_libs}
diff --git a/clang/lib/CIR/Lowering/DirectToLLVM/CMakeLists.txt 
b/clang/lib/CIR/Lowering/DirectToLLVM/CMakeLists.txt
index c7467fe40ba30..5b197ddca12c0 100644
--- a/clang/lib/CIR/Lowering/DirectToLLVM/CMakeLists.txt
+++ b/clang/lib/CIR/Lowering/DirectToLLVM/CMakeLists.txt
@@ -13,6 +13,7 @@ add_clang_library(clangCIRLoweringDirectToLLVM
   MLIRCIREnumsGen
   MLIRCIROpsIncGen
   MLIRCIROpInterfacesIncGen
+  MLIRCIRTypeConstraintsIncGen
 
   LINK_LIBS
   clangCIRLoweringCommon
diff --git a/clang/test/CodeGen/AArch64/neon/vfma-lane.c 
b/clang/test/CodeGen/AArch64/neon/vfma-lane.c
new file mode 100644
index 0000000000000..955ab411793b9
--- /dev/null
+++ b/clang/test/CodeGen/AArch64/neon/vfma-lane.c
@@ -0,0 +1,136 @@
+// REQUIRES: aarch64-registered-target
+
+// RUN: %clang_cc1 -triple arm64-none-linux-gnu -target-feature +neon \
+// RUN:   -target-feature +fullfp16 -disable-O0-optnone \
+// RUN:   -flax-vector-conversions=none -emit-llvm -o - %s | \
+// RUN:   opt -S -passes=mem2reg,sroa | FileCheck %s --check-prefix=LLVM
+// RUN: %if cir-enabled %{%clang_cc1 -triple arm64-none-linux-gnu \
+// RUN:   -target-feature +neon -target-feature +fullfp16 \
+// RUN:   -disable-O0-optnone -flax-vector-conversions=none \
+// RUN:   -fclangir -emit-llvm -o - %s | \
+// RUN:   opt -S -passes=mem2reg,sroa | FileCheck %s --check-prefix=LLVM %}
+// RUN: %if cir-enabled %{%clang_cc1 -triple arm64-none-linux-gnu \
+// RUN:   -target-feature +neon -target-feature +fullfp16 \
+// RUN:   -disable-O0-optnone -flax-vector-conversions=none \
+// RUN:   -fclangir -emit-cir -o - %s | FileCheck %s --check-prefix=CIR %}
+
+#include <arm_neon.h>
+
+// LLVM-LABEL: @test_vfma_lane_f16(
+// LLVM: shufflevector <4 x half>
+// LLVM: call <4 x half> @llvm.fma.v4f16(
+// CIR-LABEL: @test_vfma_lane_f16(
+// CIR: cir.vec.shuffle
+// CIR: cir.call_llvm_intrinsic "fma"
+float16x4_t test_vfma_lane_f16(float16x4_t a, float16x4_t b, float16x4_t c) {
+  return vfma_lane_f16(a, b, c, 3);
+}
+
+// LLVM-LABEL: @test_vfmaq_lane_f16(
+// LLVM: shufflevector <4 x half>
+// LLVM: call <8 x half> @llvm.fma.v8f16(
+// CIR-LABEL: @test_vfmaq_lane_f16(
+// CIR: cir.vec.shuffle
+// CIR: cir.call_llvm_intrinsic "fma"
+float16x8_t test_vfmaq_lane_f16(float16x8_t a, float16x8_t b, float16x4_t c) {
+  return vfmaq_lane_f16(a, b, c, 3);
+}
+
+// LLVM-LABEL: @test_vfma_laneq_f16(
+// LLVM: shufflevector <8 x half>
+// LLVM: call <4 x half> @llvm.fma.v4f16(
+// CIR-LABEL: @test_vfma_laneq_f16(
+// CIR: cir.vec.shuffle
+// CIR: cir.call_llvm_intrinsic "fma"
+float16x4_t test_vfma_laneq_f16(float16x4_t a, float16x4_t b, float16x8_t c) {
+  return vfma_laneq_f16(a, b, c, 7);
+}
+
+// LLVM-LABEL: @test_vfmaq_laneq_f16(
+// LLVM: shufflevector <8 x half>
+// LLVM: call <8 x half> @llvm.fma.v8f16(
+// CIR-LABEL: @test_vfmaq_laneq_f16(
+// CIR: cir.vec.shuffle
+// CIR: cir.call_llvm_intrinsic "fma"
+float16x8_t test_vfmaq_laneq_f16(float16x8_t a, float16x8_t b, float16x8_t c) {
+  return vfmaq_laneq_f16(a, b, c, 7);
+}
+
+// LLVM-LABEL: @test_vfma_lane_f32(
+// LLVM: shufflevector <2 x float>
+// LLVM: call <2 x float> @llvm.fma.v2f32(
+// CIR-LABEL: @test_vfma_lane_f32(
+// CIR: cir.vec.shuffle
+// CIR: cir.call_llvm_intrinsic "fma"
+float32x2_t test_vfma_lane_f32(float32x2_t a, float32x2_t b, float32x2_t v) {
+  return vfma_lane_f32(a, b, v, 1);
+}
+
+// LLVM-LABEL: @test_vfmaq_lane_f32(
+// LLVM: shufflevector <2 x float>
+// LLVM: call <4 x float> @llvm.fma.v4f32(
+// CIR-LABEL: @test_vfmaq_lane_f32(
+// CIR: cir.vec.shuffle
+// CIR: cir.call_llvm_intrinsic "fma"
+float32x4_t test_vfmaq_lane_f32(float32x4_t a, float32x4_t b, float32x2_t v) {
+  return vfmaq_lane_f32(a, b, v, 1);
+}
+
+// LLVM-LABEL: @test_vfma_laneq_f32(
+// LLVM: shufflevector <4 x float>
+// LLVM: call <2 x float> @llvm.fma.v2f32(
+// CIR-LABEL: @test_vfma_laneq_f32(
+// CIR: cir.vec.shuffle
+// CIR: cir.call_llvm_intrinsic "fma"
+float32x2_t test_vfma_laneq_f32(float32x2_t a, float32x2_t b, float32x4_t v) {
+  return vfma_laneq_f32(a, b, v, 3);
+}
+
+// LLVM-LABEL: @test_vfmaq_laneq_f32(
+// LLVM: shufflevector <4 x float>
+// LLVM: call <4 x float> @llvm.fma.v4f32(
+// CIR-LABEL: @test_vfmaq_laneq_f32(
+// CIR: cir.vec.shuffle
+// CIR: cir.call_llvm_intrinsic "fma"
+float32x4_t test_vfmaq_laneq_f32(float32x4_t a, float32x4_t b, float32x4_t v) {
+  return vfmaq_laneq_f32(a, b, v, 3);
+}
+
+// LLVM-LABEL: @test_vfma_lane_f64(
+// LLVM: shufflevector <1 x double>
+// LLVM: call <1 x double> @llvm.fma.v1f64(
+// CIR-LABEL: @test_vfma_lane_f64(
+// CIR: cir.vec.shuffle
+// CIR: cir.call_llvm_intrinsic "fma"
+float64x1_t test_vfma_lane_f64(float64x1_t a, float64x1_t b, float64x1_t v) {
+  return vfma_lane_f64(a, b, v, 0);
+}
+
+// LLVM-LABEL: @test_vfmaq_lane_f64(
+// LLVM: shufflevector <1 x double>
+// LLVM: call <2 x double> @llvm.fma.v2f64(
+// CIR-LABEL: @test_vfmaq_lane_f64(
+// CIR: cir.vec.shuffle
+// CIR: cir.call_llvm_intrinsic "fma"
+float64x2_t test_vfmaq_lane_f64(float64x2_t a, float64x2_t b, float64x1_t v) {
+  return vfmaq_lane_f64(a, b, v, 0);
+}
+
+// LLVM-LABEL: @test_vfma_laneq_f64(
+// LLVM: @llvm.fma
+// CIR-LABEL: @test_vfma_laneq_f64(
+// CIR: cir.vec.shuffle
+// CIR: cir.call_llvm_intrinsic "fma"
+float64x1_t test_vfma_laneq_f64(float64x1_t a, float64x1_t b, float64x2_t v) {
+  return vfma_laneq_f64(a, b, v, 0);
+}
+
+// LLVM-LABEL: @test_vfmaq_laneq_f64(
+// LLVM: shufflevector <2 x double>
+// LLVM: call <2 x double> @llvm.fma.v2f64(
+// CIR-LABEL: @test_vfmaq_laneq_f64(
+// CIR: cir.vec.shuffle
+// CIR: cir.call_llvm_intrinsic "fma"
+float64x2_t test_vfmaq_laneq_f64(float64x2_t a, float64x2_t b, float64x2_t v) {
+  return vfmaq_laneq_f64(a, b, v, 1);
+}
diff --git a/clang/test/CodeGen/AArch64/neon/vfma-scalar-lane.c 
b/clang/test/CodeGen/AArch64/neon/vfma-scalar-lane.c
new file mode 100644
index 0000000000000..53fc9761e01a0
--- /dev/null
+++ b/clang/test/CodeGen/AArch64/neon/vfma-scalar-lane.c
@@ -0,0 +1,77 @@
+// REQUIRES: aarch64-registered-target
+
+// RUN: %clang_cc1 -triple arm64-none-linux-gnu -target-feature +neon \
+// RUN:   -target-feature +fullfp16 -disable-O0-optnone \
+// RUN:   -flax-vector-conversions=none -emit-llvm -o - %s | \
+// RUN:   opt -S -passes=mem2reg,sroa | FileCheck %s --check-prefix=LLVM
+// RUN: %if cir-enabled %{%clang_cc1 -triple arm64-none-linux-gnu \
+// RUN:   -target-feature +neon -target-feature +fullfp16 \
+// RUN:   -disable-O0-optnone -flax-vector-conversions=none \
+// RUN:   -fclangir -emit-llvm -o - %s | \
+// RUN:   opt -S -passes=mem2reg,sroa | FileCheck %s --check-prefix=LLVM %}
+// RUN: %if cir-enabled %{%clang_cc1 -triple arm64-none-linux-gnu \
+// RUN:   -target-feature +neon -target-feature +fullfp16 \
+// RUN:   -disable-O0-optnone -flax-vector-conversions=none \
+// RUN:   -fclangir -emit-cir -o - %s | FileCheck %s --check-prefix=CIR %}
+
+#include <arm_neon.h>
+
+// LLVM-LABEL: @test_vfmah_lane_f16(
+// LLVM: extractelement <4 x half>
+// LLVM: call half @llvm.fma.f16(
+// CIR-LABEL: @test_vfmah_lane_f16(
+// CIR: cir.vec.extract
+// CIR: cir.call_llvm_intrinsic "fma"
+float16_t test_vfmah_lane_f16(float16_t a, float16_t b, float16x4_t c) {
+  return vfmah_lane_f16(a, b, c, 3);
+}
+
+// LLVM-LABEL: @test_vfmah_laneq_f16(
+// LLVM: extractelement <8 x half>
+// LLVM: call half @llvm.fma.f16(
+// CIR-LABEL: @test_vfmah_laneq_f16(
+// CIR: cir.vec.extract
+// CIR: cir.call_llvm_intrinsic "fma"
+float16_t test_vfmah_laneq_f16(float16_t a, float16_t b, float16x8_t c) {
+  return vfmah_laneq_f16(a, b, c, 7);
+}
+
+// LLVM-LABEL: @test_vfmas_lane_f32(
+// LLVM: extractelement <2 x float>
+// LLVM: call float @llvm.fma.f32(
+// CIR-LABEL: @test_vfmas_lane_f32(
+// CIR: cir.vec.extract
+// CIR: cir.call_llvm_intrinsic "fma"
+float32_t test_vfmas_lane_f32(float32_t a, float32_t b, float32x2_t c) {
+  return vfmas_lane_f32(a, b, c, 1);
+}
+
+// LLVM-LABEL: @test_vfmas_laneq_f32(
+// LLVM: extractelement <4 x float>
+// LLVM: call float @llvm.fma.f32(
+// CIR-LABEL: @test_vfmas_laneq_f32(
+// CIR: cir.vec.extract
+// CIR: cir.call_llvm_intrinsic "fma"
+float32_t test_vfmas_laneq_f32(float32_t a, float32_t b, float32x4_t c) {
+  return vfmas_laneq_f32(a, b, c, 3);
+}
+
+// LLVM-LABEL: @test_vfmad_lane_f64(
+// LLVM: extractelement <1 x double>
+// LLVM: call double @llvm.fma.f64(
+// CIR-LABEL: @test_vfmad_lane_f64(
+// CIR: cir.vec.extract
+// CIR: cir.call_llvm_intrinsic "fma"
+float64_t test_vfmad_lane_f64(float64_t a, float64_t b, float64x1_t c) {
+  return vfmad_lane_f64(a, b, c, 0);
+}
+
+// LLVM-LABEL: @test_vfmad_laneq_f64(
+// LLVM: extractelement <2 x double>
+// LLVM: call double @llvm.fma.f64(
+// CIR-LABEL: @test_vfmad_laneq_f64(
+// CIR: cir.vec.extract
+// CIR: cir.call_llvm_intrinsic "fma"
+float64_t test_vfmad_laneq_f64(float64_t a, float64_t b, float64x2_t c) {
+  return vfmad_laneq_f64(a, b, c, 1);
+}

``````````

</details>


https://github.com/llvm/llvm-project/pull/188190
_______________________________________________
cfe-commits mailing list
[email protected]
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits

[clang] [CIR][AArch64] Lower vfma lane builtins (PR #188190)

Reply via email to