llvmbot wrote:
<!--LLVM PR SUMMARY COMMENT--> @llvm/pr-subscribers-clangir Author: Yair Ben Avraham (yairbenavraham) <details> <summary>Changes</summary> This PR implements the AArch64 NEON ClangIR lowering for the vfma lane/laneq builtins and adds CIR-enabled regression tests. Covered scope: - vector lane/laneq forms - scalar lane/laneq forms - includes the vfmaq_laneq_v family called out in #<!-- -->185382 Validation: - clean build from scratch - post-build sanity check - focused llvm-lit validation for the touched AArch64 NEON tests Part of #<!-- -->185382 --- Full diff: https://github.com/llvm/llvm-project/pull/188190.diff 6 Files Affected: - (modified) clang/include/clang/CIR/Dialect/IR/CMakeLists.txt (+1-1) - (modified) clang/lib/CIR/CodeGen/CIRGenBuiltinAArch64.cpp (+53-14) - (modified) clang/lib/CIR/Lowering/CMakeLists.txt (+3) - (modified) clang/lib/CIR/Lowering/DirectToLLVM/CMakeLists.txt (+1) - (added) clang/test/CodeGen/AArch64/neon/vfma-lane.c (+136) - (added) clang/test/CodeGen/AArch64/neon/vfma-scalar-lane.c (+77) ``````````diff diff --git a/clang/include/clang/CIR/Dialect/IR/CMakeLists.txt b/clang/include/clang/CIR/Dialect/IR/CMakeLists.txt index 870f9e3f5d052..1388e5bc612f2 100644 --- a/clang/include/clang/CIR/Dialect/IR/CMakeLists.txt +++ b/clang/include/clang/CIR/Dialect/IR/CMakeLists.txt @@ -27,5 +27,5 @@ clang_tablegen(CIRLowering.inc -gen-cir-lowering set(LLVM_TARGET_DEFINITIONS CIRTypeConstraints.td) mlir_tablegen(CIRTypeConstraints.h.inc -gen-type-constraint-decls) mlir_tablegen(CIRTypeConstraints.cpp.inc -gen-type-constraint-defs) -add_public_tablegen_target(MLIRCIRTypeConstraintsIncGen) +add_mlir_generic_tablegen_target(MLIRCIRTypeConstraintsIncGen) add_dependencies(mlir-headers MLIRCIRTypeConstraintsIncGen) diff --git a/clang/lib/CIR/CodeGen/CIRGenBuiltinAArch64.cpp b/clang/lib/CIR/CodeGen/CIRGenBuiltinAArch64.cpp index 5d7b8d839fa84..26560b2ab3447 100644 --- a/clang/lib/CIR/CodeGen/CIRGenBuiltinAArch64.cpp +++ b/clang/lib/CIR/CodeGen/CIRGenBuiltinAArch64.cpp @@ -801,11 +801,10 @@ static cir::VectorType getNeonType(CIRGenFunction *cgf, NeonTypeFlags typeFlags, cgf->getCIRGenModule().errorNYI(loc, std::string("NEON type: BFloat16")); [[fallthrough]]; case NeonTypeFlags::Float16: - if (hasLegalHalfType) + if (!hasLegalHalfType) cgf->getCIRGenModule().errorNYI(loc, std::string("NEON type: Float16")); - else - cgf->getCIRGenModule().errorNYI(loc, std::string("NEON type: Float16")); - [[fallthrough]]; + return cir::VectorType::get(cgf->getCIRGenModule().fP16Ty, + v1Ty ? 1 : (4 << isQuad)); case NeonTypeFlags::Int32: return cir::VectorType::get(typeFlags.isUnsigned() ? cgf->uInt32Ty : cgf->sInt32Ty, @@ -2848,6 +2847,23 @@ CIRGenFunction::emitAArch64BuiltinExpr(unsigned builtinID, const CallExpr *expr, return mlir::Value{}; } + switch (builtinID) { + case NEON::BI__builtin_neon_vfmah_lane_f16: + case NEON::BI__builtin_neon_vfmas_lane_f32: + case NEON::BI__builtin_neon_vfmah_laneq_f16: + case NEON::BI__builtin_neon_vfmas_laneq_f32: + case NEON::BI__builtin_neon_vfmad_lane_f64: + case NEON::BI__builtin_neon_vfmad_laneq_f64: { + mlir::Value lane = cir::VecExtractOp::create(builder, loc, ops[2], ops[3]); + mlir::Type scalarTy = convertType(expr->getType()); + llvm::SmallVector<mlir::Value> fmaOps = {ops[1], lane, ops[0]}; + return emitCallMaybeConstrainedBuiltin(builder, loc, "fma", scalarTy, + fmaOps); + } + default: + break; + } + cir::VectorType ty = getNeonType(this, type, loc); if (!ty) return nullptr; @@ -2859,16 +2875,6 @@ CIRGenFunction::emitAArch64BuiltinExpr(unsigned builtinID, const CallExpr *expr, return std::nullopt; case NEON::BI__builtin_neon_vbsl_v: case NEON::BI__builtin_neon_vbslq_v: - case NEON::BI__builtin_neon_vfma_lane_v: - case NEON::BI__builtin_neon_vfmaq_lane_v: - case NEON::BI__builtin_neon_vfma_laneq_v: - case NEON::BI__builtin_neon_vfmaq_laneq_v: - case NEON::BI__builtin_neon_vfmah_lane_f16: - case NEON::BI__builtin_neon_vfmas_lane_f32: - case NEON::BI__builtin_neon_vfmah_laneq_f16: - case NEON::BI__builtin_neon_vfmas_laneq_f32: - case NEON::BI__builtin_neon_vfmad_lane_f64: - case NEON::BI__builtin_neon_vfmad_laneq_f64: case NEON::BI__builtin_neon_vmull_v: case NEON::BI__builtin_neon_vmax_v: case NEON::BI__builtin_neon_vmaxq_v: @@ -2886,6 +2892,39 @@ CIRGenFunction::emitAArch64BuiltinExpr(unsigned builtinID, const CallExpr *expr, if (cir::isFPOrVectorOfFPType(ty)) intrName = "aarch64.neon.fabd"; return emitNeonCall(cgm, builder, {ty, ty}, ops, intrName, ty, loc); + case NEON::BI__builtin_neon_vfma_lane_v: + case NEON::BI__builtin_neon_vfmaq_lane_v: + case NEON::BI__builtin_neon_vfma_laneq_v: + case NEON::BI__builtin_neon_vfmaq_laneq_v: { + mlir::Value addend = ops[0]; + mlir::Value multiplicand = ops[1]; + mlir::Value laneSource = ops[2]; + auto vecTy = mlir::cast<cir::VectorType>(ty); + auto elemTy = vecTy.getElementType(); + auto numElts = vecTy.getSize(); + + if (addend.getType() != ty) + addend = builder.createBitcast(loc, addend, ty); + if (multiplicand.getType() != ty) + multiplicand = builder.createBitcast(loc, multiplicand, ty); + + cir::VectorType sourceTy = ty; + if (builtinID == NEON::BI__builtin_neon_vfmaq_lane_v) + sourceTy = cir::VectorType::get(elemTy, numElts / 2); + else if (builtinID == NEON::BI__builtin_neon_vfma_laneq_v) + sourceTy = cir::VectorType::get(elemTy, numElts * 2); + + if (laneSource.getType() != sourceTy) + laneSource = builder.createBitcast(loc, laneSource, sourceTy); + + int64_t lane = + expr->getArg(3)->EvaluateKnownConstInt(getContext()).getSExtValue(); + llvm::SmallVector<int64_t> mask(numElts, lane); + mlir::Value splat = builder.createVecShuffle(loc, laneSource, mask); + + llvm::SmallVector<mlir::Value> fmaOps = {multiplicand, splat, addend}; + return emitCallMaybeConstrainedBuiltin(builder, loc, "fma", ty, fmaOps); + } case NEON::BI__builtin_neon_vpadal_v: case NEON::BI__builtin_neon_vpadalq_v: case NEON::BI__builtin_neon_vpmin_v: diff --git a/clang/lib/CIR/Lowering/CMakeLists.txt b/clang/lib/CIR/Lowering/CMakeLists.txt index 28ec3c551018c..77d28ef72d11d 100644 --- a/clang/lib/CIR/Lowering/CMakeLists.txt +++ b/clang/lib/CIR/Lowering/CMakeLists.txt @@ -9,6 +9,9 @@ add_clang_library(clangCIRLoweringCommon CIRPasses.cpp LoweringHelpers.cpp + DEPENDS + MLIRCIRTypeConstraintsIncGen + LINK_LIBS clangCIR ${dialect_libs} diff --git a/clang/lib/CIR/Lowering/DirectToLLVM/CMakeLists.txt b/clang/lib/CIR/Lowering/DirectToLLVM/CMakeLists.txt index c7467fe40ba30..5b197ddca12c0 100644 --- a/clang/lib/CIR/Lowering/DirectToLLVM/CMakeLists.txt +++ b/clang/lib/CIR/Lowering/DirectToLLVM/CMakeLists.txt @@ -13,6 +13,7 @@ add_clang_library(clangCIRLoweringDirectToLLVM MLIRCIREnumsGen MLIRCIROpsIncGen MLIRCIROpInterfacesIncGen + MLIRCIRTypeConstraintsIncGen LINK_LIBS clangCIRLoweringCommon diff --git a/clang/test/CodeGen/AArch64/neon/vfma-lane.c b/clang/test/CodeGen/AArch64/neon/vfma-lane.c new file mode 100644 index 0000000000000..955ab411793b9 --- /dev/null +++ b/clang/test/CodeGen/AArch64/neon/vfma-lane.c @@ -0,0 +1,136 @@ +// REQUIRES: aarch64-registered-target + +// RUN: %clang_cc1 -triple arm64-none-linux-gnu -target-feature +neon \ +// RUN: -target-feature +fullfp16 -disable-O0-optnone \ +// RUN: -flax-vector-conversions=none -emit-llvm -o - %s | \ +// RUN: opt -S -passes=mem2reg,sroa | FileCheck %s --check-prefix=LLVM +// RUN: %if cir-enabled %{%clang_cc1 -triple arm64-none-linux-gnu \ +// RUN: -target-feature +neon -target-feature +fullfp16 \ +// RUN: -disable-O0-optnone -flax-vector-conversions=none \ +// RUN: -fclangir -emit-llvm -o - %s | \ +// RUN: opt -S -passes=mem2reg,sroa | FileCheck %s --check-prefix=LLVM %} +// RUN: %if cir-enabled %{%clang_cc1 -triple arm64-none-linux-gnu \ +// RUN: -target-feature +neon -target-feature +fullfp16 \ +// RUN: -disable-O0-optnone -flax-vector-conversions=none \ +// RUN: -fclangir -emit-cir -o - %s | FileCheck %s --check-prefix=CIR %} + +#include <arm_neon.h> + +// LLVM-LABEL: @test_vfma_lane_f16( +// LLVM: shufflevector <4 x half> +// LLVM: call <4 x half> @llvm.fma.v4f16( +// CIR-LABEL: @test_vfma_lane_f16( +// CIR: cir.vec.shuffle +// CIR: cir.call_llvm_intrinsic "fma" +float16x4_t test_vfma_lane_f16(float16x4_t a, float16x4_t b, float16x4_t c) { + return vfma_lane_f16(a, b, c, 3); +} + +// LLVM-LABEL: @test_vfmaq_lane_f16( +// LLVM: shufflevector <4 x half> +// LLVM: call <8 x half> @llvm.fma.v8f16( +// CIR-LABEL: @test_vfmaq_lane_f16( +// CIR: cir.vec.shuffle +// CIR: cir.call_llvm_intrinsic "fma" +float16x8_t test_vfmaq_lane_f16(float16x8_t a, float16x8_t b, float16x4_t c) { + return vfmaq_lane_f16(a, b, c, 3); +} + +// LLVM-LABEL: @test_vfma_laneq_f16( +// LLVM: shufflevector <8 x half> +// LLVM: call <4 x half> @llvm.fma.v4f16( +// CIR-LABEL: @test_vfma_laneq_f16( +// CIR: cir.vec.shuffle +// CIR: cir.call_llvm_intrinsic "fma" +float16x4_t test_vfma_laneq_f16(float16x4_t a, float16x4_t b, float16x8_t c) { + return vfma_laneq_f16(a, b, c, 7); +} + +// LLVM-LABEL: @test_vfmaq_laneq_f16( +// LLVM: shufflevector <8 x half> +// LLVM: call <8 x half> @llvm.fma.v8f16( +// CIR-LABEL: @test_vfmaq_laneq_f16( +// CIR: cir.vec.shuffle +// CIR: cir.call_llvm_intrinsic "fma" +float16x8_t test_vfmaq_laneq_f16(float16x8_t a, float16x8_t b, float16x8_t c) { + return vfmaq_laneq_f16(a, b, c, 7); +} + +// LLVM-LABEL: @test_vfma_lane_f32( +// LLVM: shufflevector <2 x float> +// LLVM: call <2 x float> @llvm.fma.v2f32( +// CIR-LABEL: @test_vfma_lane_f32( +// CIR: cir.vec.shuffle +// CIR: cir.call_llvm_intrinsic "fma" +float32x2_t test_vfma_lane_f32(float32x2_t a, float32x2_t b, float32x2_t v) { + return vfma_lane_f32(a, b, v, 1); +} + +// LLVM-LABEL: @test_vfmaq_lane_f32( +// LLVM: shufflevector <2 x float> +// LLVM: call <4 x float> @llvm.fma.v4f32( +// CIR-LABEL: @test_vfmaq_lane_f32( +// CIR: cir.vec.shuffle +// CIR: cir.call_llvm_intrinsic "fma" +float32x4_t test_vfmaq_lane_f32(float32x4_t a, float32x4_t b, float32x2_t v) { + return vfmaq_lane_f32(a, b, v, 1); +} + +// LLVM-LABEL: @test_vfma_laneq_f32( +// LLVM: shufflevector <4 x float> +// LLVM: call <2 x float> @llvm.fma.v2f32( +// CIR-LABEL: @test_vfma_laneq_f32( +// CIR: cir.vec.shuffle +// CIR: cir.call_llvm_intrinsic "fma" +float32x2_t test_vfma_laneq_f32(float32x2_t a, float32x2_t b, float32x4_t v) { + return vfma_laneq_f32(a, b, v, 3); +} + +// LLVM-LABEL: @test_vfmaq_laneq_f32( +// LLVM: shufflevector <4 x float> +// LLVM: call <4 x float> @llvm.fma.v4f32( +// CIR-LABEL: @test_vfmaq_laneq_f32( +// CIR: cir.vec.shuffle +// CIR: cir.call_llvm_intrinsic "fma" +float32x4_t test_vfmaq_laneq_f32(float32x4_t a, float32x4_t b, float32x4_t v) { + return vfmaq_laneq_f32(a, b, v, 3); +} + +// LLVM-LABEL: @test_vfma_lane_f64( +// LLVM: shufflevector <1 x double> +// LLVM: call <1 x double> @llvm.fma.v1f64( +// CIR-LABEL: @test_vfma_lane_f64( +// CIR: cir.vec.shuffle +// CIR: cir.call_llvm_intrinsic "fma" +float64x1_t test_vfma_lane_f64(float64x1_t a, float64x1_t b, float64x1_t v) { + return vfma_lane_f64(a, b, v, 0); +} + +// LLVM-LABEL: @test_vfmaq_lane_f64( +// LLVM: shufflevector <1 x double> +// LLVM: call <2 x double> @llvm.fma.v2f64( +// CIR-LABEL: @test_vfmaq_lane_f64( +// CIR: cir.vec.shuffle +// CIR: cir.call_llvm_intrinsic "fma" +float64x2_t test_vfmaq_lane_f64(float64x2_t a, float64x2_t b, float64x1_t v) { + return vfmaq_lane_f64(a, b, v, 0); +} + +// LLVM-LABEL: @test_vfma_laneq_f64( +// LLVM: @llvm.fma +// CIR-LABEL: @test_vfma_laneq_f64( +// CIR: cir.vec.shuffle +// CIR: cir.call_llvm_intrinsic "fma" +float64x1_t test_vfma_laneq_f64(float64x1_t a, float64x1_t b, float64x2_t v) { + return vfma_laneq_f64(a, b, v, 0); +} + +// LLVM-LABEL: @test_vfmaq_laneq_f64( +// LLVM: shufflevector <2 x double> +// LLVM: call <2 x double> @llvm.fma.v2f64( +// CIR-LABEL: @test_vfmaq_laneq_f64( +// CIR: cir.vec.shuffle +// CIR: cir.call_llvm_intrinsic "fma" +float64x2_t test_vfmaq_laneq_f64(float64x2_t a, float64x2_t b, float64x2_t v) { + return vfmaq_laneq_f64(a, b, v, 1); +} diff --git a/clang/test/CodeGen/AArch64/neon/vfma-scalar-lane.c b/clang/test/CodeGen/AArch64/neon/vfma-scalar-lane.c new file mode 100644 index 0000000000000..53fc9761e01a0 --- /dev/null +++ b/clang/test/CodeGen/AArch64/neon/vfma-scalar-lane.c @@ -0,0 +1,77 @@ +// REQUIRES: aarch64-registered-target + +// RUN: %clang_cc1 -triple arm64-none-linux-gnu -target-feature +neon \ +// RUN: -target-feature +fullfp16 -disable-O0-optnone \ +// RUN: -flax-vector-conversions=none -emit-llvm -o - %s | \ +// RUN: opt -S -passes=mem2reg,sroa | FileCheck %s --check-prefix=LLVM +// RUN: %if cir-enabled %{%clang_cc1 -triple arm64-none-linux-gnu \ +// RUN: -target-feature +neon -target-feature +fullfp16 \ +// RUN: -disable-O0-optnone -flax-vector-conversions=none \ +// RUN: -fclangir -emit-llvm -o - %s | \ +// RUN: opt -S -passes=mem2reg,sroa | FileCheck %s --check-prefix=LLVM %} +// RUN: %if cir-enabled %{%clang_cc1 -triple arm64-none-linux-gnu \ +// RUN: -target-feature +neon -target-feature +fullfp16 \ +// RUN: -disable-O0-optnone -flax-vector-conversions=none \ +// RUN: -fclangir -emit-cir -o - %s | FileCheck %s --check-prefix=CIR %} + +#include <arm_neon.h> + +// LLVM-LABEL: @test_vfmah_lane_f16( +// LLVM: extractelement <4 x half> +// LLVM: call half @llvm.fma.f16( +// CIR-LABEL: @test_vfmah_lane_f16( +// CIR: cir.vec.extract +// CIR: cir.call_llvm_intrinsic "fma" +float16_t test_vfmah_lane_f16(float16_t a, float16_t b, float16x4_t c) { + return vfmah_lane_f16(a, b, c, 3); +} + +// LLVM-LABEL: @test_vfmah_laneq_f16( +// LLVM: extractelement <8 x half> +// LLVM: call half @llvm.fma.f16( +// CIR-LABEL: @test_vfmah_laneq_f16( +// CIR: cir.vec.extract +// CIR: cir.call_llvm_intrinsic "fma" +float16_t test_vfmah_laneq_f16(float16_t a, float16_t b, float16x8_t c) { + return vfmah_laneq_f16(a, b, c, 7); +} + +// LLVM-LABEL: @test_vfmas_lane_f32( +// LLVM: extractelement <2 x float> +// LLVM: call float @llvm.fma.f32( +// CIR-LABEL: @test_vfmas_lane_f32( +// CIR: cir.vec.extract +// CIR: cir.call_llvm_intrinsic "fma" +float32_t test_vfmas_lane_f32(float32_t a, float32_t b, float32x2_t c) { + return vfmas_lane_f32(a, b, c, 1); +} + +// LLVM-LABEL: @test_vfmas_laneq_f32( +// LLVM: extractelement <4 x float> +// LLVM: call float @llvm.fma.f32( +// CIR-LABEL: @test_vfmas_laneq_f32( +// CIR: cir.vec.extract +// CIR: cir.call_llvm_intrinsic "fma" +float32_t test_vfmas_laneq_f32(float32_t a, float32_t b, float32x4_t c) { + return vfmas_laneq_f32(a, b, c, 3); +} + +// LLVM-LABEL: @test_vfmad_lane_f64( +// LLVM: extractelement <1 x double> +// LLVM: call double @llvm.fma.f64( +// CIR-LABEL: @test_vfmad_lane_f64( +// CIR: cir.vec.extract +// CIR: cir.call_llvm_intrinsic "fma" +float64_t test_vfmad_lane_f64(float64_t a, float64_t b, float64x1_t c) { + return vfmad_lane_f64(a, b, c, 0); +} + +// LLVM-LABEL: @test_vfmad_laneq_f64( +// LLVM: extractelement <2 x double> +// LLVM: call double @llvm.fma.f64( +// CIR-LABEL: @test_vfmad_laneq_f64( +// CIR: cir.vec.extract +// CIR: cir.call_llvm_intrinsic "fma" +float64_t test_vfmad_laneq_f64(float64_t a, float64_t b, float64x2_t c) { + return vfmad_laneq_f64(a, b, c, 1); +} `````````` </details> https://github.com/llvm/llvm-project/pull/188190 _______________________________________________ cfe-commits mailing list [email protected] https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
