Author: Amina Chabane Date: 2025-12-05T16:08:25Z New Revision: 7f2e6f128d2e8bc20bb95ddd9af9e23ea8fea5d2
URL: https://github.com/llvm/llvm-project/commit/7f2e6f128d2e8bc20bb95ddd9af9e23ea8fea5d2 DIFF: https://github.com/llvm/llvm-project/commit/7f2e6f128d2e8bc20bb95ddd9af9e23ea8fea5d2.diff LOG: [Clang][AArch64] Implement widening FMMLA intrinsics (#165282) Proposed in [this ACLE proposal](https://github.com/ARM-software/acle/pull/409), this PR implements widening FMMLA intrinsics. - F16 to F32 - MF8 to F32 - MF8 to F16 Additional changes: - IsOverloadCvt flag renamed to IsOverloadFirstandLast for clarity, as the name implies conversion. Implementation remains unchanged. Added: clang/test/CodeGen/AArch64/sve-intrinsics/acle_sve_fmmla-f32f16.c clang/test/CodeGen/AArch64/sve2-intrinsics/acle_sve2_fmmla-f16mf8.c clang/test/CodeGen/AArch64/sve2-intrinsics/acle_sve2_fmmla-f32mf8.c llvm/test/CodeGen/AArch64/sve-fmmla-f32f16.ll llvm/test/CodeGen/AArch64/sve2-fmmla-f16mf8.ll llvm/test/CodeGen/AArch64/sve2-fmmla-f32mf8.ll Modified: clang/include/clang/Basic/TargetBuiltins.h clang/include/clang/Basic/arm_sve.td clang/include/clang/Basic/arm_sve_sme_incl.td clang/lib/CodeGen/TargetBuiltins/ARM.cpp clang/test/CodeGen/AArch64/sve-intrinsics/acle_sve_matmul_fp32.c clang/test/CodeGen/AArch64/sve-intrinsics/acle_sve_matmul_fp64.c llvm/include/llvm/IR/IntrinsicsAArch64.td llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td llvm/lib/Target/AArch64/SVEInstrFormats.td Removed: ################################################################################ diff --git a/clang/include/clang/Basic/TargetBuiltins.h b/clang/include/clang/Basic/TargetBuiltins.h index 9bd514349d31d..d5e8299f8b080 100644 --- a/clang/include/clang/Basic/TargetBuiltins.h +++ b/clang/include/clang/Basic/TargetBuiltins.h @@ -397,7 +397,9 @@ namespace clang { } bool isOverloadDefault() const { return !(Flags & OverloadKindMask); } bool isOverloadWhileRW() const { return Flags & IsOverloadWhileRW; } - bool isOverloadCvt() const { return Flags & IsOverloadCvt; } + bool isOverloadFirstandLast() const { + return Flags & IsOverloadFirstandLast; + } bool isPrefetch() const { return Flags & IsPrefetch; } bool isReverseCompare() const { return Flags & ReverseCompare; } bool isAppendSVALL() const { return Flags & IsAppendSVALL; } diff --git a/clang/include/clang/Basic/arm_sve.td b/clang/include/clang/Basic/arm_sve.td index 41179207dd060..86f2f21fabc48 100644 --- a/clang/include/clang/Basic/arm_sve.td +++ b/clang/include/clang/Basic/arm_sve.td @@ -883,56 +883,56 @@ multiclass SInstCvtMX<string name, string m_types, string xz_types, } // svcvt_s##_f16 -defm SVFCVTZS_S16_F16 : SInstCvtMXZ<"svcvt_s16[_f16]", "ddPO", "dPO", "s", "aarch64_sve_fcvtzs", [IsOverloadCvt]>; +defm SVFCVTZS_S16_F16 : SInstCvtMXZ<"svcvt_s16[_f16]", "ddPO", "dPO", "s", "aarch64_sve_fcvtzs", [IsOverloadFirstandLast]>; defm SVFCVTZS_S32_F16 : SInstCvtMXZ<"svcvt_s32[_f16]", "ddPO", "dPO", "i", "aarch64_sve_fcvtzs_i32f16">; defm SVFCVTZS_S64_F16 : SInstCvtMXZ<"svcvt_s64[_f16]", "ddPO", "dPO", "l", "aarch64_sve_fcvtzs_i64f16">; // svcvt_s##_f32 -defm SVFCVTZS_S32_F32 : SInstCvtMXZ<"svcvt_s32[_f32]", "ddPM", "dPM", "i", "aarch64_sve_fcvtzs", [IsOverloadCvt]>; +defm SVFCVTZS_S32_F32 : SInstCvtMXZ<"svcvt_s32[_f32]", "ddPM", "dPM", "i", "aarch64_sve_fcvtzs", [IsOverloadFirstandLast]>; defm SVFCVTZS_S64_F32 : SInstCvtMXZ<"svcvt_s64[_f32]", "ddPM", "dPM", "l", "aarch64_sve_fcvtzs_i64f32">; // svcvt_s##_f64 defm SVFCVTZS_S32_F64 : SInstCvtMXZ<"svcvt_s32[_f64]", "ttPd", "tPd", "d", "aarch64_sve_fcvtzs_i32f64">; -defm SVFCVTZS_S64_F64 : SInstCvtMXZ<"svcvt_s64[_f64]", "ddPN", "dPN", "l", "aarch64_sve_fcvtzs", [IsOverloadCvt]>; +defm SVFCVTZS_S64_F64 : SInstCvtMXZ<"svcvt_s64[_f64]", "ddPN", "dPN", "l", "aarch64_sve_fcvtzs", [IsOverloadFirstandLast]>; // svcvt_u##_f16 -defm SVFCVTZU_U16_F16 : SInstCvtMXZ<"svcvt_u16[_f16]", "ddPO", "dPO", "Us", "aarch64_sve_fcvtzu", [IsOverloadCvt]>; +defm SVFCVTZU_U16_F16 : SInstCvtMXZ<"svcvt_u16[_f16]", "ddPO", "dPO", "Us", "aarch64_sve_fcvtzu", [IsOverloadFirstandLast]>; defm SVFCVTZU_U32_F16 : SInstCvtMXZ<"svcvt_u32[_f16]", "ddPO", "dPO", "Ui", "aarch64_sve_fcvtzu_i32f16">; defm SVFCVTZU_U64_F16 : SInstCvtMXZ<"svcvt_u64[_f16]", "ddPO", "dPO", "Ul", "aarch64_sve_fcvtzu_i64f16">; // svcvt_u##_f32 -defm SVFCVTZU_U32_F32 : SInstCvtMXZ<"svcvt_u32[_f32]", "ddPM", "dPM", "Ui", "aarch64_sve_fcvtzu", [IsOverloadCvt]>; +defm SVFCVTZU_U32_F32 : SInstCvtMXZ<"svcvt_u32[_f32]", "ddPM", "dPM", "Ui", "aarch64_sve_fcvtzu", [IsOverloadFirstandLast]>; defm SVFCVTZU_U64_F32 : SInstCvtMXZ<"svcvt_u64[_f32]", "ddPM", "dPM", "Ul", "aarch64_sve_fcvtzu_i64f32">; // svcvt_u##_f64 defm SVFCVTZU_U32_F64 : SInstCvtMXZ<"svcvt_u32[_f64]", "zzPd", "zPd", "d", "aarch64_sve_fcvtzu_i32f64">; -defm SVFCVTZU_U64_F64 : SInstCvtMXZ<"svcvt_u64[_f64]", "ddPN", "dPN", "Ul", "aarch64_sve_fcvtzu", [IsOverloadCvt]>; +defm SVFCVTZU_U64_F64 : SInstCvtMXZ<"svcvt_u64[_f64]", "ddPN", "dPN", "Ul", "aarch64_sve_fcvtzu", [IsOverloadFirstandLast]>; // svcvt_f16_s## -defm SVFCVTZS_F16_S16 : SInstCvtMXZ<"svcvt_f16[_s16]", "OOPd", "OPd", "s", "aarch64_sve_scvtf", [IsOverloadCvt]>; +defm SVFCVTZS_F16_S16 : SInstCvtMXZ<"svcvt_f16[_s16]", "OOPd", "OPd", "s", "aarch64_sve_scvtf", [IsOverloadFirstandLast]>; defm SVFCVTZS_F16_S32 : SInstCvtMXZ<"svcvt_f16[_s32]", "OOPd", "OPd", "i", "aarch64_sve_scvtf_f16i32">; defm SVFCVTZS_F16_S64 : SInstCvtMXZ<"svcvt_f16[_s64]", "OOPd", "OPd", "l", "aarch64_sve_scvtf_f16i64">; // svcvt_f32_s## -defm SVFCVTZS_F32_S32 : SInstCvtMXZ<"svcvt_f32[_s32]", "MMPd", "MPd", "i", "aarch64_sve_scvtf", [IsOverloadCvt]>; +defm SVFCVTZS_F32_S32 : SInstCvtMXZ<"svcvt_f32[_s32]", "MMPd", "MPd", "i", "aarch64_sve_scvtf", [IsOverloadFirstandLast]>; defm SVFCVTZS_F32_S64 : SInstCvtMXZ<"svcvt_f32[_s64]", "MMPd", "MPd", "l", "aarch64_sve_scvtf_f32i64">; // svcvt_f64_s## defm SVFCVTZS_F64_S32 : SInstCvtMXZ<"svcvt_f64[_s32]", "ddPt", "dPt", "d", "aarch64_sve_scvtf_f64i32">; -defm SVFCVTZS_F64_S64 : SInstCvtMXZ<"svcvt_f64[_s64]", "NNPd", "NPd", "l", "aarch64_sve_scvtf", [IsOverloadCvt]>; +defm SVFCVTZS_F64_S64 : SInstCvtMXZ<"svcvt_f64[_s64]", "NNPd", "NPd", "l", "aarch64_sve_scvtf", [IsOverloadFirstandLast]>; // svcvt_f16_u## -defm SVFCVTZU_F16_U16 : SInstCvtMXZ<"svcvt_f16[_u16]", "OOPd", "OPd", "Us", "aarch64_sve_ucvtf", [IsOverloadCvt]>; +defm SVFCVTZU_F16_U16 : SInstCvtMXZ<"svcvt_f16[_u16]", "OOPd", "OPd", "Us", "aarch64_sve_ucvtf", [IsOverloadFirstandLast]>; defm SVFCVTZU_F16_U32 : SInstCvtMXZ<"svcvt_f16[_u32]", "OOPd", "OPd", "Ui", "aarch64_sve_ucvtf_f16i32">; defm SVFCVTZU_F16_U64 : SInstCvtMXZ<"svcvt_f16[_u64]", "OOPd", "OPd", "Ul", "aarch64_sve_ucvtf_f16i64">; // svcvt_f32_u## -defm SVFCVTZU_F32_U32 : SInstCvtMXZ<"svcvt_f32[_u32]", "MMPd", "MPd", "Ui", "aarch64_sve_ucvtf", [IsOverloadCvt]>; +defm SVFCVTZU_F32_U32 : SInstCvtMXZ<"svcvt_f32[_u32]", "MMPd", "MPd", "Ui", "aarch64_sve_ucvtf", [IsOverloadFirstandLast]>; defm SVFCVTZU_F32_U64 : SInstCvtMXZ<"svcvt_f32[_u64]", "MMPd", "MPd", "Ul", "aarch64_sve_ucvtf_f32i64">; // svcvt_f64_u## defm SVFCVTZU_F64_U32 : SInstCvtMXZ<"svcvt_f64[_u32]", "ddPz", "dPz", "d", "aarch64_sve_ucvtf_f64i32">; -defm SVFCVTZU_F64_U64 : SInstCvtMXZ<"svcvt_f64[_u64]", "NNPd", "NPd", "Ul", "aarch64_sve_ucvtf", [IsOverloadCvt]>; +defm SVFCVTZU_F64_U64 : SInstCvtMXZ<"svcvt_f64[_u64]", "NNPd", "NPd", "Ul", "aarch64_sve_ucvtf", [IsOverloadFirstandLast]>; // svcvt_f16_f## defm SVFCVT_F16_F32 : SInstCvtMXZ<"svcvt_f16[_f32]", "OOPd", "OPd", "f", "aarch64_sve_fcvt_f16f32">; @@ -1190,11 +1190,23 @@ def SVSUDOT_LANE_S : SInst<"svsudot_lane[_s32]", "ddqbi", "i", MergeNone, "aarc } let SVETargetGuard = "f32mm", SMETargetGuard = InvalidMode in { -def SVMLLA_F32 : SInst<"svmmla[_f32]", "dddd","f", MergeNone, "aarch64_sve_fmmla">; +def SVMLLA_F32 : SInst<"svmmla[_f32]", "dddd","f", MergeNone, "aarch64_sve_fmmla", [IsOverloadFirstandLast]>; } let SVETargetGuard = "f64mm", SMETargetGuard = InvalidMode in { -def SVMLLA_F64 : SInst<"svmmla[_f64]", "dddd", "d", MergeNone, "aarch64_sve_fmmla">; +def SVMLLA_F64 : SInst<"svmmla[_f64]", "dddd", "d", MergeNone, "aarch64_sve_fmmla", [IsOverloadFirstandLast]>; + +let SVETargetGuard = "sve-f16f32mm", SMETargetGuard = InvalidMode in { + def SVMLLA_F32_F16 : SInst<"svmmla[_f32_f16]", "ddhh", "f", MergeNone, "aarch64_sve_fmmla", [IsOverloadFirstandLast]>; +} + +let SVETargetGuard = "sve2,f8f32mm", SMETargetGuard = InvalidMode in { + def SVMLLA_F32_MF8 : SInst<"svmmla[_f32_mf8]", "dd~~>", "f", MergeNone, "aarch64_sve_fp8_fmmla">; +} + +let SVETargetGuard = "sve2,f8f16mm", SMETargetGuard = InvalidMode in { + def SVMLLA_F16_MF8 : SInst<"svmmla[_f16_mf8]", "dd~~>", "h", MergeNone, "aarch64_sve_fp8_fmmla">; +} def SVTRN1Q : SInst<"svtrn1q[_{d}]", "ddd", "csilUcUsUiUlhfdb", MergeNone, "aarch64_sve_trn1q">; def SVTRN2Q : SInst<"svtrn2q[_{d}]", "ddd", "csilUcUsUiUlhfdb", MergeNone, "aarch64_sve_trn2q">; diff --git a/clang/include/clang/Basic/arm_sve_sme_incl.td b/clang/include/clang/Basic/arm_sve_sme_incl.td index 13e7cf45471c2..7e60e87b12a4d 100644 --- a/clang/include/clang/Basic/arm_sve_sme_incl.td +++ b/clang/include/clang/Basic/arm_sve_sme_incl.td @@ -214,7 +214,7 @@ def IsZExtReturn : FlagType<0x00080000>; // Return value is s def IsOverloadNone : FlagType<0x00100000>; // Intrinsic does not take any overloaded types. def IsOverloadWhileOrMultiVecCvt : FlagType<0x00200000>; // Use {default type, typeof(operand1)} as overloaded types. def IsOverloadWhileRW : FlagType<0x00400000>; // Use {pred(default type), typeof(operand0)} as overloaded types. -def IsOverloadCvt : FlagType<0x00800000>; // Use {typeof(operand0), typeof(last operand)} as overloaded types. +def IsOverloadFirstandLast : FlagType<0x00800000>; // Use {typeof(operand0), typeof(last operand)} as overloaded types. def OverloadKindMask : FlagType<0x00E00000>; // When the masked values are all '0', the default type is used as overload type. def IsByteIndexed : FlagType<0x01000000>; def IsAppendSVALL : FlagType<0x02000000>; // Appends SV_ALL as the last operand. diff --git a/clang/lib/CodeGen/TargetBuiltins/ARM.cpp b/clang/lib/CodeGen/TargetBuiltins/ARM.cpp index d4b0b81d3d87f..352dc53c01b2f 100644 --- a/clang/lib/CodeGen/TargetBuiltins/ARM.cpp +++ b/clang/lib/CodeGen/TargetBuiltins/ARM.cpp @@ -4473,7 +4473,7 @@ CodeGenFunction::getSVEOverloadTypes(const SVETypeFlags &TypeFlags, if (TypeFlags.isOverloadWhileRW()) return {getSVEPredType(TypeFlags), Ops[0]->getType()}; - if (TypeFlags.isOverloadCvt()) + if (TypeFlags.isOverloadFirstandLast()) return {Ops[0]->getType(), Ops.back()->getType()}; if (TypeFlags.isReductionQV() && !ResultType->isScalableTy() && diff --git a/clang/test/CodeGen/AArch64/sve-intrinsics/acle_sve_fmmla-f32f16.c b/clang/test/CodeGen/AArch64/sve-intrinsics/acle_sve_fmmla-f32f16.c new file mode 100644 index 0000000000000..ef74024f7b091 --- /dev/null +++ b/clang/test/CodeGen/AArch64/sve-intrinsics/acle_sve_fmmla-f32f16.c @@ -0,0 +1,33 @@ +// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 6 +// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64 -target-feature +sve -target-feature +sve-f16f32mm -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s +// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64 -target-feature +sve -target-feature +sve-f16f32mm -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK +// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve -target-feature +sve-f16f32mm -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s +// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve -target-feature +sve-f16f32mm -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK +// RUN: %clang_cc1 -triple aarch64 -target-feature +sve -target-feature +sve-f16f32mm -S -disable-O0-optnone -Werror -Wall -o /dev/null %s + +// REQUIRES: aarch64-registered-target + +#include <arm_sve.h> + +#ifdef SVE_OVERLOADED_FORMS +#define SVE_ACLE_FUNC(A1, A3) A1##A3 +#else +#define SVE_ACLE_FUNC(A1, A2) A1##A2 +#endif + + +// CHECK-LABEL: define dso_local <vscale x 4 x float> @test_f32f16( +// CHECK-SAME: <vscale x 4 x float> [[ACC:%.*]], <vscale x 8 x half> [[A:%.*]], <vscale x 8 x half> [[B:%.*]]) #[[ATTR0:[0-9]+]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 4 x float> @llvm.aarch64.sve.fmmla.nxv4f32.nxv8f16(<vscale x 4 x float> [[ACC]], <vscale x 8 x half> [[A]], <vscale x 8 x half> [[B]]) +// CHECK-NEXT: ret <vscale x 4 x float> [[TMP0]] +// +// CPP-CHECK-LABEL: define dso_local <vscale x 4 x float> @_Z11test_f32f16u13__SVFloat32_tu13__SVFloat16_tS0_( +// CPP-CHECK-SAME: <vscale x 4 x float> [[ACC:%.*]], <vscale x 8 x half> [[A:%.*]], <vscale x 8 x half> [[B:%.*]]) #[[ATTR0:[0-9]+]] { +// CPP-CHECK-NEXT: [[ENTRY:.*:]] +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 4 x float> @llvm.aarch64.sve.fmmla.nxv4f32.nxv8f16(<vscale x 4 x float> [[ACC]], <vscale x 8 x half> [[A]], <vscale x 8 x half> [[B]]) +// CPP-CHECK-NEXT: ret <vscale x 4 x float> [[TMP0]] +// +svfloat32_t test_f32f16(svfloat32_t acc, svfloat16_t a, svfloat16_t b) { + return SVE_ACLE_FUNC(svmmla, _f32_f16)(acc, a, b); +} diff --git a/clang/test/CodeGen/AArch64/sve-intrinsics/acle_sve_matmul_fp32.c b/clang/test/CodeGen/AArch64/sve-intrinsics/acle_sve_matmul_fp32.c index 10442f4e31153..7d1efb7b6d954 100644 --- a/clang/test/CodeGen/AArch64/sve-intrinsics/acle_sve_matmul_fp32.c +++ b/clang/test/CodeGen/AArch64/sve-intrinsics/acle_sve_matmul_fp32.c @@ -17,12 +17,12 @@ // CHECK-LABEL: @test_svmmla_f32( // CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 4 x float> @llvm.aarch64.sve.fmmla.nxv4f32(<vscale x 4 x float> [[X:%.*]], <vscale x 4 x float> [[Y:%.*]], <vscale x 4 x float> [[Z:%.*]]) +// CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 4 x float> @llvm.aarch64.sve.fmmla.nxv4f32.nxv4f32(<vscale x 4 x float> [[X:%.*]], <vscale x 4 x float> [[Y:%.*]], <vscale x 4 x float> [[Z:%.*]]) // CHECK-NEXT: ret <vscale x 4 x float> [[TMP0]] // // CPP-CHECK-LABEL: @_Z15test_svmmla_f32u13__SVFloat32_tS_S_( // CPP-CHECK-NEXT: entry: -// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 4 x float> @llvm.aarch64.sve.fmmla.nxv4f32(<vscale x 4 x float> [[X:%.*]], <vscale x 4 x float> [[Y:%.*]], <vscale x 4 x float> [[Z:%.*]]) +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 4 x float> @llvm.aarch64.sve.fmmla.nxv4f32.nxv4f32(<vscale x 4 x float> [[X:%.*]], <vscale x 4 x float> [[Y:%.*]], <vscale x 4 x float> [[Z:%.*]]) // CPP-CHECK-NEXT: ret <vscale x 4 x float> [[TMP0]] // svfloat32_t test_svmmla_f32(svfloat32_t x, svfloat32_t y, svfloat32_t z) { diff --git a/clang/test/CodeGen/AArch64/sve-intrinsics/acle_sve_matmul_fp64.c b/clang/test/CodeGen/AArch64/sve-intrinsics/acle_sve_matmul_fp64.c index 8586a65fa240f..da211c4fba324 100644 --- a/clang/test/CodeGen/AArch64/sve-intrinsics/acle_sve_matmul_fp64.c +++ b/clang/test/CodeGen/AArch64/sve-intrinsics/acle_sve_matmul_fp64.c @@ -17,12 +17,12 @@ // CHECK-LABEL: @test_svmmla_f64( // CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 2 x double> @llvm.aarch64.sve.fmmla.nxv2f64(<vscale x 2 x double> [[X:%.*]], <vscale x 2 x double> [[Y:%.*]], <vscale x 2 x double> [[Z:%.*]]) +// CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 2 x double> @llvm.aarch64.sve.fmmla.nxv2f64.nxv2f64(<vscale x 2 x double> [[X:%.*]], <vscale x 2 x double> [[Y:%.*]], <vscale x 2 x double> [[Z:%.*]]) // CHECK-NEXT: ret <vscale x 2 x double> [[TMP0]] // // CPP-CHECK-LABEL: @_Z15test_svmmla_f64u13__SVFloat64_tS_S_( // CPP-CHECK-NEXT: entry: -// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 2 x double> @llvm.aarch64.sve.fmmla.nxv2f64(<vscale x 2 x double> [[X:%.*]], <vscale x 2 x double> [[Y:%.*]], <vscale x 2 x double> [[Z:%.*]]) +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 2 x double> @llvm.aarch64.sve.fmmla.nxv2f64.nxv2f64(<vscale x 2 x double> [[X:%.*]], <vscale x 2 x double> [[Y:%.*]], <vscale x 2 x double> [[Z:%.*]]) // CPP-CHECK-NEXT: ret <vscale x 2 x double> [[TMP0]] // svfloat64_t test_svmmla_f64(svfloat64_t x, svfloat64_t y, svfloat64_t z) { diff --git a/clang/test/CodeGen/AArch64/sve2-intrinsics/acle_sve2_fmmla-f16mf8.c b/clang/test/CodeGen/AArch64/sve2-intrinsics/acle_sve2_fmmla-f16mf8.c new file mode 100644 index 0000000000000..5054821d7168e --- /dev/null +++ b/clang/test/CodeGen/AArch64/sve2-intrinsics/acle_sve2_fmmla-f16mf8.c @@ -0,0 +1,35 @@ +// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 6 +// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64 -target-feature +sve -target-feature +sve2 -target-feature +f8f16mm -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s +// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64 -target-feature +sve -target-feature +sve2 -target-feature +f8f16mm -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK +// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve -target-feature +sve2 -target-feature +f8f16mm -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s +// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve -target-feature +sve2 -target-feature +f8f16mm -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK +// RUN: %clang_cc1 -triple aarch64 -target-feature +sve -target-feature +sve2 -target-feature +f8f16mm -S -disable-O0-optnone -Werror -Wall -o /dev/null %s + +// REQUIRES: aarch64-registered-target + +#include <arm_sve.h> + +#ifdef SVE_OVERLOADED_FORMS +// A simple used,unused... macro, long enough to represent any SVE builtin. +#define SVE_ACLE_FUNC(A1, A2_UNUSED, A3) A1##A3 +#else +#define SVE_ACLE_FUNC(A1, A2, A3) A1##A2##A3 +#endif + +// CHECK-LABEL: define dso_local <vscale x 8 x half> @test_f16mf8( +// CHECK-SAME: <vscale x 8 x half> [[ACC:%.*]], <vscale x 16 x i8> [[A:%.*]], <vscale x 16 x i8> [[B:%.*]], i64 noundef [[FPMR:%.*]]) #[[ATTR0:[0-9]+]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: tail call void @llvm.aarch64.set.fpmr(i64 [[FPMR]]) +// CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 8 x half> @llvm.aarch64.sve.fp8.fmmla.nxv8f16(<vscale x 8 x half> [[ACC]], <vscale x 16 x i8> [[A]], <vscale x 16 x i8> [[B]]) +// CHECK-NEXT: ret <vscale x 8 x half> [[TMP0]] +// +// CPP-CHECK-LABEL: define dso_local <vscale x 8 x half> @_Z11test_f16mf8u13__SVFloat16_tu13__SVMfloat8_tS0_m( +// CPP-CHECK-SAME: <vscale x 8 x half> [[ACC:%.*]], <vscale x 16 x i8> [[A:%.*]], <vscale x 16 x i8> [[B:%.*]], i64 noundef [[FPMR:%.*]]) #[[ATTR0:[0-9]+]] { +// CPP-CHECK-NEXT: [[ENTRY:.*:]] +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.set.fpmr(i64 [[FPMR]]) +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 8 x half> @llvm.aarch64.sve.fp8.fmmla.nxv8f16(<vscale x 8 x half> [[ACC]], <vscale x 16 x i8> [[A]], <vscale x 16 x i8> [[B]]) +// CPP-CHECK-NEXT: ret <vscale x 8 x half> [[TMP0]] +// +svfloat16_t test_f16mf8(svfloat16_t acc, svmfloat8_t a, svmfloat8_t b, fpm_t fpmr) { + return SVE_ACLE_FUNC(svmmla, _f16_mf8, _fpm)(acc, a, b, fpmr); +} diff --git a/clang/test/CodeGen/AArch64/sve2-intrinsics/acle_sve2_fmmla-f32mf8.c b/clang/test/CodeGen/AArch64/sve2-intrinsics/acle_sve2_fmmla-f32mf8.c new file mode 100644 index 0000000000000..e1f52e1b423c5 --- /dev/null +++ b/clang/test/CodeGen/AArch64/sve2-intrinsics/acle_sve2_fmmla-f32mf8.c @@ -0,0 +1,36 @@ +// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 6 + +// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64 -target-feature +sve -target-feature +sve2 -target-feature +f8f32mm -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s +// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64 -target-feature +sve -target-feature +sve2 -target-feature +f8f32mm -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK +// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve -target-feature +sve2 -target-feature +f8f32mm -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s +// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve -target-feature +sve2 -target-feature +f8f32mm -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK +// RUN: %clang_cc1 -triple aarch64 -target-feature +sve -target-feature +sve2 -target-feature +f8f32mm -S -disable-O0-optnone -Werror -Wall -o /dev/null %s + +// REQUIRES: aarch64-registered-target + +#include <arm_sve.h> + +#ifdef SVE_OVERLOADED_FORMS +// A simple used,unused... macro, long enough to represent any SVE builtin. +#define SVE_ACLE_FUNC(A1, A2_UNUSED, A3) A1##A3 +#else +#define SVE_ACLE_FUNC(A1, A2, A3) A1##A2##A3 +#endif + +// CHECK-LABEL: define dso_local <vscale x 4 x float> @test_f32mf8( +// CHECK-SAME: <vscale x 4 x float> [[ACC:%.*]], <vscale x 16 x i8> [[A:%.*]], <vscale x 16 x i8> [[B:%.*]], i64 noundef [[FPMR:%.*]]) #[[ATTR0:[0-9]+]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: tail call void @llvm.aarch64.set.fpmr(i64 [[FPMR]]) +// CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 4 x float> @llvm.aarch64.sve.fp8.fmmla.nxv4f32(<vscale x 4 x float> [[ACC]], <vscale x 16 x i8> [[A]], <vscale x 16 x i8> [[B]]) +// CHECK-NEXT: ret <vscale x 4 x float> [[TMP0]] +// +// CPP-CHECK-LABEL: define dso_local <vscale x 4 x float> @_Z11test_f32mf8u13__SVFloat32_tu13__SVMfloat8_tS0_m( +// CPP-CHECK-SAME: <vscale x 4 x float> [[ACC:%.*]], <vscale x 16 x i8> [[A:%.*]], <vscale x 16 x i8> [[B:%.*]], i64 noundef [[FPMR:%.*]]) #[[ATTR0:[0-9]+]] { +// CPP-CHECK-NEXT: [[ENTRY:.*:]] +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.set.fpmr(i64 [[FPMR]]) +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 4 x float> @llvm.aarch64.sve.fp8.fmmla.nxv4f32(<vscale x 4 x float> [[ACC]], <vscale x 16 x i8> [[A]], <vscale x 16 x i8> [[B]]) +// CPP-CHECK-NEXT: ret <vscale x 4 x float> [[TMP0]] +// +svfloat32_t test_f32mf8(svfloat32_t acc, svmfloat8_t a, svmfloat8_t b, fpm_t fpmr) { + return SVE_ACLE_FUNC(svmmla, _f32_mf8, _fpm)(acc, a, b, fpmr); +} diff --git a/llvm/include/llvm/IR/IntrinsicsAArch64.td b/llvm/include/llvm/IR/IntrinsicsAArch64.td index b8f87a0517efa..c905357d16fe4 100644 --- a/llvm/include/llvm/IR/IntrinsicsAArch64.td +++ b/llvm/include/llvm/IR/IntrinsicsAArch64.td @@ -2797,7 +2797,16 @@ def int_aarch64_sve_sudot_lane : AdvSIMD_SVE_DOT_Indexed_Intrinsic<[IntrSpeculat // // SVE ACLE: 7.4/5. FP64/FP32 matrix multiply extensions // -def int_aarch64_sve_fmmla : AdvSIMD_3VectorArg_Intrinsic; + +def int_aarch64_sve_fmmla + : DefaultAttrsIntrinsic<[llvm_anyvector_ty], + [LLVMMatchType<0>, llvm_anyvector_ty, LLVMMatchType<1>], + [IntrNoMem]>; + +def int_aarch64_sve_fp8_fmmla + : DefaultAttrsIntrinsic<[llvm_anyvector_ty], + [LLVMMatchType<0>, llvm_nxv16i8_ty, llvm_nxv16i8_ty], + [IntrReadMem, IntrInaccessibleMemOnly]>; // // SVE ACLE: 7.2. BFloat16 extensions diff --git a/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td b/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td index bbcffb0c44e85..c923b6e600008 100644 --- a/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td +++ b/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td @@ -3707,7 +3707,7 @@ let Predicates = [HasSVE, HasMatMulFP32] in { } // End HasSVE, HasMatMulFP32 let Predicates = [HasSVE_F16F32MM] in { - def FMLLA_ZZZ_HtoS : sve_fp_matrix_mla<0b001, "fmmla", ZPR32, ZPR16>; + defm FMLLA_ZZZ_HtoS : sve_fp_matrix_mla<0b001, "fmmla", ZPR32, ZPR16, int_aarch64_sve_fmmla, nxv4f32, nxv8f16>; } // End HasSVE_F16F32MM let Predicates = [HasSVE, HasMatMulFP64] in { @@ -4770,11 +4770,11 @@ defm FMLALLTT_ZZZ : sve2_fp8_mla<0b011, ZPR32, "fmlalltt", nxv4f32, int_aarch64_ } // End HasSSVE_FP8FMA let Predicates = [HasSVE2, HasF8F32MM] in { - def FMMLA_ZZZ_BtoS : sve2_fp8_mmla<0b0, ZPR32, "fmmla">; + defm FMMLA_ZZZ_BtoS : sve2_fp8_fmmla<0b0, ZPR32, "fmmla", nxv4f32>; } let Predicates = [HasSVE2, HasF8F16MM] in { - def FMMLA_ZZZ_BtoH : sve2_fp8_mmla<0b1, ZPR16, "fmmla">; + defm FMMLA_ZZZ_BtoH : sve2_fp8_fmmla<0b1, ZPR16, "fmmla", nxv8f16>; } let Predicates = [HasSSVE_FP8DOT2] in { diff --git a/llvm/lib/Target/AArch64/SVEInstrFormats.td b/llvm/lib/Target/AArch64/SVEInstrFormats.td index c63ae8660cad2..1f031f91f70e7 100644 --- a/llvm/lib/Target/AArch64/SVEInstrFormats.td +++ b/llvm/lib/Target/AArch64/SVEInstrFormats.td @@ -11166,6 +11166,12 @@ class sve2_fp8_mmla<bit opc, ZPRRegOp dst_ty, string mnemonic> let Uses = [FPMR, FPCR]; } +multiclass sve2_fp8_fmmla<bits<1> opc, ZPRRegOp zprty, string mnemonic, ValueType ResVT> { + def NAME : sve2_fp8_mmla<opc, zprty, mnemonic>; + def : Pat<(ResVT (int_aarch64_sve_fp8_fmmla ResVT:$acc, nxv16i8:$zn, nxv16i8:$zm)), + (!cast<Instruction>(NAME) $acc, $zn, $zm)>; +} + class sve_fp8_dot_indexed<bits<4> opc, ZPRRegOp dst_ty, Operand iop_ty, string mnemonic> : I<(outs dst_ty:$Zda), (ins dst_ty:$_Zda, ZPR8:$Zn, ZPR3b8:$Zm, iop_ty:$iop), mnemonic, "\t$Zda, $Zn, $Zm$iop", "", []>, Sched<[]> { diff --git a/llvm/test/CodeGen/AArch64/sve-fmmla-f32f16.ll b/llvm/test/CodeGen/AArch64/sve-fmmla-f32f16.ll new file mode 100644 index 0000000000000..aa856a557d1ec --- /dev/null +++ b/llvm/test/CodeGen/AArch64/sve-fmmla-f32f16.ll @@ -0,0 +1,12 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6 +; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sve-f16f32mm < %s | FileCheck %s + +define <vscale x 4 x float> @fmmla_f32f16(<vscale x 4 x float> %acc, <vscale x 8 x half> %a, <vscale x 8 x half> %b) #0 { +; CHECK-LABEL: fmmla_f32f16: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: fmmla z0.s, z1.h, z2.h +; CHECK-NEXT: ret +entry: + %out = call <vscale x 4 x float> @llvm.aarch64.sve.fmmla.nxv4f32.nxv8f16(<vscale x 4 x float> %acc, <vscale x 8 x half> %a, <vscale x 8 x half> %b) + ret <vscale x 4 x float> %out +} diff --git a/llvm/test/CodeGen/AArch64/sve2-fmmla-f16mf8.ll b/llvm/test/CodeGen/AArch64/sve2-fmmla-f16mf8.ll new file mode 100644 index 0000000000000..a7cfce3f2706f --- /dev/null +++ b/llvm/test/CodeGen/AArch64/sve2-fmmla-f16mf8.ll @@ -0,0 +1,12 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6 +; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sve,+sve2,+f8f16mm < %s | FileCheck %s + +define <vscale x 8 x half> @fmmla_f16mf8(<vscale x 8 x half> %acc, <vscale x 16 x i8> %a, <vscale x 16 x i8> %b, i64 %fpmr) { +; CHECK-LABEL: fmmla_f16mf8: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: fmmla z0.h, z1.b, z2.b +; CHECK-NEXT: ret +entry: + %out = call <vscale x 8 x half> @llvm.aarch64.sve.fp8.fmmla.nxv8f16.nxv16i8(<vscale x 8 x half> %acc, <vscale x 16 x i8> %a, <vscale x 16 x i8> %b) + ret <vscale x 8 x half> %out +} diff --git a/llvm/test/CodeGen/AArch64/sve2-fmmla-f32mf8.ll b/llvm/test/CodeGen/AArch64/sve2-fmmla-f32mf8.ll new file mode 100644 index 0000000000000..32b36f12d6268 --- /dev/null +++ b/llvm/test/CodeGen/AArch64/sve2-fmmla-f32mf8.ll @@ -0,0 +1,12 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6 +; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sve,+sve2,+f8f32mm < %s | FileCheck %s + +define dso_local <vscale x 4 x float> @fmmla_f32mf8(<vscale x 4 x float> %acc, <vscale x 16 x i8> %a, <vscale x 16 x i8> %b, i64 noundef %fpmr) #0 { +; CHECK-LABEL: fmmla_f32mf8: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: fmmla z0.s, z1.b, z2.b +; CHECK-NEXT: ret +entry: + %out = call <vscale x 4 x float> @llvm.aarch64.sve.fp8.fmmla.nxv4f32.nxv16i82(<vscale x 4 x float> %acc, <vscale x 16 x i8> %a, <vscale x 16 x i8> %b) + ret <vscale x 4 x float> %out +} _______________________________________________ cfe-commits mailing list [email protected] https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
