[clang] [llvm] [AArch64] Implement FP8 SVE intrinsics for fused multiply-add (PR #118126)
https://github.com/momchil-velikov closed https://github.com/llvm/llvm-project/pull/118126 ___ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
[clang] [llvm] [AArch64] Implement FP8 SVE intrinsics for fused multiply-add (PR #118126)
https://github.com/momchil-velikov updated https://github.com/llvm/llvm-project/pull/118126 >From 15478424a10e531d42f197dae5e2083e8d183c23 Mon Sep 17 00:00:00 2001 From: Momchil Velikov Date: Tue, 26 Nov 2024 18:01:03 + Subject: [PATCH 1/3] [AArch64] Implement FP8 SVE intrinsics for fused multiply-add This patch adds the following intrinsics: * 8-bit floating-point multiply-add long to half-precision (bottom). // Only if (__ARM_FEATURE_SVE2 && __ARM_FEATURE_FP8FMA) || __ARM_FEATURE_SSVE_FP8FMA svfloat16_t svmlalb[_f16_mf8]_fpm(svfloat16_t zda, svmfloat8_t zn, svmfloat8_t zm, fpm_t fpm); svfloat16_t svmlalb[_n_f16_mf8]_fpm(svfloat16_t zda, svmfloat8_t zn, mfloat8_t zm, fpm_t fpm); * 8-bit floating-point multiply-add long to half-precision (bottom, indexed). // Only if (__ARM_FEATURE_SVE2 && __ARM_FEATURE_FP8FMA) || __ARM_FEATURE_SSVE_FP8FMA svfloat16_t svmlalb_lane[_f16_mf8]_fpm(svfloat16_t zda, svmfloat8_t zn, svmfloat8_t zm, uint64_t imm0_15, fpm_t fpm); * 8-bit floating-point multiply-add long to half-precision (top). // Only if (__ARM_FEATURE_SVE2 && __ARM_FEATURE_FP8FMA) || __ARM_FEATURE_SSVE_FP8FMA svfloat16_t svmlalt[_f16_mf8]_fpm(svfloat16_t zda, svmfloat8_t zn, svmfloat8_t zm, fpm_t fpm); svfloat16_t svmlalt[_n_f16_mf8]_fpm(svfloat16_t zda, svmfloat8_t zn, mfloat8_t zm, fpm_t fpm); * 8-bit floating-point multiply-add long to half-precision (top, indexed). // Only if (__ARM_FEATURE_SVE2 && __ARM_FEATURE_FP8FMA) || __ARM_FEATURE_SSVE_FP8FMA svfloat16_t svmlalt_lane[_f16_mf8]_fpm(svfloat16_t zda, svmfloat8_t zn, svmfloat8_t zm, uint64_t imm0_15, fpm_t fpm); * 8-bit floating-point multiply-add long long to single-precision (bottom bottom). // Only if (__ARM_FEATURE_SVE2 && __ARM_FEATURE_FP8FMA) || __ARM_FEATURE_SSVE_FP8FMA svfloat32_t svmlallbb[_f32_mf8]_fpm(svfloat32_t zda, svmfloat8_t zn, svmfloat8_t zm, fpm_t fpm); svfloat32_t svmlallbb[_n_f32_mf8]_fpm(svfloat32_t zda, svmfloat8_t zn, mfloat8_t zm, fpm_t fpm); * 8-bit floating-point multiply-add long long to single-precision (bottom bottom, indexed). // Only if (__ARM_FEATURE_SVE2 && __ARM_FEATURE_FP8FMA) || __ARM_FEATURE_SSVE_FP8FMA svfloat32_t svmlallbb_lane[_f32_mf8]_fpm(svfloat32_t zda, svmfloat8_t zn, svmfloat8_t zm, uint64_t imm0_15, fpm_t fpm); * 8-bit floating-point multiply-add long long to single-precision (bottom top). // Only if (__ARM_FEATURE_SVE2 && __ARM_FEATURE_FP8FMA) || __ARM_FEATURE_SSVE_FP8FMA svfloat32_t svmlallbt[_f32_mf8]_fpm(svfloat32_t zda, svmfloat8_t zn, svmfloat8_t zm, fpm_t fpm); svfloat32_t svmlallbt[_n_f32_mf8]_fpm(svfloat32_t zda, svmfloat8_t zn, mfloat8_t zm, fpm_t fpm); * 8-bit floating-point multiply-add long long to single-precision (bottom top, indexed). // Only if (__ARM_FEATURE_SVE2 && __ARM_FEATURE_FP8FMA) || __ARM_FEATURE_SSVE_FP8FMA svfloat32_t svmlallbt_lane[_f32_mf8]_fpm(svfloat32_t zda, svmfloat8_t zn, svmfloat8_t zm, uint64_t imm0_15, fpm_t fpm); * 8-bit floating-point multiply-add long long to single-precision (top bottom). // Only if (__ARM_FEATURE_SVE2 && __ARM_FEATURE_FP8FMA) || __ARM_FEATURE_SSVE_FP8FMA svfloat32_t svmlalltb[_f32_mf8]_fpm(svfloat32_t zda, svmfloat8_t zn, svmfloat8_t zm, fpm_t fpm); svfloat32_t svmlalltb[_n_f32_mf8]_fpm(svfloat32_t zda, svmfloat8_t zn, mfloat8_t zm, fpm_t fpm); * 8-bit floating-point multiply-add long long to single-precision (top bottom, indexed). // Only if (__ARM_FEATURE_SVE2 && __ARM_FEATURE_FP8FMA) || __ARM_FEATURE_SSVE_FP8FMA svfloat32_t svmlalltb_lane[_f32_mf8]_fpm(svfloat32_t zda, svmfloat8_t zn, svmfloat8_t zm, uint64_t imm0_15, fpm_t fpm); * 8-bit floating-point multiply-add long long to single-precision (top top). // Only if (__ARM_FEATURE_SVE2 && __ARM_FEATURE_FP8FMA) || __ARM_FEATURE_SSVE_FP8FMA svfloat32_t svmlalltt[_f32_mf8]_fpm(svfloat32_t zda, svmfloat8_t zn, svmfloat8_t zm, fpm_t fpm); svfloat32_t svmlalltt[_n_f32_mf8]_fpm(svfloat32_t zda, svmfloat8_t zn, mfloat8_t zm, fpm_t fpm); * 8-bit floating-point multiply-add long long to single-precision (top top, indexed). // Only if (__ARM_FEATURE_SVE2 && __ARM_FEATURE_FP8FMA) || __ARM_FEATURE_SSVE_FP8FMA svfloat32_t svmlalltt_lane[_f32_mf8]_fpm(svfloat32_t zda, svmfloat8_t zn, svmfloat8_t zm, uint64_t imm0_15, fpm_t fpm); --- clang/include/clang/Basic/arm_sve.td | 31 ++ .../fp8-intrinsics/acle_sve2_fp8_fmla.c | 389 ++ .../aarch64-sve2-intrinsics/acle_sve2_fp8.c | 53 ++- llvm/include/llvm/IR/IntrinsicsAArch64.td | 19 + .../lib/Target/AArch64/AArch64SVEInstrInfo.td | 24 +- llvm/lib/Target/AArch64/SVEInstrFormats.td| 22 +- llvm/test/CodeGen/AArch64/fp8-sve-fmla.ll |
[clang] [llvm] [AArch64] Implement FP8 SVE intrinsics for fused multiply-add (PR #118126)
https://github.com/momchil-velikov updated https://github.com/llvm/llvm-project/pull/118126 >From 5bc5078af32cda3dbcf3ca8dd53b01996ad34ea1 Mon Sep 17 00:00:00 2001 From: Momchil Velikov Date: Mon, 25 Nov 2024 17:21:55 + Subject: [PATCH 1/5] [AArch64] Implements FP8 SVE intrinsics for dot-product This patch adds the following intrinsics: * 8-bit floating-point dot product to single-precision. // Only if (__ARM_FEATURE_SVE2 && __ARM_FEATURE_FP8DOT4) || __ARM_FEATURE_SSVE_FP8DOT4 svfloat32_t svdot[_f32_mf8]_fpm(svfloat32_t zda, svmfloat8_t zn, svmfloat8_t zm, fpm_t fpm); svfloat32_t svdot[_n_f32_mf8]_fpm(svfloat32_t zda, svmfloat8_t zn, mfloat8_t zm, fpm_t fpm); * 8-bit floating-point indexed dot product to single-precision. // Only if (__ARM_FEATURE_SVE2 && __ARM_FEATURE_FP8DOT4) || __ARM_FEATURE_SSVE_FP8DOT4 svfloat32_t svdot_lane[_f32_mf8]_fpm(svfloat32_t zda, svmfloat8_t zn, svmfloat8_t zm, uint64_t imm0_3, fpm_t fpm); * 8-bit floating-point dot product to half-precision. // Only if (__ARM_FEATURE_SVE2 && __ARM_FEATURE_FP8DOT2) || __ARM_FEATURE_SSVE_FP8DOT2 svfloat16_t svdot[_f16_mf8]_fpm(svfloat16_t zda, svmfloat8_t zn, svmfloat8_t zm, fpm_t fpm); svfloat16_t svdot[_n_f16_mf8]_fpm(svfloat16_t zda, svmfloat8_t zn, mfloat8_t zm, fpm_t fpm); * 8-bit floating-point indexed dot product to half-precision. // Only if (__ARM_FEATURE_SVE2 && __ARM_FEATURE_FP8DOT2) || __ARM_FEATURE_SSVE_FP8DOT2 svfloat16_t svdot_lane[_f16_mf8]_fpm(svfloat16_t zda, svmfloat8_t zn, svmfloat8_t zm, uint64_t imm0_7, fpm_t fpm); --- clang/include/clang/Basic/arm_sve.td | 19 +++ clang/include/clang/Basic/arm_sve_sme_incl.td | 1 + clang/lib/CodeGen/CGBuiltin.cpp | 11 +- .../fp8-intrinsics/acle_sve2_fp8_fdot.c | 149 ++ .../aarch64-sve2-intrinsics/acle_sve2_fp8.c | 23 ++- clang/utils/TableGen/SveEmitter.cpp | 9 +- llvm/include/llvm/IR/IntrinsicsAArch64.td | 16 ++ .../lib/Target/AArch64/AArch64SVEInstrInfo.td | 9 +- llvm/lib/Target/AArch64/SVEInstrFormats.td| 29 +++- llvm/test/CodeGen/AArch64/fp8-sve-fdot.ll | 41 + 10 files changed, 293 insertions(+), 14 deletions(-) create mode 100644 clang/test/CodeGen/AArch64/fp8-intrinsics/acle_sve2_fp8_fdot.c create mode 100644 llvm/test/CodeGen/AArch64/fp8-sve-fdot.ll diff --git a/clang/include/clang/Basic/arm_sve.td b/clang/include/clang/Basic/arm_sve.td index b9f40faf0b18e6..2c8ca8014387d3 100644 --- a/clang/include/clang/Basic/arm_sve.td +++ b/clang/include/clang/Basic/arm_sve.td @@ -2476,3 +2476,22 @@ let SVETargetGuard = "sve2,fp8", SMETargetGuard = "sme2,fp8" in { def SVFCVTNB : SInst<"svcvtnb_mf8[_f32_x2]_fpm", "~2>", "f", MergeNone, "aarch64_sve_fp8_cvtnb", [VerifyRuntimeMode, SetsFPMR]>; def SVFCVTNT : SInst<"svcvtnt_mf8[_f32_x2]_fpm", "~~2>", "f", MergeNone, "aarch64_sve_fp8_cvtnt", [VerifyRuntimeMode, SetsFPMR]>; } + +let SVETargetGuard = "sve2,fp8dot2", SMETargetGuard ="sme,ssve-fp8dot2" in { + // 8-bit floating-point dot product to half-precision (vectors) + def SVFDOT_2WAY : SInst<"svdot[_f16_mf8]_fpm", "dd~~>", "h", MergeNone, "aarch64_sve_fp8_fdot", [VerifyRuntimeMode, SetsFPMR]>; + def SVFDOT_N_2WAY : SInst<"svdot[_n_f16_mf8]_fpm", "dd~!>", "h", MergeNone, "aarch64_sve_fp8_fdot", [VerifyRuntimeMode, SetsFPMR]>; + + // 8-bit floating-point dot product to half-precision (indexed) + def SVFDOT_LANE_2WAY : SInst<"svdot_lane[_f16_mf8]_fpm", "dd~~i>", "h", MergeNone, "aarch64_sve_fp8_fdot_lane", [VerifyRuntimeMode, SetsFPMR], [ImmCheck<3, ImmCheck0_7>]>; +} + +let SVETargetGuard = "sve2,fp8dot4", SMETargetGuard ="sme,ssve-fp8dot4" in { + // 8-bit floating-point dot product to single-precision (vectors) + def SVFDOT_4WAY : SInst<"svdot[_f32_mf8]_fpm", "dd~~>", "f", MergeNone, "aarch64_sve_fp8_fdot", [VerifyRuntimeMode, SetsFPMR]>; + def SVFDOT_N_4WAY : SInst<"svdot[_n_f32_mf8]_fpm", "dd~!>", "f", MergeNone, "aarch64_sve_fp8_fdot", [VerifyRuntimeMode, SetsFPMR]>; + + // 8-bit floating-point dot product to single-precision (indexed) + def SVFDOT_LANE_4WAY : SInst<"svdot_lane[_f32_mf8]_fpm", "dd~~i>", "f", MergeNone, "aarch64_sve_fp8_fdot_lane", [VerifyRuntimeMode, SetsFPMR], [ImmCheck<3, ImmCheck0_3>]>; +} + diff --git a/clang/include/clang/Basic/arm_sve_sme_incl.td b/clang/include/clang/Basic/arm_sve_sme_incl.td index de10be7bdce0db..44201b15505599 100644 --- a/clang/include/clang/Basic/arm_sve_sme_incl.td +++ b/clang/include/clang/Basic/arm_sve_sme_incl.td @@ -105,6 +105,7 @@ include "arm_immcheck_incl.td" // N: svfloat64_t // $: svbfloat16_t // ~: svmfloat8_t +// !: mfloat8_t (splat to svmfloat8_t) // J: Prefetch type (sv_prfop) diff --git a/clang/lib/CodeGen/CGBuiltin.cpp b/clang/lib/CodeGen/CGBuiltin.cpp index 49a4c1ecc825e7..84048a4beac2c5 100644 --- a/clang/lib/CodeGen/CGBuiltin.cpp +++ b/clang/lib/CodeGen/CGBui
[clang] [llvm] [AArch64] Implement FP8 SVE intrinsics for fused multiply-add (PR #118126)
@@ -0,0 +1,114 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 +; RUN: llc -mattr=+sve2,+fp8,+fp8fma < %s | FileCheck %s +; RUN: llc -mattr=+sme,+fp8,+ssve-fp8fma --force-streaming < %s | FileCheck %s + +target triple = "aarch64-linux" + +define @fmla_2way_bot( %a, %s1, %s2) { +; CHECK-LABEL: fmla_2way_bot: +; CHECK: // %bb.0: +; CHECK-NEXT:fmlalb z0.h, z1.b, z2.b +; CHECK-NEXT:ret +%r = call @llvm.aarch64.sve.fp8.fmlalb.nxv8f16( %a, %s1, %s2) +ret %r +} + +define @fmla_2way_top( %a, %s1, %s2) { +; CHECK-LABEL: fmla_2way_top: +; CHECK: // %bb.0: +; CHECK-NEXT:fmlalt z0.h, z1.b, z2.b +; CHECK-NEXT:ret +%r = call @llvm.aarch64.sve.fp8.fmlalt.nxv8f16( %a, %s1, %s2) +ret %r +} + +define @fdot_2way_bot_lane( %a, %s1, %s2) { +; CHECK-LABEL: fdot_2way_bot_lane: +; CHECK: // %bb.0: +; CHECK-NEXT:fmlalb z0.h, z1.b, z2.b[3] +; CHECK-NEXT:ret +%r = call @llvm.aarch64.sve.fp8.fmlalb.lane.nxv8f16( %a, %s1, %s2, i32 3) +ret %r +} + +define @fdot_2way_top_lane( %a, %s1, %s2) { momchil-velikov wrote: Done https://github.com/llvm/llvm-project/pull/118126 ___ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
[clang] [llvm] [AArch64] Implement FP8 SVE intrinsics for fused multiply-add (PR #118126)
@@ -0,0 +1,114 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 +; RUN: llc -mattr=+sve2,+fp8,+fp8fma < %s | FileCheck %s +; RUN: llc -mattr=+sme,+fp8,+ssve-fp8fma --force-streaming < %s | FileCheck %s + +target triple = "aarch64-linux" + +define @fmla_2way_bot( %a, %s1, %s2) { +; CHECK-LABEL: fmla_2way_bot: +; CHECK: // %bb.0: +; CHECK-NEXT:fmlalb z0.h, z1.b, z2.b +; CHECK-NEXT:ret +%r = call @llvm.aarch64.sve.fp8.fmlalb.nxv8f16( %a, %s1, %s2) +ret %r +} + +define @fmla_2way_top( %a, %s1, %s2) { +; CHECK-LABEL: fmla_2way_top: +; CHECK: // %bb.0: +; CHECK-NEXT:fmlalt z0.h, z1.b, z2.b +; CHECK-NEXT:ret +%r = call @llvm.aarch64.sve.fp8.fmlalt.nxv8f16( %a, %s1, %s2) +ret %r +} + +define @fdot_2way_bot_lane( %a, %s1, %s2) { momchil-velikov wrote: Done https://github.com/llvm/llvm-project/pull/118126 ___ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
[clang] [llvm] [AArch64] Implement FP8 SVE intrinsics for fused multiply-add (PR #118126)
https://github.com/momchil-velikov updated https://github.com/llvm/llvm-project/pull/118126 >From 5bc5078af32cda3dbcf3ca8dd53b01996ad34ea1 Mon Sep 17 00:00:00 2001 From: Momchil Velikov Date: Mon, 25 Nov 2024 17:21:55 + Subject: [PATCH 1/4] [AArch64] Implements FP8 SVE intrinsics for dot-product This patch adds the following intrinsics: * 8-bit floating-point dot product to single-precision. // Only if (__ARM_FEATURE_SVE2 && __ARM_FEATURE_FP8DOT4) || __ARM_FEATURE_SSVE_FP8DOT4 svfloat32_t svdot[_f32_mf8]_fpm(svfloat32_t zda, svmfloat8_t zn, svmfloat8_t zm, fpm_t fpm); svfloat32_t svdot[_n_f32_mf8]_fpm(svfloat32_t zda, svmfloat8_t zn, mfloat8_t zm, fpm_t fpm); * 8-bit floating-point indexed dot product to single-precision. // Only if (__ARM_FEATURE_SVE2 && __ARM_FEATURE_FP8DOT4) || __ARM_FEATURE_SSVE_FP8DOT4 svfloat32_t svdot_lane[_f32_mf8]_fpm(svfloat32_t zda, svmfloat8_t zn, svmfloat8_t zm, uint64_t imm0_3, fpm_t fpm); * 8-bit floating-point dot product to half-precision. // Only if (__ARM_FEATURE_SVE2 && __ARM_FEATURE_FP8DOT2) || __ARM_FEATURE_SSVE_FP8DOT2 svfloat16_t svdot[_f16_mf8]_fpm(svfloat16_t zda, svmfloat8_t zn, svmfloat8_t zm, fpm_t fpm); svfloat16_t svdot[_n_f16_mf8]_fpm(svfloat16_t zda, svmfloat8_t zn, mfloat8_t zm, fpm_t fpm); * 8-bit floating-point indexed dot product to half-precision. // Only if (__ARM_FEATURE_SVE2 && __ARM_FEATURE_FP8DOT2) || __ARM_FEATURE_SSVE_FP8DOT2 svfloat16_t svdot_lane[_f16_mf8]_fpm(svfloat16_t zda, svmfloat8_t zn, svmfloat8_t zm, uint64_t imm0_7, fpm_t fpm); --- clang/include/clang/Basic/arm_sve.td | 19 +++ clang/include/clang/Basic/arm_sve_sme_incl.td | 1 + clang/lib/CodeGen/CGBuiltin.cpp | 11 +- .../fp8-intrinsics/acle_sve2_fp8_fdot.c | 149 ++ .../aarch64-sve2-intrinsics/acle_sve2_fp8.c | 23 ++- clang/utils/TableGen/SveEmitter.cpp | 9 +- llvm/include/llvm/IR/IntrinsicsAArch64.td | 16 ++ .../lib/Target/AArch64/AArch64SVEInstrInfo.td | 9 +- llvm/lib/Target/AArch64/SVEInstrFormats.td| 29 +++- llvm/test/CodeGen/AArch64/fp8-sve-fdot.ll | 41 + 10 files changed, 293 insertions(+), 14 deletions(-) create mode 100644 clang/test/CodeGen/AArch64/fp8-intrinsics/acle_sve2_fp8_fdot.c create mode 100644 llvm/test/CodeGen/AArch64/fp8-sve-fdot.ll diff --git a/clang/include/clang/Basic/arm_sve.td b/clang/include/clang/Basic/arm_sve.td index b9f40faf0b18e6..2c8ca8014387d3 100644 --- a/clang/include/clang/Basic/arm_sve.td +++ b/clang/include/clang/Basic/arm_sve.td @@ -2476,3 +2476,22 @@ let SVETargetGuard = "sve2,fp8", SMETargetGuard = "sme2,fp8" in { def SVFCVTNB : SInst<"svcvtnb_mf8[_f32_x2]_fpm", "~2>", "f", MergeNone, "aarch64_sve_fp8_cvtnb", [VerifyRuntimeMode, SetsFPMR]>; def SVFCVTNT : SInst<"svcvtnt_mf8[_f32_x2]_fpm", "~~2>", "f", MergeNone, "aarch64_sve_fp8_cvtnt", [VerifyRuntimeMode, SetsFPMR]>; } + +let SVETargetGuard = "sve2,fp8dot2", SMETargetGuard ="sme,ssve-fp8dot2" in { + // 8-bit floating-point dot product to half-precision (vectors) + def SVFDOT_2WAY : SInst<"svdot[_f16_mf8]_fpm", "dd~~>", "h", MergeNone, "aarch64_sve_fp8_fdot", [VerifyRuntimeMode, SetsFPMR]>; + def SVFDOT_N_2WAY : SInst<"svdot[_n_f16_mf8]_fpm", "dd~!>", "h", MergeNone, "aarch64_sve_fp8_fdot", [VerifyRuntimeMode, SetsFPMR]>; + + // 8-bit floating-point dot product to half-precision (indexed) + def SVFDOT_LANE_2WAY : SInst<"svdot_lane[_f16_mf8]_fpm", "dd~~i>", "h", MergeNone, "aarch64_sve_fp8_fdot_lane", [VerifyRuntimeMode, SetsFPMR], [ImmCheck<3, ImmCheck0_7>]>; +} + +let SVETargetGuard = "sve2,fp8dot4", SMETargetGuard ="sme,ssve-fp8dot4" in { + // 8-bit floating-point dot product to single-precision (vectors) + def SVFDOT_4WAY : SInst<"svdot[_f32_mf8]_fpm", "dd~~>", "f", MergeNone, "aarch64_sve_fp8_fdot", [VerifyRuntimeMode, SetsFPMR]>; + def SVFDOT_N_4WAY : SInst<"svdot[_n_f32_mf8]_fpm", "dd~!>", "f", MergeNone, "aarch64_sve_fp8_fdot", [VerifyRuntimeMode, SetsFPMR]>; + + // 8-bit floating-point dot product to single-precision (indexed) + def SVFDOT_LANE_4WAY : SInst<"svdot_lane[_f32_mf8]_fpm", "dd~~i>", "f", MergeNone, "aarch64_sve_fp8_fdot_lane", [VerifyRuntimeMode, SetsFPMR], [ImmCheck<3, ImmCheck0_3>]>; +} + diff --git a/clang/include/clang/Basic/arm_sve_sme_incl.td b/clang/include/clang/Basic/arm_sve_sme_incl.td index de10be7bdce0db..44201b15505599 100644 --- a/clang/include/clang/Basic/arm_sve_sme_incl.td +++ b/clang/include/clang/Basic/arm_sve_sme_incl.td @@ -105,6 +105,7 @@ include "arm_immcheck_incl.td" // N: svfloat64_t // $: svbfloat16_t // ~: svmfloat8_t +// !: mfloat8_t (splat to svmfloat8_t) // J: Prefetch type (sv_prfop) diff --git a/clang/lib/CodeGen/CGBuiltin.cpp b/clang/lib/CodeGen/CGBuiltin.cpp index 49a4c1ecc825e7..84048a4beac2c5 100644 --- a/clang/lib/CodeGen/CGBuiltin.cpp +++ b/clang/lib/CodeGen/CGBui
[clang] [llvm] [AArch64] Implement FP8 SVE intrinsics for fused multiply-add (PR #118126)
@@ -0,0 +1,114 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 +; RUN: llc -mattr=+sve2,+fp8,+fp8fma < %s | FileCheck %s +; RUN: llc -mattr=+sme,+fp8,+ssve-fp8fma --force-streaming < %s | FileCheck %s + +target triple = "aarch64-linux" + +define @fmla_2way_bot( %a, %s1, %s2) { +; CHECK-LABEL: fmla_2way_bot: +; CHECK: // %bb.0: +; CHECK-NEXT:fmlalb z0.h, z1.b, z2.b +; CHECK-NEXT:ret +%r = call @llvm.aarch64.sve.fp8.fmlalb.nxv8f16( %a, %s1, %s2) +ret %r +} + +define @fmla_2way_top( %a, %s1, %s2) { +; CHECK-LABEL: fmla_2way_top: +; CHECK: // %bb.0: +; CHECK-NEXT:fmlalt z0.h, z1.b, z2.b +; CHECK-NEXT:ret +%r = call @llvm.aarch64.sve.fp8.fmlalt.nxv8f16( %a, %s1, %s2) +ret %r +} + +define @fdot_2way_bot_lane( %a, %s1, %s2) { +; CHECK-LABEL: fdot_2way_bot_lane: +; CHECK: // %bb.0: +; CHECK-NEXT:fmlalb z0.h, z1.b, z2.b[3] +; CHECK-NEXT:ret +%r = call @llvm.aarch64.sve.fp8.fmlalb.lane.nxv8f16( %a, %s1, %s2, i32 3) +ret %r +} + +define @fdot_2way_top_lane( %a, %s1, %s2) { CarolineConcatto wrote: s/dot/fmla https://github.com/llvm/llvm-project/pull/118126 ___ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
[clang] [llvm] [AArch64] Implement FP8 SVE intrinsics for fused multiply-add (PR #118126)
@@ -0,0 +1,114 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 +; RUN: llc -mattr=+sve2,+fp8,+fp8fma < %s | FileCheck %s +; RUN: llc -mattr=+sme,+fp8,+ssve-fp8fma --force-streaming < %s | FileCheck %s + +target triple = "aarch64-linux" + +define @fmla_2way_bot( %a, %s1, %s2) { +; CHECK-LABEL: fmla_2way_bot: +; CHECK: // %bb.0: +; CHECK-NEXT:fmlalb z0.h, z1.b, z2.b +; CHECK-NEXT:ret +%r = call @llvm.aarch64.sve.fp8.fmlalb.nxv8f16( %a, %s1, %s2) +ret %r +} + +define @fmla_2way_top( %a, %s1, %s2) { +; CHECK-LABEL: fmla_2way_top: +; CHECK: // %bb.0: +; CHECK-NEXT:fmlalt z0.h, z1.b, z2.b +; CHECK-NEXT:ret +%r = call @llvm.aarch64.sve.fp8.fmlalt.nxv8f16( %a, %s1, %s2) +ret %r +} + +define @fdot_2way_bot_lane( %a, %s1, %s2) { CarolineConcatto wrote: s/dot/fmla https://github.com/llvm/llvm-project/pull/118126 ___ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
[clang] [llvm] [AArch64] Implement FP8 SVE intrinsics for fused multiply-add (PR #118126)
https://github.com/CarolineConcatto approved this pull request. LGTM, just fix the test name in the fmla https://github.com/llvm/llvm-project/pull/118126 ___ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
[clang] [llvm] [AArch64] Implement FP8 SVE intrinsics for fused multiply-add (PR #118126)
https://github.com/CarolineConcatto edited https://github.com/llvm/llvm-project/pull/118126 ___ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
[clang] [llvm] [AArch64] Implement FP8 SVE intrinsics for fused multiply-add (PR #118126)
https://github.com/momchil-velikov updated https://github.com/llvm/llvm-project/pull/118126 >From 5bc5078af32cda3dbcf3ca8dd53b01996ad34ea1 Mon Sep 17 00:00:00 2001 From: Momchil Velikov Date: Mon, 25 Nov 2024 17:21:55 + Subject: [PATCH 1/3] [AArch64] Implements FP8 SVE intrinsics for dot-product This patch adds the following intrinsics: * 8-bit floating-point dot product to single-precision. // Only if (__ARM_FEATURE_SVE2 && __ARM_FEATURE_FP8DOT4) || __ARM_FEATURE_SSVE_FP8DOT4 svfloat32_t svdot[_f32_mf8]_fpm(svfloat32_t zda, svmfloat8_t zn, svmfloat8_t zm, fpm_t fpm); svfloat32_t svdot[_n_f32_mf8]_fpm(svfloat32_t zda, svmfloat8_t zn, mfloat8_t zm, fpm_t fpm); * 8-bit floating-point indexed dot product to single-precision. // Only if (__ARM_FEATURE_SVE2 && __ARM_FEATURE_FP8DOT4) || __ARM_FEATURE_SSVE_FP8DOT4 svfloat32_t svdot_lane[_f32_mf8]_fpm(svfloat32_t zda, svmfloat8_t zn, svmfloat8_t zm, uint64_t imm0_3, fpm_t fpm); * 8-bit floating-point dot product to half-precision. // Only if (__ARM_FEATURE_SVE2 && __ARM_FEATURE_FP8DOT2) || __ARM_FEATURE_SSVE_FP8DOT2 svfloat16_t svdot[_f16_mf8]_fpm(svfloat16_t zda, svmfloat8_t zn, svmfloat8_t zm, fpm_t fpm); svfloat16_t svdot[_n_f16_mf8]_fpm(svfloat16_t zda, svmfloat8_t zn, mfloat8_t zm, fpm_t fpm); * 8-bit floating-point indexed dot product to half-precision. // Only if (__ARM_FEATURE_SVE2 && __ARM_FEATURE_FP8DOT2) || __ARM_FEATURE_SSVE_FP8DOT2 svfloat16_t svdot_lane[_f16_mf8]_fpm(svfloat16_t zda, svmfloat8_t zn, svmfloat8_t zm, uint64_t imm0_7, fpm_t fpm); --- clang/include/clang/Basic/arm_sve.td | 19 +++ clang/include/clang/Basic/arm_sve_sme_incl.td | 1 + clang/lib/CodeGen/CGBuiltin.cpp | 11 +- .../fp8-intrinsics/acle_sve2_fp8_fdot.c | 149 ++ .../aarch64-sve2-intrinsics/acle_sve2_fp8.c | 23 ++- clang/utils/TableGen/SveEmitter.cpp | 9 +- llvm/include/llvm/IR/IntrinsicsAArch64.td | 16 ++ .../lib/Target/AArch64/AArch64SVEInstrInfo.td | 9 +- llvm/lib/Target/AArch64/SVEInstrFormats.td| 29 +++- llvm/test/CodeGen/AArch64/fp8-sve-fdot.ll | 41 + 10 files changed, 293 insertions(+), 14 deletions(-) create mode 100644 clang/test/CodeGen/AArch64/fp8-intrinsics/acle_sve2_fp8_fdot.c create mode 100644 llvm/test/CodeGen/AArch64/fp8-sve-fdot.ll diff --git a/clang/include/clang/Basic/arm_sve.td b/clang/include/clang/Basic/arm_sve.td index b9f40faf0b18e6..2c8ca8014387d3 100644 --- a/clang/include/clang/Basic/arm_sve.td +++ b/clang/include/clang/Basic/arm_sve.td @@ -2476,3 +2476,22 @@ let SVETargetGuard = "sve2,fp8", SMETargetGuard = "sme2,fp8" in { def SVFCVTNB : SInst<"svcvtnb_mf8[_f32_x2]_fpm", "~2>", "f", MergeNone, "aarch64_sve_fp8_cvtnb", [VerifyRuntimeMode, SetsFPMR]>; def SVFCVTNT : SInst<"svcvtnt_mf8[_f32_x2]_fpm", "~~2>", "f", MergeNone, "aarch64_sve_fp8_cvtnt", [VerifyRuntimeMode, SetsFPMR]>; } + +let SVETargetGuard = "sve2,fp8dot2", SMETargetGuard ="sme,ssve-fp8dot2" in { + // 8-bit floating-point dot product to half-precision (vectors) + def SVFDOT_2WAY : SInst<"svdot[_f16_mf8]_fpm", "dd~~>", "h", MergeNone, "aarch64_sve_fp8_fdot", [VerifyRuntimeMode, SetsFPMR]>; + def SVFDOT_N_2WAY : SInst<"svdot[_n_f16_mf8]_fpm", "dd~!>", "h", MergeNone, "aarch64_sve_fp8_fdot", [VerifyRuntimeMode, SetsFPMR]>; + + // 8-bit floating-point dot product to half-precision (indexed) + def SVFDOT_LANE_2WAY : SInst<"svdot_lane[_f16_mf8]_fpm", "dd~~i>", "h", MergeNone, "aarch64_sve_fp8_fdot_lane", [VerifyRuntimeMode, SetsFPMR], [ImmCheck<3, ImmCheck0_7>]>; +} + +let SVETargetGuard = "sve2,fp8dot4", SMETargetGuard ="sme,ssve-fp8dot4" in { + // 8-bit floating-point dot product to single-precision (vectors) + def SVFDOT_4WAY : SInst<"svdot[_f32_mf8]_fpm", "dd~~>", "f", MergeNone, "aarch64_sve_fp8_fdot", [VerifyRuntimeMode, SetsFPMR]>; + def SVFDOT_N_4WAY : SInst<"svdot[_n_f32_mf8]_fpm", "dd~!>", "f", MergeNone, "aarch64_sve_fp8_fdot", [VerifyRuntimeMode, SetsFPMR]>; + + // 8-bit floating-point dot product to single-precision (indexed) + def SVFDOT_LANE_4WAY : SInst<"svdot_lane[_f32_mf8]_fpm", "dd~~i>", "f", MergeNone, "aarch64_sve_fp8_fdot_lane", [VerifyRuntimeMode, SetsFPMR], [ImmCheck<3, ImmCheck0_3>]>; +} + diff --git a/clang/include/clang/Basic/arm_sve_sme_incl.td b/clang/include/clang/Basic/arm_sve_sme_incl.td index de10be7bdce0db..44201b15505599 100644 --- a/clang/include/clang/Basic/arm_sve_sme_incl.td +++ b/clang/include/clang/Basic/arm_sve_sme_incl.td @@ -105,6 +105,7 @@ include "arm_immcheck_incl.td" // N: svfloat64_t // $: svbfloat16_t // ~: svmfloat8_t +// !: mfloat8_t (splat to svmfloat8_t) // J: Prefetch type (sv_prfop) diff --git a/clang/lib/CodeGen/CGBuiltin.cpp b/clang/lib/CodeGen/CGBuiltin.cpp index 49a4c1ecc825e7..84048a4beac2c5 100644 --- a/clang/lib/CodeGen/CGBuiltin.cpp +++ b/clang/lib/CodeGen/CGBui
[clang] [llvm] [AArch64] Implement FP8 SVE intrinsics for fused multiply-add (PR #118126)
https://github.com/SpencerAbson unassigned https://github.com/llvm/llvm-project/pull/118126 ___ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
[clang] [llvm] [AArch64] Implement FP8 SVE intrinsics for fused multiply-add (PR #118126)
https://github.com/momchil-velikov edited https://github.com/llvm/llvm-project/pull/118126 ___ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
[clang] [llvm] [AArch64] Implement FP8 SVE intrinsics for fused multiply-add (PR #118126)
https://github.com/jthackray approved this pull request. LGTM https://github.com/llvm/llvm-project/pull/118126 ___ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
[clang] [llvm] [AArch64] Implement FP8 SVE intrinsics for fused multiply-add (PR #118126)
llvmbot wrote: @llvm/pr-subscribers-clang Author: Momchil Velikov (momchil-velikov) Changes --- Patch is 105.50 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/118126.diff 16 Files Affected: - (modified) clang/include/clang/Basic/arm_sve.td (+67) - (modified) clang/include/clang/Basic/arm_sve_sme_incl.td (+1) - (modified) clang/lib/CodeGen/CGBuiltin.cpp (+10-1) - (added) clang/test/CodeGen/AArch64/fp8-intrinsics/acle_sve2_fp8_cvt.c (+173) - (added) clang/test/CodeGen/AArch64/fp8-intrinsics/acle_sve2_fp8_cvtn.c (+101) - (added) clang/test/CodeGen/AArch64/fp8-intrinsics/acle_sve2_fp8_fdot.c (+149) - (added) clang/test/CodeGen/AArch64/fp8-intrinsics/acle_sve2_fp8_fmla.c (+389) - (added) clang/test/Sema/aarch64-sve2-intrinsics/acle_sve2_fp8.c (+103) - (modified) clang/utils/TableGen/SveEmitter.cpp (+9-2) - (modified) llvm/include/llvm/IR/IntrinsicsAArch64.td (+65) - (modified) llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td (+29-29) - (modified) llvm/lib/Target/AArch64/SVEInstrFormats.td (+86-9) - (added) llvm/test/CodeGen/AArch64/fp8-sve-cvt-cvtlt.ll (+78) - (added) llvm/test/CodeGen/AArch64/fp8-sve-cvtn.ll (+49) - (added) llvm/test/CodeGen/AArch64/fp8-sve-fdot.ll (+41) - (added) llvm/test/CodeGen/AArch64/fp8-sve-fmla.ll (+114) ``diff diff --git a/clang/include/clang/Basic/arm_sve.td b/clang/include/clang/Basic/arm_sve.td index b36e592042da0b..1a9089c5466747 100644 --- a/clang/include/clang/Basic/arm_sve.td +++ b/clang/include/clang/Basic/arm_sve.td @@ -2447,3 +2447,70 @@ let SVETargetGuard = "sve2,faminmax", SMETargetGuard = "sme2,faminmax" in { defm SVAMIN : SInstZPZZ<"svamin", "hfd", "aarch64_sve_famin", "aarch64_sve_famin_u">; defm SVAMAX : SInstZPZZ<"svamax", "hfd", "aarch64_sve_famax", "aarch64_sve_famax_u">; } + +let SVETargetGuard = "sve2,fp8", SMETargetGuard = "sme2,fp8" in { + // 8-bit floating-point convert to BFloat16/Float16 + def SVF1CVT : SInst<"svcvt1_{d}[_mf8]_fpm", "d~>", "bh", MergeNone, "aarch64_sve_fp8_cvt1", [VerifyRuntimeMode, SetsFPMR]>; + def SVF2CVT : SInst<"svcvt2_{d}[_mf8]_fpm", "d~>", "bh", MergeNone, "aarch64_sve_fp8_cvt2", [VerifyRuntimeMode, SetsFPMR]>; + + // 8-bit floating-point convert to BFloat16/Float16 (top) + def SVF1CVTLT : SInst<"svcvtlt1_{d}[_mf8]_fpm", "d~>", "bh", MergeNone, "aarch64_sve_fp8_cvtlt1", [VerifyRuntimeMode, SetsFPMR]>; + def SVF2CVTLT : SInst<"svcvtlt2_{d}[_mf8]_fpm", "d~>", "bh", MergeNone, "aarch64_sve_fp8_cvtlt2", [VerifyRuntimeMode, SetsFPMR]>; + + // BFloat16/Float16 convert, narrow and interleave to 8-bit floating-point + def SVFCVTN : SInst<"svcvtn_mf8[_{d}_x2]_fpm", "~2>", "bh", MergeNone, "aarch64_sve_fp8_cvtn", [VerifyRuntimeMode, SetsFPMR]>; + + // Single-precision convert, narrow and interleave to 8-bit floating-point (top and bottom) + def SVFCVTNB : SInst<"svcvtnb_mf8[_f32_x2]_fpm", "~2>", "f", MergeNone, "aarch64_sve_fp8_cvtnb", [VerifyRuntimeMode, SetsFPMR]>; + def SVFCVTNT : SInst<"svcvtnt_mf8[_f32_x2]_fpm", "~~2>", "f", MergeNone, "aarch64_sve_fp8_cvtnt", [VerifyRuntimeMode, SetsFPMR]>; +} + +let SVETargetGuard = "sve2,fp8dot2", SMETargetGuard ="sme,ssve-fp8dot2" in { + // 8-bit floating-point dot product to half-precision (vectors) + def SVFDOT_2WAY : SInst<"svdot[_f16_mf8]_fpm", "dd~~>", "h", MergeNone, "aarch64_sve_fp8_fdot", [VerifyRuntimeMode, SetsFPMR]>; + def SVFDOT_N_2WAY : SInst<"svdot[_n_f16_mf8]_fpm", "dd~!>", "h", MergeNone, "aarch64_sve_fp8_fdot", [VerifyRuntimeMode, SetsFPMR]>; + + // 8-bit floating-point dot product to half-precision (indexed) + def SVFDOT_LANE_2WAY : SInst<"svdot_lane[_f16_mf8]_fpm", "dd~~i>", "h", MergeNone, "aarch64_sve_fp8_fdot_lane", [VerifyRuntimeMode, SetsFPMR], [ImmCheck<3, ImmCheck0_7>]>; +} + +let SVETargetGuard = "sve2,fp8dot4", SMETargetGuard ="sme,ssve-fp8dot4" in { + // 8-bit floating-point dot product to single-precision (vectors) + def SVFDOT_4WAY : SInst<"svdot[_f32_mf8]_fpm", "dd~~>", "f", MergeNone, "aarch64_sve_fp8_fdot", [VerifyRuntimeMode, SetsFPMR]>; + def SVFDOT_N_4WAY : SInst<"svdot[_n_f32_mf8]_fpm", "dd~!>", "f", MergeNone, "aarch64_sve_fp8_fdot", [VerifyRuntimeMode, SetsFPMR]>; + + // 8-bit floating-point dot product to single-precision (indexed) + def SVFDOT_LANE_4WAY : SInst<"svdot_lane[_f32_mf8]_fpm", "dd~~i>", "f", MergeNone, "aarch64_sve_fp8_fdot_lane", [VerifyRuntimeMode, SetsFPMR], [ImmCheck<3, ImmCheck0_3>]>; +} + +let SVETargetGuard = "sve2,fp8fma", SMETargetGuard = "sme,ssve-fp8fma" in { + // 8-bit floating-point multiply-add long to half-precision (bottom) + def SVFMLALB : SInst<"svmlalb[_f16_mf8]_fpm", "dd~~>", "h", MergeNone, "aarch64_sve_fp8_fmlalb", [VerifyRuntimeMode, SetsFPMR]>; + def SVFMLALB_N : SInst<"svmlalb[_n_f16_mf8]_fpm", "dd~!>", "h", MergeNone, "aarch64_sve_fp8_fmlalb", [VerifyRuntimeMode, SetsFPMR]>; + + // 8-bit floating-point multiply-add long to ha_fpmlf-precision (bottom, indexed) + def SVFML
[clang] [llvm] [AArch64] Implement FP8 SVE intrinsics for fused multiply-add (PR #118126)
llvmbot wrote: @llvm/pr-subscribers-llvm-ir Author: Momchil Velikov (momchil-velikov) Changes --- Patch is 105.50 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/118126.diff 16 Files Affected: - (modified) clang/include/clang/Basic/arm_sve.td (+67) - (modified) clang/include/clang/Basic/arm_sve_sme_incl.td (+1) - (modified) clang/lib/CodeGen/CGBuiltin.cpp (+10-1) - (added) clang/test/CodeGen/AArch64/fp8-intrinsics/acle_sve2_fp8_cvt.c (+173) - (added) clang/test/CodeGen/AArch64/fp8-intrinsics/acle_sve2_fp8_cvtn.c (+101) - (added) clang/test/CodeGen/AArch64/fp8-intrinsics/acle_sve2_fp8_fdot.c (+149) - (added) clang/test/CodeGen/AArch64/fp8-intrinsics/acle_sve2_fp8_fmla.c (+389) - (added) clang/test/Sema/aarch64-sve2-intrinsics/acle_sve2_fp8.c (+103) - (modified) clang/utils/TableGen/SveEmitter.cpp (+9-2) - (modified) llvm/include/llvm/IR/IntrinsicsAArch64.td (+65) - (modified) llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td (+29-29) - (modified) llvm/lib/Target/AArch64/SVEInstrFormats.td (+86-9) - (added) llvm/test/CodeGen/AArch64/fp8-sve-cvt-cvtlt.ll (+78) - (added) llvm/test/CodeGen/AArch64/fp8-sve-cvtn.ll (+49) - (added) llvm/test/CodeGen/AArch64/fp8-sve-fdot.ll (+41) - (added) llvm/test/CodeGen/AArch64/fp8-sve-fmla.ll (+114) ``diff diff --git a/clang/include/clang/Basic/arm_sve.td b/clang/include/clang/Basic/arm_sve.td index b36e592042da0b..1a9089c5466747 100644 --- a/clang/include/clang/Basic/arm_sve.td +++ b/clang/include/clang/Basic/arm_sve.td @@ -2447,3 +2447,70 @@ let SVETargetGuard = "sve2,faminmax", SMETargetGuard = "sme2,faminmax" in { defm SVAMIN : SInstZPZZ<"svamin", "hfd", "aarch64_sve_famin", "aarch64_sve_famin_u">; defm SVAMAX : SInstZPZZ<"svamax", "hfd", "aarch64_sve_famax", "aarch64_sve_famax_u">; } + +let SVETargetGuard = "sve2,fp8", SMETargetGuard = "sme2,fp8" in { + // 8-bit floating-point convert to BFloat16/Float16 + def SVF1CVT : SInst<"svcvt1_{d}[_mf8]_fpm", "d~>", "bh", MergeNone, "aarch64_sve_fp8_cvt1", [VerifyRuntimeMode, SetsFPMR]>; + def SVF2CVT : SInst<"svcvt2_{d}[_mf8]_fpm", "d~>", "bh", MergeNone, "aarch64_sve_fp8_cvt2", [VerifyRuntimeMode, SetsFPMR]>; + + // 8-bit floating-point convert to BFloat16/Float16 (top) + def SVF1CVTLT : SInst<"svcvtlt1_{d}[_mf8]_fpm", "d~>", "bh", MergeNone, "aarch64_sve_fp8_cvtlt1", [VerifyRuntimeMode, SetsFPMR]>; + def SVF2CVTLT : SInst<"svcvtlt2_{d}[_mf8]_fpm", "d~>", "bh", MergeNone, "aarch64_sve_fp8_cvtlt2", [VerifyRuntimeMode, SetsFPMR]>; + + // BFloat16/Float16 convert, narrow and interleave to 8-bit floating-point + def SVFCVTN : SInst<"svcvtn_mf8[_{d}_x2]_fpm", "~2>", "bh", MergeNone, "aarch64_sve_fp8_cvtn", [VerifyRuntimeMode, SetsFPMR]>; + + // Single-precision convert, narrow and interleave to 8-bit floating-point (top and bottom) + def SVFCVTNB : SInst<"svcvtnb_mf8[_f32_x2]_fpm", "~2>", "f", MergeNone, "aarch64_sve_fp8_cvtnb", [VerifyRuntimeMode, SetsFPMR]>; + def SVFCVTNT : SInst<"svcvtnt_mf8[_f32_x2]_fpm", "~~2>", "f", MergeNone, "aarch64_sve_fp8_cvtnt", [VerifyRuntimeMode, SetsFPMR]>; +} + +let SVETargetGuard = "sve2,fp8dot2", SMETargetGuard ="sme,ssve-fp8dot2" in { + // 8-bit floating-point dot product to half-precision (vectors) + def SVFDOT_2WAY : SInst<"svdot[_f16_mf8]_fpm", "dd~~>", "h", MergeNone, "aarch64_sve_fp8_fdot", [VerifyRuntimeMode, SetsFPMR]>; + def SVFDOT_N_2WAY : SInst<"svdot[_n_f16_mf8]_fpm", "dd~!>", "h", MergeNone, "aarch64_sve_fp8_fdot", [VerifyRuntimeMode, SetsFPMR]>; + + // 8-bit floating-point dot product to half-precision (indexed) + def SVFDOT_LANE_2WAY : SInst<"svdot_lane[_f16_mf8]_fpm", "dd~~i>", "h", MergeNone, "aarch64_sve_fp8_fdot_lane", [VerifyRuntimeMode, SetsFPMR], [ImmCheck<3, ImmCheck0_7>]>; +} + +let SVETargetGuard = "sve2,fp8dot4", SMETargetGuard ="sme,ssve-fp8dot4" in { + // 8-bit floating-point dot product to single-precision (vectors) + def SVFDOT_4WAY : SInst<"svdot[_f32_mf8]_fpm", "dd~~>", "f", MergeNone, "aarch64_sve_fp8_fdot", [VerifyRuntimeMode, SetsFPMR]>; + def SVFDOT_N_4WAY : SInst<"svdot[_n_f32_mf8]_fpm", "dd~!>", "f", MergeNone, "aarch64_sve_fp8_fdot", [VerifyRuntimeMode, SetsFPMR]>; + + // 8-bit floating-point dot product to single-precision (indexed) + def SVFDOT_LANE_4WAY : SInst<"svdot_lane[_f32_mf8]_fpm", "dd~~i>", "f", MergeNone, "aarch64_sve_fp8_fdot_lane", [VerifyRuntimeMode, SetsFPMR], [ImmCheck<3, ImmCheck0_3>]>; +} + +let SVETargetGuard = "sve2,fp8fma", SMETargetGuard = "sme,ssve-fp8fma" in { + // 8-bit floating-point multiply-add long to half-precision (bottom) + def SVFMLALB : SInst<"svmlalb[_f16_mf8]_fpm", "dd~~>", "h", MergeNone, "aarch64_sve_fp8_fmlalb", [VerifyRuntimeMode, SetsFPMR]>; + def SVFMLALB_N : SInst<"svmlalb[_n_f16_mf8]_fpm", "dd~!>", "h", MergeNone, "aarch64_sve_fp8_fmlalb", [VerifyRuntimeMode, SetsFPMR]>; + + // 8-bit floating-point multiply-add long to ha_fpmlf-precision (bottom, indexed) + def SVF