[clang] [llvm] [AArch64] Implements FP8 SVE intrinsics for dot-product (PR #118125)
https://github.com/momchil-velikov closed https://github.com/llvm/llvm-project/pull/118125 ___ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
[clang] [llvm] [AArch64] Implements FP8 SVE intrinsics for dot-product (PR #118125)
https://github.com/momchil-velikov edited https://github.com/llvm/llvm-project/pull/118125 ___ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
[clang] [llvm] [AArch64] Implements FP8 SVE intrinsics for dot-product (PR #118125)
@@ -10719,7 +10719,16 @@ Value *CodeGenFunction::EmitSVEDupX(Value *Scalar, llvm::Type *Ty) { cast(Ty)->getElementCount(), Scalar); } -Value *CodeGenFunction::EmitSVEDupX(Value* Scalar) { +Value *CodeGenFunction::EmitSVEDupX(Value *Scalar) { + if (auto *Ty = Scalar->getType(); Ty->isVectorTy()) { momchil-velikov wrote: We can force termination in release mode too, but I don't think that's the practice in LLVM. https://github.com/llvm/llvm-project/pull/118125 ___ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
[clang] [llvm] [AArch64] Implements FP8 SVE intrinsics for dot-product (PR #118125)
@@ -10719,7 +10719,16 @@ Value *CodeGenFunction::EmitSVEDupX(Value *Scalar, llvm::Type *Ty) { cast(Ty)->getElementCount(), Scalar); } -Value *CodeGenFunction::EmitSVEDupX(Value* Scalar) { +Value *CodeGenFunction::EmitSVEDupX(Value *Scalar) { + if (auto *Ty = Scalar->getType(); Ty->isVectorTy()) { momchil-velikov wrote: No, that would be inappropriate. It's an invariant, not an option. https://github.com/llvm/llvm-project/pull/118125 ___ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
[clang] [llvm] [AArch64] Implements FP8 SVE intrinsics for dot-product (PR #118125)
@@ -2468,4 +2468,30 @@ let SVETargetGuard = "sve2,fp8", SMETargetGuard = "sme2,fp8" in { // 8-bit floating-point convert to BFloat16/Float16 (top) def SVF1CVTLT : SInst<"svcvtlt1_{d}[_mf8]_fpm", "d~>", "bh", MergeNone, "aarch64_sve_fp8_cvtlt1", [VerifyRuntimeMode, SetsFPMR]>; def SVF2CVTLT : SInst<"svcvtlt2_{d}[_mf8]_fpm", "d~>", "bh", MergeNone, "aarch64_sve_fp8_cvtlt2", [VerifyRuntimeMode, SetsFPMR]>; + + // BFloat16/Float16 convert, narrow and interleave to 8-bit floating-point + def SVFCVTN : SInst<"svcvtn_mf8[_{d}_x2]_fpm", "~2>", "bh", MergeNone, "aarch64_sve_fp8_cvtn", [VerifyRuntimeMode, SetsFPMR]>; + + // Single-precision convert, narrow and interleave to 8-bit floating-point (top and bottom) momchil-velikov wrote: It comes from a stacked PR, that was committed. I've now rebased this one on top of `main` and conversions https://github.com/llvm/llvm-project/pull/118125 ___ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
[clang] [llvm] [AArch64] Implements FP8 SVE intrinsics for dot-product (PR #118125)
https://github.com/momchil-velikov updated https://github.com/llvm/llvm-project/pull/118125 >From 5bc5078af32cda3dbcf3ca8dd53b01996ad34ea1 Mon Sep 17 00:00:00 2001 From: Momchil Velikov Date: Mon, 25 Nov 2024 17:21:55 + Subject: [PATCH 1/2] [AArch64] Implements FP8 SVE intrinsics for dot-product This patch adds the following intrinsics: * 8-bit floating-point dot product to single-precision. // Only if (__ARM_FEATURE_SVE2 && __ARM_FEATURE_FP8DOT4) || __ARM_FEATURE_SSVE_FP8DOT4 svfloat32_t svdot[_f32_mf8]_fpm(svfloat32_t zda, svmfloat8_t zn, svmfloat8_t zm, fpm_t fpm); svfloat32_t svdot[_n_f32_mf8]_fpm(svfloat32_t zda, svmfloat8_t zn, mfloat8_t zm, fpm_t fpm); * 8-bit floating-point indexed dot product to single-precision. // Only if (__ARM_FEATURE_SVE2 && __ARM_FEATURE_FP8DOT4) || __ARM_FEATURE_SSVE_FP8DOT4 svfloat32_t svdot_lane[_f32_mf8]_fpm(svfloat32_t zda, svmfloat8_t zn, svmfloat8_t zm, uint64_t imm0_3, fpm_t fpm); * 8-bit floating-point dot product to half-precision. // Only if (__ARM_FEATURE_SVE2 && __ARM_FEATURE_FP8DOT2) || __ARM_FEATURE_SSVE_FP8DOT2 svfloat16_t svdot[_f16_mf8]_fpm(svfloat16_t zda, svmfloat8_t zn, svmfloat8_t zm, fpm_t fpm); svfloat16_t svdot[_n_f16_mf8]_fpm(svfloat16_t zda, svmfloat8_t zn, mfloat8_t zm, fpm_t fpm); * 8-bit floating-point indexed dot product to half-precision. // Only if (__ARM_FEATURE_SVE2 && __ARM_FEATURE_FP8DOT2) || __ARM_FEATURE_SSVE_FP8DOT2 svfloat16_t svdot_lane[_f16_mf8]_fpm(svfloat16_t zda, svmfloat8_t zn, svmfloat8_t zm, uint64_t imm0_7, fpm_t fpm); --- clang/include/clang/Basic/arm_sve.td | 19 +++ clang/include/clang/Basic/arm_sve_sme_incl.td | 1 + clang/lib/CodeGen/CGBuiltin.cpp | 11 +- .../fp8-intrinsics/acle_sve2_fp8_fdot.c | 149 ++ .../aarch64-sve2-intrinsics/acle_sve2_fp8.c | 23 ++- clang/utils/TableGen/SveEmitter.cpp | 9 +- llvm/include/llvm/IR/IntrinsicsAArch64.td | 16 ++ .../lib/Target/AArch64/AArch64SVEInstrInfo.td | 9 +- llvm/lib/Target/AArch64/SVEInstrFormats.td| 29 +++- llvm/test/CodeGen/AArch64/fp8-sve-fdot.ll | 41 + 10 files changed, 293 insertions(+), 14 deletions(-) create mode 100644 clang/test/CodeGen/AArch64/fp8-intrinsics/acle_sve2_fp8_fdot.c create mode 100644 llvm/test/CodeGen/AArch64/fp8-sve-fdot.ll diff --git a/clang/include/clang/Basic/arm_sve.td b/clang/include/clang/Basic/arm_sve.td index b9f40faf0b18e6..2c8ca8014387d3 100644 --- a/clang/include/clang/Basic/arm_sve.td +++ b/clang/include/clang/Basic/arm_sve.td @@ -2476,3 +2476,22 @@ let SVETargetGuard = "sve2,fp8", SMETargetGuard = "sme2,fp8" in { def SVFCVTNB : SInst<"svcvtnb_mf8[_f32_x2]_fpm", "~2>", "f", MergeNone, "aarch64_sve_fp8_cvtnb", [VerifyRuntimeMode, SetsFPMR]>; def SVFCVTNT : SInst<"svcvtnt_mf8[_f32_x2]_fpm", "~~2>", "f", MergeNone, "aarch64_sve_fp8_cvtnt", [VerifyRuntimeMode, SetsFPMR]>; } + +let SVETargetGuard = "sve2,fp8dot2", SMETargetGuard ="sme,ssve-fp8dot2" in { + // 8-bit floating-point dot product to half-precision (vectors) + def SVFDOT_2WAY : SInst<"svdot[_f16_mf8]_fpm", "dd~~>", "h", MergeNone, "aarch64_sve_fp8_fdot", [VerifyRuntimeMode, SetsFPMR]>; + def SVFDOT_N_2WAY : SInst<"svdot[_n_f16_mf8]_fpm", "dd~!>", "h", MergeNone, "aarch64_sve_fp8_fdot", [VerifyRuntimeMode, SetsFPMR]>; + + // 8-bit floating-point dot product to half-precision (indexed) + def SVFDOT_LANE_2WAY : SInst<"svdot_lane[_f16_mf8]_fpm", "dd~~i>", "h", MergeNone, "aarch64_sve_fp8_fdot_lane", [VerifyRuntimeMode, SetsFPMR], [ImmCheck<3, ImmCheck0_7>]>; +} + +let SVETargetGuard = "sve2,fp8dot4", SMETargetGuard ="sme,ssve-fp8dot4" in { + // 8-bit floating-point dot product to single-precision (vectors) + def SVFDOT_4WAY : SInst<"svdot[_f32_mf8]_fpm", "dd~~>", "f", MergeNone, "aarch64_sve_fp8_fdot", [VerifyRuntimeMode, SetsFPMR]>; + def SVFDOT_N_4WAY : SInst<"svdot[_n_f32_mf8]_fpm", "dd~!>", "f", MergeNone, "aarch64_sve_fp8_fdot", [VerifyRuntimeMode, SetsFPMR]>; + + // 8-bit floating-point dot product to single-precision (indexed) + def SVFDOT_LANE_4WAY : SInst<"svdot_lane[_f32_mf8]_fpm", "dd~~i>", "f", MergeNone, "aarch64_sve_fp8_fdot_lane", [VerifyRuntimeMode, SetsFPMR], [ImmCheck<3, ImmCheck0_3>]>; +} + diff --git a/clang/include/clang/Basic/arm_sve_sme_incl.td b/clang/include/clang/Basic/arm_sve_sme_incl.td index de10be7bdce0db..44201b15505599 100644 --- a/clang/include/clang/Basic/arm_sve_sme_incl.td +++ b/clang/include/clang/Basic/arm_sve_sme_incl.td @@ -105,6 +105,7 @@ include "arm_immcheck_incl.td" // N: svfloat64_t // $: svbfloat16_t // ~: svmfloat8_t +// !: mfloat8_t (splat to svmfloat8_t) // J: Prefetch type (sv_prfop) diff --git a/clang/lib/CodeGen/CGBuiltin.cpp b/clang/lib/CodeGen/CGBuiltin.cpp index 49a4c1ecc825e7..84048a4beac2c5 100644 --- a/clang/lib/CodeGen/CGBuiltin.cpp +++ b/clang/lib/CodeGen/CGBui
[clang] [llvm] [AArch64] Implements FP8 SVE intrinsics for dot-product (PR #118125)
https://github.com/CarolineConcatto commented: Momchil, I believe the patch is fine, but I think you have two patches into one. FCVT nad FDOTs. Maybe we should split. https://github.com/llvm/llvm-project/pull/118125 ___ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
[clang] [llvm] [AArch64] Implements FP8 SVE intrinsics for dot-product (PR #118125)
@@ -10719,7 +10719,16 @@ Value *CodeGenFunction::EmitSVEDupX(Value *Scalar, llvm::Type *Ty) { cast(Ty)->getElementCount(), Scalar); } -Value *CodeGenFunction::EmitSVEDupX(Value* Scalar) { +Value *CodeGenFunction::EmitSVEDupX(Value *Scalar) { + if (auto *Ty = Scalar->getType(); Ty->isVectorTy()) { CarolineConcatto wrote: I imagine this will only be true for <1xi8>,but should we also check this here instead of only asserting if in debug mode? https://github.com/llvm/llvm-project/pull/118125 ___ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
[clang] [llvm] [AArch64] Implements FP8 SVE intrinsics for dot-product (PR #118125)
@@ -2468,4 +2468,30 @@ let SVETargetGuard = "sve2,fp8", SMETargetGuard = "sme2,fp8" in { // 8-bit floating-point convert to BFloat16/Float16 (top) def SVF1CVTLT : SInst<"svcvtlt1_{d}[_mf8]_fpm", "d~>", "bh", MergeNone, "aarch64_sve_fp8_cvtlt1", [VerifyRuntimeMode, SetsFPMR]>; def SVF2CVTLT : SInst<"svcvtlt2_{d}[_mf8]_fpm", "d~>", "bh", MergeNone, "aarch64_sve_fp8_cvtlt2", [VerifyRuntimeMode, SetsFPMR]>; + + // BFloat16/Float16 convert, narrow and interleave to 8-bit floating-point + def SVFCVTN : SInst<"svcvtn_mf8[_{d}_x2]_fpm", "~2>", "bh", MergeNone, "aarch64_sve_fp8_cvtn", [VerifyRuntimeMode, SetsFPMR]>; + + // Single-precision convert, narrow and interleave to 8-bit floating-point (top and bottom) CarolineConcatto wrote: Where are this CVT coming from? This is dot and cvt patch? https://github.com/llvm/llvm-project/pull/118125 ___ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
[clang] [llvm] [AArch64] Implements FP8 SVE intrinsics for dot-product (PR #118125)
https://github.com/CarolineConcatto edited https://github.com/llvm/llvm-project/pull/118125 ___ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
[clang] [llvm] [AArch64] Implements FP8 SVE intrinsics for dot-product (PR #118125)
https://github.com/momchil-velikov updated https://github.com/llvm/llvm-project/pull/118125 >From 359e611dc5a74d4e2dfdb19119eb8f83badb1f0b Mon Sep 17 00:00:00 2001 From: Momchil Velikov Date: Thu, 21 Nov 2024 11:21:29 + Subject: [PATCH 1/4] [AArch64] Implement FP8 SVE intrinsics for widening conversions This patch adds the following intrinsics: * 8-bit floating-point convert to half-precision and BFloat16. // Variants are also available for: _bf16 svfloat16_t svcvt1_f16[_mf8]_fpm(svmfloat8_t zn, fpm_t fpm); svfloat16_t svcvt2_f16[_mf8]_fpm(svmfloat8_t zn, fpm_t fpm); * 8-bit floating-point convert to half-precision and BFloat16 (top). // Variants are also available for: _bf16 svfloat16_t svcvtlt1_f16[_mf8]_fpm(svmfloat8_t zn, fpm_t fpm); svfloat16_t svcvtlt2_f16[_mf8]_fpm(svmfloat8_t zn, fpm_t fpm); --- clang/include/clang/Basic/arm_sve.td | 20 +- .../fp8-intrinsics/acle_sve2_fp8_cvt.c| 173 ++ .../aarch64-sve2-intrinsics/acle_sve2_fp8.c | 24 +++ llvm/include/llvm/IR/IntrinsicsAArch64.td | 13 +- .../lib/Target/AArch64/AArch64SVEInstrInfo.td | 16 +- llvm/lib/Target/AArch64/SVEInstrFormats.td| 7 +- .../test/CodeGen/AArch64/fp8-sve-cvt-cvtlt.ll | 78 7 files changed, 317 insertions(+), 14 deletions(-) create mode 100644 clang/test/CodeGen/AArch64/fp8-intrinsics/acle_sve2_fp8_cvt.c create mode 100644 clang/test/Sema/aarch64-sve2-intrinsics/acle_sve2_fp8.c create mode 100644 llvm/test/CodeGen/AArch64/fp8-sve-cvt-cvtlt.ll diff --git a/clang/include/clang/Basic/arm_sve.td b/clang/include/clang/Basic/arm_sve.td index 9b8a8546b072c0..7b8ecf29a9de6e 100644 --- a/clang/include/clang/Basic/arm_sve.td +++ b/clang/include/clang/Basic/arm_sve.td @@ -2430,12 +2430,12 @@ let SVETargetGuard = InvalidMode, SMETargetGuard = "sme2,fp8" in { def FSCALE_X4 : Inst<"svscale[_{d}_x4]", "444.x", "fhd", MergeNone, "aarch64_sme_fp8_scale_x4", [IsStreaming],[]>; // Convert from FP8 to half-precision/BFloat16 multi-vector - def SVF1CVT : Inst<"svcvt1_{d}[_mf8]_x2_fpm", "2~>", "bh", MergeNone, "aarch64_sve_fp8_cvt1_x2", [IsStreaming, SetsFPMR], []>; - def SVF2CVT : Inst<"svcvt2_{d}[_mf8]_x2_fpm", "2~>", "bh", MergeNone, "aarch64_sve_fp8_cvt2_x2", [IsStreaming, SetsFPMR], []>; + def SVF1CVT_X2 : Inst<"svcvt1_{d}[_mf8]_x2_fpm", "2~>", "bh", MergeNone, "aarch64_sve_fp8_cvt1_x2", [IsStreaming, SetsFPMR], []>; + def SVF2CVT_X2 : Inst<"svcvt2_{d}[_mf8]_x2_fpm", "2~>", "bh", MergeNone, "aarch64_sve_fp8_cvt2_x2", [IsStreaming, SetsFPMR], []>; // Convert from FP8 to deinterleaved half-precision/BFloat16 multi-vector - def SVF1CVTL : Inst<"svcvtl1_{d}[_mf8]_x2_fpm", "2~>", "bh", MergeNone, "aarch64_sve_fp8_cvtl1_x2", [IsStreaming, SetsFPMR], []>; - def SVF2CVTL : Inst<"svcvtl2_{d}[_mf8]_x2_fpm", "2~>", "bh", MergeNone, "aarch64_sve_fp8_cvtl2_x2", [IsStreaming, SetsFPMR], []>; + def SVF1CVTL_X2 : Inst<"svcvtl1_{d}[_mf8]_x2_fpm", "2~>", "bh", MergeNone, "aarch64_sve_fp8_cvtl1_x2", [IsStreaming, SetsFPMR], []>; + def SVF2CVTL_X2 : Inst<"svcvtl2_{d}[_mf8]_x2_fpm", "2~>", "bh", MergeNone, "aarch64_sve_fp8_cvtl2_x2", [IsStreaming, SetsFPMR], []>; } let SVETargetGuard = "sve2p1", SMETargetGuard = "sme2" in { @@ -2451,3 +2451,15 @@ let SVETargetGuard = "sve2,faminmax", SMETargetGuard = "sme2,faminmax" in { defm SVAMIN : SInstZPZZ<"svamin", "hfd", "aarch64_sve_famin", "aarch64_sve_famin_u">; defm SVAMAX : SInstZPZZ<"svamax", "hfd", "aarch64_sve_famax", "aarch64_sve_famax_u">; } + +let SVETargetGuard = "sve2,fp8", SMETargetGuard = "sme2,fp8" in { + // SVE FP8 widening conversions + + // 8-bit floating-point convert to BFloat16/Float16 + def SVF1CVT : SInst<"svcvt1_{d}[_mf8]_fpm", "d~>", "bh", MergeNone, "aarch64_sve_fp8_cvt1", [VerifyRuntimeMode, SetsFPMR]>; + def SVF2CVT : SInst<"svcvt2_{d}[_mf8]_fpm", "d~>", "bh", MergeNone, "aarch64_sve_fp8_cvt2", [VerifyRuntimeMode, SetsFPMR]>; + + // 8-bit floating-point convert to BFloat16/Float16 (top) + def SVF1CVTLT : SInst<"svcvtlt1_{d}[_mf8]_fpm", "d~>", "bh", MergeNone, "aarch64_sve_fp8_cvtlt1", [VerifyRuntimeMode, SetsFPMR]>; + def SVF2CVTLT : SInst<"svcvtlt2_{d}[_mf8]_fpm", "d~>", "bh", MergeNone, "aarch64_sve_fp8_cvtlt2", [VerifyRuntimeMode, SetsFPMR]>; +} diff --git a/clang/test/CodeGen/AArch64/fp8-intrinsics/acle_sve2_fp8_cvt.c b/clang/test/CodeGen/AArch64/fp8-intrinsics/acle_sve2_fp8_cvt.c new file mode 100644 index 00..c026b8aa216f32 --- /dev/null +++ b/clang/test/CodeGen/AArch64/fp8-intrinsics/acle_sve2_fp8_cvt.c @@ -0,0 +1,173 @@ +// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 5 +// RUN: %clang_cc1-triple aarch64-none-linux-gnu -target-feature +sve -target-feature +sve2 -target-feature +fp8 -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s +// RUN: %clang_cc1 -x c++ -triple aarch64-none-linux-gnu -target-featu
[clang] [llvm] [AArch64] Implements FP8 SVE intrinsics for dot-product (PR #118125)
@@ -10877,24 +10923,37 @@ class sve_fp8_dot_indexed opc, ZPRRegOp dst_ty, Operand iop_ty, string m let DestructiveInstType = DestructiveOther; let hasSideEffects = 0; let mayRaiseFPException = 1; + + let mayLoad = 1; + let mayStore = 0; } // FP8 Widening Dot-Product - Indexed Group -multiclass sve2_fp8_dot_indexed_h{ - def NAME : sve_fp8_dot_indexed<{0, ?, ?, ?}, ZPR16, VectorIndexH, asm> { +multiclass sve2_fp8_dot_indexed_h { + def NAME : sve_fp8_dot_indexed<{0, ?, ?, ?}, ZPR16, VectorIndexH32b, asm> { bits<3> iop; let Inst{20-19} = iop{2-1}; let Inst{11}= iop{0}; + +let mayLoad = 1; momchil-velikov wrote: Done https://github.com/llvm/llvm-project/pull/118125 ___ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
[clang] [llvm] [AArch64] Implements FP8 SVE intrinsics for dot-product (PR #118125)
@@ -2447,3 +2447,39 @@ let SVETargetGuard = "sve2,faminmax", SMETargetGuard = "sme2,faminmax" in { defm SVAMIN : SInstZPZZ<"svamin", "hfd", "aarch64_sve_famin", "aarch64_sve_famin_u">; defm SVAMAX : SInstZPZZ<"svamax", "hfd", "aarch64_sve_famax", "aarch64_sve_famax_u">; } + +let SVETargetGuard = "sve2,fp8", SMETargetGuard = "sme2,fp8" in { + // 8-bit floating-point convert to BFloat16/Float16 + def SVF1CVT : SInst<"svcvt1_{d}[_mf8]_fpm", "d~>", "bh", MergeNone, "aarch64_sve_fp8_cvt1", [VerifyRuntimeMode, SetsFPMR]>; + def SVF2CVT : SInst<"svcvt2_{d}[_mf8]_fpm", "d~>", "bh", MergeNone, "aarch64_sve_fp8_cvt2", [VerifyRuntimeMode, SetsFPMR]>; + + // 8-bit floating-point convert to BFloat16/Float16 (top) + def SVF1CVTLT : SInst<"svcvtlt1_{d}[_mf8]_fpm", "d~>", "bh", MergeNone, "aarch64_sve_fp8_cvtlt1", [VerifyRuntimeMode, SetsFPMR]>; + def SVF2CVTLT : SInst<"svcvtlt2_{d}[_mf8]_fpm", "d~>", "bh", MergeNone, "aarch64_sve_fp8_cvtlt2", [VerifyRuntimeMode, SetsFPMR]>; + + // BFloat16/Float16 convert, narrow and interleave to 8-bit floating-point + def SVFCVTN : SInst<"svcvtn_mf8[_{d}_x2]_fpm", "~2>", "bh", MergeNone, "aarch64_sve_fp8_cvtn", [VerifyRuntimeMode, SetsFPMR]>; + + // Single-precision convert, narrow and interleave to 8-bit floating-point (top and bottom) + def SVFCVTNB : SInst<"svcvtnb_mf8[_f32_x2]_fpm", "~2>", "f", MergeNone, "aarch64_sve_fp8_cvtnb", [VerifyRuntimeMode, SetsFPMR]>; + def SVFCVTNT : SInst<"svcvtnt_mf8[_f32_x2]_fpm", "~~2>", "f", MergeNone, "aarch64_sve_fp8_cvtnt", [VerifyRuntimeMode, SetsFPMR]>; +} + +let SVETargetGuard = "sve2,fp8dot2", SMETargetGuard ="sme,ssve-fp8dot2" in { + // 8-bit floating-point dot product to half-precision (vectors) + def SVFDOT_2WAY : SInst<"svdot[_f16_mf8]_fpm", "dd~~>", "h", MergeNone, "aarch64_sve_fp8_fdot", [VerifyRuntimeMode, SetsFPMR]>; + def SVFDOT_N_2WAY : SInst<"svdot[_n_f16_mf8]_fpm", "dd~!>", "h", MergeNone, "aarch64_sve_fp8_fdot", [VerifyRuntimeMode, SetsFPMR]>; + + // 8-bit floating-point dot product to half-precision (indexed) + def SVFDOT_LANE_2WAY : SInst<"svdot_lane[_f16_mf8]_fpm", "dd~~i>", "h", MergeNone, "aarch64_sve_fp8_fdot_lane", [VerifyRuntimeMode, SetsFPMR], [ImmCheck<3, ImmCheck0_7>]>; +} + +let SVETargetGuard = "sve2,fp8dot4", SMETargetGuard ="sme,ssve-fp8dot4" in { + // 8-bit floating-point dot product to single-precision (vectors) + def SVFDOT_4WAY : SInst<"svdot[_f32_mf8]_fpm", "dd~~>", "f", MergeNone, "aarch64_sve_fp8_fdot", [VerifyRuntimeMode, SetsFPMR]>; + def SVFDOT_N_4WAY : SInst<"svdot[_n_f32_mf8]_fpm", "dd~!>", "f", MergeNone, "aarch64_sve_fp8_fdot", [VerifyRuntimeMode, SetsFPMR]>; momchil-velikov wrote: They were accidentally omitted, I'm adding them via https://github.com/ARM-software/acle/pull/369 https://github.com/llvm/llvm-project/pull/118125 ___ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
[clang] [llvm] [AArch64] Implements FP8 SVE intrinsics for dot-product (PR #118125)
@@ -105,6 +105,7 @@ include "arm_immcheck_incl.td" // N: svfloat64_t // $: svbfloat16_t // ~: svmfloat8_t +// !: mfloat8_t (splat to svmfloat8_t) momchil-velikov wrote: Done https://github.com/llvm/llvm-project/pull/118125 ___ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
[clang] [llvm] [AArch64] Implements FP8 SVE intrinsics for dot-product (PR #118125)
https://github.com/momchil-velikov updated https://github.com/llvm/llvm-project/pull/118125 >From cdd86588c639f818909964ab49b9972da6869cb3 Mon Sep 17 00:00:00 2001 From: Momchil Velikov Date: Thu, 21 Nov 2024 11:21:29 + Subject: [PATCH 1/4] [AArch64] Implement FP8 SVE intrinsics for widening conversions This patch adds the following intrinsics: * 8-bit floating-point convert to half-precision and BFloat16. // Variants are also available for: _bf16 svfloat16_t svcvt1_f16[_mf8]_fpm(svmfloat8_t zn, fpm_t fpm); svfloat16_t svcvt2_f16[_mf8]_fpm(svmfloat8_t zn, fpm_t fpm); * 8-bit floating-point convert to half-precision and BFloat16 (top). // Variants are also available for: _bf16 svfloat16_t svcvtlt1_f16[_mf8]_fpm(svmfloat8_t zn, fpm_t fpm); svfloat16_t svcvtlt2_f16[_mf8]_fpm(svmfloat8_t zn, fpm_t fpm); --- clang/include/clang/Basic/arm_sve.td | 20 +- .../fp8-intrinsics/acle_sve2_fp8_cvt.c| 173 ++ .../aarch64-sve2-intrinsics/acle_sve2_fp8.c | 24 +++ llvm/include/llvm/IR/IntrinsicsAArch64.td | 13 +- .../lib/Target/AArch64/AArch64SVEInstrInfo.td | 16 +- llvm/lib/Target/AArch64/SVEInstrFormats.td| 7 +- .../test/CodeGen/AArch64/fp8-sve-cvt-cvtlt.ll | 78 7 files changed, 317 insertions(+), 14 deletions(-) create mode 100644 clang/test/CodeGen/AArch64/fp8-intrinsics/acle_sve2_fp8_cvt.c create mode 100644 clang/test/Sema/aarch64-sve2-intrinsics/acle_sve2_fp8.c create mode 100644 llvm/test/CodeGen/AArch64/fp8-sve-cvt-cvtlt.ll diff --git a/clang/include/clang/Basic/arm_sve.td b/clang/include/clang/Basic/arm_sve.td index 9b8a8546b072c0..7b8ecf29a9de6e 100644 --- a/clang/include/clang/Basic/arm_sve.td +++ b/clang/include/clang/Basic/arm_sve.td @@ -2430,12 +2430,12 @@ let SVETargetGuard = InvalidMode, SMETargetGuard = "sme2,fp8" in { def FSCALE_X4 : Inst<"svscale[_{d}_x4]", "444.x", "fhd", MergeNone, "aarch64_sme_fp8_scale_x4", [IsStreaming],[]>; // Convert from FP8 to half-precision/BFloat16 multi-vector - def SVF1CVT : Inst<"svcvt1_{d}[_mf8]_x2_fpm", "2~>", "bh", MergeNone, "aarch64_sve_fp8_cvt1_x2", [IsStreaming, SetsFPMR], []>; - def SVF2CVT : Inst<"svcvt2_{d}[_mf8]_x2_fpm", "2~>", "bh", MergeNone, "aarch64_sve_fp8_cvt2_x2", [IsStreaming, SetsFPMR], []>; + def SVF1CVT_X2 : Inst<"svcvt1_{d}[_mf8]_x2_fpm", "2~>", "bh", MergeNone, "aarch64_sve_fp8_cvt1_x2", [IsStreaming, SetsFPMR], []>; + def SVF2CVT_X2 : Inst<"svcvt2_{d}[_mf8]_x2_fpm", "2~>", "bh", MergeNone, "aarch64_sve_fp8_cvt2_x2", [IsStreaming, SetsFPMR], []>; // Convert from FP8 to deinterleaved half-precision/BFloat16 multi-vector - def SVF1CVTL : Inst<"svcvtl1_{d}[_mf8]_x2_fpm", "2~>", "bh", MergeNone, "aarch64_sve_fp8_cvtl1_x2", [IsStreaming, SetsFPMR], []>; - def SVF2CVTL : Inst<"svcvtl2_{d}[_mf8]_x2_fpm", "2~>", "bh", MergeNone, "aarch64_sve_fp8_cvtl2_x2", [IsStreaming, SetsFPMR], []>; + def SVF1CVTL_X2 : Inst<"svcvtl1_{d}[_mf8]_x2_fpm", "2~>", "bh", MergeNone, "aarch64_sve_fp8_cvtl1_x2", [IsStreaming, SetsFPMR], []>; + def SVF2CVTL_X2 : Inst<"svcvtl2_{d}[_mf8]_x2_fpm", "2~>", "bh", MergeNone, "aarch64_sve_fp8_cvtl2_x2", [IsStreaming, SetsFPMR], []>; } let SVETargetGuard = "sve2p1", SMETargetGuard = "sme2" in { @@ -2451,3 +2451,15 @@ let SVETargetGuard = "sve2,faminmax", SMETargetGuard = "sme2,faminmax" in { defm SVAMIN : SInstZPZZ<"svamin", "hfd", "aarch64_sve_famin", "aarch64_sve_famin_u">; defm SVAMAX : SInstZPZZ<"svamax", "hfd", "aarch64_sve_famax", "aarch64_sve_famax_u">; } + +let SVETargetGuard = "sve2,fp8", SMETargetGuard = "sme2,fp8" in { + // SVE FP8 widening conversions + + // 8-bit floating-point convert to BFloat16/Float16 + def SVF1CVT : SInst<"svcvt1_{d}[_mf8]_fpm", "d~>", "bh", MergeNone, "aarch64_sve_fp8_cvt1", [VerifyRuntimeMode, SetsFPMR]>; + def SVF2CVT : SInst<"svcvt2_{d}[_mf8]_fpm", "d~>", "bh", MergeNone, "aarch64_sve_fp8_cvt2", [VerifyRuntimeMode, SetsFPMR]>; + + // 8-bit floating-point convert to BFloat16/Float16 (top) + def SVF1CVTLT : SInst<"svcvtlt1_{d}[_mf8]_fpm", "d~>", "bh", MergeNone, "aarch64_sve_fp8_cvtlt1", [VerifyRuntimeMode, SetsFPMR]>; + def SVF2CVTLT : SInst<"svcvtlt2_{d}[_mf8]_fpm", "d~>", "bh", MergeNone, "aarch64_sve_fp8_cvtlt2", [VerifyRuntimeMode, SetsFPMR]>; +} diff --git a/clang/test/CodeGen/AArch64/fp8-intrinsics/acle_sve2_fp8_cvt.c b/clang/test/CodeGen/AArch64/fp8-intrinsics/acle_sve2_fp8_cvt.c new file mode 100644 index 00..c026b8aa216f32 --- /dev/null +++ b/clang/test/CodeGen/AArch64/fp8-intrinsics/acle_sve2_fp8_cvt.c @@ -0,0 +1,173 @@ +// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 5 +// RUN: %clang_cc1-triple aarch64-none-linux-gnu -target-feature +sve -target-feature +sve2 -target-feature +fp8 -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s +// RUN: %clang_cc1 -x c++ -triple aarch64-none-linux-gnu -target-featu
[clang] [llvm] [AArch64] Implements FP8 SVE intrinsics for dot-product (PR #118125)
https://github.com/momchil-velikov edited https://github.com/llvm/llvm-project/pull/118125 ___ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
[clang] [llvm] [AArch64] Implements FP8 SVE intrinsics for dot-product (PR #118125)
https://github.com/momchil-velikov updated https://github.com/llvm/llvm-project/pull/118125 >From cdd86588c639f818909964ab49b9972da6869cb3 Mon Sep 17 00:00:00 2001 From: Momchil Velikov Date: Thu, 21 Nov 2024 11:21:29 + Subject: [PATCH 1/3] [AArch64] Implement FP8 SVE intrinsics for widening conversions This patch adds the following intrinsics: * 8-bit floating-point convert to half-precision and BFloat16. // Variants are also available for: _bf16 svfloat16_t svcvt1_f16[_mf8]_fpm(svmfloat8_t zn, fpm_t fpm); svfloat16_t svcvt2_f16[_mf8]_fpm(svmfloat8_t zn, fpm_t fpm); * 8-bit floating-point convert to half-precision and BFloat16 (top). // Variants are also available for: _bf16 svfloat16_t svcvtlt1_f16[_mf8]_fpm(svmfloat8_t zn, fpm_t fpm); svfloat16_t svcvtlt2_f16[_mf8]_fpm(svmfloat8_t zn, fpm_t fpm); --- clang/include/clang/Basic/arm_sve.td | 20 +- .../fp8-intrinsics/acle_sve2_fp8_cvt.c| 173 ++ .../aarch64-sve2-intrinsics/acle_sve2_fp8.c | 24 +++ llvm/include/llvm/IR/IntrinsicsAArch64.td | 13 +- .../lib/Target/AArch64/AArch64SVEInstrInfo.td | 16 +- llvm/lib/Target/AArch64/SVEInstrFormats.td| 7 +- .../test/CodeGen/AArch64/fp8-sve-cvt-cvtlt.ll | 78 7 files changed, 317 insertions(+), 14 deletions(-) create mode 100644 clang/test/CodeGen/AArch64/fp8-intrinsics/acle_sve2_fp8_cvt.c create mode 100644 clang/test/Sema/aarch64-sve2-intrinsics/acle_sve2_fp8.c create mode 100644 llvm/test/CodeGen/AArch64/fp8-sve-cvt-cvtlt.ll diff --git a/clang/include/clang/Basic/arm_sve.td b/clang/include/clang/Basic/arm_sve.td index 9b8a8546b072c0..7b8ecf29a9de6e 100644 --- a/clang/include/clang/Basic/arm_sve.td +++ b/clang/include/clang/Basic/arm_sve.td @@ -2430,12 +2430,12 @@ let SVETargetGuard = InvalidMode, SMETargetGuard = "sme2,fp8" in { def FSCALE_X4 : Inst<"svscale[_{d}_x4]", "444.x", "fhd", MergeNone, "aarch64_sme_fp8_scale_x4", [IsStreaming],[]>; // Convert from FP8 to half-precision/BFloat16 multi-vector - def SVF1CVT : Inst<"svcvt1_{d}[_mf8]_x2_fpm", "2~>", "bh", MergeNone, "aarch64_sve_fp8_cvt1_x2", [IsStreaming, SetsFPMR], []>; - def SVF2CVT : Inst<"svcvt2_{d}[_mf8]_x2_fpm", "2~>", "bh", MergeNone, "aarch64_sve_fp8_cvt2_x2", [IsStreaming, SetsFPMR], []>; + def SVF1CVT_X2 : Inst<"svcvt1_{d}[_mf8]_x2_fpm", "2~>", "bh", MergeNone, "aarch64_sve_fp8_cvt1_x2", [IsStreaming, SetsFPMR], []>; + def SVF2CVT_X2 : Inst<"svcvt2_{d}[_mf8]_x2_fpm", "2~>", "bh", MergeNone, "aarch64_sve_fp8_cvt2_x2", [IsStreaming, SetsFPMR], []>; // Convert from FP8 to deinterleaved half-precision/BFloat16 multi-vector - def SVF1CVTL : Inst<"svcvtl1_{d}[_mf8]_x2_fpm", "2~>", "bh", MergeNone, "aarch64_sve_fp8_cvtl1_x2", [IsStreaming, SetsFPMR], []>; - def SVF2CVTL : Inst<"svcvtl2_{d}[_mf8]_x2_fpm", "2~>", "bh", MergeNone, "aarch64_sve_fp8_cvtl2_x2", [IsStreaming, SetsFPMR], []>; + def SVF1CVTL_X2 : Inst<"svcvtl1_{d}[_mf8]_x2_fpm", "2~>", "bh", MergeNone, "aarch64_sve_fp8_cvtl1_x2", [IsStreaming, SetsFPMR], []>; + def SVF2CVTL_X2 : Inst<"svcvtl2_{d}[_mf8]_x2_fpm", "2~>", "bh", MergeNone, "aarch64_sve_fp8_cvtl2_x2", [IsStreaming, SetsFPMR], []>; } let SVETargetGuard = "sve2p1", SMETargetGuard = "sme2" in { @@ -2451,3 +2451,15 @@ let SVETargetGuard = "sve2,faminmax", SMETargetGuard = "sme2,faminmax" in { defm SVAMIN : SInstZPZZ<"svamin", "hfd", "aarch64_sve_famin", "aarch64_sve_famin_u">; defm SVAMAX : SInstZPZZ<"svamax", "hfd", "aarch64_sve_famax", "aarch64_sve_famax_u">; } + +let SVETargetGuard = "sve2,fp8", SMETargetGuard = "sme2,fp8" in { + // SVE FP8 widening conversions + + // 8-bit floating-point convert to BFloat16/Float16 + def SVF1CVT : SInst<"svcvt1_{d}[_mf8]_fpm", "d~>", "bh", MergeNone, "aarch64_sve_fp8_cvt1", [VerifyRuntimeMode, SetsFPMR]>; + def SVF2CVT : SInst<"svcvt2_{d}[_mf8]_fpm", "d~>", "bh", MergeNone, "aarch64_sve_fp8_cvt2", [VerifyRuntimeMode, SetsFPMR]>; + + // 8-bit floating-point convert to BFloat16/Float16 (top) + def SVF1CVTLT : SInst<"svcvtlt1_{d}[_mf8]_fpm", "d~>", "bh", MergeNone, "aarch64_sve_fp8_cvtlt1", [VerifyRuntimeMode, SetsFPMR]>; + def SVF2CVTLT : SInst<"svcvtlt2_{d}[_mf8]_fpm", "d~>", "bh", MergeNone, "aarch64_sve_fp8_cvtlt2", [VerifyRuntimeMode, SetsFPMR]>; +} diff --git a/clang/test/CodeGen/AArch64/fp8-intrinsics/acle_sve2_fp8_cvt.c b/clang/test/CodeGen/AArch64/fp8-intrinsics/acle_sve2_fp8_cvt.c new file mode 100644 index 00..c026b8aa216f32 --- /dev/null +++ b/clang/test/CodeGen/AArch64/fp8-intrinsics/acle_sve2_fp8_cvt.c @@ -0,0 +1,173 @@ +// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 5 +// RUN: %clang_cc1-triple aarch64-none-linux-gnu -target-feature +sve -target-feature +sve2 -target-feature +fp8 -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s +// RUN: %clang_cc1 -x c++ -triple aarch64-none-linux-gnu -target-featu
[clang] [llvm] [AArch64] Implements FP8 SVE intrinsics for dot-product (PR #118125)
@@ -10877,24 +10923,37 @@ class sve_fp8_dot_indexed opc, ZPRRegOp dst_ty, Operand iop_ty, string m let DestructiveInstType = DestructiveOther; let hasSideEffects = 0; let mayRaiseFPException = 1; + + let mayLoad = 1; + let mayStore = 0; } // FP8 Widening Dot-Product - Indexed Group -multiclass sve2_fp8_dot_indexed_h{ - def NAME : sve_fp8_dot_indexed<{0, ?, ?, ?}, ZPR16, VectorIndexH, asm> { +multiclass sve2_fp8_dot_indexed_h { + def NAME : sve_fp8_dot_indexed<{0, ?, ?, ?}, ZPR16, VectorIndexH32b, asm> { bits<3> iop; let Inst{20-19} = iop{2-1}; let Inst{11}= iop{0}; + +let mayLoad = 1; CarolineConcatto wrote: Do we need to repeat this here if the class sve_fp8_dot_indexed already have: let mayLoad = 1; let mayStore = 0; https://github.com/llvm/llvm-project/pull/118125 ___ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
[clang] [llvm] [AArch64] Implements FP8 SVE intrinsics for dot-product (PR #118125)
@@ -2447,3 +2447,39 @@ let SVETargetGuard = "sve2,faminmax", SMETargetGuard = "sme2,faminmax" in { defm SVAMIN : SInstZPZZ<"svamin", "hfd", "aarch64_sve_famin", "aarch64_sve_famin_u">; defm SVAMAX : SInstZPZZ<"svamax", "hfd", "aarch64_sve_famax", "aarch64_sve_famax_u">; } + +let SVETargetGuard = "sve2,fp8", SMETargetGuard = "sme2,fp8" in { + // 8-bit floating-point convert to BFloat16/Float16 + def SVF1CVT : SInst<"svcvt1_{d}[_mf8]_fpm", "d~>", "bh", MergeNone, "aarch64_sve_fp8_cvt1", [VerifyRuntimeMode, SetsFPMR]>; + def SVF2CVT : SInst<"svcvt2_{d}[_mf8]_fpm", "d~>", "bh", MergeNone, "aarch64_sve_fp8_cvt2", [VerifyRuntimeMode, SetsFPMR]>; + + // 8-bit floating-point convert to BFloat16/Float16 (top) + def SVF1CVTLT : SInst<"svcvtlt1_{d}[_mf8]_fpm", "d~>", "bh", MergeNone, "aarch64_sve_fp8_cvtlt1", [VerifyRuntimeMode, SetsFPMR]>; + def SVF2CVTLT : SInst<"svcvtlt2_{d}[_mf8]_fpm", "d~>", "bh", MergeNone, "aarch64_sve_fp8_cvtlt2", [VerifyRuntimeMode, SetsFPMR]>; + + // BFloat16/Float16 convert, narrow and interleave to 8-bit floating-point + def SVFCVTN : SInst<"svcvtn_mf8[_{d}_x2]_fpm", "~2>", "bh", MergeNone, "aarch64_sve_fp8_cvtn", [VerifyRuntimeMode, SetsFPMR]>; + + // Single-precision convert, narrow and interleave to 8-bit floating-point (top and bottom) + def SVFCVTNB : SInst<"svcvtnb_mf8[_f32_x2]_fpm", "~2>", "f", MergeNone, "aarch64_sve_fp8_cvtnb", [VerifyRuntimeMode, SetsFPMR]>; + def SVFCVTNT : SInst<"svcvtnt_mf8[_f32_x2]_fpm", "~~2>", "f", MergeNone, "aarch64_sve_fp8_cvtnt", [VerifyRuntimeMode, SetsFPMR]>; +} + +let SVETargetGuard = "sve2,fp8dot2", SMETargetGuard ="sme,ssve-fp8dot2" in { + // 8-bit floating-point dot product to half-precision (vectors) + def SVFDOT_2WAY : SInst<"svdot[_f16_mf8]_fpm", "dd~~>", "h", MergeNone, "aarch64_sve_fp8_fdot", [VerifyRuntimeMode, SetsFPMR]>; + def SVFDOT_N_2WAY : SInst<"svdot[_n_f16_mf8]_fpm", "dd~!>", "h", MergeNone, "aarch64_sve_fp8_fdot", [VerifyRuntimeMode, SetsFPMR]>; + + // 8-bit floating-point dot product to half-precision (indexed) + def SVFDOT_LANE_2WAY : SInst<"svdot_lane[_f16_mf8]_fpm", "dd~~i>", "h", MergeNone, "aarch64_sve_fp8_fdot_lane", [VerifyRuntimeMode, SetsFPMR], [ImmCheck<3, ImmCheck0_7>]>; +} + +let SVETargetGuard = "sve2,fp8dot4", SMETargetGuard ="sme,ssve-fp8dot4" in { + // 8-bit floating-point dot product to single-precision (vectors) + def SVFDOT_4WAY : SInst<"svdot[_f32_mf8]_fpm", "dd~~>", "f", MergeNone, "aarch64_sve_fp8_fdot", [VerifyRuntimeMode, SetsFPMR]>; + def SVFDOT_N_4WAY : SInst<"svdot[_n_f32_mf8]_fpm", "dd~!>", "f", MergeNone, "aarch64_sve_fp8_fdot", [VerifyRuntimeMode, SetsFPMR]>; CarolineConcatto wrote: This is probably correct, but I dont see it in the ACLE, same for the SVFDOT_N_2WAY. https://github.com/llvm/llvm-project/pull/118125 ___ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
[clang] [llvm] [AArch64] Implements FP8 SVE intrinsics for dot-product (PR #118125)
@@ -105,6 +105,7 @@ include "arm_immcheck_incl.td" // N: svfloat64_t // $: svbfloat16_t // ~: svmfloat8_t +// !: mfloat8_t (splat to svmfloat8_t) CarolineConcatto wrote: Nit: Can you move this to line 78 to be together with the other splats. https://github.com/llvm/llvm-project/pull/118125 ___ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
[clang] [llvm] [AArch64] Implements FP8 SVE intrinsics for dot-product (PR #118125)
https://github.com/jthackray approved this pull request. LGTM https://github.com/llvm/llvm-project/pull/118125 ___ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
[clang] [llvm] [AArch64] Implements FP8 SVE intrinsics for dot-product (PR #118125)
llvmbot wrote: @llvm/pr-subscribers-clang-codegen Author: Momchil Velikov (momchil-velikov) Changes --- Patch is 56.65 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/118125.diff 14 Files Affected: - (modified) clang/include/clang/Basic/arm_sve.td (+36) - (modified) clang/include/clang/Basic/arm_sve_sme_incl.td (+1) - (modified) clang/lib/CodeGen/CGBuiltin.cpp (+10-1) - (added) clang/test/CodeGen/AArch64/fp8-intrinsics/acle_sve2_fp8_cvt.c (+173) - (added) clang/test/CodeGen/AArch64/fp8-intrinsics/acle_sve2_fp8_cvtn.c (+101) - (added) clang/test/CodeGen/AArch64/fp8-intrinsics/acle_sve2_fp8_fdot.c (+149) - (added) clang/test/Sema/aarch64-sve2-intrinsics/acle_sve2_fp8.c (+54) - (modified) clang/utils/TableGen/SveEmitter.cpp (+9-2) - (modified) llvm/include/llvm/IR/IntrinsicsAArch64.td (+47) - (modified) llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td (+17-17) - (modified) llvm/lib/Target/AArch64/SVEInstrFormats.td (+66-7) - (added) llvm/test/CodeGen/AArch64/fp8-sve-cvt-cvtlt.ll (+78) - (added) llvm/test/CodeGen/AArch64/fp8-sve-cvtn.ll (+49) - (added) llvm/test/CodeGen/AArch64/fp8-sve-fdot.ll (+41) ``diff diff --git a/clang/include/clang/Basic/arm_sve.td b/clang/include/clang/Basic/arm_sve.td index b36e592042da0b..65f98b614ecf0a 100644 --- a/clang/include/clang/Basic/arm_sve.td +++ b/clang/include/clang/Basic/arm_sve.td @@ -2447,3 +2447,39 @@ let SVETargetGuard = "sve2,faminmax", SMETargetGuard = "sme2,faminmax" in { defm SVAMIN : SInstZPZZ<"svamin", "hfd", "aarch64_sve_famin", "aarch64_sve_famin_u">; defm SVAMAX : SInstZPZZ<"svamax", "hfd", "aarch64_sve_famax", "aarch64_sve_famax_u">; } + +let SVETargetGuard = "sve2,fp8", SMETargetGuard = "sme2,fp8" in { + // 8-bit floating-point convert to BFloat16/Float16 + def SVF1CVT : SInst<"svcvt1_{d}[_mf8]_fpm", "d~>", "bh", MergeNone, "aarch64_sve_fp8_cvt1", [VerifyRuntimeMode, SetsFPMR]>; + def SVF2CVT : SInst<"svcvt2_{d}[_mf8]_fpm", "d~>", "bh", MergeNone, "aarch64_sve_fp8_cvt2", [VerifyRuntimeMode, SetsFPMR]>; + + // 8-bit floating-point convert to BFloat16/Float16 (top) + def SVF1CVTLT : SInst<"svcvtlt1_{d}[_mf8]_fpm", "d~>", "bh", MergeNone, "aarch64_sve_fp8_cvtlt1", [VerifyRuntimeMode, SetsFPMR]>; + def SVF2CVTLT : SInst<"svcvtlt2_{d}[_mf8]_fpm", "d~>", "bh", MergeNone, "aarch64_sve_fp8_cvtlt2", [VerifyRuntimeMode, SetsFPMR]>; + + // BFloat16/Float16 convert, narrow and interleave to 8-bit floating-point + def SVFCVTN : SInst<"svcvtn_mf8[_{d}_x2]_fpm", "~2>", "bh", MergeNone, "aarch64_sve_fp8_cvtn", [VerifyRuntimeMode, SetsFPMR]>; + + // Single-precision convert, narrow and interleave to 8-bit floating-point (top and bottom) + def SVFCVTNB : SInst<"svcvtnb_mf8[_f32_x2]_fpm", "~2>", "f", MergeNone, "aarch64_sve_fp8_cvtnb", [VerifyRuntimeMode, SetsFPMR]>; + def SVFCVTNT : SInst<"svcvtnt_mf8[_f32_x2]_fpm", "~~2>", "f", MergeNone, "aarch64_sve_fp8_cvtnt", [VerifyRuntimeMode, SetsFPMR]>; +} + +let SVETargetGuard = "sve2,fp8dot2", SMETargetGuard ="sme,ssve-fp8dot2" in { + // 8-bit floating-point dot product to half-precision (vectors) + def SVFDOT_2WAY : SInst<"svdot[_f16_mf8]_fpm", "dd~~>", "h", MergeNone, "aarch64_sve_fp8_fdot", [VerifyRuntimeMode, SetsFPMR]>; + def SVFDOT_N_2WAY : SInst<"svdot[_n_f16_mf8]_fpm", "dd~!>", "h", MergeNone, "aarch64_sve_fp8_fdot", [VerifyRuntimeMode, SetsFPMR]>; + + // 8-bit floating-point dot product to half-precision (indexed) + def SVFDOT_LANE_2WAY : SInst<"svdot_lane[_f16_mf8]_fpm", "dd~~i>", "h", MergeNone, "aarch64_sve_fp8_fdot_lane", [VerifyRuntimeMode, SetsFPMR], [ImmCheck<3, ImmCheck0_7>]>; +} + +let SVETargetGuard = "sve2,fp8dot4", SMETargetGuard ="sme,ssve-fp8dot4" in { + // 8-bit floating-point dot product to single-precision (vectors) + def SVFDOT_4WAY : SInst<"svdot[_f32_mf8]_fpm", "dd~~>", "f", MergeNone, "aarch64_sve_fp8_fdot", [VerifyRuntimeMode, SetsFPMR]>; + def SVFDOT_N_4WAY : SInst<"svdot[_n_f32_mf8]_fpm", "dd~!>", "f", MergeNone, "aarch64_sve_fp8_fdot", [VerifyRuntimeMode, SetsFPMR]>; + + // 8-bit floating-point dot product to single-precision (indexed) + def SVFDOT_LANE_4WAY : SInst<"svdot_lane[_f32_mf8]_fpm", "dd~~i>", "f", MergeNone, "aarch64_sve_fp8_fdot_lane", [VerifyRuntimeMode, SetsFPMR], [ImmCheck<3, ImmCheck0_3>]>; +} + diff --git a/clang/include/clang/Basic/arm_sve_sme_incl.td b/clang/include/clang/Basic/arm_sve_sme_incl.td index de10be7bdce0db..44201b15505599 100644 --- a/clang/include/clang/Basic/arm_sve_sme_incl.td +++ b/clang/include/clang/Basic/arm_sve_sme_incl.td @@ -105,6 +105,7 @@ include "arm_immcheck_incl.td" // N: svfloat64_t // $: svbfloat16_t // ~: svmfloat8_t +// !: mfloat8_t (splat to svmfloat8_t) // J: Prefetch type (sv_prfop) diff --git a/clang/lib/CodeGen/CGBuiltin.cpp b/clang/lib/CodeGen/CGBuiltin.cpp index cb9c23b8e0a0d0..9f9beae3059cc9 100644 --- a/clang/lib/CodeGen/CGBuiltin.cpp +++ b/clang/lib/CodeGen/CGBuiltin.cpp @@ -10688,7 +10688
[clang] [llvm] [AArch64] Implements FP8 SVE intrinsics for dot-product (PR #118125)
llvmbot wrote: @llvm/pr-subscribers-llvm-ir Author: Momchil Velikov (momchil-velikov) Changes --- Patch is 56.65 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/118125.diff 14 Files Affected: - (modified) clang/include/clang/Basic/arm_sve.td (+36) - (modified) clang/include/clang/Basic/arm_sve_sme_incl.td (+1) - (modified) clang/lib/CodeGen/CGBuiltin.cpp (+10-1) - (added) clang/test/CodeGen/AArch64/fp8-intrinsics/acle_sve2_fp8_cvt.c (+173) - (added) clang/test/CodeGen/AArch64/fp8-intrinsics/acle_sve2_fp8_cvtn.c (+101) - (added) clang/test/CodeGen/AArch64/fp8-intrinsics/acle_sve2_fp8_fdot.c (+149) - (added) clang/test/Sema/aarch64-sve2-intrinsics/acle_sve2_fp8.c (+54) - (modified) clang/utils/TableGen/SveEmitter.cpp (+9-2) - (modified) llvm/include/llvm/IR/IntrinsicsAArch64.td (+47) - (modified) llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td (+17-17) - (modified) llvm/lib/Target/AArch64/SVEInstrFormats.td (+66-7) - (added) llvm/test/CodeGen/AArch64/fp8-sve-cvt-cvtlt.ll (+78) - (added) llvm/test/CodeGen/AArch64/fp8-sve-cvtn.ll (+49) - (added) llvm/test/CodeGen/AArch64/fp8-sve-fdot.ll (+41) ``diff diff --git a/clang/include/clang/Basic/arm_sve.td b/clang/include/clang/Basic/arm_sve.td index b36e592042da0b..65f98b614ecf0a 100644 --- a/clang/include/clang/Basic/arm_sve.td +++ b/clang/include/clang/Basic/arm_sve.td @@ -2447,3 +2447,39 @@ let SVETargetGuard = "sve2,faminmax", SMETargetGuard = "sme2,faminmax" in { defm SVAMIN : SInstZPZZ<"svamin", "hfd", "aarch64_sve_famin", "aarch64_sve_famin_u">; defm SVAMAX : SInstZPZZ<"svamax", "hfd", "aarch64_sve_famax", "aarch64_sve_famax_u">; } + +let SVETargetGuard = "sve2,fp8", SMETargetGuard = "sme2,fp8" in { + // 8-bit floating-point convert to BFloat16/Float16 + def SVF1CVT : SInst<"svcvt1_{d}[_mf8]_fpm", "d~>", "bh", MergeNone, "aarch64_sve_fp8_cvt1", [VerifyRuntimeMode, SetsFPMR]>; + def SVF2CVT : SInst<"svcvt2_{d}[_mf8]_fpm", "d~>", "bh", MergeNone, "aarch64_sve_fp8_cvt2", [VerifyRuntimeMode, SetsFPMR]>; + + // 8-bit floating-point convert to BFloat16/Float16 (top) + def SVF1CVTLT : SInst<"svcvtlt1_{d}[_mf8]_fpm", "d~>", "bh", MergeNone, "aarch64_sve_fp8_cvtlt1", [VerifyRuntimeMode, SetsFPMR]>; + def SVF2CVTLT : SInst<"svcvtlt2_{d}[_mf8]_fpm", "d~>", "bh", MergeNone, "aarch64_sve_fp8_cvtlt2", [VerifyRuntimeMode, SetsFPMR]>; + + // BFloat16/Float16 convert, narrow and interleave to 8-bit floating-point + def SVFCVTN : SInst<"svcvtn_mf8[_{d}_x2]_fpm", "~2>", "bh", MergeNone, "aarch64_sve_fp8_cvtn", [VerifyRuntimeMode, SetsFPMR]>; + + // Single-precision convert, narrow and interleave to 8-bit floating-point (top and bottom) + def SVFCVTNB : SInst<"svcvtnb_mf8[_f32_x2]_fpm", "~2>", "f", MergeNone, "aarch64_sve_fp8_cvtnb", [VerifyRuntimeMode, SetsFPMR]>; + def SVFCVTNT : SInst<"svcvtnt_mf8[_f32_x2]_fpm", "~~2>", "f", MergeNone, "aarch64_sve_fp8_cvtnt", [VerifyRuntimeMode, SetsFPMR]>; +} + +let SVETargetGuard = "sve2,fp8dot2", SMETargetGuard ="sme,ssve-fp8dot2" in { + // 8-bit floating-point dot product to half-precision (vectors) + def SVFDOT_2WAY : SInst<"svdot[_f16_mf8]_fpm", "dd~~>", "h", MergeNone, "aarch64_sve_fp8_fdot", [VerifyRuntimeMode, SetsFPMR]>; + def SVFDOT_N_2WAY : SInst<"svdot[_n_f16_mf8]_fpm", "dd~!>", "h", MergeNone, "aarch64_sve_fp8_fdot", [VerifyRuntimeMode, SetsFPMR]>; + + // 8-bit floating-point dot product to half-precision (indexed) + def SVFDOT_LANE_2WAY : SInst<"svdot_lane[_f16_mf8]_fpm", "dd~~i>", "h", MergeNone, "aarch64_sve_fp8_fdot_lane", [VerifyRuntimeMode, SetsFPMR], [ImmCheck<3, ImmCheck0_7>]>; +} + +let SVETargetGuard = "sve2,fp8dot4", SMETargetGuard ="sme,ssve-fp8dot4" in { + // 8-bit floating-point dot product to single-precision (vectors) + def SVFDOT_4WAY : SInst<"svdot[_f32_mf8]_fpm", "dd~~>", "f", MergeNone, "aarch64_sve_fp8_fdot", [VerifyRuntimeMode, SetsFPMR]>; + def SVFDOT_N_4WAY : SInst<"svdot[_n_f32_mf8]_fpm", "dd~!>", "f", MergeNone, "aarch64_sve_fp8_fdot", [VerifyRuntimeMode, SetsFPMR]>; + + // 8-bit floating-point dot product to single-precision (indexed) + def SVFDOT_LANE_4WAY : SInst<"svdot_lane[_f32_mf8]_fpm", "dd~~i>", "f", MergeNone, "aarch64_sve_fp8_fdot_lane", [VerifyRuntimeMode, SetsFPMR], [ImmCheck<3, ImmCheck0_3>]>; +} + diff --git a/clang/include/clang/Basic/arm_sve_sme_incl.td b/clang/include/clang/Basic/arm_sve_sme_incl.td index de10be7bdce0db..44201b15505599 100644 --- a/clang/include/clang/Basic/arm_sve_sme_incl.td +++ b/clang/include/clang/Basic/arm_sve_sme_incl.td @@ -105,6 +105,7 @@ include "arm_immcheck_incl.td" // N: svfloat64_t // $: svbfloat16_t // ~: svmfloat8_t +// !: mfloat8_t (splat to svmfloat8_t) // J: Prefetch type (sv_prfop) diff --git a/clang/lib/CodeGen/CGBuiltin.cpp b/clang/lib/CodeGen/CGBuiltin.cpp index cb9c23b8e0a0d0..9f9beae3059cc9 100644 --- a/clang/lib/CodeGen/CGBuiltin.cpp +++ b/clang/lib/CodeGen/CGBuiltin.cpp @@ -10688,7 +10688,16 @@
[clang] [llvm] [AArch64] Implements FP8 SVE intrinsics for dot-product (PR #118125)
https://github.com/momchil-velikov created https://github.com/llvm/llvm-project/pull/118125 None >From 3a9643e6c2d61eae2e23df42c19b1410d4a5fcc5 Mon Sep 17 00:00:00 2001 From: Momchil Velikov Date: Thu, 21 Nov 2024 11:21:29 + Subject: [PATCH 1/3] FP8 CVT/CVTL --- clang/include/clang/Basic/arm_sve.td | 10 + .../fp8-intrinsics/acle_sve2_fp8_cvt.c| 173 ++ .../aarch64-sve2-intrinsics/acle_sve2_fp8.c | 24 +++ llvm/include/llvm/IR/IntrinsicsAArch64.td | 17 ++ .../lib/Target/AArch64/AArch64SVEInstrInfo.td | 16 +- llvm/lib/Target/AArch64/SVEInstrFormats.td| 7 +- .../test/CodeGen/AArch64/fp8-sve-cvt-cvtlt.ll | 78 7 files changed, 316 insertions(+), 9 deletions(-) create mode 100644 clang/test/CodeGen/AArch64/fp8-intrinsics/acle_sve2_fp8_cvt.c create mode 100644 clang/test/Sema/aarch64-sve2-intrinsics/acle_sve2_fp8.c create mode 100644 llvm/test/CodeGen/AArch64/fp8-sve-cvt-cvtlt.ll diff --git a/clang/include/clang/Basic/arm_sve.td b/clang/include/clang/Basic/arm_sve.td index b36e592042da0b..b9d8360843aa8e 100644 --- a/clang/include/clang/Basic/arm_sve.td +++ b/clang/include/clang/Basic/arm_sve.td @@ -2447,3 +2447,13 @@ let SVETargetGuard = "sve2,faminmax", SMETargetGuard = "sme2,faminmax" in { defm SVAMIN : SInstZPZZ<"svamin", "hfd", "aarch64_sve_famin", "aarch64_sve_famin_u">; defm SVAMAX : SInstZPZZ<"svamax", "hfd", "aarch64_sve_famax", "aarch64_sve_famax_u">; } + +let SVETargetGuard = "sve2,fp8", SMETargetGuard = "sme2,fp8" in { + // 8-bit floating-point convert to BFloat16/Float16 + def SVF1CVT : SInst<"svcvt1_{d}[_mf8]_fpm", "d~>", "bh", MergeNone, "aarch64_sve_fp8_cvt1", [VerifyRuntimeMode, SetsFPMR]>; + def SVF2CVT : SInst<"svcvt2_{d}[_mf8]_fpm", "d~>", "bh", MergeNone, "aarch64_sve_fp8_cvt2", [VerifyRuntimeMode, SetsFPMR]>; + + // 8-bit floating-point convert to BFloat16/Float16 (top) + def SVF1CVTLT : SInst<"svcvtlt1_{d}[_mf8]_fpm", "d~>", "bh", MergeNone, "aarch64_sve_fp8_cvtlt1", [VerifyRuntimeMode, SetsFPMR]>; + def SVF2CVTLT : SInst<"svcvtlt2_{d}[_mf8]_fpm", "d~>", "bh", MergeNone, "aarch64_sve_fp8_cvtlt2", [VerifyRuntimeMode, SetsFPMR]>; +} diff --git a/clang/test/CodeGen/AArch64/fp8-intrinsics/acle_sve2_fp8_cvt.c b/clang/test/CodeGen/AArch64/fp8-intrinsics/acle_sve2_fp8_cvt.c new file mode 100644 index 00..c026b8aa216f32 --- /dev/null +++ b/clang/test/CodeGen/AArch64/fp8-intrinsics/acle_sve2_fp8_cvt.c @@ -0,0 +1,173 @@ +// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 5 +// RUN: %clang_cc1-triple aarch64-none-linux-gnu -target-feature +sve -target-feature +sve2 -target-feature +fp8 -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s +// RUN: %clang_cc1 -x c++ -triple aarch64-none-linux-gnu -target-feature +sme -target-feature +sme2 -target-feature +fp8 -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CHECK-CXX + +// RUN: %clang_cc1 -DSME_OVERLOADED_FORMS-triple aarch64-none-linux-gnu -target-feature +sme -target-feature +sme2 -target-feature +fp8 -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s +// RUN: %clang_cc1 -DSME_OVERLOADED_FORMS -x c++ -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +sve2 -target-feature +fp8 -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CHECK-CXX + +// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +sve2 -target-feature +fp8 -S -disable-O0-optnone -Werror -Wall -o /dev/null %s +// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme -target-feature +sme2 -target-feature +fp8 -S -disable-O0-optnone -Werror -Wall -o /dev/null %s + +// REQUIRES: aarch64-registered-target + +#ifdef __ARM_FEATURE_SME +#include +#else +#include +#endif + +#ifdef SVE_OVERLOADED_FORMS +#define SVE_ACLE_FUNC(A1,A2_UNUSED,A3) A1##A3 +#else +#define SVE_ACLE_FUNC(A1,A2,A3) A1##A2##A3 +#endif + +#ifdef __ARM_FEATURE_SME +#define STREAMING __arm_streaming +#else +#define STREAMING +#endif + +// CHECK-LABEL: define dso_local @test_svcvt1_bf16_mf8( +// CHECK-SAME: [[ZN:%.*]], i64 noundef [[FPM:%.*]]) #[[ATTR0:[0-9]+]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT:tail call void @llvm.aarch64.set.fpmr(i64 [[FPM]]) +// CHECK-NEXT:[[TMP0:%.*]] = tail call @llvm.aarch64.sve.fp8.cvt1.nxv8bf16( [[ZN]]) +// CHECK-NEXT:ret [[TMP0]] +// +// CHECK-CXX-LABEL: define dso_local @_Z20test_svcvt1_bf16_mf8u13__SVMfloat8_tm( +// CHECK-CXX-SAME: [[ZN:%.*]], i64 noundef [[FPM:%.*]]) #[[ATTR0:[0-9]+]] { +// CHECK-CXX-NEXT: [[ENTRY:.*:]] +// CHECK-CXX-NEXT:tail call void @llvm.aarch64.set.fpmr(i64 [[FPM]]) +// CHECK-CXX-NEXT:[[TMP0:%.*]] = tail call