[clang] [llvm] [AArch64] Implement NEON FP8 intrinsics for fused multiply-add (indexed) (PR #123615)
https://github.com/momchil-velikov edited https://github.com/llvm/llvm-project/pull/123615 ___ cfe-commits mailing list [email protected] https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
[clang] [llvm] [AArch64] Implement NEON FP8 intrinsics for fused multiply-add (indexed) (PR #123615)
https://github.com/momchil-velikov updated
https://github.com/llvm/llvm-project/pull/123615
>From ffb554ea1ff638237ffc9cb9a491e5a6ad66d8f6 Mon Sep 17 00:00:00 2001
From: Momchil Velikov
Date: Tue, 17 Dec 2024 17:10:38 +
Subject: [PATCH 1/2] [AArch64] Implement NEON FP8 fused multiply-add
intrinsics (non-indexed)
This patch adds the following intrinsics:
float16x8_t vmlalbq_f16_mf8_fpm(float16x8_t, mfloat8x16_t, mfloat8x16_t,
fpm_t)
float16x8_t vmlaltq_f16_mf8_fpm(float16x8_t, mfloat8x16_t, mfloat8x16_t,
fpm_t)
float32x4_t vmlallbbq_f32_mf8_fpm(float32x4_t, mfloat8x16_t, mfloat8x16_t,
fpm_t)
float32x4_t vmlallbtq_f32_mf8_fpm(float32x4_t, mfloat8x16_t, mfloat8x16_t,
fpm_t)
float32x4_t vmlalltbq_f32_mf8_fpm(float32x4_t, mfloat8x16_t, mfloat8x16_t,
fpm_t)
float32x4_t vmlallttq_f32_mf8_fpm(float32x4_t, mfloat8x16_t, mfloat8x16_t,
fpm_t)
[fixup] Update intrinsics definitions
[fixup] Remove some opt passes from RUN lines
---
clang/include/clang/Basic/arm_neon.td | 10 ++
clang/lib/CodeGen/CGBuiltin.cpp | 43 +--
clang/lib/CodeGen/CodeGenFunction.h | 4 +-
.../fp8-intrinsics/acle_neon_fp8_fmla.c | 121 ++
.../acle_neon_fp8_fmla.c | 22
llvm/include/llvm/IR/IntrinsicsAArch64.td | 17 +++
.../lib/Target/AArch64/AArch64InstrFormats.td | 9 +-
llvm/lib/Target/AArch64/AArch64InstrInfo.td | 14 +-
llvm/test/CodeGen/AArch64/fp8-neon-fmla.ll| 56
9 files changed, 274 insertions(+), 22 deletions(-)
create mode 100644
clang/test/CodeGen/AArch64/fp8-intrinsics/acle_neon_fp8_fmla.c
create mode 100644 clang/test/Sema/aarch64-fp8-intrinsics/acle_neon_fp8_fmla.c
create mode 100644 llvm/test/CodeGen/AArch64/fp8-neon-fmla.ll
diff --git a/clang/include/clang/Basic/arm_neon.td
b/clang/include/clang/Basic/arm_neon.td
index c6609f312969ee..7e7faa68c55692 100644
--- a/clang/include/clang/Basic/arm_neon.td
+++ b/clang/include/clang/Basic/arm_neon.td
@@ -2161,6 +2161,16 @@ let ArchGuard = "defined(__aarch64__)", TargetGuard =
"fp8dot4,neon" in {
def VDOTQ_LANEQ_F32_MF8 : VInst<"vdot_laneq_f32_mf8_fpm", "(>>F)(>>F)..IV",
"Qm", [ImmCheck<3, ImmCheck0_3, 0>]>;
}
+let ArchGuard = "defined(__aarch64__)", TargetGuard = "fp8fma,neon" in {
+ def VMLALB_F16_F8 : VInst<"vmlalb_f16_mf8_fpm", "(>F)(>F)..V", "Qm">;
+ def VMLALT_F16_F8 : VInst<"vmlalt_f16_mf8_fpm", "(>F)(>F)..V", "Qm">;
+
+ def VMLALLBB_F32_F8 : VInst<"vmlallbb_f32_mf8_fpm", "(>>F)(>>F)..V", "Qm">;
+ def VMLALLBT_F32_F8 : VInst<"vmlallbt_f32_mf8_fpm", "(>>F)(>>F)..V", "Qm">;
+ def VMLALLTB_F32_F8 : VInst<"vmlalltb_f32_mf8_fpm", "(>>F)(>>F)..V", "Qm">;
+ def VMLALLTT_F32_F8 : VInst<"vmlalltt_f32_mf8_fpm", "(>>F)(>>F)..V", "Qm">;
+}
+
let ArchGuard = "defined(__aarch64__)", TargetGuard = "neon,faminmax" in {
def FAMIN : WInst<"vamin", "...", "fhQdQfQh">;
def FAMAX : WInst<"vamax", "...", "fhQdQfQh">;
diff --git a/clang/lib/CodeGen/CGBuiltin.cpp b/clang/lib/CodeGen/CGBuiltin.cpp
index b4b26eb84d5f92..8dbc8bfff95a4b 100644
--- a/clang/lib/CodeGen/CGBuiltin.cpp
+++ b/clang/lib/CodeGen/CGBuiltin.cpp
@@ -6759,11 +6759,14 @@ Value *CodeGenFunction::EmitNeonCall(Function *F,
SmallVectorImpl &Ops,
return Builder.CreateCall(F, Ops, name);
}
-Value *CodeGenFunction::EmitFP8NeonCall(Function *F,
+Value *CodeGenFunction::EmitFP8NeonCall(unsigned IID,
+ArrayRef Tys,
SmallVectorImpl &Ops,
-Value *FPM, const char *name) {
+const CallExpr *E, const char *name) {
+ llvm::Value *FPM =
+ EmitScalarOrConstFoldImmArg(/* ICEArguments */ 0, E->getNumArgs() - 1,
E);
Builder.CreateCall(CGM.getIntrinsic(Intrinsic::aarch64_set_fpmr), FPM);
- return EmitNeonCall(F, Ops, name);
+ return EmitNeonCall(CGM.getIntrinsic(IID, Tys), Ops, name);
}
llvm::Value *CodeGenFunction::EmitFP8NeonFDOTCall(
@@ -6779,9 +6782,7 @@ llvm::Value *CodeGenFunction::EmitFP8NeonFDOTCall(
Ops[2] = Builder.CreateInsertVector(VT, PoisonValue::get(VT), Ops[2],
Builder.getInt64(0));
}
- llvm::Value *FPM =
- EmitScalarOrConstFoldImmArg(/* ICEArguments */ 0, E->getNumArgs() - 1,
E);
- return EmitFP8NeonCall(CGM.getIntrinsic(IID, Tys), Ops, FPM, name);
+ return EmitFP8NeonCall(IID, Tys, Ops, E, name);
}
Value *CodeGenFunction::EmitNeonShiftVector(Value *V, llvm::Type *Ty,
@@ -6802,9 +6803,7 @@ Value *CodeGenFunction::EmitFP8NeonCvtCall(unsigned IID,
llvm::Type *Ty0,
Tys[1] = llvm::FixedVectorType::get(Int8Ty, 8);
Ops[0] = Builder.CreateExtractVector(Tys[1], Ops[0], Builder.getInt64(0));
}
- llvm::Value *FPM =
- EmitScalarOrConstFoldImmArg(/* ICEArguments */ 0, E->getNumArgs() - 1,
E);
- return EmitFP8NeonCall(CGM.getIntrinsic(IID, Tys), Ops, FPM, name);
+ return EmitFP8NeonCall(IID, Tys, Ops, E, name)
[clang] [llvm] [AArch64] Implement NEON FP8 intrinsics for fused multiply-add (indexed) (PR #123615)
https://github.com/jthackray approved this pull request. Phew, massive diff, LGTM. https://github.com/llvm/llvm-project/pull/123615 ___ cfe-commits mailing list [email protected] https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
[clang] [llvm] [AArch64] Implement NEON FP8 intrinsics for fused multiply-add (indexed) (PR #123615)
llvmbot wrote:
@llvm/pr-subscribers-clang
@llvm/pr-subscribers-llvm-ir
Author: Momchil Velikov (momchil-velikov)
Changes
This patch adds the following intrinsics:
* Floating-point multiply-add long to half-precision (vector, by element)
float16x8_t vmlalbq_lane_f16_mf8_fpm(float16x8_t vd, mfloat8x16_t vn,
mfloat8x8_t vm, __builtin_constant_p(lane), fpm_t fpm)
float16x8_t vmlalbq_laneq_f16_mf8_fpm(float16x8_t vd, mfloat8x16_t vn,
mfloat8x16_t vm, __builtin_constant_p(lane), fpm_t fpm)
float16x8_t vmlaltq_lane_f16_mf8_fpm(float16x8_t vd, mfloat8x16_t vn,
mfloat8x8_t vm, __builtin_constant_p(lane), fpm_t fpm)
float16x8_t vmlaltq_laneq_f16_mf8_fpm(float16x8_t vd, mfloat8x16_t vn,
mfloat8x16_t vm, __builtin_constant_p(lane), fpm_t fpm)
* Floating-point multiply-add long-long to single-precision (vector, by element)
float32x4_t vmlallbbq_lane_f32_mf8_fpm(float32x4_t vd, mfloat8x16_t vn,
mfloat8x8_t vm, __builtin_constant_p(lane), fpm_t fpm)
float32x4_t vmlallbbq_laneq_f32_mf8_fpm(float32x4_t vd, mfloat8x16_t
vn, mfloat8x16_t vm, __builtin_constant_p(lane), fpm_t fpm)
float32x4_t vmlallbtq_lane_f32_mf8_fpm(float32x4_t vd, mfloat8x16_t vn,
mfloat8x8_t vm, __builtin_constant_p(lane), fpm_t fpm)
float32x4_t vmlallbtq_laneq_f32_mf8_fpm(float32x4_t vd, mfloat8x16_t
vn, mfloat8x16_t vm, __builtin_constant_p(lane), fpm_t fpm)
float32x4_t vmlalltbq_lane_f32_mf8_fpm(float32x4_t vd, mfloat8x16_t vn,
mfloat8x8_t vm, __builtin_constant_p(lane), fpm_t fpm)
float32x4_t vmlalltbq_laneq_f32_mf8_fpm(float32x4_t vd, mfloat8x16_t
vn, mfloat8x16_t vm, __builtin_constant_p(lane), fpm_t fpm)
float32x4_t vmlallttq_lane_f32_mf8_fpm(float32x4_t vd, mfloat8x16_t vn,
mfloat8x8_t vm, __builtin_constant_p(lane), fpm_t fpm)
float32x4_t vmlallttq_laneq_f32_mf8_fpm(float32x4_t vd, mfloat8x16_t
vn, mfloat8x16_t vm, __builtin_constant_p(lane), fpm_t fpm)
Supersedes https://github.com/llvm/llvm-project/pull/120403
---
Patch is 189.62 KiB, truncated to 20.00 KiB below, full version:
https://github.com/llvm/llvm-project/pull/123615.diff
37 Files Affected:
- (modified) clang/include/clang/AST/Type.h (+5)
- (modified) clang/include/clang/Basic/AArch64SVEACLETypes.def (+17-20)
- (modified) clang/include/clang/Basic/TargetBuiltins.h (+3-1)
- (modified) clang/include/clang/Basic/arm_neon.td (+66)
- (modified) clang/include/clang/Basic/arm_neon_incl.td (+2)
- (modified) clang/lib/AST/ASTContext.cpp (+18-12)
- (modified) clang/lib/AST/ItaniumMangle.cpp (+6-1)
- (modified) clang/lib/AST/Type.cpp (+1-3)
- (modified) clang/lib/CodeGen/CGBuiltin.cpp (+198-1)
- (modified) clang/lib/CodeGen/CGExpr.cpp (+9-2)
- (modified) clang/lib/CodeGen/CodeGenFunction.h (+15)
- (modified) clang/lib/CodeGen/CodeGenTypes.cpp (+13-5)
- (modified) clang/lib/CodeGen/Targets/AArch64.cpp (+6-8)
- (modified) clang/lib/Sema/SemaARM.cpp (+2)
- (modified) clang/lib/Sema/SemaExpr.cpp (+6-1)
- (modified) clang/lib/Sema/SemaType.cpp (+2-1)
- (added) clang/test/CodeGen/AArch64/builtin-shufflevector-fp8.c (+123)
- (added) clang/test/CodeGen/AArch64/fp8-cast.c (+193)
- (added) clang/test/CodeGen/AArch64/fp8-intrinsics/acle_neon_fp8_cvt.c (+316)
- (added) clang/test/CodeGen/AArch64/fp8-intrinsics/acle_neon_fp8_fdot.c (+254)
- (added) clang/test/CodeGen/AArch64/fp8-intrinsics/acle_neon_fp8_fmla.c (+365)
- (modified) clang/test/CodeGen/arm-mfp8.c (+53-35)
- (modified) clang/test/CodeGenCXX/aarch64-mangle-neon-vectors.cpp (+7)
- (modified) clang/test/CodeGenCXX/mangle-neon-vectors.cpp (+11)
- (added) clang/test/Sema/aarch64-fp8-cast.c (+104)
- (added) clang/test/Sema/aarch64-fp8-intrinsics/acle_neon_fp8_cvt.c (+43)
- (added) clang/test/Sema/aarch64-fp8-intrinsics/acle_neon_fp8_fdot.c (+54)
- (added) clang/test/Sema/aarch64-fp8-intrinsics/acle_neon_fp8_fmla.c (+49)
- (modified) clang/test/Sema/arm-mfp8.cpp (+22-12)
- (modified) clang/utils/TableGen/NeonEmitter.cpp (+23-9)
- (modified) clang/utils/TableGen/SveEmitter.cpp (+2-2)
- (modified) llvm/include/llvm/IR/IntrinsicsAArch64.td (+76)
- (modified) llvm/lib/Target/AArch64/AArch64InstrFormats.td (+105-56)
- (modified) llvm/lib/Target/AArch64/AArch64InstrInfo.td (+30-28)
- (added) llvm/test/CodeGen/AArch64/fp8-neon-fdot.ll (+74)
- (added) llvm/test/CodeGen/AArch64/fp8-neon-fmla.ll (+110)
- (added) llvm/test/CodeGen/AArch64/neon-fp8-cvt.ll (+112)
``diff
diff --git a/clang/include/clang/AST/Type.h b/clang/include/clang/AST/Type.h
index 3457d524c63aaa..1d9743520654eb 100644
--- a/clang/include/clang/AST/Type.h
+++ b/clang/include/clang/AST/Type.h
@@ -2518,6 +2518,7 @@ class alignas(TypeAlignment) Type : public
ExtQualsTypeCommonBase {
bool isFloat32Type() const;
bool isDoubleType() const;
bool isBFloat16Type() const;
+ bool isMFloat8Type() const;
bool isFloat128Type() const;
bool isIbm128Type() const;
bool isRealType() const; // C99 6.2.5p17 (r
[clang] [llvm] [AArch64] Implement NEON FP8 intrinsics for fused multiply-add (indexed) (PR #123615)
llvmbot wrote:
@llvm/pr-subscribers-backend-arm
Author: Momchil Velikov (momchil-velikov)
Changes
This patch adds the following intrinsics:
* Floating-point multiply-add long to half-precision (vector, by element)
float16x8_t vmlalbq_lane_f16_mf8_fpm(float16x8_t vd, mfloat8x16_t vn,
mfloat8x8_t vm, __builtin_constant_p(lane), fpm_t fpm)
float16x8_t vmlalbq_laneq_f16_mf8_fpm(float16x8_t vd, mfloat8x16_t vn,
mfloat8x16_t vm, __builtin_constant_p(lane), fpm_t fpm)
float16x8_t vmlaltq_lane_f16_mf8_fpm(float16x8_t vd, mfloat8x16_t vn,
mfloat8x8_t vm, __builtin_constant_p(lane), fpm_t fpm)
float16x8_t vmlaltq_laneq_f16_mf8_fpm(float16x8_t vd, mfloat8x16_t vn,
mfloat8x16_t vm, __builtin_constant_p(lane), fpm_t fpm)
* Floating-point multiply-add long-long to single-precision (vector, by element)
float32x4_t vmlallbbq_lane_f32_mf8_fpm(float32x4_t vd, mfloat8x16_t vn,
mfloat8x8_t vm, __builtin_constant_p(lane), fpm_t fpm)
float32x4_t vmlallbbq_laneq_f32_mf8_fpm(float32x4_t vd, mfloat8x16_t
vn, mfloat8x16_t vm, __builtin_constant_p(lane), fpm_t fpm)
float32x4_t vmlallbtq_lane_f32_mf8_fpm(float32x4_t vd, mfloat8x16_t vn,
mfloat8x8_t vm, __builtin_constant_p(lane), fpm_t fpm)
float32x4_t vmlallbtq_laneq_f32_mf8_fpm(float32x4_t vd, mfloat8x16_t
vn, mfloat8x16_t vm, __builtin_constant_p(lane), fpm_t fpm)
float32x4_t vmlalltbq_lane_f32_mf8_fpm(float32x4_t vd, mfloat8x16_t vn,
mfloat8x8_t vm, __builtin_constant_p(lane), fpm_t fpm)
float32x4_t vmlalltbq_laneq_f32_mf8_fpm(float32x4_t vd, mfloat8x16_t
vn, mfloat8x16_t vm, __builtin_constant_p(lane), fpm_t fpm)
float32x4_t vmlallttq_lane_f32_mf8_fpm(float32x4_t vd, mfloat8x16_t vn,
mfloat8x8_t vm, __builtin_constant_p(lane), fpm_t fpm)
float32x4_t vmlallttq_laneq_f32_mf8_fpm(float32x4_t vd, mfloat8x16_t
vn, mfloat8x16_t vm, __builtin_constant_p(lane), fpm_t fpm)
Supersedes https://github.com/llvm/llvm-project/pull/120403
---
Patch is 189.62 KiB, truncated to 20.00 KiB below, full version:
https://github.com/llvm/llvm-project/pull/123615.diff
37 Files Affected:
- (modified) clang/include/clang/AST/Type.h (+5)
- (modified) clang/include/clang/Basic/AArch64SVEACLETypes.def (+17-20)
- (modified) clang/include/clang/Basic/TargetBuiltins.h (+3-1)
- (modified) clang/include/clang/Basic/arm_neon.td (+66)
- (modified) clang/include/clang/Basic/arm_neon_incl.td (+2)
- (modified) clang/lib/AST/ASTContext.cpp (+18-12)
- (modified) clang/lib/AST/ItaniumMangle.cpp (+6-1)
- (modified) clang/lib/AST/Type.cpp (+1-3)
- (modified) clang/lib/CodeGen/CGBuiltin.cpp (+198-1)
- (modified) clang/lib/CodeGen/CGExpr.cpp (+9-2)
- (modified) clang/lib/CodeGen/CodeGenFunction.h (+15)
- (modified) clang/lib/CodeGen/CodeGenTypes.cpp (+13-5)
- (modified) clang/lib/CodeGen/Targets/AArch64.cpp (+6-8)
- (modified) clang/lib/Sema/SemaARM.cpp (+2)
- (modified) clang/lib/Sema/SemaExpr.cpp (+6-1)
- (modified) clang/lib/Sema/SemaType.cpp (+2-1)
- (added) clang/test/CodeGen/AArch64/builtin-shufflevector-fp8.c (+123)
- (added) clang/test/CodeGen/AArch64/fp8-cast.c (+193)
- (added) clang/test/CodeGen/AArch64/fp8-intrinsics/acle_neon_fp8_cvt.c (+316)
- (added) clang/test/CodeGen/AArch64/fp8-intrinsics/acle_neon_fp8_fdot.c (+254)
- (added) clang/test/CodeGen/AArch64/fp8-intrinsics/acle_neon_fp8_fmla.c (+365)
- (modified) clang/test/CodeGen/arm-mfp8.c (+53-35)
- (modified) clang/test/CodeGenCXX/aarch64-mangle-neon-vectors.cpp (+7)
- (modified) clang/test/CodeGenCXX/mangle-neon-vectors.cpp (+11)
- (added) clang/test/Sema/aarch64-fp8-cast.c (+104)
- (added) clang/test/Sema/aarch64-fp8-intrinsics/acle_neon_fp8_cvt.c (+43)
- (added) clang/test/Sema/aarch64-fp8-intrinsics/acle_neon_fp8_fdot.c (+54)
- (added) clang/test/Sema/aarch64-fp8-intrinsics/acle_neon_fp8_fmla.c (+49)
- (modified) clang/test/Sema/arm-mfp8.cpp (+22-12)
- (modified) clang/utils/TableGen/NeonEmitter.cpp (+23-9)
- (modified) clang/utils/TableGen/SveEmitter.cpp (+2-2)
- (modified) llvm/include/llvm/IR/IntrinsicsAArch64.td (+76)
- (modified) llvm/lib/Target/AArch64/AArch64InstrFormats.td (+105-56)
- (modified) llvm/lib/Target/AArch64/AArch64InstrInfo.td (+30-28)
- (added) llvm/test/CodeGen/AArch64/fp8-neon-fdot.ll (+74)
- (added) llvm/test/CodeGen/AArch64/fp8-neon-fmla.ll (+110)
- (added) llvm/test/CodeGen/AArch64/neon-fp8-cvt.ll (+112)
``diff
diff --git a/clang/include/clang/AST/Type.h b/clang/include/clang/AST/Type.h
index 3457d524c63aaa..1d9743520654eb 100644
--- a/clang/include/clang/AST/Type.h
+++ b/clang/include/clang/AST/Type.h
@@ -2518,6 +2518,7 @@ class alignas(TypeAlignment) Type : public
ExtQualsTypeCommonBase {
bool isFloat32Type() const;
bool isDoubleType() const;
bool isBFloat16Type() const;
+ bool isMFloat8Type() const;
bool isFloat128Type() const;
bool isIbm128Type() const;
bool isRealType() const; // C99 6.2.5p17 (real floating + integer)
