[clang] [llvm] [AArch64] Implement NEON FP8 intrinsics for fused multiply-add (indexed) (PR #123615)

2025-01-27 Thread Momchil Velikov via cfe-commits

https://github.com/momchil-velikov edited 
https://github.com/llvm/llvm-project/pull/123615
___
cfe-commits mailing list
[email protected]
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[clang] [llvm] [AArch64] Implement NEON FP8 intrinsics for fused multiply-add (indexed) (PR #123615)

2025-01-27 Thread Momchil Velikov via cfe-commits

https://github.com/momchil-velikov updated 
https://github.com/llvm/llvm-project/pull/123615

>From ffb554ea1ff638237ffc9cb9a491e5a6ad66d8f6 Mon Sep 17 00:00:00 2001
From: Momchil Velikov 
Date: Tue, 17 Dec 2024 17:10:38 +
Subject: [PATCH 1/2] [AArch64] Implement NEON FP8 fused multiply-add
 intrinsics (non-indexed)

This patch adds the following intrinsics:

float16x8_t vmlalbq_f16_mf8_fpm(float16x8_t, mfloat8x16_t, mfloat8x16_t, 
fpm_t)
float16x8_t vmlaltq_f16_mf8_fpm(float16x8_t, mfloat8x16_t, mfloat8x16_t, 
fpm_t)

float32x4_t vmlallbbq_f32_mf8_fpm(float32x4_t, mfloat8x16_t, mfloat8x16_t, 
fpm_t)
float32x4_t vmlallbtq_f32_mf8_fpm(float32x4_t, mfloat8x16_t, mfloat8x16_t, 
fpm_t)
float32x4_t vmlalltbq_f32_mf8_fpm(float32x4_t, mfloat8x16_t, mfloat8x16_t, 
fpm_t)
float32x4_t vmlallttq_f32_mf8_fpm(float32x4_t, mfloat8x16_t, mfloat8x16_t, 
fpm_t)

[fixup] Update intrinsics definitions

[fixup] Remove some opt passes from RUN lines
---
 clang/include/clang/Basic/arm_neon.td |  10 ++
 clang/lib/CodeGen/CGBuiltin.cpp   |  43 +--
 clang/lib/CodeGen/CodeGenFunction.h   |   4 +-
 .../fp8-intrinsics/acle_neon_fp8_fmla.c   | 121 ++
 .../acle_neon_fp8_fmla.c  |  22 
 llvm/include/llvm/IR/IntrinsicsAArch64.td |  17 +++
 .../lib/Target/AArch64/AArch64InstrFormats.td |   9 +-
 llvm/lib/Target/AArch64/AArch64InstrInfo.td   |  14 +-
 llvm/test/CodeGen/AArch64/fp8-neon-fmla.ll|  56 
 9 files changed, 274 insertions(+), 22 deletions(-)
 create mode 100644 
clang/test/CodeGen/AArch64/fp8-intrinsics/acle_neon_fp8_fmla.c
 create mode 100644 clang/test/Sema/aarch64-fp8-intrinsics/acle_neon_fp8_fmla.c
 create mode 100644 llvm/test/CodeGen/AArch64/fp8-neon-fmla.ll

diff --git a/clang/include/clang/Basic/arm_neon.td 
b/clang/include/clang/Basic/arm_neon.td
index c6609f312969ee..7e7faa68c55692 100644
--- a/clang/include/clang/Basic/arm_neon.td
+++ b/clang/include/clang/Basic/arm_neon.td
@@ -2161,6 +2161,16 @@ let ArchGuard = "defined(__aarch64__)", TargetGuard = 
"fp8dot4,neon" in {
   def VDOTQ_LANEQ_F32_MF8 : VInst<"vdot_laneq_f32_mf8_fpm", "(>>F)(>>F)..IV", 
"Qm", [ImmCheck<3, ImmCheck0_3, 0>]>;
 }
 
+let ArchGuard = "defined(__aarch64__)", TargetGuard = "fp8fma,neon" in {
+  def VMLALB_F16_F8 : VInst<"vmlalb_f16_mf8_fpm", "(>F)(>F)..V", "Qm">;
+  def VMLALT_F16_F8 : VInst<"vmlalt_f16_mf8_fpm", "(>F)(>F)..V", "Qm">;
+
+  def VMLALLBB_F32_F8 : VInst<"vmlallbb_f32_mf8_fpm", "(>>F)(>>F)..V", "Qm">;
+  def VMLALLBT_F32_F8 : VInst<"vmlallbt_f32_mf8_fpm", "(>>F)(>>F)..V", "Qm">;
+  def VMLALLTB_F32_F8 : VInst<"vmlalltb_f32_mf8_fpm", "(>>F)(>>F)..V", "Qm">;
+  def VMLALLTT_F32_F8 : VInst<"vmlalltt_f32_mf8_fpm", "(>>F)(>>F)..V", "Qm">;
+}
+
 let ArchGuard = "defined(__aarch64__)", TargetGuard = "neon,faminmax" in {
   def FAMIN : WInst<"vamin", "...", "fhQdQfQh">;
   def FAMAX : WInst<"vamax", "...", "fhQdQfQh">;
diff --git a/clang/lib/CodeGen/CGBuiltin.cpp b/clang/lib/CodeGen/CGBuiltin.cpp
index b4b26eb84d5f92..8dbc8bfff95a4b 100644
--- a/clang/lib/CodeGen/CGBuiltin.cpp
+++ b/clang/lib/CodeGen/CGBuiltin.cpp
@@ -6759,11 +6759,14 @@ Value *CodeGenFunction::EmitNeonCall(Function *F, 
SmallVectorImpl &Ops,
 return Builder.CreateCall(F, Ops, name);
 }
 
-Value *CodeGenFunction::EmitFP8NeonCall(Function *F,
+Value *CodeGenFunction::EmitFP8NeonCall(unsigned IID,
+ArrayRef Tys,
 SmallVectorImpl &Ops,
-Value *FPM, const char *name) {
+const CallExpr *E, const char *name) {
+  llvm::Value *FPM =
+  EmitScalarOrConstFoldImmArg(/* ICEArguments */ 0, E->getNumArgs() - 1, 
E);
   Builder.CreateCall(CGM.getIntrinsic(Intrinsic::aarch64_set_fpmr), FPM);
-  return EmitNeonCall(F, Ops, name);
+  return EmitNeonCall(CGM.getIntrinsic(IID, Tys), Ops, name);
 }
 
 llvm::Value *CodeGenFunction::EmitFP8NeonFDOTCall(
@@ -6779,9 +6782,7 @@ llvm::Value *CodeGenFunction::EmitFP8NeonFDOTCall(
 Ops[2] = Builder.CreateInsertVector(VT, PoisonValue::get(VT), Ops[2],
 Builder.getInt64(0));
   }
-  llvm::Value *FPM =
-  EmitScalarOrConstFoldImmArg(/* ICEArguments */ 0, E->getNumArgs() - 1, 
E);
-  return EmitFP8NeonCall(CGM.getIntrinsic(IID, Tys), Ops, FPM, name);
+  return EmitFP8NeonCall(IID, Tys, Ops, E, name);
 }
 
 Value *CodeGenFunction::EmitNeonShiftVector(Value *V, llvm::Type *Ty,
@@ -6802,9 +6803,7 @@ Value *CodeGenFunction::EmitFP8NeonCvtCall(unsigned IID, 
llvm::Type *Ty0,
 Tys[1] = llvm::FixedVectorType::get(Int8Ty, 8);
 Ops[0] = Builder.CreateExtractVector(Tys[1], Ops[0], Builder.getInt64(0));
   }
-  llvm::Value *FPM =
-  EmitScalarOrConstFoldImmArg(/* ICEArguments */ 0, E->getNumArgs() - 1, 
E);
-  return EmitFP8NeonCall(CGM.getIntrinsic(IID, Tys), Ops, FPM, name);
+  return EmitFP8NeonCall(IID, Tys, Ops, E, name)

[clang] [llvm] [AArch64] Implement NEON FP8 intrinsics for fused multiply-add (indexed) (PR #123615)

2025-01-22 Thread Jonathan Thackray via cfe-commits

https://github.com/jthackray approved this pull request.

Phew, massive diff, LGTM.

https://github.com/llvm/llvm-project/pull/123615
___
cfe-commits mailing list
[email protected]
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[clang] [llvm] [AArch64] Implement NEON FP8 intrinsics for fused multiply-add (indexed) (PR #123615)

2025-01-20 Thread via cfe-commits

llvmbot wrote:



@llvm/pr-subscribers-clang

@llvm/pr-subscribers-llvm-ir

Author: Momchil Velikov (momchil-velikov)


Changes

This patch adds the following intrinsics:

* Floating-point multiply-add long to half-precision (vector, by element)

float16x8_t vmlalbq_lane_f16_mf8_fpm(float16x8_t vd, mfloat8x16_t vn, 
mfloat8x8_t vm, __builtin_constant_p(lane), fpm_t fpm)
float16x8_t vmlalbq_laneq_f16_mf8_fpm(float16x8_t vd, mfloat8x16_t vn, 
mfloat8x16_t vm, __builtin_constant_p(lane), fpm_t fpm)
float16x8_t vmlaltq_lane_f16_mf8_fpm(float16x8_t vd, mfloat8x16_t vn, 
mfloat8x8_t vm, __builtin_constant_p(lane), fpm_t fpm)
float16x8_t vmlaltq_laneq_f16_mf8_fpm(float16x8_t vd, mfloat8x16_t vn, 
mfloat8x16_t vm, __builtin_constant_p(lane), fpm_t fpm)

* Floating-point multiply-add long-long to single-precision (vector, by element)

float32x4_t vmlallbbq_lane_f32_mf8_fpm(float32x4_t vd, mfloat8x16_t vn, 
mfloat8x8_t vm, __builtin_constant_p(lane), fpm_t fpm)
float32x4_t vmlallbbq_laneq_f32_mf8_fpm(float32x4_t vd, mfloat8x16_t 
vn, mfloat8x16_t vm, __builtin_constant_p(lane), fpm_t fpm)
float32x4_t vmlallbtq_lane_f32_mf8_fpm(float32x4_t vd, mfloat8x16_t vn, 
mfloat8x8_t vm, __builtin_constant_p(lane), fpm_t fpm)
float32x4_t vmlallbtq_laneq_f32_mf8_fpm(float32x4_t vd, mfloat8x16_t 
vn, mfloat8x16_t vm, __builtin_constant_p(lane), fpm_t fpm)
float32x4_t vmlalltbq_lane_f32_mf8_fpm(float32x4_t vd, mfloat8x16_t vn, 
mfloat8x8_t vm, __builtin_constant_p(lane), fpm_t fpm)
float32x4_t vmlalltbq_laneq_f32_mf8_fpm(float32x4_t vd, mfloat8x16_t 
vn, mfloat8x16_t vm, __builtin_constant_p(lane), fpm_t fpm)
float32x4_t vmlallttq_lane_f32_mf8_fpm(float32x4_t vd, mfloat8x16_t vn, 
mfloat8x8_t vm, __builtin_constant_p(lane), fpm_t fpm)
float32x4_t vmlallttq_laneq_f32_mf8_fpm(float32x4_t vd, mfloat8x16_t 
vn, mfloat8x16_t vm, __builtin_constant_p(lane), fpm_t fpm)

Supersedes https://github.com/llvm/llvm-project/pull/120403

---

Patch is 189.62 KiB, truncated to 20.00 KiB below, full version: 
https://github.com/llvm/llvm-project/pull/123615.diff


37 Files Affected:

- (modified) clang/include/clang/AST/Type.h (+5) 
- (modified) clang/include/clang/Basic/AArch64SVEACLETypes.def (+17-20) 
- (modified) clang/include/clang/Basic/TargetBuiltins.h (+3-1) 
- (modified) clang/include/clang/Basic/arm_neon.td (+66) 
- (modified) clang/include/clang/Basic/arm_neon_incl.td (+2) 
- (modified) clang/lib/AST/ASTContext.cpp (+18-12) 
- (modified) clang/lib/AST/ItaniumMangle.cpp (+6-1) 
- (modified) clang/lib/AST/Type.cpp (+1-3) 
- (modified) clang/lib/CodeGen/CGBuiltin.cpp (+198-1) 
- (modified) clang/lib/CodeGen/CGExpr.cpp (+9-2) 
- (modified) clang/lib/CodeGen/CodeGenFunction.h (+15) 
- (modified) clang/lib/CodeGen/CodeGenTypes.cpp (+13-5) 
- (modified) clang/lib/CodeGen/Targets/AArch64.cpp (+6-8) 
- (modified) clang/lib/Sema/SemaARM.cpp (+2) 
- (modified) clang/lib/Sema/SemaExpr.cpp (+6-1) 
- (modified) clang/lib/Sema/SemaType.cpp (+2-1) 
- (added) clang/test/CodeGen/AArch64/builtin-shufflevector-fp8.c (+123) 
- (added) clang/test/CodeGen/AArch64/fp8-cast.c (+193) 
- (added) clang/test/CodeGen/AArch64/fp8-intrinsics/acle_neon_fp8_cvt.c (+316) 
- (added) clang/test/CodeGen/AArch64/fp8-intrinsics/acle_neon_fp8_fdot.c (+254) 
- (added) clang/test/CodeGen/AArch64/fp8-intrinsics/acle_neon_fp8_fmla.c (+365) 
- (modified) clang/test/CodeGen/arm-mfp8.c (+53-35) 
- (modified) clang/test/CodeGenCXX/aarch64-mangle-neon-vectors.cpp (+7) 
- (modified) clang/test/CodeGenCXX/mangle-neon-vectors.cpp (+11) 
- (added) clang/test/Sema/aarch64-fp8-cast.c (+104) 
- (added) clang/test/Sema/aarch64-fp8-intrinsics/acle_neon_fp8_cvt.c (+43) 
- (added) clang/test/Sema/aarch64-fp8-intrinsics/acle_neon_fp8_fdot.c (+54) 
- (added) clang/test/Sema/aarch64-fp8-intrinsics/acle_neon_fp8_fmla.c (+49) 
- (modified) clang/test/Sema/arm-mfp8.cpp (+22-12) 
- (modified) clang/utils/TableGen/NeonEmitter.cpp (+23-9) 
- (modified) clang/utils/TableGen/SveEmitter.cpp (+2-2) 
- (modified) llvm/include/llvm/IR/IntrinsicsAArch64.td (+76) 
- (modified) llvm/lib/Target/AArch64/AArch64InstrFormats.td (+105-56) 
- (modified) llvm/lib/Target/AArch64/AArch64InstrInfo.td (+30-28) 
- (added) llvm/test/CodeGen/AArch64/fp8-neon-fdot.ll (+74) 
- (added) llvm/test/CodeGen/AArch64/fp8-neon-fmla.ll (+110) 
- (added) llvm/test/CodeGen/AArch64/neon-fp8-cvt.ll (+112) 


``diff
diff --git a/clang/include/clang/AST/Type.h b/clang/include/clang/AST/Type.h
index 3457d524c63aaa..1d9743520654eb 100644
--- a/clang/include/clang/AST/Type.h
+++ b/clang/include/clang/AST/Type.h
@@ -2518,6 +2518,7 @@ class alignas(TypeAlignment) Type : public 
ExtQualsTypeCommonBase {
   bool isFloat32Type() const;
   bool isDoubleType() const;
   bool isBFloat16Type() const;
+  bool isMFloat8Type() const;
   bool isFloat128Type() const;
   bool isIbm128Type() const;
   bool isRealType() const; // C99 6.2.5p17 (r

[clang] [llvm] [AArch64] Implement NEON FP8 intrinsics for fused multiply-add (indexed) (PR #123615)

2025-01-20 Thread via cfe-commits

llvmbot wrote:




@llvm/pr-subscribers-backend-arm

Author: Momchil Velikov (momchil-velikov)


Changes

This patch adds the following intrinsics:

* Floating-point multiply-add long to half-precision (vector, by element)

float16x8_t vmlalbq_lane_f16_mf8_fpm(float16x8_t vd, mfloat8x16_t vn, 
mfloat8x8_t vm, __builtin_constant_p(lane), fpm_t fpm)
float16x8_t vmlalbq_laneq_f16_mf8_fpm(float16x8_t vd, mfloat8x16_t vn, 
mfloat8x16_t vm, __builtin_constant_p(lane), fpm_t fpm)
float16x8_t vmlaltq_lane_f16_mf8_fpm(float16x8_t vd, mfloat8x16_t vn, 
mfloat8x8_t vm, __builtin_constant_p(lane), fpm_t fpm)
float16x8_t vmlaltq_laneq_f16_mf8_fpm(float16x8_t vd, mfloat8x16_t vn, 
mfloat8x16_t vm, __builtin_constant_p(lane), fpm_t fpm)

* Floating-point multiply-add long-long to single-precision (vector, by element)

float32x4_t vmlallbbq_lane_f32_mf8_fpm(float32x4_t vd, mfloat8x16_t vn, 
mfloat8x8_t vm, __builtin_constant_p(lane), fpm_t fpm)
float32x4_t vmlallbbq_laneq_f32_mf8_fpm(float32x4_t vd, mfloat8x16_t 
vn, mfloat8x16_t vm, __builtin_constant_p(lane), fpm_t fpm)
float32x4_t vmlallbtq_lane_f32_mf8_fpm(float32x4_t vd, mfloat8x16_t vn, 
mfloat8x8_t vm, __builtin_constant_p(lane), fpm_t fpm)
float32x4_t vmlallbtq_laneq_f32_mf8_fpm(float32x4_t vd, mfloat8x16_t 
vn, mfloat8x16_t vm, __builtin_constant_p(lane), fpm_t fpm)
float32x4_t vmlalltbq_lane_f32_mf8_fpm(float32x4_t vd, mfloat8x16_t vn, 
mfloat8x8_t vm, __builtin_constant_p(lane), fpm_t fpm)
float32x4_t vmlalltbq_laneq_f32_mf8_fpm(float32x4_t vd, mfloat8x16_t 
vn, mfloat8x16_t vm, __builtin_constant_p(lane), fpm_t fpm)
float32x4_t vmlallttq_lane_f32_mf8_fpm(float32x4_t vd, mfloat8x16_t vn, 
mfloat8x8_t vm, __builtin_constant_p(lane), fpm_t fpm)
float32x4_t vmlallttq_laneq_f32_mf8_fpm(float32x4_t vd, mfloat8x16_t 
vn, mfloat8x16_t vm, __builtin_constant_p(lane), fpm_t fpm)

Supersedes https://github.com/llvm/llvm-project/pull/120403

---

Patch is 189.62 KiB, truncated to 20.00 KiB below, full version: 
https://github.com/llvm/llvm-project/pull/123615.diff


37 Files Affected:

- (modified) clang/include/clang/AST/Type.h (+5) 
- (modified) clang/include/clang/Basic/AArch64SVEACLETypes.def (+17-20) 
- (modified) clang/include/clang/Basic/TargetBuiltins.h (+3-1) 
- (modified) clang/include/clang/Basic/arm_neon.td (+66) 
- (modified) clang/include/clang/Basic/arm_neon_incl.td (+2) 
- (modified) clang/lib/AST/ASTContext.cpp (+18-12) 
- (modified) clang/lib/AST/ItaniumMangle.cpp (+6-1) 
- (modified) clang/lib/AST/Type.cpp (+1-3) 
- (modified) clang/lib/CodeGen/CGBuiltin.cpp (+198-1) 
- (modified) clang/lib/CodeGen/CGExpr.cpp (+9-2) 
- (modified) clang/lib/CodeGen/CodeGenFunction.h (+15) 
- (modified) clang/lib/CodeGen/CodeGenTypes.cpp (+13-5) 
- (modified) clang/lib/CodeGen/Targets/AArch64.cpp (+6-8) 
- (modified) clang/lib/Sema/SemaARM.cpp (+2) 
- (modified) clang/lib/Sema/SemaExpr.cpp (+6-1) 
- (modified) clang/lib/Sema/SemaType.cpp (+2-1) 
- (added) clang/test/CodeGen/AArch64/builtin-shufflevector-fp8.c (+123) 
- (added) clang/test/CodeGen/AArch64/fp8-cast.c (+193) 
- (added) clang/test/CodeGen/AArch64/fp8-intrinsics/acle_neon_fp8_cvt.c (+316) 
- (added) clang/test/CodeGen/AArch64/fp8-intrinsics/acle_neon_fp8_fdot.c (+254) 
- (added) clang/test/CodeGen/AArch64/fp8-intrinsics/acle_neon_fp8_fmla.c (+365) 
- (modified) clang/test/CodeGen/arm-mfp8.c (+53-35) 
- (modified) clang/test/CodeGenCXX/aarch64-mangle-neon-vectors.cpp (+7) 
- (modified) clang/test/CodeGenCXX/mangle-neon-vectors.cpp (+11) 
- (added) clang/test/Sema/aarch64-fp8-cast.c (+104) 
- (added) clang/test/Sema/aarch64-fp8-intrinsics/acle_neon_fp8_cvt.c (+43) 
- (added) clang/test/Sema/aarch64-fp8-intrinsics/acle_neon_fp8_fdot.c (+54) 
- (added) clang/test/Sema/aarch64-fp8-intrinsics/acle_neon_fp8_fmla.c (+49) 
- (modified) clang/test/Sema/arm-mfp8.cpp (+22-12) 
- (modified) clang/utils/TableGen/NeonEmitter.cpp (+23-9) 
- (modified) clang/utils/TableGen/SveEmitter.cpp (+2-2) 
- (modified) llvm/include/llvm/IR/IntrinsicsAArch64.td (+76) 
- (modified) llvm/lib/Target/AArch64/AArch64InstrFormats.td (+105-56) 
- (modified) llvm/lib/Target/AArch64/AArch64InstrInfo.td (+30-28) 
- (added) llvm/test/CodeGen/AArch64/fp8-neon-fdot.ll (+74) 
- (added) llvm/test/CodeGen/AArch64/fp8-neon-fmla.ll (+110) 
- (added) llvm/test/CodeGen/AArch64/neon-fp8-cvt.ll (+112) 


``diff
diff --git a/clang/include/clang/AST/Type.h b/clang/include/clang/AST/Type.h
index 3457d524c63aaa..1d9743520654eb 100644
--- a/clang/include/clang/AST/Type.h
+++ b/clang/include/clang/AST/Type.h
@@ -2518,6 +2518,7 @@ class alignas(TypeAlignment) Type : public 
ExtQualsTypeCommonBase {
   bool isFloat32Type() const;
   bool isDoubleType() const;
   bool isBFloat16Type() const;
+  bool isMFloat8Type() const;
   bool isFloat128Type() const;
   bool isIbm128Type() const;
   bool isRealType() const; // C99 6.2.5p17 (real floating + integer)