[clang] [llvm] [AArch64] Implement FP8 SVE intrinsics for fused multiply-add (PR #118126)

2024-12-13 Thread Momchil Velikov via cfe-commits

https://github.com/momchil-velikov closed 
https://github.com/llvm/llvm-project/pull/118126
___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[clang] [llvm] [AArch64] Implement FP8 SVE intrinsics for fused multiply-add (PR #118126)

2024-12-13 Thread Momchil Velikov via cfe-commits

https://github.com/momchil-velikov updated 
https://github.com/llvm/llvm-project/pull/118126

>From 15478424a10e531d42f197dae5e2083e8d183c23 Mon Sep 17 00:00:00 2001
From: Momchil Velikov 
Date: Tue, 26 Nov 2024 18:01:03 +
Subject: [PATCH 1/3] [AArch64] Implement FP8 SVE intrinsics for fused
 multiply-add

This patch adds the following intrinsics:

* 8-bit floating-point multiply-add long to half-precision (bottom).

  // Only if (__ARM_FEATURE_SVE2  && __ARM_FEATURE_FP8FMA) || 
__ARM_FEATURE_SSVE_FP8FMA
  svfloat16_t svmlalb[_f16_mf8]_fpm(svfloat16_t zda, svmfloat8_t zn, 
svmfloat8_t zm, fpm_t fpm);
  svfloat16_t svmlalb[_n_f16_mf8]_fpm(svfloat16_t zda, svmfloat8_t zn, 
mfloat8_t zm, fpm_t fpm);

* 8-bit floating-point multiply-add long to half-precision (bottom, indexed).

  // Only if (__ARM_FEATURE_SVE2  && __ARM_FEATURE_FP8FMA) || 
__ARM_FEATURE_SSVE_FP8FMA
  svfloat16_t svmlalb_lane[_f16_mf8]_fpm(svfloat16_t zda, svmfloat8_t zn, 
svmfloat8_t zm,
 uint64_t imm0_15, fpm_t fpm);

* 8-bit floating-point multiply-add long to half-precision (top).

  // Only if (__ARM_FEATURE_SVE2  && __ARM_FEATURE_FP8FMA) || 
__ARM_FEATURE_SSVE_FP8FMA
  svfloat16_t svmlalt[_f16_mf8]_fpm(svfloat16_t zda, svmfloat8_t zn, 
svmfloat8_t zm, fpm_t fpm);
  svfloat16_t svmlalt[_n_f16_mf8]_fpm(svfloat16_t zda, svmfloat8_t zn, 
mfloat8_t zm, fpm_t fpm);

* 8-bit floating-point multiply-add long to half-precision (top, indexed).

  // Only if (__ARM_FEATURE_SVE2  && __ARM_FEATURE_FP8FMA) || 
__ARM_FEATURE_SSVE_FP8FMA
  svfloat16_t svmlalt_lane[_f16_mf8]_fpm(svfloat16_t zda, svmfloat8_t zn, 
svmfloat8_t zm,
 uint64_t imm0_15, fpm_t fpm);

* 8-bit floating-point multiply-add long long to single-precision (bottom 
bottom).

  // Only if (__ARM_FEATURE_SVE2  && __ARM_FEATURE_FP8FMA) || 
__ARM_FEATURE_SSVE_FP8FMA
  svfloat32_t svmlallbb[_f32_mf8]_fpm(svfloat32_t zda, svmfloat8_t zn, 
svmfloat8_t zm, fpm_t fpm);
  svfloat32_t svmlallbb[_n_f32_mf8]_fpm(svfloat32_t zda, svmfloat8_t zn, 
mfloat8_t zm, fpm_t fpm);

* 8-bit floating-point multiply-add long long to single-precision (bottom 
bottom, indexed).

  // Only if (__ARM_FEATURE_SVE2  && __ARM_FEATURE_FP8FMA) || 
__ARM_FEATURE_SSVE_FP8FMA
  svfloat32_t svmlallbb_lane[_f32_mf8]_fpm(svfloat32_t zda, svmfloat8_t zn, 
svmfloat8_t zm,
   uint64_t imm0_15, fpm_t fpm);

* 8-bit floating-point multiply-add long long to single-precision (bottom top).

  // Only if (__ARM_FEATURE_SVE2  && __ARM_FEATURE_FP8FMA) || 
__ARM_FEATURE_SSVE_FP8FMA
  svfloat32_t svmlallbt[_f32_mf8]_fpm(svfloat32_t zda, svmfloat8_t zn, 
svmfloat8_t zm, fpm_t fpm);
  svfloat32_t svmlallbt[_n_f32_mf8]_fpm(svfloat32_t zda, svmfloat8_t zn, 
mfloat8_t zm, fpm_t fpm);

* 8-bit floating-point multiply-add long long to single-precision (bottom top, 
indexed).

  // Only if (__ARM_FEATURE_SVE2  && __ARM_FEATURE_FP8FMA) || 
__ARM_FEATURE_SSVE_FP8FMA
  svfloat32_t svmlallbt_lane[_f32_mf8]_fpm(svfloat32_t zda, svmfloat8_t zn, 
svmfloat8_t zm,
   uint64_t imm0_15, fpm_t fpm);
* 8-bit floating-point multiply-add long long to single-precision (top bottom).

  // Only if (__ARM_FEATURE_SVE2  && __ARM_FEATURE_FP8FMA) || 
__ARM_FEATURE_SSVE_FP8FMA
  svfloat32_t svmlalltb[_f32_mf8]_fpm(svfloat32_t zda, svmfloat8_t zn, 
svmfloat8_t zm, fpm_t fpm);
  svfloat32_t svmlalltb[_n_f32_mf8]_fpm(svfloat32_t zda, svmfloat8_t zn, 
mfloat8_t zm, fpm_t fpm);

* 8-bit floating-point multiply-add long long to single-precision (top bottom, 
indexed).

  // Only if (__ARM_FEATURE_SVE2  && __ARM_FEATURE_FP8FMA) || 
__ARM_FEATURE_SSVE_FP8FMA
  svfloat32_t svmlalltb_lane[_f32_mf8]_fpm(svfloat32_t zda, svmfloat8_t zn, 
svmfloat8_t zm,
   uint64_t imm0_15, fpm_t fpm);
* 8-bit floating-point multiply-add long long to single-precision (top top).

  // Only if (__ARM_FEATURE_SVE2  && __ARM_FEATURE_FP8FMA) || 
__ARM_FEATURE_SSVE_FP8FMA
  svfloat32_t svmlalltt[_f32_mf8]_fpm(svfloat32_t zda, svmfloat8_t zn, 
svmfloat8_t zm, fpm_t fpm);
  svfloat32_t svmlalltt[_n_f32_mf8]_fpm(svfloat32_t zda, svmfloat8_t zn, 
mfloat8_t zm, fpm_t fpm);

* 8-bit floating-point multiply-add long long to single-precision (top top, 
indexed).

  // Only if (__ARM_FEATURE_SVE2  && __ARM_FEATURE_FP8FMA) || 
__ARM_FEATURE_SSVE_FP8FMA
  svfloat32_t svmlalltt_lane[_f32_mf8]_fpm(svfloat32_t zda, svmfloat8_t zn, 
svmfloat8_t zm,
   uint64_t imm0_15, fpm_t fpm);
---
 clang/include/clang/Basic/arm_sve.td  |  31 ++
 .../fp8-intrinsics/acle_sve2_fp8_fmla.c   | 389 ++
 .../aarch64-sve2-intrinsics/acle_sve2_fp8.c   |  53 ++-
 llvm/include/llvm/IR/IntrinsicsAArch64.td |  19 +
 .../lib/Target/AArch64/AArch64SVEInstrInfo.td |  24 +-
 llvm/lib/Target/AArch64/SVEInstrFormats.td|  22 +-
 llvm/test/CodeGen/AArch64/fp8-sve-fmla.ll | 

[clang] [llvm] [AArch64] Implement FP8 SVE intrinsics for fused multiply-add (PR #118126)

2024-12-13 Thread Momchil Velikov via cfe-commits

https://github.com/momchil-velikov updated 
https://github.com/llvm/llvm-project/pull/118126

>From 5bc5078af32cda3dbcf3ca8dd53b01996ad34ea1 Mon Sep 17 00:00:00 2001
From: Momchil Velikov 
Date: Mon, 25 Nov 2024 17:21:55 +
Subject: [PATCH 1/5] [AArch64] Implements FP8 SVE intrinsics for dot-product

This patch adds the following intrinsics:

* 8-bit floating-point dot product to single-precision.

  // Only if (__ARM_FEATURE_SVE2  && __ARM_FEATURE_FP8DOT4) || 
__ARM_FEATURE_SSVE_FP8DOT4
  svfloat32_t svdot[_f32_mf8]_fpm(svfloat32_t zda, svmfloat8_t zn, svmfloat8_t 
zm, fpm_t fpm);
  svfloat32_t svdot[_n_f32_mf8]_fpm(svfloat32_t zda, svmfloat8_t zn, mfloat8_t 
zm, fpm_t fpm);

* 8-bit floating-point indexed dot product to single-precision.

  // Only if (__ARM_FEATURE_SVE2  && __ARM_FEATURE_FP8DOT4) || 
__ARM_FEATURE_SSVE_FP8DOT4
  svfloat32_t svdot_lane[_f32_mf8]_fpm(svfloat32_t zda, svmfloat8_t zn, 
svmfloat8_t zm,
   uint64_t imm0_3, fpm_t fpm);

* 8-bit floating-point dot product to half-precision.

  // Only if (__ARM_FEATURE_SVE2  && __ARM_FEATURE_FP8DOT2) || 
__ARM_FEATURE_SSVE_FP8DOT2
  svfloat16_t svdot[_f16_mf8]_fpm(svfloat16_t zda, svmfloat8_t zn, svmfloat8_t 
zm, fpm_t fpm);
  svfloat16_t svdot[_n_f16_mf8]_fpm(svfloat16_t zda, svmfloat8_t zn, mfloat8_t 
zm, fpm_t fpm);

* 8-bit floating-point indexed dot product to half-precision.

  // Only if (__ARM_FEATURE_SVE2  && __ARM_FEATURE_FP8DOT2) || 
__ARM_FEATURE_SSVE_FP8DOT2
  svfloat16_t svdot_lane[_f16_mf8]_fpm(svfloat16_t zda, svmfloat8_t zn, 
svmfloat8_t zm,
   uint64_t imm0_7, fpm_t fpm);
---
 clang/include/clang/Basic/arm_sve.td  |  19 +++
 clang/include/clang/Basic/arm_sve_sme_incl.td |   1 +
 clang/lib/CodeGen/CGBuiltin.cpp   |  11 +-
 .../fp8-intrinsics/acle_sve2_fp8_fdot.c   | 149 ++
 .../aarch64-sve2-intrinsics/acle_sve2_fp8.c   |  23 ++-
 clang/utils/TableGen/SveEmitter.cpp   |   9 +-
 llvm/include/llvm/IR/IntrinsicsAArch64.td |  16 ++
 .../lib/Target/AArch64/AArch64SVEInstrInfo.td |   9 +-
 llvm/lib/Target/AArch64/SVEInstrFormats.td|  29 +++-
 llvm/test/CodeGen/AArch64/fp8-sve-fdot.ll |  41 +
 10 files changed, 293 insertions(+), 14 deletions(-)
 create mode 100644 
clang/test/CodeGen/AArch64/fp8-intrinsics/acle_sve2_fp8_fdot.c
 create mode 100644 llvm/test/CodeGen/AArch64/fp8-sve-fdot.ll

diff --git a/clang/include/clang/Basic/arm_sve.td 
b/clang/include/clang/Basic/arm_sve.td
index b9f40faf0b18e6..2c8ca8014387d3 100644
--- a/clang/include/clang/Basic/arm_sve.td
+++ b/clang/include/clang/Basic/arm_sve.td
@@ -2476,3 +2476,22 @@ let SVETargetGuard = "sve2,fp8", SMETargetGuard = 
"sme2,fp8" in {
   def SVFCVTNB : SInst<"svcvtnb_mf8[_f32_x2]_fpm", "~2>",  "f", MergeNone, 
"aarch64_sve_fp8_cvtnb", [VerifyRuntimeMode, SetsFPMR]>;
   def SVFCVTNT : SInst<"svcvtnt_mf8[_f32_x2]_fpm", "~~2>", "f", MergeNone, 
"aarch64_sve_fp8_cvtnt", [VerifyRuntimeMode, SetsFPMR]>;
 }
+
+let SVETargetGuard = "sve2,fp8dot2", SMETargetGuard ="sme,ssve-fp8dot2" in {
+  // 8-bit floating-point dot product to half-precision (vectors)
+  def SVFDOT_2WAY   :  SInst<"svdot[_f16_mf8]_fpm",   "dd~~>", "h", MergeNone, 
"aarch64_sve_fp8_fdot", [VerifyRuntimeMode, SetsFPMR]>;
+  def SVFDOT_N_2WAY :  SInst<"svdot[_n_f16_mf8]_fpm", "dd~!>", "h", MergeNone, 
"aarch64_sve_fp8_fdot", [VerifyRuntimeMode, SetsFPMR]>;
+
+  // 8-bit floating-point dot product to half-precision (indexed)
+  def SVFDOT_LANE_2WAY :  SInst<"svdot_lane[_f16_mf8]_fpm", "dd~~i>", "h", 
MergeNone, "aarch64_sve_fp8_fdot_lane", [VerifyRuntimeMode, SetsFPMR], 
[ImmCheck<3, ImmCheck0_7>]>;
+}
+
+let SVETargetGuard = "sve2,fp8dot4", SMETargetGuard ="sme,ssve-fp8dot4" in {
+  // 8-bit floating-point dot product to single-precision (vectors)
+  def SVFDOT_4WAY   : SInst<"svdot[_f32_mf8]_fpm",   "dd~~>", "f", MergeNone, 
"aarch64_sve_fp8_fdot", [VerifyRuntimeMode, SetsFPMR]>;
+  def SVFDOT_N_4WAY : SInst<"svdot[_n_f32_mf8]_fpm", "dd~!>", "f", MergeNone, 
"aarch64_sve_fp8_fdot", [VerifyRuntimeMode, SetsFPMR]>;
+
+  // 8-bit floating-point dot product to single-precision (indexed)
+  def SVFDOT_LANE_4WAY :  SInst<"svdot_lane[_f32_mf8]_fpm", "dd~~i>", "f", 
MergeNone, "aarch64_sve_fp8_fdot_lane", [VerifyRuntimeMode, SetsFPMR], 
[ImmCheck<3, ImmCheck0_3>]>;
+}
+
diff --git a/clang/include/clang/Basic/arm_sve_sme_incl.td 
b/clang/include/clang/Basic/arm_sve_sme_incl.td
index de10be7bdce0db..44201b15505599 100644
--- a/clang/include/clang/Basic/arm_sve_sme_incl.td
+++ b/clang/include/clang/Basic/arm_sve_sme_incl.td
@@ -105,6 +105,7 @@ include "arm_immcheck_incl.td"
 // N: svfloat64_t
 // $: svbfloat16_t
 // ~: svmfloat8_t
+// !: mfloat8_t (splat to svmfloat8_t)
 
 // J: Prefetch type (sv_prfop)
 
diff --git a/clang/lib/CodeGen/CGBuiltin.cpp b/clang/lib/CodeGen/CGBuiltin.cpp
index 49a4c1ecc825e7..84048a4beac2c5 100644
--- a/clang/lib/CodeGen/CGBuiltin.cpp
+++ b/clang/lib/CodeGen/CGBui

[clang] [llvm] [AArch64] Implement FP8 SVE intrinsics for fused multiply-add (PR #118126)

2024-12-13 Thread Momchil Velikov via cfe-commits


@@ -0,0 +1,114 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 
UTC_ARGS: --version 5
+; RUN: llc -mattr=+sve2,+fp8,+fp8fma  < %s | FileCheck %s
+; RUN: llc -mattr=+sme,+fp8,+ssve-fp8fma --force-streaming < %s | FileCheck %s
+
+target triple = "aarch64-linux"
+
+define  @fmla_2way_bot( %a,  %s1,  %s2) {
+; CHECK-LABEL: fmla_2way_bot:
+; CHECK:   // %bb.0:
+; CHECK-NEXT:fmlalb z0.h, z1.b, z2.b
+; CHECK-NEXT:ret
+%r = call  @llvm.aarch64.sve.fp8.fmlalb.nxv8f16( %a,  %s1,  %s2)
+ret  %r
+}
+
+define  @fmla_2way_top( %a,  %s1,  %s2) {
+; CHECK-LABEL: fmla_2way_top:
+; CHECK:   // %bb.0:
+; CHECK-NEXT:fmlalt z0.h, z1.b, z2.b
+; CHECK-NEXT:ret
+%r = call  @llvm.aarch64.sve.fp8.fmlalt.nxv8f16( %a,  %s1,  %s2)
+ret  %r
+}
+
+define  @fdot_2way_bot_lane( %a,  %s1,  %s2) {
+; CHECK-LABEL: fdot_2way_bot_lane:
+; CHECK:   // %bb.0:
+; CHECK-NEXT:fmlalb z0.h, z1.b, z2.b[3]
+; CHECK-NEXT:ret
+%r = call  
@llvm.aarch64.sve.fp8.fmlalb.lane.nxv8f16( %a,  %s1,  %s2, i32 3)
+ret  %r
+}
+
+define  @fdot_2way_top_lane( %a,  %s1,  %s2) {

momchil-velikov wrote:

Done

https://github.com/llvm/llvm-project/pull/118126
___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[clang] [llvm] [AArch64] Implement FP8 SVE intrinsics for fused multiply-add (PR #118126)

2024-12-13 Thread Momchil Velikov via cfe-commits


@@ -0,0 +1,114 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 
UTC_ARGS: --version 5
+; RUN: llc -mattr=+sve2,+fp8,+fp8fma  < %s | FileCheck %s
+; RUN: llc -mattr=+sme,+fp8,+ssve-fp8fma --force-streaming < %s | FileCheck %s
+
+target triple = "aarch64-linux"
+
+define  @fmla_2way_bot( %a,  %s1,  %s2) {
+; CHECK-LABEL: fmla_2way_bot:
+; CHECK:   // %bb.0:
+; CHECK-NEXT:fmlalb z0.h, z1.b, z2.b
+; CHECK-NEXT:ret
+%r = call  @llvm.aarch64.sve.fp8.fmlalb.nxv8f16( %a,  %s1,  %s2)
+ret  %r
+}
+
+define  @fmla_2way_top( %a,  %s1,  %s2) {
+; CHECK-LABEL: fmla_2way_top:
+; CHECK:   // %bb.0:
+; CHECK-NEXT:fmlalt z0.h, z1.b, z2.b
+; CHECK-NEXT:ret
+%r = call  @llvm.aarch64.sve.fp8.fmlalt.nxv8f16( %a,  %s1,  %s2)
+ret  %r
+}
+
+define  @fdot_2way_bot_lane( %a,  %s1,  %s2) {

momchil-velikov wrote:

Done

https://github.com/llvm/llvm-project/pull/118126
___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[clang] [llvm] [AArch64] Implement FP8 SVE intrinsics for fused multiply-add (PR #118126)

2024-12-13 Thread Momchil Velikov via cfe-commits

https://github.com/momchil-velikov updated 
https://github.com/llvm/llvm-project/pull/118126

>From 5bc5078af32cda3dbcf3ca8dd53b01996ad34ea1 Mon Sep 17 00:00:00 2001
From: Momchil Velikov 
Date: Mon, 25 Nov 2024 17:21:55 +
Subject: [PATCH 1/4] [AArch64] Implements FP8 SVE intrinsics for dot-product

This patch adds the following intrinsics:

* 8-bit floating-point dot product to single-precision.

  // Only if (__ARM_FEATURE_SVE2  && __ARM_FEATURE_FP8DOT4) || 
__ARM_FEATURE_SSVE_FP8DOT4
  svfloat32_t svdot[_f32_mf8]_fpm(svfloat32_t zda, svmfloat8_t zn, svmfloat8_t 
zm, fpm_t fpm);
  svfloat32_t svdot[_n_f32_mf8]_fpm(svfloat32_t zda, svmfloat8_t zn, mfloat8_t 
zm, fpm_t fpm);

* 8-bit floating-point indexed dot product to single-precision.

  // Only if (__ARM_FEATURE_SVE2  && __ARM_FEATURE_FP8DOT4) || 
__ARM_FEATURE_SSVE_FP8DOT4
  svfloat32_t svdot_lane[_f32_mf8]_fpm(svfloat32_t zda, svmfloat8_t zn, 
svmfloat8_t zm,
   uint64_t imm0_3, fpm_t fpm);

* 8-bit floating-point dot product to half-precision.

  // Only if (__ARM_FEATURE_SVE2  && __ARM_FEATURE_FP8DOT2) || 
__ARM_FEATURE_SSVE_FP8DOT2
  svfloat16_t svdot[_f16_mf8]_fpm(svfloat16_t zda, svmfloat8_t zn, svmfloat8_t 
zm, fpm_t fpm);
  svfloat16_t svdot[_n_f16_mf8]_fpm(svfloat16_t zda, svmfloat8_t zn, mfloat8_t 
zm, fpm_t fpm);

* 8-bit floating-point indexed dot product to half-precision.

  // Only if (__ARM_FEATURE_SVE2  && __ARM_FEATURE_FP8DOT2) || 
__ARM_FEATURE_SSVE_FP8DOT2
  svfloat16_t svdot_lane[_f16_mf8]_fpm(svfloat16_t zda, svmfloat8_t zn, 
svmfloat8_t zm,
   uint64_t imm0_7, fpm_t fpm);
---
 clang/include/clang/Basic/arm_sve.td  |  19 +++
 clang/include/clang/Basic/arm_sve_sme_incl.td |   1 +
 clang/lib/CodeGen/CGBuiltin.cpp   |  11 +-
 .../fp8-intrinsics/acle_sve2_fp8_fdot.c   | 149 ++
 .../aarch64-sve2-intrinsics/acle_sve2_fp8.c   |  23 ++-
 clang/utils/TableGen/SveEmitter.cpp   |   9 +-
 llvm/include/llvm/IR/IntrinsicsAArch64.td |  16 ++
 .../lib/Target/AArch64/AArch64SVEInstrInfo.td |   9 +-
 llvm/lib/Target/AArch64/SVEInstrFormats.td|  29 +++-
 llvm/test/CodeGen/AArch64/fp8-sve-fdot.ll |  41 +
 10 files changed, 293 insertions(+), 14 deletions(-)
 create mode 100644 
clang/test/CodeGen/AArch64/fp8-intrinsics/acle_sve2_fp8_fdot.c
 create mode 100644 llvm/test/CodeGen/AArch64/fp8-sve-fdot.ll

diff --git a/clang/include/clang/Basic/arm_sve.td 
b/clang/include/clang/Basic/arm_sve.td
index b9f40faf0b18e6..2c8ca8014387d3 100644
--- a/clang/include/clang/Basic/arm_sve.td
+++ b/clang/include/clang/Basic/arm_sve.td
@@ -2476,3 +2476,22 @@ let SVETargetGuard = "sve2,fp8", SMETargetGuard = 
"sme2,fp8" in {
   def SVFCVTNB : SInst<"svcvtnb_mf8[_f32_x2]_fpm", "~2>",  "f", MergeNone, 
"aarch64_sve_fp8_cvtnb", [VerifyRuntimeMode, SetsFPMR]>;
   def SVFCVTNT : SInst<"svcvtnt_mf8[_f32_x2]_fpm", "~~2>", "f", MergeNone, 
"aarch64_sve_fp8_cvtnt", [VerifyRuntimeMode, SetsFPMR]>;
 }
+
+let SVETargetGuard = "sve2,fp8dot2", SMETargetGuard ="sme,ssve-fp8dot2" in {
+  // 8-bit floating-point dot product to half-precision (vectors)
+  def SVFDOT_2WAY   :  SInst<"svdot[_f16_mf8]_fpm",   "dd~~>", "h", MergeNone, 
"aarch64_sve_fp8_fdot", [VerifyRuntimeMode, SetsFPMR]>;
+  def SVFDOT_N_2WAY :  SInst<"svdot[_n_f16_mf8]_fpm", "dd~!>", "h", MergeNone, 
"aarch64_sve_fp8_fdot", [VerifyRuntimeMode, SetsFPMR]>;
+
+  // 8-bit floating-point dot product to half-precision (indexed)
+  def SVFDOT_LANE_2WAY :  SInst<"svdot_lane[_f16_mf8]_fpm", "dd~~i>", "h", 
MergeNone, "aarch64_sve_fp8_fdot_lane", [VerifyRuntimeMode, SetsFPMR], 
[ImmCheck<3, ImmCheck0_7>]>;
+}
+
+let SVETargetGuard = "sve2,fp8dot4", SMETargetGuard ="sme,ssve-fp8dot4" in {
+  // 8-bit floating-point dot product to single-precision (vectors)
+  def SVFDOT_4WAY   : SInst<"svdot[_f32_mf8]_fpm",   "dd~~>", "f", MergeNone, 
"aarch64_sve_fp8_fdot", [VerifyRuntimeMode, SetsFPMR]>;
+  def SVFDOT_N_4WAY : SInst<"svdot[_n_f32_mf8]_fpm", "dd~!>", "f", MergeNone, 
"aarch64_sve_fp8_fdot", [VerifyRuntimeMode, SetsFPMR]>;
+
+  // 8-bit floating-point dot product to single-precision (indexed)
+  def SVFDOT_LANE_4WAY :  SInst<"svdot_lane[_f32_mf8]_fpm", "dd~~i>", "f", 
MergeNone, "aarch64_sve_fp8_fdot_lane", [VerifyRuntimeMode, SetsFPMR], 
[ImmCheck<3, ImmCheck0_3>]>;
+}
+
diff --git a/clang/include/clang/Basic/arm_sve_sme_incl.td 
b/clang/include/clang/Basic/arm_sve_sme_incl.td
index de10be7bdce0db..44201b15505599 100644
--- a/clang/include/clang/Basic/arm_sve_sme_incl.td
+++ b/clang/include/clang/Basic/arm_sve_sme_incl.td
@@ -105,6 +105,7 @@ include "arm_immcheck_incl.td"
 // N: svfloat64_t
 // $: svbfloat16_t
 // ~: svmfloat8_t
+// !: mfloat8_t (splat to svmfloat8_t)
 
 // J: Prefetch type (sv_prfop)
 
diff --git a/clang/lib/CodeGen/CGBuiltin.cpp b/clang/lib/CodeGen/CGBuiltin.cpp
index 49a4c1ecc825e7..84048a4beac2c5 100644
--- a/clang/lib/CodeGen/CGBuiltin.cpp
+++ b/clang/lib/CodeGen/CGBui

[clang] [llvm] [AArch64] Implement FP8 SVE intrinsics for fused multiply-add (PR #118126)

2024-12-11 Thread via cfe-commits


@@ -0,0 +1,114 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 
UTC_ARGS: --version 5
+; RUN: llc -mattr=+sve2,+fp8,+fp8fma  < %s | FileCheck %s
+; RUN: llc -mattr=+sme,+fp8,+ssve-fp8fma --force-streaming < %s | FileCheck %s
+
+target triple = "aarch64-linux"
+
+define  @fmla_2way_bot( %a,  %s1,  %s2) {
+; CHECK-LABEL: fmla_2way_bot:
+; CHECK:   // %bb.0:
+; CHECK-NEXT:fmlalb z0.h, z1.b, z2.b
+; CHECK-NEXT:ret
+%r = call  @llvm.aarch64.sve.fp8.fmlalb.nxv8f16( %a,  %s1,  %s2)
+ret  %r
+}
+
+define  @fmla_2way_top( %a,  %s1,  %s2) {
+; CHECK-LABEL: fmla_2way_top:
+; CHECK:   // %bb.0:
+; CHECK-NEXT:fmlalt z0.h, z1.b, z2.b
+; CHECK-NEXT:ret
+%r = call  @llvm.aarch64.sve.fp8.fmlalt.nxv8f16( %a,  %s1,  %s2)
+ret  %r
+}
+
+define  @fdot_2way_bot_lane( %a,  %s1,  %s2) {
+; CHECK-LABEL: fdot_2way_bot_lane:
+; CHECK:   // %bb.0:
+; CHECK-NEXT:fmlalb z0.h, z1.b, z2.b[3]
+; CHECK-NEXT:ret
+%r = call  
@llvm.aarch64.sve.fp8.fmlalb.lane.nxv8f16( %a,  %s1,  %s2, i32 3)
+ret  %r
+}
+
+define  @fdot_2way_top_lane( %a,  %s1,  %s2) {

CarolineConcatto wrote:

s/dot/fmla

https://github.com/llvm/llvm-project/pull/118126
___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[clang] [llvm] [AArch64] Implement FP8 SVE intrinsics for fused multiply-add (PR #118126)

2024-12-11 Thread via cfe-commits


@@ -0,0 +1,114 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 
UTC_ARGS: --version 5
+; RUN: llc -mattr=+sve2,+fp8,+fp8fma  < %s | FileCheck %s
+; RUN: llc -mattr=+sme,+fp8,+ssve-fp8fma --force-streaming < %s | FileCheck %s
+
+target triple = "aarch64-linux"
+
+define  @fmla_2way_bot( %a,  %s1,  %s2) {
+; CHECK-LABEL: fmla_2way_bot:
+; CHECK:   // %bb.0:
+; CHECK-NEXT:fmlalb z0.h, z1.b, z2.b
+; CHECK-NEXT:ret
+%r = call  @llvm.aarch64.sve.fp8.fmlalb.nxv8f16( %a,  %s1,  %s2)
+ret  %r
+}
+
+define  @fmla_2way_top( %a,  %s1,  %s2) {
+; CHECK-LABEL: fmla_2way_top:
+; CHECK:   // %bb.0:
+; CHECK-NEXT:fmlalt z0.h, z1.b, z2.b
+; CHECK-NEXT:ret
+%r = call  @llvm.aarch64.sve.fp8.fmlalt.nxv8f16( %a,  %s1,  %s2)
+ret  %r
+}
+
+define  @fdot_2way_bot_lane( %a,  %s1,  %s2) {

CarolineConcatto wrote:

s/dot/fmla

https://github.com/llvm/llvm-project/pull/118126
___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[clang] [llvm] [AArch64] Implement FP8 SVE intrinsics for fused multiply-add (PR #118126)

2024-12-11 Thread via cfe-commits

https://github.com/CarolineConcatto approved this pull request.

LGTM, just fix the test name in the fmla

https://github.com/llvm/llvm-project/pull/118126
___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[clang] [llvm] [AArch64] Implement FP8 SVE intrinsics for fused multiply-add (PR #118126)

2024-12-11 Thread via cfe-commits

https://github.com/CarolineConcatto edited 
https://github.com/llvm/llvm-project/pull/118126
___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[clang] [llvm] [AArch64] Implement FP8 SVE intrinsics for fused multiply-add (PR #118126)

2024-12-11 Thread Momchil Velikov via cfe-commits

https://github.com/momchil-velikov updated 
https://github.com/llvm/llvm-project/pull/118126

>From 5bc5078af32cda3dbcf3ca8dd53b01996ad34ea1 Mon Sep 17 00:00:00 2001
From: Momchil Velikov 
Date: Mon, 25 Nov 2024 17:21:55 +
Subject: [PATCH 1/3] [AArch64] Implements FP8 SVE intrinsics for dot-product

This patch adds the following intrinsics:

* 8-bit floating-point dot product to single-precision.

  // Only if (__ARM_FEATURE_SVE2  && __ARM_FEATURE_FP8DOT4) || 
__ARM_FEATURE_SSVE_FP8DOT4
  svfloat32_t svdot[_f32_mf8]_fpm(svfloat32_t zda, svmfloat8_t zn, svmfloat8_t 
zm, fpm_t fpm);
  svfloat32_t svdot[_n_f32_mf8]_fpm(svfloat32_t zda, svmfloat8_t zn, mfloat8_t 
zm, fpm_t fpm);

* 8-bit floating-point indexed dot product to single-precision.

  // Only if (__ARM_FEATURE_SVE2  && __ARM_FEATURE_FP8DOT4) || 
__ARM_FEATURE_SSVE_FP8DOT4
  svfloat32_t svdot_lane[_f32_mf8]_fpm(svfloat32_t zda, svmfloat8_t zn, 
svmfloat8_t zm,
   uint64_t imm0_3, fpm_t fpm);

* 8-bit floating-point dot product to half-precision.

  // Only if (__ARM_FEATURE_SVE2  && __ARM_FEATURE_FP8DOT2) || 
__ARM_FEATURE_SSVE_FP8DOT2
  svfloat16_t svdot[_f16_mf8]_fpm(svfloat16_t zda, svmfloat8_t zn, svmfloat8_t 
zm, fpm_t fpm);
  svfloat16_t svdot[_n_f16_mf8]_fpm(svfloat16_t zda, svmfloat8_t zn, mfloat8_t 
zm, fpm_t fpm);

* 8-bit floating-point indexed dot product to half-precision.

  // Only if (__ARM_FEATURE_SVE2  && __ARM_FEATURE_FP8DOT2) || 
__ARM_FEATURE_SSVE_FP8DOT2
  svfloat16_t svdot_lane[_f16_mf8]_fpm(svfloat16_t zda, svmfloat8_t zn, 
svmfloat8_t zm,
   uint64_t imm0_7, fpm_t fpm);
---
 clang/include/clang/Basic/arm_sve.td  |  19 +++
 clang/include/clang/Basic/arm_sve_sme_incl.td |   1 +
 clang/lib/CodeGen/CGBuiltin.cpp   |  11 +-
 .../fp8-intrinsics/acle_sve2_fp8_fdot.c   | 149 ++
 .../aarch64-sve2-intrinsics/acle_sve2_fp8.c   |  23 ++-
 clang/utils/TableGen/SveEmitter.cpp   |   9 +-
 llvm/include/llvm/IR/IntrinsicsAArch64.td |  16 ++
 .../lib/Target/AArch64/AArch64SVEInstrInfo.td |   9 +-
 llvm/lib/Target/AArch64/SVEInstrFormats.td|  29 +++-
 llvm/test/CodeGen/AArch64/fp8-sve-fdot.ll |  41 +
 10 files changed, 293 insertions(+), 14 deletions(-)
 create mode 100644 
clang/test/CodeGen/AArch64/fp8-intrinsics/acle_sve2_fp8_fdot.c
 create mode 100644 llvm/test/CodeGen/AArch64/fp8-sve-fdot.ll

diff --git a/clang/include/clang/Basic/arm_sve.td 
b/clang/include/clang/Basic/arm_sve.td
index b9f40faf0b18e6..2c8ca8014387d3 100644
--- a/clang/include/clang/Basic/arm_sve.td
+++ b/clang/include/clang/Basic/arm_sve.td
@@ -2476,3 +2476,22 @@ let SVETargetGuard = "sve2,fp8", SMETargetGuard = 
"sme2,fp8" in {
   def SVFCVTNB : SInst<"svcvtnb_mf8[_f32_x2]_fpm", "~2>",  "f", MergeNone, 
"aarch64_sve_fp8_cvtnb", [VerifyRuntimeMode, SetsFPMR]>;
   def SVFCVTNT : SInst<"svcvtnt_mf8[_f32_x2]_fpm", "~~2>", "f", MergeNone, 
"aarch64_sve_fp8_cvtnt", [VerifyRuntimeMode, SetsFPMR]>;
 }
+
+let SVETargetGuard = "sve2,fp8dot2", SMETargetGuard ="sme,ssve-fp8dot2" in {
+  // 8-bit floating-point dot product to half-precision (vectors)
+  def SVFDOT_2WAY   :  SInst<"svdot[_f16_mf8]_fpm",   "dd~~>", "h", MergeNone, 
"aarch64_sve_fp8_fdot", [VerifyRuntimeMode, SetsFPMR]>;
+  def SVFDOT_N_2WAY :  SInst<"svdot[_n_f16_mf8]_fpm", "dd~!>", "h", MergeNone, 
"aarch64_sve_fp8_fdot", [VerifyRuntimeMode, SetsFPMR]>;
+
+  // 8-bit floating-point dot product to half-precision (indexed)
+  def SVFDOT_LANE_2WAY :  SInst<"svdot_lane[_f16_mf8]_fpm", "dd~~i>", "h", 
MergeNone, "aarch64_sve_fp8_fdot_lane", [VerifyRuntimeMode, SetsFPMR], 
[ImmCheck<3, ImmCheck0_7>]>;
+}
+
+let SVETargetGuard = "sve2,fp8dot4", SMETargetGuard ="sme,ssve-fp8dot4" in {
+  // 8-bit floating-point dot product to single-precision (vectors)
+  def SVFDOT_4WAY   : SInst<"svdot[_f32_mf8]_fpm",   "dd~~>", "f", MergeNone, 
"aarch64_sve_fp8_fdot", [VerifyRuntimeMode, SetsFPMR]>;
+  def SVFDOT_N_4WAY : SInst<"svdot[_n_f32_mf8]_fpm", "dd~!>", "f", MergeNone, 
"aarch64_sve_fp8_fdot", [VerifyRuntimeMode, SetsFPMR]>;
+
+  // 8-bit floating-point dot product to single-precision (indexed)
+  def SVFDOT_LANE_4WAY :  SInst<"svdot_lane[_f32_mf8]_fpm", "dd~~i>", "f", 
MergeNone, "aarch64_sve_fp8_fdot_lane", [VerifyRuntimeMode, SetsFPMR], 
[ImmCheck<3, ImmCheck0_3>]>;
+}
+
diff --git a/clang/include/clang/Basic/arm_sve_sme_incl.td 
b/clang/include/clang/Basic/arm_sve_sme_incl.td
index de10be7bdce0db..44201b15505599 100644
--- a/clang/include/clang/Basic/arm_sve_sme_incl.td
+++ b/clang/include/clang/Basic/arm_sve_sme_incl.td
@@ -105,6 +105,7 @@ include "arm_immcheck_incl.td"
 // N: svfloat64_t
 // $: svbfloat16_t
 // ~: svmfloat8_t
+// !: mfloat8_t (splat to svmfloat8_t)
 
 // J: Prefetch type (sv_prfop)
 
diff --git a/clang/lib/CodeGen/CGBuiltin.cpp b/clang/lib/CodeGen/CGBuiltin.cpp
index 49a4c1ecc825e7..84048a4beac2c5 100644
--- a/clang/lib/CodeGen/CGBuiltin.cpp
+++ b/clang/lib/CodeGen/CGBui

[clang] [llvm] [AArch64] Implement FP8 SVE intrinsics for fused multiply-add (PR #118126)

2024-12-11 Thread via cfe-commits

https://github.com/SpencerAbson unassigned 
https://github.com/llvm/llvm-project/pull/118126
___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[clang] [llvm] [AArch64] Implement FP8 SVE intrinsics for fused multiply-add (PR #118126)

2024-12-09 Thread Momchil Velikov via cfe-commits

https://github.com/momchil-velikov edited 
https://github.com/llvm/llvm-project/pull/118126
___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[clang] [llvm] [AArch64] Implement FP8 SVE intrinsics for fused multiply-add (PR #118126)

2024-12-04 Thread Jonathan Thackray via cfe-commits

https://github.com/jthackray approved this pull request.

LGTM

https://github.com/llvm/llvm-project/pull/118126
___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[clang] [llvm] [AArch64] Implement FP8 SVE intrinsics for fused multiply-add (PR #118126)

2024-11-29 Thread via cfe-commits

llvmbot wrote:




@llvm/pr-subscribers-clang

Author: Momchil Velikov (momchil-velikov)


Changes



---

Patch is 105.50 KiB, truncated to 20.00 KiB below, full version: 
https://github.com/llvm/llvm-project/pull/118126.diff


16 Files Affected:

- (modified) clang/include/clang/Basic/arm_sve.td (+67) 
- (modified) clang/include/clang/Basic/arm_sve_sme_incl.td (+1) 
- (modified) clang/lib/CodeGen/CGBuiltin.cpp (+10-1) 
- (added) clang/test/CodeGen/AArch64/fp8-intrinsics/acle_sve2_fp8_cvt.c (+173) 
- (added) clang/test/CodeGen/AArch64/fp8-intrinsics/acle_sve2_fp8_cvtn.c (+101) 
- (added) clang/test/CodeGen/AArch64/fp8-intrinsics/acle_sve2_fp8_fdot.c (+149) 
- (added) clang/test/CodeGen/AArch64/fp8-intrinsics/acle_sve2_fp8_fmla.c (+389) 
- (added) clang/test/Sema/aarch64-sve2-intrinsics/acle_sve2_fp8.c (+103) 
- (modified) clang/utils/TableGen/SveEmitter.cpp (+9-2) 
- (modified) llvm/include/llvm/IR/IntrinsicsAArch64.td (+65) 
- (modified) llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td (+29-29) 
- (modified) llvm/lib/Target/AArch64/SVEInstrFormats.td (+86-9) 
- (added) llvm/test/CodeGen/AArch64/fp8-sve-cvt-cvtlt.ll (+78) 
- (added) llvm/test/CodeGen/AArch64/fp8-sve-cvtn.ll (+49) 
- (added) llvm/test/CodeGen/AArch64/fp8-sve-fdot.ll (+41) 
- (added) llvm/test/CodeGen/AArch64/fp8-sve-fmla.ll (+114) 


``diff
diff --git a/clang/include/clang/Basic/arm_sve.td 
b/clang/include/clang/Basic/arm_sve.td
index b36e592042da0b..1a9089c5466747 100644
--- a/clang/include/clang/Basic/arm_sve.td
+++ b/clang/include/clang/Basic/arm_sve.td
@@ -2447,3 +2447,70 @@ let SVETargetGuard = "sve2,faminmax", SMETargetGuard = 
"sme2,faminmax" in {
   defm SVAMIN : SInstZPZZ<"svamin", "hfd", "aarch64_sve_famin", 
"aarch64_sve_famin_u">;
   defm SVAMAX : SInstZPZZ<"svamax", "hfd", "aarch64_sve_famax", 
"aarch64_sve_famax_u">;
 }
+
+let SVETargetGuard = "sve2,fp8", SMETargetGuard = "sme2,fp8" in {
+  // 8-bit floating-point convert to BFloat16/Float16
+  def SVF1CVT : SInst<"svcvt1_{d}[_mf8]_fpm", "d~>", "bh", MergeNone, 
"aarch64_sve_fp8_cvt1", [VerifyRuntimeMode, SetsFPMR]>;
+  def SVF2CVT : SInst<"svcvt2_{d}[_mf8]_fpm", "d~>", "bh", MergeNone, 
"aarch64_sve_fp8_cvt2", [VerifyRuntimeMode, SetsFPMR]>;
+
+  // 8-bit floating-point convert to BFloat16/Float16 (top)
+  def SVF1CVTLT : SInst<"svcvtlt1_{d}[_mf8]_fpm", "d~>", "bh", MergeNone, 
"aarch64_sve_fp8_cvtlt1", [VerifyRuntimeMode, SetsFPMR]>;
+  def SVF2CVTLT : SInst<"svcvtlt2_{d}[_mf8]_fpm", "d~>", "bh", MergeNone, 
"aarch64_sve_fp8_cvtlt2", [VerifyRuntimeMode, SetsFPMR]>;
+
+  // BFloat16/Float16 convert, narrow and interleave to 8-bit floating-point
+  def SVFCVTN : SInst<"svcvtn_mf8[_{d}_x2]_fpm", "~2>", "bh", MergeNone, 
"aarch64_sve_fp8_cvtn", [VerifyRuntimeMode, SetsFPMR]>;
+
+  // Single-precision convert, narrow and interleave to 8-bit floating-point 
(top and bottom)
+  def SVFCVTNB : SInst<"svcvtnb_mf8[_f32_x2]_fpm", "~2>",  "f", MergeNone, 
"aarch64_sve_fp8_cvtnb", [VerifyRuntimeMode, SetsFPMR]>;
+  def SVFCVTNT : SInst<"svcvtnt_mf8[_f32_x2]_fpm", "~~2>", "f", MergeNone, 
"aarch64_sve_fp8_cvtnt", [VerifyRuntimeMode, SetsFPMR]>;
+}
+
+let SVETargetGuard = "sve2,fp8dot2", SMETargetGuard ="sme,ssve-fp8dot2" in {
+  // 8-bit floating-point dot product to half-precision (vectors)
+  def SVFDOT_2WAY   :  SInst<"svdot[_f16_mf8]_fpm",   "dd~~>", "h", MergeNone, 
"aarch64_sve_fp8_fdot", [VerifyRuntimeMode, SetsFPMR]>;
+  def SVFDOT_N_2WAY :  SInst<"svdot[_n_f16_mf8]_fpm", "dd~!>", "h", MergeNone, 
"aarch64_sve_fp8_fdot", [VerifyRuntimeMode, SetsFPMR]>;
+
+  // 8-bit floating-point dot product to half-precision (indexed)
+  def SVFDOT_LANE_2WAY :  SInst<"svdot_lane[_f16_mf8]_fpm", "dd~~i>", "h", 
MergeNone, "aarch64_sve_fp8_fdot_lane", [VerifyRuntimeMode, SetsFPMR], 
[ImmCheck<3, ImmCheck0_7>]>;
+}
+
+let SVETargetGuard = "sve2,fp8dot4", SMETargetGuard ="sme,ssve-fp8dot4" in {
+  // 8-bit floating-point dot product to single-precision (vectors)
+  def SVFDOT_4WAY   : SInst<"svdot[_f32_mf8]_fpm",   "dd~~>", "f", MergeNone, 
"aarch64_sve_fp8_fdot", [VerifyRuntimeMode, SetsFPMR]>;
+  def SVFDOT_N_4WAY : SInst<"svdot[_n_f32_mf8]_fpm", "dd~!>", "f", MergeNone, 
"aarch64_sve_fp8_fdot", [VerifyRuntimeMode, SetsFPMR]>;
+
+  // 8-bit floating-point dot product to single-precision (indexed)
+  def SVFDOT_LANE_4WAY :  SInst<"svdot_lane[_f32_mf8]_fpm", "dd~~i>", "f", 
MergeNone, "aarch64_sve_fp8_fdot_lane", [VerifyRuntimeMode, SetsFPMR], 
[ImmCheck<3, ImmCheck0_3>]>;
+}
+
+let SVETargetGuard = "sve2,fp8fma", SMETargetGuard = "sme,ssve-fp8fma" in {
+  // 8-bit floating-point multiply-add long to half-precision (bottom)
+  def SVFMLALB   : SInst<"svmlalb[_f16_mf8]_fpm",   "dd~~>", "h", MergeNone, 
"aarch64_sve_fp8_fmlalb", [VerifyRuntimeMode, SetsFPMR]>;
+  def SVFMLALB_N : SInst<"svmlalb[_n_f16_mf8]_fpm", "dd~!>", "h", MergeNone, 
"aarch64_sve_fp8_fmlalb", [VerifyRuntimeMode, SetsFPMR]>;
+
+  // 8-bit floating-point multiply-add long to ha_fpmlf-precision (bottom, 
indexed)
+  def SVFML

[clang] [llvm] [AArch64] Implement FP8 SVE intrinsics for fused multiply-add (PR #118126)

2024-11-29 Thread via cfe-commits

llvmbot wrote:




@llvm/pr-subscribers-llvm-ir

Author: Momchil Velikov (momchil-velikov)


Changes



---

Patch is 105.50 KiB, truncated to 20.00 KiB below, full version: 
https://github.com/llvm/llvm-project/pull/118126.diff


16 Files Affected:

- (modified) clang/include/clang/Basic/arm_sve.td (+67) 
- (modified) clang/include/clang/Basic/arm_sve_sme_incl.td (+1) 
- (modified) clang/lib/CodeGen/CGBuiltin.cpp (+10-1) 
- (added) clang/test/CodeGen/AArch64/fp8-intrinsics/acle_sve2_fp8_cvt.c (+173) 
- (added) clang/test/CodeGen/AArch64/fp8-intrinsics/acle_sve2_fp8_cvtn.c (+101) 
- (added) clang/test/CodeGen/AArch64/fp8-intrinsics/acle_sve2_fp8_fdot.c (+149) 
- (added) clang/test/CodeGen/AArch64/fp8-intrinsics/acle_sve2_fp8_fmla.c (+389) 
- (added) clang/test/Sema/aarch64-sve2-intrinsics/acle_sve2_fp8.c (+103) 
- (modified) clang/utils/TableGen/SveEmitter.cpp (+9-2) 
- (modified) llvm/include/llvm/IR/IntrinsicsAArch64.td (+65) 
- (modified) llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td (+29-29) 
- (modified) llvm/lib/Target/AArch64/SVEInstrFormats.td (+86-9) 
- (added) llvm/test/CodeGen/AArch64/fp8-sve-cvt-cvtlt.ll (+78) 
- (added) llvm/test/CodeGen/AArch64/fp8-sve-cvtn.ll (+49) 
- (added) llvm/test/CodeGen/AArch64/fp8-sve-fdot.ll (+41) 
- (added) llvm/test/CodeGen/AArch64/fp8-sve-fmla.ll (+114) 


``diff
diff --git a/clang/include/clang/Basic/arm_sve.td 
b/clang/include/clang/Basic/arm_sve.td
index b36e592042da0b..1a9089c5466747 100644
--- a/clang/include/clang/Basic/arm_sve.td
+++ b/clang/include/clang/Basic/arm_sve.td
@@ -2447,3 +2447,70 @@ let SVETargetGuard = "sve2,faminmax", SMETargetGuard = 
"sme2,faminmax" in {
   defm SVAMIN : SInstZPZZ<"svamin", "hfd", "aarch64_sve_famin", 
"aarch64_sve_famin_u">;
   defm SVAMAX : SInstZPZZ<"svamax", "hfd", "aarch64_sve_famax", 
"aarch64_sve_famax_u">;
 }
+
+let SVETargetGuard = "sve2,fp8", SMETargetGuard = "sme2,fp8" in {
+  // 8-bit floating-point convert to BFloat16/Float16
+  def SVF1CVT : SInst<"svcvt1_{d}[_mf8]_fpm", "d~>", "bh", MergeNone, 
"aarch64_sve_fp8_cvt1", [VerifyRuntimeMode, SetsFPMR]>;
+  def SVF2CVT : SInst<"svcvt2_{d}[_mf8]_fpm", "d~>", "bh", MergeNone, 
"aarch64_sve_fp8_cvt2", [VerifyRuntimeMode, SetsFPMR]>;
+
+  // 8-bit floating-point convert to BFloat16/Float16 (top)
+  def SVF1CVTLT : SInst<"svcvtlt1_{d}[_mf8]_fpm", "d~>", "bh", MergeNone, 
"aarch64_sve_fp8_cvtlt1", [VerifyRuntimeMode, SetsFPMR]>;
+  def SVF2CVTLT : SInst<"svcvtlt2_{d}[_mf8]_fpm", "d~>", "bh", MergeNone, 
"aarch64_sve_fp8_cvtlt2", [VerifyRuntimeMode, SetsFPMR]>;
+
+  // BFloat16/Float16 convert, narrow and interleave to 8-bit floating-point
+  def SVFCVTN : SInst<"svcvtn_mf8[_{d}_x2]_fpm", "~2>", "bh", MergeNone, 
"aarch64_sve_fp8_cvtn", [VerifyRuntimeMode, SetsFPMR]>;
+
+  // Single-precision convert, narrow and interleave to 8-bit floating-point 
(top and bottom)
+  def SVFCVTNB : SInst<"svcvtnb_mf8[_f32_x2]_fpm", "~2>",  "f", MergeNone, 
"aarch64_sve_fp8_cvtnb", [VerifyRuntimeMode, SetsFPMR]>;
+  def SVFCVTNT : SInst<"svcvtnt_mf8[_f32_x2]_fpm", "~~2>", "f", MergeNone, 
"aarch64_sve_fp8_cvtnt", [VerifyRuntimeMode, SetsFPMR]>;
+}
+
+let SVETargetGuard = "sve2,fp8dot2", SMETargetGuard ="sme,ssve-fp8dot2" in {
+  // 8-bit floating-point dot product to half-precision (vectors)
+  def SVFDOT_2WAY   :  SInst<"svdot[_f16_mf8]_fpm",   "dd~~>", "h", MergeNone, 
"aarch64_sve_fp8_fdot", [VerifyRuntimeMode, SetsFPMR]>;
+  def SVFDOT_N_2WAY :  SInst<"svdot[_n_f16_mf8]_fpm", "dd~!>", "h", MergeNone, 
"aarch64_sve_fp8_fdot", [VerifyRuntimeMode, SetsFPMR]>;
+
+  // 8-bit floating-point dot product to half-precision (indexed)
+  def SVFDOT_LANE_2WAY :  SInst<"svdot_lane[_f16_mf8]_fpm", "dd~~i>", "h", 
MergeNone, "aarch64_sve_fp8_fdot_lane", [VerifyRuntimeMode, SetsFPMR], 
[ImmCheck<3, ImmCheck0_7>]>;
+}
+
+let SVETargetGuard = "sve2,fp8dot4", SMETargetGuard ="sme,ssve-fp8dot4" in {
+  // 8-bit floating-point dot product to single-precision (vectors)
+  def SVFDOT_4WAY   : SInst<"svdot[_f32_mf8]_fpm",   "dd~~>", "f", MergeNone, 
"aarch64_sve_fp8_fdot", [VerifyRuntimeMode, SetsFPMR]>;
+  def SVFDOT_N_4WAY : SInst<"svdot[_n_f32_mf8]_fpm", "dd~!>", "f", MergeNone, 
"aarch64_sve_fp8_fdot", [VerifyRuntimeMode, SetsFPMR]>;
+
+  // 8-bit floating-point dot product to single-precision (indexed)
+  def SVFDOT_LANE_4WAY :  SInst<"svdot_lane[_f32_mf8]_fpm", "dd~~i>", "f", 
MergeNone, "aarch64_sve_fp8_fdot_lane", [VerifyRuntimeMode, SetsFPMR], 
[ImmCheck<3, ImmCheck0_3>]>;
+}
+
+let SVETargetGuard = "sve2,fp8fma", SMETargetGuard = "sme,ssve-fp8fma" in {
+  // 8-bit floating-point multiply-add long to half-precision (bottom)
+  def SVFMLALB   : SInst<"svmlalb[_f16_mf8]_fpm",   "dd~~>", "h", MergeNone, 
"aarch64_sve_fp8_fmlalb", [VerifyRuntimeMode, SetsFPMR]>;
+  def SVFMLALB_N : SInst<"svmlalb[_n_f16_mf8]_fpm", "dd~!>", "h", MergeNone, 
"aarch64_sve_fp8_fmlalb", [VerifyRuntimeMode, SetsFPMR]>;
+
+  // 8-bit floating-point multiply-add long to ha_fpmlf-precision (bottom, 
indexed)
+  def SVF