kpyzhov created this revision.
kpyzhov added a reviewer: yaxunl.
kpyzhov added a project: AMDGPU.
Herald added subscribers: llvm-commits, cfe-commits, hiraditya, t-tye, tpr, 
dstuttard, nhaehnle, wdng, jvesely, kzhuravl, arsenm.
Herald added projects: clang, LLVM.

Added declarations of MFMA built-ins to clang. Modified declarations of 
corresponding LLVM intrinsics. Added tests for new built-ins.


Repository:
  rG LLVM Github Monorepo

https://reviews.llvm.org/D72723

Files:
  clang/include/clang/Basic/BuiltinsAMDGPU.def
  clang/lib/Basic/Targets/AMDGPU.cpp
  clang/test/CodeGenOpenCL/builtins-amdgcn-mfma.cl
  clang/test/SemaOpenCL/builtins-amdgcn-error-gfx908-param.cl
  llvm/include/llvm/IR/IntrinsicsAMDGPU.td
  llvm/lib/Target/AMDGPU/AMDGPU.td
  llvm/lib/Target/AMDGPU/AMDGPUSubtarget.h

Index: llvm/lib/Target/AMDGPU/AMDGPUSubtarget.h
===================================================================
--- llvm/lib/Target/AMDGPU/AMDGPUSubtarget.h
+++ llvm/lib/Target/AMDGPU/AMDGPUSubtarget.h
@@ -353,6 +353,7 @@
   bool HasMAIInsts;
   bool HasPkFmacF16Inst;
   bool HasAtomicFaddInsts;
+  bool HasMfma1Insts;
   bool EnableSRAMECC;
   bool DoesNotSupportSRAMECC;
   bool HasNoSdstCMPX;
Index: llvm/lib/Target/AMDGPU/AMDGPU.td
===================================================================
--- llvm/lib/Target/AMDGPU/AMDGPU.td
+++ llvm/lib/Target/AMDGPU/AMDGPU.td
@@ -484,6 +484,12 @@
   "Does not need SW waitstates"
 >;
 
+def FeatureMfma1Insts : SubtargetFeature<"mfma1-insts",
+  "HasMfma1Insts",
+  "true",
+  "Has MFMA1 instructions"
+>;
+
 //===------------------------------------------------------------===//
 // Subtarget Features (options and debugging)
 //===------------------------------------------------------------===//
@@ -818,6 +824,7 @@
    FeatureAtomicFaddInsts,
    FeatureSRAMECC,
    FeatureMFMAInlineLiteralBug,
+   FeatureMfma1Insts,
    FeatureCodeObjectV3]>;
 
 def FeatureISAVersion9_0_9 : FeatureSet<
@@ -1161,6 +1168,9 @@
 def EnableLateCFGStructurize : Predicate<
   "EnableLateStructurizeCFG">;
 
+def HasMfma1Insts : Predicate<"Subtarget->hasMfma1Insts()">,
+  AssemblerPredicate<"FeatureMfma1Insts">;
+
 // Include AMDGPU TD files
 include "SISchedule.td"
 include "GCNProcessors.td"
Index: llvm/include/llvm/IR/IntrinsicsAMDGPU.td
===================================================================
--- llvm/include/llvm/IR/IntrinsicsAMDGPU.td
+++ llvm/include/llvm/IR/IntrinsicsAMDGPU.td
@@ -1725,105 +1725,125 @@
 def int_amdgcn_global_atomic_fadd    : AMDGPUGlobalAtomicNoRtn;
 
 // llvm.amdgcn.mfma.f32.* vdst, srcA, srcB, srcC, cbsz, abid, blgp
-def int_amdgcn_mfma_f32_32x32x1f32 : Intrinsic<[llvm_v32f32_ty],
-  [llvm_float_ty, llvm_float_ty, llvm_v32f32_ty,
-   llvm_i32_ty, llvm_i32_ty, llvm_i32_ty],
-   [IntrConvergent, IntrNoMem, ImmArg<3>, ImmArg<4>, ImmArg<5>]>;
-
-def int_amdgcn_mfma_f32_16x16x1f32 : Intrinsic<[llvm_v16f32_ty],
-  [llvm_float_ty, llvm_float_ty, llvm_v16f32_ty,
-   llvm_i32_ty, llvm_i32_ty, llvm_i32_ty],
-   [IntrConvergent, IntrNoMem, ImmArg<3>, ImmArg<4>, ImmArg<5>]>;
-
-def int_amdgcn_mfma_f32_4x4x1f32 : Intrinsic<[llvm_v4f32_ty],
-  [llvm_float_ty, llvm_float_ty, llvm_v4f32_ty,
-   llvm_i32_ty, llvm_i32_ty, llvm_i32_ty],
-   [IntrConvergent, IntrNoMem, ImmArg<3>, ImmArg<4>, ImmArg<5>]>;
-
-def int_amdgcn_mfma_f32_32x32x2f32 : Intrinsic<[llvm_v16f32_ty],
-  [llvm_float_ty, llvm_float_ty, llvm_v16f32_ty,
-   llvm_i32_ty, llvm_i32_ty, llvm_i32_ty],
-   [IntrConvergent, IntrNoMem, ImmArg<3>, ImmArg<4>, ImmArg<5>]>;
-
-def int_amdgcn_mfma_f32_16x16x4f32 : Intrinsic<[llvm_v4f32_ty],
-  [llvm_float_ty, llvm_float_ty, llvm_v4f32_ty,
-   llvm_i32_ty, llvm_i32_ty, llvm_i32_ty],
-   [IntrConvergent, IntrNoMem, ImmArg<3>, ImmArg<4>, ImmArg<5>]>;
-
-def int_amdgcn_mfma_f32_32x32x4f16 : Intrinsic<[llvm_v32f32_ty],
-  [llvm_v4f16_ty, llvm_v4f16_ty, llvm_v32f32_ty,
-   llvm_i32_ty, llvm_i32_ty, llvm_i32_ty],
-   [IntrConvergent, IntrNoMem, ImmArg<3>, ImmArg<4>, ImmArg<5>]>;
-
-def int_amdgcn_mfma_f32_16x16x4f16 : Intrinsic<[llvm_v16f32_ty],
-  [llvm_v4f16_ty, llvm_v4f16_ty, llvm_v16f32_ty,
-   llvm_i32_ty, llvm_i32_ty, llvm_i32_ty],
-   [IntrConvergent, IntrNoMem, ImmArg<3>, ImmArg<4>, ImmArg<5>]>;
-
-def int_amdgcn_mfma_f32_4x4x4f16 : Intrinsic<[llvm_v4f32_ty],
-  [llvm_v4f16_ty, llvm_v4f16_ty, llvm_v4f32_ty,
-   llvm_i32_ty, llvm_i32_ty, llvm_i32_ty],
-   [IntrConvergent, IntrNoMem, ImmArg<3>, ImmArg<4>, ImmArg<5>]>;
-
-def int_amdgcn_mfma_f32_32x32x8f16 : Intrinsic<[llvm_v16f32_ty],
-  [llvm_v4f16_ty, llvm_v4f16_ty, llvm_v16f32_ty,
-   llvm_i32_ty, llvm_i32_ty, llvm_i32_ty],
-   [IntrConvergent, IntrNoMem, ImmArg<3>, ImmArg<4>, ImmArg<5>]>;
-
-def int_amdgcn_mfma_f32_16x16x16f16 : Intrinsic<[llvm_v4f32_ty],
-  [llvm_v4f16_ty, llvm_v4f16_ty, llvm_v4f32_ty,
-   llvm_i32_ty, llvm_i32_ty, llvm_i32_ty],
-   [IntrConvergent, IntrNoMem, ImmArg<3>, ImmArg<4>, ImmArg<5>]>;
-
-def int_amdgcn_mfma_i32_32x32x4i8 : Intrinsic<[llvm_v32i32_ty],
-  [llvm_i32_ty, llvm_i32_ty, llvm_v32i32_ty,
-   llvm_i32_ty, llvm_i32_ty, llvm_i32_ty],
-   [IntrConvergent, IntrNoMem, ImmArg<3>, ImmArg<4>, ImmArg<5>]>;
-
-def int_amdgcn_mfma_i32_16x16x4i8 : Intrinsic<[llvm_v16i32_ty],
-  [llvm_i32_ty, llvm_i32_ty, llvm_v16i32_ty,
-   llvm_i32_ty, llvm_i32_ty, llvm_i32_ty],
-   [IntrConvergent, IntrNoMem, ImmArg<3>, ImmArg<4>, ImmArg<5>]>;
-
-def int_amdgcn_mfma_i32_4x4x4i8 : Intrinsic<[llvm_v4i32_ty],
-  [llvm_i32_ty, llvm_i32_ty, llvm_v4i32_ty,
-   llvm_i32_ty, llvm_i32_ty, llvm_i32_ty],
-   [IntrConvergent, IntrNoMem, ImmArg<3>, ImmArg<4>, ImmArg<5>]>;
-
-def int_amdgcn_mfma_i32_32x32x8i8 : Intrinsic<[llvm_v16i32_ty],
-  [llvm_i32_ty, llvm_i32_ty, llvm_v16i32_ty,
-   llvm_i32_ty, llvm_i32_ty, llvm_i32_ty],
-   [IntrConvergent, IntrNoMem, ImmArg<3>, ImmArg<4>, ImmArg<5>]>;
-
-def int_amdgcn_mfma_i32_16x16x16i8 : Intrinsic<[llvm_v4i32_ty],
-  [llvm_i32_ty, llvm_i32_ty, llvm_v4i32_ty,
-   llvm_i32_ty, llvm_i32_ty, llvm_i32_ty],
-   [IntrConvergent, IntrNoMem, ImmArg<3>, ImmArg<4>, ImmArg<5>]>;
-
-def int_amdgcn_mfma_f32_32x32x2bf16 : Intrinsic<[llvm_v32f32_ty],
-  [llvm_v2i16_ty, llvm_v2i16_ty, llvm_v32f32_ty,
-   llvm_i32_ty, llvm_i32_ty, llvm_i32_ty],
-   [IntrConvergent, IntrNoMem, ImmArg<3>, ImmArg<4>, ImmArg<5>]>;
-
-def int_amdgcn_mfma_f32_16x16x2bf16 : Intrinsic<[llvm_v16f32_ty],
-  [llvm_v2i16_ty, llvm_v2i16_ty, llvm_v16f32_ty,
-   llvm_i32_ty, llvm_i32_ty, llvm_i32_ty],
-   [IntrConvergent, IntrNoMem, ImmArg<3>, ImmArg<4>, ImmArg<5>]>;
-
-def int_amdgcn_mfma_f32_4x4x2bf16 : Intrinsic<[llvm_v4f32_ty],
-  [llvm_v2i16_ty, llvm_v2i16_ty, llvm_v4f32_ty,
-   llvm_i32_ty, llvm_i32_ty, llvm_i32_ty],
-   [IntrConvergent, IntrNoMem, ImmArg<3>, ImmArg<4>, ImmArg<5>]>;
-
-def int_amdgcn_mfma_f32_32x32x4bf16 : Intrinsic<[llvm_v16f32_ty],
-  [llvm_v2i16_ty, llvm_v2i16_ty, llvm_v16f32_ty,
-   llvm_i32_ty, llvm_i32_ty, llvm_i32_ty],
-   [IntrConvergent, IntrNoMem, ImmArg<3>, ImmArg<4>, ImmArg<5>]>;
-
-def int_amdgcn_mfma_f32_16x16x8bf16 : Intrinsic<[llvm_v4f32_ty],
-  [llvm_v2i16_ty, llvm_v2i16_ty, llvm_v4f32_ty,
-   llvm_i32_ty, llvm_i32_ty, llvm_i32_ty],
-   [IntrConvergent, IntrNoMem, ImmArg<3>, ImmArg<4>, ImmArg<5>]>;
+def int_amdgcn_mfma_f32_32x32x1f32 : GCCBuiltin<"__builtin_amdgcn_mfma_f32_32x32x1f32">,
+  Intrinsic<[llvm_v32f32_ty],
+            [llvm_float_ty, llvm_float_ty, llvm_v32f32_ty,
+            llvm_i32_ty, llvm_i32_ty, llvm_i32_ty],
+            [IntrConvergent, IntrNoMem, ImmArg<3>, ImmArg<4>, ImmArg<5>]>;
+
+def int_amdgcn_mfma_f32_16x16x1f32 : GCCBuiltin<"__builtin_amdgcn_mfma_f32_16x16x1f32">,
+  Intrinsic<[llvm_v16f32_ty],
+            [llvm_float_ty, llvm_float_ty, llvm_v16f32_ty,
+            llvm_i32_ty, llvm_i32_ty, llvm_i32_ty],
+            [IntrConvergent, IntrNoMem, ImmArg<3>, ImmArg<4>, ImmArg<5>]>;
+
+def int_amdgcn_mfma_f32_4x4x1f32 : GCCBuiltin<"__builtin_amdgcn_mfma_f32_4x4x1f32">,
+  Intrinsic<[llvm_v4f32_ty],
+            [llvm_float_ty, llvm_float_ty, llvm_v4f32_ty,
+            llvm_i32_ty, llvm_i32_ty, llvm_i32_ty],
+            [IntrConvergent, IntrNoMem, ImmArg<3>, ImmArg<4>, ImmArg<5>]>;
+
+def int_amdgcn_mfma_f32_32x32x2f32 : GCCBuiltin<"__builtin_amdgcn_mfma_f32_32x32x2f32">,
+  Intrinsic<[llvm_v16f32_ty],
+            [llvm_float_ty, llvm_float_ty, llvm_v16f32_ty,
+            llvm_i32_ty, llvm_i32_ty, llvm_i32_ty],
+            [IntrConvergent, IntrNoMem, ImmArg<3>, ImmArg<4>, ImmArg<5>]>;
+
+def int_amdgcn_mfma_f32_16x16x4f32 : GCCBuiltin<"__builtin_amdgcn_mfma_f32_16x16x4f32">,
+  Intrinsic<[llvm_v4f32_ty],
+            [llvm_float_ty, llvm_float_ty, llvm_v4f32_ty,
+            llvm_i32_ty, llvm_i32_ty, llvm_i32_ty],
+            [IntrConvergent, IntrNoMem, ImmArg<3>, ImmArg<4>, ImmArg<5>]>;
+
+def int_amdgcn_mfma_f32_32x32x4f16 : GCCBuiltin<"__builtin_amdgcn_mfma_f32_32x32x4f16">,
+  Intrinsic<[llvm_v32f32_ty],
+            [llvm_v4f16_ty, llvm_v4f16_ty, llvm_v32f32_ty,
+            llvm_i32_ty, llvm_i32_ty, llvm_i32_ty],
+            [IntrConvergent, IntrNoMem, ImmArg<3>, ImmArg<4>, ImmArg<5>]>;
+
+def int_amdgcn_mfma_f32_16x16x4f16 : GCCBuiltin<"__builtin_amdgcn_mfma_f32_16x16x4f16">,
+  Intrinsic<[llvm_v16f32_ty],
+            [llvm_v4f16_ty, llvm_v4f16_ty, llvm_v16f32_ty,
+            llvm_i32_ty, llvm_i32_ty, llvm_i32_ty],
+            [IntrConvergent, IntrNoMem, ImmArg<3>, ImmArg<4>, ImmArg<5>]>;
+
+def int_amdgcn_mfma_f32_4x4x4f16 : GCCBuiltin<"__builtin_amdgcn_mfma_f32_4x4x4f16">,
+  Intrinsic<[llvm_v4f32_ty],
+            [llvm_v4f16_ty, llvm_v4f16_ty, llvm_v4f32_ty,
+            llvm_i32_ty, llvm_i32_ty, llvm_i32_ty],
+            [IntrConvergent, IntrNoMem, ImmArg<3>, ImmArg<4>, ImmArg<5>]>;
+
+def int_amdgcn_mfma_f32_32x32x8f16 : GCCBuiltin<"__builtin_amdgcn_mfma_f32_32x32x8f16">,
+  Intrinsic<[llvm_v16f32_ty],
+            [llvm_v4f16_ty, llvm_v4f16_ty, llvm_v16f32_ty,
+            llvm_i32_ty, llvm_i32_ty, llvm_i32_ty],
+            [IntrConvergent, IntrNoMem, ImmArg<3>, ImmArg<4>, ImmArg<5>]>;
+
+def int_amdgcn_mfma_f32_16x16x16f16 : GCCBuiltin<"__builtin_amdgcn_mfma_f32_16x16x16f16">,
+  Intrinsic<[llvm_v4f32_ty],
+            [llvm_v4f16_ty, llvm_v4f16_ty, llvm_v4f32_ty,
+            llvm_i32_ty, llvm_i32_ty, llvm_i32_ty],
+            [IntrConvergent, IntrNoMem, ImmArg<3>, ImmArg<4>, ImmArg<5>]>;
+
+def int_amdgcn_mfma_i32_32x32x4i8 : GCCBuiltin<"__builtin_amdgcn_mfma_i32_32x32x4i8">,
+  Intrinsic<[llvm_v32i32_ty],
+            [llvm_i32_ty, llvm_i32_ty, llvm_v32i32_ty,
+            llvm_i32_ty, llvm_i32_ty, llvm_i32_ty],
+            [IntrConvergent, IntrNoMem, ImmArg<3>, ImmArg<4>, ImmArg<5>]>;
+
+def int_amdgcn_mfma_i32_16x16x4i8 : GCCBuiltin<"__builtin_amdgcn_mfma_i32_16x16x4i8">,
+  Intrinsic<[llvm_v16i32_ty],
+            [llvm_i32_ty, llvm_i32_ty, llvm_v16i32_ty,
+            llvm_i32_ty, llvm_i32_ty, llvm_i32_ty],
+            [IntrConvergent, IntrNoMem, ImmArg<3>, ImmArg<4>, ImmArg<5>]>;
+
+def int_amdgcn_mfma_i32_4x4x4i8 : GCCBuiltin<"__builtin_amdgcn_mfma_i32_4x4x4i8">,
+  Intrinsic<[llvm_v4i32_ty],
+            [llvm_i32_ty, llvm_i32_ty, llvm_v4i32_ty,
+            llvm_i32_ty, llvm_i32_ty, llvm_i32_ty],
+            [IntrConvergent, IntrNoMem, ImmArg<3>, ImmArg<4>, ImmArg<5>]>;
+
+def int_amdgcn_mfma_i32_32x32x8i8 : GCCBuiltin<"__builtin_amdgcn_mfma_i32_32x32x8i8">,
+  Intrinsic<[llvm_v16i32_ty],
+            [llvm_i32_ty, llvm_i32_ty, llvm_v16i32_ty,
+            llvm_i32_ty, llvm_i32_ty, llvm_i32_ty],
+            [IntrConvergent, IntrNoMem, ImmArg<3>, ImmArg<4>, ImmArg<5>]>;
+
+def int_amdgcn_mfma_i32_16x16x16i8 : GCCBuiltin<"__builtin_amdgcn_mfma_i32_16x16x16i8">,
+  Intrinsic<[llvm_v4i32_ty],
+            [llvm_i32_ty, llvm_i32_ty, llvm_v4i32_ty,
+            llvm_i32_ty, llvm_i32_ty, llvm_i32_ty],
+            [IntrConvergent, IntrNoMem, ImmArg<3>, ImmArg<4>, ImmArg<5>]>;
+
+def int_amdgcn_mfma_f32_32x32x2bf16 : GCCBuiltin<"__builtin_amdgcn_mfma_f32_32x32x2bf16">,
+  Intrinsic<[llvm_v32f32_ty],
+            [llvm_v2i16_ty, llvm_v2i16_ty, llvm_v32f32_ty,
+            llvm_i32_ty, llvm_i32_ty, llvm_i32_ty],
+            [IntrConvergent, IntrNoMem, ImmArg<3>, ImmArg<4>, ImmArg<5>]>;
+
+def int_amdgcn_mfma_f32_16x16x2bf16 : GCCBuiltin<"__builtin_amdgcn_mfma_f32_16x16x2bf16">,
+  Intrinsic<[llvm_v16f32_ty],
+            [llvm_v2i16_ty, llvm_v2i16_ty, llvm_v16f32_ty,
+            llvm_i32_ty, llvm_i32_ty, llvm_i32_ty],
+            [IntrConvergent, IntrNoMem, ImmArg<3>, ImmArg<4>, ImmArg<5>]>;
+
+def int_amdgcn_mfma_f32_4x4x2bf16 : GCCBuiltin<"__builtin_amdgcn_mfma_f32_4x4x2bf16">,
+  Intrinsic<[llvm_v4f32_ty],
+            [llvm_v2i16_ty, llvm_v2i16_ty, llvm_v4f32_ty,
+            llvm_i32_ty, llvm_i32_ty, llvm_i32_ty],
+            [IntrConvergent, IntrNoMem, ImmArg<3>, ImmArg<4>, ImmArg<5>]>;
+
+def int_amdgcn_mfma_f32_32x32x4bf16 : GCCBuiltin<"__builtin_amdgcn_mfma_f32_32x32x4bf16">,
+  Intrinsic<[llvm_v16f32_ty],
+            [llvm_v2i16_ty, llvm_v2i16_ty, llvm_v16f32_ty,
+            llvm_i32_ty, llvm_i32_ty, llvm_i32_ty],
+            [IntrConvergent, IntrNoMem, ImmArg<3>, ImmArg<4>, ImmArg<5>]>;
+
+def int_amdgcn_mfma_f32_16x16x8bf16 : GCCBuiltin<"__builtin_amdgcn_mfma_f32_16x16x8bf16">,
+  Intrinsic<[llvm_v4f32_ty],
+            [llvm_v2i16_ty, llvm_v2i16_ty, llvm_v4f32_ty,
+            llvm_i32_ty, llvm_i32_ty, llvm_i32_ty],
+            [IntrConvergent, IntrNoMem, ImmArg<3>, ImmArg<4>, ImmArg<5>]>;
 
 //===----------------------------------------------------------------------===//
 // Special Intrinsics for backend internal use only. No frontend
Index: clang/test/SemaOpenCL/builtins-amdgcn-error-gfx908-param.cl
===================================================================
--- clang/test/SemaOpenCL/builtins-amdgcn-error-gfx908-param.cl
+++ clang/test/SemaOpenCL/builtins-amdgcn-error-gfx908-param.cl
@@ -0,0 +1,159 @@
+// RUN: %clang_cc1 -triple amdgcn-- -target-cpu gfx908 -verify -S -o - %s
+
+#pragma OPENCL EXTENSION cl_khr_fp64:enable
+
+typedef float  v4f   __attribute__((ext_vector_type(4)));
+typedef float  v16f  __attribute__((ext_vector_type(16)));
+typedef float  v32f  __attribute__((ext_vector_type(32)));
+typedef half   v4h   __attribute__((ext_vector_type(4)));
+typedef half   v16h  __attribute__((ext_vector_type(16)));
+typedef half   v32h  __attribute__((ext_vector_type(32)));
+typedef int    v4i   __attribute__((ext_vector_type(4)));
+typedef int    v16i  __attribute__((ext_vector_type(16)));
+typedef int    v32i  __attribute__((ext_vector_type(32)));
+typedef short  v2s   __attribute__((ext_vector_type(2)));
+typedef short  v4s   __attribute__((ext_vector_type(4)));
+typedef short  v16s  __attribute__((ext_vector_type(16)));
+typedef short  v32s  __attribute__((ext_vector_type(32)));
+typedef double v4d   __attribute__((ext_vector_type(4)));
+
+
+void test_mfma_f32_32x32x1f32(global v32f* out, float a, float b, v32f c, int d)
+{
+  *out = __builtin_amdgcn_mfma_f32_32x32x1f32(a, b, c, d, 0, 0); // expected-error{{argument to '__builtin_amdgcn_mfma_f32_32x32x1f32' must be a constant integer}}
+  *out = __builtin_amdgcn_mfma_f32_32x32x1f32(a, b, c, 0, d, 0); // expected-error{{argument to '__builtin_amdgcn_mfma_f32_32x32x1f32' must be a constant integer}}
+  *out = __builtin_amdgcn_mfma_f32_32x32x1f32(a, b, c, 0, 0, d); // expected-error{{argument to '__builtin_amdgcn_mfma_f32_32x32x1f32' must be a constant integer}}
+}
+
+void test_mfma_f32_16x16x1f32(global v16f* out, float a, float b, v16f c, int d)
+{
+  *out = __builtin_amdgcn_mfma_f32_16x16x1f32(a, b, c, d, 0, 0); // expected-error{{argument to '__builtin_amdgcn_mfma_f32_16x16x1f32' must be a constant integer}}
+  *out = __builtin_amdgcn_mfma_f32_16x16x1f32(a, b, c, 0, d, 0); // expected-error{{argument to '__builtin_amdgcn_mfma_f32_16x16x1f32' must be a constant integer}}
+  *out = __builtin_amdgcn_mfma_f32_16x16x1f32(a, b, c, 0, 0, d); // expected-error{{argument to '__builtin_amdgcn_mfma_f32_16x16x1f32' must be a constant integer}}
+}
+
+void test_mfma_f32_4x4x1f32(global v4f* out, float a, float b, v4f c, int d)
+{
+  *out = __builtin_amdgcn_mfma_f32_4x4x1f32(a, b, c, d, 0, 0); // expected-error{{argument to '__builtin_amdgcn_mfma_f32_4x4x1f32' must be a constant integer}}
+  *out = __builtin_amdgcn_mfma_f32_4x4x1f32(a, b, c, 0, d, 0); // expected-error{{argument to '__builtin_amdgcn_mfma_f32_4x4x1f32' must be a constant integer}}
+  *out = __builtin_amdgcn_mfma_f32_4x4x1f32(a, b, c, 0, 0, d); // expected-error{{argument to '__builtin_amdgcn_mfma_f32_4x4x1f32' must be a constant integer}}
+}
+
+void test_mfma_f32_32x32x2f32(global v16f* out, float a, float b, v16f c, int d)
+{
+  *out = __builtin_amdgcn_mfma_f32_32x32x2f32(a, b, c, d, 0, 0); // expected-error{{argument to '__builtin_amdgcn_mfma_f32_32x32x2f32' must be a constant integer}}
+  *out = __builtin_amdgcn_mfma_f32_32x32x2f32(a, b, c, 0, d, 0); // expected-error{{argument to '__builtin_amdgcn_mfma_f32_32x32x2f32' must be a constant integer}}
+  *out = __builtin_amdgcn_mfma_f32_32x32x2f32(a, b, c, 0, 0, d); // expected-error{{argument to '__builtin_amdgcn_mfma_f32_32x32x2f32' must be a constant integer}}
+}
+
+void test_mfma_f32_16x16x4f32(global v4f* out, float a, float b, v4f c, int d)
+{
+  *out = __builtin_amdgcn_mfma_f32_16x16x4f32(a, b, c, d, 0, 0); // expected-error{{argument to '__builtin_amdgcn_mfma_f32_16x16x4f32' must be a constant integer}}
+  *out = __builtin_amdgcn_mfma_f32_16x16x4f32(a, b, c, 0, d, 0); // expected-error{{argument to '__builtin_amdgcn_mfma_f32_16x16x4f32' must be a constant integer}}
+  *out = __builtin_amdgcn_mfma_f32_16x16x4f32(a, b, c, 0, 0, d); // expected-error{{argument to '__builtin_amdgcn_mfma_f32_16x16x4f32' must be a constant integer}}
+}
+
+void test_mfma_f32_32x32x4f16(global v32f* out, v4h a, v4h b, v32f c, int d)
+{
+  *out = __builtin_amdgcn_mfma_f32_32x32x4f16(a, b, c, d, 0, 0); // expected-error{{argument to '__builtin_amdgcn_mfma_f32_32x32x4f16' must be a constant integer}}
+  *out = __builtin_amdgcn_mfma_f32_32x32x4f16(a, b, c, 0, d, 0); // expected-error{{argument to '__builtin_amdgcn_mfma_f32_32x32x4f16' must be a constant integer}}
+  *out = __builtin_amdgcn_mfma_f32_32x32x4f16(a, b, c, 0, 0, d); // expected-error{{argument to '__builtin_amdgcn_mfma_f32_32x32x4f16' must be a constant integer}}
+}
+
+void test_mfma_f32_16x16x4f16(global v16f* out, v4h a, v4h b, v16f c, int d)
+{
+  *out = __builtin_amdgcn_mfma_f32_16x16x4f16(a, b, c, d, 0, 0); // expected-error{{argument to '__builtin_amdgcn_mfma_f32_16x16x4f16' must be a constant integer}}
+  *out = __builtin_amdgcn_mfma_f32_16x16x4f16(a, b, c, 0, d, 0); // expected-error{{argument to '__builtin_amdgcn_mfma_f32_16x16x4f16' must be a constant integer}}
+  *out = __builtin_amdgcn_mfma_f32_16x16x4f16(a, b, c, 0, 0, d); // expected-error{{argument to '__builtin_amdgcn_mfma_f32_16x16x4f16' must be a constant integer}}
+}
+
+void test_mfma_f32_4x4x4f16(global v4f* out, v4h a, v4h b, v4f c, int d)
+{
+  *out = __builtin_amdgcn_mfma_f32_4x4x4f16(a, b, c, d, 0, 0); // expected-error{{argument to '__builtin_amdgcn_mfma_f32_4x4x4f16' must be a constant integer}}
+  *out = __builtin_amdgcn_mfma_f32_4x4x4f16(a, b, c, 0, d, 0); // expected-error{{argument to '__builtin_amdgcn_mfma_f32_4x4x4f16' must be a constant integer}}
+  *out = __builtin_amdgcn_mfma_f32_4x4x4f16(a, b, c, 0, 0, d); // expected-error{{argument to '__builtin_amdgcn_mfma_f32_4x4x4f16' must be a constant integer}}
+}
+
+void test_mfma_f32_32x32x8f16(global v16f* out, v4h a, v4h b, v16f c, int d)
+{
+  *out = __builtin_amdgcn_mfma_f32_32x32x8f16(a, b, c, d, 0, 0); // expected-error{{argument to '__builtin_amdgcn_mfma_f32_32x32x8f16' must be a constant integer}}
+  *out = __builtin_amdgcn_mfma_f32_32x32x8f16(a, b, c, 0, d, 0); // expected-error{{argument to '__builtin_amdgcn_mfma_f32_32x32x8f16' must be a constant integer}}
+  *out = __builtin_amdgcn_mfma_f32_32x32x8f16(a, b, c, 0, 0, d); // expected-error{{argument to '__builtin_amdgcn_mfma_f32_32x32x8f16' must be a constant integer}}
+}
+
+void test_mfma_f32_16x16x16f16(global v4f* out, v4h a, v4h b, v4f c, int d)
+{
+  *out = __builtin_amdgcn_mfma_f32_16x16x16f16(a, b, c, d, 0, 0); // expected-error{{argument to '__builtin_amdgcn_mfma_f32_16x16x16f16' must be a constant integer}}
+  *out = __builtin_amdgcn_mfma_f32_16x16x16f16(a, b, c, 0, d, 0); // expected-error{{argument to '__builtin_amdgcn_mfma_f32_16x16x16f16' must be a constant integer}}
+  *out = __builtin_amdgcn_mfma_f32_16x16x16f16(a, b, c, 0, 0, d); // expected-error{{argument to '__builtin_amdgcn_mfma_f32_16x16x16f16' must be a constant integer}}
+}
+
+void test_mfma_i32_32x32x4i8(global v32i* out, int a, int b, v32i c, int d)
+{
+  *out = __builtin_amdgcn_mfma_i32_32x32x4i8(a, b, c, d, 0, 0); // expected-error{{argument to '__builtin_amdgcn_mfma_i32_32x32x4i8' must be a constant integer}}
+  *out = __builtin_amdgcn_mfma_i32_32x32x4i8(a, b, c, 0, d, 0); // expected-error{{argument to '__builtin_amdgcn_mfma_i32_32x32x4i8' must be a constant integer}}
+  *out = __builtin_amdgcn_mfma_i32_32x32x4i8(a, b, c, 0, 0, d); // expected-error{{argument to '__builtin_amdgcn_mfma_i32_32x32x4i8' must be a constant integer}}
+}
+
+void test_mfma_i32_16x16x4i8(global v16i* out, int a, int b, v16i c, int d)
+{
+  *out = __builtin_amdgcn_mfma_i32_16x16x4i8(a, b, c, d, 0, 0); // expected-error{{argument to '__builtin_amdgcn_mfma_i32_16x16x4i8' must be a constant integer}}
+  *out = __builtin_amdgcn_mfma_i32_16x16x4i8(a, b, c, 0, d, 0); // expected-error{{argument to '__builtin_amdgcn_mfma_i32_16x16x4i8' must be a constant integer}}
+  *out = __builtin_amdgcn_mfma_i32_16x16x4i8(a, b, c, 0, 0, d); // expected-error{{argument to '__builtin_amdgcn_mfma_i32_16x16x4i8' must be a constant integer}}
+}
+
+void test_mfma_i32_4x4x4i8(global v4i* out, int a, int b, v4i c, int d)
+{
+  *out = __builtin_amdgcn_mfma_i32_4x4x4i8(a, b, c, d, 0, 0); // expected-error{{argument to '__builtin_amdgcn_mfma_i32_4x4x4i8' must be a constant integer}}
+  *out = __builtin_amdgcn_mfma_i32_4x4x4i8(a, b, c, 0, d, 0); // expected-error{{argument to '__builtin_amdgcn_mfma_i32_4x4x4i8' must be a constant integer}}
+  *out = __builtin_amdgcn_mfma_i32_4x4x4i8(a, b, c, 0, 0, d); // expected-error{{argument to '__builtin_amdgcn_mfma_i32_4x4x4i8' must be a constant integer}}
+}
+
+void test_mfma_i32_32x32x8i8(global v16i* out, int a, int b, v16i c, int d)
+{
+  *out = __builtin_amdgcn_mfma_i32_32x32x8i8(a, b, c, d, 0, 0); // expected-error{{argument to '__builtin_amdgcn_mfma_i32_32x32x8i8' must be a constant integer}}
+  *out = __builtin_amdgcn_mfma_i32_32x32x8i8(a, b, c, 0, d, 0); // expected-error{{argument to '__builtin_amdgcn_mfma_i32_32x32x8i8' must be a constant integer}}
+  *out = __builtin_amdgcn_mfma_i32_32x32x8i8(a, b, c, 0, 0, d); // expected-error{{argument to '__builtin_amdgcn_mfma_i32_32x32x8i8' must be a constant integer}}
+}
+
+void test_mfma_i32_16x16x16i8(global v4i* out, int a, int b, v4i c, int d)
+{
+  *out = __builtin_amdgcn_mfma_i32_16x16x16i8(a, b, c, d, 0, 0); // expected-error{{argument to '__builtin_amdgcn_mfma_i32_16x16x16i8' must be a constant integer}}
+  *out = __builtin_amdgcn_mfma_i32_16x16x16i8(a, b, c, 0, d, 0); // expected-error{{argument to '__builtin_amdgcn_mfma_i32_16x16x16i8' must be a constant integer}}
+  *out = __builtin_amdgcn_mfma_i32_16x16x16i8(a, b, c, 0, 0, d); // expected-error{{argument to '__builtin_amdgcn_mfma_i32_16x16x16i8' must be a constant integer}}
+}
+
+void test_mfma_f32_32x32x2bf16(global v32f* out, v2s a, v2s b, v32f c, int d)
+{
+  *out = __builtin_amdgcn_mfma_f32_32x32x2bf16(a, b, c, d, 0, 0); // expected-error{{argument to '__builtin_amdgcn_mfma_f32_32x32x2bf16' must be a constant integer}}
+  *out = __builtin_amdgcn_mfma_f32_32x32x2bf16(a, b, c, 0, d, 0); // expected-error{{argument to '__builtin_amdgcn_mfma_f32_32x32x2bf16' must be a constant integer}}
+  *out = __builtin_amdgcn_mfma_f32_32x32x2bf16(a, b, c, 0, 0, d); // expected-error{{argument to '__builtin_amdgcn_mfma_f32_32x32x2bf16' must be a constant integer}}
+}
+
+void test_mfma_f32_16x16x2bf16(global v16f* out, v2s a, v2s b, v16f c, int d)
+{
+  *out = __builtin_amdgcn_mfma_f32_16x16x2bf16(a, b, c, d, 0, 0); // expected-error{{argument to '__builtin_amdgcn_mfma_f32_16x16x2bf16' must be a constant integer}}
+  *out = __builtin_amdgcn_mfma_f32_16x16x2bf16(a, b, c, 0, d, 0); // expected-error{{argument to '__builtin_amdgcn_mfma_f32_16x16x2bf16' must be a constant integer}}
+  *out = __builtin_amdgcn_mfma_f32_16x16x2bf16(a, b, c, 0, 0, d); // expected-error{{argument to '__builtin_amdgcn_mfma_f32_16x16x2bf16' must be a constant integer}}
+}
+
+void test_mfma_f32_4x4x2bf16(global v4f* out, v2s a, v2s b, v4f c, int d)
+{
+  *out = __builtin_amdgcn_mfma_f32_4x4x2bf16(a, b, c, d, 0, 0); // expected-error{{argument to '__builtin_amdgcn_mfma_f32_4x4x2bf16' must be a constant integer}}
+  *out = __builtin_amdgcn_mfma_f32_4x4x2bf16(a, b, c, 0, d, 0); // expected-error{{argument to '__builtin_amdgcn_mfma_f32_4x4x2bf16' must be a constant integer}}
+  *out = __builtin_amdgcn_mfma_f32_4x4x2bf16(a, b, c, 0, 0, d); // expected-error{{argument to '__builtin_amdgcn_mfma_f32_4x4x2bf16' must be a constant integer}}
+}
+
+void test_mfma_f32_32x32x4bf16(global v16f* out, v2s a, v2s b, v16f c, int d)
+{
+  *out = __builtin_amdgcn_mfma_f32_32x32x4bf16(a, b, c, d, 0, 0); // expected-error{{argument to '__builtin_amdgcn_mfma_f32_32x32x4bf16' must be a constant integer}}
+  *out = __builtin_amdgcn_mfma_f32_32x32x4bf16(a, b, c, 0, d, 0); // expected-error{{argument to '__builtin_amdgcn_mfma_f32_32x32x4bf16' must be a constant integer}}
+  *out = __builtin_amdgcn_mfma_f32_32x32x4bf16(a, b, c, 0, 0, d); // expected-error{{argument to '__builtin_amdgcn_mfma_f32_32x32x4bf16' must be a constant integer}}
+}
+
+void test_mfma_f32_16x16x8bf16(global v4f* out, v2s a, v2s b, v4f c, int d)
+{
+  *out = __builtin_amdgcn_mfma_f32_16x16x8bf16(a, b, c, d, 0, 0); // expected-error{{argument to '__builtin_amdgcn_mfma_f32_16x16x8bf16' must be a constant integer}}
+  *out = __builtin_amdgcn_mfma_f32_16x16x8bf16(a, b, c, 0, d, 0); // expected-error{{argument to '__builtin_amdgcn_mfma_f32_16x16x8bf16' must be a constant integer}}
+  *out = __builtin_amdgcn_mfma_f32_16x16x8bf16(a, b, c, 0, 0, d); // expected-error{{argument to '__builtin_amdgcn_mfma_f32_16x16x8bf16' must be a constant integer}}
+}
Index: clang/test/CodeGenOpenCL/builtins-amdgcn-mfma.cl
===================================================================
--- clang/test/CodeGenOpenCL/builtins-amdgcn-mfma.cl
+++ clang/test/CodeGenOpenCL/builtins-amdgcn-mfma.cl
@@ -0,0 +1,161 @@
+// REQUIRES: amdgpu-registered-target
+// RUN: %clang_cc1 -triple amdgcn-unknown-unknown -target-cpu gfx90a -S -emit-llvm -o - %s | FileCheck %s
+
+#pragma OPENCL EXTENSION cl_khr_fp64:enable
+
+typedef float  v4f   __attribute__((ext_vector_type(4)));
+typedef float  v16f  __attribute__((ext_vector_type(16)));
+typedef float  v32f  __attribute__((ext_vector_type(32)));
+typedef half   v4h   __attribute__((ext_vector_type(4)));
+typedef half   v16h  __attribute__((ext_vector_type(16)));
+typedef half   v32h  __attribute__((ext_vector_type(32)));
+typedef int    v4i   __attribute__((ext_vector_type(4)));
+typedef int    v16i  __attribute__((ext_vector_type(16)));
+typedef int    v32i  __attribute__((ext_vector_type(32)));
+typedef short  v2s   __attribute__((ext_vector_type(2)));
+typedef short  v4s   __attribute__((ext_vector_type(4)));
+typedef short  v16s  __attribute__((ext_vector_type(16)));
+typedef short  v32s  __attribute__((ext_vector_type(32)));
+typedef double v4d   __attribute__((ext_vector_type(4)));
+
+
+// CHECK-LABEL: @test_mfma_f32_32x32x1f32
+// CHECK: call <32 x float> @llvm.amdgcn.mfma.f32.32x32x1f32(float %a, float %b, <32 x float> %c, i32 0, i32 0, i32 0)
+void test_mfma_f32_32x32x1f32(global v32f* out, float a, float b, v32f c)
+{
+  *out = __builtin_amdgcn_mfma_f32_32x32x1f32(a, b, c, 0, 0, 0);
+}
+
+// CHECK-LABEL: @test_mfma_f32_16x16x1f32
+// CHECK: call <16 x float> @llvm.amdgcn.mfma.f32.16x16x1f32(float %a, float %b, <16 x float> %c, i32 0, i32 0, i32 0)
+void test_mfma_f32_16x16x1f32(global v16f* out, float a, float b, v16f c)
+{
+  *out = __builtin_amdgcn_mfma_f32_16x16x1f32(a, b, c, 0, 0, 0);
+}
+
+// CHECK-LABEL: @test_mfma_f32_4x4x1f32
+// CHECK: call <4 x float> @llvm.amdgcn.mfma.f32.4x4x1f32(float %a, float %b, <4 x float> %c, i32 0, i32 0, i32 0)
+void test_mfma_f32_4x4x1f32(global v4f* out, float a, float b, v4f c)
+{
+  *out = __builtin_amdgcn_mfma_f32_4x4x1f32(a, b, c, 0, 0, 0);
+}
+
+// CHECK-LABEL: @test_mfma_f32_32x32x2f32
+// CHECK: call <16 x float> @llvm.amdgcn.mfma.f32.32x32x2f32(float %a, float %b, <16 x float> %c, i32 0, i32 0, i32 0)
+void test_mfma_f32_32x32x2f32(global v16f* out, float a, float b, v16f c)
+{
+  *out = __builtin_amdgcn_mfma_f32_32x32x2f32(a, b, c, 0, 0, 0);
+}
+
+// CHECK-LABEL: @test_mfma_f32_16x16x4f32
+// CHECK: call <4 x float> @llvm.amdgcn.mfma.f32.16x16x4f32(float %a, float %b, <4 x float> %c, i32 0, i32 0, i32 0)
+void test_mfma_f32_16x16x4f32(global v4f* out, float a, float b, v4f c)
+{
+  *out = __builtin_amdgcn_mfma_f32_16x16x4f32(a, b, c, 0, 0, 0);
+}
+
+// CHECK-LABEL: @test_mfma_f32_32x32x4f16
+// CHECK: call <32 x float> @llvm.amdgcn.mfma.f32.32x32x4f16(<4 x half> %a, <4 x half> %b, <32 x float> %c, i32 0, i32 0, i32 0)
+void test_mfma_f32_32x32x4f16(global v32f* out, v4h a, v4h b, v32f c)
+{
+  *out = __builtin_amdgcn_mfma_f32_32x32x4f16(a, b, c, 0, 0, 0);
+}
+
+// CHECK-LABEL: @test_mfma_f32_16x16x4f16
+// CHECK: call <16 x float> @llvm.amdgcn.mfma.f32.16x16x4f16(<4 x half> %a, <4 x half> %b, <16 x float> %c, i32 0, i32 0, i32 0)
+void test_mfma_f32_16x16x4f16(global v16f* out, v4h a, v4h b, v16f c)
+{
+  *out = __builtin_amdgcn_mfma_f32_16x16x4f16(a, b, c, 0, 0, 0);
+}
+
+// CHECK-LABEL: @test_mfma_f32_4x4x4f16
+// CHECK: call <4 x float> @llvm.amdgcn.mfma.f32.4x4x4f16(<4 x half> %a, <4 x half> %b, <4 x float> %c, i32 0, i32 0, i32 0)
+void test_mfma_f32_4x4x4f16(global v4f* out, v4h a, v4h b, v4f c)
+{
+  *out = __builtin_amdgcn_mfma_f32_4x4x4f16(a, b, c, 0, 0, 0);
+}
+
+// CHECK-LABEL: @test_mfma_f32_32x32x8f16
+// CHECK: call <16 x float> @llvm.amdgcn.mfma.f32.32x32x8f16(<4 x half> %a, <4 x half> %b, <16 x float> %c, i32 0, i32 0, i32 0)
+void test_mfma_f32_32x32x8f16(global v16f* out, v4h a, v4h b, v16f c)
+{
+  *out = __builtin_amdgcn_mfma_f32_32x32x8f16(a, b, c, 0, 0, 0);
+}
+
+// CHECK-LABEL: @test_mfma_f32_16x16x16f16
+// CHECK: call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %a, <4 x half> %b, <4 x float> %c, i32 0, i32 0, i32 0)
+void test_mfma_f32_16x16x16f16(global v4f* out, v4h a, v4h b, v4f c)
+{
+  *out = __builtin_amdgcn_mfma_f32_16x16x16f16(a, b, c, 0, 0, 0);
+}
+
+// CHECK-LABEL: @test_mfma_i32_32x32x4i8
+// CHECK: call <32 x i32> @llvm.amdgcn.mfma.i32.32x32x4i8(i32 %a, i32 %b, <32 x i32> %c, i32 0, i32 0, i32 0)
+void test_mfma_i32_32x32x4i8(global v32i* out, int a, int b, v32i c)
+{
+  *out = __builtin_amdgcn_mfma_i32_32x32x4i8(a, b, c, 0, 0, 0);
+}
+
+// CHECK-LABEL: @test_mfma_i32_16x16x4i8
+// CHECK: call <16 x i32> @llvm.amdgcn.mfma.i32.16x16x4i8(i32 %a, i32 %b, <16 x i32> %c, i32 0, i32 0, i32 0)
+void test_mfma_i32_16x16x4i8(global v16i* out, int a, int b, v16i c)
+{
+  *out = __builtin_amdgcn_mfma_i32_16x16x4i8(a, b, c, 0, 0, 0);
+}
+
+// CHECK-LABEL: @test_mfma_i32_4x4x4i8
+// CHECK: call <4 x i32> @llvm.amdgcn.mfma.i32.4x4x4i8(i32 %a, i32 %b, <4 x i32> %c, i32 0, i32 0, i32 0)
+void test_mfma_i32_4x4x4i8(global v4i* out, int a, int b, v4i c)
+{
+  *out = __builtin_amdgcn_mfma_i32_4x4x4i8(a, b, c, 0, 0, 0);
+}
+
+// CHECK-LABEL: @test_mfma_i32_32x32x8i8
+// CHECK: call <16 x i32> @llvm.amdgcn.mfma.i32.32x32x8i8(i32 %a, i32 %b, <16 x i32> %c, i32 0, i32 0, i32 0)
+void test_mfma_i32_32x32x8i8(global v16i* out, int a, int b, v16i c)
+{
+  *out = __builtin_amdgcn_mfma_i32_32x32x8i8(a, b, c, 0, 0, 0);
+}
+
+// CHECK-LABEL: @test_mfma_i32_16x16x16i8
+// CHECK: call <4 x i32> @llvm.amdgcn.mfma.i32.16x16x16i8(i32 %a, i32 %b, <4 x i32> %c, i32 0, i32 0, i32 0)
+void test_mfma_i32_16x16x16i8(global v4i* out, int a, int b, v4i c)
+{
+  *out = __builtin_amdgcn_mfma_i32_16x16x16i8(a, b, c, 0, 0, 0);
+}
+
+// CHECK-LABEL: @test_mfma_f32_32x32x2bf16
+// CHECK: call <32 x float> @llvm.amdgcn.mfma.f32.32x32x2bf16(<2 x i16> %a, <2 x i16> %b, <32 x float> %c, i32 0, i32 0, i32 0)
+void test_mfma_f32_32x32x2bf16(global v32f* out, v2s a, v2s b, v32f c)
+{
+  *out = __builtin_amdgcn_mfma_f32_32x32x2bf16(a, b, c, 0, 0, 0);
+}
+
+// CHECK-LABEL: @test_mfma_f32_16x16x2bf16
+// CHECK: call <16 x float> @llvm.amdgcn.mfma.f32.16x16x2bf16(<2 x i16> %a, <2 x i16> %b, <16 x float> %c, i32 0, i32 0, i32 0)
+void test_mfma_f32_16x16x2bf16(global v16f* out, v2s a, v2s b, v16f c)
+{
+  *out = __builtin_amdgcn_mfma_f32_16x16x2bf16(a, b, c, 0, 0, 0);
+}
+
+// CHECK-LABEL: @test_mfma_f32_4x4x2bf16
+// CHECK: call <4 x float> @llvm.amdgcn.mfma.f32.4x4x2bf16(<2 x i16> %a, <2 x i16> %b, <4 x float> %c, i32 0, i32 0, i32 0)
+void test_mfma_f32_4x4x2bf16(global v4f* out, v2s a, v2s b, v4f c)
+{
+  *out = __builtin_amdgcn_mfma_f32_4x4x2bf16(a, b, c, 0, 0, 0);
+}
+
+// CHECK-LABEL: @test_mfma_f32_32x32x4bf16
+// CHECK: call <16 x float> @llvm.amdgcn.mfma.f32.32x32x4bf16(<2 x i16> %a, <2 x i16> %b, <16 x float> %c, i32 0, i32 0, i32 0)
+void test_mfma_f32_32x32x4bf16(global v16f* out, v2s a, v2s b, v16f c)
+{
+  *out = __builtin_amdgcn_mfma_f32_32x32x4bf16(a, b, c, 0, 0, 0);
+}
+
+// CHECK-LABEL: @test_mfma_f32_16x16x8bf16
+// CHECK: call <4 x float> @llvm.amdgcn.mfma.f32.16x16x8bf16(<2 x i16> %a, <2 x i16> %b, <4 x float> %c, i32 0, i32 0, i32 0)
+void test_mfma_f32_16x16x8bf16(global v4f* out, v2s a, v2s b, v4f c)
+{
+  *out = __builtin_amdgcn_mfma_f32_16x16x8bf16(a, b, c, 0, 0, 0);
+}
+
Index: clang/lib/Basic/Targets/AMDGPU.cpp
===================================================================
--- clang/lib/Basic/Targets/AMDGPU.cpp
+++ clang/lib/Basic/Targets/AMDGPU.cpp
@@ -163,6 +163,7 @@
       Features["dot4-insts"] = true;
       Features["dot5-insts"] = true;
       Features["dot6-insts"] = true;
+      Features["mfma1-insts"] = true;
       LLVM_FALLTHROUGH;
     case GK_GFX906:
       Features["dl-insts"] = true;
Index: clang/include/clang/Basic/BuiltinsAMDGPU.def
===================================================================
--- clang/include/clang/Basic/BuiltinsAMDGPU.def
+++ clang/include/clang/Basic/BuiltinsAMDGPU.def
@@ -212,5 +212,30 @@
 BUILTIN(__builtin_r600_recipsqrt_ieee, "dd", "nc")
 BUILTIN(__builtin_r600_recipsqrt_ieeef, "ff", "nc")
 
+//===----------------------------------------------------------------------===//
+// MFMA builtins.
+//===----------------------------------------------------------------------===//
+
+TARGET_BUILTIN(__builtin_amdgcn_mfma_f32_32x32x1f32, "V32fffV32fIiIiIi", "nc", "mfma1-insts")
+TARGET_BUILTIN(__builtin_amdgcn_mfma_f32_16x16x1f32, "V16fffV16fIiIiIi", "nc", "mfma1-insts")
+TARGET_BUILTIN(__builtin_amdgcn_mfma_f32_4x4x1f32, "V4fffV4fIiIiIi", "nc", "mfma1-insts")
+TARGET_BUILTIN(__builtin_amdgcn_mfma_f32_32x32x2f32, "V16fffV16fIiIiIi", "nc", "mfma1-insts")
+TARGET_BUILTIN(__builtin_amdgcn_mfma_f32_16x16x4f32, "V4fffV4fIiIiIi", "nc", "mfma1-insts")
+TARGET_BUILTIN(__builtin_amdgcn_mfma_f32_32x32x4f16, "V32fV4hV4hV32fIiIiIi", "nc", "mfma1-insts")
+TARGET_BUILTIN(__builtin_amdgcn_mfma_f32_16x16x4f16, "V16fV4hV4hV16fIiIiIi", "nc", "mfma1-insts")
+TARGET_BUILTIN(__builtin_amdgcn_mfma_f32_4x4x4f16, "V4fV4hV4hV4fIiIiIi", "nc", "mfma1-insts")
+TARGET_BUILTIN(__builtin_amdgcn_mfma_f32_32x32x8f16, "V16fV4hV4hV16fIiIiIi", "nc", "mfma1-insts")
+TARGET_BUILTIN(__builtin_amdgcn_mfma_f32_16x16x16f16, "V4fV4hV4hV4fIiIiIi", "nc", "mfma1-insts")
+TARGET_BUILTIN(__builtin_amdgcn_mfma_i32_32x32x4i8, "V32iiiV32iIiIiIi", "nc", "mfma1-insts")
+TARGET_BUILTIN(__builtin_amdgcn_mfma_i32_16x16x4i8, "V16iiiV16iIiIiIi", "nc", "mfma1-insts")
+TARGET_BUILTIN(__builtin_amdgcn_mfma_i32_4x4x4i8, "V4iiiV4iIiIiIi", "nc", "mfma1-insts")
+TARGET_BUILTIN(__builtin_amdgcn_mfma_i32_32x32x8i8, "V16iiiV16iIiIiIi", "nc", "mfma1-insts")
+TARGET_BUILTIN(__builtin_amdgcn_mfma_i32_16x16x16i8, "V4iiiV4iIiIiIi", "nc", "mfma1-insts")
+TARGET_BUILTIN(__builtin_amdgcn_mfma_f32_32x32x2bf16, "V32fV2sV2sV32fIiIiIi", "nc", "mfma1-insts")
+TARGET_BUILTIN(__builtin_amdgcn_mfma_f32_16x16x2bf16, "V16fV2sV2sV16fIiIiIi", "nc", "mfma1-insts")
+TARGET_BUILTIN(__builtin_amdgcn_mfma_f32_4x4x2bf16, "V4fV2sV2sV4fIiIiIi", "nc", "mfma1-insts")
+TARGET_BUILTIN(__builtin_amdgcn_mfma_f32_32x32x4bf16, "V16fV2sV2sV16fIiIiIi", "nc", "mfma1-insts")
+TARGET_BUILTIN(__builtin_amdgcn_mfma_f32_16x16x8bf16, "V4fV2sV2sV4fIiIiIi", "nc", "mfma1-insts")
+
 #undef BUILTIN
 #undef TARGET_BUILTIN
_______________________________________________
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits

Reply via email to