[PATCH] D149976: adding bf16 support to NVPTX

2023-05-17 Thread Kushan Ahmadian via Phabricator via cfe-commits
kushanam abandoned this revision.
kushanam added a comment.

Duplicate of D144911 


Repository:
  rG LLVM Github Monorepo

CHANGES SINCE LAST ACTION
  https://reviews.llvm.org/D149976/new/

https://reviews.llvm.org/D149976

___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[PATCH] D149976: adding bf16 support to NVPTX

2023-05-17 Thread Kushan Ahmadian via Phabricator via cfe-commits
kushanam updated this revision to Diff 523106.
kushanam added a comment.

adding cland directives and removing bf16 registers

Depends on D144911 

Differential Revision: https://reviews.llvm.org/D144911


Repository:
  rG LLVM Github Monorepo

CHANGES SINCE LAST ACTION
  https://reviews.llvm.org/D149976/new/

https://reviews.llvm.org/D149976

Files:
  clang/include/clang/Basic/BuiltinsNVPTX.def
  llvm/lib/Target/NVPTX/NVPTXAsmPrinter.cpp
  llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp
  llvm/lib/Target/NVPTX/NVPTXInstrInfo.cpp
  llvm/lib/Target/NVPTX/NVPTXInstrInfo.td
  llvm/lib/Target/NVPTX/NVPTXIntrinsics.td
  llvm/lib/Target/NVPTX/NVPTXRegisterInfo.cpp

Index: llvm/lib/Target/NVPTX/NVPTXRegisterInfo.cpp
===
--- llvm/lib/Target/NVPTX/NVPTXRegisterInfo.cpp
+++ llvm/lib/Target/NVPTX/NVPTXRegisterInfo.cpp
@@ -29,14 +29,13 @@
 std::string getNVPTXRegClassName(TargetRegisterClass const *RC) {
   if (RC == &NVPTX::Float32RegsRegClass)
 return ".f32";
-  if (RC == &NVPTX::Float16RegsRegClass || RC == &NVPTX::BFloat16RegsRegClass)
+  if (RC == &NVPTX::Float16RegsRegClass)
 // Ideally fp16 registers should be .f16, but this syntax is only
 // supported on sm_53+. On the other hand, .b16 registers are
 // accepted for all supported fp16 instructions on all GPU
 // variants, so we can use them instead.
 return ".b16";
-  if (RC == &NVPTX::Float16x2RegsRegClass ||
-  RC == &NVPTX::BFloat16x2RegsRegClass)
+  if (RC == &NVPTX::Float16x2RegsRegClass)
 return ".b32";
   if (RC == &NVPTX::Float64RegsRegClass)
 return ".f64";
@@ -74,10 +73,9 @@
 std::string getNVPTXRegClassStr(TargetRegisterClass const *RC) {
   if (RC == &NVPTX::Float32RegsRegClass)
 return "%f";
-  if (RC == &NVPTX::Float16RegsRegClass || RC == &NVPTX::BFloat16RegsRegClass)
+  if (RC == &NVPTX::Float16RegsRegClass)
 return "%h";
-  if (RC == &NVPTX::Float16x2RegsRegClass ||
-  RC == &NVPTX::BFloat16x2RegsRegClass)
+  if (RC == &NVPTX::Float16x2RegsRegClass)
 return "%hh";
   if (RC == &NVPTX::Float64RegsRegClass)
 return "%fd";
Index: llvm/lib/Target/NVPTX/NVPTXIntrinsics.td
===
--- llvm/lib/Target/NVPTX/NVPTXIntrinsics.td
+++ llvm/lib/Target/NVPTX/NVPTXIntrinsics.td
@@ -998,9 +998,6 @@
 FMA_TUPLE<"_rn_ftz_relu_f16x2", int_nvvm_fma_rn_ftz_relu_f16x2,
   Float16x2Regs, [hasPTX70, hasSM80]>,
 
-// FMA_TUPLE<"_rn_relu_bf16", int_nvvm_fma_rn_relu_bf16, BFloat16Regs,
-//   [hasPTX70, hasSM80]>,
-
 FMA_TUPLE<"_rn_bf16x2", int_nvvm_fma_rn_bf16x2, BFloat16x2Regs,
   [hasPTX70, hasSM80]>,
 FMA_TUPLE<"_rn_relu_bf16x2", int_nvvm_fma_rn_relu_bf16x2, BFloat16x2Regs,
@@ -1254,24 +1251,6 @@
 def : Pat<(int_nvvm_ff2bf16x2_rz_relu Float32Regs:$a, Float32Regs:$b),
   (CVT_bf16x2_f32 Float32Regs:$a, Float32Regs:$b, CvtRZ_RELU)>;
 
-// def : Pat<(int_nvvm_ff2f16x2_rn Float32Regs:$a, Float32Regs:$b),
-//   (CVT_f16x2_f32 Float32Regs:$a, Float32Regs:$b, CvtRN)>;
-// def : Pat<(int_nvvm_ff2f16x2_rn_relu Float32Regs:$a, Float32Regs:$b),
-//   (CVT_f16x2_f32 Float32Regs:$a, Float32Regs:$b, CvtRN_RELU)>;
-// def : Pat<(int_nvvm_ff2f16x2_rz Float32Regs:$a, Float32Regs:$b),
-//   (CVT_f16x2_f32 Float32Regs:$a, Float32Regs:$b, CvtRZ)>;
-// def : Pat<(int_nvvm_ff2f16x2_rz_relu Float32Regs:$a, Float32Regs:$b),
-//   (CVT_f16x2_f32 Float32Regs:$a, Float32Regs:$b, CvtRZ_RELU)>;
-
-// def : Pat<(int_nvvm_f2bf16_rn Float32Regs:$a),
-//   (CVT_bf16_f32 Float32Regs:$a, CvtRN)>;
-// def : Pat<(int_nvvm_f2bf16_rn_relu Float32Regs:$a),
-//   (CVT_bf16_f32 Float32Regs:$a, CvtRN_RELU)>;
-// def : Pat<(int_nvvm_f2bf16_rz Float32Regs:$a),
-//   (CVT_bf16_f32 Float32Regs:$a, CvtRZ)>;
-// def : Pat<(int_nvvm_f2bf16_rz_relu Float32Regs:$a),
-//   (CVT_bf16_f32 Float32Regs:$a, CvtRZ_RELU)>;
-
 def CVT_tf32_f32 :
NVPTXInst<(outs Int32Regs:$dest), (ins Float32Regs:$a),
"cvt.rna.tf32.f32 \t$dest, $a;",
@@ -1387,11 +1366,6 @@
 def : Pat<(int_nvvm_f2h_rn Float32Regs:$a),
   (BITCONVERT_16_F2I (CVT_f16_f32 Float32Regs:$a, CvtRN))>;
 
-// def : Pat<(int_nvvm_bf2h_rn_ftz Float32Regs:$a),
-//   (BITCONVERT_16_BF2I (CVT_bf16_f32 Float32Regs:$a, CvtRN_FTZ))>;
-// def : Pat<(int_nvvm_f2h_rn BFloat16Regs:$a),
-//   (BITCONVERT_16_BF2I (CVT_bf16_f32 BFloat16Regs:$a, CvtRN))>;
-
 //
 // Bitcast
 //
Index: llvm/lib/Target/NVPTX/NVPTXInstrInfo.td
===
--- llvm/lib/Target/NVPTX/NVPTXInstrInfo.td
+++ llvm/lib/Target/NVPTX/NVPTXInstrInfo.td
@@ -656,15 +656,6 @@
   def CVT_INREG_s64_s32 : NVPTXInst<(outs Int64Regs:$dst), (ins Int64Regs:$src),
 "cvt.s64.s32 \t$dst, $src;", []>;
 
-multiclass CVT_FROM_FLOAT_SM80 {
-def _f32 :
-  NVPTXInst<(outs RC:$dst),
-

[PATCH] D149976: adding bf16 support to NVPTX

2023-05-15 Thread Kushan Ahmadian via Phabricator via cfe-commits
kushanam updated this revision to Diff 522290.
kushanam added a comment.

Adressing review changes and removing bf16 registers


Repository:
  rG LLVM Github Monorepo

CHANGES SINCE LAST ACTION
  https://reviews.llvm.org/D149976/new/

https://reviews.llvm.org/D149976

Files:
  clang/include/clang/Basic/BuiltinsNVPTX.def
  llvm/include/llvm/IR/IntrinsicsNVVM.td
  llvm/lib/Target/NVPTX/MCTargetDesc/NVPTXInstPrinter.cpp
  llvm/lib/Target/NVPTX/NVPTXAsmPrinter.cpp
  llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp
  llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.h
  llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp
  llvm/lib/Target/NVPTX/NVPTXInstrInfo.td
  llvm/lib/Target/NVPTX/NVPTXIntrinsics.td
  llvm/lib/Target/NVPTX/NVPTXMCExpr.cpp
  llvm/lib/Target/NVPTX/NVPTXMCExpr.h
  llvm/lib/Target/NVPTX/NVPTXRegisterInfo.td
  llvm/lib/Target/NVPTX/NVPTXSubtarget.cpp
  llvm/lib/Target/NVPTX/NVPTXSubtarget.h
  llvm/lib/Target/NVPTX/NVPTXTargetTransformInfo.cpp
  llvm/test/CodeGen/NVPTX/bf16-instructions.ll

Index: llvm/test/CodeGen/NVPTX/bf16-instructions.ll
===
--- /dev/null
+++ llvm/test/CodeGen/NVPTX/bf16-instructions.ll
@@ -0,0 +1,88 @@
+; RUN: llc < %s -march=nvptx64 -mcpu=sm_80 -mattr=+ptx70 | FileCheck %s
+; RUN: %if ptxas-11.0 %{ llc < %s -march=nvptx64 -mcpu=sm_80 -mattr=+ptx70 | %ptxas-verify -arch=sm_80 %}
+
+
+; CHECK-LABEL: test_fadd(
+; CHECK-DAG:  ld.param.b16[[A:%h[0-9]+]], [test_fadd_param_0];
+; CHECK-DAG:  ld.param.b16[[B:%h[0-9]+]], [test_fadd_param_1];
+; CHECK-NEXT: add.rn.bf16 [[R:%f[0-9]+]], [[A]], [[B]];
+; CHECK-NEXT: st.param.b16[func_retval0+0], [[R]];
+; CHECK-NEXT: ret;
+
+define bfloat @test_fadd(bfloat %0, bfloat %1) {
+  %3 = fadd bfloat %0, %1 
+  ret bfloat %3
+}
+
+; CHECK-LABEL: test_fsub(
+; CHECK-DAG:  ld.param.b16[[A:%h[0-9]+]], [test_fsub_param_0];
+; CHECK-DAG:  ld.param.b16[[B:%h[0-9]+]], [test_fsub_param_1];
+; CHECK-NEXT: sub.rn.bf16 [[R:%f[0-9]+]], [[A]], [[B]];
+; CHECK-NEXT: st.param.b16[func_retval0+0], [[R]];
+; CHECK-NEXT: ret;
+
+define bfloat @test_fsub(bfloat %0, bfloat %1) {
+  %3 = fsub bfloat %0, %1 
+  ret bfloat %3
+}
+
+; CHECK-LABEL: test_faddx2(
+; CHECK-DAG:  ld.param.b32[[A:%hh[0-9]+]], [test_faddx2_param_0];
+; CHECK-DAG:  ld.param.b32[[B:%hh[0-9]+]], [test_faddx2_param_1];
+; CHECK-NEXT: add.rn.bf16x2   [[R:%f[0-9]+]], [[A]], [[B]];
+
+; CHECK:  st.param.b32[func_retval0+0], [[R]];
+; CHECK:  ret;
+
+define <2 x bfloat> @test_faddx2(<2 x bfloat> %a, <2 x bfloat> %b) #0 {
+  %r = fadd <2 x bfloat> %a, %b
+  ret <2 x bfloat> %r
+}
+
+; CHECK-LABEL: test_fsubx2(
+; CHECK-DAG:  ld.param.b32[[A:%hh[0-9]+]], [test_fsubx2_param_0];
+; CHECK-DAG:  ld.param.b32[[B:%hh[0-9]+]], [test_fsubx2_param_1];
+; CHECK-NEXT: sub.rn.bf16x2   [[R:%f[0-9]+]], [[A]], [[B]];
+
+; CHECK:  st.param.b32[func_retval0+0], [[R]];
+; CHECK:  ret;
+
+define <2 x bfloat> @test_fsubx2(<2 x bfloat> %a, <2 x bfloat> %b) #0 {
+  %r = fsub <2 x bfloat> %a, %b
+  ret <2 x bfloat> %r
+}
+
+; CHECK-LABEL: test_fmulx2(
+; CHECK-DAG:  ld.param.b32[[A:%hh[0-9]+]], [test_fmulx2_param_0];
+; CHECK-DAG:  ld.param.b32[[B:%hh[0-9]+]], [test_fmulx2_param_1];
+; CHECK-NEXT: mul.rn.bf16x2   [[R:%f[0-9]+]], [[A]], [[B]];
+
+; CHECK:  st.param.b32[func_retval0+0], [[R]];
+; CHECK:  ret;
+
+define <2 x bfloat> @test_fmul(<2 x bfloat> %a, <2 x bfloat> %b) #0 {
+  %r = fmul <2 x bfloat> %a, %b
+  ret <2 x bfloat> %r
+}
+
+; CHECK-LABEL: test_fdiv(
+; CHECK-DAG:  ld.param.b32[[A:%hh[0-9]+]], [test_fdiv_param_0];
+; CHECK-DAG:  ld.param.b32[[B:%hh[0-9]+]], [test_fdiv_param_1];
+; CHECK-DAG:  mov.b32 {[[A0:%h[0-9]+]], [[A1:%h[0-9]+]]}, [[A]]
+; CHECK-DAG:  mov.b32 {[[B0:%h[0-9]+]], [[B1:%h[0-9]+]]}, [[B]]
+; CHECK-DAG:  cvt.f32.bf16 [[FA0:%f[0-9]+]], [[A0]];
+; CHECK-DAG:  cvt.f32.bf16 [[FA1:%f[0-9]+]], [[A1]];
+; CHECK-DAG:  cvt.f32.bf16 [[FB0:%f[0-9]+]], [[B0]];
+; CHECK-DAG:  cvt.f32.bf16 [[FB1:%f[0-9]+]], [[B1]];
+; CHECK-DAG:  div.rn.f32  [[FR0:%f[0-9]+]], [[FA0]], [[FB0]];
+; CHECK-DAG:  div.rn.f32  [[FR1:%f[0-9]+]], [[FA1]], [[FB1]];
+; CHECK-DAG:  cvt.rn.bf16.f32  [[R0:%h[0-9]+]], [[FR0]];
+; CHECK-DAG:  cvt.rn.bf16.f32  [[R1:%h[0-9]+]], [[FR1]];
+; CHECK-NEXT: mov.b32 [[R:%hh[0-9]+]], {[[R0]], [[R1]]}
+; CHECK-NEXT: st.param.b32[func_retval0+0], [[R]];
+; CHECK-NEXT: ret;
+
+define <2 x bfloat> @test_fdiv(<2 x bfloat> %a, <2 x bfloat> %b) #0 {
+  %r = fdiv <2 x bfloat> %a, %b
+  ret <2 x bfloat> %r
+}
Index: llvm/lib/Target/NVPTX/NVPTXTargetTransformInfo.cpp
===
--- llvm/lib/Target/NVPTX/NVPTXTargetTransformInfo.cpp
+++ llvm/lib/Target/NVPTX/NVPTXTargetTransformInfo.cpp
@@ -204,6 +204,14 @@
   return {Intrinsic::fma, FTZ_MustBeOff, true};
 case Intrinsic::nvvm

[PATCH] D149976: adding bf16 support to NVPTX

2023-05-15 Thread Kushan Ahmadian via Phabricator via cfe-commits
kushanam updated this revision to Diff 522284.
kushanam added a comment.

Addressing review changed and removing bf16 registers

Depends on D144911 


Repository:
  rG LLVM Github Monorepo

CHANGES SINCE LAST ACTION
  https://reviews.llvm.org/D149976/new/

https://reviews.llvm.org/D149976

Files:
  clang/include/clang/Basic/BuiltinsNVPTX.def
  llvm/include/llvm/IR/IntrinsicsNVVM.td
  llvm/lib/Target/NVPTX/MCTargetDesc/NVPTXInstPrinter.cpp
  llvm/lib/Target/NVPTX/NVPTXAsmPrinter.cpp
  llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp
  llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.h
  llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp
  llvm/lib/Target/NVPTX/NVPTXInstrInfo.td
  llvm/lib/Target/NVPTX/NVPTXIntrinsics.td
  llvm/lib/Target/NVPTX/NVPTXMCExpr.cpp
  llvm/lib/Target/NVPTX/NVPTXMCExpr.h
  llvm/lib/Target/NVPTX/NVPTXRegisterInfo.td
  llvm/lib/Target/NVPTX/NVPTXSubtarget.cpp
  llvm/lib/Target/NVPTX/NVPTXSubtarget.h
  llvm/lib/Target/NVPTX/NVPTXTargetTransformInfo.cpp
  llvm/test/CodeGen/NVPTX/bf16-instructions.ll

Index: llvm/test/CodeGen/NVPTX/bf16-instructions.ll
===
--- /dev/null
+++ llvm/test/CodeGen/NVPTX/bf16-instructions.ll
@@ -0,0 +1,88 @@
+; RUN: llc < %s -march=nvptx64 -mcpu=sm_80 -mattr=+ptx70 | FileCheck %s
+; RUN: %if ptxas-11.0 %{ llc < %s -march=nvptx64 -mcpu=sm_80 -mattr=+ptx70 | %ptxas-verify -arch=sm_80 %}
+
+
+; CHECK-LABEL: test_fadd(
+; CHECK-DAG:  ld.param.b16[[A:%h[0-9]+]], [test_fadd_param_0];
+; CHECK-DAG:  ld.param.b16[[B:%h[0-9]+]], [test_fadd_param_1];
+; CHECK-NEXT: add.rn.bf16 [[R:%f[0-9]+]], [[A]], [[B]];
+; CHECK-NEXT: st.param.b16[func_retval0+0], [[R]];
+; CHECK-NEXT: ret;
+
+define bfloat @test_fadd(bfloat %0, bfloat %1) {
+  %3 = fadd bfloat %0, %1 
+  ret bfloat %3
+}
+
+; CHECK-LABEL: test_fsub(
+; CHECK-DAG:  ld.param.b16[[A:%h[0-9]+]], [test_fsub_param_0];
+; CHECK-DAG:  ld.param.b16[[B:%h[0-9]+]], [test_fsub_param_1];
+; CHECK-NEXT: sub.rn.bf16 [[R:%f[0-9]+]], [[A]], [[B]];
+; CHECK-NEXT: st.param.b16[func_retval0+0], [[R]];
+; CHECK-NEXT: ret;
+
+define bfloat @test_fsub(bfloat %0, bfloat %1) {
+  %3 = fsub bfloat %0, %1 
+  ret bfloat %3
+}
+
+; CHECK-LABEL: test_faddx2(
+; CHECK-DAG:  ld.param.b32[[A:%hh[0-9]+]], [test_faddx2_param_0];
+; CHECK-DAG:  ld.param.b32[[B:%hh[0-9]+]], [test_faddx2_param_1];
+; CHECK-NEXT: add.rn.bf16x2   [[R:%f[0-9]+]], [[A]], [[B]];
+
+; CHECK:  st.param.b32[func_retval0+0], [[R]];
+; CHECK:  ret;
+
+define <2 x bfloat> @test_faddx2(<2 x bfloat> %a, <2 x bfloat> %b) #0 {
+  %r = fadd <2 x bfloat> %a, %b
+  ret <2 x bfloat> %r
+}
+
+; CHECK-LABEL: test_fsubx2(
+; CHECK-DAG:  ld.param.b32[[A:%hh[0-9]+]], [test_fsubx2_param_0];
+; CHECK-DAG:  ld.param.b32[[B:%hh[0-9]+]], [test_fsubx2_param_1];
+; CHECK-NEXT: sub.rn.bf16x2   [[R:%f[0-9]+]], [[A]], [[B]];
+
+; CHECK:  st.param.b32[func_retval0+0], [[R]];
+; CHECK:  ret;
+
+define <2 x bfloat> @test_fsubx2(<2 x bfloat> %a, <2 x bfloat> %b) #0 {
+  %r = fsub <2 x bfloat> %a, %b
+  ret <2 x bfloat> %r
+}
+
+; CHECK-LABEL: test_fmulx2(
+; CHECK-DAG:  ld.param.b32[[A:%hh[0-9]+]], [test_fmulx2_param_0];
+; CHECK-DAG:  ld.param.b32[[B:%hh[0-9]+]], [test_fmulx2_param_1];
+; CHECK-NEXT: mul.rn.bf16x2   [[R:%f[0-9]+]], [[A]], [[B]];
+
+; CHECK:  st.param.b32[func_retval0+0], [[R]];
+; CHECK:  ret;
+
+define <2 x bfloat> @test_fmul(<2 x bfloat> %a, <2 x bfloat> %b) #0 {
+  %r = fmul <2 x bfloat> %a, %b
+  ret <2 x bfloat> %r
+}
+
+; CHECK-LABEL: test_fdiv(
+; CHECK-DAG:  ld.param.b32[[A:%hh[0-9]+]], [test_fdiv_param_0];
+; CHECK-DAG:  ld.param.b32[[B:%hh[0-9]+]], [test_fdiv_param_1];
+; CHECK-DAG:  mov.b32 {[[A0:%h[0-9]+]], [[A1:%h[0-9]+]]}, [[A]]
+; CHECK-DAG:  mov.b32 {[[B0:%h[0-9]+]], [[B1:%h[0-9]+]]}, [[B]]
+; CHECK-DAG:  cvt.f32.bf16 [[FA0:%f[0-9]+]], [[A0]];
+; CHECK-DAG:  cvt.f32.bf16 [[FA1:%f[0-9]+]], [[A1]];
+; CHECK-DAG:  cvt.f32.bf16 [[FB0:%f[0-9]+]], [[B0]];
+; CHECK-DAG:  cvt.f32.bf16 [[FB1:%f[0-9]+]], [[B1]];
+; CHECK-DAG:  div.rn.f32  [[FR0:%f[0-9]+]], [[FA0]], [[FB0]];
+; CHECK-DAG:  div.rn.f32  [[FR1:%f[0-9]+]], [[FA1]], [[FB1]];
+; CHECK-DAG:  cvt.rn.bf16.f32  [[R0:%h[0-9]+]], [[FR0]];
+; CHECK-DAG:  cvt.rn.bf16.f32  [[R1:%h[0-9]+]], [[FR1]];
+; CHECK-NEXT: mov.b32 [[R:%hh[0-9]+]], {[[R0]], [[R1]]}
+; CHECK-NEXT: st.param.b32[func_retval0+0], [[R]];
+; CHECK-NEXT: ret;
+
+define <2 x bfloat> @test_fdiv(<2 x bfloat> %a, <2 x bfloat> %b) #0 {
+  %r = fdiv <2 x bfloat> %a, %b
+  ret <2 x bfloat> %r
+}
Index: llvm/lib/Target/NVPTX/NVPTXTargetTransformInfo.cpp
===
--- llvm/lib/Target/NVPTX/NVPTXTargetTransformInfo.cpp
+++ llvm/lib/Target/NVPTX/NVPTXTargetTransformInfo.cpp
@@ -204,6 +204,14 @@
   return {Intrins

[PATCH] D149976: adding bf16 support to NVPTX

2023-05-15 Thread Kushan Ahmadian via Phabricator via cfe-commits
kushanam abandoned this revision.
kushanam added a comment.

Redundant to D144911 


Repository:
  rG LLVM Github Monorepo

CHANGES SINCE LAST ACTION
  https://reviews.llvm.org/D149976/new/

https://reviews.llvm.org/D149976

___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[PATCH] D149976: adding bf16 support to NVPTX

2023-05-15 Thread Kushan Ahmadian via Phabricator via cfe-commits
kushanam updated this revision to Diff 522283.
kushanam added a comment.

Addressing the review changed adn removing bf16 specific registers


Repository:
  rG LLVM Github Monorepo

CHANGES SINCE LAST ACTION
  https://reviews.llvm.org/D149976/new/

https://reviews.llvm.org/D149976

Files:
  clang/include/clang/Basic/BuiltinsNVPTX.def
  llvm/include/llvm/IR/IntrinsicsNVVM.td
  llvm/lib/Target/NVPTX/MCTargetDesc/NVPTXInstPrinter.cpp
  llvm/lib/Target/NVPTX/NVPTXAsmPrinter.cpp
  llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp
  llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.h
  llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp
  llvm/lib/Target/NVPTX/NVPTXInstrInfo.td
  llvm/lib/Target/NVPTX/NVPTXIntrinsics.td
  llvm/lib/Target/NVPTX/NVPTXMCExpr.cpp
  llvm/lib/Target/NVPTX/NVPTXMCExpr.h
  llvm/lib/Target/NVPTX/NVPTXRegisterInfo.td
  llvm/lib/Target/NVPTX/NVPTXSubtarget.cpp
  llvm/lib/Target/NVPTX/NVPTXSubtarget.h
  llvm/lib/Target/NVPTX/NVPTXTargetTransformInfo.cpp
  llvm/test/CodeGen/NVPTX/bf16-instructions.ll

Index: llvm/test/CodeGen/NVPTX/bf16-instructions.ll
===
--- /dev/null
+++ llvm/test/CodeGen/NVPTX/bf16-instructions.ll
@@ -0,0 +1,88 @@
+; RUN: llc < %s -march=nvptx64 -mcpu=sm_80 -mattr=+ptx70 | FileCheck %s
+; RUN: %if ptxas-11.0 %{ llc < %s -march=nvptx64 -mcpu=sm_80 -mattr=+ptx70 | %ptxas-verify -arch=sm_80 %}
+
+
+; CHECK-LABEL: test_fadd(
+; CHECK-DAG:  ld.param.b16[[A:%h[0-9]+]], [test_fadd_param_0];
+; CHECK-DAG:  ld.param.b16[[B:%h[0-9]+]], [test_fadd_param_1];
+; CHECK-NEXT: add.rn.bf16 [[R:%f[0-9]+]], [[A]], [[B]];
+; CHECK-NEXT: st.param.b16[func_retval0+0], [[R]];
+; CHECK-NEXT: ret;
+
+define bfloat @test_fadd(bfloat %0, bfloat %1) {
+  %3 = fadd bfloat %0, %1 
+  ret bfloat %3
+}
+
+; CHECK-LABEL: test_fsub(
+; CHECK-DAG:  ld.param.b16[[A:%h[0-9]+]], [test_fsub_param_0];
+; CHECK-DAG:  ld.param.b16[[B:%h[0-9]+]], [test_fsub_param_1];
+; CHECK-NEXT: sub.rn.bf16 [[R:%f[0-9]+]], [[A]], [[B]];
+; CHECK-NEXT: st.param.b16[func_retval0+0], [[R]];
+; CHECK-NEXT: ret;
+
+define bfloat @test_fsub(bfloat %0, bfloat %1) {
+  %3 = fsub bfloat %0, %1 
+  ret bfloat %3
+}
+
+; CHECK-LABEL: test_faddx2(
+; CHECK-DAG:  ld.param.b32[[A:%hh[0-9]+]], [test_faddx2_param_0];
+; CHECK-DAG:  ld.param.b32[[B:%hh[0-9]+]], [test_faddx2_param_1];
+; CHECK-NEXT: add.rn.bf16x2   [[R:%f[0-9]+]], [[A]], [[B]];
+
+; CHECK:  st.param.b32[func_retval0+0], [[R]];
+; CHECK:  ret;
+
+define <2 x bfloat> @test_faddx2(<2 x bfloat> %a, <2 x bfloat> %b) #0 {
+  %r = fadd <2 x bfloat> %a, %b
+  ret <2 x bfloat> %r
+}
+
+; CHECK-LABEL: test_fsubx2(
+; CHECK-DAG:  ld.param.b32[[A:%hh[0-9]+]], [test_fsubx2_param_0];
+; CHECK-DAG:  ld.param.b32[[B:%hh[0-9]+]], [test_fsubx2_param_1];
+; CHECK-NEXT: sub.rn.bf16x2   [[R:%f[0-9]+]], [[A]], [[B]];
+
+; CHECK:  st.param.b32[func_retval0+0], [[R]];
+; CHECK:  ret;
+
+define <2 x bfloat> @test_fsubx2(<2 x bfloat> %a, <2 x bfloat> %b) #0 {
+  %r = fsub <2 x bfloat> %a, %b
+  ret <2 x bfloat> %r
+}
+
+; CHECK-LABEL: test_fmulx2(
+; CHECK-DAG:  ld.param.b32[[A:%hh[0-9]+]], [test_fmulx2_param_0];
+; CHECK-DAG:  ld.param.b32[[B:%hh[0-9]+]], [test_fmulx2_param_1];
+; CHECK-NEXT: mul.rn.bf16x2   [[R:%f[0-9]+]], [[A]], [[B]];
+
+; CHECK:  st.param.b32[func_retval0+0], [[R]];
+; CHECK:  ret;
+
+define <2 x bfloat> @test_fmul(<2 x bfloat> %a, <2 x bfloat> %b) #0 {
+  %r = fmul <2 x bfloat> %a, %b
+  ret <2 x bfloat> %r
+}
+
+; CHECK-LABEL: test_fdiv(
+; CHECK-DAG:  ld.param.b32[[A:%hh[0-9]+]], [test_fdiv_param_0];
+; CHECK-DAG:  ld.param.b32[[B:%hh[0-9]+]], [test_fdiv_param_1];
+; CHECK-DAG:  mov.b32 {[[A0:%h[0-9]+]], [[A1:%h[0-9]+]]}, [[A]]
+; CHECK-DAG:  mov.b32 {[[B0:%h[0-9]+]], [[B1:%h[0-9]+]]}, [[B]]
+; CHECK-DAG:  cvt.f32.bf16 [[FA0:%f[0-9]+]], [[A0]];
+; CHECK-DAG:  cvt.f32.bf16 [[FA1:%f[0-9]+]], [[A1]];
+; CHECK-DAG:  cvt.f32.bf16 [[FB0:%f[0-9]+]], [[B0]];
+; CHECK-DAG:  cvt.f32.bf16 [[FB1:%f[0-9]+]], [[B1]];
+; CHECK-DAG:  div.rn.f32  [[FR0:%f[0-9]+]], [[FA0]], [[FB0]];
+; CHECK-DAG:  div.rn.f32  [[FR1:%f[0-9]+]], [[FA1]], [[FB1]];
+; CHECK-DAG:  cvt.rn.bf16.f32  [[R0:%h[0-9]+]], [[FR0]];
+; CHECK-DAG:  cvt.rn.bf16.f32  [[R1:%h[0-9]+]], [[FR1]];
+; CHECK-NEXT: mov.b32 [[R:%hh[0-9]+]], {[[R0]], [[R1]]}
+; CHECK-NEXT: st.param.b32[func_retval0+0], [[R]];
+; CHECK-NEXT: ret;
+
+define <2 x bfloat> @test_fdiv(<2 x bfloat> %a, <2 x bfloat> %b) #0 {
+  %r = fdiv <2 x bfloat> %a, %b
+  ret <2 x bfloat> %r
+}
Index: llvm/lib/Target/NVPTX/NVPTXTargetTransformInfo.cpp
===
--- llvm/lib/Target/NVPTX/NVPTXTargetTransformInfo.cpp
+++ llvm/lib/Target/NVPTX/NVPTXTargetTransformInfo.cpp
@@ -204,6 +204,14 @@
   return {Intrinsic::fma, FTZ_MustBeOff, true};
 case I

[PATCH] D149976: adding bf16 support to NVPTX

2023-05-07 Thread Kushan Ahmadian via Phabricator via cfe-commits
kushanam updated this revision to Diff 520215.
kushanam added a comment.

removing bf16 specific registers classes


Repository:
  rG LLVM Github Monorepo

CHANGES SINCE LAST ACTION
  https://reviews.llvm.org/D149976/new/

https://reviews.llvm.org/D149976

Files:
  clang/include/clang/Basic/BuiltinsNVPTX.def
  llvm/include/llvm/IR/IntrinsicsNVVM.td
  llvm/lib/Target/NVPTX/MCTargetDesc/NVPTXInstPrinter.cpp
  llvm/lib/Target/NVPTX/NVPTXAsmPrinter.cpp
  llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp
  llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.h
  llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp
  llvm/lib/Target/NVPTX/NVPTXInstrInfo.td
  llvm/lib/Target/NVPTX/NVPTXIntrinsics.td
  llvm/lib/Target/NVPTX/NVPTXMCExpr.cpp
  llvm/lib/Target/NVPTX/NVPTXMCExpr.h
  llvm/lib/Target/NVPTX/NVPTXRegisterInfo.td
  llvm/lib/Target/NVPTX/NVPTXSubtarget.cpp
  llvm/lib/Target/NVPTX/NVPTXSubtarget.h
  llvm/lib/Target/NVPTX/NVPTXTargetTransformInfo.cpp
  llvm/test/CodeGen/NVPTX/bf16-instructions.ll

Index: llvm/test/CodeGen/NVPTX/bf16-instructions.ll
===
--- /dev/null
+++ llvm/test/CodeGen/NVPTX/bf16-instructions.ll
@@ -0,0 +1,88 @@
+; RUN: llc < %s -march=nvptx64 -mcpu=sm_80 -mattr=+ptx70 | FileCheck %s
+; RUN: %if ptxas-11.0 %{ llc < %s -march=nvptx64 -mcpu=sm_80 -mattr=+ptx70 | %ptxas-verify -arch=sm_80 %}
+
+
+; CHECK-LABEL: test_fadd(
+; CHECK-DAG:  ld.param.b16[[A:%h[0-9]+]], [test_fadd_param_0];
+; CHECK-DAG:  ld.param.b16[[B:%h[0-9]+]], [test_fadd_param_1];
+; CHECK-NEXT: add.rn.bf16 [[R:%f[0-9]+]], [[A]], [[B]];
+; CHECK-NEXT: st.param.b16[func_retval0+0], [[R]];
+; CHECK-NEXT: ret;
+
+define bfloat @test_fadd(bfloat %0, bfloat %1) {
+  %3 = fadd bfloat %0, %1 
+  ret bfloat %3
+}
+
+; CHECK-LABEL: test_fsub(
+; CHECK-DAG:  ld.param.b16[[A:%h[0-9]+]], [test_fsub_param_0];
+; CHECK-DAG:  ld.param.b16[[B:%h[0-9]+]], [test_fsub_param_1];
+; CHECK-NEXT: sub.rn.bf16 [[R:%f[0-9]+]], [[A]], [[B]];
+; CHECK-NEXT: st.param.b16[func_retval0+0], [[R]];
+; CHECK-NEXT: ret;
+
+define bfloat @test_fsub(bfloat %0, bfloat %1) {
+  %3 = fsub bfloat %0, %1 
+  ret bfloat %3
+}
+
+; CHECK-LABEL: test_faddx2(
+; CHECK-DAG:  ld.param.b32[[A:%hh[0-9]+]], [test_faddx2_param_0];
+; CHECK-DAG:  ld.param.b32[[B:%hh[0-9]+]], [test_faddx2_param_1];
+; CHECK-NEXT: add.rn.bf16x2   [[R:%f[0-9]+]], [[A]], [[B]];
+
+; CHECK:  st.param.b32[func_retval0+0], [[R]];
+; CHECK:  ret;
+
+define <2 x bfloat> @test_faddx2(<2 x bfloat> %a, <2 x bfloat> %b) #0 {
+  %r = fadd <2 x bfloat> %a, %b
+  ret <2 x bfloat> %r
+}
+
+; CHECK-LABEL: test_fsubx2(
+; CHECK-DAG:  ld.param.b32[[A:%hh[0-9]+]], [test_fsubx2_param_0];
+; CHECK-DAG:  ld.param.b32[[B:%hh[0-9]+]], [test_fsubx2_param_1];
+; CHECK-NEXT: sub.rn.bf16x2   [[R:%f[0-9]+]], [[A]], [[B]];
+
+; CHECK:  st.param.b32[func_retval0+0], [[R]];
+; CHECK:  ret;
+
+define <2 x bfloat> @test_fsubx2(<2 x bfloat> %a, <2 x bfloat> %b) #0 {
+  %r = fsub <2 x bfloat> %a, %b
+  ret <2 x bfloat> %r
+}
+
+; CHECK-LABEL: test_fmulx2(
+; CHECK-DAG:  ld.param.b32[[A:%hh[0-9]+]], [test_fmulx2_param_0];
+; CHECK-DAG:  ld.param.b32[[B:%hh[0-9]+]], [test_fmulx2_param_1];
+; CHECK-NEXT: mul.rn.bf16x2   [[R:%f[0-9]+]], [[A]], [[B]];
+
+; CHECK:  st.param.b32[func_retval0+0], [[R]];
+; CHECK:  ret;
+
+define <2 x bfloat> @test_fmul(<2 x bfloat> %a, <2 x bfloat> %b) #0 {
+  %r = fmul <2 x bfloat> %a, %b
+  ret <2 x bfloat> %r
+}
+
+; CHECK-LABEL: test_fdiv(
+; CHECK-DAG:  ld.param.b32[[A:%hh[0-9]+]], [test_fdiv_param_0];
+; CHECK-DAG:  ld.param.b32[[B:%hh[0-9]+]], [test_fdiv_param_1];
+; CHECK-DAG:  mov.b32 {[[A0:%h[0-9]+]], [[A1:%h[0-9]+]]}, [[A]]
+; CHECK-DAG:  mov.b32 {[[B0:%h[0-9]+]], [[B1:%h[0-9]+]]}, [[B]]
+; CHECK-DAG:  cvt.f32.bf16 [[FA0:%f[0-9]+]], [[A0]];
+; CHECK-DAG:  cvt.f32.bf16 [[FA1:%f[0-9]+]], [[A1]];
+; CHECK-DAG:  cvt.f32.bf16 [[FB0:%f[0-9]+]], [[B0]];
+; CHECK-DAG:  cvt.f32.bf16 [[FB1:%f[0-9]+]], [[B1]];
+; CHECK-DAG:  div.rn.f32  [[FR0:%f[0-9]+]], [[FA0]], [[FB0]];
+; CHECK-DAG:  div.rn.f32  [[FR1:%f[0-9]+]], [[FA1]], [[FB1]];
+; CHECK-DAG:  cvt.rn.bf16.f32  [[R0:%h[0-9]+]], [[FR0]];
+; CHECK-DAG:  cvt.rn.bf16.f32  [[R1:%h[0-9]+]], [[FR1]];
+; CHECK-NEXT: mov.b32 [[R:%hh[0-9]+]], {[[R0]], [[R1]]}
+; CHECK-NEXT: st.param.b32[func_retval0+0], [[R]];
+; CHECK-NEXT: ret;
+
+define <2 x bfloat> @test_fdiv(<2 x bfloat> %a, <2 x bfloat> %b) #0 {
+  %r = fdiv <2 x bfloat> %a, %b
+  ret <2 x bfloat> %r
+}
Index: llvm/lib/Target/NVPTX/NVPTXTargetTransformInfo.cpp
===
--- llvm/lib/Target/NVPTX/NVPTXTargetTransformInfo.cpp
+++ llvm/lib/Target/NVPTX/NVPTXTargetTransformInfo.cpp
@@ -204,6 +204,14 @@
   return {Intrinsic::fma, FTZ_MustBeOff, true};
 case Intrinsic::nvvm_fma_rn_ftz_

[PATCH] D149976: adding bf16 support to NVPTX

2023-05-07 Thread Kushan Ahmadian via Phabricator via cfe-commits
kushanam updated this revision to Diff 520210.
kushanam added a comment.

Addressing the review comments


Repository:
  rG LLVM Github Monorepo

CHANGES SINCE LAST ACTION
  https://reviews.llvm.org/D149976/new/

https://reviews.llvm.org/D149976

Files:
  clang/include/clang/Basic/BuiltinsNVPTX.def
  llvm/include/llvm/IR/IntrinsicsNVVM.td
  llvm/lib/Target/NVPTX/MCTargetDesc/NVPTXInstPrinter.cpp
  llvm/lib/Target/NVPTX/NVPTXAsmPrinter.cpp
  llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp
  llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.h
  llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp
  llvm/lib/Target/NVPTX/NVPTXInstrInfo.td
  llvm/lib/Target/NVPTX/NVPTXIntrinsics.td
  llvm/lib/Target/NVPTX/NVPTXMCExpr.cpp
  llvm/lib/Target/NVPTX/NVPTXMCExpr.h
  llvm/lib/Target/NVPTX/NVPTXRegisterInfo.cpp
  llvm/lib/Target/NVPTX/NVPTXRegisterInfo.td
  llvm/lib/Target/NVPTX/NVPTXSubtarget.cpp
  llvm/lib/Target/NVPTX/NVPTXSubtarget.h
  llvm/lib/Target/NVPTX/NVPTXTargetTransformInfo.cpp
  llvm/test/CodeGen/NVPTX/bf16-instructions.ll

Index: llvm/test/CodeGen/NVPTX/bf16-instructions.ll
===
--- /dev/null
+++ llvm/test/CodeGen/NVPTX/bf16-instructions.ll
@@ -0,0 +1,88 @@
+; RUN: llc < %s -march=nvptx64 -mcpu=sm_80 -mattr=+ptx70 | FileCheck %s
+; RUN: %if ptxas-11.0 %{ llc < %s -march=nvptx64 -mcpu=sm_80 -mattr=+ptx70 | %ptxas-verify -arch=sm_80 %}
+
+
+; CHECK-LABEL: test_fadd(
+; CHECK-DAG:  ld.param.b16[[A:%h[0-9]+]], [test_fadd_param_0];
+; CHECK-DAG:  ld.param.b16[[B:%h[0-9]+]], [test_fadd_param_1];
+; CHECK-NEXT: add.rn.bf16 [[R:%f[0-9]+]], [[A]], [[B]];
+; CHECK-NEXT: st.param.b16[func_retval0+0], [[R]];
+; CHECK-NEXT: ret;
+
+define bfloat @test_fadd(bfloat %0, bfloat %1) {
+  %3 = fadd bfloat %0, %1 
+  ret bfloat %3
+}
+
+; CHECK-LABEL: test_fsub(
+; CHECK-DAG:  ld.param.b16[[A:%h[0-9]+]], [test_fsub_param_0];
+; CHECK-DAG:  ld.param.b16[[B:%h[0-9]+]], [test_fsub_param_1];
+; CHECK-NEXT: sub.rn.bf16 [[R:%f[0-9]+]], [[A]], [[B]];
+; CHECK-NEXT: st.param.b16[func_retval0+0], [[R]];
+; CHECK-NEXT: ret;
+
+define bfloat @test_fsub(bfloat %0, bfloat %1) {
+  %3 = fsub bfloat %0, %1 
+  ret bfloat %3
+}
+
+; CHECK-LABEL: test_faddx2(
+; CHECK-DAG:  ld.param.b32[[A:%hh[0-9]+]], [test_faddx2_param_0];
+; CHECK-DAG:  ld.param.b32[[B:%hh[0-9]+]], [test_faddx2_param_1];
+; CHECK-NEXT: add.rn.bf16x2   [[R:%f[0-9]+]], [[A]], [[B]];
+
+; CHECK:  st.param.b32[func_retval0+0], [[R]];
+; CHECK:  ret;
+
+define <2 x bfloat> @test_faddx2(<2 x bfloat> %a, <2 x bfloat> %b) #0 {
+  %r = fadd <2 x bfloat> %a, %b
+  ret <2 x bfloat> %r
+}
+
+; CHECK-LABEL: test_fsubx2(
+; CHECK-DAG:  ld.param.b32[[A:%hh[0-9]+]], [test_fsubx2_param_0];
+; CHECK-DAG:  ld.param.b32[[B:%hh[0-9]+]], [test_fsubx2_param_1];
+; CHECK-NEXT: sub.rn.bf16x2   [[R:%f[0-9]+]], [[A]], [[B]];
+
+; CHECK:  st.param.b32[func_retval0+0], [[R]];
+; CHECK:  ret;
+
+define <2 x bfloat> @test_fsubx2(<2 x bfloat> %a, <2 x bfloat> %b) #0 {
+  %r = fsub <2 x bfloat> %a, %b
+  ret <2 x bfloat> %r
+}
+
+; CHECK-LABEL: test_fmulx2(
+; CHECK-DAG:  ld.param.b32[[A:%hh[0-9]+]], [test_fmulx2_param_0];
+; CHECK-DAG:  ld.param.b32[[B:%hh[0-9]+]], [test_fmulx2_param_1];
+; CHECK-NEXT: mul.rn.bf16x2   [[R:%f[0-9]+]], [[A]], [[B]];
+
+; CHECK:  st.param.b32[func_retval0+0], [[R]];
+; CHECK:  ret;
+
+define <2 x bfloat> @test_fmul(<2 x bfloat> %a, <2 x bfloat> %b) #0 {
+  %r = fmul <2 x bfloat> %a, %b
+  ret <2 x bfloat> %r
+}
+
+; CHECK-LABEL: test_fdiv(
+; CHECK-DAG:  ld.param.b32[[A:%hh[0-9]+]], [test_fdiv_param_0];
+; CHECK-DAG:  ld.param.b32[[B:%hh[0-9]+]], [test_fdiv_param_1];
+; CHECK-DAG:  mov.b32 {[[A0:%h[0-9]+]], [[A1:%h[0-9]+]]}, [[A]]
+; CHECK-DAG:  mov.b32 {[[B0:%h[0-9]+]], [[B1:%h[0-9]+]]}, [[B]]
+; CHECK-DAG:  cvt.f32.bf16 [[FA0:%f[0-9]+]], [[A0]];
+; CHECK-DAG:  cvt.f32.bf16 [[FA1:%f[0-9]+]], [[A1]];
+; CHECK-DAG:  cvt.f32.bf16 [[FB0:%f[0-9]+]], [[B0]];
+; CHECK-DAG:  cvt.f32.bf16 [[FB1:%f[0-9]+]], [[B1]];
+; CHECK-DAG:  div.rn.f32  [[FR0:%f[0-9]+]], [[FA0]], [[FB0]];
+; CHECK-DAG:  div.rn.f32  [[FR1:%f[0-9]+]], [[FA1]], [[FB1]];
+; CHECK-DAG:  cvt.rn.bf16.f32  [[R0:%h[0-9]+]], [[FR0]];
+; CHECK-DAG:  cvt.rn.bf16.f32  [[R1:%h[0-9]+]], [[FR1]];
+; CHECK-NEXT: mov.b32 [[R:%hh[0-9]+]], {[[R0]], [[R1]]}
+; CHECK-NEXT: st.param.b32[func_retval0+0], [[R]];
+; CHECK-NEXT: ret;
+
+define <2 x bfloat> @test_fdiv(<2 x bfloat> %a, <2 x bfloat> %b) #0 {
+  %r = fdiv <2 x bfloat> %a, %b
+  ret <2 x bfloat> %r
+}
Index: llvm/lib/Target/NVPTX/NVPTXTargetTransformInfo.cpp
===
--- llvm/lib/Target/NVPTX/NVPTXTargetTransformInfo.cpp
+++ llvm/lib/Target/NVPTX/NVPTXTargetTransformInfo.cpp
@@ -204,6 +204,14 @@
   return {Intrinsic::fma, FTZ_MustBeOff, true};
 

[PATCH] D149976: adding bf16 support to NVPTX

2023-05-05 Thread Artem Belevich via Phabricator via cfe-commits
tra added a comment.

This patch appears to *include* the changes in D144911 
 (e.g. 
llvm/test/CodeGen/NVPTX/bf16-instructions.ll is added by both patches). Can you 
update it as an incremental patch that actually excludes it?


Repository:
  rG LLVM Github Monorepo

CHANGES SINCE LAST ACTION
  https://reviews.llvm.org/D149976/new/

https://reviews.llvm.org/D149976

___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[PATCH] D149976: adding bf16 support to NVPTX

2023-05-05 Thread Kushan Ahmadian via Phabricator via cfe-commits
kushanam created this revision.
Herald added subscribers: mattd, gchakrabarti, asavonic, hiraditya.
Herald added a project: All.
kushanam requested review of this revision.
Herald added subscribers: llvm-commits, cfe-commits, jdoerfert, jholewinski.
Herald added projects: clang, LLVM.

Currently, bf16 has been scatteredly added to the PTX codegen. This patch aims 
to complete the set of instructions and code path required to support bf16 data 
type.

Depends on D144911 


Repository:
  rG LLVM Github Monorepo

https://reviews.llvm.org/D149976

Files:
  clang/include/clang/Basic/BuiltinsNVPTX.def
  llvm/include/llvm/IR/IntrinsicsNVVM.td
  llvm/lib/Target/NVPTX/MCTargetDesc/NVPTXInstPrinter.cpp
  llvm/lib/Target/NVPTX/NVPTXAsmPrinter.cpp
  llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp
  llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.h
  llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp
  llvm/lib/Target/NVPTX/NVPTXInstrInfo.cpp
  llvm/lib/Target/NVPTX/NVPTXInstrInfo.td
  llvm/lib/Target/NVPTX/NVPTXIntrinsics.td
  llvm/lib/Target/NVPTX/NVPTXMCExpr.cpp
  llvm/lib/Target/NVPTX/NVPTXMCExpr.h
  llvm/lib/Target/NVPTX/NVPTXRegisterInfo.cpp
  llvm/lib/Target/NVPTX/NVPTXRegisterInfo.td
  llvm/lib/Target/NVPTX/NVPTXSubtarget.cpp
  llvm/lib/Target/NVPTX/NVPTXSubtarget.h
  llvm/lib/Target/NVPTX/NVPTXTargetTransformInfo.cpp
  llvm/test/CodeGen/NVPTX/bf16-instructions.ll

Index: llvm/test/CodeGen/NVPTX/bf16-instructions.ll
===
--- /dev/null
+++ llvm/test/CodeGen/NVPTX/bf16-instructions.ll
@@ -0,0 +1,88 @@
+; RUN: llc < %s -march=nvptx64 -mcpu=sm_80 -mattr=+ptx70 | FileCheck %s
+; RUN: %if ptxas-11.0 %{ llc < %s -march=nvptx64 -mcpu=sm_80 -mattr=+ptx70 | %ptxas-verify -arch=sm_80 %}
+
+
+; CHECK-LABEL: test_fadd(
+; CHECK-DAG:  ld.param.b16[[A:%h[0-9]+]], [test_fadd_param_0];
+; CHECK-DAG:  ld.param.b16[[B:%h[0-9]+]], [test_fadd_param_1];
+; CHECK-NEXT: add.rn.bf16 [[R:%f[0-9]+]], [[A]], [[B]];
+; CHECK-NEXT: st.param.b16[func_retval0+0], [[R]];
+; CHECK-NEXT: ret;
+
+define bfloat @test_fadd(bfloat %0, bfloat %1) {
+  %3 = fadd bfloat %0, %1 
+  ret bfloat %3
+}
+
+; CHECK-LABEL: test_fsub(
+; CHECK-DAG:  ld.param.b16[[A:%h[0-9]+]], [test_fsub_param_0];
+; CHECK-DAG:  ld.param.b16[[B:%h[0-9]+]], [test_fsub_param_1];
+; CHECK-NEXT: sub.rn.bf16 [[R:%f[0-9]+]], [[A]], [[B]];
+; CHECK-NEXT: st.param.b16[func_retval0+0], [[R]];
+; CHECK-NEXT: ret;
+
+define bfloat @test_fsub(bfloat %0, bfloat %1) {
+  %3 = fsub bfloat %0, %1 
+  ret bfloat %3
+}
+
+; CHECK-LABEL: test_faddx2(
+; CHECK-DAG:  ld.param.b32[[A:%hh[0-9]+]], [test_faddx2_param_0];
+; CHECK-DAG:  ld.param.b32[[B:%hh[0-9]+]], [test_faddx2_param_1];
+; CHECK-NEXT: add.rn.bf16x2   [[R:%f[0-9]+]], [[A]], [[B]];
+
+; CHECK:  st.param.b32[func_retval0+0], [[R]];
+; CHECK:  ret;
+
+define <2 x bfloat> @test_faddx2(<2 x bfloat> %a, <2 x bfloat> %b) #0 {
+  %r = fadd <2 x bfloat> %a, %b
+  ret <2 x bfloat> %r
+}
+
+; CHECK-LABEL: test_fsubx2(
+; CHECK-DAG:  ld.param.b32[[A:%hh[0-9]+]], [test_fsubx2_param_0];
+; CHECK-DAG:  ld.param.b32[[B:%hh[0-9]+]], [test_fsubx2_param_1];
+; CHECK-NEXT: sub.rn.bf16x2   [[R:%f[0-9]+]], [[A]], [[B]];
+
+; CHECK:  st.param.b32[func_retval0+0], [[R]];
+; CHECK:  ret;
+
+define <2 x bfloat> @test_fsubx2(<2 x bfloat> %a, <2 x bfloat> %b) #0 {
+  %r = fsub <2 x bfloat> %a, %b
+  ret <2 x bfloat> %r
+}
+
+; CHECK-LABEL: test_fmulx2(
+; CHECK-DAG:  ld.param.b32[[A:%hh[0-9]+]], [test_fmulx2_param_0];
+; CHECK-DAG:  ld.param.b32[[B:%hh[0-9]+]], [test_fmulx2_param_1];
+; CHECK-NEXT: mul.rn.bf16x2   [[R:%f[0-9]+]], [[A]], [[B]];
+
+; CHECK:  st.param.b32[func_retval0+0], [[R]];
+; CHECK:  ret;
+
+define <2 x bfloat> @test_fmul(<2 x bfloat> %a, <2 x bfloat> %b) #0 {
+  %r = fmul <2 x bfloat> %a, %b
+  ret <2 x bfloat> %r
+}
+
+; CHECK-LABEL: test_fdiv(
+; CHECK-DAG:  ld.param.b32[[A:%hh[0-9]+]], [test_fdiv_param_0];
+; CHECK-DAG:  ld.param.b32[[B:%hh[0-9]+]], [test_fdiv_param_1];
+; CHECK-DAG:  mov.b32 {[[A0:%h[0-9]+]], [[A1:%h[0-9]+]]}, [[A]]
+; CHECK-DAG:  mov.b32 {[[B0:%h[0-9]+]], [[B1:%h[0-9]+]]}, [[B]]
+; CHECK-DAG:  cvt.f32.bf16 [[FA0:%f[0-9]+]], [[A0]];
+; CHECK-DAG:  cvt.f32.bf16 [[FA1:%f[0-9]+]], [[A1]];
+; CHECK-DAG:  cvt.f32.bf16 [[FB0:%f[0-9]+]], [[B0]];
+; CHECK-DAG:  cvt.f32.bf16 [[FB1:%f[0-9]+]], [[B1]];
+; CHECK-DAG:  div.rn.f32  [[FR0:%f[0-9]+]], [[FA0]], [[FB0]];
+; CHECK-DAG:  div.rn.f32  [[FR1:%f[0-9]+]], [[FA1]], [[FB1]];
+; CHECK-DAG:  cvt.rn.bf16.f32  [[R0:%h[0-9]+]], [[FR0]];
+; CHECK-DAG:  cvt.rn.bf16.f32  [[R1:%h[0-9]+]], [[FR1]];
+; CHECK-NEXT: mov.b32 [[R:%hh[0-9]+]], {[[R0]], [[R1]]}
+; CHECK-NEXT: st.param.b32[func_retval0+0], [[R]];
+; CHECK-NEXT: ret;
+
+define <2 x bfloat> @test_fdiv(<2 x bfloat> %a, <2 x bfloat>