[PATCH] D144911: adding bf16 support to NVPTX

2023-06-23 Thread Kushan Ahmadian via Phabricator via cfe-commits
kushanam updated this revision to Diff 534050.
kushanam added a comment.

fixing the f16x2-instructions test


Repository:
  rG LLVM Github Monorepo

CHANGES SINCE LAST ACTION
  https://reviews.llvm.org/D144911/new/

https://reviews.llvm.org/D144911

Files:
  clang/include/clang/Basic/BuiltinsNVPTX.def
  llvm/include/llvm/IR/IntrinsicsNVVM.td
  llvm/lib/Target/NVPTX/NVPTXAsmPrinter.cpp
  llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp
  llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.h
  llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp
  llvm/lib/Target/NVPTX/NVPTXInstrInfo.td
  llvm/lib/Target/NVPTX/NVPTXIntrinsics.td
  llvm/lib/Target/NVPTX/NVPTXMCExpr.cpp
  llvm/lib/Target/NVPTX/NVPTXMCExpr.h
  llvm/lib/Target/NVPTX/NVPTXSubtarget.cpp
  llvm/lib/Target/NVPTX/NVPTXSubtarget.h
  llvm/lib/Target/NVPTX/NVPTXTargetTransformInfo.cpp
  llvm/test/CodeGen/NVPTX/bf16-instructions.ll
  llvm/test/CodeGen/NVPTX/f16x2-instructions.ll

Index: llvm/test/CodeGen/NVPTX/f16x2-instructions.ll
===
--- llvm/test/CodeGen/NVPTX/f16x2-instructions.ll
+++ llvm/test/CodeGen/NVPTX/f16x2-instructions.ll
@@ -31,7 +31,9 @@
 target datalayout = "e-m:o-i64:64-i128:128-n32:64-S128"
 
 ; CHECK-LABEL: test_ret_const(
-; CHECK: mov.b32 [[R:%r[0-9+]]], 1073757184;
+; CHECK: mov.b16 [[A:%rs[0-9+]]], 0x4000;
+; CHECK: mov.b16 [[B:%rs[0-9+]]], 0x3C00;
+; CHECK: mov.b32 [[R:%r[0-9+]]], {[[B]], [[A]]};
 ; CHECK: st.param.b32[func_retval0+0], [[R]];
 ; CHECK-NEXT: ret;
 define <2 x half> @test_ret_const() #0 {
@@ -100,8 +102,10 @@
 ; CHECK-LABEL: test_fadd_imm_0(
 ; CHECK-DAG:  ld.param.b32[[A:%r[0-9]+]], [test_fadd_imm_0_param_0];
 ;
-; CHECK-F16:mov.b32[[I:%r[0-9+]]], 1073757184;
-; CHECK-F16:add.rn.f16x2   [[R:%r[0-9]+]], [[A]], [[I]];
+; CHECK-F16:mov.b16[[B:%rs[0-9+]]], 0x4000;
+; CHECK-F16:mov.b16[[C:%rs[0-9+]]], 0x3C00;
+; CHECK-F16:mov.b32[[D:%r[0-9+]]], {[[C]], [[B]]};
+; CHECK-F16:add.rn.f16x2   [[R:%r[0-9]+]], [[A]], [[D]];
 ;
 ; CHECK-NOF16-DAG:  mov.b32{[[A0:%rs[0-9]+]], [[A1:%rs[0-9]+]]}, [[A]]
 ; CHECK-NOF16-DAG:  cvt.f32.f16[[FA0:%f[0-9]+]], [[A0]]
@@ -122,8 +126,10 @@
 ; CHECK-LABEL: test_fadd_imm_1(
 ; CHECK-DAG:  ld.param.b32[[B:%r[0-9]+]], [test_fadd_imm_1_param_0];
 ;
-; CHECK-F16:mov.b32[[I:%r[0-9+]]], 1073757184;
-; CHECK-F16:add.rn.f16x2   [[R:%r[0-9]+]], [[B]], [[I]];
+; CHECK-F16:mov.b16[[B:%rs[0-9+]]], 0x4000;
+; CHECK-F16:mov.b16[[C:%rs[0-9+]]], 0x3C00;
+; CHECK-F16:mov.b32[[D:%r[0-9+]]], {[[C]], [[B]]};
+; CHECK-F16:add.rn.f16x2   [[R:%r[0-9]+]], [[A]], [[D]];
 ;
 ; CHECK-NOF16-DAG:  mov.b32{[[A0:%rs[0-9]+]], [[A1:%rs[0-9]+]]}, [[A]]
 ; CHECK-NOF16-DAG:  cvt.f32.f16[[FA0:%f[0-9]+]], [[A0]]
@@ -169,8 +175,10 @@
 ; CHECK-LABEL: test_fneg(
 ; CHECK-DAG:  ld.param.b32[[A:%r[0-9]+]], [test_fneg_param_0];
 ;
-; CHECK-F16:mov.b32[[I:%r[0-9+]]], 0;
-; CHECK-F16-NEXT:   sub.rn.f16x2   [[R:%r[0-9]+]], [[I]], [[A]];
+; CHECK-F16:mov.b16[[B:%rs[0-9+]]], 0x;
+; CHECK-F16:mov.b32[[C:%r[0-9+]]], {[[B]], [[B]]};
+;
+; CHECK-F16-NEXT:   sub.rn.f16x2   [[R:%r[0-9]+]], [[C]], [[A]];
 ;
 ; CHECK-NOF16-DAG:  mov.b32{[[A0:%rs[0-9]+]], [[A1:%rs[0-9]+]]}, [[A]]
 ; CHECK-NOF16-DAG:  cvt.f32.f16[[FA0:%f[0-9]+]], [[A0]]
Index: llvm/test/CodeGen/NVPTX/bf16-instructions.ll
===
--- /dev/null
+++ llvm/test/CodeGen/NVPTX/bf16-instructions.ll
@@ -0,0 +1,142 @@
+; RUN: llc < %s -march=nvptx64 -mcpu=sm_80 -mattr=+ptx70 | FileCheck %s
+; RUN: %if ptxas-11.0 %{ llc < %s -march=nvptx64 -mcpu=sm_80 -mattr=+ptx70 | %ptxas-verify -arch=sm_80 %}
+
+; LDST: .b8 bfloat_array[8] = {1, 2, 3, 4, 5, 6, 7, 8};
+@"bfloat_array" = addrspace(1) constant [4 x bfloat]
+[bfloat 0xR0201, bfloat 0xR0403, bfloat 0xR0605, bfloat 0xR0807]
+  
+; CHECK-LABEL: test_fadd(
+; CHECK-DAG:  ld.param.b16[[A:%h[0-9]+]], [test_fadd_param_0];
+; CHECK-DAG:  ld.param.b16[[B:%h[0-9]+]], [test_fadd_param_1];
+; CHECK-NEXT: add.rn.bf16 [[R:%f[0-9]+]], [[A]], [[B]];
+; CHECK-NEXT: st.param.b16[func_retval0+0], [[R]];
+; CHECK-NEXT: ret;
+
+define bfloat @test_fadd(bfloat %0, bfloat %1) {
+  %3 = fadd bfloat %0, %1
+  ret bfloat %3
+}
+
+; CHECK-LABEL: test_fsub(
+; CHECK-DAG:  ld.param.b16[[A:%h[0-9]+]], [test_fsub_param_0];
+; CHECK-DAG:  ld.param.b16[[B:%h[0-9]+]], [test_fsub_param_1];
+; CHECK-NEXT: sub.rn.bf16 [[R:%f[0-9]+]], [[A]], [[B]];
+; CHECK-NEXT: st.param.b16[func_retval0+0], [[R]];
+; CHECK-NEXT: ret;
+
+define bfloat @test_fsub(bfloat %0, bfloat %1) {
+  %3 = fsub bfloat %0, %1
+  ret bfloat %3
+}
+
+; CHECK-LABEL: test_faddx2(
+; CHECK-DAG:  ld.param.b32[[A:%hh[0-9]+]], [test_faddx2_param_0];
+; CHECK-DAG:  

[PATCH] D144911: adding bf16 support to NVPTX

2023-06-22 Thread Kushan Ahmadian via Phabricator via cfe-commits
kushanam updated this revision to Diff 533694.
kushanam added a comment.

Updating the bf16 conversion arc requirements


Repository:
  rG LLVM Github Monorepo

CHANGES SINCE LAST ACTION
  https://reviews.llvm.org/D144911/new/

https://reviews.llvm.org/D144911

Files:
  clang/include/clang/Basic/BuiltinsNVPTX.def
  llvm/include/llvm/IR/IntrinsicsNVVM.td
  llvm/lib/Target/NVPTX/NVPTXAsmPrinter.cpp
  llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp
  llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.h
  llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp
  llvm/lib/Target/NVPTX/NVPTXInstrInfo.td
  llvm/lib/Target/NVPTX/NVPTXIntrinsics.td
  llvm/lib/Target/NVPTX/NVPTXMCExpr.cpp
  llvm/lib/Target/NVPTX/NVPTXMCExpr.h
  llvm/lib/Target/NVPTX/NVPTXSubtarget.cpp
  llvm/lib/Target/NVPTX/NVPTXSubtarget.h
  llvm/lib/Target/NVPTX/NVPTXTargetTransformInfo.cpp
  llvm/test/CodeGen/NVPTX/bf16-instructions.ll

Index: llvm/test/CodeGen/NVPTX/bf16-instructions.ll
===
--- /dev/null
+++ llvm/test/CodeGen/NVPTX/bf16-instructions.ll
@@ -0,0 +1,142 @@
+; RUN: llc < %s -march=nvptx64 -mcpu=sm_80 -mattr=+ptx70 | FileCheck %s
+; RUN: %if ptxas-11.0 %{ llc < %s -march=nvptx64 -mcpu=sm_80 -mattr=+ptx70 | %ptxas-verify -arch=sm_80 %}
+
+; LDST: .b8 bfloat_array[8] = {1, 2, 3, 4, 5, 6, 7, 8};
+@"bfloat_array" = addrspace(1) constant [4 x bfloat]
+[bfloat 0xR0201, bfloat 0xR0403, bfloat 0xR0605, bfloat 0xR0807]
+  
+; CHECK-LABEL: test_fadd(
+; CHECK-DAG:  ld.param.b16[[A:%h[0-9]+]], [test_fadd_param_0];
+; CHECK-DAG:  ld.param.b16[[B:%h[0-9]+]], [test_fadd_param_1];
+; CHECK-NEXT: add.rn.bf16 [[R:%f[0-9]+]], [[A]], [[B]];
+; CHECK-NEXT: st.param.b16[func_retval0+0], [[R]];
+; CHECK-NEXT: ret;
+
+define bfloat @test_fadd(bfloat %0, bfloat %1) {
+  %3 = fadd bfloat %0, %1
+  ret bfloat %3
+}
+
+; CHECK-LABEL: test_fsub(
+; CHECK-DAG:  ld.param.b16[[A:%h[0-9]+]], [test_fsub_param_0];
+; CHECK-DAG:  ld.param.b16[[B:%h[0-9]+]], [test_fsub_param_1];
+; CHECK-NEXT: sub.rn.bf16 [[R:%f[0-9]+]], [[A]], [[B]];
+; CHECK-NEXT: st.param.b16[func_retval0+0], [[R]];
+; CHECK-NEXT: ret;
+
+define bfloat @test_fsub(bfloat %0, bfloat %1) {
+  %3 = fsub bfloat %0, %1
+  ret bfloat %3
+}
+
+; CHECK-LABEL: test_faddx2(
+; CHECK-DAG:  ld.param.b32[[A:%hh[0-9]+]], [test_faddx2_param_0];
+; CHECK-DAG:  ld.param.b32[[B:%hh[0-9]+]], [test_faddx2_param_1];
+; CHECK-NEXT: add.rn.bf16x2   [[R:%f[0-9]+]], [[A]], [[B]];
+
+; CHECK:  st.param.b32[func_retval0+0], [[R]];
+; CHECK:  ret;
+
+define <2 x bfloat> @test_faddx2(<2 x bfloat> %a, <2 x bfloat> %b) #0 {
+  %r = fadd <2 x bfloat> %a, %b
+  ret <2 x bfloat> %r
+}
+
+; CHECK-LABEL: test_fsubx2(
+; CHECK-DAG:  ld.param.b32[[A:%hh[0-9]+]], [test_fsubx2_param_0];
+; CHECK-DAG:  ld.param.b32[[B:%hh[0-9]+]], [test_fsubx2_param_1];
+; CHECK-NEXT: sub.rn.bf16x2   [[R:%f[0-9]+]], [[A]], [[B]];
+
+; CHECK:  st.param.b32[func_retval0+0], [[R]];
+; CHECK:  ret;
+
+define <2 x bfloat> @test_fsubx2(<2 x bfloat> %a, <2 x bfloat> %b) #0 {
+  %r = fsub <2 x bfloat> %a, %b
+  ret <2 x bfloat> %r
+}
+
+; CHECK-LABEL: test_fmulx2(
+; CHECK-DAG:  ld.param.b32[[A:%hh[0-9]+]], [test_fmulx2_param_0];
+; CHECK-DAG:  ld.param.b32[[B:%hh[0-9]+]], [test_fmulx2_param_1];
+; CHECK-NEXT: mul.rn.bf16x2   [[R:%f[0-9]+]], [[A]], [[B]];
+
+; CHECK:  st.param.b32[func_retval0+0], [[R]];
+; CHECK:  ret;
+
+define <2 x bfloat> @test_fmul(<2 x bfloat> %a, <2 x bfloat> %b) #0 {
+  %r = fmul <2 x bfloat> %a, %b
+  ret <2 x bfloat> %r
+}
+
+; CHECK-LABEL: test_fdiv(
+; CHECK-DAG:  ld.param.b32[[A:%hh[0-9]+]], [test_fdiv_param_0];
+; CHECK-DAG:  ld.param.b32[[B:%hh[0-9]+]], [test_fdiv_param_1];
+; CHECK-DAG:  mov.b32 {[[A0:%h[0-9]+]], [[A1:%h[0-9]+]]}, [[A]]
+; CHECK-DAG:  mov.b32 {[[B0:%h[0-9]+]], [[B1:%h[0-9]+]]}, [[B]]
+; CHECK-DAG:  cvt.f32.bf16 [[FA0:%f[0-9]+]], [[A0]];
+; CHECK-DAG:  cvt.f32.bf16 [[FA1:%f[0-9]+]], [[A1]];
+; CHECK-DAG:  cvt.f32.bf16 [[FB0:%f[0-9]+]], [[B0]];
+; CHECK-DAG:  cvt.f32.bf16 [[FB1:%f[0-9]+]], [[B1]];
+; CHECK-DAG:  div.rn.f32  [[FR0:%f[0-9]+]], [[FA0]], [[FB0]];
+; CHECK-DAG:  div.rn.f32  [[FR1:%f[0-9]+]], [[FA1]], [[FB1]];
+; CHECK-DAG:  cvt.rn.bf16.f32  [[R0:%h[0-9]+]], [[FR0]];
+; CHECK-DAG:  cvt.rn.bf16.f32  [[R1:%h[0-9]+]], [[FR1]];
+; CHECK-NEXT: mov.b32 [[R:%hh[0-9]+]], {[[R0]], [[R1]]}
+; CHECK-NEXT: st.param.b32[func_retval0+0], [[R]];
+; CHECK-NEXT: ret;
+
+define <2 x bfloat> @test_fdiv(<2 x bfloat> %a, <2 x bfloat> %b) #0 {
+  %r = fdiv <2 x bfloat> %a, %b
+  ret <2 x bfloat> %r
+}
+
+; CHECK-LABEL: test_extract_0(
+; CHECK:  ld.param.b16[[A:%rs[0-9]+]], [test_extract_0_param_0];
+; CHECK:  st.param.b16[func_retval0+0], [[A]];
+; CHECK:  ret;
+
+define bfloat @test_extract_0(<2 x bfloat> %a) #0 {
+  %e = extractelement <2 x bfloat> %a, i32 0
+  ret bfloat %e
+}
+
+; CHECK-LABEL: test_ext

[PATCH] D144911: adding bf16 support to NVPTX

2023-06-20 Thread Kushan Ahmadian via Phabricator via cfe-commits
kushanam updated this revision to Diff 533051.
kushanam added a comment.

fixing bf16 test cases


Repository:
  rG LLVM Github Monorepo

CHANGES SINCE LAST ACTION
  https://reviews.llvm.org/D144911/new/

https://reviews.llvm.org/D144911

Files:
  clang/include/clang/Basic/BuiltinsNVPTX.def
  llvm/include/llvm/IR/IntrinsicsNVVM.td
  llvm/lib/Target/NVPTX/NVPTXAsmPrinter.cpp
  llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp
  llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.h
  llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp
  llvm/lib/Target/NVPTX/NVPTXInstrInfo.td
  llvm/lib/Target/NVPTX/NVPTXIntrinsics.td
  llvm/lib/Target/NVPTX/NVPTXMCExpr.cpp
  llvm/lib/Target/NVPTX/NVPTXMCExpr.h
  llvm/lib/Target/NVPTX/NVPTXSubtarget.cpp
  llvm/lib/Target/NVPTX/NVPTXSubtarget.h
  llvm/lib/Target/NVPTX/NVPTXTargetTransformInfo.cpp
  llvm/test/CodeGen/NVPTX/bf16-instructions.ll

Index: llvm/test/CodeGen/NVPTX/bf16-instructions.ll
===
--- /dev/null
+++ llvm/test/CodeGen/NVPTX/bf16-instructions.ll
@@ -0,0 +1,142 @@
+; RUN: llc < %s -march=nvptx64 -mcpu=sm_80 -mattr=+ptx70 | FileCheck %s
+; RUN: %if ptxas-11.0 %{ llc < %s -march=nvptx64 -mcpu=sm_80 -mattr=+ptx70 | %ptxas-verify -arch=sm_80 %}
+
+; LDST: .b8 bfloat_array[8] = {1, 2, 3, 4, 5, 6, 7, 8};
+@"bfloat_array" = addrspace(1) constant [4 x bfloat]
+[bfloat 0xR0201, bfloat 0xR0403, bfloat 0xR0605, bfloat 0xR0807]
+  
+; CHECK-LABEL: test_fadd(
+; CHECK-DAG:  ld.param.b16[[A:%h[0-9]+]], [test_fadd_param_0];
+; CHECK-DAG:  ld.param.b16[[B:%h[0-9]+]], [test_fadd_param_1];
+; CHECK-NEXT: add.rn.bf16 [[R:%f[0-9]+]], [[A]], [[B]];
+; CHECK-NEXT: st.param.b16[func_retval0+0], [[R]];
+; CHECK-NEXT: ret;
+
+define bfloat @test_fadd(bfloat %0, bfloat %1) {
+  %3 = fadd bfloat %0, %1
+  ret bfloat %3
+}
+
+; CHECK-LABEL: test_fsub(
+; CHECK-DAG:  ld.param.b16[[A:%h[0-9]+]], [test_fsub_param_0];
+; CHECK-DAG:  ld.param.b16[[B:%h[0-9]+]], [test_fsub_param_1];
+; CHECK-NEXT: sub.rn.bf16 [[R:%f[0-9]+]], [[A]], [[B]];
+; CHECK-NEXT: st.param.b16[func_retval0+0], [[R]];
+; CHECK-NEXT: ret;
+
+define bfloat @test_fsub(bfloat %0, bfloat %1) {
+  %3 = fsub bfloat %0, %1
+  ret bfloat %3
+}
+
+; CHECK-LABEL: test_faddx2(
+; CHECK-DAG:  ld.param.b32[[A:%hh[0-9]+]], [test_faddx2_param_0];
+; CHECK-DAG:  ld.param.b32[[B:%hh[0-9]+]], [test_faddx2_param_1];
+; CHECK-NEXT: add.rn.bf16x2   [[R:%f[0-9]+]], [[A]], [[B]];
+
+; CHECK:  st.param.b32[func_retval0+0], [[R]];
+; CHECK:  ret;
+
+define <2 x bfloat> @test_faddx2(<2 x bfloat> %a, <2 x bfloat> %b) #0 {
+  %r = fadd <2 x bfloat> %a, %b
+  ret <2 x bfloat> %r
+}
+
+; CHECK-LABEL: test_fsubx2(
+; CHECK-DAG:  ld.param.b32[[A:%hh[0-9]+]], [test_fsubx2_param_0];
+; CHECK-DAG:  ld.param.b32[[B:%hh[0-9]+]], [test_fsubx2_param_1];
+; CHECK-NEXT: sub.rn.bf16x2   [[R:%f[0-9]+]], [[A]], [[B]];
+
+; CHECK:  st.param.b32[func_retval0+0], [[R]];
+; CHECK:  ret;
+
+define <2 x bfloat> @test_fsubx2(<2 x bfloat> %a, <2 x bfloat> %b) #0 {
+  %r = fsub <2 x bfloat> %a, %b
+  ret <2 x bfloat> %r
+}
+
+; CHECK-LABEL: test_fmulx2(
+; CHECK-DAG:  ld.param.b32[[A:%hh[0-9]+]], [test_fmulx2_param_0];
+; CHECK-DAG:  ld.param.b32[[B:%hh[0-9]+]], [test_fmulx2_param_1];
+; CHECK-NEXT: mul.rn.bf16x2   [[R:%f[0-9]+]], [[A]], [[B]];
+
+; CHECK:  st.param.b32[func_retval0+0], [[R]];
+; CHECK:  ret;
+
+define <2 x bfloat> @test_fmul(<2 x bfloat> %a, <2 x bfloat> %b) #0 {
+  %r = fmul <2 x bfloat> %a, %b
+  ret <2 x bfloat> %r
+}
+
+; CHECK-LABEL: test_fdiv(
+; CHECK-DAG:  ld.param.b32[[A:%hh[0-9]+]], [test_fdiv_param_0];
+; CHECK-DAG:  ld.param.b32[[B:%hh[0-9]+]], [test_fdiv_param_1];
+; CHECK-DAG:  mov.b32 {[[A0:%h[0-9]+]], [[A1:%h[0-9]+]]}, [[A]]
+; CHECK-DAG:  mov.b32 {[[B0:%h[0-9]+]], [[B1:%h[0-9]+]]}, [[B]]
+; CHECK-DAG:  cvt.f32.bf16 [[FA0:%f[0-9]+]], [[A0]];
+; CHECK-DAG:  cvt.f32.bf16 [[FA1:%f[0-9]+]], [[A1]];
+; CHECK-DAG:  cvt.f32.bf16 [[FB0:%f[0-9]+]], [[B0]];
+; CHECK-DAG:  cvt.f32.bf16 [[FB1:%f[0-9]+]], [[B1]];
+; CHECK-DAG:  div.rn.f32  [[FR0:%f[0-9]+]], [[FA0]], [[FB0]];
+; CHECK-DAG:  div.rn.f32  [[FR1:%f[0-9]+]], [[FA1]], [[FB1]];
+; CHECK-DAG:  cvt.rn.bf16.f32  [[R0:%h[0-9]+]], [[FR0]];
+; CHECK-DAG:  cvt.rn.bf16.f32  [[R1:%h[0-9]+]], [[FR1]];
+; CHECK-NEXT: mov.b32 [[R:%hh[0-9]+]], {[[R0]], [[R1]]}
+; CHECK-NEXT: st.param.b32[func_retval0+0], [[R]];
+; CHECK-NEXT: ret;
+
+define <2 x bfloat> @test_fdiv(<2 x bfloat> %a, <2 x bfloat> %b) #0 {
+  %r = fdiv <2 x bfloat> %a, %b
+  ret <2 x bfloat> %r
+}
+
+; CHECK-LABEL: test_extract_0(
+; CHECK:  ld.param.b16[[A:%rs[0-9]+]], [test_extract_0_param_0];
+; CHECK:  st.param.b16[func_retval0+0], [[A]];
+; CHECK:  ret;
+
+define bfloat @test_extract_0(<2 x bfloat> %a) #0 {
+  %e = extractelement <2 x bfloat> %a, i32 0
+  ret bfloat %e
+}
+
+; CHECK-LABEL: test_extract_1(
+; CHECK:  

[PATCH] D144911: adding bf16 support to NVPTX

2023-06-13 Thread Kushan Ahmadian via Phabricator via cfe-commits
kushanam updated this revision to Diff 531089.
kushanam added a comment.

renaming useBF16Math to hasBF16Math


Repository:
  rG LLVM Github Monorepo

CHANGES SINCE LAST ACTION
  https://reviews.llvm.org/D144911/new/

https://reviews.llvm.org/D144911

Files:
  clang/include/clang/Basic/BuiltinsNVPTX.def
  llvm/include/llvm/IR/IntrinsicsNVVM.td
  llvm/lib/Target/NVPTX/NVPTXAsmPrinter.cpp
  llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp
  llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.h
  llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp
  llvm/lib/Target/NVPTX/NVPTXInstrInfo.td
  llvm/lib/Target/NVPTX/NVPTXIntrinsics.td
  llvm/lib/Target/NVPTX/NVPTXMCExpr.cpp
  llvm/lib/Target/NVPTX/NVPTXMCExpr.h
  llvm/lib/Target/NVPTX/NVPTXSubtarget.cpp
  llvm/lib/Target/NVPTX/NVPTXSubtarget.h
  llvm/lib/Target/NVPTX/NVPTXTargetTransformInfo.cpp
  llvm/test/CodeGen/NVPTX/bf16-instructions.ll

Index: llvm/test/CodeGen/NVPTX/bf16-instructions.ll
===
--- /dev/null
+++ llvm/test/CodeGen/NVPTX/bf16-instructions.ll
@@ -0,0 +1,142 @@
+; RUN: llc < %s -march=nvptx64 -mcpu=sm_80 -mattr=+ptx70 | FileCheck %s
+; RUN: %if ptxas-11.0 %{ llc < %s -march=nvptx64 -mcpu=sm_80 -mattr=+ptx70 | %ptxas-verify -arch=sm_80 %}
+
+; LDST: .b8 bfloat_array[8] = {1, 2, 3, 4, 5, 6, 7, 8};
+@"bfloat_array" = addrspace(1) constant [4 x bfloat]
+[bfloat 0xR0201, bfloat 0xR0403, bfloat 0xR0605, bfloat 0xR0807]
+  
+; CHECK-LABEL: test_fadd(
+; CHECK-DAG:  ld.param.b16[[A:%h[0-9]+]], [test_fadd_param_0];
+; CHECK-DAG:  ld.param.b16[[B:%h[0-9]+]], [test_fadd_param_1];
+; CHECK-NEXT: add.rn.bf16 [[R:%f[0-9]+]], [[A]], [[B]];
+; CHECK-NEXT: st.param.b16[func_retval0+0], [[R]];
+; CHECK-NEXT: ret;
+
+define bfloat @test_fadd(bfloat %0, bfloat %1) {
+  %3 = fadd bfloat %0, %1
+  ret bfloat %3
+}
+
+; CHECK-LABEL: test_fsub(
+; CHECK-DAG:  ld.param.b16[[A:%h[0-9]+]], [test_fsub_param_0];
+; CHECK-DAG:  ld.param.b16[[B:%h[0-9]+]], [test_fsub_param_1];
+; CHECK-NEXT: sub.rn.bf16 [[R:%f[0-9]+]], [[A]], [[B]];
+; CHECK-NEXT: st.param.b16[func_retval0+0], [[R]];
+; CHECK-NEXT: ret;
+
+define bfloat @test_fsub(bfloat %0, bfloat %1) {
+  %3 = fsub bfloat %0, %1
+  ret bfloat %3
+}
+
+; CHECK-LABEL: test_faddx2(
+; CHECK-DAG:  ld.param.b32[[A:%hh[0-9]+]], [test_faddx2_param_0];
+; CHECK-DAG:  ld.param.b32[[B:%hh[0-9]+]], [test_faddx2_param_1];
+; CHECK-NEXT: add.rn.bf16x2   [[R:%f[0-9]+]], [[A]], [[B]];
+
+; CHECK:  st.param.b32[func_retval0+0], [[R]];
+; CHECK:  ret;
+
+define <2 x bfloat> @test_faddx2(<2 x bfloat> %a, <2 x bfloat> %b) #0 {
+  %r = fadd <2 x bfloat> %a, %b
+  ret <2 x bfloat> %r
+}
+
+; CHECK-LABEL: test_fsubx2(
+; CHECK-DAG:  ld.param.b32[[A:%hh[0-9]+]], [test_fsubx2_param_0];
+; CHECK-DAG:  ld.param.b32[[B:%hh[0-9]+]], [test_fsubx2_param_1];
+; CHECK-NEXT: sub.rn.bf16x2   [[R:%f[0-9]+]], [[A]], [[B]];
+
+; CHECK:  st.param.b32[func_retval0+0], [[R]];
+; CHECK:  ret;
+
+define <2 x bfloat> @test_fsubx2(<2 x bfloat> %a, <2 x bfloat> %b) #0 {
+  %r = fsub <2 x bfloat> %a, %b
+  ret <2 x bfloat> %r
+}
+
+; CHECK-LABEL: test_fmulx2(
+; CHECK-DAG:  ld.param.b32[[A:%hh[0-9]+]], [test_fmulx2_param_0];
+; CHECK-DAG:  ld.param.b32[[B:%hh[0-9]+]], [test_fmulx2_param_1];
+; CHECK-NEXT: mul.rn.bf16x2   [[R:%f[0-9]+]], [[A]], [[B]];
+
+; CHECK:  st.param.b32[func_retval0+0], [[R]];
+; CHECK:  ret;
+
+define <2 x bfloat> @test_fmul(<2 x bfloat> %a, <2 x bfloat> %b) #0 {
+  %r = fmul <2 x bfloat> %a, %b
+  ret <2 x bfloat> %r
+}
+
+; CHECK-LABEL: test_fdiv(
+; CHECK-DAG:  ld.param.b32[[A:%hh[0-9]+]], [test_fdiv_param_0];
+; CHECK-DAG:  ld.param.b32[[B:%hh[0-9]+]], [test_fdiv_param_1];
+; CHECK-DAG:  mov.b32 {[[A0:%h[0-9]+]], [[A1:%h[0-9]+]]}, [[A]]
+; CHECK-DAG:  mov.b32 {[[B0:%h[0-9]+]], [[B1:%h[0-9]+]]}, [[B]]
+; CHECK-DAG:  cvt.f32.bf16 [[FA0:%f[0-9]+]], [[A0]];
+; CHECK-DAG:  cvt.f32.bf16 [[FA1:%f[0-9]+]], [[A1]];
+; CHECK-DAG:  cvt.f32.bf16 [[FB0:%f[0-9]+]], [[B0]];
+; CHECK-DAG:  cvt.f32.bf16 [[FB1:%f[0-9]+]], [[B1]];
+; CHECK-DAG:  div.rn.f32  [[FR0:%f[0-9]+]], [[FA0]], [[FB0]];
+; CHECK-DAG:  div.rn.f32  [[FR1:%f[0-9]+]], [[FA1]], [[FB1]];
+; CHECK-DAG:  cvt.rn.bf16.f32  [[R0:%h[0-9]+]], [[FR0]];
+; CHECK-DAG:  cvt.rn.bf16.f32  [[R1:%h[0-9]+]], [[FR1]];
+; CHECK-NEXT: mov.b32 [[R:%hh[0-9]+]], {[[R0]], [[R1]]}
+; CHECK-NEXT: st.param.b32[func_retval0+0], [[R]];
+; CHECK-NEXT: ret;
+
+define <2 x bfloat> @test_fdiv(<2 x bfloat> %a, <2 x bfloat> %b) #0 {
+  %r = fdiv <2 x bfloat> %a, %b
+  ret <2 x bfloat> %r
+}
+
+; CHECK-LABEL: test_extract_0(
+; CHECK:  ld.param.b16[[A:%rs[0-9]+]], [test_extract_0_param_0];
+; CHECK:  st.param.b16[func_retval0+0], [[A]];
+; CHECK:  ret;
+
+define bfloat @test_extract_0(<2 x bfloat> %a) #0 {
+  %e = extractelement <2 x bfloat> %a, i32 0
+  ret bfloat %e
+}
+
+; CHECK-LABEL: test_extract_1(
+;

[PATCH] D144911: adding bf16 support to NVPTX

2023-06-13 Thread Kushan Ahmadian via Phabricator via cfe-commits
kushanam updated this revision to Diff 531068.
kushanam added a comment.

add constantFP for bf16


Repository:
  rG LLVM Github Monorepo

CHANGES SINCE LAST ACTION
  https://reviews.llvm.org/D144911/new/

https://reviews.llvm.org/D144911

Files:
  clang/include/clang/Basic/BuiltinsNVPTX.def
  llvm/include/llvm/IR/IntrinsicsNVVM.td
  llvm/lib/Target/NVPTX/NVPTXAsmPrinter.cpp
  llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp
  llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.h
  llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp
  llvm/lib/Target/NVPTX/NVPTXInstrInfo.td
  llvm/lib/Target/NVPTX/NVPTXIntrinsics.td
  llvm/lib/Target/NVPTX/NVPTXMCExpr.cpp
  llvm/lib/Target/NVPTX/NVPTXMCExpr.h
  llvm/lib/Target/NVPTX/NVPTXSubtarget.cpp
  llvm/lib/Target/NVPTX/NVPTXSubtarget.h
  llvm/lib/Target/NVPTX/NVPTXTargetTransformInfo.cpp
  llvm/test/CodeGen/NVPTX/bf16-instructions.ll

Index: llvm/test/CodeGen/NVPTX/bf16-instructions.ll
===
--- /dev/null
+++ llvm/test/CodeGen/NVPTX/bf16-instructions.ll
@@ -0,0 +1,142 @@
+; RUN: llc < %s -march=nvptx64 -mcpu=sm_80 -mattr=+ptx70 | FileCheck %s
+; RUN: %if ptxas-11.0 %{ llc < %s -march=nvptx64 -mcpu=sm_80 -mattr=+ptx70 | %ptxas-verify -arch=sm_80 %}
+
+; LDST: .b8 bfloat_array[8] = {1, 2, 3, 4, 5, 6, 7, 8};
+@"bfloat_array" = addrspace(1) constant [4 x bfloat]
+[bfloat 0xR0201, bfloat 0xR0403, bfloat 0xR0605, bfloat 0xR0807]
+  
+; CHECK-LABEL: test_fadd(
+; CHECK-DAG:  ld.param.b16[[A:%h[0-9]+]], [test_fadd_param_0];
+; CHECK-DAG:  ld.param.b16[[B:%h[0-9]+]], [test_fadd_param_1];
+; CHECK-NEXT: add.rn.bf16 [[R:%f[0-9]+]], [[A]], [[B]];
+; CHECK-NEXT: st.param.b16[func_retval0+0], [[R]];
+; CHECK-NEXT: ret;
+
+define bfloat @test_fadd(bfloat %0, bfloat %1) {
+  %3 = fadd bfloat %0, %1
+  ret bfloat %3
+}
+
+; CHECK-LABEL: test_fsub(
+; CHECK-DAG:  ld.param.b16[[A:%h[0-9]+]], [test_fsub_param_0];
+; CHECK-DAG:  ld.param.b16[[B:%h[0-9]+]], [test_fsub_param_1];
+; CHECK-NEXT: sub.rn.bf16 [[R:%f[0-9]+]], [[A]], [[B]];
+; CHECK-NEXT: st.param.b16[func_retval0+0], [[R]];
+; CHECK-NEXT: ret;
+
+define bfloat @test_fsub(bfloat %0, bfloat %1) {
+  %3 = fsub bfloat %0, %1
+  ret bfloat %3
+}
+
+; CHECK-LABEL: test_faddx2(
+; CHECK-DAG:  ld.param.b32[[A:%hh[0-9]+]], [test_faddx2_param_0];
+; CHECK-DAG:  ld.param.b32[[B:%hh[0-9]+]], [test_faddx2_param_1];
+; CHECK-NEXT: add.rn.bf16x2   [[R:%f[0-9]+]], [[A]], [[B]];
+
+; CHECK:  st.param.b32[func_retval0+0], [[R]];
+; CHECK:  ret;
+
+define <2 x bfloat> @test_faddx2(<2 x bfloat> %a, <2 x bfloat> %b) #0 {
+  %r = fadd <2 x bfloat> %a, %b
+  ret <2 x bfloat> %r
+}
+
+; CHECK-LABEL: test_fsubx2(
+; CHECK-DAG:  ld.param.b32[[A:%hh[0-9]+]], [test_fsubx2_param_0];
+; CHECK-DAG:  ld.param.b32[[B:%hh[0-9]+]], [test_fsubx2_param_1];
+; CHECK-NEXT: sub.rn.bf16x2   [[R:%f[0-9]+]], [[A]], [[B]];
+
+; CHECK:  st.param.b32[func_retval0+0], [[R]];
+; CHECK:  ret;
+
+define <2 x bfloat> @test_fsubx2(<2 x bfloat> %a, <2 x bfloat> %b) #0 {
+  %r = fsub <2 x bfloat> %a, %b
+  ret <2 x bfloat> %r
+}
+
+; CHECK-LABEL: test_fmulx2(
+; CHECK-DAG:  ld.param.b32[[A:%hh[0-9]+]], [test_fmulx2_param_0];
+; CHECK-DAG:  ld.param.b32[[B:%hh[0-9]+]], [test_fmulx2_param_1];
+; CHECK-NEXT: mul.rn.bf16x2   [[R:%f[0-9]+]], [[A]], [[B]];
+
+; CHECK:  st.param.b32[func_retval0+0], [[R]];
+; CHECK:  ret;
+
+define <2 x bfloat> @test_fmul(<2 x bfloat> %a, <2 x bfloat> %b) #0 {
+  %r = fmul <2 x bfloat> %a, %b
+  ret <2 x bfloat> %r
+}
+
+; CHECK-LABEL: test_fdiv(
+; CHECK-DAG:  ld.param.b32[[A:%hh[0-9]+]], [test_fdiv_param_0];
+; CHECK-DAG:  ld.param.b32[[B:%hh[0-9]+]], [test_fdiv_param_1];
+; CHECK-DAG:  mov.b32 {[[A0:%h[0-9]+]], [[A1:%h[0-9]+]]}, [[A]]
+; CHECK-DAG:  mov.b32 {[[B0:%h[0-9]+]], [[B1:%h[0-9]+]]}, [[B]]
+; CHECK-DAG:  cvt.f32.bf16 [[FA0:%f[0-9]+]], [[A0]];
+; CHECK-DAG:  cvt.f32.bf16 [[FA1:%f[0-9]+]], [[A1]];
+; CHECK-DAG:  cvt.f32.bf16 [[FB0:%f[0-9]+]], [[B0]];
+; CHECK-DAG:  cvt.f32.bf16 [[FB1:%f[0-9]+]], [[B1]];
+; CHECK-DAG:  div.rn.f32  [[FR0:%f[0-9]+]], [[FA0]], [[FB0]];
+; CHECK-DAG:  div.rn.f32  [[FR1:%f[0-9]+]], [[FA1]], [[FB1]];
+; CHECK-DAG:  cvt.rn.bf16.f32  [[R0:%h[0-9]+]], [[FR0]];
+; CHECK-DAG:  cvt.rn.bf16.f32  [[R1:%h[0-9]+]], [[FR1]];
+; CHECK-NEXT: mov.b32 [[R:%hh[0-9]+]], {[[R0]], [[R1]]}
+; CHECK-NEXT: st.param.b32[func_retval0+0], [[R]];
+; CHECK-NEXT: ret;
+
+define <2 x bfloat> @test_fdiv(<2 x bfloat> %a, <2 x bfloat> %b) #0 {
+  %r = fdiv <2 x bfloat> %a, %b
+  ret <2 x bfloat> %r
+}
+
+; CHECK-LABEL: test_extract_0(
+; CHECK:  ld.param.b16[[A:%rs[0-9]+]], [test_extract_0_param_0];
+; CHECK:  st.param.b16[func_retval0+0], [[A]];
+; CHECK:  ret;
+
+define bfloat @test_extract_0(<2 x bfloat> %a) #0 {
+  %e = extractelement <2 x bfloat> %a, i32 0
+  ret bfloat %e
+}
+
+; CHECK-LABEL: test_extract_1(
+; CHECK: 

[PATCH] D144911: adding bf16 support to NVPTX

2023-06-09 Thread Kushan Ahmadian via Phabricator via cfe-commits
kushanam updated this revision to Diff 530123.
kushanam added a comment.

Adding new bf16 tests and refactoring the code with new conversion


Repository:
  rG LLVM Github Monorepo

CHANGES SINCE LAST ACTION
  https://reviews.llvm.org/D144911/new/

https://reviews.llvm.org/D144911

Files:
  clang/include/clang/Basic/BuiltinsNVPTX.def
  llvm/include/llvm/IR/IntrinsicsNVVM.td
  llvm/lib/Target/NVPTX/MCTargetDesc/NVPTXInstPrinter.cpp
  llvm/lib/Target/NVPTX/NVPTXAsmPrinter.cpp
  llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp
  llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.h
  llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp
  llvm/lib/Target/NVPTX/NVPTXInstrInfo.td
  llvm/lib/Target/NVPTX/NVPTXIntrinsics.td
  llvm/lib/Target/NVPTX/NVPTXMCExpr.cpp
  llvm/lib/Target/NVPTX/NVPTXMCExpr.h
  llvm/lib/Target/NVPTX/NVPTXSubtarget.cpp
  llvm/lib/Target/NVPTX/NVPTXSubtarget.h
  llvm/lib/Target/NVPTX/NVPTXTargetTransformInfo.cpp
  llvm/test/CodeGen/NVPTX/bf16-instructions.ll

Index: llvm/test/CodeGen/NVPTX/bf16-instructions.ll
===
--- /dev/null
+++ llvm/test/CodeGen/NVPTX/bf16-instructions.ll
@@ -0,0 +1,131 @@
+; RUN: llc < %s -march=nvptx64 -mcpu=sm_80 -mattr=+ptx70 | FileCheck %s
+; RUN: %if ptxas-11.0 %{ llc < %s -march=nvptx64 -mcpu=sm_80 -mattr=+ptx70 | %ptxas-verify -arch=sm_80 %}
+
+; LDST: .b8 bfloat_array[8] = {1, 2, 3, 4, 5, 6, 7, 8};
+@"bfloat_array" = addrspace(1) constant [4 x bfloat]
+[bfloat 0xR0201, bfloat 0xR0403, bfloat 0xR0605, bfloat 0xR0807]
+  
+; CHECK-LABEL: test_fadd(
+; CHECK-DAG:  ld.param.b16[[A:%h[0-9]+]], [test_fadd_param_0];
+; CHECK-DAG:  ld.param.b16[[B:%h[0-9]+]], [test_fadd_param_1];
+; CHECK-NEXT: add.rn.bf16 [[R:%f[0-9]+]], [[A]], [[B]];
+; CHECK-NEXT: st.param.b16[func_retval0+0], [[R]];
+; CHECK-NEXT: ret;
+
+define bfloat @test_fadd(bfloat %0, bfloat %1) {
+  %3 = fadd bfloat %0, %1
+  ret bfloat %3
+}
+
+; CHECK-LABEL: test_fsub(
+; CHECK-DAG:  ld.param.b16[[A:%h[0-9]+]], [test_fsub_param_0];
+; CHECK-DAG:  ld.param.b16[[B:%h[0-9]+]], [test_fsub_param_1];
+; CHECK-NEXT: sub.rn.bf16 [[R:%f[0-9]+]], [[A]], [[B]];
+; CHECK-NEXT: st.param.b16[func_retval0+0], [[R]];
+; CHECK-NEXT: ret;
+
+define bfloat @test_fsub(bfloat %0, bfloat %1) {
+  %3 = fsub bfloat %0, %1
+  ret bfloat %3
+}
+
+; CHECK-LABEL: test_faddx2(
+; CHECK-DAG:  ld.param.b32[[A:%hh[0-9]+]], [test_faddx2_param_0];
+; CHECK-DAG:  ld.param.b32[[B:%hh[0-9]+]], [test_faddx2_param_1];
+; CHECK-NEXT: add.rn.bf16x2   [[R:%f[0-9]+]], [[A]], [[B]];
+
+; CHECK:  st.param.b32[func_retval0+0], [[R]];
+; CHECK:  ret;
+
+define <2 x bfloat> @test_faddx2(<2 x bfloat> %a, <2 x bfloat> %b) #0 {
+  %r = fadd <2 x bfloat> %a, %b
+  ret <2 x bfloat> %r
+}
+
+; CHECK-LABEL: test_fsubx2(
+; CHECK-DAG:  ld.param.b32[[A:%hh[0-9]+]], [test_fsubx2_param_0];
+; CHECK-DAG:  ld.param.b32[[B:%hh[0-9]+]], [test_fsubx2_param_1];
+; CHECK-NEXT: sub.rn.bf16x2   [[R:%f[0-9]+]], [[A]], [[B]];
+
+; CHECK:  st.param.b32[func_retval0+0], [[R]];
+; CHECK:  ret;
+
+define <2 x bfloat> @test_fsubx2(<2 x bfloat> %a, <2 x bfloat> %b) #0 {
+  %r = fsub <2 x bfloat> %a, %b
+  ret <2 x bfloat> %r
+}
+
+; CHECK-LABEL: test_fmulx2(
+; CHECK-DAG:  ld.param.b32[[A:%hh[0-9]+]], [test_fmulx2_param_0];
+; CHECK-DAG:  ld.param.b32[[B:%hh[0-9]+]], [test_fmulx2_param_1];
+; CHECK-NEXT: mul.rn.bf16x2   [[R:%f[0-9]+]], [[A]], [[B]];
+
+; CHECK:  st.param.b32[func_retval0+0], [[R]];
+; CHECK:  ret;
+
+define <2 x bfloat> @test_fmul(<2 x bfloat> %a, <2 x bfloat> %b) #0 {
+  %r = fmul <2 x bfloat> %a, %b
+  ret <2 x bfloat> %r
+}
+
+; CHECK-LABEL: test_fdiv(
+; CHECK-DAG:  ld.param.b32[[A:%hh[0-9]+]], [test_fdiv_param_0];
+; CHECK-DAG:  ld.param.b32[[B:%hh[0-9]+]], [test_fdiv_param_1];
+; CHECK-DAG:  mov.b32 {[[A0:%h[0-9]+]], [[A1:%h[0-9]+]]}, [[A]]
+; CHECK-DAG:  mov.b32 {[[B0:%h[0-9]+]], [[B1:%h[0-9]+]]}, [[B]]
+; CHECK-DAG:  cvt.f32.bf16 [[FA0:%f[0-9]+]], [[A0]];
+; CHECK-DAG:  cvt.f32.bf16 [[FA1:%f[0-9]+]], [[A1]];
+; CHECK-DAG:  cvt.f32.bf16 [[FB0:%f[0-9]+]], [[B0]];
+; CHECK-DAG:  cvt.f32.bf16 [[FB1:%f[0-9]+]], [[B1]];
+; CHECK-DAG:  div.rn.f32  [[FR0:%f[0-9]+]], [[FA0]], [[FB0]];
+; CHECK-DAG:  div.rn.f32  [[FR1:%f[0-9]+]], [[FA1]], [[FB1]];
+; CHECK-DAG:  cvt.rn.bf16.f32  [[R0:%h[0-9]+]], [[FR0]];
+; CHECK-DAG:  cvt.rn.bf16.f32  [[R1:%h[0-9]+]], [[FR1]];
+; CHECK-NEXT: mov.b32 [[R:%hh[0-9]+]], {[[R0]], [[R1]]}
+; CHECK-NEXT: st.param.b32[func_retval0+0], [[R]];
+; CHECK-NEXT: ret;
+
+define <2 x bfloat> @test_fdiv(<2 x bfloat> %a, <2 x bfloat> %b) #0 {
+  %r = fdiv <2 x bfloat> %a, %b
+  ret <2 x bfloat> %r
+}
+
+; CHECK-LABEL: test_extract_0(
+; CHECK:  ld.param.b16[[A:%rs[0-9]+]], [test_extract_0_param_0];
+; CHECK:  st.param.b16[func_retval0+0], [[A]];
+; CHECK:  ret;
+
+define bfloat @test_extract_0(<2 x bfloat> %a) #0 {
+  %e = extra

[PATCH] D144911: adding bf16 support to NVPTX

2023-06-08 Thread Kushan Ahmadian via Phabricator via cfe-commits
kushanam updated this revision to Diff 529515.
kushanam added a comment.

updating the bf16 resigter types


Repository:
  rG LLVM Github Monorepo

CHANGES SINCE LAST ACTION
  https://reviews.llvm.org/D144911/new/

https://reviews.llvm.org/D144911

Files:
  clang/include/clang/Basic/BuiltinsNVPTX.def
  llvm/include/llvm/IR/IntrinsicsNVVM.td
  llvm/lib/Target/NVPTX/MCTargetDesc/NVPTXInstPrinter.cpp
  llvm/lib/Target/NVPTX/NVPTXAsmPrinter.cpp
  llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp
  llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.h
  llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp
  llvm/lib/Target/NVPTX/NVPTXInstrInfo.td
  llvm/lib/Target/NVPTX/NVPTXIntrinsics.td
  llvm/lib/Target/NVPTX/NVPTXMCExpr.cpp
  llvm/lib/Target/NVPTX/NVPTXMCExpr.h
  llvm/lib/Target/NVPTX/NVPTXSubtarget.cpp
  llvm/lib/Target/NVPTX/NVPTXSubtarget.h
  llvm/lib/Target/NVPTX/NVPTXTargetTransformInfo.cpp
  llvm/test/CodeGen/NVPTX/bf16-instructions.ll

Index: llvm/test/CodeGen/NVPTX/bf16-instructions.ll
===
--- /dev/null
+++ llvm/test/CodeGen/NVPTX/bf16-instructions.ll
@@ -0,0 +1,89 @@
+; RUN: llc < %s -march=nvptx64 -mcpu=sm_80 -mattr=+ptx70 | FileCheck %s
+; RUN: %if ptxas-11.0 %{ llc < %s -march=nvptx64 -mcpu=sm_80 -mattr=+ptx70 | %ptxas-verify -arch=sm_80 %}
+
+
+; CHECK-LABEL: test_fadd(
+; CHECK-DAG:  ld.param.b16[[A:%h[0-9]+]], [test_fadd_param_0];
+; CHECK-DAG:  ld.param.b16[[B:%h[0-9]+]], [test_fadd_param_1];
+; CHECK-NEXT: add.rn.bf16 [[R:%f[0-9]+]], [[A]], [[B]];
+; CHECK-NEXT: st.param.b16[func_retval0+0], [[R]];
+; CHECK-NEXT: ret;
+
+define bfloat @test_fadd(bfloat %0, bfloat %1) {
+  %3 = fadd bfloat %0, %1
+  ret bfloat %3
+}
+
+; CHECK-LABEL: test_fsub(
+; CHECK-DAG:  ld.param.b16[[A:%h[0-9]+]], [test_fsub_param_0];
+; CHECK-DAG:  ld.param.b16[[B:%h[0-9]+]], [test_fsub_param_1];
+; CHECK-NEXT: sub.rn.bf16 [[R:%f[0-9]+]], [[A]], [[B]];
+; CHECK-NEXT: st.param.b16[func_retval0+0], [[R]];
+; CHECK-NEXT: ret;
+
+define bfloat @test_fsub(bfloat %0, bfloat %1) {
+  %3 = fsub bfloat %0, %1
+  ret bfloat %3
+}
+
+; CHECK-LABEL: test_faddx2(
+; CHECK-DAG:  ld.param.b32[[A:%hh[0-9]+]], [test_faddx2_param_0];
+; CHECK-DAG:  ld.param.b32[[B:%hh[0-9]+]], [test_faddx2_param_1];
+; CHECK-NEXT: add.rn.bf16x2   [[R:%f[0-9]+]], [[A]], [[B]];
+
+; CHECK:  st.param.b32[func_retval0+0], [[R]];
+; CHECK:  ret;
+
+define <2 x bfloat> @test_faddx2(<2 x bfloat> %a, <2 x bfloat> %b) #0 {
+  %r = fadd <2 x bfloat> %a, %b
+  ret <2 x bfloat> %r
+}
+
+; CHECK-LABEL: test_fsubx2(
+; CHECK-DAG:  ld.param.b32[[A:%hh[0-9]+]], [test_fsubx2_param_0];
+; CHECK-DAG:  ld.param.b32[[B:%hh[0-9]+]], [test_fsubx2_param_1];
+; CHECK-NEXT: sub.rn.bf16x2   [[R:%f[0-9]+]], [[A]], [[B]];
+
+; CHECK:  st.param.b32[func_retval0+0], [[R]];
+; CHECK:  ret;
+
+define <2 x bfloat> @test_fsubx2(<2 x bfloat> %a, <2 x bfloat> %b) #0 {
+  %r = fsub <2 x bfloat> %a, %b
+  ret <2 x bfloat> %r
+}
+
+; CHECK-LABEL: test_fmulx2(
+; CHECK-DAG:  ld.param.b32[[A:%hh[0-9]+]], [test_fmulx2_param_0];
+; CHECK-DAG:  ld.param.b32[[B:%hh[0-9]+]], [test_fmulx2_param_1];
+; CHECK-NEXT: mul.rn.bf16x2   [[R:%f[0-9]+]], [[A]], [[B]];
+
+; CHECK:  st.param.b32[func_retval0+0], [[R]];
+; CHECK:  ret;
+
+define <2 x bfloat> @test_fmul(<2 x bfloat> %a, <2 x bfloat> %b) #0 {
+  %r = fmul <2 x bfloat> %a, %b
+  ret <2 x bfloat> %r
+}
+
+; CHECK-LABEL: test_fdiv(
+; CHECK-DAG:  ld.param.b32[[A:%hh[0-9]+]], [test_fdiv_param_0];
+; CHECK-DAG:  ld.param.b32[[B:%hh[0-9]+]], [test_fdiv_param_1];
+; CHECK-DAG:  mov.b32 {[[A0:%h[0-9]+]], [[A1:%h[0-9]+]]}, [[A]]
+; CHECK-DAG:  mov.b32 {[[B0:%h[0-9]+]], [[B1:%h[0-9]+]]}, [[B]]
+; CHECK-DAG:  cvt.f32.bf16 [[FA0:%f[0-9]+]], [[A0]];
+; CHECK-DAG:  cvt.f32.bf16 [[FA1:%f[0-9]+]], [[A1]];
+; CHECK-DAG:  cvt.f32.bf16 [[FB0:%f[0-9]+]], [[B0]];
+; CHECK-DAG:  cvt.f32.bf16 [[FB1:%f[0-9]+]], [[B1]];
+; CHECK-DAG:  div.rn.f32  [[FR0:%f[0-9]+]], [[FA0]], [[FB0]];
+; CHECK-DAG:  div.rn.f32  [[FR1:%f[0-9]+]], [[FA1]], [[FB1]];
+; CHECK-DAG:  cvt.rn.bf16.f32  [[R0:%h[0-9]+]], [[FR0]];
+; CHECK-DAG:  cvt.rn.bf16.f32  [[R1:%h[0-9]+]], [[FR1]];
+; CHECK-NEXT: mov.b32 [[R:%hh[0-9]+]], {[[R0]], [[R1]]}
+; CHECK-NEXT: st.param.b32[func_retval0+0], [[R]];
+; CHECK-NEXT: ret;
+
+define <2 x bfloat> @test_fdiv(<2 x bfloat> %a, <2 x bfloat> %b) #0 {
+  %r = fdiv <2 x bfloat> %a, %b
+  ret <2 x bfloat> %r
+}
+
Index: llvm/lib/Target/NVPTX/NVPTXTargetTransformInfo.cpp
===
--- llvm/lib/Target/NVPTX/NVPTXTargetTransformInfo.cpp
+++ llvm/lib/Target/NVPTX/NVPTXTargetTransformInfo.cpp
@@ -204,6 +204,14 @@
   return {Intrinsic::fma, FTZ_MustBeOff, true};
 case Intrinsic::nvvm_fma_rn_ftz_f16x2:
   return {Intrinsic::fma, FTZ_MustBeOn, true};
+case Intrinsic::nvvm_fma_rn_bf16:
+  return {Intrinsic::fma, FTZ_

[PATCH] D144911: adding bf16 support to NVPTX

2023-06-07 Thread Kushan Ahmadian via Phabricator via cfe-commits
kushanam updated this revision to Diff 529494.
kushanam added a comment.

rebasing and readding the first commit changes


Repository:
  rG LLVM Github Monorepo

CHANGES SINCE LAST ACTION
  https://reviews.llvm.org/D144911/new/

https://reviews.llvm.org/D144911

Files:
  clang/include/clang/Basic/BuiltinsNVPTX.def
  llvm/include/llvm/IR/IntrinsicsNVVM.td
  llvm/lib/Target/NVPTX/MCTargetDesc/NVPTXInstPrinter.cpp
  llvm/lib/Target/NVPTX/NVPTXAsmPrinter.cpp
  llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp
  llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.h
  llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp
  llvm/lib/Target/NVPTX/NVPTXInstrInfo.td
  llvm/lib/Target/NVPTX/NVPTXIntrinsics.td
  llvm/lib/Target/NVPTX/NVPTXMCExpr.cpp
  llvm/lib/Target/NVPTX/NVPTXMCExpr.h
  llvm/lib/Target/NVPTX/NVPTXRegisterInfo.td
  llvm/lib/Target/NVPTX/NVPTXSubtarget.cpp
  llvm/lib/Target/NVPTX/NVPTXSubtarget.h
  llvm/lib/Target/NVPTX/NVPTXTargetTransformInfo.cpp
  llvm/test/CodeGen/NVPTX/bf16-instructions.ll

Index: llvm/test/CodeGen/NVPTX/bf16-instructions.ll
===
--- /dev/null
+++ llvm/test/CodeGen/NVPTX/bf16-instructions.ll
@@ -0,0 +1,89 @@
+; RUN: llc < %s -march=nvptx64 -mcpu=sm_80 -mattr=+ptx70 | FileCheck %s
+; RUN: %if ptxas-11.0 %{ llc < %s -march=nvptx64 -mcpu=sm_80 -mattr=+ptx70 | %ptxas-verify -arch=sm_80 %}
+
+
+; CHECK-LABEL: test_fadd(
+; CHECK-DAG:  ld.param.b16[[A:%h[0-9]+]], [test_fadd_param_0];
+; CHECK-DAG:  ld.param.b16[[B:%h[0-9]+]], [test_fadd_param_1];
+; CHECK-NEXT: add.rn.bf16 [[R:%f[0-9]+]], [[A]], [[B]];
+; CHECK-NEXT: st.param.b16[func_retval0+0], [[R]];
+; CHECK-NEXT: ret;
+
+define bfloat @test_fadd(bfloat %0, bfloat %1) {
+  %3 = fadd bfloat %0, %1
+  ret bfloat %3
+}
+
+; CHECK-LABEL: test_fsub(
+; CHECK-DAG:  ld.param.b16[[A:%h[0-9]+]], [test_fsub_param_0];
+; CHECK-DAG:  ld.param.b16[[B:%h[0-9]+]], [test_fsub_param_1];
+; CHECK-NEXT: sub.rn.bf16 [[R:%f[0-9]+]], [[A]], [[B]];
+; CHECK-NEXT: st.param.b16[func_retval0+0], [[R]];
+; CHECK-NEXT: ret;
+
+define bfloat @test_fsub(bfloat %0, bfloat %1) {
+  %3 = fsub bfloat %0, %1
+  ret bfloat %3
+}
+
+; CHECK-LABEL: test_faddx2(
+; CHECK-DAG:  ld.param.b32[[A:%hh[0-9]+]], [test_faddx2_param_0];
+; CHECK-DAG:  ld.param.b32[[B:%hh[0-9]+]], [test_faddx2_param_1];
+; CHECK-NEXT: add.rn.bf16x2   [[R:%f[0-9]+]], [[A]], [[B]];
+
+; CHECK:  st.param.b32[func_retval0+0], [[R]];
+; CHECK:  ret;
+
+define <2 x bfloat> @test_faddx2(<2 x bfloat> %a, <2 x bfloat> %b) #0 {
+  %r = fadd <2 x bfloat> %a, %b
+  ret <2 x bfloat> %r
+}
+
+; CHECK-LABEL: test_fsubx2(
+; CHECK-DAG:  ld.param.b32[[A:%hh[0-9]+]], [test_fsubx2_param_0];
+; CHECK-DAG:  ld.param.b32[[B:%hh[0-9]+]], [test_fsubx2_param_1];
+; CHECK-NEXT: sub.rn.bf16x2   [[R:%f[0-9]+]], [[A]], [[B]];
+
+; CHECK:  st.param.b32[func_retval0+0], [[R]];
+; CHECK:  ret;
+
+define <2 x bfloat> @test_fsubx2(<2 x bfloat> %a, <2 x bfloat> %b) #0 {
+  %r = fsub <2 x bfloat> %a, %b
+  ret <2 x bfloat> %r
+}
+
+; CHECK-LABEL: test_fmulx2(
+; CHECK-DAG:  ld.param.b32[[A:%hh[0-9]+]], [test_fmulx2_param_0];
+; CHECK-DAG:  ld.param.b32[[B:%hh[0-9]+]], [test_fmulx2_param_1];
+; CHECK-NEXT: mul.rn.bf16x2   [[R:%f[0-9]+]], [[A]], [[B]];
+
+; CHECK:  st.param.b32[func_retval0+0], [[R]];
+; CHECK:  ret;
+
+define <2 x bfloat> @test_fmul(<2 x bfloat> %a, <2 x bfloat> %b) #0 {
+  %r = fmul <2 x bfloat> %a, %b
+  ret <2 x bfloat> %r
+}
+
+; CHECK-LABEL: test_fdiv(
+; CHECK-DAG:  ld.param.b32[[A:%hh[0-9]+]], [test_fdiv_param_0];
+; CHECK-DAG:  ld.param.b32[[B:%hh[0-9]+]], [test_fdiv_param_1];
+; CHECK-DAG:  mov.b32 {[[A0:%h[0-9]+]], [[A1:%h[0-9]+]]}, [[A]]
+; CHECK-DAG:  mov.b32 {[[B0:%h[0-9]+]], [[B1:%h[0-9]+]]}, [[B]]
+; CHECK-DAG:  cvt.f32.bf16 [[FA0:%f[0-9]+]], [[A0]];
+; CHECK-DAG:  cvt.f32.bf16 [[FA1:%f[0-9]+]], [[A1]];
+; CHECK-DAG:  cvt.f32.bf16 [[FB0:%f[0-9]+]], [[B0]];
+; CHECK-DAG:  cvt.f32.bf16 [[FB1:%f[0-9]+]], [[B1]];
+; CHECK-DAG:  div.rn.f32  [[FR0:%f[0-9]+]], [[FA0]], [[FB0]];
+; CHECK-DAG:  div.rn.f32  [[FR1:%f[0-9]+]], [[FA1]], [[FB1]];
+; CHECK-DAG:  cvt.rn.bf16.f32  [[R0:%h[0-9]+]], [[FR0]];
+; CHECK-DAG:  cvt.rn.bf16.f32  [[R1:%h[0-9]+]], [[FR1]];
+; CHECK-NEXT: mov.b32 [[R:%hh[0-9]+]], {[[R0]], [[R1]]}
+; CHECK-NEXT: st.param.b32[func_retval0+0], [[R]];
+; CHECK-NEXT: ret;
+
+define <2 x bfloat> @test_fdiv(<2 x bfloat> %a, <2 x bfloat> %b) #0 {
+  %r = fdiv <2 x bfloat> %a, %b
+  ret <2 x bfloat> %r
+}
+
Index: llvm/lib/Target/NVPTX/NVPTXTargetTransformInfo.cpp
===
--- llvm/lib/Target/NVPTX/NVPTXTargetTransformInfo.cpp
+++ llvm/lib/Target/NVPTX/NVPTXTargetTransformInfo.cpp
@@ -204,6 +204,14 @@
   return {Intrinsic::fma, FTZ_MustBeOff, true};
 case Intrinsic::nvvm_fma_rn_ftz_f16x2:
   return {Intrinsic::fma, FTZ_MustBeOn, true};
+case Intri

[PATCH] D144911: adding bf16 support to NVPTX

2023-06-06 Thread Kushan Ahmadian via Phabricator via cfe-commits
kushanam added inline comments.



Comment at: llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp:615
 setFP16OperationAction(Op, MVT::v2f16, Legal, Expand);
-  }
-
-  for (const auto &Op : {ISD::FADD, ISD::FMUL, ISD::FSUB, ISD::FMA}) {
 setBF16OperationAction(Op, MVT::bf16, Legal, Promote);
 setBF16OperationAction(Op, MVT::v2bf16, Legal, Expand);

tra wrote:
> There's still something odd with this patch. The `setBF16OperationAction` is 
> not in the upstream, but it does not show up in the diff on phabricator. 
> 
> Please do rebase on top of the LLVM and make sure that all your changes are 
> on the git branch you use to send the patch to phabricator. If in doubt how 
> to get `arc` to do it correctly, you can always create and upload the diff 
> manually as described here: 
> https://llvm.org/docs/Phabricator.html#requesting-a-review-via-the-web-interface
It is in the first commit, isn't it?https://reviews.llvm.org/D144911?id=500896


Repository:
  rG LLVM Github Monorepo

CHANGES SINCE LAST ACTION
  https://reviews.llvm.org/D144911/new/

https://reviews.llvm.org/D144911

___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[PATCH] D144911: adding bf16 support to NVPTX

2023-06-05 Thread Kushan Ahmadian via Phabricator via cfe-commits
kushanam added a comment.

In D144911#4389187 , @manishucsd 
wrote:

> I tried this patch to resolve lowering `nvvm.mma.sync` on bf16 operands for 
> the PR 13858 
>
>   nvvm.mma.sync A[%293, %295, %297, %299]  B[%392, %394]  C[%484, %485, %487, 
> %488]  {layoutA = #nvvm.mma_layout, layoutB = #nvvm.mma_layout, 
> multiplicandAPtxType = #nvvm.mma_type, multiplicandBPtxType = 
> #nvvm.mma_type, shape = #nvvm.shape} : 
> (vector<2xbf16>, vector<2xbf16>, f32) -> !llvm.struct<(f32, f32, f32, f32)>
>
> I fail to compile this patch. Please find the compilation error below:
>
>   [build] ./llvm-project/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td:1117:40: 
> error: Variable not defined: 'hasPTX70'
>   [build] Requires<[useFP16Math, hasPTX70, hasSM80, Pred]>;
>   [build]^
>
>   

Could you try again, I rebased and refactored the code


Repository:
  rG LLVM Github Monorepo

CHANGES SINCE LAST ACTION
  https://reviews.llvm.org/D144911/new/

https://reviews.llvm.org/D144911

___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[PATCH] D144911: adding bf16 support to NVPTX

2023-06-05 Thread Kushan Ahmadian via Phabricator via cfe-commits
kushanam updated this revision to Diff 528526.
kushanam added a comment.

adding min and max for bf16 and refactoring the code


Repository:
  rG LLVM Github Monorepo

CHANGES SINCE LAST ACTION
  https://reviews.llvm.org/D144911/new/

https://reviews.llvm.org/D144911

Files:
  llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp
  llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp


Index: llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp
===
--- llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp
+++ llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp
@@ -150,23 +150,11 @@
 }
 
 static bool Isv2f16Orv2bf16Type(MVT VT) {
-  switch (VT.SimpleTy) {
-  default:
-return false;
-  case MVT::v2f16:
-  case MVT::v2bf16:
-return true;
-  }
+  return (VT.SimpleTy == MVT::v2f16 || VT.SimpleTy == MVT::v2bf16);
 }
 
 static bool Isf16Orbf16Type(MVT VT) {
-  switch (VT.SimpleTy) {
-  default:
-return false;
-  case MVT::f16:
-  case MVT::bf16:
-return true;
-  }
+  return (VT.SimpleTy == MVT::f16 || VT.SimpleTy == MVT::bf16);
 }
 
 /// ComputePTXValueVTs - For the given Type \p Ty, returns the set of primitive
@@ -624,9 +612,6 @@
   for (const auto &Op : {ISD::FADD, ISD::FMUL, ISD::FSUB, ISD::FMA}) {
 setFP16OperationAction(Op, MVT::f16, Legal, Promote);
 setFP16OperationAction(Op, MVT::v2f16, Legal, Expand);
-  }
-
-  for (const auto &Op : {ISD::FADD, ISD::FMUL, ISD::FSUB, ISD::FMA}) {
 setBF16OperationAction(Op, MVT::bf16, Legal, Promote);
 setBF16OperationAction(Op, MVT::v2bf16, Legal, Expand);
   }
@@ -693,20 +678,18 @@
   };
   for (const auto &Op : {ISD::FMINNUM, ISD::FMAXNUM}) {
 setFP16OperationAction(Op, MVT::f16, GetMinMaxAction(Promote), Promote);
+setBF16OperationAction(Op, MVT::bf16, GetMinMaxAction(Promote), Promote);
 setOperationAction(Op, MVT::f32, Legal);
 setOperationAction(Op, MVT::f64, Legal);
 setFP16OperationAction(Op, MVT::v2f16, GetMinMaxAction(Expand), Expand);
+setBF16OperationAction(Op, MVT::v2bf16, GetMinMaxAction(Expand), Expand);
   }
   for (const auto &Op : {ISD::FMINIMUM, ISD::FMAXIMUM}) {
 setFP16OperationAction(Op, MVT::f16, GetMinMaxAction(Expand), Expand);
+setFP16OperationAction(Op, MVT::bf16, GetMinMaxAction(Expand), Expand);
 setOperationAction(Op, MVT::f32, GetMinMaxAction(Expand));
 setFP16OperationAction(Op, MVT::v2f16, GetMinMaxAction(Expand), Expand);
-  }
-  for (const auto &Op : {ISD::FMINNUM, ISD::FMAXNUM}) {
-setBF16OperationAction(Op, MVT::bf16, GetMinMaxAction(Promote), Promote);
-setBF16OperationAction(Op, MVT::v2bf16, GetMinMaxAction(Expand), Expand);
-setBF16OperationAction(Op, MVT::bf16, GetMinMaxAction(Expand), Expand);
-setBF16OperationAction(Op, MVT::v2bf16, GetMinMaxAction(Expand), Expand);
+setFP16OperationAction(Op, MVT::v2bf16, GetMinMaxAction(Expand), Expand);
   }
 
   // No FEXP2, FLOG2.  The PTX ex2 and log2 functions are always approximate.
Index: llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp
===
--- llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp
+++ llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp
@@ -1294,14 +1294,11 @@
 NumElts = EltVT.getVectorNumElements();
 EltVT = EltVT.getVectorElementType();
 // vectors of f16 are loaded/stored as multiples of v2f16 elements.
-if (EltVT == MVT::f16 && N->getValueType(0) == MVT::v2f16) {
-  assert(NumElts % 2 == 0 && "Vector must have even number of elements");
-  EltVT = MVT::v2f16;
-  NumElts /= 2;
-} else if (EltVT == MVT::bf16 && N->getValueType(0) == MVT::v2bf16) {
-  assert(NumElts % 2 == 0 && "Vector must have even number of elements");
-  EltVT = MVT::v2bf16;
-  NumElts /= 2;
+if ((EltVT == MVT::f16 && N->getValueType(0) == MVT::v2f16) || 
+(EltVT == MVT::bf16 && N->getValueType(0) == MVT::v2bf16)) {
+  assert(NumElts % 2 == 0 && "Vector must have even number of 
elements");
+  EltVT = N->getValueType(0);
+  NumElts /= 2;
 }
   }
 


Index: llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp
===
--- llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp
+++ llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp
@@ -150,23 +150,11 @@
 }
 
 static bool Isv2f16Orv2bf16Type(MVT VT) {
-  switch (VT.SimpleTy) {
-  default:
-return false;
-  case MVT::v2f16:
-  case MVT::v2bf16:
-return true;
-  }
+  return (VT.SimpleTy == MVT::v2f16 || VT.SimpleTy == MVT::v2bf16);
 }
 
 static bool Isf16Orbf16Type(MVT VT) {
-  switch (VT.SimpleTy) {
-  default:
-return false;
-  case MVT::f16:
-  case MVT::bf16:
-return true;
-  }
+  return (VT.SimpleTy == MVT::f16 || VT.SimpleTy == MVT::bf16);
 }
 
 /// ComputePTXValueVTs - For the given Type \p Ty, returns the set of primitive
@@ -624,9 +612,6 @@
   for (const auto &Op : {ISD::FADD, ISD::FMUL, ISD::FSUB, ISD::FMA}) {
 setFP16OperationAction(Op, MVT::f16, Legal, Promote)

[PATCH] D144911: adding bf16 support to NVPTX

2023-06-05 Thread Kushan Ahmadian via Phabricator via cfe-commits
kushanam updated this revision to Diff 528475.
kushanam added a comment.

Rebasing the D144911  patch


Repository:
  rG LLVM Github Monorepo

CHANGES SINCE LAST ACTION
  https://reviews.llvm.org/D144911/new/

https://reviews.llvm.org/D144911

Files:
  llvm/lib/Target/NVPTX/NVPTXInstrInfo.td
  llvm/lib/Target/NVPTX/NVPTXIntrinsics.td


Index: llvm/lib/Target/NVPTX/NVPTXIntrinsics.td
===
--- llvm/lib/Target/NVPTX/NVPTXIntrinsics.td
+++ llvm/lib/Target/NVPTX/NVPTXIntrinsics.td
@@ -998,17 +998,17 @@
 FMA_TUPLE<"_rn_ftz_relu_f16", int_nvvm_fma_rn_ftz_relu_f16, Float16Regs,
   [hasPTX<70>, hasSM<80>]>,
 
-FMA_TUPLE<"_rn_bf16", int_nvvm_fma_rn_bf16, BFloat16Regs, [hasPTX70, 
hasSM80]>,
+FMA_TUPLE<"_rn_bf16", int_nvvm_fma_rn_bf16, BFloat16Regs, [hasPTX<70>, 
hasSM<80>]>,
 FMA_TUPLE<"_rn_ftz_bf16", int_nvvm_fma_rn_ftz_bf16, BFloat16Regs,
-  [hasPTX70, hasSM80]>,
+  [hasPTX<70>, hasSM<80>]>,
 FMA_TUPLE<"_rn_sat_bf16", int_nvvm_fma_rn_sat_bf16, BFloat16Regs,
-  [hasPTX70, hasSM80]>,
+  [hasPTX<70>, hasSM<80>]>,
 FMA_TUPLE<"_rn_ftz_sat_bf16", int_nvvm_fma_rn_ftz_sat_bf16, BFloat16Regs,
-  [hasPTX70, hasSM80]>,
+  [hasPTX<70>, hasSM<80>]>,
 FMA_TUPLE<"_rn_relu_bf16", int_nvvm_fma_rn_relu_bf16, BFloat16Regs,
-  [hasPTX70, hasSM80]>,
+  [hasPTX<70>, hasSM<80>]>,
 FMA_TUPLE<"_rn_ftz_relu_bf16", int_nvvm_fma_rn_ftz_relu_bf16, BFloat16Regs,
-  [hasPTX70, hasSM80]>,
+  [hasPTX<70>, hasSM<80>]>,
 
 FMA_TUPLE<"_rn_f16x2", int_nvvm_fma_rn_f16x2, Float16x2Regs,
   [hasPTX<42>, hasSM<53>]>,
@@ -1022,10 +1022,10 @@
   [hasPTX<70>, hasSM<80>]>,
 FMA_TUPLE<"_rn_ftz_relu_f16x2", int_nvvm_fma_rn_ftz_relu_f16x2,
   Float16x2Regs, [hasPTX<70>, hasSM<80>]>,
-FMA_TUPLE<"_rn_bf16x2", int_nvvm_fma_rn_bf16x2, Int32Regs,
-  [hasPTX70, hasSM80]>,
-FMA_TUPLE<"_rn_relu_bf16x2", int_nvvm_fma_rn_relu_bf16x2, Int32Regs,
-  [hasPTX70, hasSM80]>
+FMA_TUPLE<"_rn_bf16x2", int_nvvm_fma_rn_bf16x2, BFloat16x2Regs,
+  [hasPTX<70>, hasSM<80>]>,
+FMA_TUPLE<"_rn_relu_bf16x2", int_nvvm_fma_rn_relu_bf16x2, BFloat16x2Regs,
+  [hasPTX<70>, hasSM<80>]>
   ] in {
 def P.Variant :
   F_MATH_3,
-Requires<[useFP16Math, hasPTX70, hasSM80, Pred]>;
+Requires<[useFP16Math, hasPTX<70>, hasSM<80>, Pred]>;
 def BFNEG16_ftz   : FNEG_BF16_F16X2<"neg.ftz.bf16", bf16, BFloat16Regs, 
doF32FTZ>;
 def BFNEG16   : FNEG_BF16_F16X2<"neg.bf16", bf16, BFloat16Regs, True>;
 def BFNEG16x2_ftz : FNEG_BF16_F16X2<"neg.ftz.bf16x2", v2bf16, BFloat16x2Regs, 
doF32FTZ>;
@@ -3337,30 +3337,6 @@
"  mov.b32 \t{%tmp_lo, $dst}, $src; }}",
[(set BFloat16Regs:$dst,
  (extractelt (v2bf16 BFloat16x2Regs:$src), 
1))]>;
-
-  // // Coalesce two bf16 registers into bf16x2
-  // def BuildBF16x2 : NVPTXInst<(outs BFloat16x2Regs:$dst),
-  //(ins BFloat16Regs:$a, BFloat16Regs:$b),
-  //"mov.b32 \t$dst, {{$a, $b}};",
-  //[(set (v2bf16 BFloat16x2Regs:$dst),
-  //  (build_vector (bf16 BFloat16Regs:$a), (bf16 
BFloat16Regs:$b)))]>;
-
-  // // Directly initializing underlying the b32 register is one less SASS
-  // // instruction than than vector-packing move.
-  // def BuildBF16x2i : NVPTXInst<(outs BFloat16x2Regs:$dst), (ins 
i32imm:$src),
-  // "mov.b32 \t$dst, $src;",
-  // []>;
-
-  // // Split f16x2 into two f16 registers.
-  // def SplitBF16x2  : NVPTXInst<(outs BFloat16Regs:$lo, BFloat16Regs:$hi),
-  // (ins BFloat16x2Regs:$src),
-  // "mov.b32 \t{{$lo, $hi}}, $src;",
-  // []>;
-  // // Split an i32 into two f16
-  // def SplitI32toBF16x2  : NVPTXInst<(outs BFloat16Regs:$lo, 
BFloat16Regs:$hi),
-  //  (ins Int32Regs:$src),
-  //  "mov.b32 \t{{$lo, $hi}}, $src;",
-  //  []>;
 }
 
 // Count leading zeros


Index: llvm/lib/Target/NVPTX/NVPTXIntrinsics.td
===
--- llvm/lib/Target/NVPTX/NVPTXIntrinsics.td
+++ llvm/lib/Target/NVPTX/NVPTXIntrinsics.td
@@ -998,17 +998,17 @@
 FMA_TUPLE<"_rn_ftz_relu_f16", int_nvvm_fma_rn_ftz_relu_f16, Float16Regs,
   [hasPTX<70>, hasSM<80>]>,
 
-FMA_TUPLE<"_rn_bf16", int_nvvm_fma_rn_bf16, BFloat16Regs, [hasPTX70, hasSM80]>,
+FMA_TUPLE<"_rn_bf16", int_nvvm_fma_rn_bf16, BFloat16Regs, [hasPTX<70>, hasSM<80>]>,
 FMA_TUPLE<"_rn_ftz_bf16", int_nvvm_fma_rn_ftz_bf16, BFloat16Regs,
-  [hasPTX70, hasSM80]>,
+  [hasPTX<70>, hasSM<80>]>,
 FMA_TUPLE<"_rn_sat_bf16", int_nvvm_fma_rn_sat_bf16, BFloat16Reg

[PATCH] D144911: adding bf16 support to NVPTX

2023-05-25 Thread Kushan Ahmadian via Phabricator via cfe-commits
kushanam updated this revision to Diff 525896.
kushanam added a comment.

removing commented td entry


Repository:
  rG LLVM Github Monorepo

CHANGES SINCE LAST ACTION
  https://reviews.llvm.org/D144911/new/

https://reviews.llvm.org/D144911

Files:
  clang/include/clang/Basic/BuiltinsNVPTX.def
  llvm/include/llvm/IR/IntrinsicsNVVM.td
  llvm/lib/Target/NVPTX/MCTargetDesc/NVPTXInstPrinter.cpp
  llvm/lib/Target/NVPTX/NVPTXAsmPrinter.cpp
  llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp
  llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.h
  llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp
  llvm/lib/Target/NVPTX/NVPTXInstrInfo.td
  llvm/lib/Target/NVPTX/NVPTXIntrinsics.td
  llvm/lib/Target/NVPTX/NVPTXMCExpr.cpp
  llvm/lib/Target/NVPTX/NVPTXMCExpr.h
  llvm/lib/Target/NVPTX/NVPTXRegisterInfo.td
  llvm/lib/Target/NVPTX/NVPTXSubtarget.cpp
  llvm/lib/Target/NVPTX/NVPTXSubtarget.h
  llvm/lib/Target/NVPTX/NVPTXTargetTransformInfo.cpp
  llvm/test/CodeGen/NVPTX/bf16-instructions.ll

Index: llvm/test/CodeGen/NVPTX/bf16-instructions.ll
===
--- /dev/null
+++ llvm/test/CodeGen/NVPTX/bf16-instructions.ll
@@ -0,0 +1,88 @@
+; RUN: llc < %s -march=nvptx64 -mcpu=sm_80 -mattr=+ptx70 | FileCheck %s
+; RUN: %if ptxas-11.0 %{ llc < %s -march=nvptx64 -mcpu=sm_80 -mattr=+ptx70 | %ptxas-verify -arch=sm_80 %}
+
+
+; CHECK-LABEL: test_fadd(
+; CHECK-DAG:  ld.param.b16[[A:%h[0-9]+]], [test_fadd_param_0];
+; CHECK-DAG:  ld.param.b16[[B:%h[0-9]+]], [test_fadd_param_1];
+; CHECK-NEXT: add.rn.bf16 [[R:%f[0-9]+]], [[A]], [[B]];
+; CHECK-NEXT: st.param.b16[func_retval0+0], [[R]];
+; CHECK-NEXT: ret;
+
+define bfloat @test_fadd(bfloat %0, bfloat %1) {
+  %3 = fadd bfloat %0, %1 
+  ret bfloat %3
+}
+
+; CHECK-LABEL: test_fsub(
+; CHECK-DAG:  ld.param.b16[[A:%h[0-9]+]], [test_fsub_param_0];
+; CHECK-DAG:  ld.param.b16[[B:%h[0-9]+]], [test_fsub_param_1];
+; CHECK-NEXT: sub.rn.bf16 [[R:%f[0-9]+]], [[A]], [[B]];
+; CHECK-NEXT: st.param.b16[func_retval0+0], [[R]];
+; CHECK-NEXT: ret;
+
+define bfloat @test_fsub(bfloat %0, bfloat %1) {
+  %3 = fsub bfloat %0, %1 
+  ret bfloat %3
+}
+
+; CHECK-LABEL: test_faddx2(
+; CHECK-DAG:  ld.param.b32[[A:%hh[0-9]+]], [test_faddx2_param_0];
+; CHECK-DAG:  ld.param.b32[[B:%hh[0-9]+]], [test_faddx2_param_1];
+; CHECK-NEXT: add.rn.bf16x2   [[R:%f[0-9]+]], [[A]], [[B]];
+
+; CHECK:  st.param.b32[func_retval0+0], [[R]];
+; CHECK:  ret;
+
+define <2 x bfloat> @test_faddx2(<2 x bfloat> %a, <2 x bfloat> %b) #0 {
+  %r = fadd <2 x bfloat> %a, %b
+  ret <2 x bfloat> %r
+}
+
+; CHECK-LABEL: test_fsubx2(
+; CHECK-DAG:  ld.param.b32[[A:%hh[0-9]+]], [test_fsubx2_param_0];
+; CHECK-DAG:  ld.param.b32[[B:%hh[0-9]+]], [test_fsubx2_param_1];
+; CHECK-NEXT: sub.rn.bf16x2   [[R:%f[0-9]+]], [[A]], [[B]];
+
+; CHECK:  st.param.b32[func_retval0+0], [[R]];
+; CHECK:  ret;
+
+define <2 x bfloat> @test_fsubx2(<2 x bfloat> %a, <2 x bfloat> %b) #0 {
+  %r = fsub <2 x bfloat> %a, %b
+  ret <2 x bfloat> %r
+}
+
+; CHECK-LABEL: test_fmulx2(
+; CHECK-DAG:  ld.param.b32[[A:%hh[0-9]+]], [test_fmulx2_param_0];
+; CHECK-DAG:  ld.param.b32[[B:%hh[0-9]+]], [test_fmulx2_param_1];
+; CHECK-NEXT: mul.rn.bf16x2   [[R:%f[0-9]+]], [[A]], [[B]];
+
+; CHECK:  st.param.b32[func_retval0+0], [[R]];
+; CHECK:  ret;
+
+define <2 x bfloat> @test_fmul(<2 x bfloat> %a, <2 x bfloat> %b) #0 {
+  %r = fmul <2 x bfloat> %a, %b
+  ret <2 x bfloat> %r
+}
+
+; CHECK-LABEL: test_fdiv(
+; CHECK-DAG:  ld.param.b32[[A:%hh[0-9]+]], [test_fdiv_param_0];
+; CHECK-DAG:  ld.param.b32[[B:%hh[0-9]+]], [test_fdiv_param_1];
+; CHECK-DAG:  mov.b32 {[[A0:%h[0-9]+]], [[A1:%h[0-9]+]]}, [[A]]
+; CHECK-DAG:  mov.b32 {[[B0:%h[0-9]+]], [[B1:%h[0-9]+]]}, [[B]]
+; CHECK-DAG:  cvt.f32.bf16 [[FA0:%f[0-9]+]], [[A0]];
+; CHECK-DAG:  cvt.f32.bf16 [[FA1:%f[0-9]+]], [[A1]];
+; CHECK-DAG:  cvt.f32.bf16 [[FB0:%f[0-9]+]], [[B0]];
+; CHECK-DAG:  cvt.f32.bf16 [[FB1:%f[0-9]+]], [[B1]];
+; CHECK-DAG:  div.rn.f32  [[FR0:%f[0-9]+]], [[FA0]], [[FB0]];
+; CHECK-DAG:  div.rn.f32  [[FR1:%f[0-9]+]], [[FA1]], [[FB1]];
+; CHECK-DAG:  cvt.rn.bf16.f32  [[R0:%h[0-9]+]], [[FR0]];
+; CHECK-DAG:  cvt.rn.bf16.f32  [[R1:%h[0-9]+]], [[FR1]];
+; CHECK-NEXT: mov.b32 [[R:%hh[0-9]+]], {[[R0]], [[R1]]}
+; CHECK-NEXT: st.param.b32[func_retval0+0], [[R]];
+; CHECK-NEXT: ret;
+
+define <2 x bfloat> @test_fdiv(<2 x bfloat> %a, <2 x bfloat> %b) #0 {
+  %r = fdiv <2 x bfloat> %a, %b
+  ret <2 x bfloat> %r
+}
Index: llvm/lib/Target/NVPTX/NVPTXTargetTransformInfo.cpp
===
--- llvm/lib/Target/NVPTX/NVPTXTargetTransformInfo.cpp
+++ llvm/lib/Target/NVPTX/NVPTXTargetTransformInfo.cpp
@@ -204,6 +204,14 @@
   return {Intrinsic::fma, FTZ_MustBeOff, true};
 case Intrinsic::nvvm_fma_rn_ftz_f16x2:
  

[PATCH] D144911: adding bf16 support to NVPTX

2023-05-25 Thread Kushan Ahmadian via Phabricator via cfe-commits
kushanam updated this revision to Diff 525883.
kushanam added a comment.

rebasing the diff based on the branch HEAD


Repository:
  rG LLVM Github Monorepo

CHANGES SINCE LAST ACTION
  https://reviews.llvm.org/D144911/new/

https://reviews.llvm.org/D144911

Files:
  clang/include/clang/Basic/BuiltinsNVPTX.def
  llvm/include/llvm/IR/IntrinsicsNVVM.td
  llvm/lib/Target/NVPTX/MCTargetDesc/NVPTXInstPrinter.cpp
  llvm/lib/Target/NVPTX/NVPTXAsmPrinter.cpp
  llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp
  llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.h
  llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp
  llvm/lib/Target/NVPTX/NVPTXInstrInfo.td
  llvm/lib/Target/NVPTX/NVPTXIntrinsics.td
  llvm/lib/Target/NVPTX/NVPTXMCExpr.cpp
  llvm/lib/Target/NVPTX/NVPTXMCExpr.h
  llvm/lib/Target/NVPTX/NVPTXRegisterInfo.td
  llvm/lib/Target/NVPTX/NVPTXSubtarget.cpp
  llvm/lib/Target/NVPTX/NVPTXSubtarget.h
  llvm/lib/Target/NVPTX/NVPTXTargetTransformInfo.cpp
  llvm/test/CodeGen/NVPTX/bf16-instructions.ll

Index: llvm/test/CodeGen/NVPTX/bf16-instructions.ll
===
--- /dev/null
+++ llvm/test/CodeGen/NVPTX/bf16-instructions.ll
@@ -0,0 +1,88 @@
+; RUN: llc < %s -march=nvptx64 -mcpu=sm_80 -mattr=+ptx70 | FileCheck %s
+; RUN: %if ptxas-11.0 %{ llc < %s -march=nvptx64 -mcpu=sm_80 -mattr=+ptx70 | %ptxas-verify -arch=sm_80 %}
+
+
+; CHECK-LABEL: test_fadd(
+; CHECK-DAG:  ld.param.b16[[A:%h[0-9]+]], [test_fadd_param_0];
+; CHECK-DAG:  ld.param.b16[[B:%h[0-9]+]], [test_fadd_param_1];
+; CHECK-NEXT: add.rn.bf16 [[R:%f[0-9]+]], [[A]], [[B]];
+; CHECK-NEXT: st.param.b16[func_retval0+0], [[R]];
+; CHECK-NEXT: ret;
+
+define bfloat @test_fadd(bfloat %0, bfloat %1) {
+  %3 = fadd bfloat %0, %1 
+  ret bfloat %3
+}
+
+; CHECK-LABEL: test_fsub(
+; CHECK-DAG:  ld.param.b16[[A:%h[0-9]+]], [test_fsub_param_0];
+; CHECK-DAG:  ld.param.b16[[B:%h[0-9]+]], [test_fsub_param_1];
+; CHECK-NEXT: sub.rn.bf16 [[R:%f[0-9]+]], [[A]], [[B]];
+; CHECK-NEXT: st.param.b16[func_retval0+0], [[R]];
+; CHECK-NEXT: ret;
+
+define bfloat @test_fsub(bfloat %0, bfloat %1) {
+  %3 = fsub bfloat %0, %1 
+  ret bfloat %3
+}
+
+; CHECK-LABEL: test_faddx2(
+; CHECK-DAG:  ld.param.b32[[A:%hh[0-9]+]], [test_faddx2_param_0];
+; CHECK-DAG:  ld.param.b32[[B:%hh[0-9]+]], [test_faddx2_param_1];
+; CHECK-NEXT: add.rn.bf16x2   [[R:%f[0-9]+]], [[A]], [[B]];
+
+; CHECK:  st.param.b32[func_retval0+0], [[R]];
+; CHECK:  ret;
+
+define <2 x bfloat> @test_faddx2(<2 x bfloat> %a, <2 x bfloat> %b) #0 {
+  %r = fadd <2 x bfloat> %a, %b
+  ret <2 x bfloat> %r
+}
+
+; CHECK-LABEL: test_fsubx2(
+; CHECK-DAG:  ld.param.b32[[A:%hh[0-9]+]], [test_fsubx2_param_0];
+; CHECK-DAG:  ld.param.b32[[B:%hh[0-9]+]], [test_fsubx2_param_1];
+; CHECK-NEXT: sub.rn.bf16x2   [[R:%f[0-9]+]], [[A]], [[B]];
+
+; CHECK:  st.param.b32[func_retval0+0], [[R]];
+; CHECK:  ret;
+
+define <2 x bfloat> @test_fsubx2(<2 x bfloat> %a, <2 x bfloat> %b) #0 {
+  %r = fsub <2 x bfloat> %a, %b
+  ret <2 x bfloat> %r
+}
+
+; CHECK-LABEL: test_fmulx2(
+; CHECK-DAG:  ld.param.b32[[A:%hh[0-9]+]], [test_fmulx2_param_0];
+; CHECK-DAG:  ld.param.b32[[B:%hh[0-9]+]], [test_fmulx2_param_1];
+; CHECK-NEXT: mul.rn.bf16x2   [[R:%f[0-9]+]], [[A]], [[B]];
+
+; CHECK:  st.param.b32[func_retval0+0], [[R]];
+; CHECK:  ret;
+
+define <2 x bfloat> @test_fmul(<2 x bfloat> %a, <2 x bfloat> %b) #0 {
+  %r = fmul <2 x bfloat> %a, %b
+  ret <2 x bfloat> %r
+}
+
+; CHECK-LABEL: test_fdiv(
+; CHECK-DAG:  ld.param.b32[[A:%hh[0-9]+]], [test_fdiv_param_0];
+; CHECK-DAG:  ld.param.b32[[B:%hh[0-9]+]], [test_fdiv_param_1];
+; CHECK-DAG:  mov.b32 {[[A0:%h[0-9]+]], [[A1:%h[0-9]+]]}, [[A]]
+; CHECK-DAG:  mov.b32 {[[B0:%h[0-9]+]], [[B1:%h[0-9]+]]}, [[B]]
+; CHECK-DAG:  cvt.f32.bf16 [[FA0:%f[0-9]+]], [[A0]];
+; CHECK-DAG:  cvt.f32.bf16 [[FA1:%f[0-9]+]], [[A1]];
+; CHECK-DAG:  cvt.f32.bf16 [[FB0:%f[0-9]+]], [[B0]];
+; CHECK-DAG:  cvt.f32.bf16 [[FB1:%f[0-9]+]], [[B1]];
+; CHECK-DAG:  div.rn.f32  [[FR0:%f[0-9]+]], [[FA0]], [[FB0]];
+; CHECK-DAG:  div.rn.f32  [[FR1:%f[0-9]+]], [[FA1]], [[FB1]];
+; CHECK-DAG:  cvt.rn.bf16.f32  [[R0:%h[0-9]+]], [[FR0]];
+; CHECK-DAG:  cvt.rn.bf16.f32  [[R1:%h[0-9]+]], [[FR1]];
+; CHECK-NEXT: mov.b32 [[R:%hh[0-9]+]], {[[R0]], [[R1]]}
+; CHECK-NEXT: st.param.b32[func_retval0+0], [[R]];
+; CHECK-NEXT: ret;
+
+define <2 x bfloat> @test_fdiv(<2 x bfloat> %a, <2 x bfloat> %b) #0 {
+  %r = fdiv <2 x bfloat> %a, %b
+  ret <2 x bfloat> %r
+}
Index: llvm/lib/Target/NVPTX/NVPTXTargetTransformInfo.cpp
===
--- llvm/lib/Target/NVPTX/NVPTXTargetTransformInfo.cpp
+++ llvm/lib/Target/NVPTX/NVPTXTargetTransformInfo.cpp
@@ -204,6 +204,14 @@
   return {Intrinsic::fma, FTZ_MustBeOff, true};
 case Intrinsic::nvvm_fma_rn_ft

[PATCH] D149976: adding bf16 support to NVPTX

2023-05-17 Thread Kushan Ahmadian via Phabricator via cfe-commits
kushanam abandoned this revision.
kushanam added a comment.

Duplicate of D144911 


Repository:
  rG LLVM Github Monorepo

CHANGES SINCE LAST ACTION
  https://reviews.llvm.org/D149976/new/

https://reviews.llvm.org/D149976

___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[PATCH] D144911: adding bf16 support to NVPTX

2023-05-17 Thread Kushan Ahmadian via Phabricator via cfe-commits
kushanam updated this revision to Diff 523111.
kushanam added a comment.
Herald added a project: clang.
Herald added a subscriber: cfe-commits.

Adding clang directives and removing bf16 registers


Repository:
  rG LLVM Github Monorepo

CHANGES SINCE LAST ACTION
  https://reviews.llvm.org/D144911/new/

https://reviews.llvm.org/D144911

Files:
  clang/include/clang/Basic/BuiltinsNVPTX.def
  llvm/lib/Target/NVPTX/NVPTXAsmPrinter.cpp
  llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp
  llvm/lib/Target/NVPTX/NVPTXInstrInfo.cpp
  llvm/lib/Target/NVPTX/NVPTXInstrInfo.td
  llvm/lib/Target/NVPTX/NVPTXIntrinsics.td
  llvm/lib/Target/NVPTX/NVPTXRegisterInfo.cpp

Index: llvm/lib/Target/NVPTX/NVPTXRegisterInfo.cpp
===
--- llvm/lib/Target/NVPTX/NVPTXRegisterInfo.cpp
+++ llvm/lib/Target/NVPTX/NVPTXRegisterInfo.cpp
@@ -29,14 +29,13 @@
 std::string getNVPTXRegClassName(TargetRegisterClass const *RC) {
   if (RC == &NVPTX::Float32RegsRegClass)
 return ".f32";
-  if (RC == &NVPTX::Float16RegsRegClass || RC == &NVPTX::BFloat16RegsRegClass)
+  if (RC == &NVPTX::Float16RegsRegClass)
 // Ideally fp16 registers should be .f16, but this syntax is only
 // supported on sm_53+. On the other hand, .b16 registers are
 // accepted for all supported fp16 instructions on all GPU
 // variants, so we can use them instead.
 return ".b16";
-  if (RC == &NVPTX::Float16x2RegsRegClass ||
-  RC == &NVPTX::BFloat16x2RegsRegClass)
+  if (RC == &NVPTX::Float16x2RegsRegClass)
 return ".b32";
   if (RC == &NVPTX::Float64RegsRegClass)
 return ".f64";
@@ -74,10 +73,9 @@
 std::string getNVPTXRegClassStr(TargetRegisterClass const *RC) {
   if (RC == &NVPTX::Float32RegsRegClass)
 return "%f";
-  if (RC == &NVPTX::Float16RegsRegClass || RC == &NVPTX::BFloat16RegsRegClass)
+  if (RC == &NVPTX::Float16RegsRegClass)
 return "%h";
-  if (RC == &NVPTX::Float16x2RegsRegClass ||
-  RC == &NVPTX::BFloat16x2RegsRegClass)
+  if (RC == &NVPTX::Float16x2RegsRegClass)
 return "%hh";
   if (RC == &NVPTX::Float64RegsRegClass)
 return "%fd";
Index: llvm/lib/Target/NVPTX/NVPTXIntrinsics.td
===
--- llvm/lib/Target/NVPTX/NVPTXIntrinsics.td
+++ llvm/lib/Target/NVPTX/NVPTXIntrinsics.td
@@ -998,9 +998,6 @@
 FMA_TUPLE<"_rn_ftz_relu_f16x2", int_nvvm_fma_rn_ftz_relu_f16x2,
   Float16x2Regs, [hasPTX70, hasSM80]>,
 
-// FMA_TUPLE<"_rn_relu_bf16", int_nvvm_fma_rn_relu_bf16, BFloat16Regs,
-//   [hasPTX70, hasSM80]>,
-
 FMA_TUPLE<"_rn_bf16x2", int_nvvm_fma_rn_bf16x2, BFloat16x2Regs,
   [hasPTX70, hasSM80]>,
 FMA_TUPLE<"_rn_relu_bf16x2", int_nvvm_fma_rn_relu_bf16x2, BFloat16x2Regs,
@@ -1254,24 +1251,6 @@
 def : Pat<(int_nvvm_ff2bf16x2_rz_relu Float32Regs:$a, Float32Regs:$b),
   (CVT_bf16x2_f32 Float32Regs:$a, Float32Regs:$b, CvtRZ_RELU)>;
 
-// def : Pat<(int_nvvm_ff2f16x2_rn Float32Regs:$a, Float32Regs:$b),
-//   (CVT_f16x2_f32 Float32Regs:$a, Float32Regs:$b, CvtRN)>;
-// def : Pat<(int_nvvm_ff2f16x2_rn_relu Float32Regs:$a, Float32Regs:$b),
-//   (CVT_f16x2_f32 Float32Regs:$a, Float32Regs:$b, CvtRN_RELU)>;
-// def : Pat<(int_nvvm_ff2f16x2_rz Float32Regs:$a, Float32Regs:$b),
-//   (CVT_f16x2_f32 Float32Regs:$a, Float32Regs:$b, CvtRZ)>;
-// def : Pat<(int_nvvm_ff2f16x2_rz_relu Float32Regs:$a, Float32Regs:$b),
-//   (CVT_f16x2_f32 Float32Regs:$a, Float32Regs:$b, CvtRZ_RELU)>;
-
-// def : Pat<(int_nvvm_f2bf16_rn Float32Regs:$a),
-//   (CVT_bf16_f32 Float32Regs:$a, CvtRN)>;
-// def : Pat<(int_nvvm_f2bf16_rn_relu Float32Regs:$a),
-//   (CVT_bf16_f32 Float32Regs:$a, CvtRN_RELU)>;
-// def : Pat<(int_nvvm_f2bf16_rz Float32Regs:$a),
-//   (CVT_bf16_f32 Float32Regs:$a, CvtRZ)>;
-// def : Pat<(int_nvvm_f2bf16_rz_relu Float32Regs:$a),
-//   (CVT_bf16_f32 Float32Regs:$a, CvtRZ_RELU)>;
-
 def CVT_tf32_f32 :
NVPTXInst<(outs Int32Regs:$dest), (ins Float32Regs:$a),
"cvt.rna.tf32.f32 \t$dest, $a;",
@@ -1387,11 +1366,6 @@
 def : Pat<(int_nvvm_f2h_rn Float32Regs:$a),
   (BITCONVERT_16_F2I (CVT_f16_f32 Float32Regs:$a, CvtRN))>;
 
-// def : Pat<(int_nvvm_bf2h_rn_ftz Float32Regs:$a),
-//   (BITCONVERT_16_BF2I (CVT_bf16_f32 Float32Regs:$a, CvtRN_FTZ))>;
-// def : Pat<(int_nvvm_f2h_rn BFloat16Regs:$a),
-//   (BITCONVERT_16_BF2I (CVT_bf16_f32 BFloat16Regs:$a, CvtRN))>;
-
 //
 // Bitcast
 //
Index: llvm/lib/Target/NVPTX/NVPTXInstrInfo.td
===
--- llvm/lib/Target/NVPTX/NVPTXInstrInfo.td
+++ llvm/lib/Target/NVPTX/NVPTXInstrInfo.td
@@ -656,15 +656,6 @@
   def CVT_INREG_s64_s32 : NVPTXInst<(outs Int64Regs:$dst), (ins Int64Regs:$src),
 "cvt.s64.s32 \t$dst, $src;", []>;
 
-multiclass CVT_FROM_FLOAT_SM80 {
-def _f32 :
-  NVPTXInst<(outs RC:$dst),
-(ins Float32Regs:$src, CvtMod

[PATCH] D149976: adding bf16 support to NVPTX

2023-05-17 Thread Kushan Ahmadian via Phabricator via cfe-commits
kushanam updated this revision to Diff 523106.
kushanam added a comment.

adding cland directives and removing bf16 registers

Depends on D144911 

Differential Revision: https://reviews.llvm.org/D144911


Repository:
  rG LLVM Github Monorepo

CHANGES SINCE LAST ACTION
  https://reviews.llvm.org/D149976/new/

https://reviews.llvm.org/D149976

Files:
  clang/include/clang/Basic/BuiltinsNVPTX.def
  llvm/lib/Target/NVPTX/NVPTXAsmPrinter.cpp
  llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp
  llvm/lib/Target/NVPTX/NVPTXInstrInfo.cpp
  llvm/lib/Target/NVPTX/NVPTXInstrInfo.td
  llvm/lib/Target/NVPTX/NVPTXIntrinsics.td
  llvm/lib/Target/NVPTX/NVPTXRegisterInfo.cpp

Index: llvm/lib/Target/NVPTX/NVPTXRegisterInfo.cpp
===
--- llvm/lib/Target/NVPTX/NVPTXRegisterInfo.cpp
+++ llvm/lib/Target/NVPTX/NVPTXRegisterInfo.cpp
@@ -29,14 +29,13 @@
 std::string getNVPTXRegClassName(TargetRegisterClass const *RC) {
   if (RC == &NVPTX::Float32RegsRegClass)
 return ".f32";
-  if (RC == &NVPTX::Float16RegsRegClass || RC == &NVPTX::BFloat16RegsRegClass)
+  if (RC == &NVPTX::Float16RegsRegClass)
 // Ideally fp16 registers should be .f16, but this syntax is only
 // supported on sm_53+. On the other hand, .b16 registers are
 // accepted for all supported fp16 instructions on all GPU
 // variants, so we can use them instead.
 return ".b16";
-  if (RC == &NVPTX::Float16x2RegsRegClass ||
-  RC == &NVPTX::BFloat16x2RegsRegClass)
+  if (RC == &NVPTX::Float16x2RegsRegClass)
 return ".b32";
   if (RC == &NVPTX::Float64RegsRegClass)
 return ".f64";
@@ -74,10 +73,9 @@
 std::string getNVPTXRegClassStr(TargetRegisterClass const *RC) {
   if (RC == &NVPTX::Float32RegsRegClass)
 return "%f";
-  if (RC == &NVPTX::Float16RegsRegClass || RC == &NVPTX::BFloat16RegsRegClass)
+  if (RC == &NVPTX::Float16RegsRegClass)
 return "%h";
-  if (RC == &NVPTX::Float16x2RegsRegClass ||
-  RC == &NVPTX::BFloat16x2RegsRegClass)
+  if (RC == &NVPTX::Float16x2RegsRegClass)
 return "%hh";
   if (RC == &NVPTX::Float64RegsRegClass)
 return "%fd";
Index: llvm/lib/Target/NVPTX/NVPTXIntrinsics.td
===
--- llvm/lib/Target/NVPTX/NVPTXIntrinsics.td
+++ llvm/lib/Target/NVPTX/NVPTXIntrinsics.td
@@ -998,9 +998,6 @@
 FMA_TUPLE<"_rn_ftz_relu_f16x2", int_nvvm_fma_rn_ftz_relu_f16x2,
   Float16x2Regs, [hasPTX70, hasSM80]>,
 
-// FMA_TUPLE<"_rn_relu_bf16", int_nvvm_fma_rn_relu_bf16, BFloat16Regs,
-//   [hasPTX70, hasSM80]>,
-
 FMA_TUPLE<"_rn_bf16x2", int_nvvm_fma_rn_bf16x2, BFloat16x2Regs,
   [hasPTX70, hasSM80]>,
 FMA_TUPLE<"_rn_relu_bf16x2", int_nvvm_fma_rn_relu_bf16x2, BFloat16x2Regs,
@@ -1254,24 +1251,6 @@
 def : Pat<(int_nvvm_ff2bf16x2_rz_relu Float32Regs:$a, Float32Regs:$b),
   (CVT_bf16x2_f32 Float32Regs:$a, Float32Regs:$b, CvtRZ_RELU)>;
 
-// def : Pat<(int_nvvm_ff2f16x2_rn Float32Regs:$a, Float32Regs:$b),
-//   (CVT_f16x2_f32 Float32Regs:$a, Float32Regs:$b, CvtRN)>;
-// def : Pat<(int_nvvm_ff2f16x2_rn_relu Float32Regs:$a, Float32Regs:$b),
-//   (CVT_f16x2_f32 Float32Regs:$a, Float32Regs:$b, CvtRN_RELU)>;
-// def : Pat<(int_nvvm_ff2f16x2_rz Float32Regs:$a, Float32Regs:$b),
-//   (CVT_f16x2_f32 Float32Regs:$a, Float32Regs:$b, CvtRZ)>;
-// def : Pat<(int_nvvm_ff2f16x2_rz_relu Float32Regs:$a, Float32Regs:$b),
-//   (CVT_f16x2_f32 Float32Regs:$a, Float32Regs:$b, CvtRZ_RELU)>;
-
-// def : Pat<(int_nvvm_f2bf16_rn Float32Regs:$a),
-//   (CVT_bf16_f32 Float32Regs:$a, CvtRN)>;
-// def : Pat<(int_nvvm_f2bf16_rn_relu Float32Regs:$a),
-//   (CVT_bf16_f32 Float32Regs:$a, CvtRN_RELU)>;
-// def : Pat<(int_nvvm_f2bf16_rz Float32Regs:$a),
-//   (CVT_bf16_f32 Float32Regs:$a, CvtRZ)>;
-// def : Pat<(int_nvvm_f2bf16_rz_relu Float32Regs:$a),
-//   (CVT_bf16_f32 Float32Regs:$a, CvtRZ_RELU)>;
-
 def CVT_tf32_f32 :
NVPTXInst<(outs Int32Regs:$dest), (ins Float32Regs:$a),
"cvt.rna.tf32.f32 \t$dest, $a;",
@@ -1387,11 +1366,6 @@
 def : Pat<(int_nvvm_f2h_rn Float32Regs:$a),
   (BITCONVERT_16_F2I (CVT_f16_f32 Float32Regs:$a, CvtRN))>;
 
-// def : Pat<(int_nvvm_bf2h_rn_ftz Float32Regs:$a),
-//   (BITCONVERT_16_BF2I (CVT_bf16_f32 Float32Regs:$a, CvtRN_FTZ))>;
-// def : Pat<(int_nvvm_f2h_rn BFloat16Regs:$a),
-//   (BITCONVERT_16_BF2I (CVT_bf16_f32 BFloat16Regs:$a, CvtRN))>;
-
 //
 // Bitcast
 //
Index: llvm/lib/Target/NVPTX/NVPTXInstrInfo.td
===
--- llvm/lib/Target/NVPTX/NVPTXInstrInfo.td
+++ llvm/lib/Target/NVPTX/NVPTXInstrInfo.td
@@ -656,15 +656,6 @@
   def CVT_INREG_s64_s32 : NVPTXInst<(outs Int64Regs:$dst), (ins Int64Regs:$src),
 "cvt.s64.s32 \t$dst, $src;", []>;
 
-multiclass CVT_FROM_FLOAT_SM80 {
-def _f32 :
-  NVPTXInst<(outs RC:$dst),
-

[PATCH] D150603: adding bf16 support to NVPTX

2023-05-15 Thread Kushan Ahmadian via Phabricator via cfe-commits
kushanam abandoned this revision.
kushanam added a comment.

redundant revision again on D144911 


Repository:
  rG LLVM Github Monorepo

CHANGES SINCE LAST ACTION
  https://reviews.llvm.org/D150603/new/

https://reviews.llvm.org/D150603

___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[PATCH] D150603: adding bf16 support to NVPTX

2023-05-15 Thread Kushan Ahmadian via Phabricator via cfe-commits
kushanam created this revision.
Herald added subscribers: mattd, gchakrabarti, asavonic, hiraditya.
Herald added a project: All.
kushanam requested review of this revision.
Herald added subscribers: llvm-commits, cfe-commits, jdoerfert, jholewinski.
Herald added projects: clang, LLVM.

Currently, bf16 has been scatteredly added to the PTX codegen. This patch aims 
to complete the set of instructions and code path required to support bf16 data 
type.

Depends on D144911 


Repository:
  rG LLVM Github Monorepo

https://reviews.llvm.org/D150603

Files:
  clang/include/clang/Basic/BuiltinsNVPTX.def
  llvm/include/llvm/IR/IntrinsicsNVVM.td
  llvm/lib/Target/NVPTX/MCTargetDesc/NVPTXInstPrinter.cpp
  llvm/lib/Target/NVPTX/NVPTXAsmPrinter.cpp
  llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp
  llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.h
  llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp
  llvm/lib/Target/NVPTX/NVPTXInstrInfo.td
  llvm/lib/Target/NVPTX/NVPTXIntrinsics.td
  llvm/lib/Target/NVPTX/NVPTXMCExpr.cpp
  llvm/lib/Target/NVPTX/NVPTXMCExpr.h
  llvm/lib/Target/NVPTX/NVPTXRegisterInfo.td
  llvm/lib/Target/NVPTX/NVPTXSubtarget.cpp
  llvm/lib/Target/NVPTX/NVPTXSubtarget.h
  llvm/lib/Target/NVPTX/NVPTXTargetTransformInfo.cpp
  llvm/test/CodeGen/NVPTX/bf16-instructions.ll

Index: llvm/test/CodeGen/NVPTX/bf16-instructions.ll
===
--- /dev/null
+++ llvm/test/CodeGen/NVPTX/bf16-instructions.ll
@@ -0,0 +1,88 @@
+; RUN: llc < %s -march=nvptx64 -mcpu=sm_80 -mattr=+ptx70 | FileCheck %s
+; RUN: %if ptxas-11.0 %{ llc < %s -march=nvptx64 -mcpu=sm_80 -mattr=+ptx70 | %ptxas-verify -arch=sm_80 %}
+
+
+; CHECK-LABEL: test_fadd(
+; CHECK-DAG:  ld.param.b16[[A:%h[0-9]+]], [test_fadd_param_0];
+; CHECK-DAG:  ld.param.b16[[B:%h[0-9]+]], [test_fadd_param_1];
+; CHECK-NEXT: add.rn.bf16 [[R:%f[0-9]+]], [[A]], [[B]];
+; CHECK-NEXT: st.param.b16[func_retval0+0], [[R]];
+; CHECK-NEXT: ret;
+
+define bfloat @test_fadd(bfloat %0, bfloat %1) {
+  %3 = fadd bfloat %0, %1 
+  ret bfloat %3
+}
+
+; CHECK-LABEL: test_fsub(
+; CHECK-DAG:  ld.param.b16[[A:%h[0-9]+]], [test_fsub_param_0];
+; CHECK-DAG:  ld.param.b16[[B:%h[0-9]+]], [test_fsub_param_1];
+; CHECK-NEXT: sub.rn.bf16 [[R:%f[0-9]+]], [[A]], [[B]];
+; CHECK-NEXT: st.param.b16[func_retval0+0], [[R]];
+; CHECK-NEXT: ret;
+
+define bfloat @test_fsub(bfloat %0, bfloat %1) {
+  %3 = fsub bfloat %0, %1 
+  ret bfloat %3
+}
+
+; CHECK-LABEL: test_faddx2(
+; CHECK-DAG:  ld.param.b32[[A:%hh[0-9]+]], [test_faddx2_param_0];
+; CHECK-DAG:  ld.param.b32[[B:%hh[0-9]+]], [test_faddx2_param_1];
+; CHECK-NEXT: add.rn.bf16x2   [[R:%f[0-9]+]], [[A]], [[B]];
+
+; CHECK:  st.param.b32[func_retval0+0], [[R]];
+; CHECK:  ret;
+
+define <2 x bfloat> @test_faddx2(<2 x bfloat> %a, <2 x bfloat> %b) #0 {
+  %r = fadd <2 x bfloat> %a, %b
+  ret <2 x bfloat> %r
+}
+
+; CHECK-LABEL: test_fsubx2(
+; CHECK-DAG:  ld.param.b32[[A:%hh[0-9]+]], [test_fsubx2_param_0];
+; CHECK-DAG:  ld.param.b32[[B:%hh[0-9]+]], [test_fsubx2_param_1];
+; CHECK-NEXT: sub.rn.bf16x2   [[R:%f[0-9]+]], [[A]], [[B]];
+
+; CHECK:  st.param.b32[func_retval0+0], [[R]];
+; CHECK:  ret;
+
+define <2 x bfloat> @test_fsubx2(<2 x bfloat> %a, <2 x bfloat> %b) #0 {
+  %r = fsub <2 x bfloat> %a, %b
+  ret <2 x bfloat> %r
+}
+
+; CHECK-LABEL: test_fmulx2(
+; CHECK-DAG:  ld.param.b32[[A:%hh[0-9]+]], [test_fmulx2_param_0];
+; CHECK-DAG:  ld.param.b32[[B:%hh[0-9]+]], [test_fmulx2_param_1];
+; CHECK-NEXT: mul.rn.bf16x2   [[R:%f[0-9]+]], [[A]], [[B]];
+
+; CHECK:  st.param.b32[func_retval0+0], [[R]];
+; CHECK:  ret;
+
+define <2 x bfloat> @test_fmul(<2 x bfloat> %a, <2 x bfloat> %b) #0 {
+  %r = fmul <2 x bfloat> %a, %b
+  ret <2 x bfloat> %r
+}
+
+; CHECK-LABEL: test_fdiv(
+; CHECK-DAG:  ld.param.b32[[A:%hh[0-9]+]], [test_fdiv_param_0];
+; CHECK-DAG:  ld.param.b32[[B:%hh[0-9]+]], [test_fdiv_param_1];
+; CHECK-DAG:  mov.b32 {[[A0:%h[0-9]+]], [[A1:%h[0-9]+]]}, [[A]]
+; CHECK-DAG:  mov.b32 {[[B0:%h[0-9]+]], [[B1:%h[0-9]+]]}, [[B]]
+; CHECK-DAG:  cvt.f32.bf16 [[FA0:%f[0-9]+]], [[A0]];
+; CHECK-DAG:  cvt.f32.bf16 [[FA1:%f[0-9]+]], [[A1]];
+; CHECK-DAG:  cvt.f32.bf16 [[FB0:%f[0-9]+]], [[B0]];
+; CHECK-DAG:  cvt.f32.bf16 [[FB1:%f[0-9]+]], [[B1]];
+; CHECK-DAG:  div.rn.f32  [[FR0:%f[0-9]+]], [[FA0]], [[FB0]];
+; CHECK-DAG:  div.rn.f32  [[FR1:%f[0-9]+]], [[FA1]], [[FB1]];
+; CHECK-DAG:  cvt.rn.bf16.f32  [[R0:%h[0-9]+]], [[FR0]];
+; CHECK-DAG:  cvt.rn.bf16.f32  [[R1:%h[0-9]+]], [[FR1]];
+; CHECK-NEXT: mov.b32 [[R:%hh[0-9]+]], {[[R0]], [[R1]]}
+; CHECK-NEXT: st.param.b32[func_retval0+0], [[R]];
+; CHECK-NEXT: ret;
+
+define <2 x bfloat> @test_fdiv(<2 x bfloat> %a, <2 x bfloat> %b) #0 {
+  %r = fdiv <2 x bfloat> %a, %b
+  ret <2 x bfloat> %r
+}
Index: llvm/lib/Targ

[PATCH] D149976: adding bf16 support to NVPTX

2023-05-15 Thread Kushan Ahmadian via Phabricator via cfe-commits
kushanam updated this revision to Diff 522290.
kushanam added a comment.

Adressing review changes and removing bf16 registers


Repository:
  rG LLVM Github Monorepo

CHANGES SINCE LAST ACTION
  https://reviews.llvm.org/D149976/new/

https://reviews.llvm.org/D149976

Files:
  clang/include/clang/Basic/BuiltinsNVPTX.def
  llvm/include/llvm/IR/IntrinsicsNVVM.td
  llvm/lib/Target/NVPTX/MCTargetDesc/NVPTXInstPrinter.cpp
  llvm/lib/Target/NVPTX/NVPTXAsmPrinter.cpp
  llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp
  llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.h
  llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp
  llvm/lib/Target/NVPTX/NVPTXInstrInfo.td
  llvm/lib/Target/NVPTX/NVPTXIntrinsics.td
  llvm/lib/Target/NVPTX/NVPTXMCExpr.cpp
  llvm/lib/Target/NVPTX/NVPTXMCExpr.h
  llvm/lib/Target/NVPTX/NVPTXRegisterInfo.td
  llvm/lib/Target/NVPTX/NVPTXSubtarget.cpp
  llvm/lib/Target/NVPTX/NVPTXSubtarget.h
  llvm/lib/Target/NVPTX/NVPTXTargetTransformInfo.cpp
  llvm/test/CodeGen/NVPTX/bf16-instructions.ll

Index: llvm/test/CodeGen/NVPTX/bf16-instructions.ll
===
--- /dev/null
+++ llvm/test/CodeGen/NVPTX/bf16-instructions.ll
@@ -0,0 +1,88 @@
+; RUN: llc < %s -march=nvptx64 -mcpu=sm_80 -mattr=+ptx70 | FileCheck %s
+; RUN: %if ptxas-11.0 %{ llc < %s -march=nvptx64 -mcpu=sm_80 -mattr=+ptx70 | %ptxas-verify -arch=sm_80 %}
+
+
+; CHECK-LABEL: test_fadd(
+; CHECK-DAG:  ld.param.b16[[A:%h[0-9]+]], [test_fadd_param_0];
+; CHECK-DAG:  ld.param.b16[[B:%h[0-9]+]], [test_fadd_param_1];
+; CHECK-NEXT: add.rn.bf16 [[R:%f[0-9]+]], [[A]], [[B]];
+; CHECK-NEXT: st.param.b16[func_retval0+0], [[R]];
+; CHECK-NEXT: ret;
+
+define bfloat @test_fadd(bfloat %0, bfloat %1) {
+  %3 = fadd bfloat %0, %1 
+  ret bfloat %3
+}
+
+; CHECK-LABEL: test_fsub(
+; CHECK-DAG:  ld.param.b16[[A:%h[0-9]+]], [test_fsub_param_0];
+; CHECK-DAG:  ld.param.b16[[B:%h[0-9]+]], [test_fsub_param_1];
+; CHECK-NEXT: sub.rn.bf16 [[R:%f[0-9]+]], [[A]], [[B]];
+; CHECK-NEXT: st.param.b16[func_retval0+0], [[R]];
+; CHECK-NEXT: ret;
+
+define bfloat @test_fsub(bfloat %0, bfloat %1) {
+  %3 = fsub bfloat %0, %1 
+  ret bfloat %3
+}
+
+; CHECK-LABEL: test_faddx2(
+; CHECK-DAG:  ld.param.b32[[A:%hh[0-9]+]], [test_faddx2_param_0];
+; CHECK-DAG:  ld.param.b32[[B:%hh[0-9]+]], [test_faddx2_param_1];
+; CHECK-NEXT: add.rn.bf16x2   [[R:%f[0-9]+]], [[A]], [[B]];
+
+; CHECK:  st.param.b32[func_retval0+0], [[R]];
+; CHECK:  ret;
+
+define <2 x bfloat> @test_faddx2(<2 x bfloat> %a, <2 x bfloat> %b) #0 {
+  %r = fadd <2 x bfloat> %a, %b
+  ret <2 x bfloat> %r
+}
+
+; CHECK-LABEL: test_fsubx2(
+; CHECK-DAG:  ld.param.b32[[A:%hh[0-9]+]], [test_fsubx2_param_0];
+; CHECK-DAG:  ld.param.b32[[B:%hh[0-9]+]], [test_fsubx2_param_1];
+; CHECK-NEXT: sub.rn.bf16x2   [[R:%f[0-9]+]], [[A]], [[B]];
+
+; CHECK:  st.param.b32[func_retval0+0], [[R]];
+; CHECK:  ret;
+
+define <2 x bfloat> @test_fsubx2(<2 x bfloat> %a, <2 x bfloat> %b) #0 {
+  %r = fsub <2 x bfloat> %a, %b
+  ret <2 x bfloat> %r
+}
+
+; CHECK-LABEL: test_fmulx2(
+; CHECK-DAG:  ld.param.b32[[A:%hh[0-9]+]], [test_fmulx2_param_0];
+; CHECK-DAG:  ld.param.b32[[B:%hh[0-9]+]], [test_fmulx2_param_1];
+; CHECK-NEXT: mul.rn.bf16x2   [[R:%f[0-9]+]], [[A]], [[B]];
+
+; CHECK:  st.param.b32[func_retval0+0], [[R]];
+; CHECK:  ret;
+
+define <2 x bfloat> @test_fmul(<2 x bfloat> %a, <2 x bfloat> %b) #0 {
+  %r = fmul <2 x bfloat> %a, %b
+  ret <2 x bfloat> %r
+}
+
+; CHECK-LABEL: test_fdiv(
+; CHECK-DAG:  ld.param.b32[[A:%hh[0-9]+]], [test_fdiv_param_0];
+; CHECK-DAG:  ld.param.b32[[B:%hh[0-9]+]], [test_fdiv_param_1];
+; CHECK-DAG:  mov.b32 {[[A0:%h[0-9]+]], [[A1:%h[0-9]+]]}, [[A]]
+; CHECK-DAG:  mov.b32 {[[B0:%h[0-9]+]], [[B1:%h[0-9]+]]}, [[B]]
+; CHECK-DAG:  cvt.f32.bf16 [[FA0:%f[0-9]+]], [[A0]];
+; CHECK-DAG:  cvt.f32.bf16 [[FA1:%f[0-9]+]], [[A1]];
+; CHECK-DAG:  cvt.f32.bf16 [[FB0:%f[0-9]+]], [[B0]];
+; CHECK-DAG:  cvt.f32.bf16 [[FB1:%f[0-9]+]], [[B1]];
+; CHECK-DAG:  div.rn.f32  [[FR0:%f[0-9]+]], [[FA0]], [[FB0]];
+; CHECK-DAG:  div.rn.f32  [[FR1:%f[0-9]+]], [[FA1]], [[FB1]];
+; CHECK-DAG:  cvt.rn.bf16.f32  [[R0:%h[0-9]+]], [[FR0]];
+; CHECK-DAG:  cvt.rn.bf16.f32  [[R1:%h[0-9]+]], [[FR1]];
+; CHECK-NEXT: mov.b32 [[R:%hh[0-9]+]], {[[R0]], [[R1]]}
+; CHECK-NEXT: st.param.b32[func_retval0+0], [[R]];
+; CHECK-NEXT: ret;
+
+define <2 x bfloat> @test_fdiv(<2 x bfloat> %a, <2 x bfloat> %b) #0 {
+  %r = fdiv <2 x bfloat> %a, %b
+  ret <2 x bfloat> %r
+}
Index: llvm/lib/Target/NVPTX/NVPTXTargetTransformInfo.cpp
===
--- llvm/lib/Target/NVPTX/NVPTXTargetTransformInfo.cpp
+++ llvm/lib/Target/NVPTX/NVPTXTargetTransformInfo.cpp
@@ -204,6 +204,14 @@
   return {Intrinsic::fma, FTZ_MustBeOff, true};
 case Intrinsic::nvvm

[PATCH] D149976: adding bf16 support to NVPTX

2023-05-15 Thread Kushan Ahmadian via Phabricator via cfe-commits
kushanam updated this revision to Diff 522284.
kushanam added a comment.

Addressing review changed and removing bf16 registers

Depends on D144911 


Repository:
  rG LLVM Github Monorepo

CHANGES SINCE LAST ACTION
  https://reviews.llvm.org/D149976/new/

https://reviews.llvm.org/D149976

Files:
  clang/include/clang/Basic/BuiltinsNVPTX.def
  llvm/include/llvm/IR/IntrinsicsNVVM.td
  llvm/lib/Target/NVPTX/MCTargetDesc/NVPTXInstPrinter.cpp
  llvm/lib/Target/NVPTX/NVPTXAsmPrinter.cpp
  llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp
  llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.h
  llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp
  llvm/lib/Target/NVPTX/NVPTXInstrInfo.td
  llvm/lib/Target/NVPTX/NVPTXIntrinsics.td
  llvm/lib/Target/NVPTX/NVPTXMCExpr.cpp
  llvm/lib/Target/NVPTX/NVPTXMCExpr.h
  llvm/lib/Target/NVPTX/NVPTXRegisterInfo.td
  llvm/lib/Target/NVPTX/NVPTXSubtarget.cpp
  llvm/lib/Target/NVPTX/NVPTXSubtarget.h
  llvm/lib/Target/NVPTX/NVPTXTargetTransformInfo.cpp
  llvm/test/CodeGen/NVPTX/bf16-instructions.ll

Index: llvm/test/CodeGen/NVPTX/bf16-instructions.ll
===
--- /dev/null
+++ llvm/test/CodeGen/NVPTX/bf16-instructions.ll
@@ -0,0 +1,88 @@
+; RUN: llc < %s -march=nvptx64 -mcpu=sm_80 -mattr=+ptx70 | FileCheck %s
+; RUN: %if ptxas-11.0 %{ llc < %s -march=nvptx64 -mcpu=sm_80 -mattr=+ptx70 | %ptxas-verify -arch=sm_80 %}
+
+
+; CHECK-LABEL: test_fadd(
+; CHECK-DAG:  ld.param.b16[[A:%h[0-9]+]], [test_fadd_param_0];
+; CHECK-DAG:  ld.param.b16[[B:%h[0-9]+]], [test_fadd_param_1];
+; CHECK-NEXT: add.rn.bf16 [[R:%f[0-9]+]], [[A]], [[B]];
+; CHECK-NEXT: st.param.b16[func_retval0+0], [[R]];
+; CHECK-NEXT: ret;
+
+define bfloat @test_fadd(bfloat %0, bfloat %1) {
+  %3 = fadd bfloat %0, %1 
+  ret bfloat %3
+}
+
+; CHECK-LABEL: test_fsub(
+; CHECK-DAG:  ld.param.b16[[A:%h[0-9]+]], [test_fsub_param_0];
+; CHECK-DAG:  ld.param.b16[[B:%h[0-9]+]], [test_fsub_param_1];
+; CHECK-NEXT: sub.rn.bf16 [[R:%f[0-9]+]], [[A]], [[B]];
+; CHECK-NEXT: st.param.b16[func_retval0+0], [[R]];
+; CHECK-NEXT: ret;
+
+define bfloat @test_fsub(bfloat %0, bfloat %1) {
+  %3 = fsub bfloat %0, %1 
+  ret bfloat %3
+}
+
+; CHECK-LABEL: test_faddx2(
+; CHECK-DAG:  ld.param.b32[[A:%hh[0-9]+]], [test_faddx2_param_0];
+; CHECK-DAG:  ld.param.b32[[B:%hh[0-9]+]], [test_faddx2_param_1];
+; CHECK-NEXT: add.rn.bf16x2   [[R:%f[0-9]+]], [[A]], [[B]];
+
+; CHECK:  st.param.b32[func_retval0+0], [[R]];
+; CHECK:  ret;
+
+define <2 x bfloat> @test_faddx2(<2 x bfloat> %a, <2 x bfloat> %b) #0 {
+  %r = fadd <2 x bfloat> %a, %b
+  ret <2 x bfloat> %r
+}
+
+; CHECK-LABEL: test_fsubx2(
+; CHECK-DAG:  ld.param.b32[[A:%hh[0-9]+]], [test_fsubx2_param_0];
+; CHECK-DAG:  ld.param.b32[[B:%hh[0-9]+]], [test_fsubx2_param_1];
+; CHECK-NEXT: sub.rn.bf16x2   [[R:%f[0-9]+]], [[A]], [[B]];
+
+; CHECK:  st.param.b32[func_retval0+0], [[R]];
+; CHECK:  ret;
+
+define <2 x bfloat> @test_fsubx2(<2 x bfloat> %a, <2 x bfloat> %b) #0 {
+  %r = fsub <2 x bfloat> %a, %b
+  ret <2 x bfloat> %r
+}
+
+; CHECK-LABEL: test_fmulx2(
+; CHECK-DAG:  ld.param.b32[[A:%hh[0-9]+]], [test_fmulx2_param_0];
+; CHECK-DAG:  ld.param.b32[[B:%hh[0-9]+]], [test_fmulx2_param_1];
+; CHECK-NEXT: mul.rn.bf16x2   [[R:%f[0-9]+]], [[A]], [[B]];
+
+; CHECK:  st.param.b32[func_retval0+0], [[R]];
+; CHECK:  ret;
+
+define <2 x bfloat> @test_fmul(<2 x bfloat> %a, <2 x bfloat> %b) #0 {
+  %r = fmul <2 x bfloat> %a, %b
+  ret <2 x bfloat> %r
+}
+
+; CHECK-LABEL: test_fdiv(
+; CHECK-DAG:  ld.param.b32[[A:%hh[0-9]+]], [test_fdiv_param_0];
+; CHECK-DAG:  ld.param.b32[[B:%hh[0-9]+]], [test_fdiv_param_1];
+; CHECK-DAG:  mov.b32 {[[A0:%h[0-9]+]], [[A1:%h[0-9]+]]}, [[A]]
+; CHECK-DAG:  mov.b32 {[[B0:%h[0-9]+]], [[B1:%h[0-9]+]]}, [[B]]
+; CHECK-DAG:  cvt.f32.bf16 [[FA0:%f[0-9]+]], [[A0]];
+; CHECK-DAG:  cvt.f32.bf16 [[FA1:%f[0-9]+]], [[A1]];
+; CHECK-DAG:  cvt.f32.bf16 [[FB0:%f[0-9]+]], [[B0]];
+; CHECK-DAG:  cvt.f32.bf16 [[FB1:%f[0-9]+]], [[B1]];
+; CHECK-DAG:  div.rn.f32  [[FR0:%f[0-9]+]], [[FA0]], [[FB0]];
+; CHECK-DAG:  div.rn.f32  [[FR1:%f[0-9]+]], [[FA1]], [[FB1]];
+; CHECK-DAG:  cvt.rn.bf16.f32  [[R0:%h[0-9]+]], [[FR0]];
+; CHECK-DAG:  cvt.rn.bf16.f32  [[R1:%h[0-9]+]], [[FR1]];
+; CHECK-NEXT: mov.b32 [[R:%hh[0-9]+]], {[[R0]], [[R1]]}
+; CHECK-NEXT: st.param.b32[func_retval0+0], [[R]];
+; CHECK-NEXT: ret;
+
+define <2 x bfloat> @test_fdiv(<2 x bfloat> %a, <2 x bfloat> %b) #0 {
+  %r = fdiv <2 x bfloat> %a, %b
+  ret <2 x bfloat> %r
+}
Index: llvm/lib/Target/NVPTX/NVPTXTargetTransformInfo.cpp
===
--- llvm/lib/Target/NVPTX/NVPTXTargetTransformInfo.cpp
+++ llvm/lib/Target/NVPTX/NVPTXTargetTransformInfo.cpp
@@ -204,6 +204,14 @@
   return {Intrins

[PATCH] D149976: adding bf16 support to NVPTX

2023-05-15 Thread Kushan Ahmadian via Phabricator via cfe-commits
kushanam abandoned this revision.
kushanam added a comment.

Redundant to D144911 


Repository:
  rG LLVM Github Monorepo

CHANGES SINCE LAST ACTION
  https://reviews.llvm.org/D149976/new/

https://reviews.llvm.org/D149976

___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[PATCH] D149976: adding bf16 support to NVPTX

2023-05-15 Thread Kushan Ahmadian via Phabricator via cfe-commits
kushanam updated this revision to Diff 522283.
kushanam added a comment.

Addressing the review changed adn removing bf16 specific registers


Repository:
  rG LLVM Github Monorepo

CHANGES SINCE LAST ACTION
  https://reviews.llvm.org/D149976/new/

https://reviews.llvm.org/D149976

Files:
  clang/include/clang/Basic/BuiltinsNVPTX.def
  llvm/include/llvm/IR/IntrinsicsNVVM.td
  llvm/lib/Target/NVPTX/MCTargetDesc/NVPTXInstPrinter.cpp
  llvm/lib/Target/NVPTX/NVPTXAsmPrinter.cpp
  llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp
  llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.h
  llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp
  llvm/lib/Target/NVPTX/NVPTXInstrInfo.td
  llvm/lib/Target/NVPTX/NVPTXIntrinsics.td
  llvm/lib/Target/NVPTX/NVPTXMCExpr.cpp
  llvm/lib/Target/NVPTX/NVPTXMCExpr.h
  llvm/lib/Target/NVPTX/NVPTXRegisterInfo.td
  llvm/lib/Target/NVPTX/NVPTXSubtarget.cpp
  llvm/lib/Target/NVPTX/NVPTXSubtarget.h
  llvm/lib/Target/NVPTX/NVPTXTargetTransformInfo.cpp
  llvm/test/CodeGen/NVPTX/bf16-instructions.ll

Index: llvm/test/CodeGen/NVPTX/bf16-instructions.ll
===
--- /dev/null
+++ llvm/test/CodeGen/NVPTX/bf16-instructions.ll
@@ -0,0 +1,88 @@
+; RUN: llc < %s -march=nvptx64 -mcpu=sm_80 -mattr=+ptx70 | FileCheck %s
+; RUN: %if ptxas-11.0 %{ llc < %s -march=nvptx64 -mcpu=sm_80 -mattr=+ptx70 | %ptxas-verify -arch=sm_80 %}
+
+
+; CHECK-LABEL: test_fadd(
+; CHECK-DAG:  ld.param.b16[[A:%h[0-9]+]], [test_fadd_param_0];
+; CHECK-DAG:  ld.param.b16[[B:%h[0-9]+]], [test_fadd_param_1];
+; CHECK-NEXT: add.rn.bf16 [[R:%f[0-9]+]], [[A]], [[B]];
+; CHECK-NEXT: st.param.b16[func_retval0+0], [[R]];
+; CHECK-NEXT: ret;
+
+define bfloat @test_fadd(bfloat %0, bfloat %1) {
+  %3 = fadd bfloat %0, %1 
+  ret bfloat %3
+}
+
+; CHECK-LABEL: test_fsub(
+; CHECK-DAG:  ld.param.b16[[A:%h[0-9]+]], [test_fsub_param_0];
+; CHECK-DAG:  ld.param.b16[[B:%h[0-9]+]], [test_fsub_param_1];
+; CHECK-NEXT: sub.rn.bf16 [[R:%f[0-9]+]], [[A]], [[B]];
+; CHECK-NEXT: st.param.b16[func_retval0+0], [[R]];
+; CHECK-NEXT: ret;
+
+define bfloat @test_fsub(bfloat %0, bfloat %1) {
+  %3 = fsub bfloat %0, %1 
+  ret bfloat %3
+}
+
+; CHECK-LABEL: test_faddx2(
+; CHECK-DAG:  ld.param.b32[[A:%hh[0-9]+]], [test_faddx2_param_0];
+; CHECK-DAG:  ld.param.b32[[B:%hh[0-9]+]], [test_faddx2_param_1];
+; CHECK-NEXT: add.rn.bf16x2   [[R:%f[0-9]+]], [[A]], [[B]];
+
+; CHECK:  st.param.b32[func_retval0+0], [[R]];
+; CHECK:  ret;
+
+define <2 x bfloat> @test_faddx2(<2 x bfloat> %a, <2 x bfloat> %b) #0 {
+  %r = fadd <2 x bfloat> %a, %b
+  ret <2 x bfloat> %r
+}
+
+; CHECK-LABEL: test_fsubx2(
+; CHECK-DAG:  ld.param.b32[[A:%hh[0-9]+]], [test_fsubx2_param_0];
+; CHECK-DAG:  ld.param.b32[[B:%hh[0-9]+]], [test_fsubx2_param_1];
+; CHECK-NEXT: sub.rn.bf16x2   [[R:%f[0-9]+]], [[A]], [[B]];
+
+; CHECK:  st.param.b32[func_retval0+0], [[R]];
+; CHECK:  ret;
+
+define <2 x bfloat> @test_fsubx2(<2 x bfloat> %a, <2 x bfloat> %b) #0 {
+  %r = fsub <2 x bfloat> %a, %b
+  ret <2 x bfloat> %r
+}
+
+; CHECK-LABEL: test_fmulx2(
+; CHECK-DAG:  ld.param.b32[[A:%hh[0-9]+]], [test_fmulx2_param_0];
+; CHECK-DAG:  ld.param.b32[[B:%hh[0-9]+]], [test_fmulx2_param_1];
+; CHECK-NEXT: mul.rn.bf16x2   [[R:%f[0-9]+]], [[A]], [[B]];
+
+; CHECK:  st.param.b32[func_retval0+0], [[R]];
+; CHECK:  ret;
+
+define <2 x bfloat> @test_fmul(<2 x bfloat> %a, <2 x bfloat> %b) #0 {
+  %r = fmul <2 x bfloat> %a, %b
+  ret <2 x bfloat> %r
+}
+
+; CHECK-LABEL: test_fdiv(
+; CHECK-DAG:  ld.param.b32[[A:%hh[0-9]+]], [test_fdiv_param_0];
+; CHECK-DAG:  ld.param.b32[[B:%hh[0-9]+]], [test_fdiv_param_1];
+; CHECK-DAG:  mov.b32 {[[A0:%h[0-9]+]], [[A1:%h[0-9]+]]}, [[A]]
+; CHECK-DAG:  mov.b32 {[[B0:%h[0-9]+]], [[B1:%h[0-9]+]]}, [[B]]
+; CHECK-DAG:  cvt.f32.bf16 [[FA0:%f[0-9]+]], [[A0]];
+; CHECK-DAG:  cvt.f32.bf16 [[FA1:%f[0-9]+]], [[A1]];
+; CHECK-DAG:  cvt.f32.bf16 [[FB0:%f[0-9]+]], [[B0]];
+; CHECK-DAG:  cvt.f32.bf16 [[FB1:%f[0-9]+]], [[B1]];
+; CHECK-DAG:  div.rn.f32  [[FR0:%f[0-9]+]], [[FA0]], [[FB0]];
+; CHECK-DAG:  div.rn.f32  [[FR1:%f[0-9]+]], [[FA1]], [[FB1]];
+; CHECK-DAG:  cvt.rn.bf16.f32  [[R0:%h[0-9]+]], [[FR0]];
+; CHECK-DAG:  cvt.rn.bf16.f32  [[R1:%h[0-9]+]], [[FR1]];
+; CHECK-NEXT: mov.b32 [[R:%hh[0-9]+]], {[[R0]], [[R1]]}
+; CHECK-NEXT: st.param.b32[func_retval0+0], [[R]];
+; CHECK-NEXT: ret;
+
+define <2 x bfloat> @test_fdiv(<2 x bfloat> %a, <2 x bfloat> %b) #0 {
+  %r = fdiv <2 x bfloat> %a, %b
+  ret <2 x bfloat> %r
+}
Index: llvm/lib/Target/NVPTX/NVPTXTargetTransformInfo.cpp
===
--- llvm/lib/Target/NVPTX/NVPTXTargetTransformInfo.cpp
+++ llvm/lib/Target/NVPTX/NVPTXTargetTransformInfo.cpp
@@ -204,6 +204,14 @@
   return {Intrinsic::fma, FTZ_MustBeOff, true};
 case I

[PATCH] D149976: adding bf16 support to NVPTX

2023-05-07 Thread Kushan Ahmadian via Phabricator via cfe-commits
kushanam updated this revision to Diff 520215.
kushanam added a comment.

removing bf16 specific registers classes


Repository:
  rG LLVM Github Monorepo

CHANGES SINCE LAST ACTION
  https://reviews.llvm.org/D149976/new/

https://reviews.llvm.org/D149976

Files:
  clang/include/clang/Basic/BuiltinsNVPTX.def
  llvm/include/llvm/IR/IntrinsicsNVVM.td
  llvm/lib/Target/NVPTX/MCTargetDesc/NVPTXInstPrinter.cpp
  llvm/lib/Target/NVPTX/NVPTXAsmPrinter.cpp
  llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp
  llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.h
  llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp
  llvm/lib/Target/NVPTX/NVPTXInstrInfo.td
  llvm/lib/Target/NVPTX/NVPTXIntrinsics.td
  llvm/lib/Target/NVPTX/NVPTXMCExpr.cpp
  llvm/lib/Target/NVPTX/NVPTXMCExpr.h
  llvm/lib/Target/NVPTX/NVPTXRegisterInfo.td
  llvm/lib/Target/NVPTX/NVPTXSubtarget.cpp
  llvm/lib/Target/NVPTX/NVPTXSubtarget.h
  llvm/lib/Target/NVPTX/NVPTXTargetTransformInfo.cpp
  llvm/test/CodeGen/NVPTX/bf16-instructions.ll

Index: llvm/test/CodeGen/NVPTX/bf16-instructions.ll
===
--- /dev/null
+++ llvm/test/CodeGen/NVPTX/bf16-instructions.ll
@@ -0,0 +1,88 @@
+; RUN: llc < %s -march=nvptx64 -mcpu=sm_80 -mattr=+ptx70 | FileCheck %s
+; RUN: %if ptxas-11.0 %{ llc < %s -march=nvptx64 -mcpu=sm_80 -mattr=+ptx70 | %ptxas-verify -arch=sm_80 %}
+
+
+; CHECK-LABEL: test_fadd(
+; CHECK-DAG:  ld.param.b16[[A:%h[0-9]+]], [test_fadd_param_0];
+; CHECK-DAG:  ld.param.b16[[B:%h[0-9]+]], [test_fadd_param_1];
+; CHECK-NEXT: add.rn.bf16 [[R:%f[0-9]+]], [[A]], [[B]];
+; CHECK-NEXT: st.param.b16[func_retval0+0], [[R]];
+; CHECK-NEXT: ret;
+
+define bfloat @test_fadd(bfloat %0, bfloat %1) {
+  %3 = fadd bfloat %0, %1 
+  ret bfloat %3
+}
+
+; CHECK-LABEL: test_fsub(
+; CHECK-DAG:  ld.param.b16[[A:%h[0-9]+]], [test_fsub_param_0];
+; CHECK-DAG:  ld.param.b16[[B:%h[0-9]+]], [test_fsub_param_1];
+; CHECK-NEXT: sub.rn.bf16 [[R:%f[0-9]+]], [[A]], [[B]];
+; CHECK-NEXT: st.param.b16[func_retval0+0], [[R]];
+; CHECK-NEXT: ret;
+
+define bfloat @test_fsub(bfloat %0, bfloat %1) {
+  %3 = fsub bfloat %0, %1 
+  ret bfloat %3
+}
+
+; CHECK-LABEL: test_faddx2(
+; CHECK-DAG:  ld.param.b32[[A:%hh[0-9]+]], [test_faddx2_param_0];
+; CHECK-DAG:  ld.param.b32[[B:%hh[0-9]+]], [test_faddx2_param_1];
+; CHECK-NEXT: add.rn.bf16x2   [[R:%f[0-9]+]], [[A]], [[B]];
+
+; CHECK:  st.param.b32[func_retval0+0], [[R]];
+; CHECK:  ret;
+
+define <2 x bfloat> @test_faddx2(<2 x bfloat> %a, <2 x bfloat> %b) #0 {
+  %r = fadd <2 x bfloat> %a, %b
+  ret <2 x bfloat> %r
+}
+
+; CHECK-LABEL: test_fsubx2(
+; CHECK-DAG:  ld.param.b32[[A:%hh[0-9]+]], [test_fsubx2_param_0];
+; CHECK-DAG:  ld.param.b32[[B:%hh[0-9]+]], [test_fsubx2_param_1];
+; CHECK-NEXT: sub.rn.bf16x2   [[R:%f[0-9]+]], [[A]], [[B]];
+
+; CHECK:  st.param.b32[func_retval0+0], [[R]];
+; CHECK:  ret;
+
+define <2 x bfloat> @test_fsubx2(<2 x bfloat> %a, <2 x bfloat> %b) #0 {
+  %r = fsub <2 x bfloat> %a, %b
+  ret <2 x bfloat> %r
+}
+
+; CHECK-LABEL: test_fmulx2(
+; CHECK-DAG:  ld.param.b32[[A:%hh[0-9]+]], [test_fmulx2_param_0];
+; CHECK-DAG:  ld.param.b32[[B:%hh[0-9]+]], [test_fmulx2_param_1];
+; CHECK-NEXT: mul.rn.bf16x2   [[R:%f[0-9]+]], [[A]], [[B]];
+
+; CHECK:  st.param.b32[func_retval0+0], [[R]];
+; CHECK:  ret;
+
+define <2 x bfloat> @test_fmul(<2 x bfloat> %a, <2 x bfloat> %b) #0 {
+  %r = fmul <2 x bfloat> %a, %b
+  ret <2 x bfloat> %r
+}
+
+; CHECK-LABEL: test_fdiv(
+; CHECK-DAG:  ld.param.b32[[A:%hh[0-9]+]], [test_fdiv_param_0];
+; CHECK-DAG:  ld.param.b32[[B:%hh[0-9]+]], [test_fdiv_param_1];
+; CHECK-DAG:  mov.b32 {[[A0:%h[0-9]+]], [[A1:%h[0-9]+]]}, [[A]]
+; CHECK-DAG:  mov.b32 {[[B0:%h[0-9]+]], [[B1:%h[0-9]+]]}, [[B]]
+; CHECK-DAG:  cvt.f32.bf16 [[FA0:%f[0-9]+]], [[A0]];
+; CHECK-DAG:  cvt.f32.bf16 [[FA1:%f[0-9]+]], [[A1]];
+; CHECK-DAG:  cvt.f32.bf16 [[FB0:%f[0-9]+]], [[B0]];
+; CHECK-DAG:  cvt.f32.bf16 [[FB1:%f[0-9]+]], [[B1]];
+; CHECK-DAG:  div.rn.f32  [[FR0:%f[0-9]+]], [[FA0]], [[FB0]];
+; CHECK-DAG:  div.rn.f32  [[FR1:%f[0-9]+]], [[FA1]], [[FB1]];
+; CHECK-DAG:  cvt.rn.bf16.f32  [[R0:%h[0-9]+]], [[FR0]];
+; CHECK-DAG:  cvt.rn.bf16.f32  [[R1:%h[0-9]+]], [[FR1]];
+; CHECK-NEXT: mov.b32 [[R:%hh[0-9]+]], {[[R0]], [[R1]]}
+; CHECK-NEXT: st.param.b32[func_retval0+0], [[R]];
+; CHECK-NEXT: ret;
+
+define <2 x bfloat> @test_fdiv(<2 x bfloat> %a, <2 x bfloat> %b) #0 {
+  %r = fdiv <2 x bfloat> %a, %b
+  ret <2 x bfloat> %r
+}
Index: llvm/lib/Target/NVPTX/NVPTXTargetTransformInfo.cpp
===
--- llvm/lib/Target/NVPTX/NVPTXTargetTransformInfo.cpp
+++ llvm/lib/Target/NVPTX/NVPTXTargetTransformInfo.cpp
@@ -204,6 +204,14 @@
   return {Intrinsic::fma, FTZ_MustBeOff, true};
 case Intrinsic::nvvm_fma_rn_ftz_

[PATCH] D149976: adding bf16 support to NVPTX

2023-05-07 Thread Kushan Ahmadian via Phabricator via cfe-commits
kushanam updated this revision to Diff 520210.
kushanam added a comment.

Addressing the review comments


Repository:
  rG LLVM Github Monorepo

CHANGES SINCE LAST ACTION
  https://reviews.llvm.org/D149976/new/

https://reviews.llvm.org/D149976

Files:
  clang/include/clang/Basic/BuiltinsNVPTX.def
  llvm/include/llvm/IR/IntrinsicsNVVM.td
  llvm/lib/Target/NVPTX/MCTargetDesc/NVPTXInstPrinter.cpp
  llvm/lib/Target/NVPTX/NVPTXAsmPrinter.cpp
  llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp
  llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.h
  llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp
  llvm/lib/Target/NVPTX/NVPTXInstrInfo.td
  llvm/lib/Target/NVPTX/NVPTXIntrinsics.td
  llvm/lib/Target/NVPTX/NVPTXMCExpr.cpp
  llvm/lib/Target/NVPTX/NVPTXMCExpr.h
  llvm/lib/Target/NVPTX/NVPTXRegisterInfo.cpp
  llvm/lib/Target/NVPTX/NVPTXRegisterInfo.td
  llvm/lib/Target/NVPTX/NVPTXSubtarget.cpp
  llvm/lib/Target/NVPTX/NVPTXSubtarget.h
  llvm/lib/Target/NVPTX/NVPTXTargetTransformInfo.cpp
  llvm/test/CodeGen/NVPTX/bf16-instructions.ll

Index: llvm/test/CodeGen/NVPTX/bf16-instructions.ll
===
--- /dev/null
+++ llvm/test/CodeGen/NVPTX/bf16-instructions.ll
@@ -0,0 +1,88 @@
+; RUN: llc < %s -march=nvptx64 -mcpu=sm_80 -mattr=+ptx70 | FileCheck %s
+; RUN: %if ptxas-11.0 %{ llc < %s -march=nvptx64 -mcpu=sm_80 -mattr=+ptx70 | %ptxas-verify -arch=sm_80 %}
+
+
+; CHECK-LABEL: test_fadd(
+; CHECK-DAG:  ld.param.b16[[A:%h[0-9]+]], [test_fadd_param_0];
+; CHECK-DAG:  ld.param.b16[[B:%h[0-9]+]], [test_fadd_param_1];
+; CHECK-NEXT: add.rn.bf16 [[R:%f[0-9]+]], [[A]], [[B]];
+; CHECK-NEXT: st.param.b16[func_retval0+0], [[R]];
+; CHECK-NEXT: ret;
+
+define bfloat @test_fadd(bfloat %0, bfloat %1) {
+  %3 = fadd bfloat %0, %1 
+  ret bfloat %3
+}
+
+; CHECK-LABEL: test_fsub(
+; CHECK-DAG:  ld.param.b16[[A:%h[0-9]+]], [test_fsub_param_0];
+; CHECK-DAG:  ld.param.b16[[B:%h[0-9]+]], [test_fsub_param_1];
+; CHECK-NEXT: sub.rn.bf16 [[R:%f[0-9]+]], [[A]], [[B]];
+; CHECK-NEXT: st.param.b16[func_retval0+0], [[R]];
+; CHECK-NEXT: ret;
+
+define bfloat @test_fsub(bfloat %0, bfloat %1) {
+  %3 = fsub bfloat %0, %1 
+  ret bfloat %3
+}
+
+; CHECK-LABEL: test_faddx2(
+; CHECK-DAG:  ld.param.b32[[A:%hh[0-9]+]], [test_faddx2_param_0];
+; CHECK-DAG:  ld.param.b32[[B:%hh[0-9]+]], [test_faddx2_param_1];
+; CHECK-NEXT: add.rn.bf16x2   [[R:%f[0-9]+]], [[A]], [[B]];
+
+; CHECK:  st.param.b32[func_retval0+0], [[R]];
+; CHECK:  ret;
+
+define <2 x bfloat> @test_faddx2(<2 x bfloat> %a, <2 x bfloat> %b) #0 {
+  %r = fadd <2 x bfloat> %a, %b
+  ret <2 x bfloat> %r
+}
+
+; CHECK-LABEL: test_fsubx2(
+; CHECK-DAG:  ld.param.b32[[A:%hh[0-9]+]], [test_fsubx2_param_0];
+; CHECK-DAG:  ld.param.b32[[B:%hh[0-9]+]], [test_fsubx2_param_1];
+; CHECK-NEXT: sub.rn.bf16x2   [[R:%f[0-9]+]], [[A]], [[B]];
+
+; CHECK:  st.param.b32[func_retval0+0], [[R]];
+; CHECK:  ret;
+
+define <2 x bfloat> @test_fsubx2(<2 x bfloat> %a, <2 x bfloat> %b) #0 {
+  %r = fsub <2 x bfloat> %a, %b
+  ret <2 x bfloat> %r
+}
+
+; CHECK-LABEL: test_fmulx2(
+; CHECK-DAG:  ld.param.b32[[A:%hh[0-9]+]], [test_fmulx2_param_0];
+; CHECK-DAG:  ld.param.b32[[B:%hh[0-9]+]], [test_fmulx2_param_1];
+; CHECK-NEXT: mul.rn.bf16x2   [[R:%f[0-9]+]], [[A]], [[B]];
+
+; CHECK:  st.param.b32[func_retval0+0], [[R]];
+; CHECK:  ret;
+
+define <2 x bfloat> @test_fmul(<2 x bfloat> %a, <2 x bfloat> %b) #0 {
+  %r = fmul <2 x bfloat> %a, %b
+  ret <2 x bfloat> %r
+}
+
+; CHECK-LABEL: test_fdiv(
+; CHECK-DAG:  ld.param.b32[[A:%hh[0-9]+]], [test_fdiv_param_0];
+; CHECK-DAG:  ld.param.b32[[B:%hh[0-9]+]], [test_fdiv_param_1];
+; CHECK-DAG:  mov.b32 {[[A0:%h[0-9]+]], [[A1:%h[0-9]+]]}, [[A]]
+; CHECK-DAG:  mov.b32 {[[B0:%h[0-9]+]], [[B1:%h[0-9]+]]}, [[B]]
+; CHECK-DAG:  cvt.f32.bf16 [[FA0:%f[0-9]+]], [[A0]];
+; CHECK-DAG:  cvt.f32.bf16 [[FA1:%f[0-9]+]], [[A1]];
+; CHECK-DAG:  cvt.f32.bf16 [[FB0:%f[0-9]+]], [[B0]];
+; CHECK-DAG:  cvt.f32.bf16 [[FB1:%f[0-9]+]], [[B1]];
+; CHECK-DAG:  div.rn.f32  [[FR0:%f[0-9]+]], [[FA0]], [[FB0]];
+; CHECK-DAG:  div.rn.f32  [[FR1:%f[0-9]+]], [[FA1]], [[FB1]];
+; CHECK-DAG:  cvt.rn.bf16.f32  [[R0:%h[0-9]+]], [[FR0]];
+; CHECK-DAG:  cvt.rn.bf16.f32  [[R1:%h[0-9]+]], [[FR1]];
+; CHECK-NEXT: mov.b32 [[R:%hh[0-9]+]], {[[R0]], [[R1]]}
+; CHECK-NEXT: st.param.b32[func_retval0+0], [[R]];
+; CHECK-NEXT: ret;
+
+define <2 x bfloat> @test_fdiv(<2 x bfloat> %a, <2 x bfloat> %b) #0 {
+  %r = fdiv <2 x bfloat> %a, %b
+  ret <2 x bfloat> %r
+}
Index: llvm/lib/Target/NVPTX/NVPTXTargetTransformInfo.cpp
===
--- llvm/lib/Target/NVPTX/NVPTXTargetTransformInfo.cpp
+++ llvm/lib/Target/NVPTX/NVPTXTargetTransformInfo.cpp
@@ -204,6 +204,14 @@
   return {Intrinsic::fma, FTZ_MustBeOff, true};
 

[PATCH] D149976: adding bf16 support to NVPTX

2023-05-05 Thread Kushan Ahmadian via Phabricator via cfe-commits
kushanam created this revision.
Herald added subscribers: mattd, gchakrabarti, asavonic, hiraditya.
Herald added a project: All.
kushanam requested review of this revision.
Herald added subscribers: llvm-commits, cfe-commits, jdoerfert, jholewinski.
Herald added projects: clang, LLVM.

Currently, bf16 has been scatteredly added to the PTX codegen. This patch aims 
to complete the set of instructions and code path required to support bf16 data 
type.

Depends on D144911 


Repository:
  rG LLVM Github Monorepo

https://reviews.llvm.org/D149976

Files:
  clang/include/clang/Basic/BuiltinsNVPTX.def
  llvm/include/llvm/IR/IntrinsicsNVVM.td
  llvm/lib/Target/NVPTX/MCTargetDesc/NVPTXInstPrinter.cpp
  llvm/lib/Target/NVPTX/NVPTXAsmPrinter.cpp
  llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp
  llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.h
  llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp
  llvm/lib/Target/NVPTX/NVPTXInstrInfo.cpp
  llvm/lib/Target/NVPTX/NVPTXInstrInfo.td
  llvm/lib/Target/NVPTX/NVPTXIntrinsics.td
  llvm/lib/Target/NVPTX/NVPTXMCExpr.cpp
  llvm/lib/Target/NVPTX/NVPTXMCExpr.h
  llvm/lib/Target/NVPTX/NVPTXRegisterInfo.cpp
  llvm/lib/Target/NVPTX/NVPTXRegisterInfo.td
  llvm/lib/Target/NVPTX/NVPTXSubtarget.cpp
  llvm/lib/Target/NVPTX/NVPTXSubtarget.h
  llvm/lib/Target/NVPTX/NVPTXTargetTransformInfo.cpp
  llvm/test/CodeGen/NVPTX/bf16-instructions.ll

Index: llvm/test/CodeGen/NVPTX/bf16-instructions.ll
===
--- /dev/null
+++ llvm/test/CodeGen/NVPTX/bf16-instructions.ll
@@ -0,0 +1,88 @@
+; RUN: llc < %s -march=nvptx64 -mcpu=sm_80 -mattr=+ptx70 | FileCheck %s
+; RUN: %if ptxas-11.0 %{ llc < %s -march=nvptx64 -mcpu=sm_80 -mattr=+ptx70 | %ptxas-verify -arch=sm_80 %}
+
+
+; CHECK-LABEL: test_fadd(
+; CHECK-DAG:  ld.param.b16[[A:%h[0-9]+]], [test_fadd_param_0];
+; CHECK-DAG:  ld.param.b16[[B:%h[0-9]+]], [test_fadd_param_1];
+; CHECK-NEXT: add.rn.bf16 [[R:%f[0-9]+]], [[A]], [[B]];
+; CHECK-NEXT: st.param.b16[func_retval0+0], [[R]];
+; CHECK-NEXT: ret;
+
+define bfloat @test_fadd(bfloat %0, bfloat %1) {
+  %3 = fadd bfloat %0, %1 
+  ret bfloat %3
+}
+
+; CHECK-LABEL: test_fsub(
+; CHECK-DAG:  ld.param.b16[[A:%h[0-9]+]], [test_fsub_param_0];
+; CHECK-DAG:  ld.param.b16[[B:%h[0-9]+]], [test_fsub_param_1];
+; CHECK-NEXT: sub.rn.bf16 [[R:%f[0-9]+]], [[A]], [[B]];
+; CHECK-NEXT: st.param.b16[func_retval0+0], [[R]];
+; CHECK-NEXT: ret;
+
+define bfloat @test_fsub(bfloat %0, bfloat %1) {
+  %3 = fsub bfloat %0, %1 
+  ret bfloat %3
+}
+
+; CHECK-LABEL: test_faddx2(
+; CHECK-DAG:  ld.param.b32[[A:%hh[0-9]+]], [test_faddx2_param_0];
+; CHECK-DAG:  ld.param.b32[[B:%hh[0-9]+]], [test_faddx2_param_1];
+; CHECK-NEXT: add.rn.bf16x2   [[R:%f[0-9]+]], [[A]], [[B]];
+
+; CHECK:  st.param.b32[func_retval0+0], [[R]];
+; CHECK:  ret;
+
+define <2 x bfloat> @test_faddx2(<2 x bfloat> %a, <2 x bfloat> %b) #0 {
+  %r = fadd <2 x bfloat> %a, %b
+  ret <2 x bfloat> %r
+}
+
+; CHECK-LABEL: test_fsubx2(
+; CHECK-DAG:  ld.param.b32[[A:%hh[0-9]+]], [test_fsubx2_param_0];
+; CHECK-DAG:  ld.param.b32[[B:%hh[0-9]+]], [test_fsubx2_param_1];
+; CHECK-NEXT: sub.rn.bf16x2   [[R:%f[0-9]+]], [[A]], [[B]];
+
+; CHECK:  st.param.b32[func_retval0+0], [[R]];
+; CHECK:  ret;
+
+define <2 x bfloat> @test_fsubx2(<2 x bfloat> %a, <2 x bfloat> %b) #0 {
+  %r = fsub <2 x bfloat> %a, %b
+  ret <2 x bfloat> %r
+}
+
+; CHECK-LABEL: test_fmulx2(
+; CHECK-DAG:  ld.param.b32[[A:%hh[0-9]+]], [test_fmulx2_param_0];
+; CHECK-DAG:  ld.param.b32[[B:%hh[0-9]+]], [test_fmulx2_param_1];
+; CHECK-NEXT: mul.rn.bf16x2   [[R:%f[0-9]+]], [[A]], [[B]];
+
+; CHECK:  st.param.b32[func_retval0+0], [[R]];
+; CHECK:  ret;
+
+define <2 x bfloat> @test_fmul(<2 x bfloat> %a, <2 x bfloat> %b) #0 {
+  %r = fmul <2 x bfloat> %a, %b
+  ret <2 x bfloat> %r
+}
+
+; CHECK-LABEL: test_fdiv(
+; CHECK-DAG:  ld.param.b32[[A:%hh[0-9]+]], [test_fdiv_param_0];
+; CHECK-DAG:  ld.param.b32[[B:%hh[0-9]+]], [test_fdiv_param_1];
+; CHECK-DAG:  mov.b32 {[[A0:%h[0-9]+]], [[A1:%h[0-9]+]]}, [[A]]
+; CHECK-DAG:  mov.b32 {[[B0:%h[0-9]+]], [[B1:%h[0-9]+]]}, [[B]]
+; CHECK-DAG:  cvt.f32.bf16 [[FA0:%f[0-9]+]], [[A0]];
+; CHECK-DAG:  cvt.f32.bf16 [[FA1:%f[0-9]+]], [[A1]];
+; CHECK-DAG:  cvt.f32.bf16 [[FB0:%f[0-9]+]], [[B0]];
+; CHECK-DAG:  cvt.f32.bf16 [[FB1:%f[0-9]+]], [[B1]];
+; CHECK-DAG:  div.rn.f32  [[FR0:%f[0-9]+]], [[FA0]], [[FB0]];
+; CHECK-DAG:  div.rn.f32  [[FR1:%f[0-9]+]], [[FA1]], [[FB1]];
+; CHECK-DAG:  cvt.rn.bf16.f32  [[R0:%h[0-9]+]], [[FR0]];
+; CHECK-DAG:  cvt.rn.bf16.f32  [[R1:%h[0-9]+]], [[FR1]];
+; CHECK-NEXT: mov.b32 [[R:%hh[0-9]+]], {[[R0]], [[R1]]}
+; CHECK-NEXT: st.param.b32[func_retval0+0], [[R]];
+; CHECK-NEXT: ret;
+
+define <2 x bfloat> @test_fdiv(<2 x bfloat> %a, <2 x bfloat>