[PATCH] D151876: [NVPTX] Signed char and (unsigned)long overloads of ldg and ldu

2023-06-02 Thread Jakub Chlanda via Phabricator via cfe-commits
This revision was automatically updated to reflect the committed changes.
Closed by commit rG3e37c98bdb51: [cuda, NVPTX] Signed char and (unsigned)long 
builtins of ldg and ldu (authored by jchlanda).

Repository:
  rG LLVM Github Monorepo

CHANGES SINCE LAST ACTION
  https://reviews.llvm.org/D151876/new/

https://reviews.llvm.org/D151876

Files:
  clang/include/clang/Basic/BuiltinsNVPTX.def
  clang/lib/CodeGen/CGBuiltin.cpp
  clang/test/CodeGen/builtins-nvptx.c

Index: clang/test/CodeGen/builtins-nvptx.c
===
--- clang/test/CodeGen/builtins-nvptx.c
+++ clang/test/CodeGen/builtins-nvptx.c
@@ -554,10 +554,12 @@
 
 // CHECK-LABEL: nvvm_ldg
 __device__ void nvvm_ldg(const void *p) {
+  // CHECK: call i8 @llvm.nvvm.ldg.global.i.i8.p0(ptr {{%[0-9]+}}, i32 1)
   // CHECK: call i8 @llvm.nvvm.ldg.global.i.i8.p0(ptr {{%[0-9]+}}, i32 1)
   // CHECK: call i8 @llvm.nvvm.ldg.global.i.i8.p0(ptr {{%[0-9]+}}, i32 1)
   __nvvm_ldg_c((const char *)p);
   __nvvm_ldg_uc((const unsigned char *)p);
+  __nvvm_ldg_sc((const signed char *)p);
 
   // CHECK: call i16 @llvm.nvvm.ldg.global.i.i16.p0(ptr {{%[0-9]+}}, i32 2)
   // CHECK: call i16 @llvm.nvvm.ldg.global.i.i16.p0(ptr {{%[0-9]+}}, i32 2)
@@ -590,19 +592,25 @@
   // elements, its alignment is set to number of elements times the alignment of
   // its member: n*alignof(t)."
 
+  // CHECK: call <2 x i8> @llvm.nvvm.ldg.global.i.v2i8.p0(ptr {{%[0-9]+}}, i32 2)
   // CHECK: call <2 x i8> @llvm.nvvm.ldg.global.i.v2i8.p0(ptr {{%[0-9]+}}, i32 2)
   // CHECK: call <2 x i8> @llvm.nvvm.ldg.global.i.v2i8.p0(ptr {{%[0-9]+}}, i32 2)
   typedef char char2 __attribute__((ext_vector_type(2)));
   typedef unsigned char uchar2 __attribute__((ext_vector_type(2)));
+  typedef signed char schar2 __attribute__((ext_vector_type(2)));
   __nvvm_ldg_c2((const char2 *)p);
   __nvvm_ldg_uc2((const uchar2 *)p);
+  __nvvm_ldg_sc2((const schar2 *)p);
 
+  // CHECK: call <4 x i8> @llvm.nvvm.ldg.global.i.v4i8.p0(ptr {{%[0-9]+}}, i32 4)
   // CHECK: call <4 x i8> @llvm.nvvm.ldg.global.i.v4i8.p0(ptr {{%[0-9]+}}, i32 4)
   // CHECK: call <4 x i8> @llvm.nvvm.ldg.global.i.v4i8.p0(ptr {{%[0-9]+}}, i32 4)
   typedef char char4 __attribute__((ext_vector_type(4)));
   typedef unsigned char uchar4 __attribute__((ext_vector_type(4)));
+  typedef signed char schar4 __attribute__((ext_vector_type(4)));
   __nvvm_ldg_c4((const char4 *)p);
   __nvvm_ldg_uc4((const uchar4 *)p);
+  __nvvm_ldg_sc4((const schar4 *)p);
 
   // CHECK: call <2 x i16> @llvm.nvvm.ldg.global.i.v2i16.p0(ptr {{%[0-9]+}}, i32 4)
   // CHECK: call <2 x i16> @llvm.nvvm.ldg.global.i.v2i16.p0(ptr {{%[0-9]+}}, i32 4)
@@ -632,6 +640,15 @@
   __nvvm_ldg_i4((const int4 *)p);
   __nvvm_ldg_ui4((const uint4 *)p);
 
+  // LP32: call <2 x i32> @llvm.nvvm.ldg.global.i.v2i32.p0(ptr {{%[0-9]+}}, i32 8)
+  // LP32: call <2 x i32> @llvm.nvvm.ldg.global.i.v2i32.p0(ptr {{%[0-9]+}}, i32 8)
+  // LP64: call <2 x i64> @llvm.nvvm.ldg.global.i.v2i64.p0(ptr {{%[0-9]+}}, i32 16)
+  // LP64: call <2 x i64> @llvm.nvvm.ldg.global.i.v2i64.p0(ptr {{%[0-9]+}}, i32 16)
+  typedef long long2 __attribute__((ext_vector_type(2)));
+  typedef unsigned long ulong2 __attribute__((ext_vector_type(2)));
+  __nvvm_ldg_l2((const long2 *)p);
+  __nvvm_ldg_ul2((const ulong2 *)p);
+
   // CHECK: call <2 x i64> @llvm.nvvm.ldg.global.i.v2i64.p0(ptr {{%[0-9]+}}, i32 16)
   // CHECK: call <2 x i64> @llvm.nvvm.ldg.global.i.v2i64.p0(ptr {{%[0-9]+}}, i32 16)
   typedef long long longlong2 __attribute__((ext_vector_type(2)));
@@ -654,10 +671,12 @@
 
 // CHECK-LABEL: nvvm_ldu
 __device__ void nvvm_ldu(const void *p) {
+  // CHECK: call i8 @llvm.nvvm.ldu.global.i.i8.p0(ptr {{%[0-9]+}}, i32 1)
   // CHECK: call i8 @llvm.nvvm.ldu.global.i.i8.p0(ptr {{%[0-9]+}}, i32 1)
   // CHECK: call i8 @llvm.nvvm.ldu.global.i.i8.p0(ptr {{%[0-9]+}}, i32 1)
   __nvvm_ldu_c((const char *)p);
   __nvvm_ldu_uc((const unsigned char *)p);
+  __nvvm_ldu_sc((const signed char *)p);
 
   // CHECK: call i16 @llvm.nvvm.ldu.global.i.i16.p0(ptr {{%[0-9]+}}, i32 2)
   // CHECK: call i16 @llvm.nvvm.ldu.global.i.i16.p0(ptr {{%[0-9]+}}, i32 2)
@@ -681,19 +700,25 @@
   // CHECK: call double @llvm.nvvm.ldu.global.f.f64.p0(ptr {{%[0-9]+}}, i32 8)
   __nvvm_ldu_d((const double *)p);
 
+  // CHECK: call <2 x i8> @llvm.nvvm.ldu.global.i.v2i8.p0(ptr {{%[0-9]+}}, i32 2)
   // CHECK: call <2 x i8> @llvm.nvvm.ldu.global.i.v2i8.p0(ptr {{%[0-9]+}}, i32 2)
   // CHECK: call <2 x i8> @llvm.nvvm.ldu.global.i.v2i8.p0(ptr {{%[0-9]+}}, i32 2)
   typedef char char2 __attribute__((ext_vector_type(2)));
   typedef unsigned char uchar2 __attribute__((ext_vector_type(2)));
+  typedef signed char schar2 __attribute__((ext_vector_type(2)));
   __nvvm_ldu_c2((const char2 *)p);
   __nvvm_ldu_uc2((const uchar2 *)p);
+  __nvvm_ldu_sc2((const schar2 *)p);
 
+  // CHECK: call <4 x i8> @llvm.nvvm.ldu.global.i.v4i8.p0(ptr {{%[0-9]+}}, i32 4)
   // CHECK: call <4 x i8> @llvm.nvvm.ldu.global.i.v4i8.p0(ptr 

[PATCH] D151876: [NVPTX] Signed char and (unsigned)long overloads of ldg and ldu

2023-06-02 Thread Jakub Chlanda via Phabricator via cfe-commits
jchlanda added a comment.

In D151876#433 , @tra wrote:

> I'd change the patch title:
>
> - `[NVPTX]` -> `[cuda, NVPTX]` as these are clang changes, not NVPTX back-end.
> - `overloads ` -> `builtins`

Sure, will do. Thank you.


Repository:
  rG LLVM Github Monorepo

CHANGES SINCE LAST ACTION
  https://reviews.llvm.org/D151876/new/

https://reviews.llvm.org/D151876

___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[PATCH] D151876: [NVPTX] Signed char and (unsigned)long overloads of ldg and ldu

2023-06-01 Thread Jakub Chlanda via Phabricator via cfe-commits
jchlanda created this revision.
jchlanda added a reviewer: tra.
Herald added subscribers: mattd, gchakrabarti, asavonic.
Herald added a project: All.
jchlanda requested review of this revision.
Herald added subscribers: cfe-commits, jholewinski.
Herald added a project: clang.

This is a follow up to:

- https://reviews.llvm.org/rG7258317bade0fd82e257e47b31eee3ad0c6c5305
- https://reviews.llvm.org/rG71b06585857a77691761a7bfd16b5b91454a6894


Repository:
  rG LLVM Github Monorepo

https://reviews.llvm.org/D151876

Files:
  clang/include/clang/Basic/BuiltinsNVPTX.def
  clang/lib/CodeGen/CGBuiltin.cpp
  clang/test/CodeGen/builtins-nvptx.c

Index: clang/test/CodeGen/builtins-nvptx.c
===
--- clang/test/CodeGen/builtins-nvptx.c
+++ clang/test/CodeGen/builtins-nvptx.c
@@ -554,10 +554,12 @@
 
 // CHECK-LABEL: nvvm_ldg
 __device__ void nvvm_ldg(const void *p) {
+  // CHECK: call i8 @llvm.nvvm.ldg.global.i.i8.p0(ptr {{%[0-9]+}}, i32 1)
   // CHECK: call i8 @llvm.nvvm.ldg.global.i.i8.p0(ptr {{%[0-9]+}}, i32 1)
   // CHECK: call i8 @llvm.nvvm.ldg.global.i.i8.p0(ptr {{%[0-9]+}}, i32 1)
   __nvvm_ldg_c((const char *)p);
   __nvvm_ldg_uc((const unsigned char *)p);
+  __nvvm_ldg_sc((const signed char *)p);
 
   // CHECK: call i16 @llvm.nvvm.ldg.global.i.i16.p0(ptr {{%[0-9]+}}, i32 2)
   // CHECK: call i16 @llvm.nvvm.ldg.global.i.i16.p0(ptr {{%[0-9]+}}, i32 2)
@@ -590,19 +592,25 @@
   // elements, its alignment is set to number of elements times the alignment of
   // its member: n*alignof(t)."
 
+  // CHECK: call <2 x i8> @llvm.nvvm.ldg.global.i.v2i8.p0(ptr {{%[0-9]+}}, i32 2)
   // CHECK: call <2 x i8> @llvm.nvvm.ldg.global.i.v2i8.p0(ptr {{%[0-9]+}}, i32 2)
   // CHECK: call <2 x i8> @llvm.nvvm.ldg.global.i.v2i8.p0(ptr {{%[0-9]+}}, i32 2)
   typedef char char2 __attribute__((ext_vector_type(2)));
   typedef unsigned char uchar2 __attribute__((ext_vector_type(2)));
+  typedef signed char schar2 __attribute__((ext_vector_type(2)));
   __nvvm_ldg_c2((const char2 *)p);
   __nvvm_ldg_uc2((const uchar2 *)p);
+  __nvvm_ldg_sc2((const schar2 *)p);
 
+  // CHECK: call <4 x i8> @llvm.nvvm.ldg.global.i.v4i8.p0(ptr {{%[0-9]+}}, i32 4)
   // CHECK: call <4 x i8> @llvm.nvvm.ldg.global.i.v4i8.p0(ptr {{%[0-9]+}}, i32 4)
   // CHECK: call <4 x i8> @llvm.nvvm.ldg.global.i.v4i8.p0(ptr {{%[0-9]+}}, i32 4)
   typedef char char4 __attribute__((ext_vector_type(4)));
   typedef unsigned char uchar4 __attribute__((ext_vector_type(4)));
+  typedef signed char schar4 __attribute__((ext_vector_type(4)));
   __nvvm_ldg_c4((const char4 *)p);
   __nvvm_ldg_uc4((const uchar4 *)p);
+  __nvvm_ldg_sc4((const schar4 *)p);
 
   // CHECK: call <2 x i16> @llvm.nvvm.ldg.global.i.v2i16.p0(ptr {{%[0-9]+}}, i32 4)
   // CHECK: call <2 x i16> @llvm.nvvm.ldg.global.i.v2i16.p0(ptr {{%[0-9]+}}, i32 4)
@@ -632,6 +640,15 @@
   __nvvm_ldg_i4((const int4 *)p);
   __nvvm_ldg_ui4((const uint4 *)p);
 
+  // LP32: call <2 x i32> @llvm.nvvm.ldg.global.i.v2i32.p0(ptr {{%[0-9]+}}, i32 8)
+  // LP32: call <2 x i32> @llvm.nvvm.ldg.global.i.v2i32.p0(ptr {{%[0-9]+}}, i32 8)
+  // LP64: call <2 x i64> @llvm.nvvm.ldg.global.i.v2i64.p0(ptr {{%[0-9]+}}, i32 16)
+  // LP64: call <2 x i64> @llvm.nvvm.ldg.global.i.v2i64.p0(ptr {{%[0-9]+}}, i32 16)
+  typedef long long2 __attribute__((ext_vector_type(2)));
+  typedef unsigned long ulong2 __attribute__((ext_vector_type(2)));
+  __nvvm_ldg_l2((const long2 *)p);
+  __nvvm_ldg_ul2((const ulong2 *)p);
+
   // CHECK: call <2 x i64> @llvm.nvvm.ldg.global.i.v2i64.p0(ptr {{%[0-9]+}}, i32 16)
   // CHECK: call <2 x i64> @llvm.nvvm.ldg.global.i.v2i64.p0(ptr {{%[0-9]+}}, i32 16)
   typedef long long longlong2 __attribute__((ext_vector_type(2)));
@@ -654,10 +671,12 @@
 
 // CHECK-LABEL: nvvm_ldu
 __device__ void nvvm_ldu(const void *p) {
+  // CHECK: call i8 @llvm.nvvm.ldu.global.i.i8.p0(ptr {{%[0-9]+}}, i32 1)
   // CHECK: call i8 @llvm.nvvm.ldu.global.i.i8.p0(ptr {{%[0-9]+}}, i32 1)
   // CHECK: call i8 @llvm.nvvm.ldu.global.i.i8.p0(ptr {{%[0-9]+}}, i32 1)
   __nvvm_ldu_c((const char *)p);
   __nvvm_ldu_uc((const unsigned char *)p);
+  __nvvm_ldu_sc((const signed char *)p);
 
   // CHECK: call i16 @llvm.nvvm.ldu.global.i.i16.p0(ptr {{%[0-9]+}}, i32 2)
   // CHECK: call i16 @llvm.nvvm.ldu.global.i.i16.p0(ptr {{%[0-9]+}}, i32 2)
@@ -681,19 +700,25 @@
   // CHECK: call double @llvm.nvvm.ldu.global.f.f64.p0(ptr {{%[0-9]+}}, i32 8)
   __nvvm_ldu_d((const double *)p);
 
+  // CHECK: call <2 x i8> @llvm.nvvm.ldu.global.i.v2i8.p0(ptr {{%[0-9]+}}, i32 2)
   // CHECK: call <2 x i8> @llvm.nvvm.ldu.global.i.v2i8.p0(ptr {{%[0-9]+}}, i32 2)
   // CHECK: call <2 x i8> @llvm.nvvm.ldu.global.i.v2i8.p0(ptr {{%[0-9]+}}, i32 2)
   typedef char char2 __attribute__((ext_vector_type(2)));
   typedef unsigned char uchar2 __attribute__((ext_vector_type(2)));
+  typedef signed char schar2 __attribute__((ext_vector_type(2)));
   __nvvm_ldu_c2((const char2 *)p);
   __nvvm_ldu_uc2((const uchar2 *)p);
+  

[PATCH] D146715: [NVPTX] Enforce half type support is present for builtins

2023-03-28 Thread Jakub Chlanda via Phabricator via cfe-commits
This revision was landed with ongoing or failed builds.
This revision was automatically updated to reflect the committed changes.
Closed by commit rGae3c981aa4b8: [NVPTX] Enforce half type support is present 
for builtins (authored by jchlanda).

Repository:
  rG LLVM Github Monorepo

CHANGES SINCE LAST ACTION
  https://reviews.llvm.org/D146715/new/

https://reviews.llvm.org/D146715

Files:
  clang/lib/CodeGen/CGBuiltin.cpp
  clang/test/CodeGen/builtins-nvptx-native-half-type-err.c
  llvm/include/llvm/IR/IntrinsicsNVVM.td

Index: llvm/include/llvm/IR/IntrinsicsNVVM.td
===
--- llvm/include/llvm/IR/IntrinsicsNVVM.td
+++ llvm/include/llvm/IR/IntrinsicsNVVM.td
@@ -583,7 +583,6 @@
   "_xorsign_abs_f16", "_ftz_xorsign_abs_f16", "_nan_xorsign_abs_f16",
   "_ftz_nan_xorsign_abs_f16"] in {
   def int_nvvm_f # operation # variant :
-ClangBuiltin,
 DefaultAttrsIntrinsic<[llvm_half_ty], [llvm_half_ty, llvm_half_ty],
   [IntrNoMem, IntrSpeculatable, Commutative]>;
 }
@@ -592,7 +591,6 @@
   "_ftz_nan_f16x2", "_xorsign_abs_f16x2", "_ftz_xorsign_abs_f16x2",
   "_nan_xorsign_abs_f16x2", "_ftz_nan_xorsign_abs_f16x2"] in {
   def int_nvvm_f # operation # variant :
-ClangBuiltin,
 DefaultAttrsIntrinsic<[llvm_v2f16_ty], [llvm_v2f16_ty, llvm_v2f16_ty],
   [IntrNoMem, IntrSpeculatable, Commutative]>;
 }
@@ -828,9 +826,9 @@
   DefaultAttrsIntrinsic<[llvm_float_ty], [llvm_float_ty], [IntrNoMem]>;
   def int_nvvm_ex2_approx_d : ClangBuiltin<"__nvvm_ex2_approx_d">,
   DefaultAttrsIntrinsic<[llvm_double_ty], [llvm_double_ty], [IntrNoMem]>;
-  def int_nvvm_ex2_approx_f16 : ClangBuiltin<"__nvvm_ex2_approx_f16">,
+  def int_nvvm_ex2_approx_f16 :
   DefaultAttrsIntrinsic<[llvm_half_ty], [llvm_half_ty], [IntrNoMem]>;
-  def int_nvvm_ex2_approx_f16x2 : ClangBuiltin<"__nvvm_ex2_approx_f16x2">,
+  def int_nvvm_ex2_approx_f16x2 :
   DefaultAttrsIntrinsic<[llvm_v2f16_ty], [llvm_v2f16_ty], [IntrNoMem]>;
 
   def int_nvvm_lg2_approx_ftz_f : ClangBuiltin<"__nvvm_lg2_approx_ftz_f">,
@@ -860,18 +858,16 @@
 
   foreach variant = ["_rn_f16", "_rn_ftz_f16", "_rn_sat_f16",
 "_rn_ftz_sat_f16", "_rn_relu_f16", "_rn_ftz_relu_f16"] in {
-def int_nvvm_fma # variant : ClangBuiltin,
-DefaultAttrsIntrinsic<[llvm_half_ty],
-  [llvm_half_ty, llvm_half_ty, llvm_half_ty],
-  [IntrNoMem, IntrSpeculatable]>;
+def int_nvvm_fma # variant : DefaultAttrsIntrinsic<[llvm_half_ty],
+  [llvm_half_ty, llvm_half_ty, llvm_half_ty],
+  [IntrNoMem, IntrSpeculatable]>;
   }
 
   foreach variant = ["_rn_f16x2", "_rn_ftz_f16x2", "_rn_sat_f16x2",
 "_rn_ftz_sat_f16x2", "_rn_relu_f16x2", "_rn_ftz_relu_f16x2"] in {
-def int_nvvm_fma # variant : ClangBuiltin,
-  DefaultAttrsIntrinsic<[llvm_v2f16_ty],
-[llvm_v2f16_ty, llvm_v2f16_ty, llvm_v2f16_ty],
-[IntrNoMem, IntrSpeculatable]>;
+def int_nvvm_fma # variant : DefaultAttrsIntrinsic<[llvm_v2f16_ty],
+  [llvm_v2f16_ty, llvm_v2f16_ty, llvm_v2f16_ty],
+  [IntrNoMem, IntrSpeculatable]>;
   }
 
   foreach variant = ["_rn_bf16", "_rn_relu_bf16"] in {
Index: clang/test/CodeGen/builtins-nvptx-native-half-type-err.c
===
--- clang/test/CodeGen/builtins-nvptx-native-half-type-err.c
+++ clang/test/CodeGen/builtins-nvptx-native-half-type-err.c
@@ -1,21 +1,119 @@
 // REQUIRES: nvptx-registered-target
 //
 // RUN: not %clang_cc1 -fsyntax-only -ffp-contract=off -triple nvptx-unknown-unknown -target-cpu \
-// RUN:   sm_75 -target-feature +ptx70 -fcuda-is-device -x cuda -emit-llvm -o - %s 2>&1 \
-// RUN:   | FileCheck -check-prefix=CHECK-ERROR %s
+// RUN:   sm_86 -target-feature +ptx72 -fcuda-is-device -x cuda -emit-llvm -o - %s 2>&1 \
+// RUN:   | FileCheck -check-prefix=CHECK_ERROR %s
 
 #define __device__ __attribute__((device))
 typedef __fp16 __fp16v2 __attribute__((ext_vector_type(2)));
 
-__device__ void nvvm_ldg_ldu_native_half_types(const void *p) {
-  __nvvm_ldg_h((const __fp16 *)p);
-  __nvvm_ldg_h2((const __fp16v2 *)p);
+__device__ void nvvm_native_half_types(void *a, void*b, void*c, __fp16* out) {
+  __fp16v2 resv2 = {0, 0};
+  *out += __nvvm_ex2_approx_f16(*(__fp16 *)a);
+  resv2 = __nvvm_ex2_approx_f16x2(*(__fp16v2*)a);
 
-  __nvvm_ldu_h((const __fp16 *)p);
-  __nvvm_ldu_h2((const __fp16v2 *)p);
+  *out += __nvvm_fma_rn_relu_f16(*(__fp16*)a, *(__fp16*)b, *(__fp16*)c);
+  *out += __nvvm_fma_rn_ftz_relu_f16(*(__fp16*)a, *(__fp16*)b, *(__fp16 *)c);
+  resv2 += __nvvm_fma_rn_relu_f16x2(*(__fp16v2*)a, *(__fp16v2*)b, *(__fp16v2*)c);
+  resv2 += __nvvm_fma_rn_ftz_relu_f16x2(*(__fp16v2*)a, *(__fp16v2*)b, *(__fp16v2*)c);
+  *out += __nvvm_fma_rn_ftz_f16(*(__fp16*)a, *(__fp16*)b, *(__fp16*)c);
+  *out += __nvvm_fma_rn_sat_f16(*(__fp16*)a, *(__fp16*)b, *(__fp16*)c);
+  *out += __nvvm_fma_rn_ftz_sat_f16(*(__fp16*)a, *(__fp16*)b, *(__fp16*)c);
+  

[PATCH] D146715: [NVPTX] Enforce half type support is present for builtins

2023-03-27 Thread Jakub Chlanda via Phabricator via cfe-commits
jchlanda marked an inline comment as done.
jchlanda added inline comments.



Comment at: clang/lib/CodeGen/CGBuiltin.cpp:18313
 // of its member: n*alignof(t)."
-return MakeLdgLdu(Intrinsic::nvvm_ldg_global_i);
-  case NVPTX::BI__nvvm_ldg_h:
-  case NVPTX::BI__nvvm_ldg_h2:
-if (!HasHalfSupport(BuiltinID)) {
-  CGM.Error(E->getExprLoc(),
-getContext().BuiltinInfo.getName(BuiltinID).str() +
-" requires native half type support.");
-  return nullptr;
-}
-[[fallthrough]];
+return MakeLdgLdu(Intrinsic::nvvm_ldg_global_i, *this, E);
   case NVPTX::BI__nvvm_ldg_f:

tra wrote:
> This is where lambda would have some advantage as we could capture what it 
> needs without having to pass 'this' or 'E' explicitly.
> In this case it's not too bad, so I'm fine either way, with a very slight 
> bias towards lambdas. The static functions just don't seem to buy us anything 
> here, IMO.
My reasoning was that it keeps `CodeGenFunction::EmitNVPTXBuiltinExpr` clean, 
all that it does now is to dispatch to a correct handler based on the builtin 
id (with exception of `mma`s); you are right though, it does it at the price of 
making the signatures more verbose.


CHANGES SINCE LAST ACTION
  https://reviews.llvm.org/D146715/new/

https://reviews.llvm.org/D146715

___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[PATCH] D146715: [NVPTX] Enforce half type support is present for builtins

2023-03-24 Thread Jakub Chlanda via Phabricator via cfe-commits
jchlanda added inline comments.



Comment at: clang/lib/CodeGen/CGBuiltin.cpp:18912
+  case NVPTX::BI__nvvm_ldu_h2: {
+return MakeHalfType(BuiltinID, E, *this);
+  }

tra wrote:
> We seem to be checking builtin IDs twice. Once here and then in MakeHalfType 
> where we need to map them to intrinsics.
> I think the approach used for atomics above would work better here, too -- 
> just pass appropriate Intrinsic as a parameter to MakeHalfType and remove the 
> switch there.
> 
> Another option you may consider is that if the availability is only dependent 
> on PTX or SM version, then we may get by with declaring the builtins as 
> TARGET_BUILTIN with appropriate constraints and let the standard enforcement 
> machinery to handle diags.
I've refactored those switches and passed intrinsic ID to `MakeHalfType` helper.


CHANGES SINCE LAST ACTION
  https://reviews.llvm.org/D146715/new/

https://reviews.llvm.org/D146715

___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[PATCH] D146715: [NVPTX] Enforce half type support is present for builtins

2023-03-24 Thread Jakub Chlanda via Phabricator via cfe-commits
jchlanda updated this revision to Diff 508046.
jchlanda added a comment.

Remove duplicated switch over builtins.


CHANGES SINCE LAST ACTION
  https://reviews.llvm.org/D146715/new/

https://reviews.llvm.org/D146715

Files:
  clang/lib/CodeGen/CGBuiltin.cpp
  clang/test/CodeGen/builtins-nvptx-native-half-type-err.c
  llvm/include/llvm/IR/IntrinsicsNVVM.td

Index: llvm/include/llvm/IR/IntrinsicsNVVM.td
===
--- llvm/include/llvm/IR/IntrinsicsNVVM.td
+++ llvm/include/llvm/IR/IntrinsicsNVVM.td
@@ -583,7 +583,6 @@
   "_xorsign_abs_f16", "_ftz_xorsign_abs_f16", "_nan_xorsign_abs_f16",
   "_ftz_nan_xorsign_abs_f16"] in {
   def int_nvvm_f # operation # variant :
-ClangBuiltin,
 DefaultAttrsIntrinsic<[llvm_half_ty], [llvm_half_ty, llvm_half_ty],
   [IntrNoMem, IntrSpeculatable, Commutative]>;
 }
@@ -592,7 +591,6 @@
   "_ftz_nan_f16x2", "_xorsign_abs_f16x2", "_ftz_xorsign_abs_f16x2",
   "_nan_xorsign_abs_f16x2", "_ftz_nan_xorsign_abs_f16x2"] in {
   def int_nvvm_f # operation # variant :
-ClangBuiltin,
 DefaultAttrsIntrinsic<[llvm_v2f16_ty], [llvm_v2f16_ty, llvm_v2f16_ty],
   [IntrNoMem, IntrSpeculatable, Commutative]>;
 }
@@ -828,9 +826,9 @@
   DefaultAttrsIntrinsic<[llvm_float_ty], [llvm_float_ty], [IntrNoMem]>;
   def int_nvvm_ex2_approx_d : ClangBuiltin<"__nvvm_ex2_approx_d">,
   DefaultAttrsIntrinsic<[llvm_double_ty], [llvm_double_ty], [IntrNoMem]>;
-  def int_nvvm_ex2_approx_f16 : ClangBuiltin<"__nvvm_ex2_approx_f16">,
+  def int_nvvm_ex2_approx_f16 :
   DefaultAttrsIntrinsic<[llvm_half_ty], [llvm_half_ty], [IntrNoMem]>;
-  def int_nvvm_ex2_approx_f16x2 : ClangBuiltin<"__nvvm_ex2_approx_f16x2">,
+  def int_nvvm_ex2_approx_f16x2 :
   DefaultAttrsIntrinsic<[llvm_v2f16_ty], [llvm_v2f16_ty], [IntrNoMem]>;
 
   def int_nvvm_lg2_approx_ftz_f : ClangBuiltin<"__nvvm_lg2_approx_ftz_f">,
@@ -860,18 +858,16 @@
 
   foreach variant = ["_rn_f16", "_rn_ftz_f16", "_rn_sat_f16",
 "_rn_ftz_sat_f16", "_rn_relu_f16", "_rn_ftz_relu_f16"] in {
-def int_nvvm_fma # variant : ClangBuiltin,
-DefaultAttrsIntrinsic<[llvm_half_ty],
-  [llvm_half_ty, llvm_half_ty, llvm_half_ty],
-  [IntrNoMem, IntrSpeculatable]>;
+def int_nvvm_fma # variant : DefaultAttrsIntrinsic<[llvm_half_ty],
+  [llvm_half_ty, llvm_half_ty, llvm_half_ty],
+  [IntrNoMem, IntrSpeculatable]>;
   }
 
   foreach variant = ["_rn_f16x2", "_rn_ftz_f16x2", "_rn_sat_f16x2",
 "_rn_ftz_sat_f16x2", "_rn_relu_f16x2", "_rn_ftz_relu_f16x2"] in {
-def int_nvvm_fma # variant : ClangBuiltin,
-  DefaultAttrsIntrinsic<[llvm_v2f16_ty],
-[llvm_v2f16_ty, llvm_v2f16_ty, llvm_v2f16_ty],
-[IntrNoMem, IntrSpeculatable]>;
+def int_nvvm_fma # variant : DefaultAttrsIntrinsic<[llvm_v2f16_ty],
+  [llvm_v2f16_ty, llvm_v2f16_ty, llvm_v2f16_ty],
+  [IntrNoMem, IntrSpeculatable]>;
   }
 
   foreach variant = ["_rn_bf16", "_rn_relu_bf16"] in {
Index: clang/test/CodeGen/builtins-nvptx-native-half-type-err.c
===
--- clang/test/CodeGen/builtins-nvptx-native-half-type-err.c
+++ clang/test/CodeGen/builtins-nvptx-native-half-type-err.c
@@ -1,21 +1,119 @@
 // REQUIRES: nvptx-registered-target
 //
 // RUN: not %clang_cc1 -fsyntax-only -ffp-contract=off -triple nvptx-unknown-unknown -target-cpu \
-// RUN:   sm_75 -target-feature +ptx70 -fcuda-is-device -x cuda -emit-llvm -o - %s 2>&1 \
-// RUN:   | FileCheck -check-prefix=CHECK-ERROR %s
+// RUN:   sm_86 -target-feature +ptx72 -fcuda-is-device -x cuda -emit-llvm -o - %s 2>&1 \
+// RUN:   | FileCheck -check-prefix=CHECK_ERROR %s
 
 #define __device__ __attribute__((device))
 typedef __fp16 __fp16v2 __attribute__((ext_vector_type(2)));
 
-__device__ void nvvm_ldg_ldu_native_half_types(const void *p) {
-  __nvvm_ldg_h((const __fp16 *)p);
-  __nvvm_ldg_h2((const __fp16v2 *)p);
+__device__ void nvvm_native_half_types(void *a, void*b, void*c, __fp16* out) {
+  __fp16v2 resv2 = {0, 0};
+  *out += __nvvm_ex2_approx_f16(*(__fp16 *)a);
+  resv2 = __nvvm_ex2_approx_f16x2(*(__fp16v2*)a);
 
-  __nvvm_ldu_h((const __fp16 *)p);
-  __nvvm_ldu_h2((const __fp16v2 *)p);
+  *out += __nvvm_fma_rn_relu_f16(*(__fp16*)a, *(__fp16*)b, *(__fp16*)c);
+  *out += __nvvm_fma_rn_ftz_relu_f16(*(__fp16*)a, *(__fp16*)b, *(__fp16 *)c);
+  resv2 += __nvvm_fma_rn_relu_f16x2(*(__fp16v2*)a, *(__fp16v2*)b, *(__fp16v2*)c);
+  resv2 += __nvvm_fma_rn_ftz_relu_f16x2(*(__fp16v2*)a, *(__fp16v2*)b, *(__fp16v2*)c);
+  *out += __nvvm_fma_rn_ftz_f16(*(__fp16*)a, *(__fp16*)b, *(__fp16*)c);
+  *out += __nvvm_fma_rn_sat_f16(*(__fp16*)a, *(__fp16*)b, *(__fp16*)c);
+  *out += __nvvm_fma_rn_ftz_sat_f16(*(__fp16*)a, *(__fp16*)b, *(__fp16*)c);
+  resv2 += __nvvm_fma_rn_f16x2(*(__fp16v2*)a, *(__fp16v2*)b, *(__fp16v2*)c);
+  resv2 += __nvvm_fma_rn_ftz_f16x2(*(__fp16v2*)a, *(__fp16v2*)b, *(__fp16v2*)c);
+  resv2 += 

[PATCH] D146715: [NVPTX] Enforce half type support is present for builtins

2023-03-23 Thread Jakub Chlanda via Phabricator via cfe-commits
jchlanda created this revision.
jchlanda added a reviewer: tra.
Herald added subscribers: mattd, gchakrabarti, asavonic.
Herald added a project: All.
jchlanda requested review of this revision.
Herald added subscribers: llvm-commits, cfe-commits, jholewinski.
Herald added projects: clang, LLVM.

Follow up to https://reviews.llvm.org/D145238 that enforces native half type 
support is present when using builtins and gives more descriptive error in not.
Tidy up `CodeGenFunction::EmitNVPTXBuiltinExpr` by moving helper lambdas to 
dedicated functions (most of helpers already reside in dedicated functions).


Repository:
  rG LLVM Github Monorepo

https://reviews.llvm.org/D146715

Files:
  clang/lib/CodeGen/CGBuiltin.cpp
  clang/test/CodeGen/builtins-nvptx-native-half-type-err.c
  llvm/include/llvm/IR/IntrinsicsNVVM.td

Index: llvm/include/llvm/IR/IntrinsicsNVVM.td
===
--- llvm/include/llvm/IR/IntrinsicsNVVM.td
+++ llvm/include/llvm/IR/IntrinsicsNVVM.td
@@ -583,7 +583,6 @@
   "_xorsign_abs_f16", "_ftz_xorsign_abs_f16", "_nan_xorsign_abs_f16",
   "_ftz_nan_xorsign_abs_f16"] in {
   def int_nvvm_f # operation # variant :
-ClangBuiltin,
 DefaultAttrsIntrinsic<[llvm_half_ty], [llvm_half_ty, llvm_half_ty],
   [IntrNoMem, IntrSpeculatable, Commutative]>;
 }
@@ -592,7 +591,6 @@
   "_ftz_nan_f16x2", "_xorsign_abs_f16x2", "_ftz_xorsign_abs_f16x2",
   "_nan_xorsign_abs_f16x2", "_ftz_nan_xorsign_abs_f16x2"] in {
   def int_nvvm_f # operation # variant :
-ClangBuiltin,
 DefaultAttrsIntrinsic<[llvm_v2f16_ty], [llvm_v2f16_ty, llvm_v2f16_ty],
   [IntrNoMem, IntrSpeculatable, Commutative]>;
 }
@@ -828,9 +826,9 @@
   DefaultAttrsIntrinsic<[llvm_float_ty], [llvm_float_ty], [IntrNoMem]>;
   def int_nvvm_ex2_approx_d : ClangBuiltin<"__nvvm_ex2_approx_d">,
   DefaultAttrsIntrinsic<[llvm_double_ty], [llvm_double_ty], [IntrNoMem]>;
-  def int_nvvm_ex2_approx_f16 : ClangBuiltin<"__nvvm_ex2_approx_f16">,
+  def int_nvvm_ex2_approx_f16 :
   DefaultAttrsIntrinsic<[llvm_half_ty], [llvm_half_ty], [IntrNoMem]>;
-  def int_nvvm_ex2_approx_f16x2 : ClangBuiltin<"__nvvm_ex2_approx_f16x2">,
+  def int_nvvm_ex2_approx_f16x2 :
   DefaultAttrsIntrinsic<[llvm_v2f16_ty], [llvm_v2f16_ty], [IntrNoMem]>;
 
   def int_nvvm_lg2_approx_ftz_f : ClangBuiltin<"__nvvm_lg2_approx_ftz_f">,
@@ -860,18 +858,16 @@
 
   foreach variant = ["_rn_f16", "_rn_ftz_f16", "_rn_sat_f16",
 "_rn_ftz_sat_f16", "_rn_relu_f16", "_rn_ftz_relu_f16"] in {
-def int_nvvm_fma # variant : ClangBuiltin,
-DefaultAttrsIntrinsic<[llvm_half_ty],
-  [llvm_half_ty, llvm_half_ty, llvm_half_ty],
-  [IntrNoMem, IntrSpeculatable]>;
+def int_nvvm_fma # variant : DefaultAttrsIntrinsic<[llvm_half_ty],
+  [llvm_half_ty, llvm_half_ty, llvm_half_ty],
+  [IntrNoMem, IntrSpeculatable]>;
   }
 
   foreach variant = ["_rn_f16x2", "_rn_ftz_f16x2", "_rn_sat_f16x2",
 "_rn_ftz_sat_f16x2", "_rn_relu_f16x2", "_rn_ftz_relu_f16x2"] in {
-def int_nvvm_fma # variant : ClangBuiltin,
-  DefaultAttrsIntrinsic<[llvm_v2f16_ty],
-[llvm_v2f16_ty, llvm_v2f16_ty, llvm_v2f16_ty],
-[IntrNoMem, IntrSpeculatable]>;
+def int_nvvm_fma # variant : DefaultAttrsIntrinsic<[llvm_v2f16_ty],
+  [llvm_v2f16_ty, llvm_v2f16_ty, llvm_v2f16_ty],
+  [IntrNoMem, IntrSpeculatable]>;
   }
 
   foreach variant = ["_rn_bf16", "_rn_relu_bf16"] in {
Index: clang/test/CodeGen/builtins-nvptx-native-half-type-err.c
===
--- clang/test/CodeGen/builtins-nvptx-native-half-type-err.c
+++ clang/test/CodeGen/builtins-nvptx-native-half-type-err.c
@@ -1,21 +1,119 @@
 // REQUIRES: nvptx-registered-target
 //
 // RUN: not %clang_cc1 -fsyntax-only -ffp-contract=off -triple nvptx-unknown-unknown -target-cpu \
-// RUN:   sm_75 -target-feature +ptx70 -fcuda-is-device -x cuda -emit-llvm -o - %s 2>&1 \
-// RUN:   | FileCheck -check-prefix=CHECK-ERROR %s
+// RUN:   sm_86 -target-feature +ptx72 -fcuda-is-device -x cuda -emit-llvm -o - %s 2>&1 \
+// RUN:   | FileCheck -check-prefix=CHECK_ERROR %s
 
 #define __device__ __attribute__((device))
 typedef __fp16 __fp16v2 __attribute__((ext_vector_type(2)));
 
-__device__ void nvvm_ldg_ldu_native_half_types(const void *p) {
-  __nvvm_ldg_h((const __fp16 *)p);
-  __nvvm_ldg_h2((const __fp16v2 *)p);
+__device__ void nvvm_native_half_types(void *a, void*b, void*c, __fp16* out) {
+  __fp16v2 resv2 = {0, 0};
+  *out += __nvvm_ex2_approx_f16(*(__fp16 *)a);
+  resv2 = __nvvm_ex2_approx_f16x2(*(__fp16v2*)a);
 
-  __nvvm_ldu_h((const __fp16 *)p);
-  __nvvm_ldu_h2((const __fp16v2 *)p);
+  *out += __nvvm_fma_rn_relu_f16(*(__fp16*)a, *(__fp16*)b, *(__fp16*)c);
+  *out += __nvvm_fma_rn_ftz_relu_f16(*(__fp16*)a, *(__fp16*)b, *(__fp16 *)c);
+  resv2 += __nvvm_fma_rn_relu_f16x2(*(__fp16v2*)a, *(__fp16v2*)b, *(__fp16v2*)c);
+  resv2 += 

[PATCH] D145238: [NVPTX] Expose LDU builtins

2023-03-15 Thread Jakub Chlanda via Phabricator via cfe-commits
This revision was landed with ongoing or failed builds.
This revision was automatically updated to reflect the committed changes.
Closed by commit rG7258317bade0: [NVPTX] Expose LDU builtins (authored by 
jchlanda).

Repository:
  rG LLVM Github Monorepo

CHANGES SINCE LAST ACTION
  https://reviews.llvm.org/D145238/new/

https://reviews.llvm.org/D145238

Files:
  clang/include/clang/Basic/BuiltinsNVPTX.def
  clang/lib/CodeGen/CGBuiltin.cpp
  clang/test/CodeGen/builtins-nvptx-native-half-type-err.c
  clang/test/CodeGen/builtins-nvptx-native-half-type.c
  clang/test/CodeGen/builtins-nvptx.c
  llvm/test/CodeGen/NVPTX/ldu-ldg.ll

Index: llvm/test/CodeGen/NVPTX/ldu-ldg.ll
===
--- llvm/test/CodeGen/NVPTX/ldu-ldg.ll
+++ llvm/test/CodeGen/NVPTX/ldu-ldg.ll
@@ -3,7 +3,13 @@
 
 
 declare i8 @llvm.nvvm.ldu.global.i.i8.p1(ptr addrspace(1) %ptr, i32 %align)
+declare i16 @llvm.nvvm.ldu.global.i.i16.p1(ptr addrspace(1) %ptr, i32 %align)
 declare i32 @llvm.nvvm.ldu.global.i.i32.p1(ptr addrspace(1) %ptr, i32 %align)
+declare i64 @llvm.nvvm.ldu.global.i.i64.p1(ptr addrspace(1) %ptr, i32 %align)
+declare float @llvm.nvvm.ldu.global.f.f32.p1(ptr addrspace(1) %ptr, i32 %align)
+declare double @llvm.nvvm.ldu.global.f.f64.p1(ptr addrspace(1) %ptr, i32 %align)
+declare half @llvm.nvvm.ldu.global.f.f16.p1(ptr addrspace(1) %ptr, i32 %align)
+declare <2 x half> @llvm.nvvm.ldu.global.f.v2f16.p1(ptr addrspace(1) %ptr, i32 %align)
 
 declare i8 @llvm.nvvm.ldg.global.i.i8.p1(ptr addrspace(1) %ptr, i32 %align)
 declare i16 @llvm.nvvm.ldg.global.i.i16.p1(ptr addrspace(1) %ptr, i32 %align)
@@ -14,72 +20,114 @@
 declare half @llvm.nvvm.ldg.global.f.f16.p1(ptr addrspace(1) %ptr, i32 %align)
 declare <2 x half> @llvm.nvvm.ldg.global.f.v2f16.p1(ptr addrspace(1) %ptr, i32 %align)
 
-; CHECK: test_ldu_i8
+; CHECK-LABEL: test_ldu_i8
 define i8 @test_ldu_i8(ptr addrspace(1) %ptr) {
-; ldu.global.u8
+  ; CHECK: ldu.global.u8
   %val = tail call i8 @llvm.nvvm.ldu.global.i.i8.p1(ptr addrspace(1) %ptr, i32 4)
   ret i8 %val
 }
 
-; CHECK: test_ldu_i32
+; CHECK-LABEL: test_ldu_i16
+define i16 @test_ldu_i16(ptr addrspace(1) %ptr) {
+  ; CHECK: ldu.global.u16
+  %val = tail call i16 @llvm.nvvm.ldu.global.i.i16.p1(ptr addrspace(1) %ptr, i32 2)
+  ret i16 %val
+}
+
+; CHECK-LABEL: test_ldu_i32
 define i32 @test_ldu_i32(ptr addrspace(1) %ptr) {
-; ldu.global.u32
+  ; CHECK: ldu.global.u32
   %val = tail call i32 @llvm.nvvm.ldu.global.i.i32.p1(ptr addrspace(1) %ptr, i32 4)
   ret i32 %val
 }
 
-; CHECK: test_ldg_i8
+; CHECK-LABEL: test_ldu_i64
+define i64 @test_ldu_i64(ptr addrspace(1) %ptr) {
+  ; CHECK: ldu.global.u64
+  %val = tail call i64 @llvm.nvvm.ldu.global.i.i64.p1(ptr addrspace(1) %ptr, i32 8)
+  ret i64 %val
+}
+
+; CHECK-LABEL: test_ldu_f32
+define float @test_ldu_f32(ptr addrspace(1) %ptr) {
+  ; CHECK: ldu.global.f32
+  %val = tail call float @llvm.nvvm.ldu.global.f.f32.p1(ptr addrspace(1) %ptr, i32 4)
+  ret float %val
+}
+
+; CHECK-LABEL: test_ldu_f64
+define double @test_ldu_f64(ptr addrspace(1) %ptr) {
+  ; CHECK: ldu.global.f64
+  %val = tail call double @llvm.nvvm.ldu.global.f.f64.p1(ptr addrspace(1) %ptr, i32 8)
+  ret double %val
+}
+
+; CHECK-LABEL: test_ldu_f16
+define half @test_ldu_f16(ptr addrspace(1) %ptr) {
+  ; CHECK: ldu.global.b16
+  %val = tail call half @llvm.nvvm.ldu.global.f.f16.p1(ptr addrspace(1) %ptr, i32 2)
+  ret half %val
+}
+
+; CHECK-LABEL: test_ldu_v2f16
+define <2 x half> @test_ldu_v2f16(ptr addrspace(1) %ptr) {
+  ; CHECK: ldu.global.b32
+  %val = tail call <2 x half> @llvm.nvvm.ldu.global.f.v2f16.p1(ptr addrspace(1) %ptr, i32 4)
+  ret <2 x half> %val
+}
+
+; CHECK-LABEL: test_ldg_i8
 define i8 @test_ldg_i8(ptr addrspace(1) %ptr) {
-; ld.global.nc.u8
+  ; CHECK: ld.global.nc.u8
   %val = tail call i8 @llvm.nvvm.ldg.global.i.i8.p1(ptr addrspace(1) %ptr, i32 4)
   ret i8 %val
 }
 
-; CHECK: test_ldg_i16
+; CHECK-LABEL: test_ldg_i16
 define i16 @test_ldg_i16(ptr addrspace(1) %ptr) {
-; ld.global.nc.u16
-  %val = tail call i16 @llvm.nvvm.ldg.global.i.i16.p1(ptr addrspace(1) %ptr, i32 4)
+  ; CHECK: ld.global.nc.u16
+  %val = tail call i16 @llvm.nvvm.ldg.global.i.i16.p1(ptr addrspace(1) %ptr, i32 2)
   ret i16 %val
 }
 
-; CHECK: test_ldg_i32
+; CHECK-LABEL: test_ldg_i32
 define i32 @test_ldg_i32(ptr addrspace(1) %ptr) {
-; ld.global.nc.u32
+  ; CHECK: ld.global.nc.u32
   %val = tail call i32 @llvm.nvvm.ldg.global.i.i32.p1(ptr addrspace(1) %ptr, i32 4)
   ret i32 %val
 }
 
-; CHECK: test_ldg_i64
+; CHECK-LABEL: test_ldg_i64
 define i64 @test_ldg_i64(ptr addrspace(1) %ptr) {
-; ld.global.nc.u64
+  ; CHECK: ld.global.nc.u64
   %val = tail call i64 @llvm.nvvm.ldg.global.i.i64.p1(ptr addrspace(1) %ptr, i32 8)
   ret i64 %val
 }
 
-; CHECK: test_ldg_f32
+; CHECK-LABEL: test_ldg_f32
 define float @test_ldg_f32(ptr addrspace(1) %ptr) {
-; ld.global.nc.u64
+  ; CHECK: ld.global.nc.f32
   %val = tail call float @llvm.nvvm.ldg.global.f.f32.p1(ptr addrspace(1) 

[PATCH] D145238: [NVPTX] Expose LDU builtins

2023-03-15 Thread Jakub Chlanda via Phabricator via cfe-commits
jchlanda updated this revision to Diff 505394.
jchlanda added a comment.

Use `CHECK-LABEL`.


Repository:
  rG LLVM Github Monorepo

CHANGES SINCE LAST ACTION
  https://reviews.llvm.org/D145238/new/

https://reviews.llvm.org/D145238

Files:
  clang/include/clang/Basic/BuiltinsNVPTX.def
  clang/lib/CodeGen/CGBuiltin.cpp
  clang/test/CodeGen/builtins-nvptx-native-half-type-err.c
  clang/test/CodeGen/builtins-nvptx-native-half-type.c
  clang/test/CodeGen/builtins-nvptx.c
  llvm/test/CodeGen/NVPTX/ldu-ldg.ll

Index: llvm/test/CodeGen/NVPTX/ldu-ldg.ll
===
--- llvm/test/CodeGen/NVPTX/ldu-ldg.ll
+++ llvm/test/CodeGen/NVPTX/ldu-ldg.ll
@@ -3,7 +3,13 @@
 
 
 declare i8 @llvm.nvvm.ldu.global.i.i8.p1(ptr addrspace(1) %ptr, i32 %align)
+declare i16 @llvm.nvvm.ldu.global.i.i16.p1(ptr addrspace(1) %ptr, i32 %align)
 declare i32 @llvm.nvvm.ldu.global.i.i32.p1(ptr addrspace(1) %ptr, i32 %align)
+declare i64 @llvm.nvvm.ldu.global.i.i64.p1(ptr addrspace(1) %ptr, i32 %align)
+declare float @llvm.nvvm.ldu.global.f.f32.p1(ptr addrspace(1) %ptr, i32 %align)
+declare double @llvm.nvvm.ldu.global.f.f64.p1(ptr addrspace(1) %ptr, i32 %align)
+declare half @llvm.nvvm.ldu.global.f.f16.p1(ptr addrspace(1) %ptr, i32 %align)
+declare <2 x half> @llvm.nvvm.ldu.global.f.v2f16.p1(ptr addrspace(1) %ptr, i32 %align)
 
 declare i8 @llvm.nvvm.ldg.global.i.i8.p1(ptr addrspace(1) %ptr, i32 %align)
 declare i16 @llvm.nvvm.ldg.global.i.i16.p1(ptr addrspace(1) %ptr, i32 %align)
@@ -14,72 +20,114 @@
 declare half @llvm.nvvm.ldg.global.f.f16.p1(ptr addrspace(1) %ptr, i32 %align)
 declare <2 x half> @llvm.nvvm.ldg.global.f.v2f16.p1(ptr addrspace(1) %ptr, i32 %align)
 
-; CHECK: test_ldu_i8
+; CHECK-LABEL: test_ldu_i8
 define i8 @test_ldu_i8(ptr addrspace(1) %ptr) {
-; ldu.global.u8
+  ; CHECK: ldu.global.u8
   %val = tail call i8 @llvm.nvvm.ldu.global.i.i8.p1(ptr addrspace(1) %ptr, i32 4)
   ret i8 %val
 }
 
-; CHECK: test_ldu_i32
+; CHECK-LABEL: test_ldu_i16
+define i16 @test_ldu_i16(ptr addrspace(1) %ptr) {
+  ; CHECK: ldu.global.u16
+  %val = tail call i16 @llvm.nvvm.ldu.global.i.i16.p1(ptr addrspace(1) %ptr, i32 2)
+  ret i16 %val
+}
+
+; CHECK-LABEL: test_ldu_i32
 define i32 @test_ldu_i32(ptr addrspace(1) %ptr) {
-; ldu.global.u32
+  ; CHECK: ldu.global.u32
   %val = tail call i32 @llvm.nvvm.ldu.global.i.i32.p1(ptr addrspace(1) %ptr, i32 4)
   ret i32 %val
 }
 
-; CHECK: test_ldg_i8
+; CHECK-LABEL: test_ldu_i64
+define i64 @test_ldu_i64(ptr addrspace(1) %ptr) {
+  ; CHECK: ldu.global.u64
+  %val = tail call i64 @llvm.nvvm.ldu.global.i.i64.p1(ptr addrspace(1) %ptr, i32 8)
+  ret i64 %val
+}
+
+; CHECK-LABEL: test_ldu_f32
+define float @test_ldu_f32(ptr addrspace(1) %ptr) {
+  ; CHECK: ldu.global.f32
+  %val = tail call float @llvm.nvvm.ldu.global.f.f32.p1(ptr addrspace(1) %ptr, i32 4)
+  ret float %val
+}
+
+; CHECK-LABEL: test_ldu_f64
+define double @test_ldu_f64(ptr addrspace(1) %ptr) {
+  ; CHECK: ldu.global.f64
+  %val = tail call double @llvm.nvvm.ldu.global.f.f64.p1(ptr addrspace(1) %ptr, i32 8)
+  ret double %val
+}
+
+; CHECK-LABEL: test_ldu_f16
+define half @test_ldu_f16(ptr addrspace(1) %ptr) {
+  ; CHECK: ldu.global.b16
+  %val = tail call half @llvm.nvvm.ldu.global.f.f16.p1(ptr addrspace(1) %ptr, i32 2)
+  ret half %val
+}
+
+; CHECK-LABEL: test_ldu_v2f16
+define <2 x half> @test_ldu_v2f16(ptr addrspace(1) %ptr) {
+  ; CHECK: ldu.global.b32
+  %val = tail call <2 x half> @llvm.nvvm.ldu.global.f.v2f16.p1(ptr addrspace(1) %ptr, i32 4)
+  ret <2 x half> %val
+}
+
+; CHECK-LABEL: test_ldg_i8
 define i8 @test_ldg_i8(ptr addrspace(1) %ptr) {
-; ld.global.nc.u8
+  ; CHECK: ld.global.nc.u8
   %val = tail call i8 @llvm.nvvm.ldg.global.i.i8.p1(ptr addrspace(1) %ptr, i32 4)
   ret i8 %val
 }
 
-; CHECK: test_ldg_i16
+; CHECK-LABEL: test_ldg_i16
 define i16 @test_ldg_i16(ptr addrspace(1) %ptr) {
-; ld.global.nc.u16
-  %val = tail call i16 @llvm.nvvm.ldg.global.i.i16.p1(ptr addrspace(1) %ptr, i32 4)
+  ; CHECK: ld.global.nc.u16
+  %val = tail call i16 @llvm.nvvm.ldg.global.i.i16.p1(ptr addrspace(1) %ptr, i32 2)
   ret i16 %val
 }
 
-; CHECK: test_ldg_i32
+; CHECK-LABEL: test_ldg_i32
 define i32 @test_ldg_i32(ptr addrspace(1) %ptr) {
-; ld.global.nc.u32
+  ; CHECK: ld.global.nc.u32
   %val = tail call i32 @llvm.nvvm.ldg.global.i.i32.p1(ptr addrspace(1) %ptr, i32 4)
   ret i32 %val
 }
 
-; CHECK: test_ldg_i64
+; CHECK-LABEL: test_ldg_i64
 define i64 @test_ldg_i64(ptr addrspace(1) %ptr) {
-; ld.global.nc.u64
+  ; CHECK: ld.global.nc.u64
   %val = tail call i64 @llvm.nvvm.ldg.global.i.i64.p1(ptr addrspace(1) %ptr, i32 8)
   ret i64 %val
 }
 
-; CHECK: test_ldg_f32
+; CHECK-LABEL: test_ldg_f32
 define float @test_ldg_f32(ptr addrspace(1) %ptr) {
-; ld.global.nc.u64
+  ; CHECK: ld.global.nc.f32
   %val = tail call float @llvm.nvvm.ldg.global.f.f32.p1(ptr addrspace(1) %ptr, i32 4)
   ret float %val
 }
 
-; CHECK: test_ldg_f64
+; CHECK-LABEL: test_ldg_f64
 define double @test_ldg_f64(ptr 

[PATCH] D145238: [NVPTX] Expose LDU builtins

2023-03-14 Thread Jakub Chlanda via Phabricator via cfe-commits
jchlanda marked an inline comment as done.
jchlanda added a comment.

@tra is there anything else I should do for this patch? Thank you.


Repository:
  rG LLVM Github Monorepo

CHANGES SINCE LAST ACTION
  https://reviews.llvm.org/D145238/new/

https://reviews.llvm.org/D145238

___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[PATCH] D145238: [NVPTX] Expose LDU builtins

2023-03-09 Thread Jakub Chlanda via Phabricator via cfe-commits
jchlanda marked 5 inline comments as done.
jchlanda added inline comments.



Comment at: clang/lib/CodeGen/CGBuiltin.cpp:18267-18271
+auto HalfSupport = HasHalfSupport(BuiltinID);
+if (!HalfSupport.first) {
+  CGM.Error(E->getExprLoc(),
+HalfSupport.second.append(" requires native half type 
support.")
+.c_str());

tra wrote:
> jchlanda wrote:
> > tra wrote:
> > > I think we can simplify it all further.
> > > 
> > > ```
> > > auto HasHalfSupport = [&](unsigned BuiltinID) {
> > > auto  = getContext();
> > > return Context.getLangOpts().NativeHalfType || 
> > > !Context.getTargetInfo().useFP16ConversionIntrinsics();
> > > }
> > > ...
> > > 
> > > if (!HasHalfSupport(BuiltinID)) {
> > >   CGM.Error(E->getExprLoc(),   
> > > getContext().BuiltinInfo.getName(BuiltinID) + " requires native half type 
> > > support.");
> > > 
> > > ```
> > Done, although we need a string append there, StringRef and Twine would not 
> > work.
> We don't really need `append()`.
> `CGM.Error(E->getExprLoc(),   
> getContext().BuiltinInfo.getName(BuiltinID).str() + " requires native half 
> type support.");` works.
> 
Done.


Repository:
  rG LLVM Github Monorepo

CHANGES SINCE LAST ACTION
  https://reviews.llvm.org/D145238/new/

https://reviews.llvm.org/D145238

___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[PATCH] D145238: [NVPTX] Expose LDU builtins

2023-03-09 Thread Jakub Chlanda via Phabricator via cfe-commits
jchlanda updated this revision to Diff 503644.
jchlanda added a comment.

`append` -> `+`


Repository:
  rG LLVM Github Monorepo

CHANGES SINCE LAST ACTION
  https://reviews.llvm.org/D145238/new/

https://reviews.llvm.org/D145238

Files:
  clang/include/clang/Basic/BuiltinsNVPTX.def
  clang/lib/CodeGen/CGBuiltin.cpp
  clang/test/CodeGen/builtins-nvptx-native-half-type-err.c
  clang/test/CodeGen/builtins-nvptx-native-half-type.c
  clang/test/CodeGen/builtins-nvptx.c
  llvm/test/CodeGen/NVPTX/ldu-ldg.ll

Index: llvm/test/CodeGen/NVPTX/ldu-ldg.ll
===
--- llvm/test/CodeGen/NVPTX/ldu-ldg.ll
+++ llvm/test/CodeGen/NVPTX/ldu-ldg.ll
@@ -3,7 +3,13 @@
 
 
 declare i8 @llvm.nvvm.ldu.global.i.i8.p1(ptr addrspace(1) %ptr, i32 %align)
+declare i16 @llvm.nvvm.ldu.global.i.i16.p1(ptr addrspace(1) %ptr, i32 %align)
 declare i32 @llvm.nvvm.ldu.global.i.i32.p1(ptr addrspace(1) %ptr, i32 %align)
+declare i64 @llvm.nvvm.ldu.global.i.i64.p1(ptr addrspace(1) %ptr, i32 %align)
+declare float @llvm.nvvm.ldu.global.f.f32.p1(ptr addrspace(1) %ptr, i32 %align)
+declare double @llvm.nvvm.ldu.global.f.f64.p1(ptr addrspace(1) %ptr, i32 %align)
+declare half @llvm.nvvm.ldu.global.f.f16.p1(ptr addrspace(1) %ptr, i32 %align)
+declare <2 x half> @llvm.nvvm.ldu.global.f.v2f16.p1(ptr addrspace(1) %ptr, i32 %align)
 
 declare i8 @llvm.nvvm.ldg.global.i.i8.p1(ptr addrspace(1) %ptr, i32 %align)
 declare i16 @llvm.nvvm.ldg.global.i.i16.p1(ptr addrspace(1) %ptr, i32 %align)
@@ -16,70 +22,112 @@
 
 ; CHECK: test_ldu_i8
 define i8 @test_ldu_i8(ptr addrspace(1) %ptr) {
-; ldu.global.u8
+  ; CHECK: ldu.global.u8
   %val = tail call i8 @llvm.nvvm.ldu.global.i.i8.p1(ptr addrspace(1) %ptr, i32 4)
   ret i8 %val
 }
 
+; CHECK: test_ldu_i16
+define i16 @test_ldu_i16(ptr addrspace(1) %ptr) {
+  ; CHECK: ldu.global.u16
+  %val = tail call i16 @llvm.nvvm.ldu.global.i.i16.p1(ptr addrspace(1) %ptr, i32 2)
+  ret i16 %val
+}
+
 ; CHECK: test_ldu_i32
 define i32 @test_ldu_i32(ptr addrspace(1) %ptr) {
-; ldu.global.u32
+  ; CHECK: ldu.global.u32
   %val = tail call i32 @llvm.nvvm.ldu.global.i.i32.p1(ptr addrspace(1) %ptr, i32 4)
   ret i32 %val
 }
 
+; CHECK: test_ldu_i64
+define i64 @test_ldu_i64(ptr addrspace(1) %ptr) {
+  ; CHECK: ldu.global.u64
+  %val = tail call i64 @llvm.nvvm.ldu.global.i.i64.p1(ptr addrspace(1) %ptr, i32 8)
+  ret i64 %val
+}
+
+; CHECK: test_ldu_f32
+define float @test_ldu_f32(ptr addrspace(1) %ptr) {
+  ; CHECK: ldu.global.f32
+  %val = tail call float @llvm.nvvm.ldu.global.f.f32.p1(ptr addrspace(1) %ptr, i32 4)
+  ret float %val
+}
+
+; CHECK: test_ldu_f64
+define double @test_ldu_f64(ptr addrspace(1) %ptr) {
+  ; CHECK: ldu.global.f64
+  %val = tail call double @llvm.nvvm.ldu.global.f.f64.p1(ptr addrspace(1) %ptr, i32 8)
+  ret double %val
+}
+
+; CHECK: test_ldu_f16
+define half @test_ldu_f16(ptr addrspace(1) %ptr) {
+  ; CHECK: ldu.global.b16
+  %val = tail call half @llvm.nvvm.ldu.global.f.f16.p1(ptr addrspace(1) %ptr, i32 2)
+  ret half %val
+}
+
+; CHECK: test_ldu_v2f16
+define <2 x half> @test_ldu_v2f16(ptr addrspace(1) %ptr) {
+  ; CHECK: ldu.global.b32
+  %val = tail call <2 x half> @llvm.nvvm.ldu.global.f.v2f16.p1(ptr addrspace(1) %ptr, i32 4)
+  ret <2 x half> %val
+}
+
 ; CHECK: test_ldg_i8
 define i8 @test_ldg_i8(ptr addrspace(1) %ptr) {
-; ld.global.nc.u8
+  ; CHECK: ld.global.nc.u8
   %val = tail call i8 @llvm.nvvm.ldg.global.i.i8.p1(ptr addrspace(1) %ptr, i32 4)
   ret i8 %val
 }
 
 ; CHECK: test_ldg_i16
 define i16 @test_ldg_i16(ptr addrspace(1) %ptr) {
-; ld.global.nc.u16
-  %val = tail call i16 @llvm.nvvm.ldg.global.i.i16.p1(ptr addrspace(1) %ptr, i32 4)
+  ; CHECK: ld.global.nc.u16
+  %val = tail call i16 @llvm.nvvm.ldg.global.i.i16.p1(ptr addrspace(1) %ptr, i32 2)
   ret i16 %val
 }
 
 ; CHECK: test_ldg_i32
 define i32 @test_ldg_i32(ptr addrspace(1) %ptr) {
-; ld.global.nc.u32
+  ; CHECK: ld.global.nc.u32
   %val = tail call i32 @llvm.nvvm.ldg.global.i.i32.p1(ptr addrspace(1) %ptr, i32 4)
   ret i32 %val
 }
 
 ; CHECK: test_ldg_i64
 define i64 @test_ldg_i64(ptr addrspace(1) %ptr) {
-; ld.global.nc.u64
+  ; CHECK: ld.global.nc.u64
   %val = tail call i64 @llvm.nvvm.ldg.global.i.i64.p1(ptr addrspace(1) %ptr, i32 8)
   ret i64 %val
 }
 
 ; CHECK: test_ldg_f32
 define float @test_ldg_f32(ptr addrspace(1) %ptr) {
-; ld.global.nc.u64
+  ; CHECK: ld.global.nc.f32
   %val = tail call float @llvm.nvvm.ldg.global.f.f32.p1(ptr addrspace(1) %ptr, i32 4)
   ret float %val
 }
 
 ; CHECK: test_ldg_f64
 define double @test_ldg_f64(ptr addrspace(1) %ptr) {
-; ld.global.nc.u64
+  ; CHECK: ld.global.nc.f64
   %val = tail call double @llvm.nvvm.ldg.global.f.f64.p1(ptr addrspace(1) %ptr, i32 8)
   ret double %val
 }
 
 ; CHECK: test_ldg_f16
 define half @test_ldg_f16(ptr addrspace(1) %ptr) {
-; ld.global.nc.b16
-  %val = tail call half @llvm.nvvm.ldg.global.f.f16.p1(ptr addrspace(1) %ptr, i32 4)
+  ; CHECK: ld.global.nc.b16
+  %val = tail call half 

[PATCH] D145238: [NVPTX] Expose LDU builtins

2023-03-07 Thread Jakub Chlanda via Phabricator via cfe-commits
jchlanda added inline comments.



Comment at: clang/lib/CodeGen/CGBuiltin.cpp:18267-18271
+auto HalfSupport = HasHalfSupport(BuiltinID);
+if (!HalfSupport.first) {
+  CGM.Error(E->getExprLoc(),
+HalfSupport.second.append(" requires native half type 
support.")
+.c_str());

tra wrote:
> I think we can simplify it all further.
> 
> ```
> auto HasHalfSupport = [&](unsigned BuiltinID) {
> auto  = getContext();
> return Context.getLangOpts().NativeHalfType || 
> !Context.getTargetInfo().useFP16ConversionIntrinsics();
> }
> ...
> 
> if (!HasHalfSupport(BuiltinID)) {
>   CGM.Error(E->getExprLoc(),   
> getContext().BuiltinInfo.getName(BuiltinID) + " requires native half type 
> support.");
> 
> ```
Done, although we need a string append there, StringRef and Twine would not 
work.


Repository:
  rG LLVM Github Monorepo

CHANGES SINCE LAST ACTION
  https://reviews.llvm.org/D145238/new/

https://reviews.llvm.org/D145238

___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[PATCH] D145238: [NVPTX] Expose LDU builtins

2023-03-07 Thread Jakub Chlanda via Phabricator via cfe-commits
jchlanda updated this revision to Diff 503242.
jchlanda added a comment.

Simplify the check for half tys support.


Repository:
  rG LLVM Github Monorepo

CHANGES SINCE LAST ACTION
  https://reviews.llvm.org/D145238/new/

https://reviews.llvm.org/D145238

Files:
  clang/include/clang/Basic/BuiltinsNVPTX.def
  clang/lib/CodeGen/CGBuiltin.cpp
  clang/test/CodeGen/builtins-nvptx-native-half-type-err.c
  clang/test/CodeGen/builtins-nvptx-native-half-type.c
  clang/test/CodeGen/builtins-nvptx.c
  llvm/test/CodeGen/NVPTX/ldu-ldg.ll

Index: llvm/test/CodeGen/NVPTX/ldu-ldg.ll
===
--- llvm/test/CodeGen/NVPTX/ldu-ldg.ll
+++ llvm/test/CodeGen/NVPTX/ldu-ldg.ll
@@ -3,7 +3,13 @@
 
 
 declare i8 @llvm.nvvm.ldu.global.i.i8.p1(ptr addrspace(1) %ptr, i32 %align)
+declare i16 @llvm.nvvm.ldu.global.i.i16.p1(ptr addrspace(1) %ptr, i32 %align)
 declare i32 @llvm.nvvm.ldu.global.i.i32.p1(ptr addrspace(1) %ptr, i32 %align)
+declare i64 @llvm.nvvm.ldu.global.i.i64.p1(ptr addrspace(1) %ptr, i32 %align)
+declare float @llvm.nvvm.ldu.global.f.f32.p1(ptr addrspace(1) %ptr, i32 %align)
+declare double @llvm.nvvm.ldu.global.f.f64.p1(ptr addrspace(1) %ptr, i32 %align)
+declare half @llvm.nvvm.ldu.global.f.f16.p1(ptr addrspace(1) %ptr, i32 %align)
+declare <2 x half> @llvm.nvvm.ldu.global.f.v2f16.p1(ptr addrspace(1) %ptr, i32 %align)
 
 declare i8 @llvm.nvvm.ldg.global.i.i8.p1(ptr addrspace(1) %ptr, i32 %align)
 declare i16 @llvm.nvvm.ldg.global.i.i16.p1(ptr addrspace(1) %ptr, i32 %align)
@@ -16,70 +22,112 @@
 
 ; CHECK: test_ldu_i8
 define i8 @test_ldu_i8(ptr addrspace(1) %ptr) {
-; ldu.global.u8
+  ; CHECK: ldu.global.u8
   %val = tail call i8 @llvm.nvvm.ldu.global.i.i8.p1(ptr addrspace(1) %ptr, i32 4)
   ret i8 %val
 }
 
+; CHECK: test_ldu_i16
+define i16 @test_ldu_i16(ptr addrspace(1) %ptr) {
+  ; CHECK: ldu.global.u16
+  %val = tail call i16 @llvm.nvvm.ldu.global.i.i16.p1(ptr addrspace(1) %ptr, i32 2)
+  ret i16 %val
+}
+
 ; CHECK: test_ldu_i32
 define i32 @test_ldu_i32(ptr addrspace(1) %ptr) {
-; ldu.global.u32
+  ; CHECK: ldu.global.u32
   %val = tail call i32 @llvm.nvvm.ldu.global.i.i32.p1(ptr addrspace(1) %ptr, i32 4)
   ret i32 %val
 }
 
+; CHECK: test_ldu_i64
+define i64 @test_ldu_i64(ptr addrspace(1) %ptr) {
+  ; CHECK: ldu.global.u64
+  %val = tail call i64 @llvm.nvvm.ldu.global.i.i64.p1(ptr addrspace(1) %ptr, i32 8)
+  ret i64 %val
+}
+
+; CHECK: test_ldu_f32
+define float @test_ldu_f32(ptr addrspace(1) %ptr) {
+  ; CHECK: ldu.global.f32
+  %val = tail call float @llvm.nvvm.ldu.global.f.f32.p1(ptr addrspace(1) %ptr, i32 4)
+  ret float %val
+}
+
+; CHECK: test_ldu_f64
+define double @test_ldu_f64(ptr addrspace(1) %ptr) {
+  ; CHECK: ldu.global.f64
+  %val = tail call double @llvm.nvvm.ldu.global.f.f64.p1(ptr addrspace(1) %ptr, i32 8)
+  ret double %val
+}
+
+; CHECK: test_ldu_f16
+define half @test_ldu_f16(ptr addrspace(1) %ptr) {
+  ; CHECK: ldu.global.b16
+  %val = tail call half @llvm.nvvm.ldu.global.f.f16.p1(ptr addrspace(1) %ptr, i32 2)
+  ret half %val
+}
+
+; CHECK: test_ldu_v2f16
+define <2 x half> @test_ldu_v2f16(ptr addrspace(1) %ptr) {
+  ; CHECK: ldu.global.b32
+  %val = tail call <2 x half> @llvm.nvvm.ldu.global.f.v2f16.p1(ptr addrspace(1) %ptr, i32 4)
+  ret <2 x half> %val
+}
+
 ; CHECK: test_ldg_i8
 define i8 @test_ldg_i8(ptr addrspace(1) %ptr) {
-; ld.global.nc.u8
+  ; CHECK: ld.global.nc.u8
   %val = tail call i8 @llvm.nvvm.ldg.global.i.i8.p1(ptr addrspace(1) %ptr, i32 4)
   ret i8 %val
 }
 
 ; CHECK: test_ldg_i16
 define i16 @test_ldg_i16(ptr addrspace(1) %ptr) {
-; ld.global.nc.u16
-  %val = tail call i16 @llvm.nvvm.ldg.global.i.i16.p1(ptr addrspace(1) %ptr, i32 4)
+  ; CHECK: ld.global.nc.u16
+  %val = tail call i16 @llvm.nvvm.ldg.global.i.i16.p1(ptr addrspace(1) %ptr, i32 2)
   ret i16 %val
 }
 
 ; CHECK: test_ldg_i32
 define i32 @test_ldg_i32(ptr addrspace(1) %ptr) {
-; ld.global.nc.u32
+  ; CHECK: ld.global.nc.u32
   %val = tail call i32 @llvm.nvvm.ldg.global.i.i32.p1(ptr addrspace(1) %ptr, i32 4)
   ret i32 %val
 }
 
 ; CHECK: test_ldg_i64
 define i64 @test_ldg_i64(ptr addrspace(1) %ptr) {
-; ld.global.nc.u64
+  ; CHECK: ld.global.nc.u64
   %val = tail call i64 @llvm.nvvm.ldg.global.i.i64.p1(ptr addrspace(1) %ptr, i32 8)
   ret i64 %val
 }
 
 ; CHECK: test_ldg_f32
 define float @test_ldg_f32(ptr addrspace(1) %ptr) {
-; ld.global.nc.u64
+  ; CHECK: ld.global.nc.f32
   %val = tail call float @llvm.nvvm.ldg.global.f.f32.p1(ptr addrspace(1) %ptr, i32 4)
   ret float %val
 }
 
 ; CHECK: test_ldg_f64
 define double @test_ldg_f64(ptr addrspace(1) %ptr) {
-; ld.global.nc.u64
+  ; CHECK: ld.global.nc.f64
   %val = tail call double @llvm.nvvm.ldg.global.f.f64.p1(ptr addrspace(1) %ptr, i32 8)
   ret double %val
 }
 
 ; CHECK: test_ldg_f16
 define half @test_ldg_f16(ptr addrspace(1) %ptr) {
-; ld.global.nc.b16
-  %val = tail call half @llvm.nvvm.ldg.global.f.f16.p1(ptr addrspace(1) %ptr, i32 4)
+  ; CHECK: ld.global.nc.b16
+  %val = tail call half 

[PATCH] D145238: [NVPTX] Expose LDU builtins

2023-03-07 Thread Jakub Chlanda via Phabricator via cfe-commits
jchlanda added inline comments.



Comment at: clang/lib/CodeGen/CGBuiltin.cpp:18116
+case NVPTX::BI__nvvm_ldu_h:
+  BuiltinName = "__nvvm_ldu_h";
+  break;

tra wrote:
> Can we use the standard `StringRef Name = 
> getContext().BuiltinInfo.getName(BuiltinID);` to figure out the builtin name?
Ha, had a feeling it would exist, couldn't find it. Thanks.



Comment at: clang/lib/CodeGen/CGBuiltin.cpp:18261-18263
+  std::string ErrMsg{HalfSupport.second};
+  CGM.Error(E->getExprLoc(),
+ErrMsg.append(" requires native half type support.").c_str());

tra wrote:
> Nit: this would be a bit more readable:
> ```
> std::string BuiltinName{HalfSupport.second};
> CGM.Error(E->getExprLoc(),  BuiltinName + " requires native half type 
> support.")` 
> ```
> You may also consider changing returned `BuiltinName` to be `std::string`, so 
> you would not need an explicit temp var. 
Done the `std::string` return, thanks.



Comment at: llvm/test/CodeGen/NVPTX/ldu-ldg.ll:60
+define double @test_ldu_f64(ptr addrspace(1) %ptr) {
+; ldu.global.u64
+  %val = tail call double @llvm.nvvm.ldu.global.f.f64.p1(ptr addrspace(1) 
%ptr, i32 8)

tra wrote:
> Hmm. I wonder why we end up with `u64` here and not `b64`. Not that it 
> matters in this case, but it is a discrepancy vs. `f16`.
That is copy/paste sloppiness on my part, sorry. I've updated the test to check 
generated PTX, not just the labels, and fixed the values.

It generates correct kinds of loads, based on the type, the only discrepancy is 
that it doesn't distinguish between signed and unsigned loads, always choosing 
the unsigned variant. I think that's by design, at 
[ISel](https://github.com/llvm/llvm-project/blob/main/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp#L1683)
 there is a check if the load needs to be extended and a correct `CVT` 
instruction will be added.


Repository:
  rG LLVM Github Monorepo

CHANGES SINCE LAST ACTION
  https://reviews.llvm.org/D145238/new/

https://reviews.llvm.org/D145238

___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[PATCH] D145238: [NVPTX] Expose LDU builtins

2023-03-07 Thread Jakub Chlanda via Phabricator via cfe-commits
jchlanda updated this revision to Diff 502948.
jchlanda marked 3 inline comments as done.
jchlanda added a comment.

Address PR comments


Repository:
  rG LLVM Github Monorepo

CHANGES SINCE LAST ACTION
  https://reviews.llvm.org/D145238/new/

https://reviews.llvm.org/D145238

Files:
  clang/include/clang/Basic/BuiltinsNVPTX.def
  clang/lib/CodeGen/CGBuiltin.cpp
  clang/test/CodeGen/builtins-nvptx-native-half-type-err.c
  clang/test/CodeGen/builtins-nvptx-native-half-type.c
  clang/test/CodeGen/builtins-nvptx.c
  llvm/test/CodeGen/NVPTX/ldu-ldg.ll

Index: llvm/test/CodeGen/NVPTX/ldu-ldg.ll
===
--- llvm/test/CodeGen/NVPTX/ldu-ldg.ll
+++ llvm/test/CodeGen/NVPTX/ldu-ldg.ll
@@ -3,7 +3,13 @@
 
 
 declare i8 @llvm.nvvm.ldu.global.i.i8.p1(ptr addrspace(1) %ptr, i32 %align)
+declare i16 @llvm.nvvm.ldu.global.i.i16.p1(ptr addrspace(1) %ptr, i32 %align)
 declare i32 @llvm.nvvm.ldu.global.i.i32.p1(ptr addrspace(1) %ptr, i32 %align)
+declare i64 @llvm.nvvm.ldu.global.i.i64.p1(ptr addrspace(1) %ptr, i32 %align)
+declare float @llvm.nvvm.ldu.global.f.f32.p1(ptr addrspace(1) %ptr, i32 %align)
+declare double @llvm.nvvm.ldu.global.f.f64.p1(ptr addrspace(1) %ptr, i32 %align)
+declare half @llvm.nvvm.ldu.global.f.f16.p1(ptr addrspace(1) %ptr, i32 %align)
+declare <2 x half> @llvm.nvvm.ldu.global.f.v2f16.p1(ptr addrspace(1) %ptr, i32 %align)
 
 declare i8 @llvm.nvvm.ldg.global.i.i8.p1(ptr addrspace(1) %ptr, i32 %align)
 declare i16 @llvm.nvvm.ldg.global.i.i16.p1(ptr addrspace(1) %ptr, i32 %align)
@@ -16,70 +22,112 @@
 
 ; CHECK: test_ldu_i8
 define i8 @test_ldu_i8(ptr addrspace(1) %ptr) {
-; ldu.global.u8
+  ; CHECK: ldu.global.u8
   %val = tail call i8 @llvm.nvvm.ldu.global.i.i8.p1(ptr addrspace(1) %ptr, i32 4)
   ret i8 %val
 }
 
+; CHECK: test_ldu_i16
+define i16 @test_ldu_i16(ptr addrspace(1) %ptr) {
+  ; CHECK: ldu.global.u16
+  %val = tail call i16 @llvm.nvvm.ldu.global.i.i16.p1(ptr addrspace(1) %ptr, i32 2)
+  ret i16 %val
+}
+
 ; CHECK: test_ldu_i32
 define i32 @test_ldu_i32(ptr addrspace(1) %ptr) {
-; ldu.global.u32
+  ; CHECK: ldu.global.u32
   %val = tail call i32 @llvm.nvvm.ldu.global.i.i32.p1(ptr addrspace(1) %ptr, i32 4)
   ret i32 %val
 }
 
+; CHECK: test_ldu_i64
+define i64 @test_ldu_i64(ptr addrspace(1) %ptr) {
+  ; CHECK: ldu.global.u64
+  %val = tail call i64 @llvm.nvvm.ldu.global.i.i64.p1(ptr addrspace(1) %ptr, i32 8)
+  ret i64 %val
+}
+
+; CHECK: test_ldu_f32
+define float @test_ldu_f32(ptr addrspace(1) %ptr) {
+  ; CHECK: ldu.global.f32
+  %val = tail call float @llvm.nvvm.ldu.global.f.f32.p1(ptr addrspace(1) %ptr, i32 4)
+  ret float %val
+}
+
+; CHECK: test_ldu_f64
+define double @test_ldu_f64(ptr addrspace(1) %ptr) {
+  ; CHECK: ldu.global.f64
+  %val = tail call double @llvm.nvvm.ldu.global.f.f64.p1(ptr addrspace(1) %ptr, i32 8)
+  ret double %val
+}
+
+; CHECK: test_ldu_f16
+define half @test_ldu_f16(ptr addrspace(1) %ptr) {
+  ; CHECK: ldu.global.b16
+  %val = tail call half @llvm.nvvm.ldu.global.f.f16.p1(ptr addrspace(1) %ptr, i32 2)
+  ret half %val
+}
+
+; CHECK: test_ldu_v2f16
+define <2 x half> @test_ldu_v2f16(ptr addrspace(1) %ptr) {
+  ; CHECK: ldu.global.b32
+  %val = tail call <2 x half> @llvm.nvvm.ldu.global.f.v2f16.p1(ptr addrspace(1) %ptr, i32 4)
+  ret <2 x half> %val
+}
+
 ; CHECK: test_ldg_i8
 define i8 @test_ldg_i8(ptr addrspace(1) %ptr) {
-; ld.global.nc.u8
+  ; CHECK: ld.global.nc.u8
   %val = tail call i8 @llvm.nvvm.ldg.global.i.i8.p1(ptr addrspace(1) %ptr, i32 4)
   ret i8 %val
 }
 
 ; CHECK: test_ldg_i16
 define i16 @test_ldg_i16(ptr addrspace(1) %ptr) {
-; ld.global.nc.u16
-  %val = tail call i16 @llvm.nvvm.ldg.global.i.i16.p1(ptr addrspace(1) %ptr, i32 4)
+  ; CHECK: ld.global.nc.u16
+  %val = tail call i16 @llvm.nvvm.ldg.global.i.i16.p1(ptr addrspace(1) %ptr, i32 2)
   ret i16 %val
 }
 
 ; CHECK: test_ldg_i32
 define i32 @test_ldg_i32(ptr addrspace(1) %ptr) {
-; ld.global.nc.u32
+  ; CHECK: ld.global.nc.u32
   %val = tail call i32 @llvm.nvvm.ldg.global.i.i32.p1(ptr addrspace(1) %ptr, i32 4)
   ret i32 %val
 }
 
 ; CHECK: test_ldg_i64
 define i64 @test_ldg_i64(ptr addrspace(1) %ptr) {
-; ld.global.nc.u64
+  ; CHECK: ld.global.nc.u64
   %val = tail call i64 @llvm.nvvm.ldg.global.i.i64.p1(ptr addrspace(1) %ptr, i32 8)
   ret i64 %val
 }
 
 ; CHECK: test_ldg_f32
 define float @test_ldg_f32(ptr addrspace(1) %ptr) {
-; ld.global.nc.u64
+  ; CHECK: ld.global.nc.f32
   %val = tail call float @llvm.nvvm.ldg.global.f.f32.p1(ptr addrspace(1) %ptr, i32 4)
   ret float %val
 }
 
 ; CHECK: test_ldg_f64
 define double @test_ldg_f64(ptr addrspace(1) %ptr) {
-; ld.global.nc.u64
+  ; CHECK: ld.global.nc.f64
   %val = tail call double @llvm.nvvm.ldg.global.f.f64.p1(ptr addrspace(1) %ptr, i32 8)
   ret double %val
 }
 
 ; CHECK: test_ldg_f16
 define half @test_ldg_f16(ptr addrspace(1) %ptr) {
-; ld.global.nc.b16
-  %val = tail call half @llvm.nvvm.ldg.global.f.f16.p1(ptr addrspace(1) %ptr, i32 4)
+  ; CHECK: ld.global.nc.b16
+  

[PATCH] D145238: [NVPTX] Expose LDU builtins

2023-03-03 Thread Jakub Chlanda via Phabricator via cfe-commits
jchlanda created this revision.
jchlanda added a reviewer: tra.
Herald added subscribers: mattd, gchakrabarti, asavonic.
Herald added a project: All.
jchlanda requested review of this revision.
Herald added subscribers: llvm-commits, cfe-commits, jholewinski.
Herald added projects: clang, LLVM.

Also check if native half types are supported to give more descriptive error 
message, without it clang only reports incorrect intrinsic return type.


Repository:
  rG LLVM Github Monorepo

https://reviews.llvm.org/D145238

Files:
  clang/include/clang/Basic/BuiltinsNVPTX.def
  clang/lib/CodeGen/CGBuiltin.cpp
  clang/test/CodeGen/builtins-nvptx-native-half-type-err.c
  clang/test/CodeGen/builtins-nvptx-native-half-type.c
  clang/test/CodeGen/builtins-nvptx.c
  llvm/test/CodeGen/NVPTX/ldu-ldg.ll

Index: llvm/test/CodeGen/NVPTX/ldu-ldg.ll
===
--- llvm/test/CodeGen/NVPTX/ldu-ldg.ll
+++ llvm/test/CodeGen/NVPTX/ldu-ldg.ll
@@ -3,7 +3,13 @@
 
 
 declare i8 @llvm.nvvm.ldu.global.i.i8.p1(ptr addrspace(1) %ptr, i32 %align)
+declare i16 @llvm.nvvm.ldu.global.i.i16.p1(ptr addrspace(1) %ptr, i32 %align)
 declare i32 @llvm.nvvm.ldu.global.i.i32.p1(ptr addrspace(1) %ptr, i32 %align)
+declare i64 @llvm.nvvm.ldu.global.i.i64.p1(ptr addrspace(1) %ptr, i32 %align)
+declare float @llvm.nvvm.ldu.global.f.f32.p1(ptr addrspace(1) %ptr, i32 %align)
+declare double @llvm.nvvm.ldu.global.f.f64.p1(ptr addrspace(1) %ptr, i32 %align)
+declare half @llvm.nvvm.ldu.global.f.f16.p1(ptr addrspace(1) %ptr, i32 %align)
+declare <2 x half> @llvm.nvvm.ldu.global.f.v2f16.p1(ptr addrspace(1) %ptr, i32 %align)
 
 declare i8 @llvm.nvvm.ldg.global.i.i8.p1(ptr addrspace(1) %ptr, i32 %align)
 declare i16 @llvm.nvvm.ldg.global.i.i16.p1(ptr addrspace(1) %ptr, i32 %align)
@@ -21,6 +27,13 @@
   ret i8 %val
 }
 
+; CHECK: test_ldu_i16
+define i16 @test_ldu_i16(ptr addrspace(1) %ptr) {
+; ldu.global.u16
+  %val = tail call i16 @llvm.nvvm.ldu.global.i.i16.p1(ptr addrspace(1) %ptr, i32 4)
+  ret i16 %val
+}
+
 ; CHECK: test_ldu_i32
 define i32 @test_ldu_i32(ptr addrspace(1) %ptr) {
 ; ldu.global.u32
@@ -28,6 +41,41 @@
   ret i32 %val
 }
 
+; CHECK: test_ldu_i64
+define i64 @test_ldu_i64(ptr addrspace(1) %ptr) {
+; ldu.global.u64
+  %val = tail call i64 @llvm.nvvm.ldu.global.i.i64.p1(ptr addrspace(1) %ptr, i32 8)
+  ret i64 %val
+}
+
+; CHECK: test_ldu_f32
+define float @test_ldu_f32(ptr addrspace(1) %ptr) {
+; ldu.global.u64
+  %val = tail call float @llvm.nvvm.ldu.global.f.f32.p1(ptr addrspace(1) %ptr, i32 4)
+  ret float %val
+}
+
+; CHECK: test_ldu_f64
+define double @test_ldu_f64(ptr addrspace(1) %ptr) {
+; ldu.global.u64
+  %val = tail call double @llvm.nvvm.ldu.global.f.f64.p1(ptr addrspace(1) %ptr, i32 8)
+  ret double %val
+}
+
+; CHECK: test_ldu_f16
+define half @test_ldu_f16(ptr addrspace(1) %ptr) {
+; ldu.global.b16
+  %val = tail call half @llvm.nvvm.ldu.global.f.f16.p1(ptr addrspace(1) %ptr, i32 4)
+  ret half %val
+}
+
+; CHECK: test_ldu_v2f16
+define <2 x half> @test_ldu_v2f16(ptr addrspace(1) %ptr) {
+; ldu.global.b32
+  %val = tail call <2 x half> @llvm.nvvm.ldu.global.f.v2f16.p1(ptr addrspace(1) %ptr, i32 4)
+  ret <2 x half> %val
+}
+
 ; CHECK: test_ldg_i8
 define i8 @test_ldg_i8(ptr addrspace(1) %ptr) {
 ; ld.global.nc.u8
Index: clang/test/CodeGen/builtins-nvptx.c
===
--- clang/test/CodeGen/builtins-nvptx.c
+++ clang/test/CodeGen/builtins-nvptx.c
@@ -652,6 +652,97 @@
   __nvvm_ldg_d2((const double2 *)p);
 }
 
+// CHECK-LABEL: nvvm_ldu
+__device__ void nvvm_ldu(const void *p) {
+  // CHECK: call i8 @llvm.nvvm.ldu.global.i.i8.p0(ptr {{%[0-9]+}}, i32 1)
+  // CHECK: call i8 @llvm.nvvm.ldu.global.i.i8.p0(ptr {{%[0-9]+}}, i32 1)
+  __nvvm_ldu_c((const char *)p);
+  __nvvm_ldu_uc((const unsigned char *)p);
+
+  // CHECK: call i16 @llvm.nvvm.ldu.global.i.i16.p0(ptr {{%[0-9]+}}, i32 2)
+  // CHECK: call i16 @llvm.nvvm.ldu.global.i.i16.p0(ptr {{%[0-9]+}}, i32 2)
+  __nvvm_ldu_s((const short *)p);
+  __nvvm_ldu_us((const unsigned short *)p);
+
+  // CHECK: call i32 @llvm.nvvm.ldu.global.i.i32.p0(ptr {{%[0-9]+}}, i32 4)
+  // CHECK: call i32 @llvm.nvvm.ldu.global.i.i32.p0(ptr {{%[0-9]+}}, i32 4)
+  __nvvm_ldu_i((const int *)p);
+  __nvvm_ldu_ui((const unsigned int *)p);
+
+  // LP32: call i32 @llvm.nvvm.ldu.global.i.i32.p0(ptr {{%[0-9]+}}, i32 4)
+  // LP32: call i32 @llvm.nvvm.ldu.global.i.i32.p0(ptr {{%[0-9]+}}, i32 4)
+  // LP64: call i64 @llvm.nvvm.ldu.global.i.i64.p0(ptr {{%[0-9]+}}, i32 8)
+  // LP64: call i64 @llvm.nvvm.ldu.global.i.i64.p0(ptr {{%[0-9]+}}, i32 8)
+  __nvvm_ldu_l((const long *)p);
+  __nvvm_ldu_ul((const unsigned long *)p);
+
+  // CHECK: call float @llvm.nvvm.ldu.global.f.f32.p0(ptr {{%[0-9]+}}, i32 4)
+  __nvvm_ldu_f((const float *)p);
+  // CHECK: call double @llvm.nvvm.ldu.global.f.f64.p0(ptr {{%[0-9]+}}, i32 8)
+  __nvvm_ldu_d((const double *)p);
+
+  // CHECK: call <2 x i8> 

[PATCH] D144961: [NVPTX] Add f16 and v2f16 ldg builtins

2023-03-03 Thread Jakub Chlanda via Phabricator via cfe-commits
This revision was automatically updated to reflect the committed changes.
Closed by commit rG71b06585857a: [NVPTX] Add f16 and v2f16 ldg builtins 
(authored by jchlanda).

Repository:
  rG LLVM Github Monorepo

CHANGES SINCE LAST ACTION
  https://reviews.llvm.org/D144961/new/

https://reviews.llvm.org/D144961

Files:
  clang/include/clang/Basic/BuiltinsNVPTX.def
  clang/lib/CodeGen/CGBuiltin.cpp
  clang/test/CodeGen/builtins-nvptx-native-half-type.c
  llvm/test/CodeGen/NVPTX/ldu-ldg.ll

Index: llvm/test/CodeGen/NVPTX/ldu-ldg.ll
===
--- llvm/test/CodeGen/NVPTX/ldu-ldg.ll
+++ llvm/test/CodeGen/NVPTX/ldu-ldg.ll
@@ -4,34 +4,82 @@
 
 declare i8 @llvm.nvvm.ldu.global.i.i8.p1(ptr addrspace(1) %ptr, i32 %align)
 declare i32 @llvm.nvvm.ldu.global.i.i32.p1(ptr addrspace(1) %ptr, i32 %align)
+
 declare i8 @llvm.nvvm.ldg.global.i.i8.p1(ptr addrspace(1) %ptr, i32 %align)
+declare i16 @llvm.nvvm.ldg.global.i.i16.p1(ptr addrspace(1) %ptr, i32 %align)
 declare i32 @llvm.nvvm.ldg.global.i.i32.p1(ptr addrspace(1) %ptr, i32 %align)
+declare i64 @llvm.nvvm.ldg.global.i.i64.p1(ptr addrspace(1) %ptr, i32 %align)
+declare float @llvm.nvvm.ldg.global.f.f32.p1(ptr addrspace(1) %ptr, i32 %align)
+declare double @llvm.nvvm.ldg.global.f.f64.p1(ptr addrspace(1) %ptr, i32 %align)
+declare half @llvm.nvvm.ldg.global.f.f16.p1(ptr addrspace(1) %ptr, i32 %align)
+declare <2 x half> @llvm.nvvm.ldg.global.f.v2f16.p1(ptr addrspace(1) %ptr, i32 %align)
 
-
-; CHECK: func0
-define i8 @func0(ptr addrspace(1) %ptr) {
+; CHECK: test_ldu_i8
+define i8 @test_ldu_i8(ptr addrspace(1) %ptr) {
 ; ldu.global.u8
   %val = tail call i8 @llvm.nvvm.ldu.global.i.i8.p1(ptr addrspace(1) %ptr, i32 4)
   ret i8 %val
 }
 
-; CHECK: func1
-define i32 @func1(ptr addrspace(1) %ptr) {
+; CHECK: test_ldu_i32
+define i32 @test_ldu_i32(ptr addrspace(1) %ptr) {
 ; ldu.global.u32
   %val = tail call i32 @llvm.nvvm.ldu.global.i.i32.p1(ptr addrspace(1) %ptr, i32 4)
   ret i32 %val
 }
 
-; CHECK: func2
-define i8 @func2(ptr addrspace(1) %ptr) {
+; CHECK: test_ldg_i8
+define i8 @test_ldg_i8(ptr addrspace(1) %ptr) {
 ; ld.global.nc.u8
   %val = tail call i8 @llvm.nvvm.ldg.global.i.i8.p1(ptr addrspace(1) %ptr, i32 4)
   ret i8 %val
 }
 
-; CHECK: func3
-define i32 @func3(ptr addrspace(1) %ptr) {
+; CHECK: test_ldg_i16
+define i16 @test_ldg_i16(ptr addrspace(1) %ptr) {
+; ld.global.nc.u16
+  %val = tail call i16 @llvm.nvvm.ldg.global.i.i16.p1(ptr addrspace(1) %ptr, i32 4)
+  ret i16 %val
+}
+
+; CHECK: test_ldg_i32
+define i32 @test_ldg_i32(ptr addrspace(1) %ptr) {
 ; ld.global.nc.u32
   %val = tail call i32 @llvm.nvvm.ldg.global.i.i32.p1(ptr addrspace(1) %ptr, i32 4)
   ret i32 %val
 }
+
+; CHECK: test_ldg_i64
+define i64 @test_ldg_i64(ptr addrspace(1) %ptr) {
+; ld.global.nc.u64
+  %val = tail call i64 @llvm.nvvm.ldg.global.i.i64.p1(ptr addrspace(1) %ptr, i32 8)
+  ret i64 %val
+}
+
+; CHECK: test_ldg_f32
+define float @test_ldg_f32(ptr addrspace(1) %ptr) {
+; ld.global.nc.u64
+  %val = tail call float @llvm.nvvm.ldg.global.f.f32.p1(ptr addrspace(1) %ptr, i32 4)
+  ret float %val
+}
+
+; CHECK: test_ldg_f64
+define double @test_ldg_f64(ptr addrspace(1) %ptr) {
+; ld.global.nc.u64
+  %val = tail call double @llvm.nvvm.ldg.global.f.f64.p1(ptr addrspace(1) %ptr, i32 8)
+  ret double %val
+}
+
+; CHECK: test_ldg_f16
+define half @test_ldg_f16(ptr addrspace(1) %ptr) {
+; ld.global.nc.b16
+  %val = tail call half @llvm.nvvm.ldg.global.f.f16.p1(ptr addrspace(1) %ptr, i32 4)
+  ret half %val
+}
+
+; CHECK: test_ldg_v2f16
+define <2 x half> @test_ldg_v2f16(ptr addrspace(1) %ptr) {
+; ld.global.nc.b32
+  %val = tail call <2 x half> @llvm.nvvm.ldg.global.f.v2f16.p1(ptr addrspace(1) %ptr, i32 4)
+  ret <2 x half> %val
+}
Index: clang/test/CodeGen/builtins-nvptx-native-half-type.c
===
--- clang/test/CodeGen/builtins-nvptx-native-half-type.c
+++ clang/test/CodeGen/builtins-nvptx-native-half-type.c
@@ -172,3 +172,12 @@
 #endif
   // CHECK: ret void
 }
+
+// CHECK-LABEL: nvvm_ldg_native_half_types
+__device__ void nvvm_ldg_native_half_types(const void *p) {
+  // CHECK: call half @llvm.nvvm.ldg.global.f.f16.p0
+  __nvvm_ldg_h((const __fp16 *)p);
+  typedef __fp16 __fp16v2 __attribute__((ext_vector_type(2)));
+  // CHECK: call <2 x half> @llvm.nvvm.ldg.global.f.v2f16.p0
+  __nvvm_ldg_h2((const __fp16v2 *)p);
+}
Index: clang/lib/CodeGen/CGBuiltin.cpp
===
--- clang/lib/CodeGen/CGBuiltin.cpp
+++ clang/lib/CodeGen/CGBuiltin.cpp
@@ -18228,7 +18228,9 @@
 // elements, its alignment is set to number of elements times the alignment
 // of its member: n*alignof(t)."
 return MakeLdg(Intrinsic::nvvm_ldg_global_i);
+  case NVPTX::BI__nvvm_ldg_h:
   case NVPTX::BI__nvvm_ldg_f:
+  case NVPTX::BI__nvvm_ldg_h2:
   case NVPTX::BI__nvvm_ldg_f2:
   case NVPTX::BI__nvvm_ldg_f4:
   case NVPTX::BI__nvvm_ldg_d:

[PATCH] D144961: [NVPTX] Add f16 and v2f16 ldg builtins

2023-03-01 Thread Jakub Chlanda via Phabricator via cfe-commits
jchlanda added inline comments.



Comment at: llvm/test/CodeGen/NVPTX/ldu-ldg.ll:5-6
 
 declare i8 @llvm.nvvm.ldu.global.i.i8.p1(ptr addrspace(1) %ptr, i32 %align)
 declare i32 @llvm.nvvm.ldu.global.i.i32.p1(ptr addrspace(1) %ptr, i32 %align)
+

tra wrote:
> Do we want to add missing `ldu` variants, too?
Looks like none of the `ldu` builtins have been exposed yet. I'll do it in a 
follow up patch.



Comment at: llvm/test/CodeGen/NVPTX/ldu-ldg.ll:39
 ; CHECK: func3
-define i32 @func3(ptr addrspace(1) %ptr) {
+define i16 @func3(ptr addrspace(1) %ptr) {
+; ld.global.nc.u16

tra wrote:
> Nit: I'd give test functions more descriptive names. E.g. `test_ldg_u16`.
Sure, done now.


CHANGES SINCE LAST ACTION
  https://reviews.llvm.org/D144961/new/

https://reviews.llvm.org/D144961

___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[PATCH] D144961: [NVPTX] Add f16 and v2f16 ldg builtins

2023-03-01 Thread Jakub Chlanda via Phabricator via cfe-commits
jchlanda updated this revision to Diff 501392.
jchlanda added a comment.

Add more verbose test names.


CHANGES SINCE LAST ACTION
  https://reviews.llvm.org/D144961/new/

https://reviews.llvm.org/D144961

Files:
  clang/include/clang/Basic/BuiltinsNVPTX.def
  clang/lib/CodeGen/CGBuiltin.cpp
  clang/test/CodeGen/builtins-nvptx-native-half-type.c
  llvm/test/CodeGen/NVPTX/ldu-ldg.ll

Index: llvm/test/CodeGen/NVPTX/ldu-ldg.ll
===
--- llvm/test/CodeGen/NVPTX/ldu-ldg.ll
+++ llvm/test/CodeGen/NVPTX/ldu-ldg.ll
@@ -4,34 +4,82 @@
 
 declare i8 @llvm.nvvm.ldu.global.i.i8.p1(ptr addrspace(1) %ptr, i32 %align)
 declare i32 @llvm.nvvm.ldu.global.i.i32.p1(ptr addrspace(1) %ptr, i32 %align)
+
 declare i8 @llvm.nvvm.ldg.global.i.i8.p1(ptr addrspace(1) %ptr, i32 %align)
+declare i16 @llvm.nvvm.ldg.global.i.i16.p1(ptr addrspace(1) %ptr, i32 %align)
 declare i32 @llvm.nvvm.ldg.global.i.i32.p1(ptr addrspace(1) %ptr, i32 %align)
+declare i64 @llvm.nvvm.ldg.global.i.i64.p1(ptr addrspace(1) %ptr, i32 %align)
+declare float @llvm.nvvm.ldg.global.f.f32.p1(ptr addrspace(1) %ptr, i32 %align)
+declare double @llvm.nvvm.ldg.global.f.f64.p1(ptr addrspace(1) %ptr, i32 %align)
+declare half @llvm.nvvm.ldg.global.f.f16.p1(ptr addrspace(1) %ptr, i32 %align)
+declare <2 x half> @llvm.nvvm.ldg.global.f.v2f16.p1(ptr addrspace(1) %ptr, i32 %align)
 
-
-; CHECK: func0
-define i8 @func0(ptr addrspace(1) %ptr) {
+; CHECK: test_ldu_i8
+define i8 @test_ldu_i8(ptr addrspace(1) %ptr) {
 ; ldu.global.u8
   %val = tail call i8 @llvm.nvvm.ldu.global.i.i8.p1(ptr addrspace(1) %ptr, i32 4)
   ret i8 %val
 }
 
-; CHECK: func1
-define i32 @func1(ptr addrspace(1) %ptr) {
+; CHECK: test_ldu_i32
+define i32 @test_ldu_i32(ptr addrspace(1) %ptr) {
 ; ldu.global.u32
   %val = tail call i32 @llvm.nvvm.ldu.global.i.i32.p1(ptr addrspace(1) %ptr, i32 4)
   ret i32 %val
 }
 
-; CHECK: func2
-define i8 @func2(ptr addrspace(1) %ptr) {
+; CHECK: test_ldg_i8
+define i8 @test_ldg_i8(ptr addrspace(1) %ptr) {
 ; ld.global.nc.u8
   %val = tail call i8 @llvm.nvvm.ldg.global.i.i8.p1(ptr addrspace(1) %ptr, i32 4)
   ret i8 %val
 }
 
-; CHECK: func3
-define i32 @func3(ptr addrspace(1) %ptr) {
+; CHECK: test_ldg_i16
+define i16 @test_ldg_i16(ptr addrspace(1) %ptr) {
+; ld.global.nc.u16
+  %val = tail call i16 @llvm.nvvm.ldg.global.i.i16.p1(ptr addrspace(1) %ptr, i32 4)
+  ret i16 %val
+}
+
+; CHECK: test_ldg_i32
+define i32 @test_ldg_i32(ptr addrspace(1) %ptr) {
 ; ld.global.nc.u32
   %val = tail call i32 @llvm.nvvm.ldg.global.i.i32.p1(ptr addrspace(1) %ptr, i32 4)
   ret i32 %val
 }
+
+; CHECK: test_ldg_i64
+define i64 @test_ldg_i64(ptr addrspace(1) %ptr) {
+; ld.global.nc.u64
+  %val = tail call i64 @llvm.nvvm.ldg.global.i.i64.p1(ptr addrspace(1) %ptr, i32 8)
+  ret i64 %val
+}
+
+; CHECK: test_ldg_f32
+define float @test_ldg_f32(ptr addrspace(1) %ptr) {
+; ld.global.nc.u64
+  %val = tail call float @llvm.nvvm.ldg.global.f.f32.p1(ptr addrspace(1) %ptr, i32 4)
+  ret float %val
+}
+
+; CHECK: test_ldg_f64
+define double @test_ldg_f64(ptr addrspace(1) %ptr) {
+; ld.global.nc.u64
+  %val = tail call double @llvm.nvvm.ldg.global.f.f64.p1(ptr addrspace(1) %ptr, i32 8)
+  ret double %val
+}
+
+; CHECK: test_ldg_f16
+define half @test_ldg_f16(ptr addrspace(1) %ptr) {
+; ld.global.nc.b16
+  %val = tail call half @llvm.nvvm.ldg.global.f.f16.p1(ptr addrspace(1) %ptr, i32 4)
+  ret half %val
+}
+
+; CHECK: test_ldg_v2f16
+define <2 x half> @test_ldg_v2f16(ptr addrspace(1) %ptr) {
+; ld.global.nc.b32
+  %val = tail call <2 x half> @llvm.nvvm.ldg.global.f.v2f16.p1(ptr addrspace(1) %ptr, i32 4)
+  ret <2 x half> %val
+}
Index: clang/test/CodeGen/builtins-nvptx-native-half-type.c
===
--- clang/test/CodeGen/builtins-nvptx-native-half-type.c
+++ clang/test/CodeGen/builtins-nvptx-native-half-type.c
@@ -172,3 +172,12 @@
 #endif
   // CHECK: ret void
 }
+
+// CHECK-LABEL: nvvm_ldg_native_half_types
+__device__ void nvvm_ldg_native_half_types(const void *p) {
+  // CHECK: call half @llvm.nvvm.ldg.global.f.f16.p0
+  __nvvm_ldg_h((const __fp16 *)p);
+  typedef __fp16 __fp16v2 __attribute__((ext_vector_type(2)));
+  // CHECK: call <2 x half> @llvm.nvvm.ldg.global.f.v2f16.p0
+  __nvvm_ldg_h2((const __fp16v2 *)p);
+}
Index: clang/lib/CodeGen/CGBuiltin.cpp
===
--- clang/lib/CodeGen/CGBuiltin.cpp
+++ clang/lib/CodeGen/CGBuiltin.cpp
@@ -18228,7 +18228,9 @@
 // elements, its alignment is set to number of elements times the alignment
 // of its member: n*alignof(t)."
 return MakeLdg(Intrinsic::nvvm_ldg_global_i);
+  case NVPTX::BI__nvvm_ldg_h:
   case NVPTX::BI__nvvm_ldg_f:
+  case NVPTX::BI__nvvm_ldg_h2:
   case NVPTX::BI__nvvm_ldg_f2:
   case NVPTX::BI__nvvm_ldg_f4:
   case NVPTX::BI__nvvm_ldg_d:
Index: clang/include/clang/Basic/BuiltinsNVPTX.def

[PATCH] D144961: [NVPTX] Add f16 and v2f16 ldg builtins

2023-02-28 Thread Jakub Chlanda via Phabricator via cfe-commits
jchlanda created this revision.
jchlanda added a reviewer: tra.
Herald added subscribers: mattd, gchakrabarti, asavonic.
Herald added a project: All.
jchlanda requested review of this revision.
Herald added subscribers: llvm-commits, cfe-commits, jholewinski.
Herald added projects: clang, LLVM.

Adds `f16` and `v2f16` `ldg` builtins and relevant tests.


Repository:
  rG LLVM Github Monorepo

https://reviews.llvm.org/D144961

Files:
  clang/include/clang/Basic/BuiltinsNVPTX.def
  clang/lib/CodeGen/CGBuiltin.cpp
  clang/test/CodeGen/builtins-nvptx-native-half-type.c
  llvm/test/CodeGen/NVPTX/ldu-ldg.ll

Index: llvm/test/CodeGen/NVPTX/ldu-ldg.ll
===
--- llvm/test/CodeGen/NVPTX/ldu-ldg.ll
+++ llvm/test/CodeGen/NVPTX/ldu-ldg.ll
@@ -4,9 +4,15 @@
 
 declare i8 @llvm.nvvm.ldu.global.i.i8.p1(ptr addrspace(1) %ptr, i32 %align)
 declare i32 @llvm.nvvm.ldu.global.i.i32.p1(ptr addrspace(1) %ptr, i32 %align)
+
 declare i8 @llvm.nvvm.ldg.global.i.i8.p1(ptr addrspace(1) %ptr, i32 %align)
+declare i16 @llvm.nvvm.ldg.global.i.i16.p1(ptr addrspace(1) %ptr, i32 %align)
 declare i32 @llvm.nvvm.ldg.global.i.i32.p1(ptr addrspace(1) %ptr, i32 %align)
-
+declare i64 @llvm.nvvm.ldg.global.i.i64.p1(ptr addrspace(1) %ptr, i32 %align)
+declare float @llvm.nvvm.ldg.global.f.f32.p1(ptr addrspace(1) %ptr, i32 %align)
+declare double @llvm.nvvm.ldg.global.f.f64.p1(ptr addrspace(1) %ptr, i32 %align)
+declare half @llvm.nvvm.ldg.global.f.f16.p1(ptr addrspace(1) %ptr, i32 %align)
+declare <2 x half> @llvm.nvvm.ldg.global.f.v2f16.p1(ptr addrspace(1) %ptr, i32 %align)
 
 ; CHECK: func0
 define i8 @func0(ptr addrspace(1) %ptr) {
@@ -30,8 +36,50 @@
 }
 
 ; CHECK: func3
-define i32 @func3(ptr addrspace(1) %ptr) {
+define i16 @func3(ptr addrspace(1) %ptr) {
+; ld.global.nc.u16
+  %val = tail call i16 @llvm.nvvm.ldg.global.i.i16.p1(ptr addrspace(1) %ptr, i32 4)
+  ret i16 %val
+}
+
+; CHECK: func4
+define i32 @func4(ptr addrspace(1) %ptr) {
 ; ld.global.nc.u32
   %val = tail call i32 @llvm.nvvm.ldg.global.i.i32.p1(ptr addrspace(1) %ptr, i32 4)
   ret i32 %val
 }
+
+; CHECK: func5
+define i64 @func5(ptr addrspace(1) %ptr) {
+; ld.global.nc.u64
+  %val = tail call i64 @llvm.nvvm.ldg.global.i.i64.p1(ptr addrspace(1) %ptr, i32 8)
+  ret i64 %val
+}
+
+; CHECK: func6
+define float @func6(ptr addrspace(1) %ptr) {
+; ld.global.nc.u64
+  %val = tail call float @llvm.nvvm.ldg.global.f.f32.p1(ptr addrspace(1) %ptr, i32 4)
+  ret float %val
+}
+
+; CHECK: func7
+define double @func7(ptr addrspace(1) %ptr) {
+; ld.global.nc.u64
+  %val = tail call double @llvm.nvvm.ldg.global.f.f64.p1(ptr addrspace(1) %ptr, i32 8)
+  ret double %val
+}
+
+; CHECK: func8
+define half @func8(ptr addrspace(1) %ptr) {
+; ld.global.nc.b16
+  %val = tail call half @llvm.nvvm.ldg.global.f.f16.p1(ptr addrspace(1) %ptr, i32 4)
+  ret half %val
+}
+
+; CHECK: func9
+define <2 x half> @func9(ptr addrspace(1) %ptr) {
+; ld.global.nc.b32
+  %val = tail call <2 x half> @llvm.nvvm.ldg.global.f.v2f16.p1(ptr addrspace(1) %ptr, i32 4)
+  ret <2 x half> %val
+}
Index: clang/test/CodeGen/builtins-nvptx-native-half-type.c
===
--- clang/test/CodeGen/builtins-nvptx-native-half-type.c
+++ clang/test/CodeGen/builtins-nvptx-native-half-type.c
@@ -172,3 +172,12 @@
 #endif
   // CHECK: ret void
 }
+
+// CHECK-LABEL: nvvm_ldg_native_half_types
+__device__ void nvvm_ldg_native_half_types(const void *p) {
+  // CHECK: call half @llvm.nvvm.ldg.global.f.f16.p0
+  __nvvm_ldg_h((const __fp16 *)p);
+  typedef __fp16 __fp16v2 __attribute__((ext_vector_type(2)));
+  // CHECK: call <2 x half> @llvm.nvvm.ldg.global.f.v2f16.p0
+  __nvvm_ldg_h2((const __fp16v2 *)p);
+}
Index: clang/lib/CodeGen/CGBuiltin.cpp
===
--- clang/lib/CodeGen/CGBuiltin.cpp
+++ clang/lib/CodeGen/CGBuiltin.cpp
@@ -18228,7 +18228,9 @@
 // elements, its alignment is set to number of elements times the alignment
 // of its member: n*alignof(t)."
 return MakeLdg(Intrinsic::nvvm_ldg_global_i);
+  case NVPTX::BI__nvvm_ldg_h:
   case NVPTX::BI__nvvm_ldg_f:
+  case NVPTX::BI__nvvm_ldg_h2:
   case NVPTX::BI__nvvm_ldg_f2:
   case NVPTX::BI__nvvm_ldg_f4:
   case NVPTX::BI__nvvm_ldg_d:
Index: clang/include/clang/Basic/BuiltinsNVPTX.def
===
--- clang/include/clang/Basic/BuiltinsNVPTX.def
+++ clang/include/clang/Basic/BuiltinsNVPTX.def
@@ -795,6 +795,7 @@
 BUILTIN(__nvvm_ldg_ul, "ULiULiC*", "")
 BUILTIN(__nvvm_ldg_ull, "ULLiULLiC*", "")
 
+BUILTIN(__nvvm_ldg_h, "hhC*", "")
 BUILTIN(__nvvm_ldg_f, "ffC*", "")
 BUILTIN(__nvvm_ldg_d, "ddC*", "")
 
@@ -814,6 +815,7 @@
 BUILTIN(__nvvm_ldg_ui4, "E4UiE4UiC*", "")
 BUILTIN(__nvvm_ldg_ull2, "E2ULLiE2ULLiC*", "")
 
+BUILTIN(__nvvm_ldg_h2, "E2hE2hC*", "")
 BUILTIN(__nvvm_ldg_f2, "E2fE2fC*", "")
 BUILTIN(__nvvm_ldg_f4, "E4fE4fC*", "")
 BUILTIN(__nvvm_ldg_d2, 

[PATCH] D136311: [CUDA,NVPTX] Implement __bf16 support for NVPTX.

2022-10-25 Thread Jakub Chlanda via Phabricator via cfe-commits
jchlanda accepted this revision.
jchlanda added a comment.
This revision is now accepted and ready to land.

Looks good.




Comment at: llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp:838
   }
 }
+static int getLdStRegType(EVT VT) {

New line here.



Comment at: llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp:844
+case MVT::bf16:
+case MVT::v2f16:
+case MVT::v2bf16:

Nice, this fixes the reg type for `v2f16` from `Float` to `Untyped`, looks like 
no test picked up on that before.


Repository:
  rG LLVM Github Monorepo

CHANGES SINCE LAST ACTION
  https://reviews.llvm.org/D136311/new/

https://reviews.llvm.org/D136311

___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[PATCH] D124382: [Clang] Recognize target address space in superset calculation

2022-05-17 Thread Jakub Chlanda via Phabricator via cfe-commits
jchlanda added inline comments.



Comment at: clang/include/clang/AST/Type.h:486
+   bool IsSYCLOrOpenCL = false) {
+if (ASMap) {
+  bool IsATargetAS = false;

jchlanda wrote:
> tra wrote:
> > If A and B are both target AS, we fall through to the code which is dealing 
> > with language AS, which would not do us any good. If that's not expected to 
> > happen, we should have an assert to ensure it.
> > 
> > Next, I'm not particularly fond of `IsSYCLOrOpenCL`. Do we need it at all. 
> > If we do know that AS maps to OpenCL `Constant` or `Generic`, I would 
> > assume that those AS would follow the same semantics. Besides, will we ever 
> > see OpenCL language AS in non-OpenCL code?
> > 
> > Next, the function *is* OpenCL specific and would not work for CUDA or HIP. 
> > I think it needs to be generalized to provide language-specific AS mapping 
> > rules.
> I would only like to handle the mixed AS case, it feels like trying to walk 
> back from both HW AS and potentially do the logic of global and constant 
> would be against the intention of users. Asserting on only one HW AS could 
> backfire, as I think it should be allowed to assign between different HW AS.
> 
> The reason I added `IsSYCLOrOpenCL` is because this code is also exercised by 
> `checkPointerTypesForAssignment` which is not OpenCL specific, so I had to 
> have a way of conditionally enabling the conversion to generic AS.
> 
> I agree, it is too restrictive now, especially that the AS map provides 
> values for SYCL, OpenCL and CUDA, so perhaps I should extend `IsSYCLOrOpenCL` 
> to be an enum specifying which language the function deals with and act 
> accordingly?
> 
> Next, I'm not particularly fond of `IsSYCLOrOpenCL`. Do we need it at all. If 
> we do know that AS maps to OpenCL `Constant` or `Generic`, I would assume 
> that those AS would follow the same semantics. Besides, will we ever see 
> OpenCL language AS in non-OpenCL code?
> 
> Next, the function *is* OpenCL specific and would not work for CUDA or HIP. I 
> think it needs to be generalized to provide language-specific AS mapping 
> rules.

I've changed that bool flag to be an enum specifying OpenCL/SYCL/None. The 
rational here is that the handling of AS values differs slightly (SYCL 
introduces `globa device` and `global host`). It would appear that CUDA follows 
completely different code-path and by the time `isAddressSpaceSuperset` called 
all of the language AS values are stripped and set to `0`, which is why (along 
the fact that we don't have a valid use case for CUDA) I left it out, and only 
return true for an exact match.


Repository:
  rG LLVM Github Monorepo

CHANGES SINCE LAST ACTION
  https://reviews.llvm.org/D124382/new/

https://reviews.llvm.org/D124382

___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[PATCH] D124382: [Clang] Recognize target address space in superset calculation

2022-05-17 Thread Jakub Chlanda via Phabricator via cfe-commits
jchlanda updated this revision to Diff 429956.
jchlanda edited the summary of this revision.
jchlanda added a reviewer: Anastasia.
jchlanda added a comment.
Herald added a subscriber: kosarev.

Use helper functions when handling address space values.


Repository:
  rG LLVM Github Monorepo

CHANGES SINCE LAST ACTION
  https://reviews.llvm.org/D124382/new/

https://reviews.llvm.org/D124382

Files:
  clang/include/clang/AST/Type.h
  clang/lib/Sema/SemaCast.cpp
  clang/lib/Sema/SemaExpr.cpp
  clang/test/Sema/address_space_type_casts_amdgpu.cl
  clang/test/Sema/address_space_type_casts_default.cl
  clang/test/SemaOpenCL/atomic-ops.cl
  clang/test/SemaOpenCL/numbered-address-space.cl
  clang/test/SemaOpenCL/predefined-expr.cl
  clang/test/SemaOpenCL/vector-conv.cl

Index: clang/test/SemaOpenCL/vector-conv.cl
===
--- clang/test/SemaOpenCL/vector-conv.cl
+++ clang/test/SemaOpenCL/vector-conv.cl
@@ -16,7 +16,8 @@
   e = (constant int4)i;
   e = (private int4)i;
 
-  private int4 *private_ptr = (const private int4 *)const_global_ptr; // expected-error{{casting 'const __global int4 *' to type 'const __private int4 *' changes address space of pointer}}
-  global int4 *global_ptr = const_global_ptr; // expected-warning {{initializing '__global int4 *__private' with an expression of type 'const __global int4 *__private' discards qualifiers}}
+private
+  int4 *private_ptr = (const private int4 *)const_global_ptr; // expected-error{{casting 'const __global int4 *' to type 'const __private int4 *' changes address space of pointer}}
+  global int4 *global_ptr = const_global_ptr;
   global_ptr = (global int4 *)const_global_ptr;
 }
Index: clang/test/SemaOpenCL/predefined-expr.cl
===
--- clang/test/SemaOpenCL/predefined-expr.cl
+++ clang/test/SemaOpenCL/predefined-expr.cl
@@ -2,7 +2,7 @@
 // RUN: %clang_cc1 %s -verify -cl-std=CL2.0
 
 void f() {
-  char *f1 = __func__;  //expected-error-re{{initializing '{{__generic|__private}} char *__private' with an expression of type 'const __constant char *' changes address space of pointer}}
-  constant char *f2 = __func__; //expected-warning{{initializing '__constant char *__private' with an expression of type 'const __constant char[2]' discards qualifiers}}
+  char *f1 = __func__; // expected-error-re{{initializing '{{__generic|__private}} char *__private' with an expression of type 'const __constant char *' changes address space of pointer}}
+  constant char *f2 = __func__;
   constant const char *f3 = __func__;
 }
Index: clang/test/SemaOpenCL/numbered-address-space.cl
===
--- clang/test/SemaOpenCL/numbered-address-space.cl
+++ clang/test/SemaOpenCL/numbered-address-space.cl
@@ -2,11 +2,16 @@
 // RUN: %clang_cc1 -cl-std=CL2.0 -triple amdgcn-amd-amdhsa -verify -pedantic -fsyntax-only %s
 
 void test_numeric_as_to_generic_implicit_cast(__attribute__((address_space(3))) int *as3_ptr, float src) {
-  generic int* generic_ptr = as3_ptr; // FIXME: This should error
+  generic int *generic_ptr = as3_ptr;
+}
+
+// AS 4 is constant on AMDGPU, casting it to generic is illegal.
+void test_numeric_as_const_to_generic_implicit_cast(__attribute__((address_space(4))) int *as4_ptr, float src) {
+  generic int *generic_ptr = as4_ptr; // expected-error{{initializing '__generic int *__private' with an expression of type '__attribute__((address_space(4))) int *__private' changes address space of pointer}}
 }
 
 void test_numeric_as_to_generic_explicit_cast(__attribute__((address_space(3))) int *as3_ptr, float src) {
-  generic int* generic_ptr = (generic int*) as3_ptr; // Should maybe be valid?
+  generic int *generic_ptr = (generic int *)as3_ptr;
 }
 
 void test_generic_to_numeric_as_implicit_cast(void) {
@@ -20,12 +25,12 @@
 }
 
 void test_generic_as_to_builtin_parameter_explicit_cast_numeric(__attribute__((address_space(3))) int *as3_ptr, float src) {
-  generic int* generic_ptr = as3_ptr; // FIXME: This should error
-  volatile float result = __builtin_amdgcn_ds_fmaxf((__attribute__((address_space(3))) float*) generic_ptr, src, 0, 0, false); // expected-error {{passing '__attribute__((address_space(3))) float *' to parameter of type '__local float *' changes address space of pointer}}
+  generic int *generic_ptr = as3_ptr;
+  // This is legal, as address_space(3) corresponds to local on amdgpu.
+  volatile float result = __builtin_amdgcn_ds_fmaxf((__attribute__((address_space(3))) float *)generic_ptr, src, 0, 0, false);
 }
 
 void test_generic_as_to_builtin_parameterimplicit_cast_numeric(__attribute__((address_space(3))) int *as3_ptr, float src) {
-  generic int* generic_ptr = as3_ptr;
+  generic int *generic_ptr = as3_ptr;
   volatile float result = __builtin_amdgcn_ds_fmaxf(generic_ptr, src, 0, 0, false); // expected-error {{passing '__generic int *__private' to parameter of type 

[PATCH] D124382: [Clang] Recognize target address space in superset calculation

2022-05-06 Thread Jakub Chlanda via Phabricator via cfe-commits
jchlanda added inline comments.



Comment at: clang/include/clang/AST/Type.h:486
+   bool IsSYCLOrOpenCL = false) {
+if (ASMap) {
+  bool IsATargetAS = false;

tra wrote:
> If A and B are both target AS, we fall through to the code which is dealing 
> with language AS, which would not do us any good. If that's not expected to 
> happen, we should have an assert to ensure it.
> 
> Next, I'm not particularly fond of `IsSYCLOrOpenCL`. Do we need it at all. If 
> we do know that AS maps to OpenCL `Constant` or `Generic`, I would assume 
> that those AS would follow the same semantics. Besides, will we ever see 
> OpenCL language AS in non-OpenCL code?
> 
> Next, the function *is* OpenCL specific and would not work for CUDA or HIP. I 
> think it needs to be generalized to provide language-specific AS mapping 
> rules.
I would only like to handle the mixed AS case, it feels like trying to walk 
back from both HW AS and potentially do the logic of global and constant would 
be against the intention of users. Asserting on only one HW AS could backfire, 
as I think it should be allowed to assign between different HW AS.

The reason I added `IsSYCLOrOpenCL` is because this code is also exercised by 
`checkPointerTypesForAssignment` which is not OpenCL specific, so I had to have 
a way of conditionally enabling the conversion to generic AS.

I agree, it is too restrictive now, especially that the AS map provides values 
for SYCL, OpenCL and CUDA, so perhaps I should extend `IsSYCLOrOpenCL` to be an 
enum specifying which language the function deals with and act accordingly?




Comment at: clang/include/clang/AST/Type.h:489
+  bool IsBTargetAS = false;
+  if (A > LangAS::FirstTargetAddressSpace)
+IsATargetAS = true;

tra wrote:
> Is the check intended to tell if A is a target AS? If so, we do have 
> `isTargetAddressSpace()` for that (and it uses '>= 
> LangAS::FirstTargetAddressSpace', which suggests that `>` may be incorrect, 
> too).
Yeap, will update to `isTargetAddressSpace`.




Comment at: clang/include/clang/AST/Type.h:498-499
+LangAS Constant = static_cast(
+(*ASMap)[static_cast(LangAS::opencl_constant)] +
+static_cast(LangAS::FirstTargetAddressSpace));
+if (IsATargetAS)

tra wrote:
> `getLangASFromTargetAS((*ASMap)[static_cast(LangAS::opencl_constant)])`
>  
OK.



Comment at: clang/include/clang/AST/Type.h:500
+static_cast(LangAS::FirstTargetAddressSpace));
+if (IsATargetAS)
+  B = static_cast(

tra wrote:
> `if (!IsBTargetAS)` would be more directly related to what we're doing here.
You are right, will update.



Comment at: clang/include/clang/AST/Type.h:508
+  static_cast(LangAS::FirstTargetAddressSpace));
+// When dealing with target AS return true if:
+// * A is equal to B, or

tra wrote:
> Is the code above intended to ensure that both A and B are target AS at this 
> point?
> If so, then it could be simplified to something like this:
> ```
> if (ASMap) {
>   if (!isTargetAddressSpace(A))
> A = getLangASFromTargetAS((*ASMap)[static_cast(A)]);
>   if (!isTargetAddressSpace(B))
> B = getLangASFromTargetAS((*ASMap)[static_cast(B)]);
> 
>   Generic = 
> getLangASFromTargetAS((*ASMap)[static_cast(LangAS::opencl_generic)])
>   Constant = 
> getLangASFromTargetAS((*ASMap)[static_cast(LangAS::opencl_constant)]);
> 
>   // ... proceed inferring whether A is superset of B in target AS.
>   return;
> }
> assert (isTargetAddressSpace(A) && isTargetAddressSpace(B));
> ```
Yes at the end of AS map accesses all address spaces have to be expressed in 
therms of HW values, but I want it to happen only in the case of mixed AS 
(Language and HW). I will add assert and use helpers, like you suggested in the 
snippet, but would like to keep the `^` condition.



Comment at: clang/lib/Sema/SemaExpr.cpp:9204
+  const LangASMap  = S.Context.getTargetInfo().getAddressSpaceMap();
+  if (!lhq.compatiblyIncludes(rhq, )) {
+const bool AddressSpaceSuperset = Qualifiers::isAddressSpaceSupersetOf(

tra wrote:
> Should you pass `IsSYCLOrOpenCL` to it too? The way 
> `isAddressSpaceSupersetOf` is implemented now it may give you a different 
> result without it. 
> 
> Also, it may make sense to plumb ASMap into `lhq.isAddressSpaceSupersetOf`, 
> too, and just use the old code + couple of new arguments.
Yes, will add.

I thought it would introduce much bigger diff, and there was already a handy 
`static` version of it, don't mind modifying the member if you'd prefer.



Comment at: clang/lib/Sema/SemaExpr.cpp:9219
 // and from void*.
-else if (lhq.withoutObjCGCAttr().withoutObjCLifetime()
-.compatiblyIncludes(
-   

[PATCH] D124382: [Clang] Recognize target address space in superset calculation

2022-05-06 Thread Jakub Chlanda via Phabricator via cfe-commits
jchlanda added a comment.

In D124382#3480600 , @Anastasia wrote:

> 



> And I think we could add this feature in a very light way for example by 
> reserving the numbers from the clang `LangAS` enum to be used with the 
> language address spaces in the prototypes of builtins.

I'm not sure I understand how that would look, could you please elaborate?

> Just to understand further - do you need the builtins to have a specific 
> address space prototype? And do you need this to improve error handling in 
> libclc code base?

With the wrappers suggested we could declare all the pointers to be in generic 
AS and that would get us around the target vs language AS problem. I don't 
think that would improve the situation, as from llvm perspective/use case all 
those builtins would be incorrect and there would be no way for users to tell 
that there is a specific AS requirement on them, nor would the compiler be able 
to warn/error. Then the only thing making it work would be those wrappers, 
embedded deeply in the source of libclc, which at the moment is not even 
shipped with upstream llvm.
The builtins in question are not exclusively OpenCL/SYCL related, say 
`TARGET_BUILTIN(__nvvm_cp_async_ca_shared_global_16, "vv*3vC*1", "", 
AND(SM_80,PTX70))` would need to take both pointer in a generic address space 
here. It feels like explicitly providing AS in the prototype is needed.

> this for example won’t work for builtins that are shared between targets.

While I agree in principle, I'm not sure if there are any target agnostic and 
AS specific builtins, sounds like a dangerous thing to introduce. And in any 
case, it's the target that provides the AS map.


Repository:
  rG LLVM Github Monorepo

CHANGES SINCE LAST ACTION
  https://reviews.llvm.org/D124382/new/

https://reviews.llvm.org/D124382

___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[PATCH] D124382: [Clang] Recognize target address space in superset calculation

2022-05-06 Thread Jakub Chlanda via Phabricator via cfe-commits
jchlanda added a comment.

@Anastasia @tra apologies for a late reply, I'm catching up with the thread 
after holidays.


Repository:
  rG LLVM Github Monorepo

CHANGES SINCE LAST ACTION
  https://reviews.llvm.org/D124382/new/

https://reviews.llvm.org/D124382

___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[PATCH] D124382: [Clang] Recognize target address space in superset calculation

2022-04-28 Thread Jakub Chlanda via Phabricator via cfe-commits
jchlanda added a comment.

In D124382#3472888 , @Anastasia wrote:

> 



> Can you provide an example of where it could be useful? Note that I feel that
> such functionality could be implemented on top of full implementation of
> target specific address space proposed in https://reviews.llvm.org/D62574.

The use case we had was when calling target builtin (that specifies address
space) from within OpenCL C. Currently this errors out, similarly, the explicit
type cast to address space yields an error
(`(__attribute__((address_space(3))) int *)_woof` in the example below). This
is important for libclc, which is implemented in OpenCL C and deals directly
with target builtins.

  __kernel void woof() {
__local int l_woof;
__nvvm_cp_async_mbarrier_arrive_shared(_woof);
  }

I wasn't aware of that patch, sorry, I've not had a close look yet, but it
seems worryingly dated.


Repository:
  rG LLVM Github Monorepo

CHANGES SINCE LAST ACTION
  https://reviews.llvm.org/D124382/new/

https://reviews.llvm.org/D124382

___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[PATCH] D124382: [Clang] Recognize target address space in superset calculation

2022-04-25 Thread Jakub Chlanda via Phabricator via cfe-commits
jchlanda added a comment.

@tra @Naghasan @t4c1 you might find it interesting, a follow up from the 
discussion here: https://reviews.llvm.org/D112718


Repository:
  rG LLVM Github Monorepo

CHANGES SINCE LAST ACTION
  https://reviews.llvm.org/D124382/new/

https://reviews.llvm.org/D124382

___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[PATCH] D124382: [Clang] Recognize target address space in superset calculation

2022-04-25 Thread Jakub Chlanda via Phabricator via cfe-commits
jchlanda created this revision.
Herald added subscribers: kerbowa, Anastasia, jvesely.
Herald added a project: All.
jchlanda requested review of this revision.
Herald added a project: clang.
Herald added a subscriber: cfe-commits.

Use target's address space map to handle cases when both language and
target address spaces are provided. In such case, attempt language to
target translation and only then perform the calculation.
The main motivation is to be able to use language address spaces as
inputs for builtins, which are defined in terms of target address space
(as discussed here: https://reviews.llvm.org/D112718) and hence the
definition of builtins with generic address space pointers that would
allow any other address space pointers inputs (bar constant).

This patch attempts to find a happy medium between not recognising target
address spaces at all (current state) and allowing all uses of it, based on
the assumption that users must know better. What it does not to is to
provide a bidirectional translation mechanism, which I'm not sure could ever
be done, with the current address space implementation (use of 0, the value
of default, etc).

Based on OpenCL rules, this patch follows the conversion guidelines for
`generic` and `constant` address space pointers as described here:
https://www.khronos.org/registry/OpenCL/specs/2.2/html/OpenCL_API.html#_memory_model


Repository:
  rG LLVM Github Monorepo

https://reviews.llvm.org/D124382

Files:
  clang/include/clang/AST/Type.h
  clang/lib/Sema/SemaCast.cpp
  clang/lib/Sema/SemaExpr.cpp
  clang/test/Sema/address_space_type_casts_amdgpu.cl
  clang/test/Sema/address_space_type_casts_default.cl
  clang/test/SemaOpenCL/atomic-ops.cl
  clang/test/SemaOpenCL/numbered-address-space.cl
  clang/test/SemaOpenCL/predefined-expr.cl
  clang/test/SemaOpenCL/vector-conv.cl

Index: clang/test/SemaOpenCL/vector-conv.cl
===
--- clang/test/SemaOpenCL/vector-conv.cl
+++ clang/test/SemaOpenCL/vector-conv.cl
@@ -16,7 +16,8 @@
   e = (constant int4)i;
   e = (private int4)i;
 
-  private int4 *private_ptr = (const private int4 *)const_global_ptr; // expected-error{{casting 'const __global int4 *' to type 'const __private int4 *' changes address space of pointer}}
-  global int4 *global_ptr = const_global_ptr; // expected-warning {{initializing '__global int4 *__private' with an expression of type 'const __global int4 *__private' discards qualifiers}}
+private
+  int4 *private_ptr = (const private int4 *)const_global_ptr; // expected-error{{casting 'const __global int4 *' to type 'const __private int4 *' changes address space of pointer}}
+  global int4 *global_ptr = const_global_ptr;
   global_ptr = (global int4 *)const_global_ptr;
 }
Index: clang/test/SemaOpenCL/predefined-expr.cl
===
--- clang/test/SemaOpenCL/predefined-expr.cl
+++ clang/test/SemaOpenCL/predefined-expr.cl
@@ -2,7 +2,7 @@
 // RUN: %clang_cc1 %s -verify -cl-std=CL2.0
 
 void f() {
-  char *f1 = __func__;  //expected-error-re{{initializing '{{__generic|__private}} char *__private' with an expression of type 'const __constant char *' changes address space of pointer}}
-  constant char *f2 = __func__; //expected-warning{{initializing '__constant char *__private' with an expression of type 'const __constant char[2]' discards qualifiers}}
+  char *f1 = __func__; // expected-error-re{{initializing '{{__generic|__private}} char *__private' with an expression of type 'const __constant char *' changes address space of pointer}}
+  constant char *f2 = __func__;
   constant const char *f3 = __func__;
 }
Index: clang/test/SemaOpenCL/numbered-address-space.cl
===
--- clang/test/SemaOpenCL/numbered-address-space.cl
+++ clang/test/SemaOpenCL/numbered-address-space.cl
@@ -2,11 +2,16 @@
 // RUN: %clang_cc1 -cl-std=CL2.0 -triple amdgcn-amd-amdhsa -verify -pedantic -fsyntax-only %s
 
 void test_numeric_as_to_generic_implicit_cast(__attribute__((address_space(3))) int *as3_ptr, float src) {
-  generic int* generic_ptr = as3_ptr; // FIXME: This should error
+  generic int *generic_ptr = as3_ptr;
+}
+
+// AS 4 is constant on AMDGPU, casting it to generic is illegal.
+void test_numeric_as_const_to_generic_implicit_cast(__attribute__((address_space(4))) int *as4_ptr, float src) {
+  generic int *generic_ptr = as4_ptr; // expected-error{{initializing '__generic int *__private' with an expression of type '__attribute__((address_space(4))) int *__private' changes address space of pointer}}
 }
 
 void test_numeric_as_to_generic_explicit_cast(__attribute__((address_space(3))) int *as3_ptr, float src) {
-  generic int* generic_ptr = (generic int*) as3_ptr; // Should maybe be valid?
+  generic int *generic_ptr = (generic int *)as3_ptr;
 }
 
 void test_generic_to_numeric_as_implicit_cast(void) {
@@ -20,12 +25,12 @@
 }
 
 void 

[PATCH] D117887: [NVPTX] Expose float tys min, max, abs, neg as builtins

2022-03-02 Thread Jakub Chlanda via Phabricator via cfe-commits
jchlanda added a comment.
Herald added a project: All.

In D117887#3352079 , @tra wrote:

> In D117887#3351257 , @jchlanda 
> wrote:
>
>> @tra thank you for landing the patches, it seems that the clang part 
>> (builtin declarations and tests) have been dropped, only `llvm` dir changes 
>> made it through. Is there any way I could fix it (same goes for the other 
>> two patches in this stack)?
>
> Somehow `arc export | git apply` didn't pick clang changes when I was 
> transferring the patch from the phabricator. I'll re-fetch the patches and 
> will land the missing pieces shortly.
>
> I'm not sure how the you've submitted the patch to phabricator. In general, 
> it works best when the patch is supplied as a gic commit diff, with the 
> author metadata, etc. 
> Or via `arc diff`. See for the details. 
> https://llvm.org/docs/Phabricator.html#phabricator-reviews

I went with the web interface as described here: 
https://llvm.org/docs/Phabricator.html#requesting-a-review-via-the-web-interface
 
with `git diff -U99 ...` didn't want to bite the bullet of `arc`, hoping 
that github PRs will soon be a thing.

All working now, thank you for resolving that so quickly.


Repository:
  rG LLVM Github Monorepo

CHANGES SINCE LAST ACTION
  https://reviews.llvm.org/D117887/new/

https://reviews.llvm.org/D117887

___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[PATCH] D117887: [NVPTX] Expose float tys min, max, abs, neg as builtins

2022-03-01 Thread Jakub Chlanda via Phabricator via cfe-commits
jchlanda marked an inline comment as done.
jchlanda added a comment.

@tra thank you for landing the patches, it seems that the clang part (builtin 
declarations and tests) have been dropped, only `llvm` dir changes made it 
through. Is there any way I could fix it (same goes for the other two patches 
in this stack)?


Repository:
  rG LLVM Github Monorepo

CHANGES SINCE LAST ACTION
  https://reviews.llvm.org/D117887/new/

https://reviews.llvm.org/D117887

___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[PATCH] D118977: [NVPTX] Add more FMA intriniscs/builtins

2022-02-17 Thread Jakub Chlanda via Phabricator via cfe-commits
jchlanda added a comment.

@tra I've fixed the test failure (`math-intrins.ll`) the rest seems to be 
unrelated timeouts, would you be able to merge those patches in, as I don't 
have the commit access please? The same goes for 
https://reviews.llvm.org/D117887 and https://reviews.llvm.org/D119157 from 
Nicolas. Thanks.


Repository:
  rG LLVM Github Monorepo

CHANGES SINCE LAST ACTION
  https://reviews.llvm.org/D118977/new/

https://reviews.llvm.org/D118977

___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[PATCH] D118977: [NVPTX] Add more FMA intriniscs/builtins

2022-02-11 Thread Jakub Chlanda via Phabricator via cfe-commits
jchlanda updated this revision to Diff 407904.
jchlanda added a comment.

PTX/sm version tidy up.


Repository:
  rG LLVM Github Monorepo

CHANGES SINCE LAST ACTION
  https://reviews.llvm.org/D118977/new/

https://reviews.llvm.org/D118977

Files:
  clang/include/clang/Basic/BuiltinsNVPTX.def
  clang/test/CodeGen/builtins-nvptx-native-half-type.c
  clang/test/CodeGen/builtins-nvptx.c
  llvm/include/llvm/IR/IntrinsicsNVVM.td
  llvm/lib/Target/NVPTX/NVPTXInstrInfo.td
  llvm/lib/Target/NVPTX/NVPTXIntrinsics.td
  llvm/lib/Target/NVPTX/NVPTXTargetTransformInfo.cpp
  llvm/test/CodeGen/NVPTX/math-intrins-sm53-ptx42.ll
  llvm/test/CodeGen/NVPTX/math-intrins-sm80-ptx70-instcombine.ll
  llvm/test/CodeGen/NVPTX/math-intrins-sm80-ptx70.ll
  llvm/test/CodeGen/NVPTX/math-intrins-sm86-ptx72.ll

Index: llvm/test/CodeGen/NVPTX/math-intrins-sm86-ptx72.ll
===
--- llvm/test/CodeGen/NVPTX/math-intrins-sm86-ptx72.ll
+++ llvm/test/CodeGen/NVPTX/math-intrins-sm86-ptx72.ll
@@ -36,6 +36,7 @@
 
 ; CHECK-LABEL: fmin_xorsign_abs_f16
 define half @fmin_xorsign_abs_f16(half %0, half %1) {
+  ; CHECK-NOT: call
   ; CHECK: min.xorsign.abs.f16
   %res = call half @llvm.nvvm.fmin.xorsign.abs.f16(half %0, half %1)
   ret half %res
@@ -43,6 +44,7 @@
 
 ; CHECK-LABEL: fmin_ftz_xorsign_abs_f16
 define half @fmin_ftz_xorsign_abs_f16(half %0, half %1) {
+  ; CHECK-NOT: call
   ; CHECK: min.ftz.xorsign.abs.f16
   %res = call half @llvm.nvvm.fmin.ftz.xorsign.abs.f16(half %0, half %1)
   ret half %res
@@ -50,6 +52,7 @@
 
 ; CHECK-LABEL: fmin_nan_xorsign_abs_f16
 define half @fmin_nan_xorsign_abs_f16(half %0, half %1) {
+  ; CHECK-NOT: call
   ; CHECK: min.NaN.xorsign.abs.f16
   %res = call half @llvm.nvvm.fmin.nan.xorsign.abs.f16(half %0, half %1)
   ret half %res
@@ -57,6 +60,7 @@
 
 ; CHECK-LABEL: fmin_ftz_nan_xorsign_abs_f16
 define half @fmin_ftz_nan_xorsign_abs_f16(half %0, half %1) {
+  ; CHECK-NOT: call
   ; CHECK: min.ftz.NaN.xorsign.abs.f16
   %res = call half @llvm.nvvm.fmin.ftz.nan.xorsign.abs.f16(half %0, half %1)
   ret half %res
@@ -64,6 +68,7 @@
 
 ; CHECK-LABEL: fmin_xorsign_abs_f16x2
 define <2 x half> @fmin_xorsign_abs_f16x2(<2 x half> %0, <2 x half> %1) {
+  ; CHECK-NOT: call
   ; CHECK: min.xorsign.abs.f16x2
   %res = call <2 x half> @llvm.nvvm.fmin.xorsign.abs.f16x2(<2 x half> %0, <2 x half> %1)
   ret <2 x half> %res
@@ -71,6 +76,7 @@
 
 ; CHECK-LABEL: fmin_ftz_xorsign_abs_f16x2
 define <2 x half> @fmin_ftz_xorsign_abs_f16x2(<2 x half> %0, <2 x half> %1) {
+  ; CHECK-NOT: call
   ; CHECK: min.ftz.xorsign.abs.f16x2
   %res = call <2 x half> @llvm.nvvm.fmin.ftz.xorsign.abs.f16x2(<2 x half> %0, <2 x half> %1)
   ret <2 x half> %res
@@ -78,6 +84,7 @@
 
 ; CHECK-LABEL: fmin_nan_xorsign_abs_f16x2
 define <2 x half> @fmin_nan_xorsign_abs_f16x2(<2 x half> %0, <2 x half> %1) {
+  ; CHECK-NOT: call
   ; CHECK: min.NaN.xorsign.abs.f16x2
   %res = call <2 x half> @llvm.nvvm.fmin.nan.xorsign.abs.f16x2(<2 x half> %0, <2 x half> %1)
   ret <2 x half> %res
@@ -85,6 +92,7 @@
 
 ; CHECK-LABEL: fmin_ftz_nan_xorsign_abs_f16x2
 define <2 x half> @fmin_ftz_nan_xorsign_abs_f16x2(<2 x half> %0, <2 x half> %1) {
+  ; CHECK-NOT: call
   ; CHECK: min.ftz.NaN.xorsign.abs.f16x2
   %res = call <2 x half> @llvm.nvvm.fmin.ftz.nan.xorsign.abs.f16x2(<2 x half> %0, <2 x half> %1)
   ret <2 x half> %res
@@ -92,6 +100,7 @@
 
 ; CHECK-LABEL: fmin_xorsign_abs_bf16
 define i16 @fmin_xorsign_abs_bf16(i16 %0, i16 %1) {
+  ; CHECK-NOT: call
   ; CHECK: min.xorsign.abs.bf16
   %res = call i16 @llvm.nvvm.fmin.xorsign.abs.bf16(i16 %0, i16 %1)
   ret i16 %res
@@ -99,6 +108,7 @@
 
 ; CHECK-LABEL: fmin_nan_xorsign_abs_bf16
 define i16 @fmin_nan_xorsign_abs_bf16(i16 %0, i16 %1) {
+  ; CHECK-NOT: call
   ; CHECK: min.NaN.xorsign.abs.bf16
   %res = call i16 @llvm.nvvm.fmin.nan.xorsign.abs.bf16(i16 %0, i16 %1)
   ret i16 %res
@@ -106,6 +116,7 @@
 
 ; CHECK-LABEL: fmin_xorsign_abs_bf16x2
 define i32 @fmin_xorsign_abs_bf16x2(i32 %0, i32 %1) {
+  ; CHECK-NOT: call
   ; CHECK: min.xorsign.abs.bf16x2
   %res = call i32 @llvm.nvvm.fmin.xorsign.abs.bf16x2(i32 %0, i32 %1)
   ret i32 %res
@@ -113,6 +124,7 @@
 
 ; CHECK-LABEL: fmin_nan_xorsign_abs_bf16x2
 define i32 @fmin_nan_xorsign_abs_bf16x2(i32 %0, i32 %1) {
+  ; CHECK-NOT: call
   ; CHECK: min.NaN.xorsign.abs.bf16x2
   %res = call i32 @llvm.nvvm.fmin.nan.xorsign.abs.bf16x2(i32 %0, i32 %1)
   ret i32 %res
@@ -120,6 +132,7 @@
 
 ; CHECK-LABEL: fmin_xorsign_abs_f
 define float @fmin_xorsign_abs_f(float %0, float %1) {
+  ; CHECK-NOT: call
   ; CHECK: min.xorsign.abs.f
   %res = call float @llvm.nvvm.fmin.xorsign.abs.f(float %0, float %1)
   ret float %res
@@ -127,6 +140,7 @@
 
 ; CHECK-LABEL: fmin_ftz_xorsign_abs_f
 define float @fmin_ftz_xorsign_abs_f(float %0, float %1) {
+  ; CHECK-NOT: call
   ; CHECK: min.ftz.xorsign.abs.f
   %res = call float @llvm.nvvm.fmin.ftz.xorsign.abs.f(float %0, float %1)
   ret float %res
@@ -134,6 +148,7 @@
 
 ; CHECK-LABEL: 

[PATCH] D117887: [NVPTX] Expose float tys min, max, abs, neg as builtins

2022-02-09 Thread Jakub Chlanda via Phabricator via cfe-commits
jchlanda marked 2 inline comments as done.
jchlanda added inline comments.



Comment at: llvm/lib/Target/NVPTX/NVPTXTargetTransformInfo.cpp:162
 
 SimplifyAction(Instruction::BinaryOps BinaryOp, FtzRequirementTy FtzReq)
 : BinaryOp(BinaryOp), FtzRequirement(FtzReq) {}

tra wrote:
> jchlanda wrote:
> > tra wrote:
> > > The new 3-argument constructor above obviates the need for this one.
> > I'm not sure if it does, the 3-way takes `Intrinsic`, while this one 
> > `Instruction`.
> You're right. Sorry, my mistake.
np


Repository:
  rG LLVM Github Monorepo

CHANGES SINCE LAST ACTION
  https://reviews.llvm.org/D117887/new/

https://reviews.llvm.org/D117887

___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[PATCH] D117887: [NVPTX] Expose float tys min, max, abs, neg as builtins

2022-02-09 Thread Jakub Chlanda via Phabricator via cfe-commits
jchlanda added inline comments.



Comment at: llvm/lib/Target/NVPTX/NVPTXTargetTransformInfo.cpp:162
 
 SimplifyAction(Instruction::BinaryOps BinaryOp, FtzRequirementTy FtzReq)
 : BinaryOp(BinaryOp), FtzRequirement(FtzReq) {}

tra wrote:
> The new 3-argument constructor above obviates the need for this one.
I'm not sure if it does, the 3-way takes `Intrinsic`, while this one 
`Instruction`.


Repository:
  rG LLVM Github Monorepo

CHANGES SINCE LAST ACTION
  https://reviews.llvm.org/D117887/new/

https://reviews.llvm.org/D117887

___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[PATCH] D118977: [NVPTX] Add more FMA intriniscs/builtins

2022-02-09 Thread Jakub Chlanda via Phabricator via cfe-commits
jchlanda added inline comments.



Comment at: llvm/lib/Target/NVPTX/NVPTXIntrinsics.td:937
+class FMA_TUPLE Preds = [hasPTX70, hasSM80]> {
+  string Variant = V;

tra wrote:
>  I think the default should be the most useful/common and the least 
> surprising value.
> I'd argue that in this case it would be `[]`.  This would give reader a 
> reasonable idea about what's going on even without looking at FMA_TUPLE 
> implementation.
Agreed, I wrote that class before folding in f32 and f64, `hasPTX70, hasSM80` 
made more sense then. Changed now.


Repository:
  rG LLVM Github Monorepo

CHANGES SINCE LAST ACTION
  https://reviews.llvm.org/D118977/new/

https://reviews.llvm.org/D118977

___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[PATCH] D118977: [NVPTX] Add more FMA intriniscs/builtins

2022-02-09 Thread Jakub Chlanda via Phabricator via cfe-commits
jchlanda updated this revision to Diff 407078.
jchlanda added a comment.

Tidy up FMA_TUPLE class.


Repository:
  rG LLVM Github Monorepo

CHANGES SINCE LAST ACTION
  https://reviews.llvm.org/D118977/new/

https://reviews.llvm.org/D118977

Files:
  clang/include/clang/Basic/BuiltinsNVPTX.def
  clang/test/CodeGen/builtins-nvptx-native-half-type.c
  clang/test/CodeGen/builtins-nvptx.c
  llvm/include/llvm/IR/IntrinsicsNVVM.td
  llvm/lib/Target/NVPTX/NVPTXInstrInfo.td
  llvm/lib/Target/NVPTX/NVPTXIntrinsics.td
  llvm/lib/Target/NVPTX/NVPTXTargetTransformInfo.cpp
  llvm/test/CodeGen/NVPTX/math-intrins-sm80-ptx70-instcombine.ll
  llvm/test/CodeGen/NVPTX/math-intrins-sm80-ptx70.ll
  llvm/test/CodeGen/NVPTX/math-intrins-sm86-ptx72.ll
  llvm/test/CodeGen/NVPTX/math-intrins.ll

Index: llvm/test/CodeGen/NVPTX/math-intrins.ll
===
--- llvm/test/CodeGen/NVPTX/math-intrins.ll
+++ llvm/test/CodeGen/NVPTX/math-intrins.ll
@@ -30,6 +30,15 @@
 declare float @llvm.fma.f32(float, float, float) #0
 declare double @llvm.fma.f64(double, double, double) #0
 
+declare half @llvm.nvvm.fma.rn.f16(half, half, half)
+declare half @llvm.nvvm.fma.rn.ftz.f16(half, half, half)
+declare half @llvm.nvvm.fma.rn.sat.f16(half, half, half)
+declare half @llvm.nvvm.fma.rn.ftz.sat.f16(half, half, half)
+declare <2 x half> @llvm.nvvm.fma.rn.f16x2(<2 x half>, <2 x half>, <2 x half>)
+declare <2 x half> @llvm.nvvm.fma.rn.ftz.f16x2(<2 x half>, <2 x half>, <2 x half>)
+declare <2 x half> @llvm.nvvm.fma.rn.sat.f16x2(<2 x half>, <2 x half>, <2 x half>)
+declare <2 x half> @llvm.nvvm.fma.rn.ftz.sat.f16x2(<2 x half>, <2 x half>, <2 x half>)
+
 ;  ceil 
 
 ; CHECK-LABEL: ceil_float
@@ -328,5 +337,69 @@
   ret double %x
 }
 
+; CHECK-LABEL: fma_rn_f16
+define half @fma_rn_f16(half %0, half %1, half %2) {
+  ; CHECK-NOT: call
+  ; CHECK: fma.rn.f16
+  %res = call half @llvm.nvvm.fma.rn.f16(half %0, half %1, half %2)
+  ret half %res
+}
+
+; CHECK-LABEL: fma_rn_ftz_f16
+define half @fma_rn_ftz_f16(half %0, half %1, half %2) {
+  ; CHECK-NOT: call
+  ; CHECK: fma.rn.ftz.f16
+  %res = call half @llvm.nvvm.fma.rn.ftz.f16(half %0, half %1, half %2)
+  ret half %res
+}
+
+; CHECK-LABEL: fma_rn_sat_f16
+define half @fma_rn_sat_f16(half %0, half %1, half %2) {
+  ; CHECK-NOT: call
+  ; CHECK: fma.rn.sat.f16
+  %res = call half @llvm.nvvm.fma.rn.sat.f16(half %0, half %1, half %2)
+  ret half %res
+}
+
+; CHECK-LABEL: fma_rn_ftz_sat_f16
+define half @fma_rn_ftz_sat_f16(half %0, half %1, half %2) {
+  ; CHECK-NOT: call
+  ; CHECK: fma.rn.ftz.sat.f16
+  %res = call half @llvm.nvvm.fma.rn.ftz.sat.f16(half %0, half %1, half %2)
+  ret half %res
+}
+
+; CHECK-LABEL: fma_rn_f16x2
+define <2 x half> @fma_rn_f16x2(<2 x half> %0, <2 x half> %1, <2 x half> %2) {
+  ; CHECK-NOT: call
+  ; CHECK: fma.rn.f16x2
+  %res = call <2 x half> @llvm.nvvm.fma.rn.f16x2(<2 x half> %0, <2 x half> %1, <2 x half> %2)
+  ret <2 x half> %res
+}
+
+; CHECK-LABEL: fma_rn_ftz_f16x2
+define <2 x half> @fma_rn_ftz_f16x2(<2 x half> %0, <2 x half> %1, <2 x half> %2) {
+  ; CHECK-NOT: call
+  ; CHECK: fma.rn.ftz.f16x2
+  %res = call <2 x half> @llvm.nvvm.fma.rn.ftz.f16x2(<2 x half> %0, <2 x half> %1, <2 x half> %2)
+  ret <2 x half> %res
+}
+
+; CHECK-LABEL: fma_rn_sat_f16x2
+define <2 x half> @fma_rn_sat_f16x2(<2 x half> %0, <2 x half> %1, <2 x half> %2) {
+  ; CHECK-NOT: call
+  ; CHECK: fma.rn.sat.f16x2
+  %res = call <2 x half> @llvm.nvvm.fma.rn.sat.f16x2(<2 x half> %0, <2 x half> %1, <2 x half> %2)
+  ret <2 x half> %res
+}
+
+; CHECK-LABEL: fma_rn_ftz_sat_f16x2
+define <2 x half> @fma_rn_ftz_sat_f16x2(<2 x half> %0, <2 x half> %1, <2 x half> %2) {
+  ; CHECK-NOT: call
+  ; CHECK: fma.rn.ftz.sat.f16x2
+  %res = call <2 x half> @llvm.nvvm.fma.rn.ftz.sat.f16x2(<2 x half> %0, <2 x half> %1, <2 x half> %2)
+  ret <2 x half> %res
+}
+
 attributes #0 = { nounwind readnone }
 attributes #1 = { "denormal-fp-math-f32" = "preserve-sign" }
Index: llvm/test/CodeGen/NVPTX/math-intrins-sm86-ptx72.ll
===
--- llvm/test/CodeGen/NVPTX/math-intrins-sm86-ptx72.ll
+++ llvm/test/CodeGen/NVPTX/math-intrins-sm86-ptx72.ll
@@ -36,6 +36,7 @@
 
 ; CHECK-LABEL: fmin_xorsign_abs_f16
 define half @fmin_xorsign_abs_f16(half %0, half %1) {
+  ; CHECK-NOT: call
   ; CHECK: min.xorsign.abs.f16
   %res = call half @llvm.nvvm.fmin.xorsign.abs.f16(half %0, half %1)
   ret half %res
@@ -43,6 +44,7 @@
 
 ; CHECK-LABEL: fmin_ftz_xorsign_abs_f16
 define half @fmin_ftz_xorsign_abs_f16(half %0, half %1) {
+  ; CHECK-NOT: call
   ; CHECK: min.ftz.xorsign.abs.f16
   %res = call half @llvm.nvvm.fmin.ftz.xorsign.abs.f16(half %0, half %1)
   ret half %res
@@ -50,6 +52,7 @@
 
 ; CHECK-LABEL: fmin_nan_xorsign_abs_f16
 define half @fmin_nan_xorsign_abs_f16(half %0, half %1) {
+  ; CHECK-NOT: call
   ; CHECK: min.NaN.xorsign.abs.f16
   %res = call half @llvm.nvvm.fmin.nan.xorsign.abs.f16(half %0, half %1)
   ret half 

[PATCH] D118977: [NVPTX] Add more FMA intriniscs/builtins

2022-02-08 Thread Jakub Chlanda via Phabricator via cfe-commits
jchlanda added a comment.

In D118977#3302158 , @tra wrote:

> In D118977#3299974 , @jchlanda 
> wrote:
>
>>> Target ISA Notes
>>> Requires sm_53 or higher.
>
> I think we do need this constraint applied to the new builtins, too. Right 
> now nothing stops using them on a GPU where they do not exist and that will 
> likely crash the compiler when we fail to find a matching intrinsic.

You are right, it would lead to unexpected crashes, I wrongly assumed that 
since the lowest version guarded against is PTX 6.0/SM_60, anything lower than 
that should be OK. Fixed now.


Repository:
  rG LLVM Github Monorepo

CHANGES SINCE LAST ACTION
  https://reviews.llvm.org/D118977/new/

https://reviews.llvm.org/D118977

___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[PATCH] D118977: [NVPTX] Add more FMA intriniscs/builtins

2022-02-08 Thread Jakub Chlanda via Phabricator via cfe-commits
jchlanda updated this revision to Diff 406851.
jchlanda added a comment.

Add sm/ptx version guard to f16{x2} builtins.


Repository:
  rG LLVM Github Monorepo

CHANGES SINCE LAST ACTION
  https://reviews.llvm.org/D118977/new/

https://reviews.llvm.org/D118977

Files:
  clang/include/clang/Basic/BuiltinsNVPTX.def
  clang/test/CodeGen/builtins-nvptx-native-half-type.c
  clang/test/CodeGen/builtins-nvptx.c
  llvm/include/llvm/IR/IntrinsicsNVVM.td
  llvm/lib/Target/NVPTX/NVPTXInstrInfo.td
  llvm/lib/Target/NVPTX/NVPTXIntrinsics.td
  llvm/lib/Target/NVPTX/NVPTXTargetTransformInfo.cpp
  llvm/test/CodeGen/NVPTX/math-intrins-sm80-ptx70-instcombine.ll
  llvm/test/CodeGen/NVPTX/math-intrins-sm80-ptx70.ll
  llvm/test/CodeGen/NVPTX/math-intrins-sm86-ptx72.ll
  llvm/test/CodeGen/NVPTX/math-intrins.ll

Index: llvm/test/CodeGen/NVPTX/math-intrins.ll
===
--- llvm/test/CodeGen/NVPTX/math-intrins.ll
+++ llvm/test/CodeGen/NVPTX/math-intrins.ll
@@ -30,6 +30,15 @@
 declare float @llvm.fma.f32(float, float, float) #0
 declare double @llvm.fma.f64(double, double, double) #0
 
+declare half @llvm.nvvm.fma.rn.f16(half, half, half)
+declare half @llvm.nvvm.fma.rn.ftz.f16(half, half, half)
+declare half @llvm.nvvm.fma.rn.sat.f16(half, half, half)
+declare half @llvm.nvvm.fma.rn.ftz.sat.f16(half, half, half)
+declare <2 x half> @llvm.nvvm.fma.rn.f16x2(<2 x half>, <2 x half>, <2 x half>)
+declare <2 x half> @llvm.nvvm.fma.rn.ftz.f16x2(<2 x half>, <2 x half>, <2 x half>)
+declare <2 x half> @llvm.nvvm.fma.rn.sat.f16x2(<2 x half>, <2 x half>, <2 x half>)
+declare <2 x half> @llvm.nvvm.fma.rn.ftz.sat.f16x2(<2 x half>, <2 x half>, <2 x half>)
+
 ;  ceil 
 
 ; CHECK-LABEL: ceil_float
@@ -328,5 +337,69 @@
   ret double %x
 }
 
+; CHECK-LABEL: fma_rn_f16
+define half @fma_rn_f16(half %0, half %1, half %2) {
+  ; CHECK-NOT: call
+  ; CHECK: fma.rn.f16
+  %res = call half @llvm.nvvm.fma.rn.f16(half %0, half %1, half %2)
+  ret half %res
+}
+
+; CHECK-LABEL: fma_rn_ftz_f16
+define half @fma_rn_ftz_f16(half %0, half %1, half %2) {
+  ; CHECK-NOT: call
+  ; CHECK: fma.rn.ftz.f16
+  %res = call half @llvm.nvvm.fma.rn.ftz.f16(half %0, half %1, half %2)
+  ret half %res
+}
+
+; CHECK-LABEL: fma_rn_sat_f16
+define half @fma_rn_sat_f16(half %0, half %1, half %2) {
+  ; CHECK-NOT: call
+  ; CHECK: fma.rn.sat.f16
+  %res = call half @llvm.nvvm.fma.rn.sat.f16(half %0, half %1, half %2)
+  ret half %res
+}
+
+; CHECK-LABEL: fma_rn_ftz_sat_f16
+define half @fma_rn_ftz_sat_f16(half %0, half %1, half %2) {
+  ; CHECK-NOT: call
+  ; CHECK: fma.rn.ftz.sat.f16
+  %res = call half @llvm.nvvm.fma.rn.ftz.sat.f16(half %0, half %1, half %2)
+  ret half %res
+}
+
+; CHECK-LABEL: fma_rn_f16x2
+define <2 x half> @fma_rn_f16x2(<2 x half> %0, <2 x half> %1, <2 x half> %2) {
+  ; CHECK-NOT: call
+  ; CHECK: fma.rn.f16x2
+  %res = call <2 x half> @llvm.nvvm.fma.rn.f16x2(<2 x half> %0, <2 x half> %1, <2 x half> %2)
+  ret <2 x half> %res
+}
+
+; CHECK-LABEL: fma_rn_ftz_f16x2
+define <2 x half> @fma_rn_ftz_f16x2(<2 x half> %0, <2 x half> %1, <2 x half> %2) {
+  ; CHECK-NOT: call
+  ; CHECK: fma.rn.ftz.f16x2
+  %res = call <2 x half> @llvm.nvvm.fma.rn.ftz.f16x2(<2 x half> %0, <2 x half> %1, <2 x half> %2)
+  ret <2 x half> %res
+}
+
+; CHECK-LABEL: fma_rn_sat_f16x2
+define <2 x half> @fma_rn_sat_f16x2(<2 x half> %0, <2 x half> %1, <2 x half> %2) {
+  ; CHECK-NOT: call
+  ; CHECK: fma.rn.sat.f16x2
+  %res = call <2 x half> @llvm.nvvm.fma.rn.sat.f16x2(<2 x half> %0, <2 x half> %1, <2 x half> %2)
+  ret <2 x half> %res
+}
+
+; CHECK-LABEL: fma_rn_ftz_sat_f16x2
+define <2 x half> @fma_rn_ftz_sat_f16x2(<2 x half> %0, <2 x half> %1, <2 x half> %2) {
+  ; CHECK-NOT: call
+  ; CHECK: fma.rn.ftz.sat.f16x2
+  %res = call <2 x half> @llvm.nvvm.fma.rn.ftz.sat.f16x2(<2 x half> %0, <2 x half> %1, <2 x half> %2)
+  ret <2 x half> %res
+}
+
 attributes #0 = { nounwind readnone }
 attributes #1 = { "denormal-fp-math-f32" = "preserve-sign" }
Index: llvm/test/CodeGen/NVPTX/math-intrins-sm86-ptx72.ll
===
--- llvm/test/CodeGen/NVPTX/math-intrins-sm86-ptx72.ll
+++ llvm/test/CodeGen/NVPTX/math-intrins-sm86-ptx72.ll
@@ -36,6 +36,7 @@
 
 ; CHECK-LABEL: fmin_xorsign_abs_f16
 define half @fmin_xorsign_abs_f16(half %0, half %1) {
+  ; CHECK-NOT: call
   ; CHECK: min.xorsign.abs.f16
   %res = call half @llvm.nvvm.fmin.xorsign.abs.f16(half %0, half %1)
   ret half %res
@@ -43,6 +44,7 @@
 
 ; CHECK-LABEL: fmin_ftz_xorsign_abs_f16
 define half @fmin_ftz_xorsign_abs_f16(half %0, half %1) {
+  ; CHECK-NOT: call
   ; CHECK: min.ftz.xorsign.abs.f16
   %res = call half @llvm.nvvm.fmin.ftz.xorsign.abs.f16(half %0, half %1)
   ret half %res
@@ -50,6 +52,7 @@
 
 ; CHECK-LABEL: fmin_nan_xorsign_abs_f16
 define half @fmin_nan_xorsign_abs_f16(half %0, half %1) {
+  ; CHECK-NOT: call
   ; CHECK: min.NaN.xorsign.abs.f16
   %res = call half @llvm.nvvm.fmin.nan.xorsign.abs.f16(half %0, 

[PATCH] D118977: [NVPTX] Add more FMA intriniscs/builtins

2022-02-06 Thread Jakub Chlanda via Phabricator via cfe-commits
jchlanda added a comment.

In D118977#3297465 , @tra wrote:

>> They all require PTX 7.0, SM_80.
>
> According to 
> https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#half-precision-floating-point-instructions-fma
>  only `fma.relu` and `bf16*` variants require ptx70/sm80:
>
>   PTX ISA Notes
>   Introduced in PTX ISA version 4.2.
>   
>   fma.relu.{f16, f16x2} and fma{.relu}.{bf16, bf16x2} introduced in PTX ISA 
> version 7.0.
>   
>   Target ISA Notes
>   Requires sm_53 or higher.
>   
>   fma.relu.{f16, f16x2} and fma{.relu}.{bf16, bf16x2} require sm_80 or higher.

My bad, sorry. Fixed now.


Repository:
  rG LLVM Github Monorepo

CHANGES SINCE LAST ACTION
  https://reviews.llvm.org/D118977/new/

https://reviews.llvm.org/D118977

___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[PATCH] D118977: [NVPTX] Add more FMA intriniscs/builtins

2022-02-06 Thread Jakub Chlanda via Phabricator via cfe-commits
jchlanda updated this revision to Diff 406322.
jchlanda added a comment.

Set correct SM and PTX version.


Repository:
  rG LLVM Github Monorepo

CHANGES SINCE LAST ACTION
  https://reviews.llvm.org/D118977/new/

https://reviews.llvm.org/D118977

Files:
  clang/include/clang/Basic/BuiltinsNVPTX.def
  clang/test/CodeGen/builtins-nvptx-native-half-type.c
  clang/test/CodeGen/builtins-nvptx.c
  llvm/include/llvm/IR/IntrinsicsNVVM.td
  llvm/lib/Target/NVPTX/NVPTXInstrInfo.td
  llvm/lib/Target/NVPTX/NVPTXIntrinsics.td
  llvm/lib/Target/NVPTX/NVPTXTargetTransformInfo.cpp
  llvm/test/CodeGen/NVPTX/math-intrins-sm80-ptx70-instcombine.ll
  llvm/test/CodeGen/NVPTX/math-intrins-sm80-ptx70.ll
  llvm/test/CodeGen/NVPTX/math-intrins-sm86-ptx72.ll
  llvm/test/CodeGen/NVPTX/math-intrins.ll

Index: llvm/test/CodeGen/NVPTX/math-intrins.ll
===
--- llvm/test/CodeGen/NVPTX/math-intrins.ll
+++ llvm/test/CodeGen/NVPTX/math-intrins.ll
@@ -30,6 +30,15 @@
 declare float @llvm.fma.f32(float, float, float) #0
 declare double @llvm.fma.f64(double, double, double) #0
 
+declare half @llvm.nvvm.fma.rn.f16(half, half, half)
+declare half @llvm.nvvm.fma.rn.ftz.f16(half, half, half)
+declare half @llvm.nvvm.fma.rn.sat.f16(half, half, half)
+declare half @llvm.nvvm.fma.rn.ftz.sat.f16(half, half, half)
+declare <2 x half> @llvm.nvvm.fma.rn.f16x2(<2 x half>, <2 x half>, <2 x half>)
+declare <2 x half> @llvm.nvvm.fma.rn.ftz.f16x2(<2 x half>, <2 x half>, <2 x half>)
+declare <2 x half> @llvm.nvvm.fma.rn.sat.f16x2(<2 x half>, <2 x half>, <2 x half>)
+declare <2 x half> @llvm.nvvm.fma.rn.ftz.sat.f16x2(<2 x half>, <2 x half>, <2 x half>)
+
 ;  ceil 
 
 ; CHECK-LABEL: ceil_float
@@ -328,5 +337,69 @@
   ret double %x
 }
 
+; CHECK-LABEL: fma_rn_f16
+define half @fma_rn_f16(half %0, half %1, half %2) {
+  ; CHECK-NOT: call
+  ; CHECK: fma.rn.f16
+  %res = call half @llvm.nvvm.fma.rn.f16(half %0, half %1, half %2)
+  ret half %res
+}
+
+; CHECK-LABEL: fma_rn_ftz_f16
+define half @fma_rn_ftz_f16(half %0, half %1, half %2) {
+  ; CHECK-NOT: call
+  ; CHECK: fma.rn.ftz.f16
+  %res = call half @llvm.nvvm.fma.rn.ftz.f16(half %0, half %1, half %2)
+  ret half %res
+}
+
+; CHECK-LABEL: fma_rn_sat_f16
+define half @fma_rn_sat_f16(half %0, half %1, half %2) {
+  ; CHECK-NOT: call
+  ; CHECK: fma.rn.sat.f16
+  %res = call half @llvm.nvvm.fma.rn.sat.f16(half %0, half %1, half %2)
+  ret half %res
+}
+
+; CHECK-LABEL: fma_rn_ftz_sat_f16
+define half @fma_rn_ftz_sat_f16(half %0, half %1, half %2) {
+  ; CHECK-NOT: call
+  ; CHECK: fma.rn.ftz.sat.f16
+  %res = call half @llvm.nvvm.fma.rn.ftz.sat.f16(half %0, half %1, half %2)
+  ret half %res
+}
+
+; CHECK-LABEL: fma_rn_f16x2
+define <2 x half> @fma_rn_f16x2(<2 x half> %0, <2 x half> %1, <2 x half> %2) {
+  ; CHECK-NOT: call
+  ; CHECK: fma.rn.f16x2
+  %res = call <2 x half> @llvm.nvvm.fma.rn.f16x2(<2 x half> %0, <2 x half> %1, <2 x half> %2)
+  ret <2 x half> %res
+}
+
+; CHECK-LABEL: fma_rn_ftz_f16x2
+define <2 x half> @fma_rn_ftz_f16x2(<2 x half> %0, <2 x half> %1, <2 x half> %2) {
+  ; CHECK-NOT: call
+  ; CHECK: fma.rn.ftz.f16x2
+  %res = call <2 x half> @llvm.nvvm.fma.rn.ftz.f16x2(<2 x half> %0, <2 x half> %1, <2 x half> %2)
+  ret <2 x half> %res
+}
+
+; CHECK-LABEL: fma_rn_sat_f16x2
+define <2 x half> @fma_rn_sat_f16x2(<2 x half> %0, <2 x half> %1, <2 x half> %2) {
+  ; CHECK-NOT: call
+  ; CHECK: fma.rn.sat.f16x2
+  %res = call <2 x half> @llvm.nvvm.fma.rn.sat.f16x2(<2 x half> %0, <2 x half> %1, <2 x half> %2)
+  ret <2 x half> %res
+}
+
+; CHECK-LABEL: fma_rn_ftz_sat_f16x2
+define <2 x half> @fma_rn_ftz_sat_f16x2(<2 x half> %0, <2 x half> %1, <2 x half> %2) {
+  ; CHECK-NOT: call
+  ; CHECK: fma.rn.ftz.sat.f16x2
+  %res = call <2 x half> @llvm.nvvm.fma.rn.ftz.sat.f16x2(<2 x half> %0, <2 x half> %1, <2 x half> %2)
+  ret <2 x half> %res
+}
+
 attributes #0 = { nounwind readnone }
 attributes #1 = { "denormal-fp-math-f32" = "preserve-sign" }
Index: llvm/test/CodeGen/NVPTX/math-intrins-sm86-ptx72.ll
===
--- llvm/test/CodeGen/NVPTX/math-intrins-sm86-ptx72.ll
+++ llvm/test/CodeGen/NVPTX/math-intrins-sm86-ptx72.ll
@@ -36,6 +36,7 @@
 
 ; CHECK-LABEL: fmin_xorsign_abs_f16
 define half @fmin_xorsign_abs_f16(half %0, half %1) {
+  ; CHECK-NOT: call
   ; CHECK: min.xorsign.abs.f16
   %res = call half @llvm.nvvm.fmin.xorsign.abs.f16(half %0, half %1)
   ret half %res
@@ -43,6 +44,7 @@
 
 ; CHECK-LABEL: fmin_ftz_xorsign_abs_f16
 define half @fmin_ftz_xorsign_abs_f16(half %0, half %1) {
+  ; CHECK-NOT: call
   ; CHECK: min.ftz.xorsign.abs.f16
   %res = call half @llvm.nvvm.fmin.ftz.xorsign.abs.f16(half %0, half %1)
   ret half %res
@@ -50,6 +52,7 @@
 
 ; CHECK-LABEL: fmin_nan_xorsign_abs_f16
 define half @fmin_nan_xorsign_abs_f16(half %0, half %1) {
+  ; CHECK-NOT: call
   ; CHECK: min.NaN.xorsign.abs.f16
   %res = call half @llvm.nvvm.fmin.nan.xorsign.abs.f16(half %0, half %1)
   

[PATCH] D118977: [NVPTX] Add more FMA intriniscs/builtins

2022-02-04 Thread Jakub Chlanda via Phabricator via cfe-commits
jchlanda created this revision.
jchlanda added reviewers: tra, csigg, bkramer.
Herald added subscribers: asavonic, hiraditya, jholewinski.
jchlanda requested review of this revision.
Herald added subscribers: llvm-commits, cfe-commits, jdoerfert.
Herald added projects: clang, LLVM.

This patch adds builtins/intrinsics for the following variants of FMA:

- f16
  - rn
  - rn_ftz
  - rn_sat
  - rn_ftz_sat
  - rn_relu
  - rn_ftz_relu
- f16x2
  - rn
  - rn_ftz
  - rn_sat
  - rn_ftz_sat
  - rn_relu
  - rn_ftz_relu
- bf16
  - rn
  - rn_relu
- bf16x2
  - rn
  - rn_relu

They all require PTX 7.0, SM_80.

`ptxas` (Cuda compilation tools, release 11.0, V11.0.194) is happy with the 
generated assembly.

Depends on D<117887>


Repository:
  rG LLVM Github Monorepo

https://reviews.llvm.org/D118977

Files:
  clang/include/clang/Basic/BuiltinsNVPTX.def
  clang/test/CodeGen/builtins-nvptx-native-half-type.c
  clang/test/CodeGen/builtins-nvptx.c
  llvm/include/llvm/IR/IntrinsicsNVVM.td
  llvm/lib/Target/NVPTX/NVPTXIntrinsics.td
  llvm/lib/Target/NVPTX/NVPTXTargetTransformInfo.cpp
  llvm/test/CodeGen/NVPTX/math-intrins-sm80-ptx70-instcombine.ll
  llvm/test/CodeGen/NVPTX/math-intrins-sm80-ptx70.ll
  llvm/test/CodeGen/NVPTX/math-intrins-sm86-ptx72.ll

Index: llvm/test/CodeGen/NVPTX/math-intrins-sm86-ptx72.ll
===
--- llvm/test/CodeGen/NVPTX/math-intrins-sm86-ptx72.ll
+++ llvm/test/CodeGen/NVPTX/math-intrins-sm86-ptx72.ll
@@ -36,6 +36,7 @@
 
 ; CHECK-LABEL: fmin_xorsign_abs_f16
 define half @fmin_xorsign_abs_f16(half %0, half %1) {
+  ; CHECK-NOT: call
   ; CHECK: min.xorsign.abs.f16
   %res = call half @llvm.nvvm.fmin.xorsign.abs.f16(half %0, half %1)
   ret half %res
@@ -43,6 +44,7 @@
 
 ; CHECK-LABEL: fmin_ftz_xorsign_abs_f16
 define half @fmin_ftz_xorsign_abs_f16(half %0, half %1) {
+  ; CHECK-NOT: call
   ; CHECK: min.ftz.xorsign.abs.f16
   %res = call half @llvm.nvvm.fmin.ftz.xorsign.abs.f16(half %0, half %1)
   ret half %res
@@ -50,6 +52,7 @@
 
 ; CHECK-LABEL: fmin_nan_xorsign_abs_f16
 define half @fmin_nan_xorsign_abs_f16(half %0, half %1) {
+  ; CHECK-NOT: call
   ; CHECK: min.NaN.xorsign.abs.f16
   %res = call half @llvm.nvvm.fmin.nan.xorsign.abs.f16(half %0, half %1)
   ret half %res
@@ -57,6 +60,7 @@
 
 ; CHECK-LABEL: fmin_ftz_nan_xorsign_abs_f16
 define half @fmin_ftz_nan_xorsign_abs_f16(half %0, half %1) {
+  ; CHECK-NOT: call
   ; CHECK: min.ftz.NaN.xorsign.abs.f16
   %res = call half @llvm.nvvm.fmin.ftz.nan.xorsign.abs.f16(half %0, half %1)
   ret half %res
@@ -64,6 +68,7 @@
 
 ; CHECK-LABEL: fmin_xorsign_abs_f16x2
 define <2 x half> @fmin_xorsign_abs_f16x2(<2 x half> %0, <2 x half> %1) {
+  ; CHECK-NOT: call
   ; CHECK: min.xorsign.abs.f16x2
   %res = call <2 x half> @llvm.nvvm.fmin.xorsign.abs.f16x2(<2 x half> %0, <2 x half> %1)
   ret <2 x half> %res
@@ -71,6 +76,7 @@
 
 ; CHECK-LABEL: fmin_ftz_xorsign_abs_f16x2
 define <2 x half> @fmin_ftz_xorsign_abs_f16x2(<2 x half> %0, <2 x half> %1) {
+  ; CHECK-NOT: call
   ; CHECK: min.ftz.xorsign.abs.f16x2
   %res = call <2 x half> @llvm.nvvm.fmin.ftz.xorsign.abs.f16x2(<2 x half> %0, <2 x half> %1)
   ret <2 x half> %res
@@ -78,6 +84,7 @@
 
 ; CHECK-LABEL: fmin_nan_xorsign_abs_f16x2
 define <2 x half> @fmin_nan_xorsign_abs_f16x2(<2 x half> %0, <2 x half> %1) {
+  ; CHECK-NOT: call
   ; CHECK: min.NaN.xorsign.abs.f16x2
   %res = call <2 x half> @llvm.nvvm.fmin.nan.xorsign.abs.f16x2(<2 x half> %0, <2 x half> %1)
   ret <2 x half> %res
@@ -85,6 +92,7 @@
 
 ; CHECK-LABEL: fmin_ftz_nan_xorsign_abs_f16x2
 define <2 x half> @fmin_ftz_nan_xorsign_abs_f16x2(<2 x half> %0, <2 x half> %1) {
+  ; CHECK-NOT: call
   ; CHECK: min.ftz.NaN.xorsign.abs.f16x2
   %res = call <2 x half> @llvm.nvvm.fmin.ftz.nan.xorsign.abs.f16x2(<2 x half> %0, <2 x half> %1)
   ret <2 x half> %res
@@ -92,6 +100,7 @@
 
 ; CHECK-LABEL: fmin_xorsign_abs_bf16
 define i16 @fmin_xorsign_abs_bf16(i16 %0, i16 %1) {
+  ; CHECK-NOT: call
   ; CHECK: min.xorsign.abs.bf16
   %res = call i16 @llvm.nvvm.fmin.xorsign.abs.bf16(i16 %0, i16 %1)
   ret i16 %res
@@ -99,6 +108,7 @@
 
 ; CHECK-LABEL: fmin_nan_xorsign_abs_bf16
 define i16 @fmin_nan_xorsign_abs_bf16(i16 %0, i16 %1) {
+  ; CHECK-NOT: call
   ; CHECK: min.NaN.xorsign.abs.bf16
   %res = call i16 @llvm.nvvm.fmin.nan.xorsign.abs.bf16(i16 %0, i16 %1)
   ret i16 %res
@@ -106,6 +116,7 @@
 
 ; CHECK-LABEL: fmin_xorsign_abs_bf16x2
 define i32 @fmin_xorsign_abs_bf16x2(i32 %0, i32 %1) {
+  ; CHECK-NOT: call
   ; CHECK: min.xorsign.abs.bf16x2
   %res = call i32 @llvm.nvvm.fmin.xorsign.abs.bf16x2(i32 %0, i32 %1)
   ret i32 %res
@@ -113,6 +124,7 @@
 
 ; CHECK-LABEL: fmin_nan_xorsign_abs_bf16x2
 define i32 @fmin_nan_xorsign_abs_bf16x2(i32 %0, i32 %1) {
+  ; CHECK-NOT: call
   ; CHECK: min.NaN.xorsign.abs.bf16x2
   %res = call i32 @llvm.nvvm.fmin.nan.xorsign.abs.bf16x2(i32 %0, i32 %1)
   ret i32 %res
@@ -120,6 +132,7 @@
 
 ; CHECK-LABEL: fmin_xorsign_abs_f
 define float @fmin_xorsign_abs_f(float %0, float %1) {
+  

[PATCH] D117887: [NVPTX] Expose float tys min, max, abs, neg as builtins

2022-02-02 Thread Jakub Chlanda via Phabricator via cfe-commits
jchlanda marked 2 inline comments as done.
jchlanda added inline comments.



Comment at: clang/test/CodeGen/builtins-nvptx.c:822
+  // CHECK_PTX70_SM80: call i16 @llvm.nvvm.fmin.bf16
+  __nvvm_fmin_bf16(0x1234, 0x7FBF);
+  // CHECK_PTX70_SM80: call i16 @llvm.nvvm.fmin.nan.bf16

tra wrote:
> I'd `#define` the magic values to give them sensible names.
I've added #defs for those values, they are not strictly needed (as in the 
values don't really matter) as this is not being executed, but I agree, it 
makes for a better read of the test.


Repository:
  rG LLVM Github Monorepo

CHANGES SINCE LAST ACTION
  https://reviews.llvm.org/D117887/new/

https://reviews.llvm.org/D117887

___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[PATCH] D117887: [NVPTX] Expose float tys min, max, abs, neg as builtins

2022-02-02 Thread Jakub Chlanda via Phabricator via cfe-commits
jchlanda added a comment.

In D117887#3262024 , @tra wrote:

> 



> Please do check that the generated PTX does get assembled by ptxas.

`ptxas` is happy with asm generated from both `math-intrins-sm86-ptx72.ll` and 
`math-intrins-sm80-ptx70.ll`


Repository:
  rG LLVM Github Monorepo

CHANGES SINCE LAST ACTION
  https://reviews.llvm.org/D117887/new/

https://reviews.llvm.org/D117887

___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[PATCH] D117887: [NVPTX] Expose float tys min, max, abs, neg as builtins

2022-02-02 Thread Jakub Chlanda via Phabricator via cfe-commits
jchlanda updated this revision to Diff 405161.
jchlanda edited the summary of this revision.
jchlanda added a comment.

Added xorsign.abs variant and test.


Repository:
  rG LLVM Github Monorepo

CHANGES SINCE LAST ACTION
  https://reviews.llvm.org/D117887/new/

https://reviews.llvm.org/D117887

Files:
  clang/include/clang/Basic/BuiltinsNVPTX.def
  clang/test/CodeGen/builtins-nvptx-native-half-type.c
  clang/test/CodeGen/builtins-nvptx.c
  llvm/include/llvm/IR/IntrinsicsNVVM.td
  llvm/lib/Target/NVPTX/NVPTXInstrInfo.td
  llvm/lib/Target/NVPTX/NVPTXIntrinsics.td
  llvm/lib/Target/NVPTX/NVPTXTargetTransformInfo.cpp
  llvm/test/CodeGen/NVPTX/math-intrins-sm80-ptx70-instcombine.ll
  llvm/test/CodeGen/NVPTX/math-intrins-sm80-ptx70.ll
  llvm/test/CodeGen/NVPTX/math-intrins-sm86-ptx72.ll

Index: llvm/test/CodeGen/NVPTX/math-intrins-sm86-ptx72.ll
===
--- /dev/null
+++ llvm/test/CodeGen/NVPTX/math-intrins-sm86-ptx72.ll
@@ -0,0 +1,259 @@
+; RUN: llc < %s -march=nvptx64 -mcpu=sm_86 -mattr=+ptx72 | FileCheck %s
+
+declare half @llvm.nvvm.fmin.xorsign.abs.f16(half, half)
+declare half @llvm.nvvm.fmin.ftz.xorsign.abs.f16(half, half)
+declare half @llvm.nvvm.fmin.nan.xorsign.abs.f16(half, half)
+declare half @llvm.nvvm.fmin.ftz.nan.xorsign.abs.f16(half, half)
+declare <2 x half> @llvm.nvvm.fmin.xorsign.abs.f16x2(<2 x half> , <2 x half>)
+declare <2 x half> @llvm.nvvm.fmin.ftz.xorsign.abs.f16x2(<2 x half> , <2 x half>)
+declare <2 x half> @llvm.nvvm.fmin.nan.xorsign.abs.f16x2(<2 x half> , <2 x half>)
+declare <2 x half> @llvm.nvvm.fmin.ftz.nan.xorsign.abs.f16x2(<2 x half> , <2 x half>)
+declare i16 @llvm.nvvm.fmin.xorsign.abs.bf16(i16, i16)
+declare i16 @llvm.nvvm.fmin.nan.xorsign.abs.bf16(i16, i16)
+declare i32 @llvm.nvvm.fmin.xorsign.abs.bf16x2(i32, i32)
+declare i32 @llvm.nvvm.fmin.nan.xorsign.abs.bf16x2(i32, i32)
+declare float @llvm.nvvm.fmin.xorsign.abs.f(float, float)
+declare float @llvm.nvvm.fmin.ftz.xorsign.abs.f(float, float)
+declare float @llvm.nvvm.fmin.nan.xorsign.abs.f(float, float)
+declare float @llvm.nvvm.fmin.ftz.nan.xorsign.abs.f(float, float)
+
+declare half @llvm.nvvm.fmax.xorsign.abs.f16(half, half)
+declare half @llvm.nvvm.fmax.ftz.xorsign.abs.f16(half, half)
+declare half @llvm.nvvm.fmax.nan.xorsign.abs.f16(half, half)
+declare half @llvm.nvvm.fmax.ftz.nan.xorsign.abs.f16(half, half)
+declare <2 x half> @llvm.nvvm.fmax.xorsign.abs.f16x2(<2 x half> , <2 x half>)
+declare <2 x half> @llvm.nvvm.fmax.ftz.xorsign.abs.f16x2(<2 x half> , <2 x half>)
+declare <2 x half> @llvm.nvvm.fmax.nan.xorsign.abs.f16x2(<2 x half> , <2 x half>)
+declare <2 x half> @llvm.nvvm.fmax.ftz.nan.xorsign.abs.f16x2(<2 x half> , <2 x half>)
+declare i16 @llvm.nvvm.fmax.xorsign.abs.bf16(i16, i16)
+declare i16 @llvm.nvvm.fmax.nan.xorsign.abs.bf16(i16, i16)
+declare i32 @llvm.nvvm.fmax.xorsign.abs.bf16x2(i32, i32)
+declare i32 @llvm.nvvm.fmax.nan.xorsign.abs.bf16x2(i32, i32)
+declare float @llvm.nvvm.fmax.xorsign.abs.f(float, float)
+declare float @llvm.nvvm.fmax.ftz.xorsign.abs.f(float, float)
+declare float @llvm.nvvm.fmax.nan.xorsign.abs.f(float, float)
+declare float @llvm.nvvm.fmax.ftz.nan.xorsign.abs.f(float, float)
+
+; CHECK-LABEL: fmin_xorsign_abs_f16
+define half @fmin_xorsign_abs_f16(half %0, half %1) {
+  ; CHECK: min.xorsign.abs.f16
+  %res = call half @llvm.nvvm.fmin.xorsign.abs.f16(half %0, half %1)
+  ret half %res
+}
+
+; CHECK-LABEL: fmin_ftz_xorsign_abs_f16
+define half @fmin_ftz_xorsign_abs_f16(half %0, half %1) {
+  ; CHECK: min.ftz.xorsign.abs.f16
+  %res = call half @llvm.nvvm.fmin.ftz.xorsign.abs.f16(half %0, half %1)
+  ret half %res
+}
+
+; CHECK-LABEL: fmin_nan_xorsign_abs_f16
+define half @fmin_nan_xorsign_abs_f16(half %0, half %1) {
+  ; CHECK: min.NaN.xorsign.abs.f16
+  %res = call half @llvm.nvvm.fmin.nan.xorsign.abs.f16(half %0, half %1)
+  ret half %res
+}
+
+; CHECK-LABEL: fmin_ftz_nan_xorsign_abs_f16
+define half @fmin_ftz_nan_xorsign_abs_f16(half %0, half %1) {
+  ; CHECK: min.ftz.NaN.xorsign.abs.f16
+  %res = call half @llvm.nvvm.fmin.ftz.nan.xorsign.abs.f16(half %0, half %1)
+  ret half %res
+}
+
+; CHECK-LABEL: fmin_xorsign_abs_f16x2
+define <2 x half> @fmin_xorsign_abs_f16x2(<2 x half> %0, <2 x half> %1) {
+  ; CHECK: min.xorsign.abs.f16x2
+  %res = call <2 x half> @llvm.nvvm.fmin.xorsign.abs.f16x2(<2 x half> %0, <2 x half> %1)
+  ret <2 x half> %res
+}
+
+; CHECK-LABEL: fmin_ftz_xorsign_abs_f16x2
+define <2 x half> @fmin_ftz_xorsign_abs_f16x2(<2 x half> %0, <2 x half> %1) {
+  ; CHECK: min.ftz.xorsign.abs.f16x2
+  %res = call <2 x half> @llvm.nvvm.fmin.ftz.xorsign.abs.f16x2(<2 x half> %0, <2 x half> %1)
+  ret <2 x half> %res
+}
+
+; CHECK-LABEL: fmin_nan_xorsign_abs_f16x2
+define <2 x half> @fmin_nan_xorsign_abs_f16x2(<2 x half> %0, <2 x half> %1) {
+  ; CHECK: min.NaN.xorsign.abs.f16x2
+  %res = call <2 x half> @llvm.nvvm.fmin.nan.xorsign.abs.f16x2(<2 x half> %0, <2 x half> %1)
+  ret <2 x half> %res
+}
+
+; CHECK-LABEL: 

[PATCH] D117887: [NVPTX] Expose float tys min, max, abs, neg as builtins

2022-02-01 Thread Jakub Chlanda via Phabricator via cfe-commits
jchlanda added a comment.

In D117887#3262024 , @tra wrote:

> 



> If you only intended to add instructions available in PTX-7.0, which, based 
> on the constraints used in the patch, appears to be the case, I'd mention 
> that in the commit log.

Yeap, we were only going to bump up to 7.0, I don't mind adding the 
`xorsign.abs`, while I'm at it. Will update the diff shortly.


Repository:
  rG LLVM Github Monorepo

CHANGES SINCE LAST ACTION
  https://reviews.llvm.org/D117887/new/

https://reviews.llvm.org/D117887

___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[PATCH] D117887: [NVPTX] Expose float tys min, max, abs, neg as builtins

2022-01-21 Thread Jakub Chlanda via Phabricator via cfe-commits
jchlanda created this revision.
jchlanda added reviewers: csigg, tra, bkramer.
Herald added subscribers: asavonic, hiraditya, jholewinski.
jchlanda requested review of this revision.
Herald added subscribers: llvm-commits, cfe-commits, jdoerfert.
Herald added projects: clang, LLVM.

Adds support for the following builtins:

- abs, neg:
  - bf16,
  - bf16x2
- min, max
  - f16, f16 ftz, f16 nan, f16 nan ftz
  - f16x2, f16x2 ftz, f16x2 nan, f16x2 nan ftz
  - bf16, bf16 nan
  - bf16x2, bf16x2 nan
  - f32 ftz, f32 nan, f32 nan ftz


Repository:
  rG LLVM Github Monorepo

https://reviews.llvm.org/D117887

Files:
  clang/include/clang/Basic/BuiltinsNVPTX.def
  clang/test/CodeGen/builtins-nvptx-native-half-type.c
  clang/test/CodeGen/builtins-nvptx.c
  llvm/include/llvm/IR/IntrinsicsNVVM.td
  llvm/lib/Target/NVPTX/NVPTXIntrinsics.td
  llvm/lib/Target/NVPTX/NVPTXTargetTransformInfo.cpp
  llvm/test/CodeGen/NVPTX/math-intrins-sm80-ptx70-instcombine.ll
  llvm/test/CodeGen/NVPTX/math-intrins-sm80-ptx70.ll

Index: llvm/test/CodeGen/NVPTX/math-intrins-sm80-ptx70.ll
===
--- /dev/null
+++ llvm/test/CodeGen/NVPTX/math-intrins-sm80-ptx70.ll
@@ -0,0 +1,260 @@
+; RUN: llc < %s -march=nvptx64 -mcpu=sm_80 -mattr=+ptx70 | FileCheck %s
+
+declare i16 @llvm.nvvm.abs.bf16(i16)
+declare i32 @llvm.nvvm.abs.bf16x2(i32)
+declare i16 @llvm.nvvm.neg.bf16(i16)
+declare i32 @llvm.nvvm.neg.bf16x2(i32)
+
+declare float @llvm.nvvm.fmin.nan.f(float, float)
+declare float @llvm.nvvm.fmin.ftz.nan.f(float, float)
+declare half @llvm.nvvm.fmin.f16(half, half)
+declare half @llvm.nvvm.fmin.ftz.f16(half, half)
+declare half @llvm.nvvm.fmin.nan.f16(half, half)
+declare half @llvm.nvvm.fmin.ftz.nan.f16(half, half)
+declare <2 x half> @llvm.nvvm.fmin.f16x2(<2 x half>, <2 x half>)
+declare <2 x half> @llvm.nvvm.fmin.ftz.f16x2(<2 x half>, <2 x half>)
+declare <2 x half> @llvm.nvvm.fmin.nan.f16x2(<2 x half>, <2 x half>)
+declare <2 x half> @llvm.nvvm.fmin.ftz.nan.f16x2(<2 x half>, <2 x half>)
+declare i16 @llvm.nvvm.fmin.bf16(i16, i16)
+declare i16 @llvm.nvvm.fmin.nan.bf16(i16, i16)
+declare i32 @llvm.nvvm.fmin.bf16x2(i32, i32)
+declare i32 @llvm.nvvm.fmin.nan.bf16x2(i32, i32)
+
+declare float @llvm.nvvm.fmax.nan.f(float, float)
+declare float @llvm.nvvm.fmax.ftz.nan.f(float, float)
+declare half @llvm.nvvm.fmax.f16(half, half)
+declare half @llvm.nvvm.fmax.ftz.f16(half, half)
+declare half @llvm.nvvm.fmax.nan.f16(half, half)
+declare half @llvm.nvvm.fmax.ftz.nan.f16(half, half)
+declare <2 x half> @llvm.nvvm.fmax.f16x2(<2 x half>, <2 x half>)
+declare <2 x half> @llvm.nvvm.fmax.ftz.f16x2(<2 x half>, <2 x half>)
+declare <2 x half> @llvm.nvvm.fmax.nan.f16x2(<2 x half>, <2 x half>)
+declare <2 x half> @llvm.nvvm.fmax.ftz.nan.f16x2(<2 x half>, <2 x half>)
+declare i16 @llvm.nvvm.fmax.bf16(i16, i16)
+declare i16 @llvm.nvvm.fmax.nan.bf16(i16, i16)
+declare i32 @llvm.nvvm.fmax.bf16x2(i32, i32)
+declare i32 @llvm.nvvm.fmax.nan.bf16x2(i32, i32)
+
+; CHECK-LABEL: abs_bf16
+define i16 @abs_bf16(i16 %0) {
+  ; CHECK: abs.bf16
+  %res = call i16 @llvm.nvvm.abs.bf16(i16 %0);
+  ret i16 %res
+}
+
+; CHECK-LABEL: abs_bf16x2
+define i32 @abs_bf16x2(i32 %0) {
+  ; CHECK: abs.bf16x2
+  %res = call i32 @llvm.nvvm.abs.bf16x2(i32 %0);
+  ret i32 %res
+}
+
+; CHECK-LABEL: neg_bf16
+define i16 @neg_bf16(i16 %0) {
+  ; CHECK: neg.bf16
+  %res = call i16 @llvm.nvvm.neg.bf16(i16 %0);
+  ret i16 %res
+}
+
+; CHECK-LABEL: neg_bf16x2
+define i32 @neg_bf16x2(i32 %0) {
+  ; CHECK: neg.bf16x2
+  %res = call i32 @llvm.nvvm.neg.bf16x2(i32 %0);
+  ret i32 %res
+}
+
+; CHECK-LABEL: fmin_nan_f
+define float @fmin_nan_f(float %0, float %1) {
+  ; CHECK: min.NaN.f32
+  %res = call float @llvm.nvvm.fmin.nan.f(float %0, float %1);
+  ret float %res
+}
+
+; CHECK-LABEL: fmin_ftz_nan_f
+define float @fmin_ftz_nan_f(float %0, float %1) {
+  ; CHECK: min.ftz.NaN.f32
+  %res = call float @llvm.nvvm.fmin.ftz.nan.f(float %0, float %1);
+  ret float %res
+}
+
+; CHECK-LABEL: fmin_f16
+define half @fmin_f16(half %0, half %1) {
+  ; CHECK: min.f16
+  %res = call half @llvm.nvvm.fmin.f16(half %0, half %1)
+  ret half %res
+}
+
+; CHECK-LABEL: fmin_ftz_f16
+define half @fmin_ftz_f16(half %0, half %1) {
+  ; CHECK: min.ftz.f16
+  %res = call half @llvm.nvvm.fmin.ftz.f16(half %0, half %1)
+  ret half %res
+}
+
+; CHECK-LABEL: fmin_nan_f16
+define half @fmin_nan_f16(half %0, half %1) {
+  ; CHECK: min.NaN.f16
+  %res = call half @llvm.nvvm.fmin.nan.f16(half %0, half %1)
+  ret half %res
+}
+
+; CHECK-LABEL: fmin_ftz_nan_f16
+define half @fmin_ftz_nan_f16(half %0, half %1) {
+  ; CHECK: min.ftz.NaN.f16
+  %res = call half @llvm.nvvm.fmin.ftz.nan.f16(half %0, half %1)
+  ret half %res
+}
+
+; CHECK-LABEL: fmin_f16x2
+define <2 x half> @fmin_f16x2(<2 x half> %0, <2 x half> %1) {
+  ; CHECK: min.f16x2
+  %res = call <2 x half> @llvm.nvvm.fmin.f16x2(<2 x half> %0, <2 x half> %1)
+  ret <2 x half> %res
+}
+
+; CHECK-LABEL: fmin_ftz_f16x2
+define <2 x 

[PATCH] D117787: [NVPTX] abs, neg, min, max intrinsics for half types

2022-01-20 Thread Jakub Chlanda via Phabricator via cfe-commits
jchlanda abandoned this revision.
jchlanda added a comment.

I need to update this to the upstream, as it was based on intel's fork. 
Abandoning for now.


Repository:
  rG LLVM Github Monorepo

CHANGES SINCE LAST ACTION
  https://reviews.llvm.org/D117787/new/

https://reviews.llvm.org/D117787

___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[PATCH] D117787: [NVPTX] abs, neg, min, max intrinsics for half types

2022-01-20 Thread Jakub Chlanda via Phabricator via cfe-commits
jchlanda created this revision.
jchlanda added reviewers: jholewinski, steffenlarsen, asavonic, sebastian-ne.
Herald added a subscriber: hiraditya.
jchlanda requested review of this revision.
Herald added subscribers: llvm-commits, cfe-commits, jdoerfert.
Herald added projects: clang, LLVM.

This patch extends `abs` and `neg` intrinsics to support `bf16` and `bf16x2` 
data formats.
Extends `min` and `max` to support `.NaN` modifier and `f16`, `f16x2`, `bf16` 
and `bf16x2` data formats.


Repository:
  rG LLVM Github Monorepo

https://reviews.llvm.org/D117787

Files:
  clang/include/clang/Basic/BuiltinsNVPTX.def
  clang/test/CodeGen/builtins-nvptx-native-half-type.c
  clang/test/CodeGen/builtins-nvptx.c
  llvm/include/llvm/IR/IntrinsicsNVVM.td
  llvm/lib/Target/NVPTX/NVPTXInstrInfo.td
  llvm/lib/Target/NVPTX/NVPTXIntrinsics.td
  llvm/lib/Target/NVPTX/NVPTXTargetTransformInfo.cpp
  llvm/test/CodeGen/NVPTX/math-intrins-sm80-ptx70-instcombine.ll
  llvm/test/CodeGen/NVPTX/math-intrins-sm80-ptx70.ll

Index: llvm/test/CodeGen/NVPTX/math-intrins-sm80-ptx70.ll
===
--- /dev/null
+++ llvm/test/CodeGen/NVPTX/math-intrins-sm80-ptx70.ll
@@ -0,0 +1,260 @@
+; RUN: llc < %s -march=nvptx64 -mcpu=sm_80 -mattr=+ptx70 | FileCheck %s
+
+declare i16 @llvm.nvvm.abs.bf16(i16)
+declare i32 @llvm.nvvm.abs.bf16x2(i32)
+declare i16 @llvm.nvvm.neg.bf16(i16)
+declare i32 @llvm.nvvm.neg.bf16x2(i32)
+
+declare float @llvm.nvvm.fmin.nan.f(float, float)
+declare float @llvm.nvvm.fmin.ftz.nan.f(float, float)
+declare half @llvm.nvvm.fmin.f16(half, half)
+declare half @llvm.nvvm.fmin.ftz.f16(half, half)
+declare half @llvm.nvvm.fmin.nan.f16(half, half)
+declare half @llvm.nvvm.fmin.ftz.nan.f16(half, half)
+declare <2 x half> @llvm.nvvm.fmin.f16x2(<2 x half>, <2 x half>)
+declare <2 x half> @llvm.nvvm.fmin.ftz.f16x2(<2 x half>, <2 x half>)
+declare <2 x half> @llvm.nvvm.fmin.nan.f16x2(<2 x half>, <2 x half>)
+declare <2 x half> @llvm.nvvm.fmin.ftz.nan.f16x2(<2 x half>, <2 x half>)
+declare i16 @llvm.nvvm.fmin.bf16(i16, i16)
+declare i16 @llvm.nvvm.fmin.nan.bf16(i16, i16)
+declare i32 @llvm.nvvm.fmin.bf16x2(i32, i32)
+declare i32 @llvm.nvvm.fmin.nan.bf16x2(i32, i32)
+
+declare float @llvm.nvvm.fmax.nan.f(float, float)
+declare float @llvm.nvvm.fmax.ftz.nan.f(float, float)
+declare half @llvm.nvvm.fmax.f16(half, half)
+declare half @llvm.nvvm.fmax.ftz.f16(half, half)
+declare half @llvm.nvvm.fmax.nan.f16(half, half)
+declare half @llvm.nvvm.fmax.ftz.nan.f16(half, half)
+declare <2 x half> @llvm.nvvm.fmax.f16x2(<2 x half>, <2 x half>)
+declare <2 x half> @llvm.nvvm.fmax.ftz.f16x2(<2 x half>, <2 x half>)
+declare <2 x half> @llvm.nvvm.fmax.nan.f16x2(<2 x half>, <2 x half>)
+declare <2 x half> @llvm.nvvm.fmax.ftz.nan.f16x2(<2 x half>, <2 x half>)
+declare i16 @llvm.nvvm.fmax.bf16(i16, i16)
+declare i16 @llvm.nvvm.fmax.nan.bf16(i16, i16)
+declare i32 @llvm.nvvm.fmax.bf16x2(i32, i32)
+declare i32 @llvm.nvvm.fmax.nan.bf16x2(i32, i32)
+
+; CHECK-LABEL: abs_bf16
+define i16 @abs_bf16(i16 %0) {
+  ; CHECK: abs.bf16
+  %res = call i16 @llvm.nvvm.abs.bf16(i16 %0);
+  ret i16 %res
+}
+
+; CHECK-LABEL: abs_bf16x2
+define i32 @abs_bf16x2(i32 %0) {
+  ; CHECK: abs.bf16x2
+  %res = call i32 @llvm.nvvm.abs.bf16x2(i32 %0);
+  ret i32 %res
+}
+
+; CHECK-LABEL: neg_bf16
+define i16 @neg_bf16(i16 %0) {
+  ; CHECK: neg.bf16
+  %res = call i16 @llvm.nvvm.neg.bf16(i16 %0);
+  ret i16 %res
+}
+
+; CHECK-LABEL: neg_bf16x2
+define i32 @neg_bf16x2(i32 %0) {
+  ; CHECK: neg.bf16x2
+  %res = call i32 @llvm.nvvm.neg.bf16x2(i32 %0);
+  ret i32 %res
+}
+
+; CHECK-LABEL: fmin_nan_f
+define float @fmin_nan_f(float %0, float %1) {
+  ; CHECK: min.NaN.f32
+  %res = call float @llvm.nvvm.fmin.nan.f(float %0, float %1);
+  ret float %res
+}
+
+; CHECK-LABEL: fmin_ftz_nan_f
+define float @fmin_ftz_nan_f(float %0, float %1) {
+  ; CHECK: min.ftz.NaN.f32
+  %res = call float @llvm.nvvm.fmin.ftz.nan.f(float %0, float %1);
+  ret float %res
+}
+
+; CHECK-LABEL: fmin_f16
+define half @fmin_f16(half %0, half %1) {
+  ; CHECK: min.f16
+  %res = call half @llvm.nvvm.fmin.f16(half %0, half %1)
+  ret half %res
+}
+
+; CHECK-LABEL: fmin_ftz_f16
+define half @fmin_ftz_f16(half %0, half %1) {
+  ; CHECK: min.ftz.f16
+  %res = call half @llvm.nvvm.fmin.ftz.f16(half %0, half %1)
+  ret half %res
+}
+
+; CHECK-LABEL: fmin_nan_f16
+define half @fmin_nan_f16(half %0, half %1) {
+  ; CHECK: min.NaN.f16
+  %res = call half @llvm.nvvm.fmin.nan.f16(half %0, half %1)
+  ret half %res
+}
+
+; CHECK-LABEL: fmin_ftz_nan_f16
+define half @fmin_ftz_nan_f16(half %0, half %1) {
+  ; CHECK: min.ftz.NaN.f16
+  %res = call half @llvm.nvvm.fmin.ftz.nan.f16(half %0, half %1)
+  ret half %res
+}
+
+; CHECK-LABEL: fmin_f16x2
+define <2 x half> @fmin_f16x2(<2 x half> %0, <2 x half> %1) {
+  ; CHECK: min.f16x2
+  %res = call <2 x half> @llvm.nvvm.fmin.f16x2(<2 x half> %0, <2 x half> %1)
+  ret <2 x half> %res
+}
+
+; CHECK-LABEL: fmin_ftz_f16x2
+define <2 x