[PATCH] D151243: [CUDA] Fix wrappers for sm_80 functions

2023-05-24 Thread Artem Belevich via Phabricator via cfe-commits
This revision was automatically updated to reflect the committed changes.
Closed by commit rG29cb080c363d: [CUDA] Fix wrappers for sm_80 functions 
(authored by tra).

Repository:
  rG LLVM Github Monorepo

CHANGES SINCE LAST ACTION
  https://reviews.llvm.org/D151243/new/

https://reviews.llvm.org/D151243

Files:
  clang/lib/Headers/__clang_cuda_intrinsics.h

Index: clang/lib/Headers/__clang_cuda_intrinsics.h
===
--- clang/lib/Headers/__clang_cuda_intrinsics.h
+++ clang/lib/Headers/__clang_cuda_intrinsics.h
@@ -512,70 +512,63 @@
 __device__ inline cuuint32_t __nvvm_get_smem_pointer(void *__ptr) {
   return __nv_cvta_generic_to_shared_impl(__ptr);
 }
+} // extern "C"
 
 #if !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 800
-__device__ inline unsigned __reduce_add_sync_unsigned_impl(unsigned __mask,
-   unsigned __value) {
-  return __nvvm_redux_sync_add(__mask, __value);
-}
-__device__ inline int __reduce_add_sync_signed_impl(unsigned __mask,
-int __value) {
+__device__ inline unsigned __reduce_add_sync(unsigned __mask,
+ unsigned __value) {
   return __nvvm_redux_sync_add(__mask, __value);
 }
-__device__ inline unsigned __reduce_min_sync_unsigned_impl(unsigned __mask,
-   unsigned __value) {
+__device__ inline unsigned __reduce_min_sync(unsigned __mask,
+ unsigned __value) {
   return __nvvm_redux_sync_umin(__mask, __value);
 }
-__device__ inline unsigned __reduce_max_sync_unsigned_impl(unsigned __mask,
-   unsigned __value) {
+__device__ inline unsigned __reduce_max_sync(unsigned __mask,
+ unsigned __value) {
   return __nvvm_redux_sync_umax(__mask, __value);
 }
-__device__ inline int __reduce_min_sync_signed_impl(unsigned __mask,
-int __value) {
+__device__ inline int __reduce_min_sync(unsigned __mask, int __value) {
   return __nvvm_redux_sync_min(__mask, __value);
 }
-__device__ inline int __reduce_max_sync_signed_impl(unsigned __mask,
-int __value) {
+__device__ inline int __reduce_max_sync(unsigned __mask, int __value) {
   return __nvvm_redux_sync_max(__mask, __value);
 }
-__device__ inline unsigned __reduce_or_sync_unsigned_impl(unsigned __mask,
-  unsigned __value) {
+__device__ inline unsigned __reduce_or_sync(unsigned __mask, unsigned __value) {
   return __nvvm_redux_sync_or(__mask, __value);
 }
-__device__ inline unsigned __reduce_and_sync_unsigned_impl(unsigned __mask,
-   unsigned __value) {
+__device__ inline unsigned __reduce_and_sync(unsigned __mask,
+ unsigned __value) {
   return __nvvm_redux_sync_and(__mask, __value);
 }
-__device__ inline unsigned __reduce_xor_sync_unsigned_impl(unsigned __mask,
-   unsigned __value) {
+__device__ inline unsigned __reduce_xor_sync(unsigned __mask,
+ unsigned __value) {
   return __nvvm_redux_sync_xor(__mask, __value);
 }
 
-__device__ inline void
-__nv_memcpy_async_shared_global_4_impl(void *__dst, const void *__src,
-   unsigned __src_size) {
+__device__ inline void __nv_memcpy_async_shared_global_4(void *__dst,
+ const void *__src,
+ unsigned __src_size) {
   __nvvm_cp_async_ca_shared_global_4(
   (void __attribute__((address_space(3))) *)__dst,
   (const void __attribute__((address_space(1))) *)__src, __src_size);
 }
-__device__ inline void
-__nv_memcpy_async_shared_global_8_impl(void *__dst, const void *__src,
-   unsigned __src_size) {
+__device__ inline void __nv_memcpy_async_shared_global_8(void *__dst,
+ const void *__src,
+ unsigned __src_size) {
   __nvvm_cp_async_ca_shared_global_8(
   (void __attribute__((address_space(3))) *)__dst,
   (const void __attribute__((address_space(1))) *)__src, __src_size);
 }
-__device__ inline void
-__nv_memcpy_async_shared_global_16_impl(void *__dst, const void *__src,
-unsigned __src_size) {
+__device__ inline void __nv_memcpy_async_shared_global_16(void *__dst,
+  const void *__src,
+  unsign

[PATCH] D151243: [CUDA] Fix wrappers for sm_80 functions

2023-05-23 Thread Artem Belevich via Phabricator via cfe-commits
tra created this revision.
Herald added subscribers: mattd, carlosgalvezp, bixia, yaxunl.
Herald added a project: All.
tra published this revision for review.
tra added a reviewer: jlebar.
Herald added a project: clang.
Herald added a subscriber: cfe-commits.

Previous implementation provided wrappers for the internal implementations used
by CUDA headers. However, clang does not include those, so we need to provide
the public functions instead.


Repository:
  rG LLVM Github Monorepo

https://reviews.llvm.org/D151243

Files:
  clang/lib/Headers/__clang_cuda_intrinsics.h

Index: clang/lib/Headers/__clang_cuda_intrinsics.h
===
--- clang/lib/Headers/__clang_cuda_intrinsics.h
+++ clang/lib/Headers/__clang_cuda_intrinsics.h
@@ -512,70 +512,63 @@
 __device__ inline cuuint32_t __nvvm_get_smem_pointer(void *__ptr) {
   return __nv_cvta_generic_to_shared_impl(__ptr);
 }
+} // extern "C"
 
 #if !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 800
-__device__ inline unsigned __reduce_add_sync_unsigned_impl(unsigned __mask,
-   unsigned __value) {
+__device__ inline unsigned __reduce_add_sync(unsigned __mask,
+ unsigned __value) {
   return __nvvm_redux_sync_add(__mask, __value);
 }
-__device__ inline int __reduce_add_sync_signed_impl(unsigned __mask,
-int __value) {
-  return __nvvm_redux_sync_add(__mask, __value);
-}
-__device__ inline unsigned __reduce_min_sync_unsigned_impl(unsigned __mask,
-   unsigned __value) {
+__device__ inline unsigned __reduce_min_sync(unsigned __mask,
+ unsigned __value) {
   return __nvvm_redux_sync_umin(__mask, __value);
 }
-__device__ inline unsigned __reduce_max_sync_unsigned_impl(unsigned __mask,
-   unsigned __value) {
+__device__ inline unsigned __reduce_max_sync(unsigned __mask,
+ unsigned __value) {
   return __nvvm_redux_sync_umax(__mask, __value);
 }
-__device__ inline int __reduce_min_sync_signed_impl(unsigned __mask,
-int __value) {
+__device__ inline int __reduce_min_sync(unsigned __mask, int __value) {
   return __nvvm_redux_sync_min(__mask, __value);
 }
-__device__ inline int __reduce_max_sync_signed_impl(unsigned __mask,
-int __value) {
+__device__ inline int __reduce_max_sync(unsigned __mask, int __value) {
   return __nvvm_redux_sync_max(__mask, __value);
 }
-__device__ inline unsigned __reduce_or_sync_unsigned_impl(unsigned __mask,
-  unsigned __value) {
+__device__ inline unsigned __reduce_or_sync(unsigned __mask, unsigned __value) {
   return __nvvm_redux_sync_or(__mask, __value);
 }
-__device__ inline unsigned __reduce_and_sync_unsigned_impl(unsigned __mask,
-   unsigned __value) {
+__device__ inline unsigned __reduce_and_sync(unsigned __mask,
+ unsigned __value) {
   return __nvvm_redux_sync_and(__mask, __value);
 }
-__device__ inline unsigned __reduce_xor_sync_unsigned_impl(unsigned __mask,
-   unsigned __value) {
+__device__ inline unsigned __reduce_xor_sync(unsigned __mask,
+ unsigned __value) {
   return __nvvm_redux_sync_xor(__mask, __value);
 }
 
-__device__ inline void
-__nv_memcpy_async_shared_global_4_impl(void *__dst, const void *__src,
-   unsigned __src_size) {
+__device__ inline void __nv_memcpy_async_shared_global_4(void *__dst,
+ const void *__src,
+ unsigned __src_size) {
   __nvvm_cp_async_ca_shared_global_4(
   (void __attribute__((address_space(3))) *)__dst,
   (const void __attribute__((address_space(1))) *)__src, __src_size);
 }
-__device__ inline void
-__nv_memcpy_async_shared_global_8_impl(void *__dst, const void *__src,
-   unsigned __src_size) {
+__device__ inline void __nv_memcpy_async_shared_global_8(void *__dst,
+ const void *__src,
+ unsigned __src_size) {
   __nvvm_cp_async_ca_shared_global_8(
   (void __attribute__((address_space(3))) *)__dst,
   (const void __attribute__((address_space(1))) *)__src, __src_size);
 }
-__device__ inline void
-__nv_memcpy_async_shared_global_16_impl(void *__dst, const void *__src,
-unsigned __src_size) {