[PATCH] D151243: [CUDA] Fix wrappers for sm_80 functions
This revision was automatically updated to reflect the committed changes. Closed by commit rG29cb080c363d: [CUDA] Fix wrappers for sm_80 functions (authored by tra). Repository: rG LLVM Github Monorepo CHANGES SINCE LAST ACTION https://reviews.llvm.org/D151243/new/ https://reviews.llvm.org/D151243 Files: clang/lib/Headers/__clang_cuda_intrinsics.h Index: clang/lib/Headers/__clang_cuda_intrinsics.h === --- clang/lib/Headers/__clang_cuda_intrinsics.h +++ clang/lib/Headers/__clang_cuda_intrinsics.h @@ -512,70 +512,63 @@ __device__ inline cuuint32_t __nvvm_get_smem_pointer(void *__ptr) { return __nv_cvta_generic_to_shared_impl(__ptr); } +} // extern "C" #if !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 800 -__device__ inline unsigned __reduce_add_sync_unsigned_impl(unsigned __mask, - unsigned __value) { - return __nvvm_redux_sync_add(__mask, __value); -} -__device__ inline int __reduce_add_sync_signed_impl(unsigned __mask, -int __value) { +__device__ inline unsigned __reduce_add_sync(unsigned __mask, + unsigned __value) { return __nvvm_redux_sync_add(__mask, __value); } -__device__ inline unsigned __reduce_min_sync_unsigned_impl(unsigned __mask, - unsigned __value) { +__device__ inline unsigned __reduce_min_sync(unsigned __mask, + unsigned __value) { return __nvvm_redux_sync_umin(__mask, __value); } -__device__ inline unsigned __reduce_max_sync_unsigned_impl(unsigned __mask, - unsigned __value) { +__device__ inline unsigned __reduce_max_sync(unsigned __mask, + unsigned __value) { return __nvvm_redux_sync_umax(__mask, __value); } -__device__ inline int __reduce_min_sync_signed_impl(unsigned __mask, -int __value) { +__device__ inline int __reduce_min_sync(unsigned __mask, int __value) { return __nvvm_redux_sync_min(__mask, __value); } -__device__ inline int __reduce_max_sync_signed_impl(unsigned __mask, -int __value) { +__device__ inline int __reduce_max_sync(unsigned __mask, int __value) { return __nvvm_redux_sync_max(__mask, __value); } -__device__ inline unsigned __reduce_or_sync_unsigned_impl(unsigned __mask, - unsigned __value) { +__device__ inline unsigned __reduce_or_sync(unsigned __mask, unsigned __value) { return __nvvm_redux_sync_or(__mask, __value); } -__device__ inline unsigned __reduce_and_sync_unsigned_impl(unsigned __mask, - unsigned __value) { +__device__ inline unsigned __reduce_and_sync(unsigned __mask, + unsigned __value) { return __nvvm_redux_sync_and(__mask, __value); } -__device__ inline unsigned __reduce_xor_sync_unsigned_impl(unsigned __mask, - unsigned __value) { +__device__ inline unsigned __reduce_xor_sync(unsigned __mask, + unsigned __value) { return __nvvm_redux_sync_xor(__mask, __value); } -__device__ inline void -__nv_memcpy_async_shared_global_4_impl(void *__dst, const void *__src, - unsigned __src_size) { +__device__ inline void __nv_memcpy_async_shared_global_4(void *__dst, + const void *__src, + unsigned __src_size) { __nvvm_cp_async_ca_shared_global_4( (void __attribute__((address_space(3))) *)__dst, (const void __attribute__((address_space(1))) *)__src, __src_size); } -__device__ inline void -__nv_memcpy_async_shared_global_8_impl(void *__dst, const void *__src, - unsigned __src_size) { +__device__ inline void __nv_memcpy_async_shared_global_8(void *__dst, + const void *__src, + unsigned __src_size) { __nvvm_cp_async_ca_shared_global_8( (void __attribute__((address_space(3))) *)__dst, (const void __attribute__((address_space(1))) *)__src, __src_size); } -__device__ inline void -__nv_memcpy_async_shared_global_16_impl(void *__dst, const void *__src, -unsigned __src_size) { +__device__ inline void __nv_memcpy_async_shared_global_16(void *__dst, + const void *__src, + unsign
[PATCH] D151243: [CUDA] Fix wrappers for sm_80 functions
tra created this revision. Herald added subscribers: mattd, carlosgalvezp, bixia, yaxunl. Herald added a project: All. tra published this revision for review. tra added a reviewer: jlebar. Herald added a project: clang. Herald added a subscriber: cfe-commits. Previous implementation provided wrappers for the internal implementations used by CUDA headers. However, clang does not include those, so we need to provide the public functions instead. Repository: rG LLVM Github Monorepo https://reviews.llvm.org/D151243 Files: clang/lib/Headers/__clang_cuda_intrinsics.h Index: clang/lib/Headers/__clang_cuda_intrinsics.h === --- clang/lib/Headers/__clang_cuda_intrinsics.h +++ clang/lib/Headers/__clang_cuda_intrinsics.h @@ -512,70 +512,63 @@ __device__ inline cuuint32_t __nvvm_get_smem_pointer(void *__ptr) { return __nv_cvta_generic_to_shared_impl(__ptr); } +} // extern "C" #if !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 800 -__device__ inline unsigned __reduce_add_sync_unsigned_impl(unsigned __mask, - unsigned __value) { +__device__ inline unsigned __reduce_add_sync(unsigned __mask, + unsigned __value) { return __nvvm_redux_sync_add(__mask, __value); } -__device__ inline int __reduce_add_sync_signed_impl(unsigned __mask, -int __value) { - return __nvvm_redux_sync_add(__mask, __value); -} -__device__ inline unsigned __reduce_min_sync_unsigned_impl(unsigned __mask, - unsigned __value) { +__device__ inline unsigned __reduce_min_sync(unsigned __mask, + unsigned __value) { return __nvvm_redux_sync_umin(__mask, __value); } -__device__ inline unsigned __reduce_max_sync_unsigned_impl(unsigned __mask, - unsigned __value) { +__device__ inline unsigned __reduce_max_sync(unsigned __mask, + unsigned __value) { return __nvvm_redux_sync_umax(__mask, __value); } -__device__ inline int __reduce_min_sync_signed_impl(unsigned __mask, -int __value) { +__device__ inline int __reduce_min_sync(unsigned __mask, int __value) { return __nvvm_redux_sync_min(__mask, __value); } -__device__ inline int __reduce_max_sync_signed_impl(unsigned __mask, -int __value) { +__device__ inline int __reduce_max_sync(unsigned __mask, int __value) { return __nvvm_redux_sync_max(__mask, __value); } -__device__ inline unsigned __reduce_or_sync_unsigned_impl(unsigned __mask, - unsigned __value) { +__device__ inline unsigned __reduce_or_sync(unsigned __mask, unsigned __value) { return __nvvm_redux_sync_or(__mask, __value); } -__device__ inline unsigned __reduce_and_sync_unsigned_impl(unsigned __mask, - unsigned __value) { +__device__ inline unsigned __reduce_and_sync(unsigned __mask, + unsigned __value) { return __nvvm_redux_sync_and(__mask, __value); } -__device__ inline unsigned __reduce_xor_sync_unsigned_impl(unsigned __mask, - unsigned __value) { +__device__ inline unsigned __reduce_xor_sync(unsigned __mask, + unsigned __value) { return __nvvm_redux_sync_xor(__mask, __value); } -__device__ inline void -__nv_memcpy_async_shared_global_4_impl(void *__dst, const void *__src, - unsigned __src_size) { +__device__ inline void __nv_memcpy_async_shared_global_4(void *__dst, + const void *__src, + unsigned __src_size) { __nvvm_cp_async_ca_shared_global_4( (void __attribute__((address_space(3))) *)__dst, (const void __attribute__((address_space(1))) *)__src, __src_size); } -__device__ inline void -__nv_memcpy_async_shared_global_8_impl(void *__dst, const void *__src, - unsigned __src_size) { +__device__ inline void __nv_memcpy_async_shared_global_8(void *__dst, + const void *__src, + unsigned __src_size) { __nvvm_cp_async_ca_shared_global_8( (void __attribute__((address_space(3))) *)__dst, (const void __attribute__((address_space(1))) *)__src, __src_size); } -__device__ inline void -__nv_memcpy_async_shared_global_16_impl(void *__dst, const void *__src, -unsigned __src_size) {