tra created this revision. Herald added subscribers: mattd, gchakrabarti, asavonic, bixia, hiraditya, yaxunl. Herald added a project: All. tra published this revision for review. tra added a reviewer: alexfh. Herald added subscribers: llvm-commits, cfe-commits, wangpc, jholewinski. Herald added projects: clang, LLVM.
We had some instances when LLVM would not inline fixed-count memcpy and ended up attempting to lower it a a libcall, which would not work on NVPTX as there's no standard library to call. The patch relaxes the threshold used for -Os compilation so we're always allowed to inline memory copy functions. Repository: rG LLVM Github Monorepo https://reviews.llvm.org/D158226 Files: clang/test/CodeGenCUDA/memcpy-libcall.cu llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp Index: llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp =================================================================== --- llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp +++ llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp @@ -386,9 +386,9 @@ // always lower memset, memcpy, and memmove intrinsics to load/store // instructions, rather // then generating calls to memset, mempcy or memmove. - MaxStoresPerMemset = (unsigned) 0xFFFFFFFF; - MaxStoresPerMemcpy = (unsigned) 0xFFFFFFFF; - MaxStoresPerMemmove = (unsigned) 0xFFFFFFFF; + MaxStoresPerMemset = MaxStoresPerMemsetOptSize = (unsigned)0xFFFFFFFF; + MaxStoresPerMemcpy = MaxStoresPerMemcpyOptSize = (unsigned) 0xFFFFFFFF; + MaxStoresPerMemmove = MaxStoresPerMemmoveOptSize = (unsigned) 0xFFFFFFFF; setBooleanContents(ZeroOrNegativeOneBooleanContent); setBooleanVectorContents(ZeroOrNegativeOneBooleanContent); Index: clang/test/CodeGenCUDA/memcpy-libcall.cu =================================================================== --- /dev/null +++ clang/test/CodeGenCUDA/memcpy-libcall.cu @@ -0,0 +1,61 @@ +// RUN: %clang_cc1 -x cuda -triple nvptx64-nvidia-cuda- -fcuda-is-device \ +// RUN: -O3 -S %s -o - | FileCheck -check-prefix=PTX %s +// RUN: %clang_cc1 -x cuda -triple nvptx64-nvidia-cuda- -fcuda-is-device \ +// RUN: -Os -S %s -o - | FileCheck -check-prefix=PTX %s +#include "Inputs/cuda.h" + +// PTX-LABEL: .func _Z12copy_genericPvPKv( +void __device__ copy_generic(void *dest, const void *src) { + __builtin_memcpy(dest, src, 32); +// PTX: ld.u8 +// PTX: st.u8 +} + +// PTX-LABEL: .entry _Z11copy_globalPvS_( +void __global__ copy_global(void *dest, void * src) { + __builtin_memcpy(dest, src, 32); +// PTX: ld.global.u8 +// PTX: st.global.u8 +} + +struct S { + int data[8]; +}; + +// PTX-LABEL: .entry _Z20copy_param_to_globalP1SS_( +void __global__ copy_param_to_global(S *global, S param) { + __builtin_memcpy(global, ¶m, sizeof(S)); +// PTX: ld.param.u32 +// PTX: st.global.u32 +} + +// PTX-LABEL: .entry _Z19copy_param_to_localPU3AS51SS_( +void __global__ copy_param_to_local(__attribute__((address_space(5))) S *local, + S param) { + __builtin_memcpy(local, ¶m, sizeof(S)); +// PTX: ld.param.u32 +// PTX: st.local.u32 +} + +// PTX-LABEL: .func _Z21copy_local_to_genericP1SPU3AS5S_( +void __device__ copy_local_to_generic(S *generic, + __attribute__((address_space(5))) S *src) { + __builtin_memcpy(generic, src, sizeof(S)); +// PTX: ld.local.u32 +// PTX: st.u32 +} + +__shared__ S shared; + +// PTX-LABEL: .entry _Z20copy_param_to_shared1S( +void __global__ copy_param_to_shared( S param) { + __builtin_memcpy(&shared, ¶m, sizeof(S)); +// PTX: ld.param.u32 +// PTX: st.shared.u32 +} + +void __device__ copy_shared_to_generic(S *generic) { + __builtin_memcpy(generic, &shared, sizeof(S)); +// PTX: ld.shared.u32 +// PTX: st.u32 +}
Index: llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp =================================================================== --- llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp +++ llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp @@ -386,9 +386,9 @@ // always lower memset, memcpy, and memmove intrinsics to load/store // instructions, rather // then generating calls to memset, mempcy or memmove. - MaxStoresPerMemset = (unsigned) 0xFFFFFFFF; - MaxStoresPerMemcpy = (unsigned) 0xFFFFFFFF; - MaxStoresPerMemmove = (unsigned) 0xFFFFFFFF; + MaxStoresPerMemset = MaxStoresPerMemsetOptSize = (unsigned)0xFFFFFFFF; + MaxStoresPerMemcpy = MaxStoresPerMemcpyOptSize = (unsigned) 0xFFFFFFFF; + MaxStoresPerMemmove = MaxStoresPerMemmoveOptSize = (unsigned) 0xFFFFFFFF; setBooleanContents(ZeroOrNegativeOneBooleanContent); setBooleanVectorContents(ZeroOrNegativeOneBooleanContent); Index: clang/test/CodeGenCUDA/memcpy-libcall.cu =================================================================== --- /dev/null +++ clang/test/CodeGenCUDA/memcpy-libcall.cu @@ -0,0 +1,61 @@ +// RUN: %clang_cc1 -x cuda -triple nvptx64-nvidia-cuda- -fcuda-is-device \ +// RUN: -O3 -S %s -o - | FileCheck -check-prefix=PTX %s +// RUN: %clang_cc1 -x cuda -triple nvptx64-nvidia-cuda- -fcuda-is-device \ +// RUN: -Os -S %s -o - | FileCheck -check-prefix=PTX %s +#include "Inputs/cuda.h" + +// PTX-LABEL: .func _Z12copy_genericPvPKv( +void __device__ copy_generic(void *dest, const void *src) { + __builtin_memcpy(dest, src, 32); +// PTX: ld.u8 +// PTX: st.u8 +} + +// PTX-LABEL: .entry _Z11copy_globalPvS_( +void __global__ copy_global(void *dest, void * src) { + __builtin_memcpy(dest, src, 32); +// PTX: ld.global.u8 +// PTX: st.global.u8 +} + +struct S { + int data[8]; +}; + +// PTX-LABEL: .entry _Z20copy_param_to_globalP1SS_( +void __global__ copy_param_to_global(S *global, S param) { + __builtin_memcpy(global, ¶m, sizeof(S)); +// PTX: ld.param.u32 +// PTX: st.global.u32 +} + +// PTX-LABEL: .entry _Z19copy_param_to_localPU3AS51SS_( +void __global__ copy_param_to_local(__attribute__((address_space(5))) S *local, + S param) { + __builtin_memcpy(local, ¶m, sizeof(S)); +// PTX: ld.param.u32 +// PTX: st.local.u32 +} + +// PTX-LABEL: .func _Z21copy_local_to_genericP1SPU3AS5S_( +void __device__ copy_local_to_generic(S *generic, + __attribute__((address_space(5))) S *src) { + __builtin_memcpy(generic, src, sizeof(S)); +// PTX: ld.local.u32 +// PTX: st.u32 +} + +__shared__ S shared; + +// PTX-LABEL: .entry _Z20copy_param_to_shared1S( +void __global__ copy_param_to_shared( S param) { + __builtin_memcpy(&shared, ¶m, sizeof(S)); +// PTX: ld.param.u32 +// PTX: st.shared.u32 +} + +void __device__ copy_shared_to_generic(S *generic) { + __builtin_memcpy(generic, &shared, sizeof(S)); +// PTX: ld.shared.u32 +// PTX: st.u32 +}
_______________________________________________ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits