Author: Artem Belevich Date: 2023-08-18T11:27:36-07:00 New Revision: 72757343fa866b7bfcbaa67edad895297c8cb2c5
URL: https://github.com/llvm/llvm-project/commit/72757343fa866b7bfcbaa67edad895297c8cb2c5 DIFF: https://github.com/llvm/llvm-project/commit/72757343fa866b7bfcbaa67edad895297c8cb2c5.diff LOG: [CUDA/NVPTX] Improve handling of memcpy for -Os compilations. We had some instances when LLVM would not inline fixed-count memcpy and ended up attempting to lower it a a libcall, which would not work on NVPTX as there's no standard library to call. The patch relaxes the threshold used for -Os compilation so we're always allowed to inline memory copy functions. Differential Revision: https://reviews.llvm.org/D158226 Added: clang/test/CodeGenCUDA/memcpy-libcall.cu Modified: llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp Removed: ################################################################################ diff --git a/clang/test/CodeGenCUDA/memcpy-libcall.cu b/clang/test/CodeGenCUDA/memcpy-libcall.cu new file mode 100644 index 00000000000000..12844fd9c2bc41 --- /dev/null +++ b/clang/test/CodeGenCUDA/memcpy-libcall.cu @@ -0,0 +1,61 @@ +// RUN: %clang_cc1 -x cuda -triple nvptx64-nvidia-cuda- -fcuda-is-device \ +// RUN: -O3 -S %s -o - | FileCheck -check-prefix=PTX %s +// RUN: %clang_cc1 -x cuda -triple nvptx64-nvidia-cuda- -fcuda-is-device \ +// RUN: -Os -S %s -o - | FileCheck -check-prefix=PTX %s +#include "Inputs/cuda.h" + +// PTX-LABEL: .func _Z12copy_genericPvPKv( +void __device__ copy_generic(void *dest, const void *src) { + __builtin_memcpy(dest, src, 32); +// PTX: ld.u8 +// PTX: st.u8 +} + +// PTX-LABEL: .entry _Z11copy_globalPvS_( +void __global__ copy_global(void *dest, void * src) { + __builtin_memcpy(dest, src, 32); +// PTX: ld.global.u8 +// PTX: st.global.u8 +} + +struct S { + int data[8]; +}; + +// PTX-LABEL: .entry _Z20copy_param_to_globalP1SS_( +void __global__ copy_param_to_global(S *global, S param) { + __builtin_memcpy(global, ¶m, sizeof(S)); +// PTX: ld.param.u32 +// PTX: st.global.u32 +} + +// PTX-LABEL: .entry _Z19copy_param_to_localPU3AS51SS_( +void __global__ copy_param_to_local(__attribute__((address_space(5))) S *local, + S param) { + __builtin_memcpy(local, ¶m, sizeof(S)); +// PTX: ld.param.u32 +// PTX: st.local.u32 +} + +// PTX-LABEL: .func _Z21copy_local_to_genericP1SPU3AS5S_( +void __device__ copy_local_to_generic(S *generic, + __attribute__((address_space(5))) S *src) { + __builtin_memcpy(generic, src, sizeof(S)); +// PTX: ld.local.u32 +// PTX: st.u32 +} + +__shared__ S shared; + +// PTX-LABEL: .entry _Z20copy_param_to_shared1S( +void __global__ copy_param_to_shared( S param) { + __builtin_memcpy(&shared, ¶m, sizeof(S)); +// PTX: ld.param.u32 +// PTX: st.shared.u32 +} + +void __device__ copy_shared_to_generic(S *generic) { + __builtin_memcpy(generic, &shared, sizeof(S)); +// PTX: ld.shared.u32 +// PTX: st.u32 +} diff --git a/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp b/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp index 7823e12d627066..f12f4fe3af33f0 100644 --- a/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp +++ b/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp @@ -386,9 +386,9 @@ NVPTXTargetLowering::NVPTXTargetLowering(const NVPTXTargetMachine &TM, // always lower memset, memcpy, and memmove intrinsics to load/store // instructions, rather // then generating calls to memset, mempcy or memmove. - MaxStoresPerMemset = (unsigned) 0xFFFFFFFF; - MaxStoresPerMemcpy = (unsigned) 0xFFFFFFFF; - MaxStoresPerMemmove = (unsigned) 0xFFFFFFFF; + MaxStoresPerMemset = MaxStoresPerMemsetOptSize = (unsigned)0xFFFFFFFF; + MaxStoresPerMemcpy = MaxStoresPerMemcpyOptSize = (unsigned) 0xFFFFFFFF; + MaxStoresPerMemmove = MaxStoresPerMemmoveOptSize = (unsigned) 0xFFFFFFFF; setBooleanContents(ZeroOrNegativeOneBooleanContent); setBooleanVectorContents(ZeroOrNegativeOneBooleanContent); _______________________________________________ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits