https://gcc.gnu.org/g:0c917e4821cf4b3dc46e703e7f1fcbf9dd7f0f13
commit 0c917e4821cf4b3dc46e703e7f1fcbf9dd7f0f13 Author: Thomas Schwinge <tschwi...@baylibre.com> Date: Fri May 31 17:04:39 2024 +0200 nvptx offloading: 'GOMP_NVPTX_NATIVE_GPU_THREAD_STACK_SIZE' environment variable [PR97384, PR105274] ... as a means to manually set the "native" GPU thread stack size. PR libgomp/97384 PR libgomp/105274 libgomp/ * plugin/cuda-lib.def (cuCtxSetLimit): Add. * plugin/plugin-nvptx.c (nvptx_open_device): Handle 'GOMP_NVPTX_NATIVE_GPU_THREAD_STACK_SIZE' environment variable. (cherry picked from commit 0d25989d60d15866ef4737d66e02432f50717255) Diff: --- libgomp/ChangeLog.omp | 6 ++++++ libgomp/plugin/cuda-lib.def | 1 + libgomp/plugin/plugin-nvptx.c | 45 +++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 52 insertions(+) diff --git a/libgomp/ChangeLog.omp b/libgomp/ChangeLog.omp index b1c3ec684fd..4021dc46fab 100644 --- a/libgomp/ChangeLog.omp +++ b/libgomp/ChangeLog.omp @@ -1,3 +1,9 @@ +2024-05-31 Thomas Schwinge <tschwi...@baylibre.com> + + * plugin/cuda-lib.def (cuCtxSetLimit): Add. + * plugin/plugin-nvptx.c (nvptx_open_device): Handle + 'GOMP_NVPTX_NATIVE_GPU_THREAD_STACK_SIZE' environment variable. + 2024-06-05 Thomas Schwinge <tschwi...@baylibre.com> * plugin/plugin-nvptx.c (nvptx_do_global_cdtors): New. diff --git a/libgomp/plugin/cuda-lib.def b/libgomp/plugin/cuda-lib.def index bd25375c26a..f3aa3fb3639 100644 --- a/libgomp/plugin/cuda-lib.def +++ b/libgomp/plugin/cuda-lib.def @@ -4,6 +4,7 @@ CUDA_ONE_CALL (cuCtxGetCurrent) CUDA_ONE_CALL (cuCtxGetDevice) CUDA_ONE_CALL (cuCtxPopCurrent) CUDA_ONE_CALL (cuCtxPushCurrent) +CUDA_ONE_CALL (cuCtxSetLimit) CUDA_ONE_CALL (cuCtxSynchronize) CUDA_ONE_CALL (cuDeviceGet) CUDA_ONE_CALL (cuDeviceGetAttribute) diff --git a/libgomp/plugin/plugin-nvptx.c b/libgomp/plugin/plugin-nvptx.c index 16c69bb4511..84bac0beafd 100644 --- a/libgomp/plugin/plugin-nvptx.c +++ b/libgomp/plugin/plugin-nvptx.c @@ -150,6 +150,8 @@ init_cuda_lib (void) #include "secure_getenv.h" +static void notify_var (const char *, const char *); + #undef MIN #undef MAX #define MIN(X,Y) ((X) < (Y) ? (X) : (Y)) @@ -341,6 +343,9 @@ struct ptx_device static struct ptx_device **ptx_devices; +/* "Native" GPU thread stack size. */ +static unsigned native_gpu_thread_stack_size = 0; + /* OpenMP kernels reserve a small amount of ".shared" space for use by omp_alloc. The size is configured using GOMP_NVPTX_LOWLAT_POOL, but the default is set here. */ @@ -555,6 +560,46 @@ nvptx_open_device (int n) ptx_dev->free_blocks = NULL; pthread_mutex_init (&ptx_dev->free_blocks_lock, NULL); + /* "Native" GPU thread stack size. */ + { + /* This is intentionally undocumented, until we work out a proper, common + scheme (as much as makes sense) between all offload plugins as well + as between nvptx offloading use of "native" stacks for OpenACC vs. + OpenMP "soft stacks" vs. OpenMP '-msoft-stack-reserve-local=[...]'. + + GCN offloading has a 'GCN_STACK_SIZE' environment variable (without + 'GOMP_' prefix): documented; presumably used for all things OpenACC and + OpenMP? Based on GCN command-line option '-mstack-size=[...]' (marked + "obsolete"), that one may be set via a GCN 'mkoffload'-synthesized + 'constructor' function. */ + const char *var_name = "GOMP_NVPTX_NATIVE_GPU_THREAD_STACK_SIZE"; + const char *env_var = secure_getenv (var_name); + notify_var (var_name, env_var); + + if (env_var != NULL) + { + char *endptr; + unsigned long val = strtoul (env_var, &endptr, 10); + if (endptr == NULL || *endptr != '\0' + || errno == ERANGE || errno == EINVAL + || val > UINT_MAX) + GOMP_PLUGIN_error ("Error parsing %s", var_name); + else + native_gpu_thread_stack_size = val; + } + } + if (native_gpu_thread_stack_size == 0) + ; /* Zero means use default. */ + else + { + GOMP_PLUGIN_debug (0, "Setting \"native\" GPU thread stack size" + " ('CU_LIMIT_STACK_SIZE') to %u bytes\n", + native_gpu_thread_stack_size); + CUDA_CALL (cuCtxSetLimit, + CU_LIMIT_STACK_SIZE, (size_t) native_gpu_thread_stack_size); + } + + /* OpenMP "soft stacks". */ ptx_dev->omp_stacks.ptr = 0; ptx_dev->omp_stacks.size = 0; pthread_mutex_init (&ptx_dev->omp_stacks.lock, NULL);