Hi, This patch adds caching for the stack block allocated for offloaded OpenMP kernel launches on NVPTX. This is a performance optimisation -- we observed an average 11% or so performance improvement with this patch across a set of accelerated GPU benchmarks on one machine (results vary according to individual benchmark and with hardware used).
A given kernel launch will reuse the stack block from the previous launch if it is large enough, else it is freed and reallocated. A slight caveat is that memory will not be freed until the device is closed, so e.g. if code is using highly variable launch geometries and large amounts of GPU RAM, you might run out of resources slightly quicker with this patch. Another way this patch gains performance is by omitting the synchronisation at the end of an OpenMP offload kernel launch -- it's safe for the GPU and CPU to continue executing in parallel at that point, because e.g. copies-back from the device will be synchronised properly with kernel completion anyway. In turn, the last part necessitates a change to the way "(perhaps abort was called)" errors are detected and reported. Tested with offloading to NVPTX. OK for mainline? Thanks, Julian 2020-10-26 Julian Brown <jul...@codesourcery.com> libgomp/ * plugin/plugin-nvptx.c (maybe_abort_message): Add function. (CUDA_CALL_ERET, CUDA_CALL_ASSERT): Use above function. (struct ptx_device): Add omp_stacks struct. (nvptx_open_device): Initialise cached-stacks housekeeping info. (nvptx_close_device): Free cached stacks block and mutex. (nvptx_stacks_alloc): Rename to... (nvptx_stacks_acquire): This. Cache stacks block between runs if same size or smaller is required. (nvptx_stacks_free): Rename to... (nvptx_stacks_release): This. Do not free stacks block, but release mutex. (GOMP_OFFLOAD_run): Adjust for changes to above functions, and remove special-case "abort" error handling and synchronisation after kernel launch. --- libgomp/plugin/plugin-nvptx.c | 91 ++++++++++++++++++++++++++--------- 1 file changed, 68 insertions(+), 23 deletions(-) diff --git a/libgomp/plugin/plugin-nvptx.c b/libgomp/plugin/plugin-nvptx.c index 11d4ceeae62e..e7ff5d5213e0 100644 --- a/libgomp/plugin/plugin-nvptx.c +++ b/libgomp/plugin/plugin-nvptx.c @@ -137,6 +137,15 @@ init_cuda_lib (void) #define MIN(X,Y) ((X) < (Y) ? (X) : (Y)) #define MAX(X,Y) ((X) > (Y) ? (X) : (Y)) +static const char * +maybe_abort_message (unsigned errmsg) +{ + if (errmsg == CUDA_ERROR_LAUNCH_FAILED) + return " (perhaps abort was called)"; + else + return ""; +} + /* Convenience macros for the frequently used CUDA library call and error handling sequence as well as CUDA library calls that do the error checking themselves or don't do it at all. */ @@ -147,8 +156,9 @@ init_cuda_lib (void) = CUDA_CALL_PREFIX FN (__VA_ARGS__); \ if (__r != CUDA_SUCCESS) \ { \ - GOMP_PLUGIN_error (#FN " error: %s", \ - cuda_error (__r)); \ + GOMP_PLUGIN_error (#FN " error: %s%s", \ + cuda_error (__r), \ + maybe_abort_message (__r)); \ return ERET; \ } \ } while (0) @@ -162,8 +172,9 @@ init_cuda_lib (void) = CUDA_CALL_PREFIX FN (__VA_ARGS__); \ if (__r != CUDA_SUCCESS) \ { \ - GOMP_PLUGIN_fatal (#FN " error: %s", \ - cuda_error (__r)); \ + GOMP_PLUGIN_fatal (#FN " error: %s%s", \ + cuda_error (__r), \ + maybe_abort_message (__r)); \ } \ } while (0) @@ -307,6 +318,14 @@ struct ptx_device struct ptx_free_block *free_blocks; pthread_mutex_t free_blocks_lock; + /* OpenMP stacks, cached between kernel invocations. */ + struct + { + CUdeviceptr ptr; + size_t size; + pthread_mutex_t lock; + } omp_stacks; + struct ptx_device *next; }; @@ -514,6 +533,10 @@ nvptx_open_device (int n) ptx_dev->free_blocks = NULL; pthread_mutex_init (&ptx_dev->free_blocks_lock, NULL); + ptx_dev->omp_stacks.ptr = 0; + ptx_dev->omp_stacks.size = 0; + pthread_mutex_init (&ptx_dev->omp_stacks.lock, NULL); + return ptx_dev; } @@ -534,6 +557,11 @@ nvptx_close_device (struct ptx_device *ptx_dev) pthread_mutex_destroy (&ptx_dev->free_blocks_lock); pthread_mutex_destroy (&ptx_dev->image_lock); + pthread_mutex_destroy (&ptx_dev->omp_stacks.lock); + + if (ptx_dev->omp_stacks.ptr) + CUDA_CALL (cuMemFree, ptx_dev->omp_stacks.ptr); + if (!ptx_dev->ctx_shared) CUDA_CALL (cuCtxDestroy, ptx_dev->ctx); @@ -1866,26 +1894,49 @@ nvptx_stacks_size () return 128 * 1024; } -/* Return contiguous storage for NUM stacks, each SIZE bytes. */ +/* Return contiguous storage for NUM stacks, each SIZE bytes, and obtain the + lock for that storage. */ static void * -nvptx_stacks_alloc (size_t size, int num) +nvptx_stacks_acquire (struct ptx_device *ptx_dev, size_t size, int num) { - CUdeviceptr stacks; - CUresult r = CUDA_CALL_NOCHECK (cuMemAlloc, &stacks, size * num); + pthread_mutex_lock (&ptx_dev->omp_stacks.lock); + + if (ptx_dev->omp_stacks.ptr && ptx_dev->omp_stacks.size >= size * num) + return (void *) ptx_dev->omp_stacks.ptr; + + /* Free the old, too-small stacks. */ + if (ptx_dev->omp_stacks.ptr) + { + CUresult r = CUDA_CALL_NOCHECK (cuCtxSynchronize, ); + if (r != CUDA_SUCCESS) + GOMP_PLUGIN_fatal ("cuCtxSynchronize error: %s\n", cuda_error (r)); + r = CUDA_CALL_NOCHECK (cuMemFree, ptx_dev->omp_stacks.ptr); + if (r != CUDA_SUCCESS) + GOMP_PLUGIN_fatal ("cuMemFree error: %s", cuda_error (r)); + } + + /* Make new and bigger stacks, and remember where we put them and how big + they are. */ + CUresult r = CUDA_CALL_NOCHECK (cuMemAlloc, &ptx_dev->omp_stacks.ptr, + size * num); if (r != CUDA_SUCCESS) GOMP_PLUGIN_fatal ("cuMemAlloc error: %s", cuda_error (r)); - return (void *) stacks; + + ptx_dev->omp_stacks.size = size * num; + + return (void *) ptx_dev->omp_stacks.ptr; } -/* Release storage previously allocated by nvptx_stacks_alloc. */ +/* Release the lock associated with a ptx_device's OpenMP stacks block. */ static void -nvptx_stacks_free (void *p, int num) +nvptx_stacks_release (CUstream stream, CUresult res, void *ptr) { - CUresult r = CUDA_CALL_NOCHECK (cuMemFree, (CUdeviceptr) p); - if (r != CUDA_SUCCESS) - GOMP_PLUGIN_fatal ("cuMemFree error: %s", cuda_error (r)); + if (res != CUDA_SUCCESS) + GOMP_PLUGIN_fatal ("%s error: %s", __FUNCTION__, cuda_error (res)); + struct ptx_device *ptx_dev = (struct ptx_device *) ptr; + pthread_mutex_unlock (&ptx_dev->omp_stacks.lock); } void @@ -1898,7 +1949,6 @@ GOMP_OFFLOAD_run (int ord, void *tgt_fn, void *tgt_vars, void **args) const char *fn_name = launch->fn; CUresult r; struct ptx_device *ptx_dev = ptx_devices[ord]; - const char *maybe_abort_msg = "(perhaps abort was called)"; int teams = 0, threads = 0; if (!args) @@ -1922,7 +1972,7 @@ GOMP_OFFLOAD_run (int ord, void *tgt_fn, void *tgt_vars, void **args) nvptx_adjust_launch_bounds (tgt_fn, ptx_dev, &teams, &threads); size_t stack_size = nvptx_stacks_size (); - void *stacks = nvptx_stacks_alloc (stack_size, teams * threads); + void *stacks = nvptx_stacks_acquire (ptx_dev, stack_size, teams * threads); void *fn_args[] = {tgt_vars, stacks, (void *) stack_size}; size_t fn_args_size = sizeof fn_args; void *config[] = { @@ -1938,13 +1988,8 @@ GOMP_OFFLOAD_run (int ord, void *tgt_fn, void *tgt_vars, void **args) if (r != CUDA_SUCCESS) GOMP_PLUGIN_fatal ("cuLaunchKernel error: %s", cuda_error (r)); - r = CUDA_CALL_NOCHECK (cuCtxSynchronize, ); - if (r == CUDA_ERROR_LAUNCH_FAILED) - GOMP_PLUGIN_fatal ("cuCtxSynchronize error: %s %s\n", cuda_error (r), - maybe_abort_msg); - else if (r != CUDA_SUCCESS) - GOMP_PLUGIN_fatal ("cuCtxSynchronize error: %s", cuda_error (r)); - nvptx_stacks_free (stacks, teams * threads); + CUDA_CALL_ASSERT (cuStreamAddCallback, NULL, nvptx_stacks_release, + (void *) ptx_dev, 0); } /* TODO: Implement GOMP_OFFLOAD_async_run. */ -- 2.28.0