Avoid calling malloc where it's easy to use stack storage instead: device malloc is very slow in CUDA. This cuts about 60-80 microseconds from target region entry/exit time, slimming down empty target regions from ~95 to ~17 microseconds (as measured on a GTX Titan).
* config/nvptx/target.c (GOMP_teams): Do not call 'free'. * config/nvptx/team.c (gomp_nvptx_main): Use 'alloca' instead of 'malloc' to obtain storage. Do not call 'free'. * team.c (gomp_free_thread) [__nvptx__]: Do not call 'free'. --- libgomp/ChangeLog.gomp-nvptx | 7 +++++++ libgomp/config/nvptx/target.c | 1 - libgomp/config/nvptx/team.c | 9 +++++---- libgomp/team.c | 4 +++- 4 files changed, 15 insertions(+), 6 deletions(-) diff --git a/libgomp/config/nvptx/target.c b/libgomp/config/nvptx/target.c index dbf4710..38ea7f7 100644 --- a/libgomp/config/nvptx/target.c +++ b/libgomp/config/nvptx/target.c @@ -43,7 +43,6 @@ GOMP_teams (unsigned int num_teams, unsigned int thread_limit) else if (block_id >= num_teams) { gomp_free_thread (nvptx_thrs); - free (nvptx_thrs); asm ("exit;"); } gomp_num_teams_var = num_teams - 1; diff --git a/libgomp/config/nvptx/team.c b/libgomp/config/nvptx/team.c index b9f9f9f..933f5a0 100644 --- a/libgomp/config/nvptx/team.c +++ b/libgomp/config/nvptx/team.c @@ -29,6 +29,7 @@ #include "libgomp.h" #include <stdlib.h> +#include <string.h> struct gomp_thread *nvptx_thrs __attribute__((shared)); @@ -46,10 +47,11 @@ gomp_nvptx_main (void (*fn) (void *), void *fn_data) /* Starting additional threads is not supported. */ gomp_global_icv.dyn_var = true; - nvptx_thrs = gomp_malloc_cleared (ntids * sizeof (*nvptx_thrs)); + nvptx_thrs = alloca (ntids * sizeof (*nvptx_thrs)); + memset (nvptx_thrs, 0, ntids * sizeof (*nvptx_thrs)); - struct gomp_thread_pool *pool = gomp_malloc (sizeof (*pool)); - pool->threads = gomp_malloc (ntids * sizeof (*pool->threads)); + struct gomp_thread_pool *pool = alloca (sizeof (*pool)); + pool->threads = alloca (ntids * sizeof (*pool->threads)); for (tid = 0; tid < ntids; tid++) pool->threads[tid] = nvptx_thrs + tid; pool->threads_size = ntids; @@ -63,7 +65,6 @@ gomp_nvptx_main (void (*fn) (void *), void *fn_data) fn (fn_data); gomp_free_thread (nvptx_thrs); - free (nvptx_thrs); } else { diff --git a/libgomp/team.c b/libgomp/team.c index 9a43a10..e301345 100644 --- a/libgomp/team.c +++ b/libgomp/team.c @@ -274,10 +274,12 @@ gomp_free_thread (void *arg __attribute__((unused))) gomp_mutex_unlock (&gomp_managed_threads_lock); #endif } - free (pool->threads); if (pool->last_team) free_team (pool->last_team); +#ifndef __nvptx__ + free (pool->threads); free (pool); +#endif thr->thread_pool = NULL; } if (thr->ts.level == 0 && __builtin_expect (thr->ts.team != NULL, 0))