Avoid calling malloc where it's easy to use stack storage instead: device
malloc is very slow in CUDA.  This cuts about 60-80 microseconds from target
region entry/exit time, slimming down empty target regions from ~95 to ~17
microseconds (as measured on a GTX Titan).

        * config/nvptx/target.c (GOMP_teams): Do not call 'free'.
        * config/nvptx/team.c (gomp_nvptx_main): Use 'alloca' instead of
        'malloc' to obtain storage.  Do not call 'free'.
        * team.c (gomp_free_thread) [__nvptx__]: Do not call 'free'.
---
 libgomp/ChangeLog.gomp-nvptx  | 7 +++++++
 libgomp/config/nvptx/target.c | 1 -
 libgomp/config/nvptx/team.c   | 9 +++++----
 libgomp/team.c                | 4 +++-
 4 files changed, 15 insertions(+), 6 deletions(-)

diff --git a/libgomp/config/nvptx/target.c b/libgomp/config/nvptx/target.c
index dbf4710..38ea7f7 100644
--- a/libgomp/config/nvptx/target.c
+++ b/libgomp/config/nvptx/target.c
@@ -43,7 +43,6 @@ GOMP_teams (unsigned int num_teams, unsigned int thread_limit)
   else if (block_id >= num_teams)
     {
       gomp_free_thread (nvptx_thrs);
-      free (nvptx_thrs);
       asm ("exit;");
     }
   gomp_num_teams_var = num_teams - 1;
diff --git a/libgomp/config/nvptx/team.c b/libgomp/config/nvptx/team.c
index b9f9f9f..933f5a0 100644
--- a/libgomp/config/nvptx/team.c
+++ b/libgomp/config/nvptx/team.c
@@ -29,6 +29,7 @@
 
 #include "libgomp.h"
 #include <stdlib.h>
+#include <string.h>
 
 struct gomp_thread *nvptx_thrs __attribute__((shared));
 
@@ -46,10 +47,11 @@ gomp_nvptx_main (void (*fn) (void *), void *fn_data)
       /* Starting additional threads is not supported.  */
       gomp_global_icv.dyn_var = true;
 
-      nvptx_thrs = gomp_malloc_cleared (ntids * sizeof (*nvptx_thrs));
+      nvptx_thrs = alloca (ntids * sizeof (*nvptx_thrs));
+      memset (nvptx_thrs, 0, ntids * sizeof (*nvptx_thrs));
 
-      struct gomp_thread_pool *pool = gomp_malloc (sizeof (*pool));
-      pool->threads = gomp_malloc (ntids * sizeof (*pool->threads));
+      struct gomp_thread_pool *pool = alloca (sizeof (*pool));
+      pool->threads = alloca (ntids * sizeof (*pool->threads));
       for (tid = 0; tid < ntids; tid++)
        pool->threads[tid] = nvptx_thrs + tid;
       pool->threads_size = ntids;
@@ -63,7 +65,6 @@ gomp_nvptx_main (void (*fn) (void *), void *fn_data)
       fn (fn_data);
 
       gomp_free_thread (nvptx_thrs);
-      free (nvptx_thrs);
     }
   else
     {
diff --git a/libgomp/team.c b/libgomp/team.c
index 9a43a10..e301345 100644
--- a/libgomp/team.c
+++ b/libgomp/team.c
@@ -274,10 +274,12 @@ gomp_free_thread (void *arg __attribute__((unused)))
          gomp_mutex_unlock (&gomp_managed_threads_lock);
 #endif
        }
-      free (pool->threads);
       if (pool->last_team)
        free_team (pool->last_team);
+#ifndef __nvptx__
+      free (pool->threads);
       free (pool);
+#endif
       thr->thread_pool = NULL;
     }
   if (thr->ts.level == 0 && __builtin_expect (thr->ts.team != NULL, 0))

Reply via email to