Try to re-use the previous team to avoid the use of malloc() and free()
in the normal case where number of threads is the same. Avoid
superfluous destruction and initialization of team synchronization
objects.
Using the microbenchmark posted here
https://gcc.gnu.org/ml/gcc-patches/2008-03/msg00930.html
shows an improvement in the parallel bench test case (target
x86_64-unknown-linux-gnu, median out of 9 test runs, iteration count
increased to 200000).
Before the patch:
parallel bench 11.2284 seconds
After the patch:
parallel bench 10.7575 seconds
libgomp/ChangeLog
2015-07-07 Sebastian Huber <sebastian.hu...@embedded-brains.de>
* libgomp.h (gomp_thread): Add spare_team field.
* team.c (gomp_thread_start): Initialize spare team for non-TLS
targets.
(gomp_new_team): Use spare team if possible.
(free_team): Destroy more team objects.
(gomp_free_thread): Free spare team if necessary.
(free_non_nested_team): New.
(gomp_team_end): Move some team object destructions to
free_team(). Use free_non_nested_team().
---
libgomp/libgomp.h | 3 +++
libgomp/team.c | 63 ++++++++++++++++++++++++++++++++++++-------------------
2 files changed, 45 insertions(+), 21 deletions(-)
diff --git a/libgomp/libgomp.h b/libgomp/libgomp.h
index 5ed0f78..563c1e2 100644
--- a/libgomp/libgomp.h
+++ b/libgomp/libgomp.h
@@ -448,6 +448,9 @@ struct gomp_thread
/* User pthread thread pool */
struct gomp_thread_pool *thread_pool;
+
+ /* Spare team ready for re-use in gomp_new_team() */
+ struct gomp_team *spare_team;
};
diff --git a/libgomp/team.c b/libgomp/team.c
index b98b233..cc19eb0 100644
--- a/libgomp/team.c
+++ b/libgomp/team.c
@@ -77,6 +77,7 @@ gomp_thread_start (void *xdata)
struct gomp_thread local_thr;
thr = &local_thr;
pthread_setspecific (gomp_tls_key, thr);
+ thr->spare_team = NULL;
#endif
gomp_sem_init (&thr->release, 0);
@@ -140,19 +141,35 @@ gomp_thread_start (void *xdata)
struct gomp_team *
gomp_new_team (unsigned nthreads)
{
+ struct gomp_thread *thr = gomp_thread ();
+ struct gomp_team *spare_team = thr->spare_team;
struct gomp_team *team;
- size_t size;
int i;
- size = sizeof (*team) + nthreads * (sizeof (team->ordered_release[0])
- + sizeof (team->implicit_task[0]));
- team = gomp_malloc (size);
+ if (spare_team && spare_team->nthreads == nthreads)
+ {
+ thr->spare_team = NULL;
+ team = spare_team;
+ }
+ else
+ {
+ size_t extra = sizeof (team->ordered_release[0])
+ + sizeof (team->implicit_task[0]);
+ team = gomp_malloc (sizeof (*team) + nthreads * extra);
+
+#ifndef HAVE_SYNC_BUILTINS
+ gomp_mutex_init (&team->work_share_list_free_lock);
+#endif
+ gomp_barrier_init (&team->barrier, nthreads);
+ gomp_sem_init (&team->master_release, 0);
+ gomp_mutex_init (&team->task_lock);
+
+ team->nthreads = nthreads;
+ }
team->work_share_chunk = 8;
#ifdef HAVE_SYNC_BUILTINS
team->single_count = 0;
-#else
- gomp_mutex_init (&team->work_share_list_free_lock);
#endif
team->work_shares_to_free = &team->work_shares[0];
gomp_init_work_share (&team->work_shares[0], false, nthreads);
@@ -163,14 +180,9 @@ gomp_new_team (unsigned nthreads)
team->work_shares[i].next_free = &team->work_shares[i + 1];
team->work_shares[i].next_free = NULL;
- team->nthreads = nthreads;
- gomp_barrier_init (&team->barrier, nthreads);
-
- gomp_sem_init (&team->master_release, 0);
team->ordered_release = (void *) &team->implicit_task[nthreads];
team->ordered_release[0] = &team->master_release;
- gomp_mutex_init (&team->task_lock);
team->task_queue = NULL;
team->task_count = 0;
team->task_queued_count = 0;
@@ -187,6 +199,10 @@ gomp_new_team (unsigned nthreads)
static void
free_team (struct gomp_team *team)
{
+ gomp_sem_destroy (&team->master_release);
+#ifndef HAVE_SYNC_BUILTINS
+ gomp_mutex_destroy (&team->work_share_list_free_lock);
+#endif
gomp_barrier_destroy (&team->barrier);
gomp_mutex_destroy (&team->task_lock);
free (team);
@@ -225,6 +241,8 @@ gomp_free_thread (void *arg __attribute__((unused)))
{
struct gomp_thread *thr = gomp_thread ();
struct gomp_thread_pool *pool = thr->thread_pool;
+ if (thr->spare_team)
+ free_team (thr->spare_team);
if (pool)
{
if (pool->threads_used > 0)
@@ -835,6 +853,18 @@ gomp_team_start (void (*fn) (void *), void *data, unsigned
nthreads,
free (affinity_thr);
}
+static void
+free_non_nested_team (struct gomp_team *team, struct gomp_thread *thr)
+{
+ struct gomp_thread_pool *pool = thr->thread_pool;
+ if (pool->last_team)
+ {
+ if (thr->spare_team)
+ free_team (thr->spare_team);
+ thr->spare_team = pool->last_team;
+ }
+ pool->last_team = team;
+}
/* Terminate the current team. This is only to be called by the master
thread. We assume that we must wait for the other threads. */
@@ -894,21 +924,12 @@ gomp_team_end (void)
}
while (ws != NULL);
}
- gomp_sem_destroy (&team->master_release);
-#ifndef HAVE_SYNC_BUILTINS
- gomp_mutex_destroy (&team->work_share_list_free_lock);
-#endif
if (__builtin_expect (thr->ts.team != NULL, 0)
|| __builtin_expect (team->nthreads == 1, 0))
free_team (team);
else
- {
- struct gomp_thread_pool *pool = thr->thread_pool;
- if (pool->last_team)
- free_team (pool->last_team);
- pool->last_team = team;
- }
+ free_non_nested_team (team, thr);
}