wake ((int *) >generation, count == 0 ? INT_MAX : count);
+}
+
+void
+gomp_team_barrier_wait_end (gomp_barrier_t *bar, gomp_barrier_state_t state)
+{
+ unsigned int generation, gen;
+
+ if (__builtin_expect (state & BAR_WAS_LAST, 0))
+{
+ /* Next time we'll be awaiting TOTAL threads again. */
+ struct gomp_thread *thr = gomp_thread ();
+ struct gomp_team *team = thr->ts.team;
+
+ bar->awaited = bar->total;
+ team->work_share_cancelled = 0;
+ if (__builtin_expect (team->task_count, 0))
+ {
+ gomp_barrier_handle_tasks (state);
+ state &= ~BAR_WAS_LAST;
+ }
+ else
+ {
+ state &= ~BAR_CANCELLED;
+ state += BAR_INCR - BAR_WAS_LAST;
+ __atomic_store_n (>generation, state, MEMMODEL_RELEASE);
+ futex_wake ((int *) >generation, INT_MAX);
+ return;
+ }
+}
+
+ generation = state;
+ state &= ~BAR_CANCELLED;
+ do
+{
+ do_wait ((int *) >generation, generation);
+ gen = __atomic_load_n (>generation, MEMMODEL_ACQUIRE);
+ if (__builtin_expect (gen & BAR_TASK_PENDING, 0))
+ {
+ gomp_barrier_handle_tasks (state);
+ gen = __atomic_load_n (>generation, MEMMODEL_ACQUIRE);
+ }
+ generation |= gen & BAR_WAITING_FOR_TASK;
+}
+ while (gen != state + BAR_INCR);
+}
+
+void
+gomp_team_barrier_wait (gomp_barrier_t *bar)
+{
+ gomp_team_barrier_wait_end (bar, gomp_barrier_wait_start (bar));
+}
+
+void
+gomp_team_barrier_wait_final (gomp_barrier_t *bar)
+{
+ gomp_barrier_state_t state = gomp_barrier_wait_final_start (bar);
+ if (__builtin_expect (state & BAR_WAS_LAST, 0))
+bar->awaited_final = bar->total;
+ gomp_team_barrier_wait_end (bar, state);
+}
+
+bool
+gomp_team_barrier_wait_cancel_end (gomp_barrier_t *bar,
+ gomp_barrier_state_t state)
+{
+ unsigned int generation, gen;
+
+ if (__builtin_expect (state & BAR_WAS_LAST, 0))
+{
+ /* Next time we'll be awaiting TOTAL threads again. */
+ /* BAR_CANCELLED should never be set in state here, because
+cancellation means that at least one of the threads has been
+cancelled, thus on a cancellable barrier we should never see
+all threads to arrive. */
+ struct gomp_thread *thr = gomp_thread ();
+ struct gomp_team *team = thr->ts.team;
+
+ bar->awaited = bar->total;
+ team->work_share_cancelled = 0;
+ if (__builtin_expect (team->task_count, 0))
+ {
+ gomp_barrier_handle_tasks (state);
+ state &= ~BAR_WAS_LAST;
+ }
+ else
+ {
+ state += BAR_INCR - BAR_WAS_LAST;
+ __atomic_store_n (>generation, state, MEMMODEL_RELEASE);
+ futex_wake ((int *) >generation, INT_MAX);
+ return false;
+ }
+}
+
+ if (__builtin_expect (state & BAR_CANCELLED, 0))
+return true;
+
+ generation = state;
+ do
+{
+ do_wait ((int *) >generation, generation);
+ gen = __atomic_load_n (>generation, MEMMODEL_ACQUIRE);
+ if (__builtin_expect (gen & BAR_CANCELLED, 0))
+ return true;
+ if (__builtin_expect (gen & BAR_TASK_PENDING, 0))
+ {
+ gomp_barrier_handle_tasks (state);
+ gen = __atomic_load_n (>generation, MEMMODEL_ACQUIRE);
+ }
+ generation |= gen & BAR_WAITING_FOR_TASK;
+}
+ while (gen != state + BAR_INCR);
+
+ return false;
+}
+
+bool
+gomp_team_barrier_wait_cancel (gomp_barrier_t *bar)
+{
+ return gomp_team_barrier_wait_cancel_end (bar, gomp_barrier_wait_start
(bar));
+}
+
+void
+gomp_team_barrier_cancel (struct gomp_team *team)
+{
+ gomp_mutex_lock (>task_lock);
+ if (team->barrier.generation & BAR_CANCELLED)
+{
+ gomp_mutex_unlock (>task_lock);
+ return;
+}
+ team->barrier.generation |= BAR_CANCELLED;
+ gomp_mutex_unlock (>task_lock);
+ futex_wake ((int *) >barrier.generation, INT_MAX);
+}
--
2.8.1
From 2a621905bb91475e792ee1be9f06ea6145df0bc2 Mon Sep 17 00:00:00 2001
From: Chung-Lin Tang
Date: Thu, 1 Sep 2022 07:04:42 -0700
Subject: [PATCH 2/2] openmp/nvptx: use bar.sync/arrive for barriers when
tasking is not used
The nvptx implementation of futex_wait/wake ops, while enables OpenMP task
behavior on nvptx offloaded regions, can cause quite significant performance
regressions on some benchmarks.
However, when tasks-related functionality are not used at all by the team inside
an OpenMP target region, and a barrier is just a place to wait for all threads
to rejoin (with no problem of invoking waiting tasks to re-start) a barrier can
be implemented by simple bar.sync and bar.arrive PTX instructions, which can
bypass the heavy-weightness of nvptx tasks.
This patch adds a 'task_never_used' flag inside struct gomp_team, initialized
to true, and set to false when tasks are added to the team. The nvptx specific
g