libgomp/ChangeLog:
* barrier.c (GOMP_barrier, GOMP_barrier_cancel): Pass team_id
to barrier functions.
* config/linux/bar.c (gomp_barrier_ensure_last): New.
(gomp_assert_and_increment_flag): New.
(gomp_team_barrier_ensure_last): New.
(gomp_assert_and_increment_cancel_flag): New.
(gomp_team_barrier_ensure_cancel_last): New.
(gomp_barrier_wait_end): Extra assertions.
(gomp_barrier_wait,gomp_team_barrier_wait,
gomp_team_barrier_wait_cancel): Rewrite to use corresponding
gomp_barrier_ensure_last variant.
(gomp_barrier_wait_last): Rewrite for new parameter.
(gomp_team_barrier_wait_end): Update to handle new barrier
flags
& new parameter.
(gomp_team_barrier_wait_cancel_end): Likewise.
(gomp_team_barrier_wait_final): Delegate to
gomp_team_barrier_wait.
(gomp_team_barrier_cancel): Rewrite to account for futex_waitv
fallback.
* config/linux/bar.h (struct gomp_barrier_t): Add
`threadgens`,
remove `awaited` and `awaited_final`.
(BAR_SECONDARY_ARRIVED): New.
(BAR_SECONDARY_CANCELLABLE_ARRIVED): New.
(BAR_CANCEL_INCR): New.
(BAR_TASK_PENDING): Shift to accomodate new bits.
(BAR_INCR): Shift to accomodate new bits.
(BAR_FLAGS_MASK): New.
(BAR_GEN_MASK): New.
(BAR_BOTH_GENS_MASK): New.
(BAR_CANCEL_GEN_MASK): New.
(BAR_INCREMENT_CANCEL): New.
(PRIMARY_WAITING_TG): New.
(gomp_assert_seenflags): New.
(gomp_barrier_has_space): New.
(gomp_barrier_minimal_reinit): New.
(gomp_barrier_init): Account for threadgens, remove awaited
and
awaited_final.
(gomp_barrier_reinit): Remove.
(gomp_barrier_reinit_1): New.
(gomp_barrier_destroy): Free new `threadgens` member.
(gomp_barrier_wait): Add new `id` parameter.
(gomp_barrier_wait_last): Likewise.
(gomp_barrier_wait_end): Likewise.
(gomp_team_barrier_wait): Likewise.
(gomp_team_barrier_wait_final): Likewise.
(gomp_team_barrier_wait_cancel): Likewise.
(gomp_team_barrier_wait_cancel_end): Likewise.
(gomp_barrier_reinit_2): New (dummy) implementation.
(gomp_team_barrier_ensure_last): New declaration.
(gomp_barrier_ensure_last): New declaration.
(gomp_team_barrier_ensure_cancel_last): New declaration.
(gomp_assert_and_increment_flag): New declaration.
(gomp_assert_and_increment_cancel_flag): New declaration.
(gomp_barrier_wait_start): Rewrite.
(gomp_barrier_wait_cancel_start): Rewrite.
(gomp_barrier_wait_final_start): Rewrite.
(gomp_reset_cancellable_primary_threadgen): New.
(gomp_team_barrier_set_task_pending): Make atomic.
(gomp_team_barrier_clear_task_pending): Make atomic.
(gomp_team_barrier_set_waiting_for_tasks): Make atomic.
(gomp_team_barrier_waiting_for_tasks): Make atomic.
(gomp_increment_gen): New to account for cancel/non-cancel
distinction when incrementing generation.
(gomp_team_barrier_done): Use gomp_increment_gen.
(gomp_barrier_state_is_incremented): Use gomp_increment_gen.
(gomp_barrier_has_completed): Load generation atomically.
(gomp_barrier_prepare_reinit): New.
* config/linux/wait.h (FUTEX_32): New.
(spin_count): Extracted from do_spin.
(do_spin): Use spin_count.
* config/posix/bar.c (gomp_barrier_wait,
gomp_team_barrier_wait_end, gomp_team_barrier_wait_cancel_end,
gomp_team_barrier_wait, gomp_team_barrier_wait_cancel): Add
and
pass through new parameters.
* config/posix/bar.h (gomp_barrier_wait,
gomp_team_barrier_wait,
gomp_team_barrier_wait_cancel,
gomp_team_barrier_wait_cancel_end, gomp_barrier_wait_start,
gomp_barrier_wait_cancel_start, gomp_team_barrier_wait_final,
gomp_barrier_wait_last, gomp_team_barrier_done): Add and pass
through new id parameter.
(gomp_barrier_prepare_reinit): New.
(gomp_barrier_minimal_reinit): New.
(gomp_barrier_reinit_1): New.
(gomp_barrier_reinit_2): New.
(gomp_barrier_has_space): New.
(gomp_team_barrier_ensure_last): New.
(gomp_team_barrier_ensure_cancel_last): New.
(gomp_reset_cancellable_primary_threadgen): New.
* config/posix/simple-bar.h (gomp_simple_barrier_reinit):
(gomp_simple_barrier_minimal_reinit): New.
(gomp_simple_barrier_reinit_1): New.
(gomp_simple_barrier_wait): Add new parameter.
(gomp_simple_barrier_wait_last): Add new parameter.
(gomp_simple_barrier_prepare_reinit): New.
(gomp_simple_barrier_reinit_2): New.
(gomp_simple_barrier_has_space): New.
* libgomp.h (gomp_assert): New.
(gomp_barrier_handle_tasks): Add new parameter.
* single.c (GOMP_single_copy_start): Pass new parameter.
(GOMP_single_copy_end): Pass new parameter.
* task.c (gomp_barrier_handle_tasks): Add and pass new
parameter through.
(GOMP_taskgroup_end): Pass new parameter.
(GOMP_workshare_task_reduction_unregister): Pass new
parameter.
* team.c (gomp_thread_start): Pass new parameter.
(gomp_free_pool_helper): Likewise.
(gomp_free_thread): Likewise.
(gomp_team_start): Use new API for reinitialising a barrier.
This requires passing information on which threads are being
reused and which are not.
(gomp_team_end): Pass new parameter.
(gomp_pause_pool_helper): Likewise.
(gomp_pause_host): Likewise.
* work.c (gomp_work_share_end): Directly use
`gomp_team_barrier_ensure_last`.
(gomp_work_share_end_cancel): Similar for
gomp_team_barrier_ensure_cancel_last but also ensure that
thread local generations are reset if barrier is cancelled.
* config/linux/futex_waitv.h: New file.
* testsuite/libgomp.c/primary-thread-tasking.c: New test.
Signed-off-by: Matthew Malcomson <[email protected]>
---
libgomp/barrier.c | 4 +-
libgomp/config/linux/bar.c | 660 ++++++++++++++++-
-
libgomp/config/linux/bar.h | 403 +++++++++--
libgomp/config/linux/futex_waitv.h | 129 ++++
libgomp/config/linux/wait.h | 15 +-
libgomp/config/posix/bar.c | 29 +-
libgomp/config/posix/bar.h | 101 ++-
libgomp/config/posix/simple-bar.h | 39 +-
libgomp/libgomp.h | 11 +-
libgomp/single.c | 4 +-
libgomp/task.c | 12 +-
libgomp/team.c | 127 +++-
.../libgomp.c/primary-thread-tasking.c | 80 +++
libgomp/work.c | 26 +-
14 files changed, 1451 insertions(+), 189 deletions(-)
create mode 100644 libgomp/config/linux/futex_waitv.h
create mode 100644 libgomp/testsuite/libgomp.c/primary-thread-
tasking.c
diff --git a/libgomp/barrier.c b/libgomp/barrier.c
index 244dadd1adb..12e54a1b2b6 100644
--- a/libgomp/barrier.c
+++ b/libgomp/barrier.c
@@ -38,7 +38,7 @@ GOMP_barrier (void)
if (team == NULL)
return;
- gomp_team_barrier_wait (&team->barrier);
+ gomp_team_barrier_wait (&team->barrier, thr->ts.team_id);
}
bool
@@ -50,5 +50,5 @@ GOMP_barrier_cancel (void)
/* The compiler transforms to barrier_cancel when it sees that the
barrier is within a construct that can cancel. Thus we should
never have an orphaned cancellable barrier. */
- return gomp_team_barrier_wait_cancel (&team->barrier);
+ return gomp_team_barrier_wait_cancel (&team->barrier, thr-
ts.team_id);
}
diff --git a/libgomp/config/linux/bar.c b/libgomp/config/linux/bar.c
index 9eaec0e5f23..25f7e04dd16 100644
--- a/libgomp/config/linux/bar.c
+++ b/libgomp/config/linux/bar.c
@@ -29,15 +29,76 @@
#include <limits.h>
#include "wait.h"
+#include "futex_waitv.h"
+void
+gomp_barrier_ensure_last (gomp_barrier_t *bar, unsigned id,
+ gomp_barrier_state_t state)
+{
+ gomp_assert (id == 0, "Calling ensure_last in thread %u", id);
+ gomp_assert (!(state & BAR_CANCELLED),
+ "BAR_CANCELLED set when using plain barrier: %u",
state);
+ unsigned tstate = state & BAR_GEN_MASK;
+ struct thread_lock_data *arr = bar->threadgens;
+ for (unsigned i = 1; i < bar->total; i++)
+ {
+ unsigned tmp;
+ do
+ {
+ do_wait ((int *) &arr[i].gen, tstate);
+ tmp = __atomic_load_n (&arr[i].gen, MEMMODEL_ACQUIRE);
+ gomp_assert (tmp == tstate || tmp == (tstate + BAR_INCR),
+ "Thread-local state seen to be %u"
+ " when global gens are %u",
+ tmp, tstate);
+ }
+ while (tmp != (tstate + BAR_INCR));
+ }
+ gomp_assert_seenflags (bar, false);
+}
void
-gomp_barrier_wait_end (gomp_barrier_t *bar, gomp_barrier_state_t
state)
+gomp_assert_and_increment_flag (gomp_barrier_t *bar, unsigned id,
unsigned gens)
{
+ /* Our own personal flag, so don't need to atomically read it
except in the
+ case of the fallback code for when running on an older Linux
kernel
+ (without futex_waitv). Do need to atomically update it for the
primary
+ thread to read and form an acquire-release synchronisation from
our thread
+ to the primary thread. */
+ struct thread_lock_data *arr = bar->threadgens;
+ unsigned orig = __atomic_fetch_add (&arr[id].gen, BAR_INCR,
MEMMODEL_RELEASE);
+ futex_wake ((int *) &arr[id].gen, INT_MAX);
+ /* This clause is only to handle the fallback when `futex_waitv` is
not
+ available on the kernel we're running on. For the logic of this
+ particular synchronisation see the comment in
+ `gomp_team_barrier_ensure_last`. */
+ unsigned gen = gens & BAR_GEN_MASK;
+ if (__builtin_expect (orig == (gen | PRIMARY_WAITING_TG), 0))
+ {
+ __atomic_fetch_or (&bar->generation, BAR_SECONDARY_ARRIVED,
+ MEMMODEL_RELEASE);
+ futex_wake ((int *) &bar->generation, INT_MAX);
+ }
+ else
+ {
+ gomp_assert (orig == gen, "Original flag %u != generation of
%u\n", orig,
+ gen);
+ }
+}
+
+void
+gomp_barrier_wait_end (gomp_barrier_t *bar, gomp_barrier_state_t
state,
+ unsigned id)
+{
+ gomp_assert (!(state & BAR_CANCELLED),
+ "BAR_CANCELLED set when using plain barrier: %u",
state);
if (__builtin_expect (state & BAR_WAS_LAST, 0))
{
- /* Next time we'll be awaiting TOTAL threads again. */
- bar->awaited = bar->total;
+ gomp_assert (id == 0, "Id %u believes it is last\n", id);
+ /* Shouldn't have anything modifying bar->generation at this
point. */
+ gomp_assert ((bar->generation & ~BAR_BOTH_GENS_MASK) == 0,
+ "flags set in gomp_barrier_wait_end: %u",
+ bar->generation & ~BAR_BOTH_GENS_MASK);
__atomic_store_n (&bar->generation, bar->generation + BAR_INCR,
MEMMODEL_RELEASE);
futex_wake ((int *) &bar->generation, INT_MAX);
@@ -51,9 +112,12 @@ gomp_barrier_wait_end (gomp_barrier_t *bar,
gomp_barrier_state_t state)
}
void
-gomp_barrier_wait (gomp_barrier_t *bar)
+gomp_barrier_wait (gomp_barrier_t *bar, unsigned id)
{
- gomp_barrier_wait_end (bar, gomp_barrier_wait_start (bar));
+ gomp_barrier_state_t state = gomp_barrier_wait_start (bar, id);
+ if (__builtin_expect (state & BAR_WAS_LAST, 0))
+ gomp_barrier_ensure_last (bar, id, state);
+ gomp_barrier_wait_end (bar, state, id);
}
/* Like gomp_barrier_wait, except that if the encountering thread
@@ -64,11 +128,29 @@ gomp_barrier_wait (gomp_barrier_t *bar)
the barrier can be safely destroyed. */
void
-gomp_barrier_wait_last (gomp_barrier_t *bar)
+gomp_barrier_wait_last (gomp_barrier_t *bar, unsigned id)
{
- gomp_barrier_state_t state = gomp_barrier_wait_start (bar);
+ gomp_assert (id != 0, "Thread with id %u called
gomp_barrier_wait_last", id);
+ gomp_barrier_wait_start (bar, id);
+ /* N.b. For the intended use of this function we don't actually
need to do
+ the below. `state & BAR_WAS_LAST` should never be non-zero.
The
+ assertion above checks that `id != 0` and that's the only case
when this
+ could be true. However we write down what the code should be
if/when
+ `gomp_barrier_wait_last` gets used in a different way.
+
+ If we do want to provide the possibility for non-primary threads
to know
+ that this barrier can be destroyed we would need that non-
primary thread
+ to wait on the barrier (as it would having called
gomp_barrier_wait`, and
+ the primary thread to signal to it that all other threads have
arrived.
+ That would be done with the below (having gotten `state` from
the return
+ value of `gomp_barrier_wait_start` call above):
+
if (state & BAR_WAS_LAST)
- gomp_barrier_wait_end (bar, state);
+ {
+ gomp_barrier_ensure_last (bar, id, state);
+ gomp_barrier_wait_end (bar, state, id);
+ }
+ */
}
void
@@ -77,24 +159,231 @@ gomp_team_barrier_wake (gomp_barrier_t *bar, int
count)
futex_wake ((int *) &bar->generation, count == 0 ? INT_MAX :
count);
}
+/* We need to have some "ensure we're last, but perform useful work
+ in the meantime" function. This allows the primary thread to
perform useful
+ work while it is waiting on all secondary threads.
+ - Again related to the fundamental difference between this barrier
and the
+ "centralized" barrier where the thread doing the bookkeeping is
+ pre-determined as some "primary" thread and that is not
necessarily the
+ last thread to enter the barrier.
+
+ To do that we loop through checking each of the other threads
flags. If any
+ are not set then we take that opportunity to check the the global
generation
+ number, if there's some task to handle then do so before going
back to
+ checking the remaining thread-local flags.
+
+ This approach means that the cancellable barriers work reasonably
naturally.
+ If we're checking the global generation flag then we can see when
it is
+ cancelled. Hence `gomp_team_barrier_ensure_cancel_last` below
does
+ something very similar to this function here.
+
+ The "wait for wakeup" is a little tricky. When there is nothing
for a
+ thread to do we usually call `futex_wait` on an address. In this
case we
+ want to wait on one of two addresses being changed. In Linux
kernels >=
+ 5.16 there is the `futex_waitv` syscall which provides us exactly
that
+ functionality, but in older kernels we have to implement some
mechanism to
+ emulate the functionality.
+
+ On these older kernels (as a fallback implementation) we do the
following:
+ - Primary fetch_add's the PRIMARY_WAITING_TG bit to the thread-
local
+ generation number of the secondary it's waiting on.
+ - If primary sees that the BAR_INCR was added then secondary
reached
+ barrier as we decided to go to sleep waiting for it.
+ Clear the flag and continue.
+ - Otherwise primary futex_wait's on the generation number.
+ - Past this point the only state that this primary thread will
use
+ to indicate that this particular thread has arrived will be
+ BAR_SECONDARY_ARRIVED set on the barrier-global generation
number.
+ (the increment of the thread-local generation number no
longer means
+ anything to the primary thread, though we do assert that it's
done
+ in a checking build since that is still an invariant that
must hold
+ for later uses of this barrier).
+ - Can get woken up by a new task being added
(BAR_TASK_PENDING).
+ - Can get woken up by BAR_SECONDARY_ARRIVED bit flag in the
global
+ generation number saying "secondary you were waiting on has
+ arrived".
+ - In this case it clears that bit and returns to looking to
see if
+ all secondary threads have arrived.
+ Meanwhile secondary thread:
+ - Checks the value it gets in `gomp_assert_and_increment_flag`.
+ - If it has the PRIMARY_WAITING_TG flag then set
+ BAR_SECONDARY_ARRIVED on global generation number and
futex_wake
+ it.
+ Each secondary thread has to ignore BAR_SECONDARY_ARRIVED in their
loop
+ (will still get woken up by it, just continue around the loop
until it's
+ cleared).
+
+ N.b. using this fallback mechanism is the only place where any
thread
+ modifies a thread-local generation number that is not its own.
This is also
+ the only place where a secondary thread would modify bar-
generation without
+ a lock held. Hence these modifications need a reasonable amount
of care
+ w.r.t. atomicity. */
void
-gomp_team_barrier_wait_end (gomp_barrier_t *bar, gomp_barrier_state_t
state)
+gomp_team_barrier_ensure_last (gomp_barrier_t *bar, unsigned id,
+ gomp_barrier_state_t state)
+{
+ /* Thought process here:
+ - Until we are the last thread, we are "some thread waiting on
the
+ barrier". Hence we should doing only a variant of what is
done in
+ `gomp_team_barrier_wait_end` for the other threads.
+ - The loop done in that function is:
+ 1) Wait on `generation` having some flag.
+ 2) When it changes, enforce the acquire-release semantics and
+ re-load.
+ 3) If added `BAR_TASK_PENDING`, then handle some tasks.
+ 4) Ignore `BAR_WAITING_FOR_TASK` flag.
+ - Hence the loop done in this function is:
+ 1) Look through each thread-local generation number.
+ - When this is not incremented, then:
+ 1. wait on `generation` having some flag.
+ 2. When it does have such a flag, enforce the acquire-
release
+ semantics and load.
+ 3. If added `BAR_TASK_PENDING` then handle some tasks.
+ 4. Ignore `BAR_WAITING_FOR_TASK` not necessary because
it's us that
+ would set that flag.
+ Differences otherwise are that we put pausing in different
positions. */
+ gomp_assert (id == 0, "Calling ensure_last in thread %u\n", id);
+ unsigned gstate = state & (BAR_BOTH_GENS_MASK | BAR_CANCELLED);
+ unsigned tstate = state & BAR_GEN_MASK;
+ struct thread_lock_data *arr = bar->threadgens;
+ for (unsigned i = 1; i < bar->total; i++)
+ {
+ unsigned long long j, count = spin_count ();
+
+ wait_on_this_thread:
+ /* Use `j <= count` here just in case `gomp_spin_count_var ==
0` (which
+ can happen with OMP_WAIT_POLICY=passive). We need to go
around this
+ loop at least once to check and handle any changes. */
+ for (j = 0; j <= count; j++)
+ {
+ /* Thought about using MEMMODEL_ACQUIRE or MEMMODEL_RELAXED
until we
+ see a difference and *then* MEMMODEL_ACQUIRE for the
+ acquire-release semantics. Idea was that the more
relaxed memory
+ model might provide a performance boost. Did not see any
+ improvement on a micro-benchmark and decided not to do
that.
+
+ TODO Question whether to run one loop checking both
variables, or
+ two loops each checking one variable multiple times.
Suspect
+ that one loop checking both variables is going to be
more
+ responsive and more understandable in terms of
performance
+ characteristics when specifying GOMP_SPINCOUNT.
+ However there's the chance that checking each
variable
+ something like 10000 times and going around this
loop
+ count/10000 times could give better throughput? */
+ unsigned int threadgen
+ = __atomic_load_n (&arr[i].gen, MEMMODEL_ACQUIRE);
+ /* If this thread local generation number has
PRIMARY_WAITING_TG set,
+ then we must have set it (primary thread is the only one
that sets
+ this flag). The mechanism by which that would be set is:
+ 1) We dropped into `futex_waitv` while waiting on this
thread
+ 2) Some secondary thread woke us (either by setting
+ `BAR_TASK_PENDING` or the thread we were waiting on
setting
+ `BAR_SECONDARY_ARRIVED`).
+ 3) We came out of `futex_wake` and started waiting on
this thread
+ again.
+ Go into the `bar->generation` clause below to reset state
around
+ here and we'll go back to this thing later. N.b. having
+ `PRIMARY_WAITING_TG` set on a given thread local
generation flag
+ means that the "continue" flag for this thread is now
seeing
+ `BAR_SECONDARY_ARRIVED` set on the global generation
flag. */
+ if (__builtin_expect (threadgen != tstate, 0)
+ && __builtin_expect (!(threadgen & PRIMARY_WAITING_TG),
1))
+ {
+ gomp_assert (threadgen == (tstate + BAR_INCR),
+ "Thread-local state seen to be %u"
+ " when global state was %u.\n",
+ threadgen, tstate);
+ goto wait_on_next_thread;
+ }
+ /* Only need for MEMMODEL_ACQUIRE below is when using the
futex_waitv
+ fallback for older kernels. When this happens we use the
+ BAR_SECONDARY_ARRIVED flag for synchronisation need to
ensure the
+ acquire-release synchronisation is formed from secondary
thread to
+ primary thread as per OpenMP flush requirements on a
barrier. */
+ unsigned int gen
+ = __atomic_load_n (&bar->generation, MEMMODEL_ACQUIRE);
+ if (__builtin_expect (gen != gstate, 0))
+ {
+ /* If there are some tasks to perform then perform them.
*/
+ if (gen & BAR_TASK_PENDING)
+ gomp_barrier_handle_tasks (gstate, false);
+ if (gen & BAR_SECONDARY_ARRIVED)
+ {
+ /* Secondary thread has arrived, clear the flag on
the
+ thread local generation and the global
generation.
+ Invariants are:
+ - BAR_SECONDARY_ARRIVED is only ever set by the
secondary
+ threads and only when their own thread-local
generation
+ has the flag PRIMARY_WAITING_TG.
+ - PRIMARY_WAITING_TG flag is only ever set by the
primary
+ thread.
+ - Release-Acquire ordering on the generation
number means
+ that as far as this thread is concerned the
relevant
+ secondary thread must have incremented its
generation.
+ - When using the fallback the required "flush"
release
+ semantics are provided by the Release-Acquire
+ syncronisation on the generation number (with
+ BAR_SECONDARY_ARRIVED). Otherwise it's provided
by the
+ Release-Acquire ordering on the thread-local
generation.
+ */
+ __atomic_fetch_and (&bar->generation,
~BAR_SECONDARY_ARRIVED,
+ MEMMODEL_RELAXED);
+ gomp_assert (
+ (threadgen == ((tstate + BAR_INCR) |
PRIMARY_WAITING_TG))
+ || (threadgen == (tstate | PRIMARY_WAITING_TG)),
+ "Thread %d local generation is %d but expected"
+ " PRIMARY_WAITING_TG set because bar->generation"
+ " marked with SECONDARY (%d)",
+ i, threadgen, gen);
+ __atomic_fetch_and (&arr[i].gen,
~PRIMARY_WAITING_TG,
+ MEMMODEL_RELAXED);
+ }
+ /* There should be no other way in which this can be
different. */
+ gomp_assert (
+ (gen & BAR_CANCELLED) == (gstate & BAR_CANCELLED),
+ "Unnecessary looping due to mismatching BAR_CANCELLED"
+ " bar->generation: %u state: %u",
+ gen, gstate);
+ gomp_assert (
+ !(gen & BAR_WAITING_FOR_TASK),
+ "BAR_WAITING_FOR_TASK set by non-primary thread
gen=%d", gen);
+ gomp_assert (
+ !gomp_barrier_state_is_incremented (gen, gstate,
BAR_INCR),
+ "Global state incremented by non-primary thread
gen=%d", gen);
+ goto wait_on_this_thread;
+ }
+ }
+ /* Neither thread-local generation number nor global generation
seem to
+ be changing. Wait for one of them to change. */
+ futex_waitv ((int *) &arr[i].gen, tstate, (int *) &bar-
generation,
+ gstate);
+ /* One of the above values has changed, go back to the start of
this loop
+ * and we can find out what it was and deal with it
accordingly. */
+ goto wait_on_this_thread;
+
+ wait_on_next_thread:
+ continue;
+ }
+ gomp_assert_seenflags (bar, false);
+}
+
+void
+gomp_team_barrier_wait_end (gomp_barrier_t *bar, gomp_barrier_state_t
state,
+ unsigned id)
{
unsigned int generation, gen;
if (__builtin_expect (state & BAR_WAS_LAST, 0))
{
- /* Next time we'll be awaiting TOTAL threads again. */
+ gomp_assert (id == 0, "Id %u believes it is last\n", id);
struct gomp_thread *thr = gomp_thread ();
struct gomp_team *team = thr->ts.team;
-
- bar->awaited = bar->total;
team->work_share_cancelled = 0;
unsigned task_count
= __atomic_load_n (&team->task_count, MEMMODEL_ACQUIRE);
if (__builtin_expect (task_count, 0))
{
- gomp_barrier_handle_tasks (state);
+ gomp_barrier_handle_tasks (state, false);
state &= ~BAR_WAS_LAST;
}
else
@@ -113,103 +402,376 @@ gomp_team_barrier_wait_end (gomp_barrier_t
*bar, gomp_barrier_state_t state)
{
do_wait ((int *) &bar->generation, generation);
gen = __atomic_load_n (&bar->generation, MEMMODEL_ACQUIRE);
+ gomp_assert ((gen == state + BAR_INCR)
+ || (gen & BAR_CANCELLED) == (generation &
BAR_CANCELLED)
+ /* Can cancel a barrier when already gotten into
final
+ implicit barrier at the end of a parallel
loop.
+ This happens in `cancel-parallel-3.c`.
+ In this case the above assertion does not hold
because
+ We are waiting on the implicit barrier at the
end of a
+ parallel region while some other thread is
performing
+ work in that parallel region, hits a
+ `#pragma omp cancel parallel`, and sets said
flag. */
+ || !(generation & BAR_CANCELLED),
+ "Unnecessary looping due to BAR_CANCELLED diff"
+ " gen: %u generation: %u id: %u",
+ gen, generation, id);
if (__builtin_expect (gen & BAR_TASK_PENDING, 0))
{
- gomp_barrier_handle_tasks (state);
+ gomp_barrier_handle_tasks (state, false);
gen = __atomic_load_n (&bar->generation, MEMMODEL_ACQUIRE);
}
+ /* These flags will not change until this barrier is completed.
+ Going forward we don't want to be continually waking up
checking for
+ whether this barrier has completed yet.
+
+ If the barrier is cancelled but there are tasks yet to
perform then
+ some thread will have used `gomp_barrier_handle_tasks` to go
through
+ all tasks and drop them. */
generation |= gen & BAR_WAITING_FOR_TASK;
+ generation |= gen & BAR_CANCELLED;
+ /* Other flags that may be set in `bar->generation` are:
+ 1) BAR_SECONDARY_ARRIVED
+ 2) BAR_SECONDARY_CANCELLABLE_ARRIVED
+ While we want to ignore these, they should be transient and
quickly
+ removed, hence we don't adjust our expected `generation`
accordingly.
+ TODO Would be good to benchmark both approaches. */
}
- while (!gomp_barrier_state_is_incremented (gen, state));
+ while (!gomp_barrier_state_is_incremented (gen, state, false));
}
void
-gomp_team_barrier_wait (gomp_barrier_t *bar)
+gomp_team_barrier_wait (gomp_barrier_t *bar, unsigned id)
{
- gomp_team_barrier_wait_end (bar, gomp_barrier_wait_start (bar));
+ gomp_barrier_state_t state = gomp_barrier_wait_start (bar, id);
+ if (__builtin_expect (state & BAR_WAS_LAST, 0))
+ gomp_team_barrier_ensure_last (bar, id, state);
+ gomp_team_barrier_wait_end (bar, state, id);
}
void
-gomp_team_barrier_wait_final (gomp_barrier_t *bar)
+gomp_team_barrier_wait_final (gomp_barrier_t *bar, unsigned id)
{
- gomp_barrier_state_t state = gomp_barrier_wait_final_start (bar);
- if (__builtin_expect (state & BAR_WAS_LAST, 0))
- bar->awaited_final = bar->total;
- gomp_team_barrier_wait_end (bar, state);
+ return gomp_team_barrier_wait (bar, id);
+}
+
+void
+gomp_assert_and_increment_cancel_flag (gomp_barrier_t *bar, unsigned
id,
+ unsigned gens)
+{
+ struct thread_lock_data *arr = bar->threadgens;
+ /* Because we have separate thread local generation numbers for
cancellable
+ barriers and non-cancellable barriers, we can use fetch_add in
the
+ cancellable thread-local generation number and just let the bits
roll over
+ into the BAR_INCR area. This means we don't have to have a CAS
loop to
+ handle the futex_waitv_fallback possibility that the primary
thread
+ updates our thread-local variable at the same time as we do. */
+ unsigned orig
+ = __atomic_fetch_add (&arr[id].cgen, BAR_CANCEL_INCR,
MEMMODEL_RELEASE);
+ futex_wake ((int *) &arr[id].cgen, INT_MAX);
+
+ /* However, using that fetch_add means that we need to mask out the
values
+ we're comparing against. Since this masking is not a memory
operation we
+ believe that trade-off is good. */
+ unsigned orig_cgen = orig & (BAR_CANCEL_GEN_MASK | BAR_FLAGS_MASK);
+ unsigned global_cgen = gens & BAR_CANCEL_GEN_MASK;
+ if (__builtin_expect (orig_cgen == (global_cgen |
PRIMARY_WAITING_TG), 0))
+ {
+ unsigned prev = __atomic_fetch_or (&bar->generation,
+
BAR_SECONDARY_CANCELLABLE_ARRIVED,
+ MEMMODEL_RELAXED);
+ /* Wait! The barrier got cancelled, this flag is nothing but
annoying
+ state to ignore in the non-cancellable barrier that will be
coming up
+ soon and is state that needs to be reset before any following
+ cancellable barrier is called. */
+ if (prev & BAR_CANCELLED)
+ __atomic_fetch_and (&bar->generation,
+ ~BAR_SECONDARY_CANCELLABLE_ARRIVED,
+ MEMMODEL_RELAXED);
+ futex_wake ((int *) &bar->generation, INT_MAX);
+ }
+ else
+ {
+ gomp_assert (orig_cgen == global_cgen,
+ "Id %u: Original flag %u != generation of %u\n",
id,
+ orig_cgen, global_cgen);
+ }
+}
+
+bool
+gomp_team_barrier_ensure_cancel_last (gomp_barrier_t *bar, unsigned
id,
+ gomp_barrier_state_t state)
+{
+ gomp_assert (id == 0, "Calling ensure_cancel_last in thread %u\n",
id);
+ unsigned gstate = state & BAR_BOTH_GENS_MASK;
+ unsigned tstate = state & BAR_CANCEL_GEN_MASK;
+ struct thread_lock_data *arr = bar->threadgens;
+ for (unsigned i = 1; i < bar->total; i++)
+ {
+ unsigned long long j, count = spin_count ();
+
+ wait_on_this_thread:
+ for (j = 0; j <= count; j++)
+ {
+ unsigned int threadgen
+ = __atomic_load_n (&arr[i].cgen, MEMMODEL_ACQUIRE);
+ /* Clear "overrun" bits -- spillover into the non-
cancellable
+ generation numbers that we are leaving around in order to
avoid
+ having to perform extra memory operations in this
barrier. */
+ threadgen &= (BAR_CANCEL_GEN_MASK | BAR_FLAGS_MASK);
+
+ if (__builtin_expect (threadgen != tstate, 0)
+ && __builtin_expect (!(threadgen & PRIMARY_WAITING_TG),
1))
+ {
+ gomp_assert (threadgen == BAR_INCREMENT_CANCEL (tstate),
+ "Thread-local state seen to be %u"
+ " when global state was %u.\n",
+ threadgen, tstate);
+ goto wait_on_next_thread;
+ }
+ unsigned int gen
+ = __atomic_load_n (&bar->generation, MEMMODEL_ACQUIRE);
+ if (__builtin_expect (gen != gstate, 0))
+ {
+ gomp_assert (
+ !(gen & BAR_WAITING_FOR_TASK),
+ "BAR_WAITING_FOR_TASK set in non-primary thread
gen=%d", gen);
+ gomp_assert (!gomp_barrier_state_is_incremented (gen,
gstate,
+
BAR_CANCEL_INCR),
+ "Global state incremented by non-primary
thread "
+ "gstate=%d gen=%d",
+ gstate, gen);
+ if (gen & BAR_CANCELLED)
+ {
+ if (__builtin_expect (threadgen &
PRIMARY_WAITING_TG, 0))
+ __atomic_fetch_and (&arr[i].cgen,
~PRIMARY_WAITING_TG,
+ MEMMODEL_RELAXED);
+ /* Don't check for BAR_SECONDARY_CANCELLABLE_ARRIVED
here.
+ There are too many windows for race conditions if
+ resetting here (essentially can't guarantee that
we'll
+ catch it so appearing like we might is just
confusing).
+ Instead the primary thread has to ignore that
flag in the
+ next barrier (which will be a non-cancellable
barrier),
+ and we'll eventually reset it either in the
thread that
+ set `BAR_CANCELLED` or in the thread that set
+ `BAR_SECONDARY_CANCELLABLE_ARRIVED`. */
+ return false;
+ }
+ if (gen & BAR_SECONDARY_CANCELLABLE_ARRIVED)
+ {
+ __atomic_fetch_and (&bar->generation,
+
~BAR_SECONDARY_CANCELLABLE_ARRIVED,
+ MEMMODEL_RELAXED);
+ gomp_assert (
+ (threadgen
+ == (BAR_INCREMENT_CANCEL (tstate) |
PRIMARY_WAITING_TG))
+ || (threadgen == (tstate | PRIMARY_WAITING_TG)),
+ "Thread %d local generation is %d but expected"
+ " PRIMARY_WAITING_TG set because bar->generation"
+ " marked with SECONDARY (%d)",
+ i, threadgen, gen);
+ __atomic_fetch_and (&arr[i].cgen,
~PRIMARY_WAITING_TG,
+ MEMMODEL_RELAXED);
+ }
+
+ if (gen & BAR_TASK_PENDING)
+ gomp_barrier_handle_tasks (gstate, false);
+ goto wait_on_this_thread;
+ }
+ }
+ futex_waitv ((int *) &arr[i].cgen, tstate, (int *) &bar-
generation,
+ gstate);
+ goto wait_on_this_thread;
+
+ wait_on_next_thread:
+ continue;
+ }
+ gomp_assert_seenflags (bar, true);
+ return true;
}
bool
gomp_team_barrier_wait_cancel_end (gomp_barrier_t *bar,
- gomp_barrier_state_t state)
+ gomp_barrier_state_t state,
unsigned id)
{
unsigned int generation, gen;
+ gomp_assert (
+ !(state & BAR_CANCELLED),
+ "gomp_team_barrier_wait_cancel_end called when barrier cancelled
state: %u",
+ state);
if (__builtin_expect (state & BAR_WAS_LAST, 0))
{
- /* Next time we'll be awaiting TOTAL threads again. */
- /* BAR_CANCELLED should never be set in state here, because
- cancellation means that at least one of the threads has been
- cancelled, thus on a cancellable barrier we should never see
- all threads to arrive. */
struct gomp_thread *thr = gomp_thread ();
struct gomp_team *team = thr->ts.team;
-
- bar->awaited = bar->total;
team->work_share_cancelled = 0;
unsigned task_count
= __atomic_load_n (&team->task_count, MEMMODEL_ACQUIRE);
if (__builtin_expect (task_count, 0))
{
- gomp_barrier_handle_tasks (state);
+ gomp_barrier_handle_tasks (state, true);
state &= ~BAR_WAS_LAST;
}
else
{
- state += BAR_INCR - BAR_WAS_LAST;
+ state &= ~BAR_WAS_LAST;
+ state = BAR_INCREMENT_CANCEL (state);
__atomic_store_n (&bar->generation, state,
MEMMODEL_RELEASE);
futex_wake ((int *) &bar->generation, INT_MAX);
return false;
}
}
- if (__builtin_expect (state & BAR_CANCELLED, 0))
- return true;
-
generation = state;
do
{
do_wait ((int *) &bar->generation, generation);
gen = __atomic_load_n (&bar->generation, MEMMODEL_ACQUIRE);
if (__builtin_expect (gen & BAR_CANCELLED, 0))
- return true;
+ {
+ if (__builtin_expect ((gen & BAR_CANCEL_GEN_MASK)
+ != (state & BAR_CANCEL_GEN_MASK),
+ 0))
+ {
+ /* Have cancelled a barrier just after completing the
current
+ one. We must not reset our local state. */
+ gomp_assert (
+ (gen & BAR_CANCEL_GEN_MASK)
+ == (BAR_INCREMENT_CANCEL (state) &
BAR_CANCEL_GEN_MASK),
+ "Incremented global generation (cancellable) more than
one"
+ " gen: %u original state: %u",
+ gen, state);
+ /* Other threads could have continued on in between the
time that
+ the primary thread signalled all other threads should
wake up
+ and the time that we actually read `bar->generation`
above.
+ Any one of them could be performing tasks, also they
could be
+ waiting on the next barrier and could have set
+ BAR_SECONDARY{_CANCELLABLE,}_ARRIVED.
+
+ W.r.t. the task handling that could have been set
(primary
+ thread increments generation telling us to go, then
it
+ continues itself and finds an `omp task` directive
and
+ schedules a task) we need to move on with it
(PR122314). */
+ gomp_assert (!(gen & BAR_WAITING_FOR_TASK),
+ "Generation incremented while "
+ " main thread is still waiting for tasks:
gen: %u "
+ "original state: %u",
+ gen, state);
+ /* *This* barrier wasn't cancelled -- the next barrier
is
+ cancelled. Returning `false` gives that information
back up
+ to calling functions. */
+ return false;
+ }
+ /* Need to reset our thread-local generation. Don't want
state to be
+ messed up the next time we hit a cancellable barrier.
+ Must do this atomically because the cancellation signal
could
+ happen from "somewhere else" while the primary thread has
decided
+ that it wants to wait on us -- if we're using the
fallback for
+ pre-5.16 Linux kernels.
+
+ We are helped by the invariant that when this barrier is
+ cancelled, the next barrier that will be entered is a
+ non-cancellable barrier. That means we don't have to
worry about
+ the primary thread getting confused by this generation
being
+ incremented to say it's reached a cancellable barrier.
+
+ We do not reset the PRIMARY_WAITING_TG bit. That is left
to the
+ primary thread. We only subtract the BAR_CANCEL_INCR
that we
+ added before getting here.
+
+ N.b. Another approach to avoid problems with multiple
threads
+ modifying this thread-local generation could be for the
primary
+ thread to reset the thread-local generations once it's
"gathered"
+ all threads during the next non-cancellable barrier.
Such a reset
+ would not need to be atomic because we would know that
all threads
+ have already acted on their thread-local generation
number.
+
+ That approach mirrors the previous approach where the
primary
+ thread would reset `awaited`. The problem with this is
that now
+ we have *many* generations to reset, the primary thread
can be the
+ bottleneck in a barrier (it performs the most work) and
putting
+ more work into the primary thread while all secondary
threads are
+ waiting on it seems problematic. Moreover outside of the
+ futex_waitv_fallback the primary thread does not adjust
the
+ thread-local generations. Maintaining that property
where
+ possible seems very worthwhile. */
+ unsigned orig __attribute__ ((unused))
+ = __atomic_fetch_sub (&bar->threadgens[id].cgen,
BAR_CANCEL_INCR,
+ MEMMODEL_RELAXED);
+#if _LIBGOMP_CHECKING_
+ unsigned orig_gen = (orig & BAR_CANCEL_GEN_MASK);
+ unsigned global_gen_plus1
+ = ((gen + BAR_CANCEL_INCR) & BAR_CANCEL_GEN_MASK);
+ gomp_assert (orig_gen == global_gen_plus1
+ || orig_gen == (global_gen_plus1 |
PRIMARY_WAITING_TG),
+ "Thread-local generation %u unknown
modification:"
+ " expected %u (with possible PRIMARY* flags)
seen %u",
+ id, (gen + BAR_CANCEL_INCR) &
BAR_CANCEL_GEN_MASK, orig);
+#endif
+ return true;
+ }
if (__builtin_expect (gen & BAR_TASK_PENDING, 0))
{
- gomp_barrier_handle_tasks (state);
+ gomp_barrier_handle_tasks (state, true);
gen = __atomic_load_n (&bar->generation, MEMMODEL_ACQUIRE);
}
generation |= gen & BAR_WAITING_FOR_TASK;
}
- while (!gomp_barrier_state_is_incremented (gen, state));
+ while (!gomp_barrier_state_is_incremented (gen, state, true));
return false;
}
bool
-gomp_team_barrier_wait_cancel (gomp_barrier_t *bar)
+gomp_team_barrier_wait_cancel (gomp_barrier_t *bar, unsigned id)
{
- return gomp_team_barrier_wait_cancel_end (bar,
gomp_barrier_wait_start (bar));
+ gomp_barrier_state_t state = gomp_barrier_wait_cancel_start (bar,
id);
+
+ if (__builtin_expect (state & BAR_CANCELLED, 0))
+ return true;
+
+ if (__builtin_expect (state & BAR_WAS_LAST, 0)
+ && !gomp_team_barrier_ensure_cancel_last (bar, id, state))
+ {
+ gomp_reset_cancellable_primary_threadgen (bar, id);
+ return true;
+ }
+ return gomp_team_barrier_wait_cancel_end (bar, state, id);
}
void
gomp_team_barrier_cancel (struct gomp_team *team)
{
- gomp_mutex_lock (&team->task_lock);
- if (team->barrier.generation & BAR_CANCELLED)
- {
- gomp_mutex_unlock (&team->task_lock);
- return;
- }
- team->barrier.generation |= BAR_CANCELLED;
- gomp_mutex_unlock (&team->task_lock);
- futex_wake ((int *) &team->barrier.generation, INT_MAX);
+ /* Always set CANCEL on the barrier. Means we have one simple
atomic
+ operation. Need an atomic operation because the barrier now
uses the
+ `generation` to communicate.
+
+ Don't need to add any memory-ordering here. The only thing that
+ BAR_CANCELLED signifies is that the barrier is cancelled and the
only
+ thing that BAR_SECONDARY_CANCELLABLE_ARRIVED signifies is that
the
+ secondary thread has arrived at the barrier. No thread infers
anything
+ about any other data having been set based on these flags. */
+ unsigned orig = __atomic_fetch_or (&team->barrier.generation,
BAR_CANCELLED,
+ MEMMODEL_RELAXED);
+ /* We're cancelling a barrier and it's currently using the fallback
mechanism
+ instead of the futex_waitv syscall. We need to ensure that
state gets
+ reset before the next cancellable barrier.
+
+ Any cancelled barrier is followed by a non-cancellable barrier
so just
+ ensuring this reset happens in one thread allows us to ensure
that it
+ happens before any thread reaches the next cancellable barrier.
+
+ This and the check in `gomp_assert_and_increment_cancel_flag`
are the two
+ ways in which we ensure `BAR_SECONDARY_CANCELLABLE_ARRIVED` is
reset by
+ the time that any thread arrives at the next cancellable
barrier.
+
+ We need both since that `BAR_SECONDARY_CANCELLABLE_ARRIVED` flag
could be
+ set *just* after this `BAR_CANCELLED` bit gets set (in which
case the
+ other case handles resetting) or before (in which case this
clause handles
+ resetting). */
+ if (orig & BAR_SECONDARY_CANCELLABLE_ARRIVED)
+ __atomic_fetch_and (&team->barrier.generation,
+ ~BAR_SECONDARY_CANCELLABLE_ARRIVED,
MEMMODEL_RELAXED);
+ if (!(orig & BAR_CANCELLED))
+ futex_wake ((int *) &team->barrier.generation, INT_MAX);
}
diff --git a/libgomp/config/linux/bar.h b/libgomp/config/linux/bar.h
index faa03746d8f..dbd4b868418 100644
--- a/libgomp/config/linux/bar.h
+++ b/libgomp/config/linux/bar.h
@@ -32,14 +32,31 @@
#include "mutex.h"
+/* Handy to have `cgen` and `gen` separate, since then we can use
+ `__atomic_fetch_add` directly on the `cgen` instead of having to
use a CAS
+ loop to increment the cancel generation bits. If it weren't for
the
+ fallback for when `futex_waitv` is not available it wouldn't
matter but with
+ that fallback more than one thread can adjust these thread-local
generation
+ numbers and hence we have to be concerned about synchronisation
issues. */
+struct __attribute__ ((aligned (64))) thread_lock_data
+{
+ unsigned gen;
+ unsigned cgen;
+};
+
typedef struct
{
/* Make sure total/generation is in a mostly read cacheline, while
- awaited in a separate cacheline. */
+ awaited in a separate cacheline. Each generation structure is
in a
+ separate cache line too. Put both cancellable and non-
cancellable
+ generation numbers in the same cache line because they should
both be
+ only ever modified by their corresponding thread (except in the
case of
+ the primary thread wanting to wait on a given thread arriving at
the
+ barrier and we're on an old Linux kernel). */
unsigned total __attribute__((aligned (64)));
+ unsigned allocated;
unsigned generation;
- unsigned awaited __attribute__((aligned (64)));
- unsigned awaited_final;
+ struct thread_lock_data *threadgens;
} gomp_barrier_t;
typedef unsigned int gomp_barrier_state_t;
@@ -48,140 +65,416 @@ typedef unsigned int gomp_barrier_state_t;
low bits dedicated to flags. Note that TASK_PENDING and WAS_LAST
can
share space because WAS_LAST is never stored back to generation.
*/
#define BAR_TASK_PENDING 1
+/* In this particular target BAR_WAS_LAST indicates something more
like
+ "chosen by design to be last", but I like having the macro the
same name as
+ it is given in other targets. */
#define BAR_WAS_LAST 1
#define BAR_WAITING_FOR_TASK 2
#define BAR_CANCELLED 4
-#define BAR_INCR 8
+/* BAR_SECONDARY_ARRIVED and PRIMARY_WAITING_TG flags are only used
for the
+ fallback approach when `futex_waitv` is not available. That
syscall should
+ be available on all kernels newer than Linux 5.16. */
+#define BAR_SECONDARY_ARRIVED 8
+#define BAR_SECONDARY_CANCELLABLE_ARRIVED 16
+/* Using bits 5 -> 10 for the generation number of cancellable
barriers and
+ remaining bits for the generation number of non-cancellable
barriers. */
+#define BAR_CANCEL_INCR 32
+#define BAR_INCR 2048
+#define BAR_FLAGS_MASK (~(-BAR_CANCEL_INCR))
+#define BAR_GEN_MASK (-BAR_INCR)
+#define BAR_BOTH_GENS_MASK (-BAR_CANCEL_INCR)
+#define BAR_CANCEL_GEN_MASK (-BAR_CANCEL_INCR & (~(-BAR_INCR)))
+/* Increment BAR_CANCEL_INCR, with wrapping arithmetic within the
bits assigned
+ to this generation number. I.e. Increment, then set bits above
BAR_INCR to
+ what they were before. */
+#define BAR_INCREMENT_CANCEL(X)
\
+ ({
\
+ __typeof__ (X) _X = (X);
\
+ (((_X + BAR_CANCEL_INCR) & BAR_CANCEL_GEN_MASK)
\
+ | (_X & ~BAR_CANCEL_GEN_MASK));
\
+ })
+/* The thread-local generation field similarly contains a counter in
the high
+ bits and has a few low bits dedicated to flags. None of the flags
above are
+ used in the thread-local generation field. Hence we can have a
different
+ set of bits for a protocol between the primary thread and the
secondary
+ threads. */
+#define PRIMARY_WAITING_TG 1
-static inline void gomp_barrier_init (gomp_barrier_t *bar, unsigned
count)
+static inline void
+gomp_assert_seenflags (gomp_barrier_t *bar, bool cancellable)
{
+#if _LIBGOMP_CHECKING_
+ unsigned gen = __atomic_load_n (&bar->generation,
MEMMODEL_RELAXED);
+ struct thread_lock_data *arr = bar->threadgens;
+ unsigned cancel_incr = cancellable ? BAR_CANCEL_INCR : 0;
+ unsigned incr = cancellable ? 0 : BAR_INCR;
+ /* Assert that all threads have been seen. */
+ for (unsigned i = 0; i < bar->total; i++)
+ {
+ gomp_assert (arr[i].gen == (gen & BAR_GEN_MASK) + incr,
+ "Index %u generation is %u (global is %u)\n", i,
arr[i].gen,
+ gen);
+ gomp_assert ((arr[i].cgen & BAR_CANCEL_GEN_MASK)
+ == ((gen + cancel_incr) & BAR_CANCEL_GEN_MASK),
+ "Index %u cancel generation is %u (global is
%u)\n", i,
+ arr[i].cgen, gen);
+ }
+
+ /* Assert that generation numbers not corresponding to any thread
are
+ cleared. This helps us test code-paths. */
+ for (unsigned i = bar->total; i < bar->allocated; i++)
+ {
+ gomp_assert (arr[i].gen == 0,
+ "Index %u gen should be 0. Is %u (global gen is
%u)\n", i,
+ arr[i].gen, gen);
+ gomp_assert (arr[i].cgen == 0,
+ "Index %u gen should be 0. Is %u (global gen is
%u)\n", i,
+ arr[i].cgen, gen);
+ }
+#endif
+}
+
+static inline void
+gomp_barrier_init (gomp_barrier_t *bar, unsigned count)
+{
+ bar->threadgens
+ = gomp_aligned_alloc (64, sizeof (bar->threadgens[0]) * count);
+ for (unsigned i = 0; i < count; ++i)
+ {
+ bar->threadgens[i].gen = 0;
+ bar->threadgens[i].cgen = 0;
+ }
bar->total = count;
- bar->awaited = count;
- bar->awaited_final = count;
+ bar->allocated = count;
bar->generation = 0;
}
-static inline void gomp_barrier_reinit (gomp_barrier_t *bar, unsigned
count)
+static inline bool
+gomp_barrier_has_space (gomp_barrier_t *bar, unsigned nthreads)
{
- __atomic_add_fetch (&bar->awaited, count - bar->total,
MEMMODEL_ACQ_REL);
- bar->total = count;
+ return nthreads <= bar->allocated;
+}
+
+static inline void
+gomp_barrier_minimal_reinit (gomp_barrier_t *bar, unsigned nthreads,
+ unsigned num_new_threads)
+{
+ /* Just increasing number of threads by appending logically used
threads at
+ "the end" of the team. That essentially means we need more of
the
+ `bar->threadgens` array to be logically used. We set them all
to the
+ current `generation` (marking that they are yet to hit this
generation).
+
+ This function has only been called after we checked there is
enough space
+ in this barrier for the number of threads we want using it.
Hence there's
+ no serialisation needed. */
+ gomp_assert (nthreads <= bar->allocated,
+ "minimal reinit on barrier with not enough space: "
+ "%u > %u",
+ nthreads, bar->allocated);
+ unsigned gen = bar->generation & BAR_GEN_MASK;
+ unsigned cancel_gen = bar->generation & BAR_CANCEL_GEN_MASK;
+ gomp_assert (bar->total == nthreads - num_new_threads,
+ "minimal_reinit called with incorrect state: %u != %u -
%u\n",
+ bar->total, nthreads, num_new_threads);
+ for (unsigned i = bar->total; i < nthreads; i++)
+ {
+ bar->threadgens[i].gen = gen;
+ bar->threadgens[i].cgen = cancel_gen;
+ }
+ bar->total = nthreads;
+}
+
+/* When re-initialising a barrier we know the following:
+ 1) We are waiting on a non-cancellable barrier.
+ 2) The cancel generation bits are known consistent (having been
tidied up by
+ each individual thread if the barrier got cancelled). */
+static inline void
+gomp_barrier_reinit_1 (gomp_barrier_t *bar, unsigned nthreads,
+ unsigned num_new_threads, unsigned long long
*new_ids)
+{
+#if _LIBGOMP_CHECKING_
+ /* Assertions that this barrier is in a sensible state.
+ Everything waiting on the standard barrier.
+ Current thread has not registered itself as arrived, but we
tweak for the
+ current assertions. */
+ bar->threadgens[0].gen += BAR_INCR;
+ gomp_assert_seenflags (bar, false);
+ bar->threadgens[0].gen -= BAR_INCR;
+ struct thread_lock_data threadgen_zero = bar->threadgens[0];
+#endif
+ if (!gomp_barrier_has_space (bar, nthreads))
+ {
+ /* Using `realloc` not chosen. Pros/Cons below.
+ Pros of using `realloc`:
+ - May not actually have to move memory.
+ Cons of using `realloc`:
+ - If do have to move memory, then *also* copies data, we are
going to
+ overwrite the data in this function. That copy would be a
waste.
+ - If do have to move memory then pointer may no longer be
aligned.
+ Would need bookkeeping for "pointer to free" and "pointer
to have
+ data".
+ Seems like "bad" case of `realloc` is made even worse by what
we need
+ here. Would have to benchmark to figure out whether using
`realloc`
+ or not is best. Since we shouldn't be re-allocating very
often I'm
+ choosing the simplest to code rather than the most optimal.
+
+ Does not matter that we have any existing threads waiting on
this
+ barrier. They are all waiting on bar->generation and their
+ thread-local generation will not be looked at. */
+ gomp_aligned_free (bar->threadgens);
+ bar->threadgens
+ = gomp_aligned_alloc (64, sizeof (bar->threadgens[0]) *
nthreads);
+ bar->allocated = nthreads;
+ }
+
+ /* Re-initialise the existing values. */
+ unsigned curgen = bar->generation & BAR_GEN_MASK;
+ unsigned cancel_curgen = bar->generation & BAR_CANCEL_GEN_MASK;
+ unsigned iter_len = nthreads;
+ unsigned bits_per_ull = sizeof (unsigned long long) * CHAR_BIT;
+#if _LIBGOMP_CHECKING_
+ /* If checking, zero out everything that's not going to be used in
this team.
+ This is only helpful for debugging (other assertions later can
ensure that
+ we've gone through this path for adjusting the number of
threads, and when
+ viewing the data structure in GDB can easily identify which
generation
+ numbers are in use). When not running assertions or running in
the
+ debugger these extra numbers are simply not used. */
+ iter_len = bar->allocated;
+ /* In the checking build just unconditionally reinitialise. This
handles
+ when the memory has moved and is harmless (except in performance
which the
+ checking build doesn't care about) otherwise. */
+ bar->threadgens[0] = threadgen_zero;
+#endif
+ for (unsigned i = 1; i < iter_len; i++)
+ {
+ /* Re-initialisation. Zero out the "remaining" elements in our
wake flag
+ array when _LIBGOMP_CHECKING_ as a helper for our assertions
to check
+ validity. Set thread-specific generations to "seen" for `i's
+ corresponding to re-used threads, set thread-specific
generations to
+ "not yet seen" for `i's corresponding to threads about to be
+ spawned. */
+ unsigned newthr_val = i < nthreads ? curgen : 0;
+ unsigned newthr_cancel_val = i < nthreads ? cancel_curgen : 0;
+ unsigned index = i / bits_per_ull;
+ unsigned long long bitmask = (1ULL << (i % bits_per_ull));
+ bool bit_is_set = ((new_ids[index] & bitmask) != 0);
+ bar->threadgens[i].gen = bit_is_set ? curgen + BAR_INCR :
newthr_val;
+ /* This is different because we only ever call this function
while threads
+ are waiting on a non-cancellable barrier. Hence "which
threads have
+ arrived and which will be newly spawned" is not a question.
*/
+ bar->threadgens[i].cgen = newthr_cancel_val;
+ }
+ bar->total = nthreads;
}
static inline void gomp_barrier_destroy (gomp_barrier_t *bar)
{
+ gomp_aligned_free (bar->threadgens);
}
-extern void gomp_barrier_wait (gomp_barrier_t *);
-extern void gomp_barrier_wait_last (gomp_barrier_t *);
-extern void gomp_barrier_wait_end (gomp_barrier_t *,
gomp_barrier_state_t);
-extern void gomp_team_barrier_wait (gomp_barrier_t *);
-extern void gomp_team_barrier_wait_final (gomp_barrier_t *);
-extern void gomp_team_barrier_wait_end (gomp_barrier_t *,
- gomp_barrier_state_t);
-extern bool gomp_team_barrier_wait_cancel (gomp_barrier_t *);
+static inline void
+gomp_barrier_reinit_2 (gomp_barrier_t __attribute__ ((unused)) * bar,
+ unsigned __attribute__ ((unused)) nthreads) {};
+extern void gomp_barrier_wait (gomp_barrier_t *, unsigned);
+extern void gomp_barrier_wait_last (gomp_barrier_t *, unsigned);
+extern void gomp_barrier_wait_end (gomp_barrier_t *,
gomp_barrier_state_t,
+ unsigned);
+extern void gomp_team_barrier_wait (gomp_barrier_t *, unsigned);
+extern void gomp_team_barrier_wait_final (gomp_barrier_t *,
unsigned);
+extern void gomp_team_barrier_wait_end (gomp_barrier_t *,
gomp_barrier_state_t,
+ unsigned);
+extern bool gomp_team_barrier_wait_cancel (gomp_barrier_t *,
unsigned);
extern bool gomp_team_barrier_wait_cancel_end (gomp_barrier_t *,
- gomp_barrier_state_t);
+ gomp_barrier_state_t,
unsigned);
extern void gomp_team_barrier_wake (gomp_barrier_t *, int);
struct gomp_team;
extern void gomp_team_barrier_cancel (struct gomp_team *);
+extern void gomp_team_barrier_ensure_last (gomp_barrier_t *,
unsigned,
+ gomp_barrier_state_t);
+extern void gomp_barrier_ensure_last (gomp_barrier_t *, unsigned,
+ gomp_barrier_state_t);
+extern bool gomp_team_barrier_ensure_cancel_last (gomp_barrier_t *,
unsigned,
+
gomp_barrier_state_t);
+extern void gomp_assert_and_increment_flag (gomp_barrier_t *,
unsigned,
+ unsigned);
+extern void gomp_assert_and_increment_cancel_flag (gomp_barrier_t *,
unsigned,
+ unsigned);
static inline gomp_barrier_state_t
-gomp_barrier_wait_start (gomp_barrier_t *bar)
+gomp_barrier_wait_start (gomp_barrier_t *bar, unsigned id)
{
+ /* TODO I don't believe this MEMMODEL_ACQUIRE is needed.
+ Look into it later. Point being that this should only ever read
a value
+ from last barrier or from tasks/cancellation/etc. There was
already an
+ acquire-release ordering at exit of the last barrier, all
setting of
+ tasks/cancellation etc are done with RELAXED memory model =>
using ACQUIRE
+ doesn't help.
+
+ See corresponding comment in `gomp_team_barrier_cancel` when
thinking
+ about this. */
unsigned int ret = __atomic_load_n (&bar->generation,
MEMMODEL_ACQUIRE);
- ret &= -BAR_INCR | BAR_CANCELLED;
- /* A memory barrier is needed before exiting from the various forms
- of gomp_barrier_wait, to satisfy OpenMP API version 3.1 section
- 2.8.6 flush Construct, which says there is an implicit flush
during
- a barrier region. This is a convenient place to add the
barrier,
- so we use MEMMODEL_ACQ_REL here rather than MEMMODEL_ACQUIRE.
*/
- if (__atomic_add_fetch (&bar->awaited, -1, MEMMODEL_ACQ_REL) == 0)
+ ret &= (BAR_BOTH_GENS_MASK | BAR_CANCELLED);
+#if !_LIBGOMP_CHECKING_
+ if (id != 0)
+#endif
+ /* Increment local flag. For thread id 0 this doesn't
communicate
+ anything to *other* threads, but it is useful for debugging
purposes. */
+ gomp_assert_and_increment_flag (bar, id, ret);
+
+ if (id == 0)
ret |= BAR_WAS_LAST;
return ret;
}
static inline gomp_barrier_state_t
-gomp_barrier_wait_cancel_start (gomp_barrier_t *bar)
-{
- return gomp_barrier_wait_start (bar);
-}
-
-/* This is like gomp_barrier_wait_start, except it decrements
- bar->awaited_final rather than bar->awaited and should be used
- for the gomp_team_end barrier only. */
-static inline gomp_barrier_state_t
-gomp_barrier_wait_final_start (gomp_barrier_t *bar)
+gomp_barrier_wait_cancel_start (gomp_barrier_t *bar, unsigned id)
{
unsigned int ret = __atomic_load_n (&bar->generation,
MEMMODEL_ACQUIRE);
- ret &= -BAR_INCR | BAR_CANCELLED;
- /* See above gomp_barrier_wait_start comment. */
- if (__atomic_add_fetch (&bar->awaited_final, -1, MEMMODEL_ACQ_REL)
== 0)
+ ret &= BAR_BOTH_GENS_MASK | BAR_CANCELLED;
+ if (!(ret & BAR_CANCELLED)
+#if !_LIBGOMP_CHECKING_
+ && id != 0
+#endif
+ )
+ gomp_assert_and_increment_cancel_flag (bar, id, ret);
+ if (id == 0)
ret |= BAR_WAS_LAST;
return ret;
}
+static inline void
+gomp_reset_cancellable_primary_threadgen (gomp_barrier_t *bar,
unsigned id)
+{
+#if _LIBGOMP_CHECKING_
+ gomp_assert (id == 0,
+ "gomp_reset_cancellable_primary_threadgen called with "
+ "non-primary thread id: %u",
+ id);
+ unsigned orig = __atomic_fetch_sub (&bar->threadgens[id].cgen,
+ BAR_CANCEL_INCR,
MEMMODEL_RELAXED);
+ unsigned orig_gen = (orig & BAR_CANCEL_GEN_MASK);
+ unsigned gen = __atomic_load_n (&bar->generation,
MEMMODEL_RELAXED);
+ unsigned global_gen_plus1 = ((gen + BAR_CANCEL_INCR) &
BAR_CANCEL_GEN_MASK);
+ gomp_assert (orig_gen == global_gen_plus1,
+ "Thread-local generation %u unknown modification:"
+ " expected %u seen %u",
+ id, (gen + BAR_CANCEL_GEN_MASK) & BAR_CANCEL_GEN_MASK,
orig);
+#endif
+}
+
static inline bool
gomp_barrier_last_thread (gomp_barrier_state_t state)
{
return state & BAR_WAS_LAST;
}
-/* All the inlines below must be called with team->task_lock
- held. */
+/* All the inlines below must be called with team->task_lock held.
However
+ with the `futex_waitv` fallback there can still be contention on
+ `bar->generation`. For the RMW operations it is obvious that we
need to
+ perform these atomically. For the load in
`gomp_barrier_has_completed` and
+ `gomp_team_barrier_cancelled` the need is tied to the C
specification.
+
+ On the architectures I have some grips on (x86_64 and AArch64)
there would
+ be no real downside to setting a bit in thread while reading in
another.
+ However the C definition of data race doesn't have any such leeway
and to
+ avoid UB we need to load atomically. */
static inline void
gomp_team_barrier_set_task_pending (gomp_barrier_t *bar)
{
- bar->generation |= BAR_TASK_PENDING;
+ __atomic_fetch_or (&bar->generation, BAR_TASK_PENDING,
MEMMODEL_RELAXED);
}
static inline void
gomp_team_barrier_clear_task_pending (gomp_barrier_t *bar)
{
- bar->generation &= ~BAR_TASK_PENDING;
+ __atomic_fetch_and (&bar->generation, ~BAR_TASK_PENDING,
MEMMODEL_RELAXED);
}
static inline void
gomp_team_barrier_set_waiting_for_tasks (gomp_barrier_t *bar)
{
- bar->generation |= BAR_WAITING_FOR_TASK;
+ __atomic_fetch_or (&bar->generation, BAR_WAITING_FOR_TASK,
MEMMODEL_RELAXED);
}
static inline bool
gomp_team_barrier_waiting_for_tasks (gomp_barrier_t *bar)
{
- return (bar->generation & BAR_WAITING_FOR_TASK) != 0;
+ unsigned gen = __atomic_load_n (&bar->generation,
MEMMODEL_RELAXED);
+ return (gen & BAR_WAITING_FOR_TASK) != 0;
}
static inline bool
gomp_team_barrier_cancelled (gomp_barrier_t *bar)
{
- return __builtin_expect ((bar->generation & BAR_CANCELLED) != 0,
0);
+ unsigned gen = __atomic_load_n (&bar->generation,
MEMMODEL_RELAXED);
+ return __builtin_expect ((gen & BAR_CANCELLED) != 0, 0);
+}
+
+static inline unsigned
+gomp_increment_gen (gomp_barrier_state_t state, bool use_cancel)
+{
+ unsigned gens = (state & BAR_BOTH_GENS_MASK);
+ return use_cancel ? BAR_INCREMENT_CANCEL (gens) : gens + BAR_INCR;
}
static inline void
-gomp_team_barrier_done (gomp_barrier_t *bar, gomp_barrier_state_t
state)
+gomp_team_barrier_done (gomp_barrier_t *bar, gomp_barrier_state_t
state,
+ bool use_cancel)
{
/* Need the atomic store for acquire-release synchronisation with
the
load in `gomp_team_barrier_wait_{cancel_,}end`. See PR112356
*/
- __atomic_store_n (&bar->generation, (state & -BAR_INCR) + BAR_INCR,
- MEMMODEL_RELEASE);
+ unsigned next = gomp_increment_gen (state, use_cancel);
+ __atomic_store_n (&bar->generation, next, MEMMODEL_RELEASE);
}
static inline bool
gomp_barrier_state_is_incremented (gomp_barrier_state_t gen,
- gomp_barrier_state_t state)
+ gomp_barrier_state_t state, bool
use_cancel)
{
- unsigned next_state = (state & -BAR_INCR) + BAR_INCR;
- return next_state > state ? gen >= next_state : gen < state;
+ unsigned next = gomp_increment_gen (state, use_cancel);
+ return next > state ? gen >= next : gen < state;
}
static inline bool
-gomp_barrier_has_completed (gomp_barrier_state_t state,
gomp_barrier_t *bar)
+gomp_barrier_has_completed (gomp_barrier_state_t state,
gomp_barrier_t *bar,
+ bool use_cancel)
{
/* Handling overflow in the generation. The "next" state could be
less than
or greater than the current one. */
- return gomp_barrier_state_is_incremented (bar->generation, state);
+ unsigned curgen = __atomic_load_n (&bar->generation,
MEMMODEL_RELAXED);
+ return gomp_barrier_state_is_incremented (curgen, state,
use_cancel);
+}
+
+static inline void
+gomp_barrier_prepare_reinit (gomp_barrier_t *bar, unsigned id)
+{
+ gomp_assert (id == 0,
+ "gomp_barrier_prepare_reinit called in non-primary
thread: %u",
+ id);
+ /* This use of `gomp_barrier_wait_start` is worth note.
+ 1) We're running in `id == 0`, which means that without checking
we'll
+ essentially just load `bar->generation`.
+ 2) In this case there's no need to form any release-acquire
ordering. The
+ `gomp_barrier_ensure_last` call below will form a release-
acquire
+ ordering between each secondary thread and this one, and that
will be
+ from some point after all uses of the barrier that we care
about.
+ 3) However, in the checking builds, it's very useful to call
+ `gomp_assert_and_increment_flag` in order to provide extra
guarantees
+ about what we're doing. */
+ gomp_barrier_state_t state = gomp_barrier_wait_start (bar, id);
+ gomp_barrier_ensure_last (bar, id, state);
+#if _LIBGOMP_CHECKING_
+ /* When checking, `gomp_assert_and_increment_flag` will have
incremented the
+ generation flag. However later on down the line we'll be
calling the full
+ barrier again and we need to decrement that flag ready for that.
We still
+ *want* the flag to have been incremented above so that the
assertions in
+ `gomp_barrier_ensure_last` all work.
+
+ When not checking, this increment/decrement/increment again
cycle is not
+ performed. */
+ bar->threadgens[0].gen -= BAR_INCR;
+#endif
}
#endif /* GOMP_BARRIER_H */
diff --git a/libgomp/config/linux/futex_waitv.h
b/libgomp/config/linux/futex_waitv.h
new file mode 100644
index 00000000000..c9780b5e9f0
--- /dev/null
+++ b/libgomp/config/linux/futex_waitv.h
@@ -0,0 +1,129 @@
+/* Copyright The GNU Toolchain Authors.
+
+ This file is part of the GNU Offloading and Multi Processing
Library
+ (libgomp).
+
+ Libgomp is free software; you can redistribute it and/or modify it
+ under the terms of the GNU General Public License as published by
+ the Free Software Foundation; either version 3, or (at your
option)
+ any later version.
+
+ Libgomp is distributed in the hope that it will be useful, but
WITHOUT ANY
+ WARRANTY; without even the implied warranty of MERCHANTABILITY or
FITNESS
+ FOR A PARTICULAR PURPOSE. See the GNU General Public License for
+ more details.
+
+ Under Section 7 of GPL version 3, you are granted additional
+ permissions described in the GCC Runtime Library Exception,
version
+ 3.1, as published by the Free Software Foundation.
+
+ You should have received a copy of the GNU General Public License
and
+ a copy of the GCC Runtime Library Exception along with this
program;
+ see the files COPYING3 and COPYING.RUNTIME respectively. If not,
see
+ <http://www.gnu.org/licenses/>. */
+
+/* Only defining an interface that we need. `waitv` can take many
more
+ addresses but we only use two. We define a fallback for when
`futex_waitv`
+ is not available. This rather than define another target in the
config/
+ directory that looks much the same as the linux one except for the
parts
+ handling `futex_waitv`, `PRIMARY_WAITING_TG`, and
+ `BAR_SECONDARY_{CANCELLABLE_,}ARRIVED`. */
+
+#define _GNU_SOURCE
+#include <sys/syscall.h>
+
+#ifdef SYS_futex_waitv
+#pragma GCC visibility push(default)
+
+#include <unistd.h>
+#include <stdint.h>
+#include <string.h>
+#include <errno.h>
+#include <time.h>
+
+#pragma GCC visibility pop
+
+struct futex_waitv
+{
+ uint64_t val;
+ uint64_t uaddr;
+ uint32_t flags;
+ uint32_t __reserved;
+};
+
+static inline void
+futex_waitv (int *addr, int val, int *addr2, int val2)
+{
+ gomp_assert (gomp_thread ()->ts.team_id == 0,
+ "Called futex_waitv from secondary thread %d\n",
+ gomp_thread ()->ts.team_id);
+ struct futex_waitv addrs[2];
+ addrs[0].val = val;
+ addrs[0].uaddr = (uint64_t) (uintptr_t) addr;
+ /* Using same internally-defined flags as futex.h does. These are
defined in
+ wait.h. */
+ addrs[0].flags = FUTEX_PRIVATE_FLAG | FUTEX_32;
+ addrs[0].__reserved = 0;
+ addrs[1].val = val2;
+ addrs[1].uaddr = (uint64_t) (uintptr_t) addr2;
+ addrs[1].flags = FUTEX_PRIVATE_FLAG | FUTEX_32;
+ addrs[1].__reserved = 0;
+ int err __attribute__ ((unused))
+ = syscall (SYS_futex_waitv, addrs, 2, 0, NULL, CLOCK_MONOTONIC);
+ /* If a signal woke us then we simply leave and let the loop
outside of us
+ handle it. We never require knowledge about whether anything
changed or
+ not. */
+ gomp_assert (err >= 0 || errno == EAGAIN || errno == EINTR,
+ "Failed with futex_waitv err = %d, message: %s", err,
+ strerror (errno));
+}
+
+#else
+
+static inline void
+futex_waitv (int *addr, int val, int *addr2, int val2)
+{
+ int threadlocal
+ = __atomic_fetch_or (addr, PRIMARY_WAITING_TG, MEMMODEL_RELAXED);
+ /* futex_wait can be woken up because of a BAR_TASK_PENDING being
set or
+ the like. In that case we might come back here after checking
+ variables again.
+ If in between checking variables and coming back here the other
thread
+ arrived this variable could have been incremented.
+ Hence possible variables are:
+ - val
+ - val + BAR_INCR
+ - val | PRIMARY_WAITING_TG
+ - val + BAR_INCR | PRIMARY_WAITING_TG
+
+ If the `PRIMARY_WAITING_TG` flag is set, then the trigger for
"we can
+ proceed" is now `BAR_SECONDARY_ARRIVED` being set on the
generation
+ number. */
+ if (__builtin_expect (threadlocal != val, 0)
+ && !(threadlocal & PRIMARY_WAITING_TG))
+ {
+ /* Secondary thread reached this point before us. Know that
secondary
+ will not modify this variable again until we've "released" it
from
+ this barrier. Hence can simply reset the thread-local
variable and
+ continue.
+
+ It's worth mentioning this implementation interacts directly
with what
+ is handled in bar.c. That's not a great separation of
concerns.
+ I believe I need things that way, but would be nice if I
could make
+ the separation neat. ??? That also might allow passing some
+ information down about whether we're working on the
cancellable or
+ non-cancellable generation numbers. Then would be able to
restrict
+ the below assertion to the only value that is valid (for
whichever set
+ of generation numbers we have). */
+ gomp_assert (threadlocal == (val + BAR_INCR)
+ || ((threadlocal & BAR_CANCEL_GEN_MASK)
+ == (BAR_INCREMENT_CANCEL (val) &
BAR_CANCEL_GEN_MASK)),
+ "threadlocal generation number odd: %d (expected
%d)",
+ threadlocal, val);
+ __atomic_store_n (addr, threadlocal, MEMMODEL_RELAXED);
+ return;
+ }
+ futex_wait (addr2, val2);
+}
+
+#endif
diff --git a/libgomp/config/linux/wait.h b/libgomp/config/linux/wait.h
index c15e035dd56..c197da5ea24 100644
--- a/libgomp/config/linux/wait.h
+++ b/libgomp/config/linux/wait.h
@@ -35,6 +35,8 @@
#define FUTEX_WAIT 0
#define FUTEX_WAKE 1
+
+#define FUTEX_32 2
#define FUTEX_PRIVATE_FLAG 128
#ifdef HAVE_ATTRIBUTE_VISIBILITY
@@ -45,14 +47,21 @@ extern int gomp_futex_wait, gomp_futex_wake;
#include <futex.h>
-static inline int do_spin (int *addr, int val)
+static inline unsigned long long
+spin_count ()
{
- unsigned long long i, count = gomp_spin_count_var;
-
+ unsigned long long count = gomp_spin_count_var;
if (__builtin_expect (__atomic_load_n (&gomp_managed_threads,
MEMMODEL_RELAXED)
> gomp_available_cpus, 0))
count = gomp_throttled_spin_count_var;
+ return count;
+}
+
+static inline int
+do_spin (int *addr, int val)
+{
+ unsigned long long i, count = spin_count ();
for (i = 0; i < count; i++)
if (__builtin_expect (__atomic_load_n (addr, MEMMODEL_RELAXED) !=
val, 0))
return 0;
diff --git a/libgomp/config/posix/bar.c b/libgomp/config/posix/bar.c
index a86b2f38c2d..6c8a4c6d7d2 100644
--- a/libgomp/config/posix/bar.c
+++ b/libgomp/config/posix/bar.c
@@ -105,13 +105,14 @@ gomp_barrier_wait_end (gomp_barrier_t *bar,
gomp_barrier_state_t state)
}
void
-gomp_barrier_wait (gomp_barrier_t *barrier)
+gomp_barrier_wait (gomp_barrier_t *barrier, unsigned id)
{
- gomp_barrier_wait_end (barrier, gomp_barrier_wait_start (barrier));
+ gomp_barrier_wait_end (barrier, gomp_barrier_wait_start (barrier,
id));
}
void
-gomp_team_barrier_wait_end (gomp_barrier_t *bar, gomp_barrier_state_t
state)
+gomp_team_barrier_wait_end (gomp_barrier_t *bar, gomp_barrier_state_t
state,
+ unsigned id __attribute__ ((unused)))
{
unsigned int n;
@@ -127,7 +128,7 @@ gomp_team_barrier_wait_end (gomp_barrier_t *bar,
gomp_barrier_state_t state)
= __atomic_load_n (&team->task_count, MEMMODEL_ACQUIRE);
if (task_count)
{
- gomp_barrier_handle_tasks (state);
+ gomp_barrier_handle_tasks (state, false);
if (n > 0)
gomp_sem_wait (&bar->sem2);
gomp_mutex_unlock (&bar->mutex1);
@@ -154,7 +155,7 @@ gomp_team_barrier_wait_end (gomp_barrier_t *bar,
gomp_barrier_state_t state)
gen = __atomic_load_n (&bar->generation, MEMMODEL_ACQUIRE);
if (gen & BAR_TASK_PENDING)
{
- gomp_barrier_handle_tasks (state);
+ gomp_barrier_handle_tasks (state, false);
gen = __atomic_load_n (&bar->generation,
MEMMODEL_ACQUIRE);
}
}
@@ -175,7 +176,8 @@ gomp_team_barrier_wait_end (gomp_barrier_t *bar,
gomp_barrier_state_t state)
bool
gomp_team_barrier_wait_cancel_end (gomp_barrier_t *bar,
- gomp_barrier_state_t state)
+ gomp_barrier_state_t state,
+ unsigned id __attribute__
((unused)))
{
unsigned int n;
@@ -191,7 +193,7 @@ gomp_team_barrier_wait_cancel_end (gomp_barrier_t
*bar,
= __atomic_load_n (&team->task_count, MEMMODEL_ACQUIRE);
if (task_count)
{
- gomp_barrier_handle_tasks (state);
+ gomp_barrier_handle_tasks (state, true);
if (n > 0)
gomp_sem_wait (&bar->sem2);
gomp_mutex_unlock (&bar->mutex1);
@@ -226,7 +228,7 @@ gomp_team_barrier_wait_cancel_end (gomp_barrier_t
*bar,
break;
if (gen & BAR_TASK_PENDING)
{
- gomp_barrier_handle_tasks (state);
+ gomp_barrier_handle_tasks (state, true);
gen = __atomic_load_n (&bar->generation,
MEMMODEL_ACQUIRE);
if (gen & BAR_CANCELLED)
break;
@@ -251,9 +253,10 @@ gomp_team_barrier_wait_cancel_end (gomp_barrier_t
*bar,
}
void
-gomp_team_barrier_wait (gomp_barrier_t *barrier)
+gomp_team_barrier_wait (gomp_barrier_t *barrier, unsigned id)
{
- gomp_team_barrier_wait_end (barrier, gomp_barrier_wait_start
(barrier));
+ gomp_team_barrier_wait_end (barrier, gomp_barrier_wait_start
(barrier, id),
+ id);
}
void
@@ -266,10 +269,10 @@ gomp_team_barrier_wake (gomp_barrier_t *bar, int
count)
}
bool
-gomp_team_barrier_wait_cancel (gomp_barrier_t *bar)
+gomp_team_barrier_wait_cancel (gomp_barrier_t *bar, unsigned id)
{
- gomp_barrier_state_t state = gomp_barrier_wait_cancel_start (bar);
- return gomp_team_barrier_wait_cancel_end (bar, state);
+ gomp_barrier_state_t state = gomp_barrier_wait_cancel_start (bar,
id);
+ return gomp_team_barrier_wait_cancel_end (bar, state, id);
}
void
diff --git a/libgomp/config/posix/bar.h b/libgomp/config/posix/bar.h
index 35b94e43ce2..928b12a14ff 100644
--- a/libgomp/config/posix/bar.h
+++ b/libgomp/config/posix/bar.h
@@ -62,20 +62,21 @@ extern void gomp_barrier_init (gomp_barrier_t *,
unsigned);
extern void gomp_barrier_reinit (gomp_barrier_t *, unsigned);
extern void gomp_barrier_destroy (gomp_barrier_t *);
-extern void gomp_barrier_wait (gomp_barrier_t *);
+extern void gomp_barrier_wait (gomp_barrier_t *, unsigned);
extern void gomp_barrier_wait_end (gomp_barrier_t *,
gomp_barrier_state_t);
-extern void gomp_team_barrier_wait (gomp_barrier_t *);
-extern void gomp_team_barrier_wait_end (gomp_barrier_t *,
- gomp_barrier_state_t);
-extern bool gomp_team_barrier_wait_cancel (gomp_barrier_t *);
+extern void gomp_team_barrier_wait (gomp_barrier_t *, unsigned);
+extern void gomp_team_barrier_wait_end (gomp_barrier_t *,
gomp_barrier_state_t,
+ unsigned);
+extern bool gomp_team_barrier_wait_cancel (gomp_barrier_t *,
unsigned);
extern bool gomp_team_barrier_wait_cancel_end (gomp_barrier_t *,
- gomp_barrier_state_t);
+ gomp_barrier_state_t,
unsigned);
extern void gomp_team_barrier_wake (gomp_barrier_t *, int);
struct gomp_team;
extern void gomp_team_barrier_cancel (struct gomp_team *);
static inline gomp_barrier_state_t
-gomp_barrier_wait_start (gomp_barrier_t *bar)
+gomp_barrier_wait_start (gomp_barrier_t *bar,
+ unsigned id __attribute__ ((unused)))
{
unsigned int ret;
gomp_mutex_lock (&bar->mutex1);
@@ -86,7 +87,8 @@ gomp_barrier_wait_start (gomp_barrier_t *bar)
}
static inline gomp_barrier_state_t
-gomp_barrier_wait_cancel_start (gomp_barrier_t *bar)
+gomp_barrier_wait_cancel_start (gomp_barrier_t *bar,
+ unsigned id __attribute__ ((unused)))
{
unsigned int ret;
gomp_mutex_lock (&bar->mutex1);
@@ -99,9 +101,9 @@ gomp_barrier_wait_cancel_start (gomp_barrier_t
*bar)
}
static inline void
-gomp_team_barrier_wait_final (gomp_barrier_t *bar)
+gomp_team_barrier_wait_final (gomp_barrier_t *bar, unsigned id)
{
- gomp_team_barrier_wait (bar);
+ gomp_team_barrier_wait (bar, id);
}
static inline bool
@@ -111,9 +113,9 @@ gomp_barrier_last_thread (gomp_barrier_state_t
state)
}
static inline void
-gomp_barrier_wait_last (gomp_barrier_t *bar)
+gomp_barrier_wait_last (gomp_barrier_t *bar, unsigned id)
{
- gomp_barrier_wait (bar);
+ gomp_barrier_wait (bar, id);
}
/* All the inlines below must be called with team->task_lock
@@ -150,7 +152,8 @@ gomp_team_barrier_cancelled (gomp_barrier_t *bar)
}
static inline void
-gomp_team_barrier_done (gomp_barrier_t *bar, gomp_barrier_state_t
state)
+gomp_team_barrier_done (gomp_barrier_t *bar, gomp_barrier_state_t
state,
+ unsigned use_cancel __attribute__ ((unused)))
{
/* Need the atomic store for acquire-release synchronisation with
the
load in `gomp_team_barrier_wait_{cancel_,}end`. See PR112356
*/
@@ -167,11 +170,81 @@ gomp_barrier_state_is_incremented
(gomp_barrier_state_t gen,
}
static inline bool
-gomp_barrier_has_completed (gomp_barrier_state_t state,
gomp_barrier_t *bar)
+gomp_barrier_has_completed (gomp_barrier_state_t state,
gomp_barrier_t *bar,
+ bool use_cancel __attribute__ ((unused)))
{
/* Handling overflow in the generation. The "next" state could be
less than
or greater than the current one. */
return gomp_barrier_state_is_incremented (bar->generation, state);
}
+/* Functions dummied out for this implementation. */
+static inline void
+gomp_barrier_prepare_reinit (gomp_barrier_t *bar __attribute__
((unused)),
+ unsigned id __attribute__ ((unused)))
+{}
+
+static inline void
+gomp_barrier_minimal_reinit (gomp_barrier_t *bar, unsigned nthreads,
+ unsigned num_new_threads __attribute__
((unused)))
+{
+ gomp_barrier_reinit (bar, nthreads);
+}
+
+static inline void
+gomp_barrier_reinit_1 (gomp_barrier_t *bar, unsigned nthreads,
+ unsigned num_new_threads,
+ unsigned long long *new_ids __attribute__
((unused)))
+{
+ if (num_new_threads)
+ {
+ gomp_mutex_lock (&bar->mutex1);
+ bar->total += num_new_threads;
+ gomp_mutex_unlock (&bar->mutex1);
+ }
+}
+
+static inline void
+gomp_barrier_reinit_2 (gomp_barrier_t *bar, unsigned nthreads)
+{
+ gomp_barrier_reinit (bar, nthreads);
+}
+
+static inline bool
+gomp_barrier_has_space (gomp_barrier_t *bar __attribute__ ((unused)),
+ unsigned nthreads __attribute__ ((unused)))
+{
+ /* Space to handle `nthreads`. Only thing that we need is to set
bar->total
+ to `nthreads`. Can always do that. */
+ return true;
+}
+
+static inline void
+gomp_team_barrier_ensure_last (gomp_barrier_t *bar __attribute__
((unused)),
+ unsigned id __attribute__ ((unused)),
+ gomp_barrier_state_t state
+ __attribute__ ((unused)))
+{}
+
+static inline bool
+gomp_team_barrier_ensure_cancel_last (gomp_barrier_t *bar
+ __attribute__ ((unused)),
+ unsigned id __attribute__
((unused)),
+ gomp_barrier_state_t state
+ __attribute__ ((unused)))
+{
+ /* After returning BAR_WAS_LAST, actually ensure that this thread
is last.
+ Return `true` if this thread is known last into the barrier
return `false`
+ if the barrier got cancelled such that not all threads entered
the barrier.
+
+ Since BAR_WAS_LAST is only set for a thread when that thread
decremented
+ the `awaited` counter to zero we know that all threads must have
entered
+ the barrier. Hence always return `true`. */
+ return true;
+}
+
+static inline void
+gomp_reset_cancellable_primary_threadgen (gomp_barrier_t *bar,
unsigned id)
+{}
+
#endif /* GOMP_BARRIER_H */
diff --git a/libgomp/config/posix/simple-bar.h
b/libgomp/config/posix/simple-bar.h
index 7b4b7e43ea6..12abd0512e8 100644
--- a/libgomp/config/posix/simple-bar.h
+++ b/libgomp/config/posix/simple-bar.h
@@ -43,9 +43,18 @@ gomp_simple_barrier_init (gomp_simple_barrier_t
*bar, unsigned count)
}
static inline void
-gomp_simple_barrier_reinit (gomp_simple_barrier_t *bar, unsigned
count)
+gomp_simple_barrier_minimal_reinit (gomp_simple_barrier_t *bar,
+ unsigned nthreads, unsigned
num_new_threads)
{
- gomp_barrier_reinit (&bar->bar, count);
+ gomp_barrier_minimal_reinit (&bar->bar, nthreads, num_new_threads);
+}
+
+static inline void
+gomp_simple_barrier_reinit_1 (gomp_simple_barrier_t *bar, unsigned
nthreads,
+ unsigned num_new_threads,
+ unsigned long long *new_ids)
+{
+ gomp_barrier_reinit_1 (&bar->bar, nthreads, num_new_threads,
new_ids);
}
static inline void
@@ -55,15 +64,33 @@ gomp_simple_barrier_destroy (gomp_simple_barrier_t
*bar)
}
static inline void
-gomp_simple_barrier_wait (gomp_simple_barrier_t *bar)
+gomp_simple_barrier_wait (gomp_simple_barrier_t *bar, unsigned id)
+{
+ gomp_barrier_wait (&bar->bar, id);
+}
+
+static inline void
+gomp_simple_barrier_wait_last (gomp_simple_barrier_t *bar, unsigned
id)
{
- gomp_barrier_wait (&bar->bar);
+ gomp_barrier_wait_last (&bar->bar, id);
}
static inline void
-gomp_simple_barrier_wait_last (gomp_simple_barrier_t *bar)
+gomp_simple_barrier_prepare_reinit (gomp_simple_barrier_t *sbar,
unsigned id)
+{
+ gomp_barrier_prepare_reinit (&sbar->bar, id);
+}
+
+static inline void
+gomp_simple_barrier_reinit_2 (gomp_simple_barrier_t *sbar, unsigned
nthreads)
+{
+ gomp_barrier_reinit_2 (&sbar->bar, nthreads);
+}
+
+static inline bool
+gomp_simple_barrier_has_space (gomp_simple_barrier_t *sbar, unsigned
nthreads)
{
- gomp_barrier_wait_last (&bar->bar);
+ return gomp_barrier_has_space (&sbar->bar, nthreads);
}
#endif /* GOMP_SIMPLE_BARRIER_H */
diff --git a/libgomp/libgomp.h b/libgomp/libgomp.h
index a43398300a5..e0459046bc9 100644
--- a/libgomp/libgomp.h
+++ b/libgomp/libgomp.h
@@ -59,6 +59,7 @@
#include <stdbool.h>
#include <stdlib.h>
#include <stdarg.h>
+#include <limits.h>
/* Needed for memset in priority_queue.c. */
#if _LIBGOMP_CHECKING_
@@ -200,6 +201,14 @@ extern void gomp_vfatal (const char *, va_list)
extern void gomp_fatal (const char *, ...)
__attribute__ ((noreturn, format (printf, 1, 2)));
+#if _LIBGOMP_CHECKING_
+#define gomp_assert(EXPR, MSG, ...)
\
+ if (!(EXPR))
\
+ gomp_fatal ("%s:%d " MSG, __FILE__, __LINE__, __VA_ARGS__)
+#else
+#define gomp_assert(...)
+#endif
+
struct gomp_task;
struct gomp_taskgroup;
struct htab;
@@ -1099,7 +1108,7 @@ extern unsigned gomp_dynamic_max_threads (void);
extern void gomp_init_task (struct gomp_task *, struct gomp_task *,
struct gomp_task_icv *);
extern void gomp_end_task (void);
-extern void gomp_barrier_handle_tasks (gomp_barrier_state_t);
+extern void gomp_barrier_handle_tasks (gomp_barrier_state_t, bool);
extern void gomp_task_maybe_wait_for_dependencies (void **);
extern bool gomp_create_target_task (struct gomp_device_descr *,
void (*) (void *), size_t, void
**,
diff --git a/libgomp/single.c b/libgomp/single.c
index 397501daeb3..abba2976586 100644
--- a/libgomp/single.c
+++ b/libgomp/single.c
@@ -77,7 +77,7 @@ GOMP_single_copy_start (void)
}
else
{
- gomp_team_barrier_wait (&thr->ts.team->barrier);
+ gomp_team_barrier_wait (&thr->ts.team->barrier, thr-
ts.team_id);
ret = thr->ts.work_share->copyprivate;
gomp_work_share_end_nowait ();
@@ -98,7 +98,7 @@ GOMP_single_copy_end (void *data)
if (team != NULL)
{
thr->ts.work_share->copyprivate = data;
- gomp_team_barrier_wait (&team->barrier);
+ gomp_team_barrier_wait (&team->barrier, thr->ts.team_id);
}
gomp_work_share_end_nowait ();
diff --git a/libgomp/task.c b/libgomp/task.c
index 5965e781f7e..658b51c1fd2 100644
--- a/libgomp/task.c
+++ b/libgomp/task.c
@@ -1549,7 +1549,7 @@ gomp_task_run_post_remove_taskgroup (struct
gomp_task *child_task)
}
void
-gomp_barrier_handle_tasks (gomp_barrier_state_t state)
+gomp_barrier_handle_tasks (gomp_barrier_state_t state, bool
use_cancel)
{
struct gomp_thread *thr = gomp_thread ();
struct gomp_team *team = thr->ts.team;
@@ -1570,7 +1570,7 @@ gomp_barrier_handle_tasks (gomp_barrier_state_t
state)
When `task_count == 0` we're not going to perform tasks anyway,
so the
problem of PR122314 is naturally avoided. */
if (team->task_count != 0
- && gomp_barrier_has_completed (state, &team->barrier))
+ && gomp_barrier_has_completed (state, &team->barrier,
use_cancel))
{
gomp_mutex_unlock (&team->task_lock);
return;
@@ -1580,7 +1580,7 @@ gomp_barrier_handle_tasks (gomp_barrier_state_t
state)
{
if (team->task_count == 0)
{
- gomp_team_barrier_done (&team->barrier, state);
+ gomp_team_barrier_done (&team->barrier, state, use_cancel);
gomp_mutex_unlock (&team->task_lock);
gomp_team_barrier_wake (&team->barrier, 0);
return;
@@ -1617,7 +1617,7 @@ gomp_barrier_handle_tasks (gomp_barrier_state_t
state)
else if (team->task_count == 0
&& gomp_team_barrier_waiting_for_tasks (&team-
barrier))
{
- gomp_team_barrier_done (&team->barrier, state);
+ gomp_team_barrier_done (&team->barrier, state, use_cancel);
gomp_mutex_unlock (&team->task_lock);
gomp_team_barrier_wake (&team->barrier, 0);
if (to_free)
@@ -2243,7 +2243,7 @@ GOMP_taskgroup_end (void)
is #pragma omp target nowait that creates an implicit
team with a single thread. In this case, we want to wait
for all outstanding tasks in this team. */
- gomp_team_barrier_wait (&team->barrier);
+ gomp_team_barrier_wait (&team->barrier, thr->ts.team_id);
return;
}
@@ -2698,7 +2698,7 @@ GOMP_workshare_task_reduction_unregister (bool
cancelled)
htab_free ((struct htab *) data[5]);
if (!cancelled)
- gomp_team_barrier_wait (&team->barrier);
+ gomp_team_barrier_wait (&team->barrier, thr->ts.team_id);
}
int
diff --git a/libgomp/team.c b/libgomp/team.c
index cb1d3235312..512b1368af6 100644
--- a/libgomp/team.c
+++ b/libgomp/team.c
@@ -109,28 +109,28 @@ gomp_thread_start (void *xdata)
struct gomp_team *team = thr->ts.team;
struct gomp_task *task = thr->task;
- gomp_barrier_wait (&team->barrier);
+ gomp_barrier_wait (&team->barrier, thr->ts.team_id);
local_fn (local_data);
- gomp_team_barrier_wait_final (&team->barrier);
+ gomp_team_barrier_wait_final (&team->barrier, thr->ts.team_id);
gomp_finish_task (task);
- gomp_barrier_wait_last (&team->barrier);
+ gomp_barrier_wait_last (&team->barrier, thr->ts.team_id);
}
else
{
pool->threads[thr->ts.team_id] = thr;
- gomp_simple_barrier_wait (&pool->threads_dock);
+ gomp_simple_barrier_wait (&pool->threads_dock, thr-
ts.team_id);
do
{
struct gomp_team *team = thr->ts.team;
struct gomp_task *task = thr->task;
local_fn (local_data);
- gomp_team_barrier_wait_final (&team->barrier);
+ gomp_team_barrier_wait_final (&team->barrier, thr-
ts.team_id);
gomp_finish_task (task);
- gomp_simple_barrier_wait (&pool->threads_dock);
+ gomp_simple_barrier_wait (&pool->threads_dock, thr-
ts.team_id);
local_fn = thr->fn;
local_data = thr->data;
@@ -243,7 +243,7 @@ gomp_free_pool_helper (void *thread_pool)
struct gomp_thread *thr = gomp_thread ();
struct gomp_thread_pool *pool
= (struct gomp_thread_pool *) thread_pool;
- gomp_simple_barrier_wait_last (&pool->threads_dock);
+ gomp_simple_barrier_wait_last (&pool->threads_dock, thr-
ts.team_id);
gomp_sem_destroy (&thr->release);
thr->thread_pool = NULL;
thr->task = NULL;
@@ -278,10 +278,10 @@ gomp_free_thread (void *arg
__attribute__((unused)))
nthr->data = pool;
}
/* This barrier undocks threads docked on pool-
threads_dock. */
- gomp_simple_barrier_wait (&pool->threads_dock);
+ gomp_simple_barrier_wait (&pool->threads_dock, thr-
ts.team_id);
/* And this waits till all threads have called
gomp_barrier_wait_last
in gomp_free_pool_helper. */
- gomp_simple_barrier_wait (&pool->threads_dock);
+ gomp_simple_barrier_wait (&pool->threads_dock, thr-
ts.team_id);
/* Now it is safe to destroy the barrier and free the pool.
*/
gomp_simple_barrier_destroy (&pool->threads_dock);
@@ -457,6 +457,14 @@ gomp_team_start (void (*fn) (void *), void *data,
unsigned nthreads,
else
bind = omp_proc_bind_false;
+ unsigned bits_per_ull = sizeof (unsigned long long) * CHAR_BIT;
+ int id_arr_len = ((nthreads + pool->threads_used) / bits_per_ull) +
1;
+ unsigned long long new_ids[id_arr_len];
+ for (int j = 0; j < id_arr_len; j++)
+ {
+ new_ids[j] = 0;
+ }
+
/* We only allow the reuse of idle threads for non-nested PARALLEL
regions. This appears to be implied by the semantics of
threadprivate variables, but perhaps that's reading too much
into
@@ -464,6 +472,11 @@ gomp_team_start (void (*fn) (void *), void *data,
unsigned nthreads,
only the initial program thread will modify gomp_threads. */
if (!nested)
{
+ /* This current thread is always re-used in next team. */
+ unsigned total_reused = 1;
+ gomp_assert (team->prev_ts.team_id == 0,
+ "Starting a team from thread with id %u in previous
team\n",
+ team->prev_ts.team_id);
old_threads_used = pool->threads_used;
if (nthreads <= old_threads_used)
@@ -474,13 +487,7 @@ gomp_team_start (void (*fn) (void *), void *data,
unsigned nthreads,
gomp_simple_barrier_init (&pool->threads_dock, nthreads);
}
else
- {
- n = old_threads_used;
-
- /* Increase the barrier threshold to make sure all new
- threads arrive before the team is released. */
- gomp_simple_barrier_reinit (&pool->threads_dock, nthreads);
- }
+ n = old_threads_used;
/* Not true yet, but soon will be. We're going to release all
threads from the dock, and those that aren't part of the
@@ -502,6 +509,7 @@ gomp_team_start (void (*fn) (void *), void *data,
unsigned nthreads,
}
/* Release existing idle threads. */
+ bool have_prepared = false;
for (; i < n; ++i)
{
unsigned int place_partition_off = thr-
ts.place_partition_off;
@@ -643,7 +651,23 @@ gomp_team_start (void (*fn) (void *), void *data,
unsigned nthreads,
nthr->ts.team = team;
nthr->ts.work_share = &team->work_shares[0];
nthr->ts.last_work_share = NULL;
+ /* If we're changing any threads team_id then we need to
wait for all
+ other threads to have reached the barrier. */
+ if (nthr->ts.team_id != i && !have_prepared)
+ {
+ gomp_simple_barrier_prepare_reinit (&pool->threads_dock,
+ thr->ts.team_id);
+ have_prepared = true;
+ }
nthr->ts.team_id = i;
+ {
+ unsigned idx = (i / bits_per_ull);
+ gomp_assert (!(new_ids[idx] & (1ULL << (i %
bits_per_ull))),
+ "new_ids[%u] == %llu (for `i` %u)", idx,
new_ids[idx],
+ i);
+ new_ids[idx] |= (1ULL << (i % bits_per_ull));
+ }
+ total_reused += 1;
nthr->ts.level = team->prev_ts.level + 1;
nthr->ts.active_level = thr->ts.active_level;
nthr->ts.place_partition_off = place_partition_off;
@@ -714,13 +738,54 @@ gomp_team_start (void (*fn) (void *), void
*data, unsigned nthreads,
}
break;
}
+ }
+ }
- /* Increase the barrier threshold to make sure all new
- threads and all the threads we're going to let die
- arrive before the team is released. */
- if (affinity_count)
- gomp_simple_barrier_reinit (&pool->threads_dock,
- nthreads +
affinity_count);
+ /* If we are changing the number of threads *or* if we are
starting new
+ threads for any reason. Then update the barrier accordingly.
+
+ The handling of the barrier here is different for the
different
+ designs of barrier.
+
+ The `posix/bar.h` design needs to "grow" to accomodate the
extra
+ threads that we'll wait on, then "shrink" to the size we want
+ eventually.
+
+ The `linux/bar.h` design needs to assign positions for each
thread.
+ Some of the threads getting started will want the position of
a thread
+ that is currently running. Hence we need to (1) serialise
existing
+ threads then (2) set up barierr state for the incoming new
threads.
+ Once this is done we don't need any equivalent of the
"shrink" step
+ later. This does result in a longer period of serialisation
than
+ the posix/bar.h design, but it seems that this is a fair
trade-off to
+ make for the design that is faster under contention. */
+ if (old_threads_used != 0
+ && (nthreads != pool->threads_dock.bar.total || i <
nthreads))
+ {
+ /* If all we've done is increase the number of threads that
we want,
+ don't need to serialise anything (wake flags don't need
to be
+ adjusted). */
+ if (nthreads > old_threads_used && affinity_count == 0
+ && total_reused == old_threads_used
+ /* `have_prepared` can be used to detect whether we re-
shuffled
+ any threads around. */
+ && !have_prepared
+ && gomp_simple_barrier_has_space (&pool->threads_dock,
nthreads))
+ gomp_simple_barrier_minimal_reinit (&pool->threads_dock,
nthreads,
+ nthreads -
old_threads_used);
+ else
+ {
+ /* Otherwise, we need to ensure that we've paused all
existing
+ threads (waiting on us to restart them) before
adjusting their
+ wake flags. */
+ if (!have_prepared)
+ gomp_simple_barrier_prepare_reinit (&pool-
threads_dock,
+ thr->ts.team_id);
+ gomp_simple_barrier_reinit_1 (&pool->threads_dock,
nthreads,
+ nthreads <= total_reused
+ ? 0
+ : nthreads -
total_reused,
+ new_ids);
}
}
@@ -868,9 +933,9 @@ gomp_team_start (void (*fn) (void *), void *data,
unsigned nthreads,
do_release:
if (nested)
- gomp_barrier_wait (&team->barrier);
+ gomp_barrier_wait (&team->barrier, thr->ts.team_id);
else
- gomp_simple_barrier_wait (&pool->threads_dock);
+ gomp_simple_barrier_wait (&pool->threads_dock, thr->ts.team_id);
/* Decrease the barrier threshold to match the number of threads
that should arrive back at the end of this team. The extra
@@ -888,8 +953,7 @@ gomp_team_start (void (*fn) (void *), void *data,
unsigned nthreads,
if (affinity_count)
diff = -affinity_count;
- gomp_simple_barrier_reinit (&pool->threads_dock, nthreads);
-
+ gomp_simple_barrier_reinit_2 (&pool->threads_dock, nthreads);
#ifdef HAVE_SYNC_BUILTINS
__sync_fetch_and_add (&gomp_managed_threads, diff);
#else
@@ -949,12 +1013,13 @@ gomp_team_end (void)
{
struct gomp_thread *thr = gomp_thread ();
struct gomp_team *team = thr->ts.team;
+ unsigned team_id = thr->ts.team_id;
/* This barrier handles all pending explicit threads.
As #pragma omp cancel parallel might get awaited count in
team->barrier in a inconsistent state, we need to use a
different
counter here. */
- gomp_team_barrier_wait_final (&team->barrier);
+ gomp_team_barrier_wait_final (&team->barrier, thr->ts.team_id);
if (__builtin_expect (team->team_cancelled, 0))
{
struct gomp_work_share *ws = team->work_shares_to_free;
@@ -985,7 +1050,7 @@ gomp_team_end (void)
#endif
/* This barrier has gomp_barrier_wait_last counterparts
and ensures the team can be safely destroyed. */
- gomp_barrier_wait (&team->barrier);
+ gomp_barrier_wait (&team->barrier, team_id);
}
if (__builtin_expect (team->work_shares[0].next_alloc != NULL, 0))
@@ -1049,7 +1114,7 @@ gomp_pause_pool_helper (void *thread_pool)
struct gomp_thread *thr = gomp_thread ();
struct gomp_thread_pool *pool
= (struct gomp_thread_pool *) thread_pool;
- gomp_simple_barrier_wait_last (&pool->threads_dock);
+ gomp_simple_barrier_wait_last (&pool->threads_dock, thr-
ts.team_id);
gomp_sem_destroy (&thr->release);
thr->thread_pool = NULL;
thr->task = NULL;
@@ -1081,10 +1146,10 @@ gomp_pause_host (void)
thrs[i] = gomp_thread_to_pthread_t (nthr);
}
/* This barrier undocks threads docked on pool-
threads_dock. */
- gomp_simple_barrier_wait (&pool->threads_dock);
+ gomp_simple_barrier_wait (&pool->threads_dock, thr-
ts.team_id);
/* And this waits till all threads have called
gomp_barrier_wait_last
in gomp_pause_pool_helper. */
- gomp_simple_barrier_wait (&pool->threads_dock);
+ gomp_simple_barrier_wait (&pool->threads_dock, thr-
ts.team_id);
/* Now it is safe to destroy the barrier and free the pool.
*/
gomp_simple_barrier_destroy (&pool->threads_dock);
diff --git a/libgomp/testsuite/libgomp.c/primary-thread-tasking.c
b/libgomp/testsuite/libgomp.c/primary-thread-tasking.c
new file mode 100644
index 00000000000..4b70dca30a2
--- /dev/null
+++ b/libgomp/testsuite/libgomp.c/primary-thread-tasking.c
@@ -0,0 +1,80 @@
+/* Test to check our primary thread can execute some tasks while
waiting for
+ other threads. This to check an edge-case in a recent
implementation of the
+ barrier tasking mechanism.
+ I don't believe there's any way to guarantee that a task will be
run on a
+ given thread. Hence I don't know anywhere I can put an `abort`
and say we
+ failed. However I can set things up so that we'll timeout if the
primary
+ thread is not executing any tasks. That timeout will at least
count as a
+ fail.
+
+ Idea here being that we keep spawning tasks until one is handled
by the
+ primary thread. Meanwhile we give the secondary threads lots of
+ opportunities to sleep and let the primary thread take a task. */
+/* { dg-do run { target *-linux-* } } */
+
+#define _GNU_SOURCE
+#include <omp.h>
+#include <unistd.h>
+#include <sys/syscall.h>
+#include <linux/futex.h>
+#include <assert.h>
+#include <stdatomic.h>
+#include <limits.h>
+
+int wake_flag = 0;
+
+void
+continue_until_on_thread0 ()
+{
+ if (omp_get_thread_num () == 0)
+ {
+ __atomic_fetch_add (&wake_flag, 1, memory_order_relaxed);
+ syscall (SYS_futex, &wake_flag, FUTEX_WAKE |
FUTEX_PRIVATE_FLAG, INT_MAX);
+ }
+ else
+ {
+ /* If the flag has been set try again. Otherwise put another
few tasks
+ * on the task queue. */
+ if (__atomic_load_n (&wake_flag, memory_order_relaxed))
+ {
+ return;
+ }
+#pragma omp task
+ continue_until_on_thread0 ();
+#pragma omp task
+ continue_until_on_thread0 ();
+#pragma omp task
+ continue_until_on_thread0 ();
+ syscall (SYS_futex, &wake_flag, FUTEX_WAIT |
FUTEX_PRIVATE_FLAG, 0, NULL);
+ }
+}
+
+int
+foo ()
+{
+#pragma omp parallel
+ {
+ if (omp_get_thread_num () != 0)
+ {
+#pragma omp task
+ continue_until_on_thread0 ();
+#pragma omp task
+ continue_until_on_thread0 ();
+ /* Wait on the master thread to have executed one of the
tasks. */
+ int val = __atomic_load_n (&wake_flag, memory_order_acquire);
+ while (val == 0)
+ {
+ syscall (SYS_futex, &wake_flag, FUTEX_WAIT |
FUTEX_PRIVATE_FLAG,
+ val, NULL);
+ val = __atomic_load_n (&wake_flag, memory_order_acquire);
+ }
+ }
+ }
+}
+
+int
+main ()
+{
+ foo ();
+ return 0;
+}
diff --git a/libgomp/work.c b/libgomp/work.c
index eae972ae2d1..b75b97da181 100644
--- a/libgomp/work.c
+++ b/libgomp/work.c
@@ -240,10 +240,14 @@ gomp_work_share_end (void)
return;
}
- bstate = gomp_barrier_wait_start (&team->barrier);
+ bstate = gomp_barrier_wait_start (&team->barrier, thr->ts.team_id);
if (gomp_barrier_last_thread (bstate))
{
+ /* For some targets the state returning "last" no longer
indicates that
+ we're *already* last, instead it indicates that we *should
be* last.
+ Perform the relevant synchronisation. */
+ gomp_team_barrier_ensure_last (&team->barrier, thr->ts.team_id,
bstate);
if (__builtin_expect (thr->ts.last_work_share != NULL, 1))
{
team->work_shares_to_free = thr->ts.work_share;
@@ -251,7 +255,7 @@ gomp_work_share_end (void)
}
}
- gomp_team_barrier_wait_end (&team->barrier, bstate);
+ gomp_team_barrier_wait_end (&team->barrier, bstate, thr-
ts.team_id);
thr->ts.last_work_share = NULL;
}
@@ -266,19 +270,27 @@ gomp_work_share_end_cancel (void)
gomp_barrier_state_t bstate;
/* Cancellable work sharing constructs cannot be orphaned. */
- bstate = gomp_barrier_wait_cancel_start (&team->barrier);
+ bstate = gomp_barrier_wait_cancel_start (&team->barrier, thr-
ts.team_id);
if (gomp_barrier_last_thread (bstate))
{
- if (__builtin_expect (thr->ts.last_work_share != NULL, 1))
+ if (gomp_team_barrier_ensure_cancel_last (&team->barrier, thr-
ts.team_id,
+ bstate))
{
- team->work_shares_to_free = thr->ts.work_share;
- free_work_share (team, thr->ts.last_work_share);
+ if (__builtin_expect (thr->ts.last_work_share != NULL, 1))
+ {
+ team->work_shares_to_free = thr->ts.work_share;
+ free_work_share (team, thr->ts.last_work_share);
+ }
}
+ else
+ gomp_reset_cancellable_primary_threadgen (&team->barrier,
+ thr->ts.team_id);
}
thr->ts.last_work_share = NULL;
- return gomp_team_barrier_wait_cancel_end (&team->barrier, bstate);
+ return gomp_team_barrier_wait_cancel_end (&team->barrier, bstate,
+ thr->ts.team_id);
}
/* The current thread is done with its current work sharing
construct.
--
2.43.0