[OpenMP, nvptx] Use bar.sync/arrive for barriers when tasking is not used

Chung-Lin Tang Thu, 01 Sep 2022 08:40:09 -0700

Hi, 
our work on SPEChpc2021 benchmarks show that, after the fix for PR99555 was 
committed:
[libgomp, nvptx] Fix hang in gomp_team_barrier_wait_end
https://gcc.gnu.org/git/gitweb.cgi?p=gcc.git;h=5ed77fb3ed1ee0289a0ec9499ef52b99b39421f1


while that patch fixed the hang, there were quite severe performance 
regressions caused
by this new barrier code. Under OpenMP target offload mode, Minisweep regressed 
by about 350%,
while HPGMG-FV was about 2x slower.

So the problem was presumably the new barriers, which replaced erroneous but 
fast bar.sync
instructions, with correct but really heavy-weight futex_wait/wake operations 
on the GPU.
This is probably required for preserving correct task vs. barrier behavior.

However, the observation is that: when tasks-related functionality are not used 
at all by
the team inside an OpenMP target region, and a barrier is just a place to wait 
for all
threads to rejoin (no problem of invoking waiting tasks to re-start) a barrier 
can in that
case be implemented by simple bar.sync and bar.arrive PTX instructions. That 
should be
able to recover most performance the cases that usually matter, e.g. 'omp 
parallel for' inside
'omp target'.

So the plan is to mark cases where 'tasks are never used'. This patch adds a 
'task_never_used'
flag inside struct gomp_team, initialized to true, and set to false when tasks 
are added to
the team. The nvptx specific gomp_team_barrier_wait_end routines can then use 
simple barrier
when team->task_never_used remains true on the barrier.

Some other cases, like the master/masked construct, and single construct, also 
needs to have
task_never_used set false; because these constructs inherently creates 
asymmetric loads where
only a subset of threads run through the region (which may or may not use 
tasking), there may
be the case where different threads wait at the end assuming different 
task_never_used cases.
For correctness, these constructs must have team->task_never_used 
conservatively marked false
at the start of the construct.

This patch has been divided into two: the first is the inlining of contents of 
config/linux/bar.c
into config/nvptx/bar.c (instead of an include). This is needed now because 
some parts of
gomp_team_barrier_wait_[cancel_]end now needs nvptx specific adjustments. The 
second contains
the above described changes.

Tested on powerpc64le-linux and x86_64-linux with nvptx offloading, seeking 
approval for trunk.

Thanks,
Chung-Lin

From c2fdc31880d2d040822e8abece015c29a6d7b472 Mon Sep 17 00:00:00 2001
From: Chung-Lin Tang <clt...@codesourcery.com>
Date: Thu, 1 Sep 2022 05:53:49 -0700
Subject: [PATCH 1/2] libgomp: inline config/linux/bar.c into
 config/nvptx/bar.c

Preparing to add nvptx specific modifications to gomp_team_barrier_wait_end,
et al., so change from using an #include of config/linux/bar.c
in config/nvptx/bar.c, to a full copy of the implementation.

2022-09-01  Chung-Lin Tang  <clt...@codesourcery.com>

libgomp/ChangeLog:

        * config/nvptx/bar.c: Adjust include of "../linux/bar.c" into an
        inlining of contents of config/linux/bar.c,
---
 libgomp/config/nvptx/bar.c | 183 ++++++++++++++++++++++++++++++++++++++++++++-
 1 file changed, 180 insertions(+), 3 deletions(-)

diff --git a/libgomp/config/nvptx/bar.c b/libgomp/config/nvptx/bar.c
index eee2107..a850c22 100644
--- a/libgomp/config/nvptx/bar.c
+++ b/libgomp/config/nvptx/bar.c
@@ -161,6 +161,183 @@ static inline void do_wait (int *addr, int val)
     futex_wait (addr, val);
 }
 
-/* Reuse the linux implementation.  */
-#define GOMP_WAIT_H 1
-#include "../linux/bar.c"
+/* Below is based on the linux implementation.  */
+
+void
+gomp_barrier_wait_end (gomp_barrier_t *bar, gomp_barrier_state_t state)
+{
+  if (__builtin_expect (state & BAR_WAS_LAST, 0))
+    {
+      /* Next time we'll be awaiting TOTAL threads again.  */
+      bar->awaited = bar->total;
+      __atomic_store_n (&bar->generation, bar->generation + BAR_INCR,
+                       MEMMODEL_RELEASE);
+      futex_wake ((int *) &bar->generation, INT_MAX);
+    }
+  else
+    {
+      do
+       do_wait ((int *) &bar->generation, state);
+      while (__atomic_load_n (&bar->generation, MEMMODEL_ACQUIRE) == state);
+    }
+}
+
+void
+gomp_barrier_wait (gomp_barrier_t *bar)
+{
+  gomp_barrier_wait_end (bar, gomp_barrier_wait_start (bar));
+}
+
+/* Like gomp_barrier_wait, except that if the encountering thread
+   is not the last one to hit the barrier, it returns immediately.
+   The intended usage is that a thread which intends to gomp_barrier_destroy
+   this barrier calls gomp_barrier_wait, while all other threads
+   call gomp_barrier_wait_last.  When gomp_barrier_wait returns,
+   the barrier can be safely destroyed.  */
+
+void
+gomp_barrier_wait_last (gomp_barrier_t *bar)
+{
+  gomp_barrier_state_t state = gomp_barrier_wait_start (bar);
+  if (state & BAR_WAS_LAST)
+    gomp_barrier_wait_end (bar, state);
+}
+
+void
+gomp_team_barrier_wake (gomp_barrier_t *bar, int count)
+{
+  futex_wake ((int *) &bar->generation, count == 0 ? INT_MAX : count);
+}
+
+void
+gomp_team_barrier_wait_end (gomp_barrier_t *bar, gomp_barrier_state_t state)
+{
+  unsigned int generation, gen;
+
+  if (__builtin_expect (state & BAR_WAS_LAST, 0))
+    {
+      /* Next time we'll be awaiting TOTAL threads again.  */
+      struct gomp_thread *thr = gomp_thread ();
+      struct gomp_team *team = thr->ts.team;
+
+      bar->awaited = bar->total;
+      team->work_share_cancelled = 0;
+      if (__builtin_expect (team->task_count, 0))
+       {
+         gomp_barrier_handle_tasks (state);
+         state &= ~BAR_WAS_LAST;
+       }
+      else
+       {
+         state &= ~BAR_CANCELLED;
+         state += BAR_INCR - BAR_WAS_LAST;
+         __atomic_store_n (&bar->generation, state, MEMMODEL_RELEASE);
+         futex_wake ((int *) &bar->generation, INT_MAX);
+         return;
+       }
+    }
+
+  generation = state;
+  state &= ~BAR_CANCELLED;
+  do
+    {
+      do_wait ((int *) &bar->generation, generation);
+      gen = __atomic_load_n (&bar->generation, MEMMODEL_ACQUIRE);
+      if (__builtin_expect (gen & BAR_TASK_PENDING, 0))
+       {
+         gomp_barrier_handle_tasks (state);
+         gen = __atomic_load_n (&bar->generation, MEMMODEL_ACQUIRE);
+       }
+      generation |= gen & BAR_WAITING_FOR_TASK;
+    }
+  while (gen != state + BAR_INCR);
+}
+
+void
+gomp_team_barrier_wait (gomp_barrier_t *bar)
+{
+  gomp_team_barrier_wait_end (bar, gomp_barrier_wait_start (bar));
+}
+
+void
+gomp_team_barrier_wait_final (gomp_barrier_t *bar)
+{
+  gomp_barrier_state_t state = gomp_barrier_wait_final_start (bar);
+  if (__builtin_expect (state & BAR_WAS_LAST, 0))
+    bar->awaited_final = bar->total;
+  gomp_team_barrier_wait_end (bar, state);
+}
+
+bool
+gomp_team_barrier_wait_cancel_end (gomp_barrier_t *bar,
+                                  gomp_barrier_state_t state)
+{
+  unsigned int generation, gen;
+
+  if (__builtin_expect (state & BAR_WAS_LAST, 0))
+    {
+      /* Next time we'll be awaiting TOTAL threads again.  */
+      /* BAR_CANCELLED should never be set in state here, because
+        cancellation means that at least one of the threads has been
+        cancelled, thus on a cancellable barrier we should never see
+        all threads to arrive.  */
+      struct gomp_thread *thr = gomp_thread ();
+      struct gomp_team *team = thr->ts.team;
+
+      bar->awaited = bar->total;
+      team->work_share_cancelled = 0;
+      if (__builtin_expect (team->task_count, 0))
+       {
+         gomp_barrier_handle_tasks (state);
+         state &= ~BAR_WAS_LAST;
+       }
+      else
+       {
+         state += BAR_INCR - BAR_WAS_LAST;
+         __atomic_store_n (&bar->generation, state, MEMMODEL_RELEASE);
+         futex_wake ((int *) &bar->generation, INT_MAX);
+         return false;
+       }
+    }
+
+  if (__builtin_expect (state & BAR_CANCELLED, 0))
+    return true;
+
+  generation = state;
+  do
+    {
+      do_wait ((int *) &bar->generation, generation);
+      gen = __atomic_load_n (&bar->generation, MEMMODEL_ACQUIRE);
+      if (__builtin_expect (gen & BAR_CANCELLED, 0))
+       return true;
+      if (__builtin_expect (gen & BAR_TASK_PENDING, 0))
+       {
+         gomp_barrier_handle_tasks (state);
+         gen = __atomic_load_n (&bar->generation, MEMMODEL_ACQUIRE);
+       }
+      generation |= gen & BAR_WAITING_FOR_TASK;
+    }
+  while (gen != state + BAR_INCR);
+
+  return false;
+}
+
+bool
+gomp_team_barrier_wait_cancel (gomp_barrier_t *bar)
+{
+  return gomp_team_barrier_wait_cancel_end (bar, gomp_barrier_wait_start 
(bar));
+}
+
+void
+gomp_team_barrier_cancel (struct gomp_team *team)
+{
+  gomp_mutex_lock (&team->task_lock);
+  if (team->barrier.generation & BAR_CANCELLED)
+    {
+      gomp_mutex_unlock (&team->task_lock);
+      return;
+    }
+  team->barrier.generation |= BAR_CANCELLED;
+  gomp_mutex_unlock (&team->task_lock);
+  futex_wake ((int *) &team->barrier.generation, INT_MAX);
+}
-- 
2.8.1

From 2a621905bb91475e792ee1be9f06ea6145df0bc2 Mon Sep 17 00:00:00 2001
From: Chung-Lin Tang <clt...@codesourcery.com>
Date: Thu, 1 Sep 2022 07:04:42 -0700
Subject: [PATCH 2/2] openmp/nvptx: use bar.sync/arrive for barriers when
 tasking is not used

The nvptx implementation of futex_wait/wake ops, while enables OpenMP task
behavior on nvptx offloaded regions, can cause quite significant performance
regressions on some benchmarks.

However, when tasks-related functionality are not used at all by the team inside
an OpenMP target region, and a barrier is just a place to wait for all threads
to rejoin (with no problem of invoking waiting tasks to re-start) a barrier can
be implemented by simple bar.sync and bar.arrive PTX instructions, which can
bypass the heavy-weightness of nvptx tasks.

This patch adds a 'task_never_used' flag inside struct gomp_team, initialized
to true, and set to false when tasks are added to the team. The nvptx specific
gomp_team_barrier_wait_end routines can then use simple barrier when
team->task_never_used remains true on the barrier.

Some other cases, like the master/masked construct, and single construct, also
needs to have task_never_used set false; because these constructs inherently
creates asymmetric loads where only a subset of threads run through the region
(which may or may not use tasking), there may be the case where different
threads wait at the end assuming different task_never_used cases. For
correctness, these constructs must have team->task_never_used marked false.

2022-09-01  Chung-Lin Tang  <clt...@codesourcery.com>

gcc/ChangeLog:

        * omp-builtins.def (BUILT_IN_GOMP_TASK_SET_USED): Add new omp builtin.
        * omp-low.cc (lower_omp_master): Add call to GOMP_task_set_used at
        start of generated code sequence.

libgomp/ChangeLog:

        * libgomp.h (struct gomp_team): Add 'bool task_never_used' field.
        * libgomp.map (GOMP_5.1): Add GOMP_task_set_used.
        * single.c (GOMP_single_start): Set team->task_never_used to false.
        (GOMP_single_copy_start): Likewise.
        * task.c (GOMP_task_set_used): New function to set team->task_never_used
        to false.
        (GOMP_task): Set team->task_never_used to false.
        (gomp_create_target_task): Likewise.
        (gomp_task_run_post_handle_dependers): Likewise,
        * team.c (gomp_new_team): Add init of team->task_never_used to true.

        * config/nvptx/bar.c (gomp_team_barrier_wait_end):
        When team->task_never_used, use PTX bar.sync/arrive instructions to
        implement simple barrier wait.
        (gomp_team_barrier_wait_cancel_end): Likewise.
---
 gcc/omp-builtins.def       |  2 ++
 gcc/omp-low.cc             |  7 ++++++-
 libgomp/config/nvptx/bar.c | 52 ++++++++++++++++++++++++++++++++++++----------
 libgomp/libgomp.h          |  2 ++
 libgomp/libgomp.map        |  1 +
 libgomp/single.c           | 11 ++++++++--
 libgomp/task.c             | 13 ++++++++++++
 libgomp/team.c             |  1 +
 8 files changed, 75 insertions(+), 14 deletions(-)

diff --git a/gcc/omp-builtins.def b/gcc/omp-builtins.def
index ee5213e..44a15e6 100644
--- a/gcc/omp-builtins.def
+++ b/gcc/omp-builtins.def
@@ -98,6 +98,8 @@ DEF_GOMP_BUILTIN (BUILT_IN_GOMP_TASKGROUP_START, 
"GOMP_taskgroup_start",
                  BT_FN_VOID, ATTR_NOTHROW_LEAF_LIST)
 DEF_GOMP_BUILTIN (BUILT_IN_GOMP_TASKGROUP_END, "GOMP_taskgroup_end",
                  BT_FN_VOID, ATTR_NOTHROW_LEAF_LIST)
+DEF_GOMP_BUILTIN (BUILT_IN_GOMP_TASK_SET_USED, "GOMP_task_set_used",
+                 BT_FN_VOID, ATTR_NOTHROW_LEAF_LIST)
 DEF_GOMP_BUILTIN (BUILT_IN_GOMP_CANCEL, "GOMP_cancel",
                  BT_FN_BOOL_INT_BOOL, ATTR_NOTHROW_LEAF_LIST)
 DEF_GOMP_BUILTIN (BUILT_IN_GOMP_CANCELLATION_POINT, "GOMP_cancellation_point",
diff --git a/gcc/omp-low.cc b/gcc/omp-low.cc
index f54dea9..fd896ca 100644
--- a/gcc/omp-low.cc
+++ b/gcc/omp-low.cc
@@ -9035,11 +9035,16 @@ lower_omp_master (gimple_stmt_iterator *gsi_p, 
omp_context *ctx)
   gsi_replace (gsi_p, bind, true);
   gimple_bind_add_stmt (bind, stmt);
 
+  tseq = NULL;
+
+  bfn_decl = builtin_decl_explicit (BUILT_IN_GOMP_TASK_SET_USED);
+  x = build_call_expr_loc (loc, bfn_decl, 0);
+  gimplify_and_add (x, &tseq);
+
   bfn_decl = builtin_decl_explicit (BUILT_IN_OMP_GET_THREAD_NUM);
   x = build_call_expr_loc (loc, bfn_decl, 0);
   x = build2 (EQ_EXPR, boolean_type_node, x, filter);
   x = build3 (COND_EXPR, void_type_node, x, NULL, build_and_jump (&lab));
-  tseq = NULL;
   gimplify_and_add (x, &tseq);
   gimple_bind_add_seq (bind, tseq);
 
diff --git a/libgomp/config/nvptx/bar.c b/libgomp/config/nvptx/bar.c
index a850c22..02781dd 100644
--- a/libgomp/config/nvptx/bar.c
+++ b/libgomp/config/nvptx/bar.c
@@ -212,13 +212,13 @@ gomp_team_barrier_wake (gomp_barrier_t *bar, int count)
 void
 gomp_team_barrier_wait_end (gomp_barrier_t *bar, gomp_barrier_state_t state)
 {
+  struct gomp_thread *thr = gomp_thread ();
+  struct gomp_team *team = thr->ts.team;
   unsigned int generation, gen;
 
   if (__builtin_expect (state & BAR_WAS_LAST, 0))
     {
       /* Next time we'll be awaiting TOTAL threads again.  */
-      struct gomp_thread *thr = gomp_thread ();
-      struct gomp_team *team = thr->ts.team;
 
       bar->awaited = bar->total;
       team->work_share_cancelled = 0;
@@ -229,14 +229,28 @@ gomp_team_barrier_wait_end (gomp_barrier_t *bar, 
gomp_barrier_state_t state)
        }
       else
        {
-         state &= ~BAR_CANCELLED;
-         state += BAR_INCR - BAR_WAS_LAST;
-         __atomic_store_n (&bar->generation, state, MEMMODEL_RELEASE);
-         futex_wake ((int *) &bar->generation, INT_MAX);
+         if (team->task_never_used)
+           {
+             if (bar->total > 1)
+               asm ("bar.arrive 1, %0;" : : "r" (32 * bar->total));
+           }
+         else
+           {
+             state &= ~BAR_CANCELLED;
+             state += BAR_INCR - BAR_WAS_LAST;
+             __atomic_store_n (&bar->generation, state, MEMMODEL_RELEASE);
+             futex_wake ((int *) &bar->generation, INT_MAX);
+           }
          return;
        }
     }
 
+  if (team->task_never_used && bar->total > 1)
+    {
+      asm ("bar.sync 1, %0;" : : "r" (32 * bar->total));
+      return;
+    }
+
   generation = state;
   state &= ~BAR_CANCELLED;
   do
@@ -272,6 +286,8 @@ bool
 gomp_team_barrier_wait_cancel_end (gomp_barrier_t *bar,
                                   gomp_barrier_state_t state)
 {
+  struct gomp_thread *thr = gomp_thread ();
+  struct gomp_team *team = thr->ts.team;
   unsigned int generation, gen;
 
   if (__builtin_expect (state & BAR_WAS_LAST, 0))
@@ -281,8 +297,6 @@ gomp_team_barrier_wait_cancel_end (gomp_barrier_t *bar,
         cancellation means that at least one of the threads has been
         cancelled, thus on a cancellable barrier we should never see
         all threads to arrive.  */
-      struct gomp_thread *thr = gomp_thread ();
-      struct gomp_team *team = thr->ts.team;
 
       bar->awaited = bar->total;
       team->work_share_cancelled = 0;
@@ -293,15 +307,31 @@ gomp_team_barrier_wait_cancel_end (gomp_barrier_t *bar,
        }
       else
        {
-         state += BAR_INCR - BAR_WAS_LAST;
-         __atomic_store_n (&bar->generation, state, MEMMODEL_RELEASE);
-         futex_wake ((int *) &bar->generation, INT_MAX);
+         if (team->task_never_used)
+           {
+             if (bar->total > 1)
+               asm ("bar.arrive 1, %0;" : : "r" (32 * bar->total));
+           }
+         else
+           {
+             state += BAR_INCR - BAR_WAS_LAST;
+             __atomic_store_n (&bar->generation, state, MEMMODEL_RELEASE);
+             futex_wake ((int *) &bar->generation, INT_MAX);
+           }
          return false;
        }
     }
 
   if (__builtin_expect (state & BAR_CANCELLED, 0))
     return true;
+  else
+    {
+      if (team->task_never_used && bar->total > 1)
+       {
+         asm ("bar.sync 1, %0;" : : "r" (32 * bar->total));
+         return false;
+       }
+    }
 
   generation = state;
   do
diff --git a/libgomp/libgomp.h b/libgomp/libgomp.h
index c243c4d..37e9600 100644
--- a/libgomp/libgomp.h
+++ b/libgomp/libgomp.h
@@ -725,6 +725,8 @@ struct gomp_team
   gomp_mutex_t task_lock;
   /* Scheduled tasks.  */
   struct priority_queue task_queue;
+  /* True if tasks haven't been used by this team.  */
+  bool task_never_used;
   /* Number of all GOMP_TASK_{WAITING,TIED} tasks in the team.  */
   unsigned int task_count;
   /* Number of GOMP_TASK_WAITING tasks currently waiting to be scheduled.  */
diff --git a/libgomp/libgomp.map b/libgomp/libgomp.map
index 46d5f10..1971891 100644
--- a/libgomp/libgomp.map
+++ b/libgomp/libgomp.map
@@ -413,6 +413,7 @@ GOMP_5.1 {
 GOMP_5.1.1 {
   global:
        GOMP_taskwait_depend_nowait;
+       GOMP_task_set_used;
 } GOMP_5.1;
 
 OACC_2.0 {
diff --git a/libgomp/single.c b/libgomp/single.c
index 79a3f8e..698e35a 100644
--- a/libgomp/single.c
+++ b/libgomp/single.c
@@ -35,18 +35,21 @@
 bool
 GOMP_single_start (void)
 {
-#ifdef HAVE_SYNC_BUILTINS
   struct gomp_thread *thr = gomp_thread ();
   struct gomp_team *team = thr->ts.team;
+#ifdef HAVE_SYNC_BUILTINS
   unsigned long single_count;
 
   if (__builtin_expect (team == NULL, 0))
     return true;
 
+  team->task_never_used = false;
+
   single_count = thr->ts.single_count++;
   return __sync_bool_compare_and_swap (&team->single_count, single_count,
                                       single_count + 1L);
 #else
+  team->task_never_used = false;
   bool ret = gomp_work_share_start (0);
   if (ret)
     gomp_work_share_init_done ();
@@ -64,12 +67,16 @@ void *
 GOMP_single_copy_start (void)
 {
   struct gomp_thread *thr = gomp_thread ();
+  struct gomp_team *team = thr->ts.team;
+
+  if (team != NULL)
+    team->task_never_used = false;
 
   bool first;
   void *ret;
 
   first = gomp_work_share_start (0);
-  
+
   if (first)
     {
       gomp_work_share_init_done ();
diff --git a/libgomp/task.c b/libgomp/task.c
index 30cd046..52a8ea1 100644
--- a/libgomp/task.c
+++ b/libgomp/task.c
@@ -726,6 +726,7 @@ GOMP_task (void (*fn) (void *), void *data, void (*cpyfn) 
(void *, void *),
 
       ++team->task_count;
       ++team->task_queued_count;
+      team->task_never_used = false;
       gomp_team_barrier_set_task_pending (&team->barrier);
       do_wake = team->task_running_count + !parent->in_tied_task
                < team->nthreads;
@@ -740,6 +741,15 @@ ialias (GOMP_taskgroup_start)
 ialias (GOMP_taskgroup_end)
 ialias (GOMP_taskgroup_reduction_register)
 
+void
+GOMP_task_set_used (void)
+{
+  struct gomp_thread *thr = gomp_thread ();
+  struct gomp_team *team = thr->ts.team;
+  if (team)
+    team->task_never_used = false;
+}
+
 #define TYPE long
 #define UTYPE unsigned long
 #define TYPE_is_long 1
@@ -1053,6 +1063,7 @@ gomp_create_target_task (struct gomp_device_descr 
*devicep,
       task->pnode[PQ_TEAM].prev = NULL;
       task->kind = GOMP_TASK_TIED;
       ++team->task_count;
+      team->task_never_used = false;
       gomp_mutex_unlock (&team->task_lock);
 
       thr->task = task;
@@ -1086,6 +1097,7 @@ gomp_create_target_task (struct gomp_device_descr 
*devicep,
                         task->parent_depends_on);
   ++team->task_count;
   ++team->task_queued_count;
+  team->task_never_used = false;
   gomp_team_barrier_set_task_pending (&team->barrier);
   do_wake = team->task_running_count + !parent->in_tied_task
            < team->nthreads;
@@ -1459,6 +1471,7 @@ gomp_task_run_post_handle_dependers (struct gomp_task 
*child_task,
                             task->parent_depends_on);
       ++team->task_count;
       ++team->task_queued_count;
+      team->task_never_used = false;
       ++ret;
     }
   free (child_task->dependers);
diff --git a/libgomp/team.c b/libgomp/team.c
index cb6875d..9e29eb5 100644
--- a/libgomp/team.c
+++ b/libgomp/team.c
@@ -211,6 +211,7 @@ gomp_new_team (unsigned nthreads)
   team->ordered_release[0] = &team->master_release;
 
   priority_queue_init (&team->task_queue);
+  team->task_never_used = true;
   team->task_count = 0;
   team->task_queued_count = 0;
   team->task_running_count = 0;
-- 
2.8.1

[OpenMP, nvptx] Use bar.sync/arrive for barriers when tasking is not used

Reply via email to