Hi,
This patch adds a thread barrier after worker loops for OpenACC, in
accordance with OpenACC 2.0a section 2.7.3 (worker loops): "All workers
will complete execution of their assigned iterations before any worker
proceeds beyond the end of the loop.". (This is quite target-specific:
work to alleviate that is still ongoing.)
Barriers are "special" in that they should not be cloned or subject to
excessive code motion: to that end, barriers placed after loops have
their (outgoing) edge set to EDGE_ABNORMAL. That seems to suffice to
keep the barriers in the right places.
This passes libgomp testing when applied on gomp4 branch, and fixes the
previously-broken worker-partn-5.c and worker-partn-6.c tests, on top
of my previous patches:
https://gcc.gnu.org/ml/gcc-patches/2015-05/msg02612.html
https://gcc.gnu.org/ml/gcc-patches/2015-06/msg00307.html
(ping!), but unfortunately (again, with the above patches) appears to
interact badly with Cesar's patch for vector state propagation:
https://gcc.gnu.org/ml/gcc-patches/2015-06/msg00371.html
I haven't yet investigated why (I reverted that patch in my local
series in order to test the attached patch).
FYI,
Julian
ChangeLog
gcc/
* omp-low.c (build_oacc_threadbarrier): New function.
(oacc_loop_needs_threadbarrier_p): New function.
(expand_omp_for_static_nochunk, expand_omp_for_static_chunk):
Insert threadbarrier after worker loops.
(find_omp_for_region_data): Rename to...
(find_omp_for_region_gwv): This. Return mask, rather than modifying
REGION structure.
(build_omp_regions_1): Move modification of REGION structure to
here, after calling above function with new name.
(generate_oacc_broadcast): Use new build_oacc_threadbarrier
function.
(make_gimple_omp_edges): Make edges out of OpenACC worker loop exit
block abnormal.
* tree-ssa-alias.c (ref_maybe_used_by_call_p_1): Add
BUILT_IN_GOACC_THREADBARRIER.
libgomp/
* testsuite/libgomp.oacc-c-c++-common/worker-partn-5.c: Remove
XFAIL.
* testsuite/libgomp.oacc-c-c++-common/worker-partn-6.c: Likewise.
commit e46fbc68b7bc7e705417475fcfb8e203056b5a51
Author: Julian Brown <jul...@codesourcery.com>
Date: Fri Jun 5 10:01:01 2015 -0700
Threadbarrier after worker and vector loops.
diff --git a/gcc/omp-low.c b/gcc/omp-low.c
index 55a2a12..45ff05a 100644
--- a/gcc/omp-low.c
+++ b/gcc/omp-low.c
@@ -3691,6 +3691,15 @@ build_omp_barrier (tree lhs)
return g;
}
+/* Build a call to GOACC_threadbarrier. */
+
+static gcall *
+build_oacc_threadbarrier (void)
+{
+ tree fndecl = builtin_decl_explicit (BUILT_IN_GOACC_THREADBARRIER);
+ return gimple_build_call (fndecl, 0);
+}
+
/* If a context was created for STMT when it was scanned, return it. */
static omp_context *
@@ -7181,6 +7190,20 @@ expand_omp_for_generic (struct omp_region *region,
}
+/* True if a barrier is needed after a loop partitioned over
+ gangs/workers/vectors as specified by GWV_BITS. OpenACC semantics specify
+ that a (conceptual) barrier is needed after worker and vector-partitioned
+ loops, but not after gang-partitioned loops. Currently we are relying on
+ warp reconvergence to synchronise threads within a warp after vector loops,
+ so an explicit barrier is not helpful after those. */
+
+static bool
+oacc_loop_needs_threadbarrier_p (int gwv_bits)
+{
+ return (gwv_bits & (MASK_GANG | MASK_WORKER)) == MASK_WORKER;
+}
+
+
/* A subroutine of expand_omp_for. Generate code for a parallel
loop with static schedule and no specified chunk size. Given
parameters:
@@ -7523,7 +7546,11 @@ expand_omp_for_static_nochunk (struct omp_region *region,
{
t = gimple_omp_return_lhs (gsi_stmt (gsi));
if (gimple_omp_for_kind (fd->for_stmt) == GF_OMP_FOR_KIND_OACC_LOOP)
- gcc_checking_assert (t == NULL_TREE);
+ {
+ gcc_checking_assert (t == NULL_TREE);
+ if (oacc_loop_needs_threadbarrier_p (region->gwv_this))
+ gsi_insert_after (&gsi, build_oacc_threadbarrier (), GSI_SAME_STMT);
+ }
else
gsi_insert_after (&gsi, build_omp_barrier (t), GSI_SAME_STMT);
}
@@ -7956,7 +7983,11 @@ expand_omp_for_static_chunk (struct omp_region *region,
{
t = gimple_omp_return_lhs (gsi_stmt (gsi));
if (gimple_omp_for_kind (fd->for_stmt) == GF_OMP_FOR_KIND_OACC_LOOP)
- gcc_checking_assert (t == NULL_TREE);
+ {
+ gcc_checking_assert (t == NULL_TREE);
+ if (oacc_loop_needs_threadbarrier_p (region->gwv_this))
+ gsi_insert_after (&gsi, build_oacc_threadbarrier (), GSI_SAME_STMT);
+ }
else
gsi_insert_after (&gsi, build_omp_barrier (t), GSI_SAME_STMT);
}
@@ -10270,22 +10301,26 @@ expand_omp (struct omp_region *region)
/* Map each basic block to an omp_region. */
static hash_map<basic_block, omp_region *> *bb_region_map;
-/* Fill in additional data for a region REGION associated with an
+/* Return a mask of GWV bits for region REGION associated with an
OMP_FOR STMT. */
-static void
-find_omp_for_region_data (struct omp_region *region, gimple stmt)
+static int
+find_omp_for_region_gwv (gimple stmt)
{
+ int tmp = 0;
+
if (!is_gimple_omp_oacc (stmt))
- return;
+ return 0;
tree clauses = gimple_omp_for_clauses (stmt);
if (find_omp_clause (clauses, OMP_CLAUSE_GANG))
- region->gwv_this |= MASK_GANG;
+ tmp |= MASK_GANG;
if (find_omp_clause (clauses, OMP_CLAUSE_WORKER))
- region->gwv_this |= MASK_WORKER;
+ tmp |= MASK_WORKER;
if (find_omp_clause (clauses, OMP_CLAUSE_VECTOR))
- region->gwv_this |= MASK_VECTOR;
+ tmp |= MASK_VECTOR;
+
+ return tmp;
}
/* Fill in additional data for a region REGION associated with an
@@ -10391,7 +10426,7 @@ build_omp_regions_1 (basic_block bb, struct omp_region *parent,
}
}
else if (code == GIMPLE_OMP_FOR)
- find_omp_for_region_data (region, stmt);
+ region->gwv_this = find_omp_for_region_gwv (stmt);
/* ..., this directive becomes the parent for a new region. */
if (region)
parent = region;
@@ -10560,9 +10595,7 @@ generate_oacc_broadcast (omp_region *region, tree dest_var, tree var,
gassign *st = gimple_build_assign (build_simple_mem_ref (ptr), var);
gsi_insert_after (&where, st, GSI_NEW_STMT);
- tree fndecl = builtin_decl_explicit (BUILT_IN_GOACC_THREADBARRIER);
- gcall *sync_bar = gimple_build_call (fndecl, 0);
- gsi_insert_after (&where, sync_bar, GSI_NEW_STMT);
+ gsi_insert_after (&where, build_oacc_threadbarrier (), GSI_NEW_STMT);
gassign *cast2 = gimple_build_assign (ptr, NOP_EXPR,
parent->broadcast_array);
@@ -13877,11 +13910,29 @@ make_gimple_omp_edges (basic_block bb, struct omp_region **region,
break;
case GIMPLE_OMP_RETURN:
- /* In the case of a GIMPLE_OMP_SECTION, the edge will go
- somewhere other than the next block. This will be
- created later. */
cur_region->exit = bb;
- fallthru = cur_region->type != GIMPLE_OMP_SECTION;
+ gimple for_stmt;
+ if (cur_region->type == GIMPLE_OMP_FOR
+ && gimple_omp_for_kind (as_a <gomp_for *>
+ ((for_stmt = last_stmt (cur_region->entry))))
+ == GF_OMP_FOR_KIND_OACC_LOOP)
+ {
+ /* Called before OMP expansion, so this information has not been
+ recorded in cur_region->gwv_this yet. */
+ int gwv_bits = find_omp_for_region_gwv (for_stmt);
+ if (oacc_loop_needs_threadbarrier_p (gwv_bits))
+ {
+ make_edge (bb, bb->next_bb, EDGE_FALLTHRU | EDGE_ABNORMAL);
+ fallthru = false;
+ }
+ else
+ fallthru = true;
+ }
+ else
+ /* In the case of a GIMPLE_OMP_SECTION, the edge will go
+ somewhere other than the next block. This will be
+ created later. */
+ fallthru = cur_region->type != GIMPLE_OMP_SECTION;
cur_region = cur_region->outer;
break;
diff --git a/gcc/tree-ssa-alias.c b/gcc/tree-ssa-alias.c
index fa6caef..249ac45 100644
--- a/gcc/tree-ssa-alias.c
+++ b/gcc/tree-ssa-alias.c
@@ -1764,6 +1764,7 @@ ref_maybe_used_by_call_p_1 (gcall *call, ao_ref *ref)
case BUILT_IN_GOMP_ATOMIC_END:
case BUILT_IN_GOMP_BARRIER:
case BUILT_IN_GOMP_BARRIER_CANCEL:
+ case BUILT_IN_GOACC_THREADBARRIER:
case BUILT_IN_GOMP_TASKWAIT:
case BUILT_IN_GOMP_TASKGROUP_END:
case BUILT_IN_GOMP_CRITICAL_START:
diff --git a/libgomp/testsuite/libgomp.oacc-c-c++-common/worker-partn-5.c b/libgomp/testsuite/libgomp.oacc-c-c++-common/worker-partn-5.c
index fc66b04..a1d2a72 100644
--- a/libgomp/testsuite/libgomp.oacc-c-c++-common/worker-partn-5.c
+++ b/libgomp/testsuite/libgomp.oacc-c-c++-common/worker-partn-5.c
@@ -1,5 +1,3 @@
-/* { dg-xfail-run-if "TODO" { openacc_nvidia_accel_selected } { "*" } { "" } } */
-
#include <assert.h>
/* Test correct synchronisation between worker-partitioned loops. */
diff --git a/libgomp/testsuite/libgomp.oacc-c-c++-common/worker-partn-6.c b/libgomp/testsuite/libgomp.oacc-c-c++-common/worker-partn-6.c
index 0f3a1d7..cfbcd17 100644
--- a/libgomp/testsuite/libgomp.oacc-c-c++-common/worker-partn-6.c
+++ b/libgomp/testsuite/libgomp.oacc-c-c++-common/worker-partn-6.c
@@ -1,5 +1,3 @@
-/* { dg-xfail-run-if "TODO" { openacc_nvidia_accel_selected } { "*" } { "" } } */
-
#include <assert.h>
/* Test correct synchronisation between worker+vector-partitioned loops. */