On 09/11/15 16:35, Tom de Vries wrote:
Hi,
this patch series for stage1 trunk adds support to:
- parallelize oacc kernels regions using parloops, and
- map the loops onto the oacc gang dimension.
The patch series contains these patches:
1 Insert new exit block only when needed in
transform_to_exit_first_loop_alt
2 Make create_parallel_loop return void
3 Ignore reduction clause on kernels directive
4 Implement -foffload-alias
5 Add in_oacc_kernels_region in struct loop
6 Add pass_oacc_kernels
7 Add pass_dominator_oacc_kernels
8 Add pass_ch_oacc_kernels
9 Add pass_parallelize_loops_oacc_kernels
10 Add pass_oacc_kernels pass group in passes.def
11 Update testcases after adding kernels pass group
12 Handle acc loop directive
13 Add c-c++-common/goacc/kernels-*.c
14 Add gfortran.dg/goacc/kernels-*.f95
15 Add libgomp.oacc-c-c++-common/kernels-*.c
16 Add libgomp.oacc-fortran/kernels-*.f95
The first 9 patches are more or less independent, but patches 10-16 are
intended to be committed at the same time.
Bootstrapped and reg-tested on x86_64.
Build and reg-tested with nvidia accelerator, in combination with a
patch that enables accelerator testing (which is submitted at
https://gcc.gnu.org/ml/gcc-patches/2015-10/msg01771.html ).
I'll post the individual patches in reply to this message.
this patch deals with loops in an oacc kernels region which are
annotated using "#pragma acc loop". It expands such a loop as a normal
loop, which has the effect of ignoring the "#pragma acc loop".
Thanks,
- Tom
Handle acc loop directive
2015-11-09 Tom de Vries <t...@codesourcery.com>
* omp-low.c (struct omp_region): Add inside_kernels_p field.
(expand_omp_for_generic): Only set address taken for istart0
and end0 unless necessary. Adjust to generate a 'sequential' loop
when GOMP builtin arguments are BUILT_IN_NONE.
(expand_omp_for): Use expand_omp_for_generic() to generate a
non-parallelized loop for OMP_FORs inside OpenACC kernels regions.
(expand_omp): Mark inside_kernels_p field true for regions
nested inside OpenACC kernels constructs.
---
gcc/omp-low.c | 127 ++++++++++++++++++++++++++++++++++++++++------------------
1 file changed, 87 insertions(+), 40 deletions(-)
diff --git a/gcc/omp-low.c b/gcc/omp-low.c
index 1283cc7..859a2eb 100644
--- a/gcc/omp-low.c
+++ b/gcc/omp-low.c
@@ -136,6 +136,9 @@ struct omp_region
/* The ordered stmt if type is GIMPLE_OMP_ORDERED and it has
a depend clause. */
gomp_ordered *ord_stmt;
+
+ /* True if this is nested inside an OpenACC kernels construct. */
+ bool inside_kernels_p;
};
/* Context structure. Used to store information about each parallel
@@ -8238,6 +8241,7 @@ expand_omp_for_generic (struct omp_region *region,
gassign *assign_stmt;
bool in_combined_parallel = is_combined_parallel (region);
bool broken_loop = region->cont == NULL;
+ bool seq_loop = (start_fn == BUILT_IN_NONE || next_fn == BUILT_IN_NONE);
edge e, ne;
tree *counts = NULL;
int i;
@@ -8335,8 +8339,12 @@ expand_omp_for_generic (struct omp_region *region,
type = TREE_TYPE (fd->loop.v);
istart0 = create_tmp_var (fd->iter_type, ".istart0");
iend0 = create_tmp_var (fd->iter_type, ".iend0");
- TREE_ADDRESSABLE (istart0) = 1;
- TREE_ADDRESSABLE (iend0) = 1;
+
+ if (!seq_loop)
+ {
+ TREE_ADDRESSABLE (istart0) = 1;
+ TREE_ADDRESSABLE (iend0) = 1;
+ }
/* See if we need to bias by LLONG_MIN. */
if (fd->iter_type == long_long_unsigned_type_node
@@ -8366,7 +8374,20 @@ expand_omp_for_generic (struct omp_region *region,
gsi_prev (&gsif);
tree arr = NULL_TREE;
- if (in_combined_parallel)
+ if (seq_loop)
+ {
+ tree n1 = fold_convert (fd->iter_type, fd->loop.n1);
+ tree n2 = fold_convert (fd->iter_type, fd->loop.n2);
+
+ assign_stmt = gimple_build_assign (istart0, n1);
+ gsi_insert_before (&gsi, assign_stmt, GSI_SAME_STMT);
+
+ assign_stmt = gimple_build_assign (iend0, n2);
+ gsi_insert_before (&gsi, assign_stmt, GSI_SAME_STMT);
+
+ t = fold_build2 (NE_EXPR, boolean_type_node, istart0, iend0);
+ }
+ else if (in_combined_parallel)
{
gcc_assert (fd->ordered == 0);
/* In a combined parallel loop, emit a call to
@@ -8788,39 +8809,45 @@ expand_omp_for_generic (struct omp_region *region,
collapse_bb = extract_omp_for_update_vars (fd, cont_bb, l1_bb);
/* Emit code to get the next parallel iteration in L2_BB. */
- gsi = gsi_start_bb (l2_bb);
+ if (!seq_loop)
+ {
+ gsi = gsi_start_bb (l2_bb);
- t = build_call_expr (builtin_decl_explicit (next_fn), 2,
- build_fold_addr_expr (istart0),
- build_fold_addr_expr (iend0));
- t = force_gimple_operand_gsi (&gsi, t, true, NULL_TREE,
- false, GSI_CONTINUE_LINKING);
- if (TREE_TYPE (t) != boolean_type_node)
- t = fold_build2 (NE_EXPR, boolean_type_node,
- t, build_int_cst (TREE_TYPE (t), 0));
- gcond *cond_stmt = gimple_build_cond_empty (t);
- gsi_insert_after (&gsi, cond_stmt, GSI_CONTINUE_LINKING);
+ t = build_call_expr (builtin_decl_explicit (next_fn), 2,
+ build_fold_addr_expr (istart0),
+ build_fold_addr_expr (iend0));
+ t = force_gimple_operand_gsi (&gsi, t, true, NULL_TREE,
+ false, GSI_CONTINUE_LINKING);
+ if (TREE_TYPE (t) != boolean_type_node)
+ t = fold_build2 (NE_EXPR, boolean_type_node,
+ t, build_int_cst (TREE_TYPE (t), 0));
+ gcond *cond_stmt = gimple_build_cond_empty (t);
+ gsi_insert_after (&gsi, cond_stmt, GSI_CONTINUE_LINKING);
+ }
}
/* Add the loop cleanup function. */
gsi = gsi_last_bb (exit_bb);
- if (gimple_omp_return_nowait_p (gsi_stmt (gsi)))
- t = builtin_decl_explicit (BUILT_IN_GOMP_LOOP_END_NOWAIT);
- else if (gimple_omp_return_lhs (gsi_stmt (gsi)))
- t = builtin_decl_explicit (BUILT_IN_GOMP_LOOP_END_CANCEL);
- else
- t = builtin_decl_explicit (BUILT_IN_GOMP_LOOP_END);
- gcall *call_stmt = gimple_build_call (t, 0);
- if (gimple_omp_return_lhs (gsi_stmt (gsi)))
- gimple_call_set_lhs (call_stmt, gimple_omp_return_lhs (gsi_stmt (gsi)));
- gsi_insert_after (&gsi, call_stmt, GSI_SAME_STMT);
- if (fd->ordered)
+ if (!seq_loop)
{
- tree arr = counts[fd->ordered];
- tree clobber = build_constructor (TREE_TYPE (arr), NULL);
- TREE_THIS_VOLATILE (clobber) = 1;
- gsi_insert_after (&gsi, gimple_build_assign (arr, clobber),
- GSI_SAME_STMT);
+ if (gimple_omp_return_nowait_p (gsi_stmt (gsi)))
+ t = builtin_decl_explicit (BUILT_IN_GOMP_LOOP_END_NOWAIT);
+ else if (gimple_omp_return_lhs (gsi_stmt (gsi)))
+ t = builtin_decl_explicit (BUILT_IN_GOMP_LOOP_END_CANCEL);
+ else
+ t = builtin_decl_explicit (BUILT_IN_GOMP_LOOP_END);
+ gcall *call_stmt = gimple_build_call (t, 0);
+ if (gimple_omp_return_lhs (gsi_stmt (gsi)))
+ gimple_call_set_lhs (call_stmt, gimple_omp_return_lhs (gsi_stmt (gsi)));
+ gsi_insert_after (&gsi, call_stmt, GSI_SAME_STMT);
+ if (fd->ordered)
+ {
+ tree arr = counts[fd->ordered];
+ tree clobber = build_constructor (TREE_TYPE (arr), NULL);
+ TREE_THIS_VOLATILE (clobber) = 1;
+ gsi_insert_after (&gsi, gimple_build_assign (arr, clobber),
+ GSI_SAME_STMT);
+ }
}
gsi_remove (&gsi, true);
@@ -8833,7 +8860,9 @@ expand_omp_for_generic (struct omp_region *region,
gimple_seq phis;
e = find_edge (cont_bb, l3_bb);
- ne = make_edge (l2_bb, l3_bb, EDGE_FALSE_VALUE);
+ ne = make_edge (l2_bb, l3_bb, (seq_loop
+ ? EDGE_FALLTHRU
+ : EDGE_FALSE_VALUE));
phis = phi_nodes (l3_bb);
for (gsi = gsi_start (phis); !gsi_end_p (gsi); gsi_next (&gsi))
@@ -8873,7 +8902,8 @@ expand_omp_for_generic (struct omp_region *region,
e = find_edge (cont_bb, l2_bb);
e->flags = EDGE_FALLTHRU;
}
- make_edge (l2_bb, l0_bb, EDGE_TRUE_VALUE);
+ if (!seq_loop)
+ make_edge (l2_bb, l0_bb, EDGE_TRUE_VALUE);
if (gimple_in_ssa_p (cfun))
{
@@ -8929,12 +8959,16 @@ expand_omp_for_generic (struct omp_region *region,
add_bb_to_loop (l2_bb, outer_loop);
- /* We've added a new loop around the original loop. Allocate the
- corresponding loop struct. */
- struct loop *new_loop = alloc_loop ();
- new_loop->header = l0_bb;
- new_loop->latch = l2_bb;
- add_loop (new_loop, outer_loop);
+ struct loop *new_loop = NULL;
+ if (!seq_loop)
+ {
+ /* We've added a new loop around the original loop. Allocate the
+ corresponding loop struct. */
+ new_loop = alloc_loop ();
+ new_loop->header = l0_bb;
+ new_loop->latch = l2_bb;
+ add_loop (new_loop, outer_loop);
+ }
/* Allocate a loop structure for the original loop unless we already
had one. */
@@ -8944,7 +8978,9 @@ expand_omp_for_generic (struct omp_region *region,
struct loop *orig_loop = alloc_loop ();
orig_loop->header = l1_bb;
/* The loop may have multiple latches. */
- add_loop (orig_loop, new_loop);
+ add_loop (orig_loop, (new_loop != NULL
+ ? new_loop
+ : outer_loop));
}
}
}
@@ -11348,7 +11384,10 @@ expand_omp_for (struct omp_region *region, gimple *inner_stmt)
original loops from being detected. Fix that up. */
loops_state_set (LOOPS_NEED_FIXUP);
- if (gimple_omp_for_kind (fd.for_stmt) & GF_OMP_FOR_SIMD)
+ if (region->inside_kernels_p)
+ expand_omp_for_generic (region, &fd, BUILT_IN_NONE, BUILT_IN_NONE,
+ inner_stmt);
+ else if (gimple_omp_for_kind (fd.for_stmt) & GF_OMP_FOR_SIMD)
expand_omp_simd (region, &fd);
else if (gimple_omp_for_kind (fd.for_stmt) == GF_OMP_FOR_KIND_CILKFOR)
expand_cilk_for (region, &fd);
@@ -13030,6 +13069,14 @@ expand_omp (struct omp_region *region)
if (region->type == GIMPLE_OMP_PARALLEL)
determine_parallel_type (region);
+ if (region->type == GIMPLE_OMP_TARGET && region->inner)
+ {
+ gomp_target *entry = as_a <gomp_target *> (last_stmt (region->entry));
+ if (gimple_omp_target_kind (entry) == GF_OMP_TARGET_KIND_OACC_KERNELS
+ || region->inside_kernels_p)
+ region->inner->inside_kernels_p = true;
+ }
+
if (region->type == GIMPLE_OMP_FOR
&& gimple_omp_for_combined_p (last_stmt (region->entry)))
inner_stmt = last_stmt (region->inner->entry);
--
1.9.1