This patch backports the change in the way that 'acc parallel loop' reductions are handled in trunk. Before, the reduction clause only used to be associated with the split acc loop. Now the reduction clause is associated with both the loop and the parallel region. That's beneficial because the gimplifier adds implicit copy clauses if necessary for any reduction variable attached to a parallel construct.
I had to update reduction-2.f95 because of the way that gomp4 implements device_type, which tends to rearrange the ordering of the clauses. Also, libgomp.oacc-c++/template-reduction.C is broken in gomp4, so I had to xfail it. Apparently, it exposed an async bug. My forthcoming patch which uses firstprivate pointers for subarrays should fix it. This patch has been committed to gomp-4_0-branch. Cesar
2016-05-09 Cesar Philippidis <ce...@codesourcery.com> Backport trunk r235651: 2016-04-29 Cesar Philippidis <ce...@codesourcery.com> gcc/c-family/ PR middle-end/70626 * c-common.h (c_oacc_split_loop_clauses): Add boolean argument. * c-omp.c (c_oacc_split_loop_clauses): Use it to duplicate reduction clauses in acc parallel loops. gcc/c/ PR middle-end/70626 * c-parser.c (c_parser_oacc_loop): Don't augment mask with OACC_LOOP_CLAUSE_MASK. (c_parser_oacc_kernels_parallel): Update call to c_oacc_split_loop_clauses. gcc/cp/ PR middle-end/70626 * parser.c (cp_parser_oacc_loop): Don't augment mask with OACC_LOOP_CLAUSE_MASK. (cp_parser_oacc_kernels_parallel): Update call to c_oacc_split_loop_clauses. gcc/fortran/ PR middle-end/70626 * trans-openmp.c (gfc_trans_oacc_combined_directive): Duplicate the reduction clause in both parallel and loop directives. gcc/testsuite/ PR middle-end/70626 * c-c++-common/goacc/combined-reduction.c: New test. * gfortran.dg/goacc/reduction-2.f95: Add check for kernels reductions. libgomp/ PR middle-end/70626 * testsuite/libgomp.oacc-c++/template-reduction.C: Adjust test. * testsuite/libgomp.oacc-c-c++-common/combined-reduction.c: New test. * testsuite/libgomp.oacc-fortran/combined-reduction.f90: New test. diff --git a/gcc/c-family/c-common.h b/gcc/c-family/c-common.h index ef3493e..daa77f9 100644 --- a/gcc/c-family/c-common.h +++ b/gcc/c-family/c-common.h @@ -1285,7 +1285,7 @@ extern bool c_omp_check_loop_iv (tree, tree, walk_tree_lh); extern bool c_omp_check_loop_iv_exprs (location_t, tree, tree, tree, tree, walk_tree_lh); extern tree c_finish_oacc_wait (location_t, tree, tree); -extern tree c_oacc_split_loop_clauses (tree, tree *); +extern tree c_oacc_split_loop_clauses (tree, tree *, bool); extern void c_omp_split_clauses (location_t, enum tree_code, omp_clause_mask, tree, tree *); extern tree c_omp_declare_simd_clauses_to_numbers (tree, tree); diff --git a/gcc/c-family/c-omp.c b/gcc/c-family/c-omp.c index 4d3f7dc..614ee1f 100644 --- a/gcc/c-family/c-omp.c +++ b/gcc/c-family/c-omp.c @@ -861,9 +861,10 @@ c_omp_check_loop_iv_exprs (location_t stmt_loc, tree declv, tree decl, #pragma acc parallel loop */ tree -c_oacc_split_loop_clauses (tree clauses, tree *not_loop_clauses) +c_oacc_split_loop_clauses (tree clauses, tree *not_loop_clauses, + bool is_parallel) { - tree next, loop_clauses; + tree next, loop_clauses, nc; loop_clauses = *not_loop_clauses = NULL_TREE; for (; clauses ; clauses = next) @@ -882,7 +883,23 @@ c_oacc_split_loop_clauses (tree clauses, tree *not_loop_clauses) case OMP_CLAUSE_SEQ: case OMP_CLAUSE_INDEPENDENT: case OMP_CLAUSE_PRIVATE: + OMP_CLAUSE_CHAIN (clauses) = loop_clauses; + loop_clauses = clauses; + break; + + /* Reductions must be duplicated on both constructs. */ case OMP_CLAUSE_REDUCTION: + if (is_parallel) + { + nc = build_omp_clause (OMP_CLAUSE_LOCATION (clauses), + OMP_CLAUSE_REDUCTION); + OMP_CLAUSE_DECL (nc) = OMP_CLAUSE_DECL (clauses); + OMP_CLAUSE_REDUCTION_CODE (nc) + = OMP_CLAUSE_REDUCTION_CODE (clauses); + OMP_CLAUSE_CHAIN (nc) = *not_loop_clauses; + *not_loop_clauses = nc; + } + OMP_CLAUSE_CHAIN (clauses) = loop_clauses; loop_clauses = clauses; break; diff --git a/gcc/c/c-parser.c b/gcc/c/c-parser.c index 48fa26a..0f2d871 100644 --- a/gcc/c/c-parser.c +++ b/gcc/c/c-parser.c @@ -14012,6 +14012,8 @@ static tree c_parser_oacc_loop (location_t loc, c_parser *parser, char *p_name, omp_clause_mask mask, tree *cclauses, bool *if_p) { + bool is_parallel = ((mask >> PRAGMA_OACC_CLAUSE_REDUCTION) & 1) == 1; + strcat (p_name, " loop"); mask |= OACC_LOOP_CLAUSE_MASK; @@ -14020,7 +14022,7 @@ c_parser_oacc_loop (location_t loc, c_parser *parser, char *p_name, cclauses == NULL); if (cclauses) { - clauses = c_oacc_split_loop_clauses (clauses, cclauses); + clauses = c_oacc_split_loop_clauses (clauses, cclauses, is_parallel); if (*cclauses) *cclauses = c_finish_omp_clauses (*cclauses, C_ORT_ACC); if (clauses) @@ -14128,8 +14130,6 @@ c_parser_oacc_kernels_parallel (location_t loc, c_parser *parser, if (strcmp (p, "loop") == 0) { c_parser_consume_token (parser); - mask |= OACC_LOOP_CLAUSE_MASK; - tree block = c_begin_omp_parallel (); tree clauses; c_parser_oacc_loop (loc, parser, p_name, mask, &clauses, if_p); diff --git a/gcc/cp/parser.c b/gcc/cp/parser.c index f4b2603..f43c962 100644 --- a/gcc/cp/parser.c +++ b/gcc/cp/parser.c @@ -35581,6 +35581,8 @@ static tree cp_parser_oacc_loop (cp_parser *parser, cp_token *pragma_tok, char *p_name, omp_clause_mask mask, tree *cclauses, bool *if_p) { + bool is_parallel = ((mask >> PRAGMA_OACC_CLAUSE_REDUCTION) & 1) == 1; + strcat (p_name, " loop"); mask |= OACC_LOOP_CLAUSE_MASK; @@ -35589,7 +35591,7 @@ cp_parser_oacc_loop (cp_parser *parser, cp_token *pragma_tok, char *p_name, cclauses == NULL); if (cclauses) { - clauses = c_oacc_split_loop_clauses (clauses, cclauses); + clauses = c_oacc_split_loop_clauses (clauses, cclauses, is_parallel); if (*cclauses) *cclauses = finish_omp_clauses (*cclauses, C_ORT_ACC); if (clauses) @@ -35697,8 +35699,6 @@ cp_parser_oacc_kernels_parallel (cp_parser *parser, cp_token *pragma_tok, if (strcmp (p, "loop") == 0) { cp_lexer_consume_token (parser->lexer); - mask |= OACC_LOOP_CLAUSE_MASK; - tree block = begin_omp_parallel (); tree clauses; cp_parser_oacc_loop (parser, pragma_tok, p_name, mask, &clauses, diff --git a/gcc/cp/semantics.c b/gcc/cp/semantics.c index 765e08b..9945365 100644 --- a/gcc/cp/semantics.c +++ b/gcc/cp/semantics.c @@ -6703,7 +6703,7 @@ finish_omp_clauses (tree clauses, enum c_omp_region_type ort) OMP_CLAUSE_DECL (c) = t; } if (TREE_CODE (t) == COMPONENT_REF - && (ort & C_ORT_OMP_DECLARE_SIMD) == C_ORT_OMP + && allow_fields && OMP_CLAUSE_CODE (c) != OMP_CLAUSE__CACHE_) { if (type_dependent_expression_p (t)) diff --git a/gcc/fortran/gfortran.h b/gcc/fortran/gfortran.h index 6a50a7b..2327b13 100644 --- a/gcc/fortran/gfortran.h +++ b/gcc/fortran/gfortran.h @@ -2948,6 +2948,7 @@ bool gfc_check_intrinsic_standard (const gfc_intrinsic_sym*, const char**, bool, locus); /* match.c -- FIXME */ +gfc_omp_namelist *gfc_copy_omp_namelist (gfc_omp_namelist *); void gfc_free_iterator (gfc_iterator *, int); void gfc_free_forall_iterator (gfc_forall_iterator *); void gfc_free_alloc_list (gfc_alloc *); diff --git a/gcc/fortran/match.c b/gcc/fortran/match.c index 2490f85..30d571d 100644 --- a/gcc/fortran/match.c +++ b/gcc/fortran/match.c @@ -4744,6 +4744,28 @@ gfc_free_omp_namelist (gfc_omp_namelist *name) } } +/* Duplicate an omp namelist. */ + +gfc_omp_namelist * +gfc_copy_omp_namelist (gfc_omp_namelist *name) +{ + gfc_omp_namelist *nl = NULL, *t; + + for (; name; name = name->next) + { + t = gfc_get_omp_namelist (); + t->sym = name->sym; + t->expr = gfc_copy_expr (name->expr);; + t->u = name->u; + t->udr = NULL; + t->where = name->where; + t->next = nl; + nl = t; + } + + return nl; +} + /* Match a NAMELIST statement. */ diff --git a/gcc/fortran/trans-openmp.c b/gcc/fortran/trans-openmp.c index bbb5364..1a91901 100644 --- a/gcc/fortran/trans-openmp.c +++ b/gcc/fortran/trans-openmp.c @@ -3642,7 +3642,8 @@ gfc_trans_omp_do (gfc_code *code, gfc_exec_op op, stmtblock_t *pblock, static void gfc_filter_oacc_combined_clauses (gfc_omp_clauses **orig_clauses, - gfc_omp_clauses **loop_clauses) + gfc_omp_clauses **loop_clauses, + enum tree_code construct_code) { if (*orig_clauses == NULL) { @@ -3683,8 +3684,9 @@ gfc_filter_oacc_combined_clauses (gfc_omp_clauses **orig_clauses, (*loop_clauses)->tile_list = (*orig_clauses)->tile_list; (*orig_clauses)->tile_list = NULL; (*loop_clauses)->lists[OMP_LIST_REDUCTION] - = (*orig_clauses)->lists[OMP_LIST_REDUCTION]; - (*orig_clauses)->lists[OMP_LIST_REDUCTION] = NULL; + = gfc_copy_omp_namelist ((*orig_clauses)->lists[OMP_LIST_REDUCTION]); + if (construct_code == OACC_KERNELS) + (*orig_clauses)->lists[OMP_LIST_REDUCTION] = NULL; #if 0 (*loop_clauses)->lists[OMP_LIST_PRIVATE] = (*orig_clauses)->lists[OMP_LIST_PRIVATE]; @@ -3693,7 +3695,8 @@ gfc_filter_oacc_combined_clauses (gfc_omp_clauses **orig_clauses, (*loop_clauses)->device_types = (*orig_clauses)->device_types; gfc_filter_oacc_combined_clauses (&(*orig_clauses)->dtype_clauses, - &(*loop_clauses)->dtype_clauses); + &(*loop_clauses)->dtype_clauses, + construct_code); } /* Combined OpenACC parallel loop and kernels loop. */ @@ -3723,7 +3726,8 @@ gfc_trans_oacc_combined_directive (gfc_code *code) gfc_start_block (&block); - gfc_filter_oacc_combined_clauses (&code->ext.omp_clauses, &loop_clauses); + gfc_filter_oacc_combined_clauses (&code->ext.omp_clauses, &loop_clauses, + construct_code); oacc_clauses = gfc_trans_omp_clauses (&block, code->ext.omp_clauses, code->loc); diff --git a/gcc/testsuite/c-c++-common/goacc/combined-reduction.c b/gcc/testsuite/c-c++-common/goacc/combined-reduction.c new file mode 100644 index 0000000..ecf23f5 --- /dev/null +++ b/gcc/testsuite/c-c++-common/goacc/combined-reduction.c @@ -0,0 +1,29 @@ +/* { dg-do compile } */ +/* { dg-options "-fopenacc -fdump-tree-gimple" } */ + +#include <assert.h> + +int +main () +{ + int i, v1 = 0, n = 100; + +#pragma acc parallel loop reduction(+:v1) + for (i = 0; i < n; i++) + v1++; + + assert (v1 == n); + +#pragma acc kernels loop reduction(+:v1) + for (i = 0; i < n; i++) + v1++; + + assert (v1 == n); + + return 0; +} + +/* { dg-final { scan-tree-dump-times "omp target oacc_parallel reduction.+:v1. map.tofrom:v1" 1 "gimple" } } */ +/* { dg-final { scan-tree-dump-times "acc loop reduction.+:v1. private.i." 1 "gimple" } } */ +/* { dg-final { scan-tree-dump-times "omp target oacc_kernels map.force_tofrom:n .len: 4.. map.force_tofrom:v1 .len: 4.." 1 "gimple" } } */ +/* { dg-final { scan-tree-dump-times "acc loop reduction.+:v1. private.i." 1 "gimple" } } */ diff --git a/gcc/testsuite/c-c++-common/goacc/pr70688.c b/gcc/testsuite/c-c++-common/goacc/pr70688.c new file mode 100644 index 0000000..37c3885 --- /dev/null +++ b/gcc/testsuite/c-c++-common/goacc/pr70688.c @@ -0,0 +1,50 @@ +/* { dg-compile } */ + +const int n = 100; + +int +private_reduction () +{ + int i, r; + + #pragma acc parallel + #pragma acc loop private (r) reduction (+:r) + for (i = 0; i < 100; i++) + r += 10; + + return r; +} + +int +parallel_reduction () +{ + int sum = 0; + int dummy = 0; + +#pragma acc data copy (dummy) + { +#pragma acc parallel num_gangs (10) copy (sum) reduction (+:sum) /* { dg-warning "region is gang partitioned" } */ + { + int v = 5; + sum += 10 + v; + } + } + + return sum; +} + +int +main () +{ + int i, s = 0; + +#pragma acc parallel num_gangs (10) copy (s) reduction (+:s) /* { dg-warning "region is gang partitioned" } */ + for (i = 0; i < n; i++) + s += i+1; + +#pragma acc parallel num_gangs (10) reduction (+:s) copy (s) /* { dg-warning "region is gang partitioned" } */ + for (i = 0; i < n; i++) + s += i+1; + + return 0; +} diff --git a/gcc/testsuite/gfortran.dg/goacc/combined-directives.f90 b/gcc/testsuite/gfortran.dg/goacc/combined-directives.f90 index 08d25ca..00b8822 100644 --- a/gcc/testsuite/gfortran.dg/goacc/combined-directives.f90 +++ b/gcc/testsuite/gfortran.dg/goacc/combined-directives.f90 @@ -165,5 +165,5 @@ end subroutine test ! { dg-final { scan-tree-dump-times "acc loop private.i. private.j. tile.2, 3" 2 "gimple" { xfail *-*-* } } } ! { dg-final { scan-tree-dump-times "acc loop private.i. independent" 2 "gimple" { xfail *-*-* } } } ! { dg-final { scan-tree-dump-times "private.z" 2 "gimple" } } -! { dg-final { scan-tree-dump-times "omp target oacc_\[^ \]+ map.force_tofrom:y" 2 "gimple" { xfail *-*-* } } } +! { dg-final { scan-tree-dump-times "omp target oacc_\[^ \]+ map.force_tofrom:y" 2 "gimple" } } ! { dg-final { scan-tree-dump-times "acc loop private.i. reduction..:y." 2 "gimple" { xfail *-*-* } } } diff --git a/gcc/testsuite/gfortran.dg/goacc/reduction-2.f95 b/gcc/testsuite/gfortran.dg/goacc/reduction-2.f95 index 91a4b2c..d083c7b 100644 --- a/gcc/testsuite/gfortran.dg/goacc/reduction-2.f95 +++ b/gcc/testsuite/gfortran.dg/goacc/reduction-2.f95 @@ -1,3 +1,4 @@ +! { dg-do compile } ! { dg-additional-options "-fdump-tree-gimple" } subroutine foo () @@ -14,5 +15,8 @@ subroutine foo () !$acc end kernels loop end subroutine -! { dg-final { scan-tree-dump-times "target oacc_parallel firstprivate.a." 1 "gimple" } } +! { dg-final { scan-tree-dump-times "target oacc_parallel reduction..:a. map.tofrom.a." 1 "gimple" } } ! { dg-final { scan-tree-dump-times "acc loop reduction..:a. private.p." 1 "gimple" } } +! { dg-final { scan-tree-dump-times "target oacc_kernels map.force_tofrom:a .len: 4.." 1 "gimple" } } +! { dg-final { scan-tree-dump-times "acc loop reduction..:a. private.k." 1 "gimple" } } + diff --git a/libgomp/testsuite/libgomp.oacc-c++/template-reduction.C b/libgomp/testsuite/libgomp.oacc-c++/template-reduction.C index fb5924c..e2a0ca0 100644 --- a/libgomp/testsuite/libgomp.oacc-c++/template-reduction.C +++ b/libgomp/testsuite/libgomp.oacc-c++/template-reduction.C @@ -1,3 +1,7 @@ +// TODO: async_sum is currently failing on nvptx. + +// { dg-xfail-run-if "TODO" { openacc_nvidia_accel_selected } } + const int n = 100; // Check explicit template copy map @@ -7,7 +11,7 @@ sum (T array[]) { T s = 0; -#pragma acc parallel loop num_gangs (10) gang reduction (+:s) copy (s, array[0:n]) +#pragma acc parallel loop num_gangs (10) gang reduction (+:s) copy (array[0:n]) for (int i = 0; i < n; i++) s += array[i]; @@ -25,7 +29,7 @@ sum () for (int i = 0; i < n; i++) array[i] = i+1; -#pragma acc parallel loop num_gangs (10) gang reduction (+:s) copy (s) +#pragma acc parallel loop num_gangs (10) gang reduction (+:s) for (int i = 0; i < n; i++) s += array[i]; @@ -43,7 +47,7 @@ async_sum (T array[]) for (int i = 0; i < n; i++) array[i] = i+1; -#pragma acc parallel loop num_gangs (10) gang reduction (+:s) present (array[0:n]) copy (s) async wait (1) +#pragma acc parallel loop num_gangs (10) gang reduction (+:s) present (array[0:n]) async wait (1) for (int i = 0; i < n; i++) s += array[i]; @@ -59,7 +63,7 @@ async_sum (int c) { T s = 0; -#pragma acc parallel loop num_gangs (10) gang reduction (+:s) copy(s) firstprivate (c) async wait (1) +#pragma acc parallel loop num_gangs (10) gang reduction (+:s) firstprivate (c) async wait (1) for (int i = 0; i < n; i++) s += i+c; diff --git a/libgomp/testsuite/libgomp.oacc-c-c++-common/combined-reduction.c b/libgomp/testsuite/libgomp.oacc-c-c++-common/combined-reduction.c new file mode 100644 index 0000000..b5ce4ed --- /dev/null +++ b/libgomp/testsuite/libgomp.oacc-c-c++-common/combined-reduction.c @@ -0,0 +1,23 @@ +/* Test a combined acc parallel loop reduction. */ + +/* { dg-do run } */ + +#include <assert.h> + +int +main () +{ + int i, v1 = 0, v2 = 0, n = 100; + +#pragma acc parallel loop reduction(+:v1, v2) + for (i = 0; i < n; i++) + { + v1++; + v2++; + } + + assert (v1 == n); + assert (v2 == n); + + return 0; +} diff --git a/libgomp/testsuite/libgomp.oacc-fortran/combined-reduction.f90 b/libgomp/testsuite/libgomp.oacc-fortran/combined-reduction.f90 new file mode 100644 index 0000000..d3a61b5 --- /dev/null +++ b/libgomp/testsuite/libgomp.oacc-fortran/combined-reduction.f90 @@ -0,0 +1,19 @@ +! Test a combined acc parallel loop reduction. + +! { dg-do run } + +program test + implicit none + integer i, n, var + + n = 100 + var = 0 + + !$acc parallel loop reduction(+:var) + do i = 1, 100 + var = var + 1 + end do + !$acc end parallel loop + + if (var .ne. n) call abort +end program test