On 2015/6/16 05:05 PM, Tom de Vries wrote: > On 16/06/15 10:59, Chung-Lin Tang wrote: >> This patch adjusts omp-low.c:expand_omp_for_generic() to expand to a >> "sequential" >> loop form (without the OMP runtime calls), used for loop directives inside >> OpenACC kernels constructs. Tom mentions that this allows the kernels >> parallelization >> to work when '#pragma acc loop' makes the front-ends create OMP_FOR, which >> the >> loop analysis phases don't understand. >> >> Tested and committed to gomp-4_0-branch. >> > > Hi Chung-Lin, > > can you commit a test-case to exercise the code? > > Thanks, > - Tom
Just committed the attached testcase patch to gomp-4_0-branch. Chung-Lin 2015-06-23 Chung-Lin Tang <clt...@codesourcery.com> gcc/testsuite/ * c-c++-common/goacc/kernels-loop.c (ACC_LOOP): Add #ifndef/#define. (main): Tag loops inside kernels construct with '#pragma ACC_LOOP'. * c-c++-common/goacc/kernels-loop-2.c: Likewise. * c-c++-common/goacc/kernels-loop-3.c: Likewise. * c-c++-common/goacc/kernels-loop-n.c: Likewise. * c-c++-common/goacc/kernels-loop-acc-loop.c: New test. * c-c++-common/goacc/kernels-loop-2-acc-loop.c: New test. * c-c++-common/goacc/kernels-loop-3-acc-loop.c: New test. * c-c++-common/goacc/kernels-loop-n-acc-loop.c: New test.
Index: gcc/testsuite/c-c++-common/goacc/kernels-loop-3-acc-loop.c =================================================================== --- gcc/testsuite/c-c++-common/goacc/kernels-loop-3-acc-loop.c (revision 0) +++ gcc/testsuite/c-c++-common/goacc/kernels-loop-3-acc-loop.c (revision 0) @@ -0,0 +1,20 @@ +/* { dg-additional-options "-O2" } */ +/* { dg-additional-options "-ftree-parallelize-loops=32" } */ +/* { dg-additional-options "-fdump-tree-parloops_oacc_kernels-all" } */ +/* { dg-additional-options "-fdump-tree-optimized" } */ + +/* Check that loops with '#pragma acc loop' tagged gets properly parallelized. */ +#define ACC_LOOP acc loop +#include "kernels-loop-3.c" + +/* Check that only one loop is analyzed, and that it can be parallelized. */ +/* { dg-final { scan-tree-dump-times "SUCCESS: may be parallelized" 1 "parloops_oacc_kernels" } } */ +/* { dg-final { scan-tree-dump-not "FAILED:" "parloops_oacc_kernels" } } */ + +/* Check that the loop has been split off into a function. */ +/* { dg-final { scan-tree-dump-times "(?n);; Function .*main._omp_fn.0" 1 "optimized" } } */ + +/* { dg-final { scan-tree-dump-times "(?n)pragma omp target oacc_parallel.*num_gangs\\(32\\)" 1 "parloops_oacc_kernels" } } */ + +/* { dg-final { cleanup-tree-dump "parloops_oacc_kernels" } } */ +/* { dg-final { cleanup-tree-dump "optimized" } } */ Index: gcc/testsuite/c-c++-common/goacc/kernels-loop-2.c =================================================================== --- gcc/testsuite/c-c++-common/goacc/kernels-loop-2.c (revision 224836) +++ gcc/testsuite/c-c++-common/goacc/kernels-loop-2.c (working copy) @@ -8,6 +8,10 @@ #define N (1024 * 512) #define COUNTERTYPE unsigned int +#ifndef ACC_LOOP +#define ACC_LOOP +#endif + int main (void) { @@ -21,18 +25,21 @@ main (void) #pragma acc kernels copyout (a[0:N]) { + #pragma ACC_LOOP for (COUNTERTYPE i = 0; i < N; i++) a[i] = i * 2; } #pragma acc kernels copyout (b[0:N]) { + #pragma ACC_LOOP for (COUNTERTYPE i = 0; i < N; i++) b[i] = i * 4; } #pragma acc kernels copyin (a[0:N], b[0:N]) copyout (c[0:N]) { + #pragma ACC_LOOP for (COUNTERTYPE ii = 0; ii < N; ii++) c[ii] = a[ii] + b[ii]; } Index: gcc/testsuite/c-c++-common/goacc/kernels-loop.c =================================================================== --- gcc/testsuite/c-c++-common/goacc/kernels-loop.c (revision 224836) +++ gcc/testsuite/c-c++-common/goacc/kernels-loop.c (working copy) @@ -8,6 +8,10 @@ #define N (1024 * 512) #define COUNTERTYPE unsigned int +#ifndef ACC_LOOP +#define ACC_LOOP +#endif + int main (void) { @@ -27,6 +31,7 @@ main (void) #pragma acc kernels copyin (a[0:N], b[0:N]) copyout (c[0:N]) { + #pragma ACC_LOOP for (COUNTERTYPE ii = 0; ii < N; ii++) c[ii] = a[ii] + b[ii]; } Index: gcc/testsuite/c-c++-common/goacc/kernels-loop-2-acc-loop.c =================================================================== --- gcc/testsuite/c-c++-common/goacc/kernels-loop-2-acc-loop.c (revision 0) +++ gcc/testsuite/c-c++-common/goacc/kernels-loop-2-acc-loop.c (revision 0) @@ -0,0 +1,23 @@ +/* { dg-additional-options "-O2" } */ +/* { dg-additional-options "-ftree-parallelize-loops=32" } */ +/* { dg-additional-options "-fdump-tree-parloops_oacc_kernels-all" } */ +/* { dg-additional-options "-fdump-tree-optimized" } */ + +/* Check that loops with '#pragma acc loop' tagged gets properly parallelized. */ +#define ACC_LOOP acc loop +#include "kernels-loop-2.c" + +/* Check that only three loops are analyzed, and that all can be + parallelized. */ +/* { dg-final { scan-tree-dump-times "SUCCESS: may be parallelized" 3 "parloops_oacc_kernels" } } */ +/* { dg-final { scan-tree-dump-not "FAILED:" "parloops_oacc_kernels" } } */ + +/* Check that the loop has been split off into a function. */ +/* { dg-final { scan-tree-dump-times "(?n);; Function .*main._omp_fn.0" 1 "optimized" } } */ +/* { dg-final { scan-tree-dump-times "(?n);; Function .*main._omp_fn.1" 1 "optimized" } } */ +/* { dg-final { scan-tree-dump-times "(?n);; Function .*main._omp_fn.2" 1 "optimized" } } */ + +/* { dg-final { scan-tree-dump-times "(?n)pragma omp target oacc_parallel.*num_gangs\\(32\\)" 3 "parloops_oacc_kernels" } } */ + +/* { dg-final { cleanup-tree-dump "parloops_oacc_kernels" } } */ +/* { dg-final { cleanup-tree-dump "optimized" } } */ Index: gcc/testsuite/c-c++-common/goacc/kernels-loop-n.c =================================================================== --- gcc/testsuite/c-c++-common/goacc/kernels-loop-n.c (revision 224836) +++ gcc/testsuite/c-c++-common/goacc/kernels-loop-n.c (working copy) @@ -8,6 +8,10 @@ #define N ((1024 * 512) + 1) #define COUNTERTYPE unsigned int +#ifndef ACC_LOOP +#define ACC_LOOP +#endif + int foo (COUNTERTYPE n) { @@ -27,6 +31,7 @@ foo (COUNTERTYPE n) #pragma acc kernels copyin (a[0:n], b[0:n]) copyout (c[0:n]) { + #pragma ACC_LOOP for (COUNTERTYPE ii = 0; ii < n; ii++) c[ii] = a[ii] + b[ii]; } Index: gcc/testsuite/c-c++-common/goacc/kernels-loop-acc-loop.c =================================================================== --- gcc/testsuite/c-c++-common/goacc/kernels-loop-acc-loop.c (revision 0) +++ gcc/testsuite/c-c++-common/goacc/kernels-loop-acc-loop.c (revision 0) @@ -0,0 +1,20 @@ +/* { dg-additional-options "-O2" } */ +/* { dg-additional-options "-ftree-parallelize-loops=32" } */ +/* { dg-additional-options "-fdump-tree-parloops_oacc_kernels-all" } */ +/* { dg-additional-options "-fdump-tree-optimized" } */ + +/* Check that loops with '#pragma acc loop' tagged gets properly parallelized. */ +#define ACC_LOOP acc loop +#include "kernels-loop.c" + +/* Check that only one loop is analyzed, and that it can be parallelized. */ +/* { dg-final { scan-tree-dump-times "SUCCESS: may be parallelized" 1 "parloops_oacc_kernels" } } */ +/* { dg-final { scan-tree-dump-not "FAILED:" "parloops_oacc_kernels" } } */ + +/* Check that the loop has been split off into a function. */ +/* { dg-final { scan-tree-dump-times "(?n);; Function .*main._omp_fn.0" 1 "optimized" } } */ + +/* { dg-final { scan-tree-dump-times "(?n)pragma omp target oacc_parallel.*num_gangs\\(32\\)" 1 "parloops_oacc_kernels" } } */ + +/* { dg-final { cleanup-tree-dump "parloops_oacc_kernels" } } */ +/* { dg-final { cleanup-tree-dump "optimized" } } */ Index: gcc/testsuite/c-c++-common/goacc/kernels-loop-3.c =================================================================== --- gcc/testsuite/c-c++-common/goacc/kernels-loop-3.c (revision 224836) +++ gcc/testsuite/c-c++-common/goacc/kernels-loop-3.c (working copy) @@ -8,6 +8,10 @@ #define N (1024 * 512) #define COUNTERTYPE unsigned int +#ifndef ACC_LOOP +#define ACC_LOOP +#endif + int main (void) { @@ -22,6 +26,7 @@ main (void) #pragma acc kernels copy (c[0:N]) { + #pragma ACC_LOOP for (COUNTERTYPE ii = 0; ii < N; ii++) c[ii] = c[ii] + ii + 1; } Index: gcc/testsuite/c-c++-common/goacc/kernels-loop-n-acc-loop.c =================================================================== --- gcc/testsuite/c-c++-common/goacc/kernels-loop-n-acc-loop.c (revision 0) +++ gcc/testsuite/c-c++-common/goacc/kernels-loop-n-acc-loop.c (revision 0) @@ -0,0 +1,20 @@ +/* { dg-additional-options "-O2" } */ +/* { dg-additional-options "-ftree-parallelize-loops=32" } */ +/* { dg-additional-options "-fdump-tree-parloops_oacc_kernels-all" } */ +/* { dg-additional-options "-fdump-tree-optimized" } */ + +/* Check that loops with '#pragma acc loop' tagged gets properly parallelized. */ +#define ACC_LOOP acc loop +#include "kernels-loop-n.c" + +/* Check that only one loop is analyzed, and that it can be parallelized. */ +/* { dg-final { scan-tree-dump-times "SUCCESS: may be parallelized" 1 "parloops_oacc_kernels" } } */ +/* { dg-final { scan-tree-dump-not "FAILED:" "parloops_oacc_kernels" } } */ + +/* Check that the loop has been split off into a function. */ +/* { dg-final { scan-tree-dump-times "(?n);; Function .*foo.*._omp_fn.0" 1 "optimized" } } */ + +/* { dg-final { scan-tree-dump-times "(?n)pragma omp target oacc_parallel.*num_gangs\\(32\\)" 1 "parloops_oacc_kernels" } } */ + +/* { dg-final { cleanup-tree-dump "parloops_oacc_kernels" } } */ +/* { dg-final { cleanup-tree-dump "optimized" } } */