On 09/11/15 16:35, Tom de Vries wrote:
Hi,
this patch series for stage1 trunk adds support to:
- parallelize oacc kernels regions using parloops, and
- map the loops onto the oacc gang dimension.
The patch series contains these patches:
1 Insert new exit block only when needed in
transform_to_exit_first_loop_alt
2 Make create_parallel_loop return void
3 Ignore reduction clause on kernels directive
4 Implement -foffload-alias
5 Add in_oacc_kernels_region in struct loop
6 Add pass_oacc_kernels
7 Add pass_dominator_oacc_kernels
8 Add pass_ch_oacc_kernels
9 Add pass_parallelize_loops_oacc_kernels
10 Add pass_oacc_kernels pass group in passes.def
11 Update testcases after adding kernels pass group
12 Handle acc loop directive
13 Add c-c++-common/goacc/kernels-*.c
14 Add gfortran.dg/goacc/kernels-*.f95
15 Add libgomp.oacc-c-c++-common/kernels-*.c
16 Add libgomp.oacc-fortran/kernels-*.f95
The first 9 patches are more or less independent, but patches 10-16 are
intended to be committed at the same time.
Bootstrapped and reg-tested on x86_64.
Build and reg-tested with nvidia accelerator, in combination with a
patch that enables accelerator testing (which is submitted at
https://gcc.gnu.org/ml/gcc-patches/2015-10/msg01771.html ).
I'll post the individual patches in reply to this message.
This patch adds C/C++ oacc kernels compilation tests.
Thanks,
- Tom
Add c-c++-common/goacc/kernels-*.c
2015-11-09 Tom de Vries <t...@codesourcery.com>
* c-c++-common/goacc/kernels-acc-loop-reduction.c: New test.
* c-c++-common/goacc/kernels-acc-loop-smaller-equal.c: New test.
* c-c++-common/goacc/kernels-counter-var-redundant-load.c: New test.
* c-c++-common/goacc/kernels-counter-vars-function-scope.c: New test.
* c-c++-common/goacc/kernels-double-reduction.c: New test.
* c-c++-common/goacc/kernels-empty.c: New test.
* c-c++-common/goacc/kernels-eternal.c: New test.
* c-c++-common/goacc/kernels-loop-2-acc-loop.c: New test.
* c-c++-common/goacc/kernels-loop-2.c: New test.
* c-c++-common/goacc/kernels-loop-3-acc-loop.c: New test.
* c-c++-common/goacc/kernels-loop-3.c: New test.
* c-c++-common/goacc/kernels-loop-acc-loop.c: New test.
* c-c++-common/goacc/kernels-loop-data-2.c: New test.
* c-c++-common/goacc/kernels-loop-data-enter-exit-2.c: New test.
* c-c++-common/goacc/kernels-loop-data-enter-exit.c: New test.
* c-c++-common/goacc/kernels-loop-data-update.c: New test.
* c-c++-common/goacc/kernels-loop-data.c: New test.
* c-c++-common/goacc/kernels-loop-g.c: New test.
* c-c++-common/goacc/kernels-loop-mod-not-zero.c: New test.
* c-c++-common/goacc/kernels-loop-n-acc-loop.c: New test.
* c-c++-common/goacc/kernels-loop-n.c: New test.
* c-c++-common/goacc/kernels-loop-nest.c: New test.
* c-c++-common/goacc/kernels-loop.c: New test.
* c-c++-common/goacc/kernels-noreturn.c: New test.
* c-c++-common/goacc/kernels-one-counter-var.c: New test.
* c-c++-common/goacc/kernels-parallel-loop-data-enter-exit.c: New test.
* c-c++-common/goacc/kernels-reduction.c: New test.
---
.../goacc/kernels-acc-loop-reduction.c | 25 ++++++++
.../goacc/kernels-acc-loop-smaller-equal.c | 25 ++++++++
.../goacc/kernels-counter-var-redundant-load.c | 36 +++++++++++
.../goacc/kernels-counter-vars-function-scope.c | 54 +++++++++++++++++
.../c-c++-common/goacc/kernels-double-reduction.c | 37 ++++++++++++
gcc/testsuite/c-c++-common/goacc/kernels-empty.c | 6 ++
gcc/testsuite/c-c++-common/goacc/kernels-eternal.c | 11 ++++
.../c-c++-common/goacc/kernels-loop-2-acc-loop.c | 21 +++++++
gcc/testsuite/c-c++-common/goacc/kernels-loop-2.c | 70 ++++++++++++++++++++++
.../c-c++-common/goacc/kernels-loop-3-acc-loop.c | 17 ++++++
gcc/testsuite/c-c++-common/goacc/kernels-loop-3.c | 49 +++++++++++++++
.../c-c++-common/goacc/kernels-loop-acc-loop.c | 17 ++++++
.../c-c++-common/goacc/kernels-loop-data-2.c | 70 ++++++++++++++++++++++
.../goacc/kernels-loop-data-enter-exit-2.c | 68 +++++++++++++++++++++
.../goacc/kernels-loop-data-enter-exit.c | 65 ++++++++++++++++++++
.../c-c++-common/goacc/kernels-loop-data-update.c | 65 ++++++++++++++++++++
.../c-c++-common/goacc/kernels-loop-data.c | 64 ++++++++++++++++++++
gcc/testsuite/c-c++-common/goacc/kernels-loop-g.c | 17 ++++++
.../c-c++-common/goacc/kernels-loop-mod-not-zero.c | 52 ++++++++++++++++
.../c-c++-common/goacc/kernels-loop-n-acc-loop.c | 17 ++++++
gcc/testsuite/c-c++-common/goacc/kernels-loop-n.c | 56 +++++++++++++++++
.../c-c++-common/goacc/kernels-loop-nest.c | 39 ++++++++++++
gcc/testsuite/c-c++-common/goacc/kernels-loop.c | 56 +++++++++++++++++
.../c-c++-common/goacc/kernels-noreturn.c | 12 ++++
.../c-c++-common/goacc/kernels-one-counter-var.c | 54 +++++++++++++++++
.../goacc/kernels-parallel-loop-data-enter-exit.c | 66 ++++++++++++++++++++
.../c-c++-common/goacc/kernels-reduction.c | 36 +++++++++++
27 files changed, 1105 insertions(+)
create mode 100644 gcc/testsuite/c-c++-common/goacc/kernels-acc-loop-reduction.c
create mode 100644 gcc/testsuite/c-c++-common/goacc/kernels-acc-loop-smaller-equal.c
create mode 100644 gcc/testsuite/c-c++-common/goacc/kernels-counter-var-redundant-load.c
create mode 100644 gcc/testsuite/c-c++-common/goacc/kernels-counter-vars-function-scope.c
create mode 100644 gcc/testsuite/c-c++-common/goacc/kernels-double-reduction.c
create mode 100644 gcc/testsuite/c-c++-common/goacc/kernels-empty.c
create mode 100644 gcc/testsuite/c-c++-common/goacc/kernels-eternal.c
create mode 100644 gcc/testsuite/c-c++-common/goacc/kernels-loop-2-acc-loop.c
create mode 100644 gcc/testsuite/c-c++-common/goacc/kernels-loop-2.c
create mode 100644 gcc/testsuite/c-c++-common/goacc/kernels-loop-3-acc-loop.c
create mode 100644 gcc/testsuite/c-c++-common/goacc/kernels-loop-3.c
create mode 100644 gcc/testsuite/c-c++-common/goacc/kernels-loop-acc-loop.c
create mode 100644 gcc/testsuite/c-c++-common/goacc/kernels-loop-data-2.c
create mode 100644 gcc/testsuite/c-c++-common/goacc/kernels-loop-data-enter-exit-2.c
create mode 100644 gcc/testsuite/c-c++-common/goacc/kernels-loop-data-enter-exit.c
create mode 100644 gcc/testsuite/c-c++-common/goacc/kernels-loop-data-update.c
create mode 100644 gcc/testsuite/c-c++-common/goacc/kernels-loop-data.c
create mode 100644 gcc/testsuite/c-c++-common/goacc/kernels-loop-g.c
create mode 100644 gcc/testsuite/c-c++-common/goacc/kernels-loop-mod-not-zero.c
create mode 100644 gcc/testsuite/c-c++-common/goacc/kernels-loop-n-acc-loop.c
create mode 100644 gcc/testsuite/c-c++-common/goacc/kernels-loop-n.c
create mode 100644 gcc/testsuite/c-c++-common/goacc/kernels-loop-nest.c
create mode 100644 gcc/testsuite/c-c++-common/goacc/kernels-loop.c
create mode 100644 gcc/testsuite/c-c++-common/goacc/kernels-noreturn.c
create mode 100644 gcc/testsuite/c-c++-common/goacc/kernels-one-counter-var.c
create mode 100644 gcc/testsuite/c-c++-common/goacc/kernels-parallel-loop-data-enter-exit.c
create mode 100644 gcc/testsuite/c-c++-common/goacc/kernels-reduction.c
diff --git a/gcc/testsuite/c-c++-common/goacc/kernels-acc-loop-reduction.c b/gcc/testsuite/c-c++-common/goacc/kernels-acc-loop-reduction.c
new file mode 100644
index 0000000..dcc5891
--- /dev/null
+++ b/gcc/testsuite/c-c++-common/goacc/kernels-acc-loop-reduction.c
@@ -0,0 +1,25 @@
+/* { dg-additional-options "-O2" } */
+/* { dg-additional-options "-ftree-parallelize-loops=32" } */
+/* { dg-additional-options "-fdump-tree-parloops_oacc_kernels-all" } */
+/* { dg-additional-options "-fdump-tree-optimized" } */
+
+unsigned int
+foo (int n, unsigned int *a)
+{
+ unsigned int sum = 0;
+
+#pragma acc kernels loop gang reduction(+:sum)
+ for (int i = 0; i < n; i++)
+ sum += a[i];
+
+ return sum;
+}
+
+/* Check that only one loop is analyzed, and that it can be parallelized. */
+/* { dg-final { scan-tree-dump-times "SUCCESS: may be parallelized" 1 "parloops_oacc_kernels" } } */
+/* { dg-final { scan-tree-dump-not "FAILED:" "parloops_oacc_kernels" } } */
+
+/* Check that the loop has been split off into a function. */
+/* { dg-final { scan-tree-dump-times "(?n);; Function .*foo.*\\._omp_fn\\.0" 1 "optimized" } } */
+
+/* { dg-final { scan-tree-dump-times "(?n)oacc function \\(32," 1 "parloops_oacc_kernels" } } */
diff --git a/gcc/testsuite/c-c++-common/goacc/kernels-acc-loop-smaller-equal.c b/gcc/testsuite/c-c++-common/goacc/kernels-acc-loop-smaller-equal.c
new file mode 100644
index 0000000..c05c694
--- /dev/null
+++ b/gcc/testsuite/c-c++-common/goacc/kernels-acc-loop-smaller-equal.c
@@ -0,0 +1,25 @@
+/* { dg-additional-options "-O2" } */
+/* { dg-additional-options "-ftree-parallelize-loops=32" } */
+/* { dg-additional-options "-fdump-tree-parloops_oacc_kernels-all" } */
+/* { dg-additional-options "-fdump-tree-optimized" } */
+
+unsigned int
+foo (int n)
+{
+ unsigned int sum = 1;
+
+ #pragma acc kernels loop
+ for (int i = 1; i <= n; i++)
+ sum += i;
+
+ return sum;
+}
+
+/* Check that only one loop is analyzed, and that it can be parallelized. */
+/* { dg-final { scan-tree-dump-times "SUCCESS: may be parallelized" 1 "parloops_oacc_kernels" } } */
+/* { dg-final { scan-tree-dump-not "FAILED:" "parloops_oacc_kernels" } } */
+
+/* Check that the loop has been split off into a function. */
+/* { dg-final { scan-tree-dump-times "(?n);; Function .*foo.*\\._omp_fn\\.0" 1 "optimized" } } */
+
+/* { dg-final { scan-tree-dump-times "(?n)oacc function \\(32," 1 "parloops_oacc_kernels" } } */
diff --git a/gcc/testsuite/c-c++-common/goacc/kernels-counter-var-redundant-load.c b/gcc/testsuite/c-c++-common/goacc/kernels-counter-var-redundant-load.c
new file mode 100644
index 0000000..ad101dd
--- /dev/null
+++ b/gcc/testsuite/c-c++-common/goacc/kernels-counter-var-redundant-load.c
@@ -0,0 +1,36 @@
+/* { dg-additional-options "-O2" } */
+/* { dg-additional-options "-ftree-parallelize-loops=32" } */
+/* { dg-additional-options "-fdump-tree-dom_oacc_kernels3" } */
+
+#include <stdlib.h>
+
+#define N (1024 * 512)
+#define COUNTERTYPE unsigned int
+
+COUNTERTYPE
+foo (unsigned int *c)
+{
+ COUNTERTYPE ii;
+
+#pragma acc kernels copyout (c[0:N])
+ {
+ for (ii = 0; ii < N; ii++)
+ c[ii] = 1;
+ }
+
+ return ii;
+}
+
+/* We're expecting:
+
+ .omp_data_i_10 = &.omp_data_arr.3;
+ _11 = .omp_data_i_10->ii;
+ *_11 = 0;
+ _15 = .omp_data_i_10->c;
+ c.1_16 = *_15;
+
+ Check that there is one load from anonymous ssa-name, which we assume to
+ be:
+ - the one to read c. */
+
+/* { dg-final { scan-tree-dump-times "(?n)\\*_\[0-9\]\[0-9\]*;$" 1 "dom_oacc_kernels3" } } */
diff --git a/gcc/testsuite/c-c++-common/goacc/kernels-counter-vars-function-scope.c b/gcc/testsuite/c-c++-common/goacc/kernels-counter-vars-function-scope.c
new file mode 100644
index 0000000..650fb8ca
--- /dev/null
+++ b/gcc/testsuite/c-c++-common/goacc/kernels-counter-vars-function-scope.c
@@ -0,0 +1,54 @@
+/* { dg-additional-options "-O2" } */
+/* { dg-additional-options "-ftree-parallelize-loops=32" } */
+/* { dg-additional-options "-fdump-tree-parloops_oacc_kernels-all" } */
+/* { dg-additional-options "-fdump-tree-optimized" } */
+
+#include <stdlib.h>
+
+#define N (1024 * 512)
+#define COUNTERTYPE unsigned int
+
+int
+main (void)
+{
+ unsigned int *__restrict a;
+ unsigned int *__restrict b;
+ unsigned int *__restrict c;
+ COUNTERTYPE i;
+ COUNTERTYPE ii;
+
+ a = (unsigned int *)malloc (N * sizeof (unsigned int));
+ b = (unsigned int *)malloc (N * sizeof (unsigned int));
+ c = (unsigned int *)malloc (N * sizeof (unsigned int));
+
+ for (i = 0; i < N; i++)
+ a[i] = i * 2;
+
+ for (i = 0; i < N; i++)
+ b[i] = i * 4;
+
+#pragma acc kernels copyin (a[0:N], b[0:N]) copyout (c[0:N])
+ {
+ for (ii = 0; ii < N; ii++)
+ c[ii] = a[ii] + b[ii];
+ }
+
+ for (i = 0; i < N; i++)
+ if (c[i] != a[i] + b[i])
+ abort ();
+
+ free (a);
+ free (b);
+ free (c);
+
+ return 0;
+}
+
+/* Check that only one loop is analyzed, and that it can be parallelized. */
+/* { dg-final { scan-tree-dump-times "SUCCESS: may be parallelized" 1 "parloops_oacc_kernels" } } */
+/* { dg-final { scan-tree-dump-not "FAILED:" "parloops_oacc_kernels" } } */
+
+/* Check that the loop has been split off into a function. */
+/* { dg-final { scan-tree-dump-times "(?n);; Function .*main._omp_fn.0" 1 "optimized" } } */
+
+/* { dg-final { scan-tree-dump-times "(?n)oacc function \\(32," 1 "parloops_oacc_kernels" } } */
diff --git a/gcc/testsuite/c-c++-common/goacc/kernels-double-reduction.c b/gcc/testsuite/c-c++-common/goacc/kernels-double-reduction.c
new file mode 100644
index 0000000..da20f34
--- /dev/null
+++ b/gcc/testsuite/c-c++-common/goacc/kernels-double-reduction.c
@@ -0,0 +1,37 @@
+/* { dg-additional-options "-O2" } */
+/* { dg-additional-options "-ftree-parallelize-loops=32" } */
+/* { dg-additional-options "-fdump-tree-parloops_oacc_kernels-all" } */
+/* { dg-additional-options "-fdump-tree-optimized" } */
+
+#include <stdlib.h>
+
+#define N 500
+
+unsigned int a[N][N];
+
+void __attribute__((noinline,noclone))
+foo (void)
+{
+ int i, j;
+ unsigned int sum = 1;
+
+#pragma acc kernels copyin (a[0:N]) copy (sum)
+ {
+ for (i = 0; i < N; ++i)
+ for (j = 0; j < N; ++j)
+ sum += a[i][j];
+ }
+
+ if (sum != 5001)
+ abort ();
+}
+
+/* Check that only one loop is analyzed, and that it can be parallelized. */
+/* { dg-final { scan-tree-dump-times "SUCCESS: may be parallelized" 1 "parloops_oacc_kernels" } } */
+/* { dg-final { scan-tree-dump-not "FAILED:" "parloops_oacc_kernels" } } */
+/* { dg-final { scan-tree-dump-times "parallelizing outer loop" 1 "parloops_oacc_kernels" } } */
+
+/* Check that the loop has been split off into a function. */
+/* { dg-final { scan-tree-dump-times "(?n);; Function .*foo.*._omp_fn.0" 1 "optimized" } } */
+
+/* { dg-final { scan-tree-dump-times "(?n)oacc function \\(32," 1 "parloops_oacc_kernels" } } */
diff --git a/gcc/testsuite/c-c++-common/goacc/kernels-empty.c b/gcc/testsuite/c-c++-common/goacc/kernels-empty.c
new file mode 100644
index 0000000..e91b81c
--- /dev/null
+++ b/gcc/testsuite/c-c++-common/goacc/kernels-empty.c
@@ -0,0 +1,6 @@
+void
+foo (void)
+{
+#pragma acc kernels
+ ;
+}
diff --git a/gcc/testsuite/c-c++-common/goacc/kernels-eternal.c b/gcc/testsuite/c-c++-common/goacc/kernels-eternal.c
new file mode 100644
index 0000000..edc17d2
--- /dev/null
+++ b/gcc/testsuite/c-c++-common/goacc/kernels-eternal.c
@@ -0,0 +1,11 @@
+int
+main (void)
+{
+#pragma acc kernels
+ {
+ while (1)
+ ;
+ }
+
+ return 0;
+}
diff --git a/gcc/testsuite/c-c++-common/goacc/kernels-loop-2-acc-loop.c b/gcc/testsuite/c-c++-common/goacc/kernels-loop-2-acc-loop.c
new file mode 100644
index 0000000..6a4fb1f
--- /dev/null
+++ b/gcc/testsuite/c-c++-common/goacc/kernels-loop-2-acc-loop.c
@@ -0,0 +1,21 @@
+/* { dg-additional-options "-O2" } */
+/* { dg-additional-options "-ftree-parallelize-loops=32" } */
+/* { dg-additional-options "-fdump-tree-parloops_oacc_kernels-all" } */
+/* { dg-additional-options "-fdump-tree-optimized" } */
+
+/* Check that loops with '#pragma acc loop' tagged gets properly parallelized. */
+#define ACC_LOOP
+#include "kernels-loop-2.c"
+
+/* Check that only three loops are analyzed, and that all can be
+ parallelized. */
+/* { dg-final { scan-tree-dump-times "SUCCESS: may be parallelized" 3 "parloops_oacc_kernels" } } */
+/* { dg-final { scan-tree-dump-not "FAILED:" "parloops_oacc_kernels" } } */
+
+/* Check that the loop has been split off into a function. */
+/* { dg-final { scan-tree-dump-times "(?n);; Function .*main._omp_fn.0" 1 "optimized" } } */
+/* { dg-final { scan-tree-dump-times "(?n);; Function .*main._omp_fn.1" 1 "optimized" } } */
+/* { dg-final { scan-tree-dump-times "(?n);; Function .*main._omp_fn.2" 1 "optimized" } } */
+
+/* { dg-final { scan-tree-dump-times "(?n)oacc function \\(32," 3 "parloops_oacc_kernels" } } */
+
diff --git a/gcc/testsuite/c-c++-common/goacc/kernels-loop-2.c b/gcc/testsuite/c-c++-common/goacc/kernels-loop-2.c
new file mode 100644
index 0000000..514591e
--- /dev/null
+++ b/gcc/testsuite/c-c++-common/goacc/kernels-loop-2.c
@@ -0,0 +1,70 @@
+/* { dg-additional-options "-O2" } */
+/* { dg-additional-options "-ftree-parallelize-loops=32" } */
+/* { dg-additional-options "-fdump-tree-parloops_oacc_kernels-all" } */
+/* { dg-additional-options "-fdump-tree-optimized" } */
+
+#include <stdlib.h>
+
+#define N (1024 * 512)
+#define COUNTERTYPE unsigned int
+
+int
+main (void)
+{
+ unsigned int *__restrict a;
+ unsigned int *__restrict b;
+ unsigned int *__restrict c;
+
+ a = (unsigned int *)malloc (N * sizeof (unsigned int));
+ b = (unsigned int *)malloc (N * sizeof (unsigned int));
+ c = (unsigned int *)malloc (N * sizeof (unsigned int));
+
+#pragma acc kernels copyout (a[0:N])
+ {
+#ifdef ACC_LOOP
+ #pragma acc loop
+#endif
+ for (COUNTERTYPE i = 0; i < N; i++)
+ a[i] = i * 2;
+ }
+
+#pragma acc kernels copyout (b[0:N])
+ {
+#ifdef ACC_LOOP
+ #pragma acc loop
+#endif
+ for (COUNTERTYPE i = 0; i < N; i++)
+ b[i] = i * 4;
+ }
+
+#pragma acc kernels copyin (a[0:N], b[0:N]) copyout (c[0:N])
+ {
+#ifdef ACC_LOOP
+ #pragma acc loop
+#endif
+ for (COUNTERTYPE ii = 0; ii < N; ii++)
+ c[ii] = a[ii] + b[ii];
+ }
+
+ for (COUNTERTYPE i = 0; i < N; i++)
+ if (c[i] != a[i] + b[i])
+ abort ();
+
+ free (a);
+ free (b);
+ free (c);
+
+ return 0;
+}
+
+/* Check that only three loops are analyzed, and that all can be
+ parallelized. */
+/* { dg-final { scan-tree-dump-times "SUCCESS: may be parallelized" 3 "parloops_oacc_kernels" } } */
+/* { dg-final { scan-tree-dump-not "FAILED:" "parloops_oacc_kernels" } } */
+
+/* Check that the loop has been split off into a function. */
+/* { dg-final { scan-tree-dump-times "(?n);; Function .*main._omp_fn.0" 1 "optimized" } } */
+/* { dg-final { scan-tree-dump-times "(?n);; Function .*main._omp_fn.1" 1 "optimized" } } */
+/* { dg-final { scan-tree-dump-times "(?n);; Function .*main._omp_fn.2" 1 "optimized" } } */
+
+/* { dg-final { scan-tree-dump-times "(?n)oacc function \\(32," 3 "parloops_oacc_kernels" } } */
diff --git a/gcc/testsuite/c-c++-common/goacc/kernels-loop-3-acc-loop.c b/gcc/testsuite/c-c++-common/goacc/kernels-loop-3-acc-loop.c
new file mode 100644
index 0000000..a9e81ee
--- /dev/null
+++ b/gcc/testsuite/c-c++-common/goacc/kernels-loop-3-acc-loop.c
@@ -0,0 +1,17 @@
+/* { dg-additional-options "-O2" } */
+/* { dg-additional-options "-ftree-parallelize-loops=32" } */
+/* { dg-additional-options "-fdump-tree-parloops_oacc_kernels-all" } */
+/* { dg-additional-options "-fdump-tree-optimized" } */
+
+/* Check that loops with '#pragma acc loop' tagged gets properly parallelized. */
+#define ACC_LOOP
+#include "kernels-loop-3.c"
+
+/* Check that only one loop is analyzed, and that it can be parallelized. */
+/* { dg-final { scan-tree-dump-times "SUCCESS: may be parallelized" 1 "parloops_oacc_kernels" } } */
+/* { dg-final { scan-tree-dump-not "FAILED:" "parloops_oacc_kernels" } } */
+
+/* Check that the loop has been split off into a function. */
+/* { dg-final { scan-tree-dump-times "(?n);; Function .*main._omp_fn.0" 1 "optimized" } } */
+
+/* { dg-final { scan-tree-dump-times "(?n)oacc function \\(32," 1 "parloops_oacc_kernels" } } */
diff --git a/gcc/testsuite/c-c++-common/goacc/kernels-loop-3.c b/gcc/testsuite/c-c++-common/goacc/kernels-loop-3.c
new file mode 100644
index 0000000..790add9
--- /dev/null
+++ b/gcc/testsuite/c-c++-common/goacc/kernels-loop-3.c
@@ -0,0 +1,49 @@
+/* { dg-additional-options "-O2" } */
+/* { dg-additional-options "-ftree-parallelize-loops=32" } */
+/* { dg-additional-options "-fdump-tree-parloops_oacc_kernels-all" } */
+/* { dg-additional-options "-fdump-tree-optimized" } */
+
+#include <stdlib.h>
+
+#define N (1024 * 512)
+#define COUNTERTYPE unsigned int
+
+int
+main (void)
+{
+ unsigned int i;
+
+ unsigned int *__restrict c;
+
+ c = (unsigned int *__restrict)malloc (N * sizeof (unsigned int));
+
+ for (COUNTERTYPE i = 0; i < N; i++)
+ c[i] = i * 2;
+
+#pragma acc kernels copy (c[0:N])
+ {
+#ifdef ACC_LOOP
+ #pragma acc loop
+#endif
+ for (COUNTERTYPE ii = 0; ii < N; ii++)
+ c[ii] = c[ii] + ii + 1;
+ }
+
+ for (COUNTERTYPE i = 0; i < N; i++)
+ if (c[i] != i * 2 + i + 1)
+ abort ();
+
+ free (c);
+
+ return 0;
+}
+
+/* Check that only one loop is analyzed, and that it can be parallelized. */
+/* { dg-final { scan-tree-dump-times "SUCCESS: may be parallelized" 1 "parloops_oacc_kernels" } } */
+/* { dg-final { scan-tree-dump-not "FAILED:" "parloops_oacc_kernels" } } */
+
+/* Check that the loop has been split off into a function. */
+/* { dg-final { scan-tree-dump-times "(?n);; Function .*main._omp_fn.0" 1 "optimized" } } */
+
+/* { dg-final { scan-tree-dump-times "(?n)oacc function \\(32," 1 "parloops_oacc_kernels" } } */
+
diff --git a/gcc/testsuite/c-c++-common/goacc/kernels-loop-acc-loop.c b/gcc/testsuite/c-c++-common/goacc/kernels-loop-acc-loop.c
new file mode 100644
index 0000000..516598f
--- /dev/null
+++ b/gcc/testsuite/c-c++-common/goacc/kernels-loop-acc-loop.c
@@ -0,0 +1,17 @@
+/* { dg-additional-options "-O2" } */
+/* { dg-additional-options "-ftree-parallelize-loops=32" } */
+/* { dg-additional-options "-fdump-tree-parloops_oacc_kernels-all" } */
+/* { dg-additional-options "-fdump-tree-optimized" } */
+
+/* Check that loops with '#pragma acc loop' tagged gets properly parallelized. */
+#define ACC_LOOP
+#include "kernels-loop.c"
+
+/* Check that only one loop is analyzed, and that it can be parallelized. */
+/* { dg-final { scan-tree-dump-times "SUCCESS: may be parallelized" 1 "parloops_oacc_kernels" } } */
+/* { dg-final { scan-tree-dump-not "FAILED:" "parloops_oacc_kernels" } } */
+
+/* Check that the loop has been split off into a function. */
+/* { dg-final { scan-tree-dump-times "(?n);; Function .*main._omp_fn.0" 1 "optimized" } } */
+
+/* { dg-final { scan-tree-dump-times "(?n)oacc function \\(32," 1 "parloops_oacc_kernels" } } */
diff --git a/gcc/testsuite/c-c++-common/goacc/kernels-loop-data-2.c b/gcc/testsuite/c-c++-common/goacc/kernels-loop-data-2.c
new file mode 100644
index 0000000..095ed6c
--- /dev/null
+++ b/gcc/testsuite/c-c++-common/goacc/kernels-loop-data-2.c
@@ -0,0 +1,70 @@
+/* { dg-additional-options "-O2" } */
+/* { dg-additional-options "-ftree-parallelize-loops=32" } */
+/* { dg-additional-options "-fdump-tree-parloops_oacc_kernels-all" } */
+/* { dg-additional-options "-fdump-tree-optimized" } */
+
+#include <stdlib.h>
+
+#define N (1024 * 512)
+#define COUNTERTYPE unsigned int
+
+int
+main (void)
+{
+ unsigned int *__restrict a;
+ unsigned int *__restrict b;
+ unsigned int *__restrict c;
+
+ a = (unsigned int *)malloc (N * sizeof (unsigned int));
+ b = (unsigned int *)malloc (N * sizeof (unsigned int));
+ c = (unsigned int *)malloc (N * sizeof (unsigned int));
+
+#pragma acc data copyout (a[0:N])
+ {
+#pragma acc kernels present (a[0:N])
+ {
+ for (COUNTERTYPE i = 0; i < N; i++)
+ a[i] = i * 2;
+ }
+ }
+
+#pragma acc data copyout (b[0:N])
+ {
+#pragma acc kernels present (b[0:N])
+ {
+ for (COUNTERTYPE i = 0; i < N; i++)
+ b[i] = i * 4;
+ }
+ }
+
+#pragma acc data copyin (a[0:N], b[0:N]) copyout (c[0:N])
+ {
+#pragma acc kernels present (a[0:N], b[0:N], c[0:N])
+ {
+ for (COUNTERTYPE ii = 0; ii < N; ii++)
+ c[ii] = a[ii] + b[ii];
+ }
+ }
+
+ for (COUNTERTYPE i = 0; i < N; i++)
+ if (c[i] != a[i] + b[i])
+ abort ();
+
+ free (a);
+ free (b);
+ free (c);
+
+ return 0;
+}
+
+/* Check that only three loops are analyzed, and that all can be
+ parallelized. */
+/* { dg-final { scan-tree-dump-times "SUCCESS: may be parallelized" 3 "parloops_oacc_kernels" } } */
+/* { dg-final { scan-tree-dump-not "FAILED:" "parloops_oacc_kernels" } } */
+
+/* Check that the loop has been split off into a function. */
+/* { dg-final { scan-tree-dump-times "(?n);; Function .*main._omp_fn.0" 1 "optimized" } } */
+/* { dg-final { scan-tree-dump-times "(?n);; Function .*main._omp_fn.1" 1 "optimized" } } */
+/* { dg-final { scan-tree-dump-times "(?n);; Function .*main._omp_fn.2" 1 "optimized" } } */
+
+/* { dg-final { scan-tree-dump-times "(?n)oacc function \\(32," 3 "parloops_oacc_kernels" } } */
diff --git a/gcc/testsuite/c-c++-common/goacc/kernels-loop-data-enter-exit-2.c b/gcc/testsuite/c-c++-common/goacc/kernels-loop-data-enter-exit-2.c
new file mode 100644
index 0000000..9efffac
--- /dev/null
+++ b/gcc/testsuite/c-c++-common/goacc/kernels-loop-data-enter-exit-2.c
@@ -0,0 +1,68 @@
+/* { dg-additional-options "-O2" } */
+/* { dg-additional-options "-ftree-parallelize-loops=32" } */
+/* { dg-additional-options "-fdump-tree-parloops_oacc_kernels-all" } */
+/* { dg-additional-options "-fdump-tree-optimized" } */
+
+#include <stdlib.h>
+
+#define N (1024 * 512)
+#define COUNTERTYPE unsigned int
+
+int
+main (void)
+{
+ unsigned int *__restrict a;
+ unsigned int *__restrict b;
+ unsigned int *__restrict c;
+
+ a = (unsigned int *)malloc (N * sizeof (unsigned int));
+ b = (unsigned int *)malloc (N * sizeof (unsigned int));
+ c = (unsigned int *)malloc (N * sizeof (unsigned int));
+
+#pragma acc enter data create (a[0:N])
+#pragma acc kernels present (a[0:N])
+ {
+ for (COUNTERTYPE i = 0; i < N; i++)
+ a[i] = i * 2;
+ }
+#pragma acc exit data copyout (a[0:N])
+
+#pragma acc enter data create (b[0:N])
+#pragma acc kernels present (b[0:N])
+ {
+ for (COUNTERTYPE i = 0; i < N; i++)
+ b[i] = i * 4;
+ }
+#pragma acc exit data copyout (b[0:N])
+
+
+#pragma acc enter data copyin (a[0:N], b[0:N]) create (c[0:N])
+#pragma acc kernels present (a[0:N], b[0:N], c[0:N])
+ {
+ for (COUNTERTYPE ii = 0; ii < N; ii++)
+ c[ii] = a[ii] + b[ii];
+ }
+#pragma acc exit data copyout (c[0:N])
+
+ for (COUNTERTYPE i = 0; i < N; i++)
+ if (c[i] != a[i] + b[i])
+ abort ();
+
+ free (a);
+ free (b);
+ free (c);
+
+ return 0;
+}
+
+/* Check that only three loops are analyzed, and that all can be
+ parallelized. */
+/* { dg-final { scan-tree-dump-times "SUCCESS: may be parallelized" 3 "parloops_oacc_kernels" } } */
+/* { dg-final { scan-tree-dump-not "FAILED:" "parloops_oacc_kernels" } } */
+
+/* Check that the loop has been split off into a function. */
+/* { dg-final { scan-tree-dump-times "(?n);; Function .*main._omp_fn.0" 1 "optimized" } } */
+/* { dg-final { scan-tree-dump-times "(?n);; Function .*main._omp_fn.1" 1 "optimized" } } */
+/* { dg-final { scan-tree-dump-times "(?n);; Function .*main._omp_fn.2" 1 "optimized" } } */
+
+/* { dg-final { scan-tree-dump-times "(?n)oacc function \\(32," 3 "parloops_oacc_kernels" } } */
diff --git a/gcc/testsuite/c-c++-common/goacc/kernels-loop-data-enter-exit.c b/gcc/testsuite/c-c++-common/goacc/kernels-loop-data-enter-exit.c
new file mode 100644
index 0000000..2da20b4
--- /dev/null
+++ b/gcc/testsuite/c-c++-common/goacc/kernels-loop-data-enter-exit.c
@@ -0,0 +1,65 @@
+/* { dg-additional-options "-O2" } */
+/* { dg-additional-options "-ftree-parallelize-loops=32" } */
+/* { dg-additional-options "-fdump-tree-parloops_oacc_kernels-all" } */
+/* { dg-additional-options "-fdump-tree-optimized" } */
+
+#include <stdlib.h>
+
+#define N (1024 * 512)
+#define COUNTERTYPE unsigned int
+
+int
+main (void)
+{
+ unsigned int *__restrict a;
+ unsigned int *__restrict b;
+ unsigned int *__restrict c;
+
+ a = (unsigned int *)malloc (N * sizeof (unsigned int));
+ b = (unsigned int *)malloc (N * sizeof (unsigned int));
+ c = (unsigned int *)malloc (N * sizeof (unsigned int));
+
+#pragma acc enter data create (a[0:N], b[0:N], c[0:N])
+
+#pragma acc kernels present (a[0:N])
+ {
+ for (COUNTERTYPE i = 0; i < N; i++)
+ a[i] = i * 2;
+ }
+
+#pragma acc kernels present (b[0:N])
+ {
+ for (COUNTERTYPE i = 0; i < N; i++)
+ b[i] = i * 4;
+ }
+
+#pragma acc kernels present (a[0:N], b[0:N], c[0:N])
+ {
+ for (COUNTERTYPE ii = 0; ii < N; ii++)
+ c[ii] = a[ii] + b[ii];
+ }
+
+#pragma acc exit data copyout (a[0:N], c[0:N])
+
+ for (COUNTERTYPE i = 0; i < N; i++)
+ if (c[i] != a[i] + b[i])
+ abort ();
+
+ free (a);
+ free (b);
+ free (c);
+
+ return 0;
+}
+
+/* Check that only three loops are analyzed, and that all can be
+ parallelized. */
+/* { dg-final { scan-tree-dump-times "SUCCESS: may be parallelized" 3 "parloops_oacc_kernels" } } */
+/* { dg-final { scan-tree-dump-not "FAILED:" "parloops_oacc_kernels" } } */
+
+/* Check that the loop has been split off into a function. */
+/* { dg-final { scan-tree-dump-times "(?n);; Function .*main._omp_fn.0" 1 "optimized" } } */
+/* { dg-final { scan-tree-dump-times "(?n);; Function .*main._omp_fn.1" 1 "optimized" } } */
+/* { dg-final { scan-tree-dump-times "(?n);; Function .*main._omp_fn.2" 1 "optimized" } } */
+
+/* { dg-final { scan-tree-dump-times "(?n)oacc function \\(32," 3 "parloops_oacc_kernels" } } */
diff --git a/gcc/testsuite/c-c++-common/goacc/kernels-loop-data-update.c b/gcc/testsuite/c-c++-common/goacc/kernels-loop-data-update.c
new file mode 100644
index 0000000..09b63e5
--- /dev/null
+++ b/gcc/testsuite/c-c++-common/goacc/kernels-loop-data-update.c
@@ -0,0 +1,65 @@
+/* { dg-additional-options "-O2" } */
+/* { dg-additional-options "-ftree-parallelize-loops=32" } */
+/* { dg-additional-options "-fdump-tree-parloops_oacc_kernels-all" } */
+/* { dg-additional-options "-fdump-tree-optimized" } */
+
+#include <stdlib.h>
+
+#define N (1024 * 512)
+#define COUNTERTYPE unsigned int
+
+int
+main (void)
+{
+ unsigned int *__restrict a;
+ unsigned int *__restrict b;
+ unsigned int *__restrict c;
+
+ a = (unsigned int *)malloc (N * sizeof (unsigned int));
+ b = (unsigned int *)malloc (N * sizeof (unsigned int));
+ c = (unsigned int *)malloc (N * sizeof (unsigned int));
+
+#pragma acc enter data create (a[0:N], b[0:N], c[0:N])
+
+#pragma acc kernels present (a[0:N])
+ {
+ for (COUNTERTYPE i = 0; i < N; i++)
+ a[i] = i * 2;
+ }
+
+ {
+ for (COUNTERTYPE i = 0; i < N; i++)
+ b[i] = i * 4;
+ }
+
+#pragma acc update device (b[0:N])
+
+#pragma acc kernels present (a[0:N], b[0:N], c[0:N])
+ {
+ for (COUNTERTYPE ii = 0; ii < N; ii++)
+ c[ii] = a[ii] + b[ii];
+ }
+
+#pragma acc exit data copyout (a[0:N], c[0:N])
+
+ for (COUNTERTYPE i = 0; i < N; i++)
+ if (c[i] != a[i] + b[i])
+ abort ();
+
+ free (a);
+ free (b);
+ free (c);
+
+ return 0;
+}
+
+/* Check that only two loops are analyzed, and that both can be
+ parallelized. */
+/* { dg-final { scan-tree-dump-times "SUCCESS: may be parallelized" 2 "parloops_oacc_kernels" } } */
+/* { dg-final { scan-tree-dump-not "FAILED:" "parloops_oacc_kernels" } } */
+
+/* Check that the loop has been split off into a function. */
+/* { dg-final { scan-tree-dump-times "(?n);; Function .*main._omp_fn.0" 1 "optimized" } } */
+/* { dg-final { scan-tree-dump-times "(?n);; Function .*main._omp_fn.1" 1 "optimized" } } */
+
+/* { dg-final { scan-tree-dump-times "(?n)oacc function \\(32," 2 "parloops_oacc_kernels" } } */
diff --git a/gcc/testsuite/c-c++-common/goacc/kernels-loop-data.c b/gcc/testsuite/c-c++-common/goacc/kernels-loop-data.c
new file mode 100644
index 0000000..437fd73
--- /dev/null
+++ b/gcc/testsuite/c-c++-common/goacc/kernels-loop-data.c
@@ -0,0 +1,64 @@
+/* { dg-additional-options "-O2" } */
+/* { dg-additional-options "-ftree-parallelize-loops=32" } */
+/* { dg-additional-options "-fdump-tree-parloops_oacc_kernels-all" } */
+/* { dg-additional-options "-fdump-tree-optimized" } */
+
+#include <stdlib.h>
+
+#define N (1024 * 512)
+#define COUNTERTYPE unsigned int
+
+int
+main (void)
+{
+ unsigned int *__restrict a;
+ unsigned int *__restrict b;
+ unsigned int *__restrict c;
+
+ a = (unsigned int *)malloc (N * sizeof (unsigned int));
+ b = (unsigned int *)malloc (N * sizeof (unsigned int));
+ c = (unsigned int *)malloc (N * sizeof (unsigned int));
+
+#pragma acc data copyout (a[0:N], b[0:N], c[0:N])
+ {
+#pragma acc kernels present (a[0:N])
+ {
+ for (COUNTERTYPE i = 0; i < N; i++)
+ a[i] = i * 2;
+ }
+
+#pragma acc kernels present (b[0:N])
+ {
+ for (COUNTERTYPE i = 0; i < N; i++)
+ b[i] = i * 4;
+ }
+
+#pragma acc kernels present (a[0:N], b[0:N], c[0:N])
+ {
+ for (COUNTERTYPE ii = 0; ii < N; ii++)
+ c[ii] = a[ii] + b[ii];
+ }
+ }
+
+ for (COUNTERTYPE i = 0; i < N; i++)
+ if (c[i] != a[i] + b[i])
+ abort ();
+
+ free (a);
+ free (b);
+ free (c);
+
+ return 0;
+}
+
+/* Check that only three loops are analyzed, and that all can be
+ parallelized. */
+/* { dg-final { scan-tree-dump-times "SUCCESS: may be parallelized" 3 "parloops_oacc_kernels" } } */
+/* { dg-final { scan-tree-dump-not "FAILED:" "parloops_oacc_kernels" } } */
+
+/* Check that the loop has been split off into a function. */
+/* { dg-final { scan-tree-dump-times "(?n);; Function .*main._omp_fn.0" 1 "optimized" } } */
+/* { dg-final { scan-tree-dump-times "(?n);; Function .*main._omp_fn.1" 1 "optimized" } } */
+/* { dg-final { scan-tree-dump-times "(?n);; Function .*main._omp_fn.2" 1 "optimized" } } */
+
+/* { dg-final { scan-tree-dump-times "(?n)oacc function \\(32," 3 "parloops_oacc_kernels" } } */
diff --git a/gcc/testsuite/c-c++-common/goacc/kernels-loop-g.c b/gcc/testsuite/c-c++-common/goacc/kernels-loop-g.c
new file mode 100644
index 0000000..27e23f8
--- /dev/null
+++ b/gcc/testsuite/c-c++-common/goacc/kernels-loop-g.c
@@ -0,0 +1,17 @@
+/* { dg-additional-options "-O2" } */
+/* { dg-additional-options "-g" } */
+/* { dg-additional-options "-ftree-parallelize-loops=32" } */
+/* { dg-additional-options "-fdump-tree-parloops_oacc_kernels-all" } */
+/* { dg-additional-options "-fdump-tree-optimized" } */
+
+#include "kernels-loop.c"
+
+/* Check that only one loop is analyzed, and that it can be parallelized. */
+/* { dg-final { scan-tree-dump-times "SUCCESS: may be parallelized" 1 "parloops_oacc_kernels" } } */
+/* { dg-final { scan-tree-dump-not "FAILED:" "parloops_oacc_kernels" } } */
+
+/* Check that the loop has been split off into a function. */
+/* { dg-final { scan-tree-dump-times "(?n);; Function .*main._omp_fn.0" 1 "optimized" } } */
+
+/* { dg-final { scan-tree-dump-times "(?n)oacc function \\(32," 1 "parloops_oacc_kernels" } } */
+
diff --git a/gcc/testsuite/c-c++-common/goacc/kernels-loop-mod-not-zero.c b/gcc/testsuite/c-c++-common/goacc/kernels-loop-mod-not-zero.c
new file mode 100644
index 0000000..940341d
--- /dev/null
+++ b/gcc/testsuite/c-c++-common/goacc/kernels-loop-mod-not-zero.c
@@ -0,0 +1,52 @@
+/* { dg-additional-options "-O2" } */
+/* { dg-additional-options "-ftree-parallelize-loops=32" } */
+/* { dg-additional-options "-fdump-tree-parloops_oacc_kernels-all" } */
+/* { dg-additional-options "-fdump-tree-optimized" } */
+
+#include <stdlib.h>
+
+#define N ((1024 * 512) + 1)
+#define COUNTERTYPE unsigned int
+
+int
+main (void)
+{
+ unsigned int *__restrict a;
+ unsigned int *__restrict b;
+ unsigned int *__restrict c;
+
+ a = (unsigned int *__restrict)malloc (N * sizeof (unsigned int));
+ b = (unsigned int *__restrict)malloc (N * sizeof (unsigned int));
+ c = (unsigned int *__restrict)malloc (N * sizeof (unsigned int));
+
+ for (COUNTERTYPE i = 0; i < N; i++)
+ a[i] = i * 2;
+
+ for (COUNTERTYPE i = 0; i < N; i++)
+ b[i] = i * 4;
+
+#pragma acc kernels copyin (a[0:N], b[0:N]) copyout (c[0:N])
+ {
+ for (COUNTERTYPE ii = 0; ii < N; ii++)
+ c[ii] = a[ii] + b[ii];
+ }
+
+ for (COUNTERTYPE i = 0; i < N; i++)
+ if (c[i] != a[i] + b[i])
+ abort ();
+
+ free (a);
+ free (b);
+ free (c);
+
+ return 0;
+}
+
+/* Check that only one loop is analyzed, and that it can be parallelized. */
+/* { dg-final { scan-tree-dump-times "SUCCESS: may be parallelized" 1 "parloops_oacc_kernels" } } */
+/* { dg-final { scan-tree-dump-not "FAILED:" "parloops_oacc_kernels" } } */
+
+/* Check that the loop has been split off into a function. */
+/* { dg-final { scan-tree-dump-times "(?n);; Function .*main._omp_fn.0" 1 "optimized" } } */
+
+/* { dg-final { scan-tree-dump-times "(?n)oacc function \\(32," 1 "parloops_oacc_kernels" } } */
diff --git a/gcc/testsuite/c-c++-common/goacc/kernels-loop-n-acc-loop.c b/gcc/testsuite/c-c++-common/goacc/kernels-loop-n-acc-loop.c
new file mode 100644
index 0000000..64e59a2
--- /dev/null
+++ b/gcc/testsuite/c-c++-common/goacc/kernels-loop-n-acc-loop.c
@@ -0,0 +1,17 @@
+/* { dg-additional-options "-O2" } */
+/* { dg-additional-options "-ftree-parallelize-loops=32" } */
+/* { dg-additional-options "-fdump-tree-parloops_oacc_kernels-all" } */
+/* { dg-additional-options "-fdump-tree-optimized" } */
+
+/* Check that loops with '#pragma acc loop' tagged gets properly parallelized. */
+#define ACC_LOOP
+#include "kernels-loop-n.c"
+
+/* Check that only one loop is analyzed, and that it can be parallelized. */
+/* { dg-final { scan-tree-dump-times "SUCCESS: may be parallelized" 1 "parloops_oacc_kernels" } } */
+/* { dg-final { scan-tree-dump-not "FAILED:" "parloops_oacc_kernels" } } */
+
+/* Check that the loop has been split off into a function. */
+/* { dg-final { scan-tree-dump-times "(?n);; Function .*foo.*._omp_fn.0" 1 "optimized" } } */
+
+/* { dg-final { scan-tree-dump-times "(?n)oacc function \\(32," 1 "parloops_oacc_kernels" } } */
diff --git a/gcc/testsuite/c-c++-common/goacc/kernels-loop-n.c b/gcc/testsuite/c-c++-common/goacc/kernels-loop-n.c
new file mode 100644
index 0000000..73c6142
--- /dev/null
+++ b/gcc/testsuite/c-c++-common/goacc/kernels-loop-n.c
@@ -0,0 +1,56 @@
+/* { dg-additional-options "-O2" } */
+/* { dg-additional-options "-ftree-parallelize-loops=32" } */
+/* { dg-additional-options "-fdump-tree-parloops_oacc_kernels-all" } */
+/* { dg-additional-options "-fdump-tree-optimized" } */
+
+#include <stdlib.h>
+
+#define N ((1024 * 512) + 1)
+#define COUNTERTYPE unsigned int
+
+int
+foo (COUNTERTYPE n)
+{
+ unsigned int *__restrict a;
+ unsigned int *__restrict b;
+ unsigned int *__restrict c;
+
+ a = (unsigned int *__restrict)malloc (n * sizeof (unsigned int));
+ b = (unsigned int *__restrict)malloc (n * sizeof (unsigned int));
+ c = (unsigned int *__restrict)malloc (n * sizeof (unsigned int));
+
+ for (COUNTERTYPE i = 0; i < n; i++)
+ a[i] = i * 2;
+
+ for (COUNTERTYPE i = 0; i < n; i++)
+ b[i] = i * 4;
+
+#pragma acc kernels copyin (a[0:n], b[0:n]) copyout (c[0:n])
+ {
+#ifdef ACC_LOOP
+ #pragma acc loop
+#endif
+ for (COUNTERTYPE ii = 0; ii < n; ii++)
+ c[ii] = a[ii] + b[ii];
+ }
+
+ for (COUNTERTYPE i = 0; i < n; i++)
+ if (c[i] != a[i] + b[i])
+ abort ();
+
+ free (a);
+ free (b);
+ free (c);
+
+ return 0;
+}
+
+/* Check that only one loop is analyzed, and that it can be parallelized. */
+/* { dg-final { scan-tree-dump-times "SUCCESS: may be parallelized" 1 "parloops_oacc_kernels" } } */
+/* { dg-final { scan-tree-dump-not "FAILED:" "parloops_oacc_kernels" } } */
+
+/* Check that the loop has been split off into a function. */
+/* { dg-final { scan-tree-dump-times "(?n);; Function .*foo.*._omp_fn.0" 1 "optimized" } } */
+
+/* { dg-final { scan-tree-dump-times "(?n)oacc function \\(32," 1 "parloops_oacc_kernels" } } */
+
diff --git a/gcc/testsuite/c-c++-common/goacc/kernels-loop-nest.c b/gcc/testsuite/c-c++-common/goacc/kernels-loop-nest.c
new file mode 100644
index 0000000..d2aeda6
--- /dev/null
+++ b/gcc/testsuite/c-c++-common/goacc/kernels-loop-nest.c
@@ -0,0 +1,39 @@
+/* { dg-additional-options "-O2" } */
+/* { dg-additional-options "-ftree-parallelize-loops=32" } */
+/* { dg-additional-options "-fdump-tree-parloops_oacc_kernels-all" } */
+/* { dg-additional-options "-fdump-tree-optimized" } */
+
+/* Based on autopar/outer-1.c. */
+
+#include <stdlib.h>
+
+#define N 1000
+
+int
+main (void)
+{
+ int x[N][N];
+
+#pragma acc kernels copyout (x)
+ {
+ for (int ii = 0; ii < N; ii++)
+ for (int jj = 0; jj < N; jj++)
+ x[ii][jj] = ii + jj + 3;
+ }
+
+ for (int i = 0; i < N; i++)
+ for (int j = 0; j < N; j++)
+ if (x[i][j] != i + j + 3)
+ abort ();
+
+ return 0;
+}
+
+/* Check that only one loop is analyzed, and that it can be parallelized. */
+/* { dg-final { scan-tree-dump-times "SUCCESS: may be parallelized" 1 "parloops_oacc_kernels" } } */
+/* { dg-final { scan-tree-dump-not "FAILED:" "parloops_oacc_kernels" } } */
+
+/* Check that the loop has been split off into a function. */
+/* { dg-final { scan-tree-dump-times "(?n);; Function .*main._omp_fn.0" 1 "optimized" } } */
+
+/* { dg-final { scan-tree-dump-times "(?n)oacc function \\(32," 1 "parloops_oacc_kernels" } } */
diff --git a/gcc/testsuite/c-c++-common/goacc/kernels-loop.c b/gcc/testsuite/c-c++-common/goacc/kernels-loop.c
new file mode 100644
index 0000000..925a84e
--- /dev/null
+++ b/gcc/testsuite/c-c++-common/goacc/kernels-loop.c
@@ -0,0 +1,56 @@
+/* { dg-additional-options "-O2" } */
+/* { dg-additional-options "-ftree-parallelize-loops=32" } */
+/* { dg-additional-options "-fdump-tree-parloops_oacc_kernels-all" } */
+/* { dg-additional-options "-fdump-tree-optimized" } */
+
+#include <stdlib.h>
+
+#define N (1024 * 512)
+#define COUNTERTYPE unsigned int
+
+int
+main (void)
+{
+ unsigned int *__restrict a;
+ unsigned int *__restrict b;
+ unsigned int *__restrict c;
+
+ a = (unsigned int *)malloc (N * sizeof (unsigned int));
+ b = (unsigned int *)malloc (N * sizeof (unsigned int));
+ c = (unsigned int *)malloc (N * sizeof (unsigned int));
+
+ for (COUNTERTYPE i = 0; i < N; i++)
+ a[i] = i * 2;
+
+ for (COUNTERTYPE i = 0; i < N; i++)
+ b[i] = i * 4;
+
+#pragma acc kernels copyin (a[0:N], b[0:N]) copyout (c[0:N])
+ {
+#ifdef ACC_LOOP
+ #pragma acc loop
+#endif
+ for (COUNTERTYPE ii = 0; ii < N; ii++)
+ c[ii] = a[ii] + b[ii];
+ }
+
+ for (COUNTERTYPE i = 0; i < N; i++)
+ if (c[i] != a[i] + b[i])
+ abort ();
+
+ free (a);
+ free (b);
+ free (c);
+
+ return 0;
+}
+
+/* Check that only one loop is analyzed, and that it can be parallelized. */
+/* { dg-final { scan-tree-dump-times "SUCCESS: may be parallelized" 1 "parloops_oacc_kernels" } } */
+/* { dg-final { scan-tree-dump-not "FAILED:" "parloops_oacc_kernels" } } */
+
+/* Check that the loop has been split off into a function. */
+/* { dg-final { scan-tree-dump-times "(?n);; Function .*main._omp_fn.0" 1 "optimized" } } */
+
+/* { dg-final { scan-tree-dump-times "(?n)oacc function \\(32," 1 "parloops_oacc_kernels" } } */
+
diff --git a/gcc/testsuite/c-c++-common/goacc/kernels-noreturn.c b/gcc/testsuite/c-c++-common/goacc/kernels-noreturn.c
new file mode 100644
index 0000000..1a8cc67
--- /dev/null
+++ b/gcc/testsuite/c-c++-common/goacc/kernels-noreturn.c
@@ -0,0 +1,12 @@
+int
+main (void)
+{
+
+#pragma acc kernels
+ {
+ __builtin_abort ();
+ }
+
+ return 0;
+}
+
diff --git a/gcc/testsuite/c-c++-common/goacc/kernels-one-counter-var.c b/gcc/testsuite/c-c++-common/goacc/kernels-one-counter-var.c
new file mode 100644
index 0000000..b000a8c
--- /dev/null
+++ b/gcc/testsuite/c-c++-common/goacc/kernels-one-counter-var.c
@@ -0,0 +1,54 @@
+/* { dg-additional-options "-O2" } */
+/* { dg-additional-options "-ftree-parallelize-loops=32" } */
+/* { dg-additional-options "-fdump-tree-parloops_oacc_kernels-all" } */
+/* { dg-additional-options "-fdump-tree-optimized" } */
+
+#include <stdlib.h>
+
+#define N (1024 * 512)
+#define COUNTERTYPE unsigned int
+
+int
+main (void)
+{
+ unsigned int *__restrict a;
+ unsigned int *__restrict b;
+ unsigned int *__restrict c;
+ COUNTERTYPE i;
+
+ a = (unsigned int *)malloc (N * sizeof (unsigned int));
+ b = (unsigned int *)malloc (N * sizeof (unsigned int));
+ c = (unsigned int *)malloc (N * sizeof (unsigned int));
+
+ for (i = 0; i < N; i++)
+ a[i] = i * 2;
+
+ for (i = 0; i < N; i++)
+ b[i] = i * 4;
+
+#pragma acc kernels copyin (a[0:N], b[0:N]) copyout (c[0:N])
+ {
+ for (i = 0; i < N; i++)
+ c[i] = a[i] + b[i];
+ }
+
+ for (i = 0; i < N; i++)
+ if (c[i] != a[i] + b[i])
+ abort ();
+
+ free (a);
+ free (b);
+ free (c);
+
+ return 0;
+}
+
+/* Check that only one loop is analyzed, and that it can be parallelized. */
+/* { dg-final { scan-tree-dump-times "SUCCESS: may be parallelized" 1 "parloops_oacc_kernels" } } */
+/* { dg-final { scan-tree-dump-not "FAILED:" "parloops_oacc_kernels" } } */
+
+/* Check that the loop has been split off into a function. */
+/* { dg-final { scan-tree-dump-times "(?n);; Function .*main._omp_fn.0" 1 "optimized" } } */
+
+/* { dg-final { scan-tree-dump-times "(?n)oacc function \\(32," 1 "parloops_oacc_kernels" } } */
+
diff --git a/gcc/testsuite/c-c++-common/goacc/kernels-parallel-loop-data-enter-exit.c b/gcc/testsuite/c-c++-common/goacc/kernels-parallel-loop-data-enter-exit.c
new file mode 100644
index 0000000..31b06bd
--- /dev/null
+++ b/gcc/testsuite/c-c++-common/goacc/kernels-parallel-loop-data-enter-exit.c
@@ -0,0 +1,66 @@
+/* { dg-additional-options "-O2" } */
+/* { dg-additional-options "-ftree-parallelize-loops=32" } */
+/* { dg-additional-options "-fdump-tree-parloops_oacc_kernels-all" } */
+/* { dg-additional-options "-fdump-tree-optimized" } */
+
+#include <stdlib.h>
+
+#define N (1024 * 512)
+#define COUNTERTYPE unsigned int
+
+int
+main (void)
+{
+ unsigned int *__restrict a;
+ unsigned int *__restrict b;
+ unsigned int *__restrict c;
+
+ a = (unsigned int *)malloc (N * sizeof (unsigned int));
+ b = (unsigned int *)malloc (N * sizeof (unsigned int));
+ c = (unsigned int *)malloc (N * sizeof (unsigned int));
+
+#pragma acc enter data create (a[0:N], b[0:N], c[0:N])
+
+#pragma acc kernels present (a[0:N])
+ {
+ for (COUNTERTYPE i = 0; i < N; i++)
+ a[i] = i * 2;
+ }
+
+#pragma acc parallel present (b[0:N])
+ {
+#pragma acc loop
+ for (COUNTERTYPE i = 0; i < N; i++)
+ b[i] = i * 4;
+ }
+
+#pragma acc kernels present (a[0:N], b[0:N], c[0:N])
+ {
+ for (COUNTERTYPE ii = 0; ii < N; ii++)
+ c[ii] = a[ii] + b[ii];
+ }
+
+#pragma acc exit data copyout (a[0:N], b[0:N], c[0:N])
+
+ for (COUNTERTYPE i = 0; i < N; i++)
+ if (c[i] != a[i] + b[i])
+ abort ();
+
+ free (a);
+ free (b);
+ free (c);
+
+ return 0;
+}
+
+/* Check that only two loops are analyzed, and that both can be
+ parallelized. */
+/* { dg-final { scan-tree-dump-times "SUCCESS: may be parallelized" 2 "parloops_oacc_kernels" } } */
+/* { dg-final { scan-tree-dump-not "FAILED:" "parloops_oacc_kernels" } } */
+
+/* Check that the loop has been split off into a function. */
+/* { dg-final { scan-tree-dump-times "(?n);; Function .*main._omp_fn.0" 1 "optimized" } } */
+/* { dg-final { scan-tree-dump-times "(?n);; Function .*main._omp_fn.1" 1 "optimized" } } */
+/* { dg-final { scan-tree-dump-times "(?n);; Function .*main._omp_fn.2" 1 "optimized" } } */
+
+/* { dg-final { scan-tree-dump-times "(?n)oacc function \\(32," 2 "parloops_oacc_kernels" } } */
diff --git a/gcc/testsuite/c-c++-common/goacc/kernels-reduction.c b/gcc/testsuite/c-c++-common/goacc/kernels-reduction.c
new file mode 100644
index 0000000..6a0b7a2
--- /dev/null
+++ b/gcc/testsuite/c-c++-common/goacc/kernels-reduction.c
@@ -0,0 +1,36 @@
+/* { dg-additional-options "-O2" } */
+/* { dg-additional-options "-ftree-parallelize-loops=32" } */
+/* { dg-additional-options "-fdump-tree-parloops_oacc_kernels-all" } */
+/* { dg-additional-options "-fdump-tree-optimized" } */
+
+#include <stdlib.h>
+
+#define n 10000
+
+unsigned int a[n];
+
+void __attribute__((noinline,noclone))
+foo (void)
+{
+ int i;
+ unsigned int sum = 1;
+
+#pragma acc kernels copyin (a[0:n]) copy (sum)
+ {
+ for (i = 0; i < n; ++i)
+ sum += a[i];
+ }
+
+ if (sum != 5001)
+ abort ();
+}
+
+/* Check that only one loop is analyzed, and that it can be parallelized. */
+/* { dg-final { scan-tree-dump-times "SUCCESS: may be parallelized" 1 "parloops_oacc_kernels" } } */
+/* { dg-final { scan-tree-dump-not "FAILED:" "parloops_oacc_kernels" } } */
+
+/* Check that the loop has been split off into a function. */
+/* { dg-final { scan-tree-dump-times "(?n);; Function .*foo.*._omp_fn.0" 1 "optimized" } } */
+
+/* { dg-final { scan-tree-dump-times "(?n)oacc function \\(32," 1 "parloops_oacc_kernels" } } */
+
--
1.9.1