Hi,

attached patch handles reductions in oacc kernels region.

The approach uses the normal parloops reduction handling code, with these modifications:

1.

For each reduction, we look for this pattern in the oacc-lowered code, and store 'addr' in the corresponding struct reduction_info:
...
 <bb preheader>
   .omp_data_i = &.omp_data_arr;
   addr = .omp_data_i->sum;
   sum_a = *addr;

 <bb header>:
   sum_b = PHI <sum_a (preheader), sum_c (latch)>
...

2.

We replaces the non-atomic store to 'addr' at the end of the kernels region with an atomic one.


Bootstrapped and reg-tested on x86_64 on top of gomp-4_0-branch.

Committed to gomp-4_0-branch.

Thanks,
- Tom

Handle reduction in oacc kernels region

2015-06-18  Tom de Vries  <t...@codesourcery.com>

	* tree-parloops.c (struct reduction_info): Add reduc_addr field.
	(create_call_for_reduction_1): Handle case that reduc_addr is non-NULL.
	(gen_parallel_loop): Init clsn_data for oacc_kernels_p case.
	(try_create_reduction_list): Add and handle oacc_kernels_p parameter.
	(parallelize_loops): Add argument to call to try_create_reduction_list.

	* testsuite/libgomp.oacc-c-c++-common/kernels-reduction.c: New test.

	* c-c++-common/goacc/kernels-reduction.c: New test.
---
 .../c-c++-common/goacc/kernels-reduction.c         | 38 +++++++++
 gcc/tree-parloops.c                                | 92 ++++++++++++++++++++--
 .../libgomp.oacc-c-c++-common/kernels-reduction.c  | 37 +++++++++
 3 files changed, 162 insertions(+), 5 deletions(-)
 create mode 100644 gcc/testsuite/c-c++-common/goacc/kernels-reduction.c
 create mode 100644 libgomp/testsuite/libgomp.oacc-c-c++-common/kernels-reduction.c

diff --git a/gcc/testsuite/c-c++-common/goacc/kernels-reduction.c b/gcc/testsuite/c-c++-common/goacc/kernels-reduction.c
new file mode 100644
index 0000000..bfbcdbd
--- /dev/null
+++ b/gcc/testsuite/c-c++-common/goacc/kernels-reduction.c
@@ -0,0 +1,38 @@
+/* { dg-additional-options "-O2" } */
+/* { dg-additional-options "-ftree-parallelize-loops=32" } */
+/* { dg-additional-options "-fdump-tree-parloops_oacc_kernels-all" } */
+/* { dg-additional-options "-fdump-tree-optimized" } */
+
+#include <stdlib.h>
+
+#define n 10000
+
+unsigned int a[n];
+
+void  __attribute__((noinline,noclone))
+foo (void)
+{
+  int i;
+  unsigned int sum = 1;
+
+#pragma acc kernels copyin (a[0:n]) copy (sum)
+  {
+    for (i = 0; i < n; ++i)
+      sum += a[i];
+  }
+
+  if (sum != 5001)
+    abort ();
+}
+
+/* Check that only one loop is analyzed, and that it can be parallelized.  */
+/* { dg-final { scan-tree-dump-times "SUCCESS: may be parallelized" 1 "parloops_oacc_kernels" } } */
+/* { dg-final { scan-tree-dump-not "FAILED:" "parloops_oacc_kernels" } } */
+
+/* Check that the loop has been split off into a function.  */
+/* { dg-final { scan-tree-dump-times "(?n);; Function .*foo.*._omp_fn.0" 1 "optimized" } } */
+
+/* { dg-final { scan-tree-dump-times "(?n)pragma omp target oacc_parallel.*num_gangs\\(32\\)" 1 "parloops_oacc_kernels" } } */
+
+/* { dg-final { cleanup-tree-dump "parloops_oacc_kernels" } } */
+/* { dg-final { cleanup-tree-dump "optimized" } } */
diff --git a/gcc/tree-parloops.c b/gcc/tree-parloops.c
index 0661b78..c5f4d9a 100644
--- a/gcc/tree-parloops.c
+++ b/gcc/tree-parloops.c
@@ -218,6 +218,8 @@ struct reduction_info
 				   of the reduction variable when existing the loop. */
   tree initial_value;		/* The initial value of the reduction var before entering the loop.  */
   tree field;			/*  the name of the field in the parloop data structure intended for reduction.  */
+  tree reduc_addr;		/* The address of the reduction variable for
+				   openacc reductions.  */
   tree init;			/* reduction initialization value.  */
   gphi *new_phi;		/* (helper field) Newly created phi node whose result
 				   will be passed to the atomic operation.  Represents
@@ -1107,10 +1109,30 @@ create_call_for_reduction_1 (reduction_info **slot, struct clsn_data *clsn_data)
   tree tmp_load, name;
   gimple load;
 
-  load_struct = build_simple_mem_ref (clsn_data->load);
-  t = build3 (COMPONENT_REF, type, load_struct, reduc->field, NULL_TREE);
+  if (reduc->reduc_addr == NULL_TREE)
+    {
+      load_struct = build_simple_mem_ref (clsn_data->load);
+      t = build3 (COMPONENT_REF, type, load_struct, reduc->field, NULL_TREE);
+
+      addr = build_addr (t, current_function_decl);
+    }
+  else
+    {
+      /* Set the address for the atomic store.  */
+      addr = reduc->reduc_addr;
+
+      /* Remove the non-atomic store '*addr = sum'.  */
+      tree res = PHI_RESULT (reduc->keep_res);
+      use_operand_p use_p;
+      gimple stmt;
+      bool single_use_p = single_imm_use (res, &use_p, &stmt);
+      gcc_assert (single_use_p);
+      replace_uses_by (gimple_vdef (stmt),
+		       gimple_vuse (stmt));
+      gimple_stmt_iterator gsi = gsi_for_stmt (stmt);
+      gsi_remove (&gsi, true);
+    }
 
-  addr = build_addr (t, current_function_decl);
 
   /* Create phi node.  */
   bb = clsn_data->load_bb;
@@ -2441,6 +2463,10 @@ gen_parallel_loop (struct loop *loop,
     {
       arg_struct = NULL_TREE;
       new_arg_struct = NULL_TREE;
+      clsn_data.load = NULL_TREE;
+      clsn_data.load_bb = exit->dest;
+      clsn_data.store = NULL_TREE;
+      clsn_data.store_bb = NULL;
     }
 
   /* Create the parallel constructs.  */
@@ -2591,7 +2617,8 @@ try_get_loop_niter (loop_p loop, struct tree_niter_desc *niter)
 
 static bool
 try_create_reduction_list (loop_p loop,
-			   reduction_info_table_type *reduction_list)
+			   reduction_info_table_type *reduction_list,
+			   bool oacc_kernels_p)
 {
   edge exit = single_dom_exit (loop);
   gphi_iterator gsi;
@@ -2681,6 +2708,61 @@ try_create_reduction_list (loop_p loop,
     }
 
 
+  if (oacc_kernels_p)
+    {
+      edge e = loop_preheader_edge (loop);
+
+      for (gsi = gsi_start_phis (loop->header); !gsi_end_p (gsi);
+	   gsi_next (&gsi))
+	{
+	  gphi *phi = gsi.phi ();
+	  tree def = PHI_RESULT (phi);
+	  affine_iv iv;
+
+	  if (!virtual_operand_p (def) && !simple_iv (loop, loop, def, &iv, true))
+	    {
+	      struct reduction_info *red;
+	      red = reduction_phi (reduction_list, phi);
+
+	      /* Look for pattern:
+
+		 <bb preheader>
+		   .omp_data_i = &.omp_data_arr;
+		   addr = .omp_data_i->sum;
+		   sum_a = *addr;
+
+		 <bb header>:
+		   sum_b = PHI <sum_a (preheader), sum_c (latch)>
+
+		 and assign addr to reduc->reduc_addr.  */
+
+	      tree arg = PHI_ARG_DEF_FROM_EDGE (phi, e);
+	      gimple stmt = SSA_NAME_DEF_STMT (arg);
+	      if (!gimple_assign_single_p (stmt))
+		return false;
+	      tree memref = gimple_assign_rhs1 (stmt);
+	      if (TREE_CODE (memref) != MEM_REF)
+		return false;
+	      tree addr = TREE_OPERAND (memref, 0);
+
+	      gimple stmt2 = SSA_NAME_DEF_STMT (addr);
+	      if (!gimple_assign_single_p (stmt2))
+		return false;
+	      tree compref = gimple_assign_rhs1 (stmt2);
+	      if (TREE_CODE (compref) != COMPONENT_REF)
+		return false;
+	      tree addr2 = TREE_OPERAND (compref, 0);
+	      if (TREE_CODE (addr2) != MEM_REF)
+		return false;
+	      addr2 = TREE_OPERAND (addr2, 0);
+	      if (TREE_CODE (addr2) != SSA_NAME
+		  || !gimple_stmt_omp_data_i_init_p (SSA_NAME_DEF_STMT (addr2)))
+		return false;
+	      red->reduc_addr = addr;
+	    }
+	}
+    }
+
   return true;
 }
 
@@ -2784,7 +2866,7 @@ parallelize_loops (bool oacc_kernels_p)
       if (!try_get_loop_niter (loop, &niter_desc))
 	continue;
 
-      if (!try_create_reduction_list (loop, &reduction_list))
+      if (!try_create_reduction_list (loop, &reduction_list, oacc_kernels_p))
 	continue;
 
       if (!flag_loop_parallelize_all
diff --git a/libgomp/testsuite/libgomp.oacc-c-c++-common/kernels-reduction.c b/libgomp/testsuite/libgomp.oacc-c-c++-common/kernels-reduction.c
new file mode 100644
index 0000000..6984a3b
--- /dev/null
+++ b/libgomp/testsuite/libgomp.oacc-c-c++-common/kernels-reduction.c
@@ -0,0 +1,37 @@
+/* { dg-do run } */
+/* { dg-options "-ftree-parallelize-loops=32 -O2" } */
+
+#include <stdlib.h>
+
+#define n 10000
+
+unsigned int a[n];
+
+void  __attribute__((noinline,noclone))
+foo (void)
+{
+  int i;
+  unsigned int sum = 1;
+
+#pragma acc kernels copyin (a[0:n]) copy (sum)
+  {
+    for (i = 0; i < n; ++i)
+      sum += a[i];
+  }
+
+  if (sum != 5001)
+    abort ();
+}
+
+int
+main ()
+{
+  int i;
+
+  for (i = 0; i < n; ++i)
+    a[i] = i % 2;
+
+  foo ();
+
+  return 0;
+}
-- 
1.9.1


Reply via email to