When forking to call a partitioned routine there is no need to propagate local state from the active thread to the forked threads.

I've committed this patch to implement that optimization.

nathan
2015-08-25  Nathan Sidwell  <nat...@codesourcery.com>

	* config/nvptx/nvptx.c (nvptx_emit_forking): Add is_call argument,
	propagate it into mask.
	(nvptx_emit_joining): Likewise.
	(nvptx_expand_call): Move emit_forking call to later.  Add is_call
	argument.
	(nvptx_expand_oacc_fork, nvptx_expand_oacc_join): Asjust calls.
	(nvptx_discover_pars): Don't look for predecessor insn in call
	forks and joins.
	(nvptx_process_pars): Don't emit propagation code for a call.

Index: gcc/config/nvptx/nvptx.c
===================================================================
--- gcc/config/nvptx/nvptx.c	(revision 227159)
+++ gcc/config/nvptx/nvptx.c	(working copy)
@@ -1047,16 +1047,16 @@ nvptx_expand_compare (rtx compare)
 /* Emit forking instructions for MASK.  */
 
 static void
-nvptx_emit_forking (unsigned mask)
+nvptx_emit_forking (unsigned mask, bool is_call)
 {
   mask &= (GOMP_DIM_MASK (GOMP_DIM_WORKER)
 	   | GOMP_DIM_MASK (GOMP_DIM_VECTOR));
   if (mask)
     {
-      rtx op = GEN_INT (mask);
+      rtx op = GEN_INT (mask | (is_call << GOMP_DIM_MAX));
       
       /* Emit fork for worker level.  */
-      if (mask & GOMP_DIM_MASK (GOMP_DIM_WORKER))
+      if (!is_call && mask & GOMP_DIM_MASK (GOMP_DIM_WORKER))
 	emit_insn (gen_nvptx_fork (op));
       emit_insn (gen_nvptx_forked (op));
     }
@@ -1065,16 +1065,19 @@ nvptx_emit_forking (unsigned mask)
 /* Emit joining instructions for MASK.  */
 
 static void
-nvptx_emit_joining (unsigned mask)
+nvptx_emit_joining (unsigned mask, bool is_call)
 {
   mask &= (GOMP_DIM_MASK (GOMP_DIM_WORKER)
 	   | GOMP_DIM_MASK (GOMP_DIM_VECTOR));
   if (mask)
     {
-      rtx op = GEN_INT (mask);
+      rtx op = GEN_INT (mask | (is_call << GOMP_DIM_MAX));
 
-      /* Emit joining for all pars.  */
-      emit_insn (gen_nvptx_joining (op));
+      /* Emit joining for all non-call pars to ensure there's a single
+	 predecessor for the block the join insn ends up in.  This is
+	 needed for skipping entire loops.  */
+      if (!is_call)
+	emit_insn (gen_nvptx_joining (op));
       emit_insn (gen_nvptx_join (op));
     }
 }
@@ -1135,8 +1138,6 @@ nvptx_expand_call (rtx retval, rtx addre
 	}
     }
 
-  nvptx_emit_forking (parallel);
-
   if (cfun->machine->funtype
       /* It's possible to construct testcases where we call a variable.
 	 See compile/20020129-1.c.  stdarg_p will crash so avoid calling it
@@ -1195,11 +1196,12 @@ nvptx_expand_call (rtx retval, rtx addre
 	  write_func_decl_from_insn (func_decls, retval, pat, callee);
 	}
     }
+  nvptx_emit_forking (parallel, true);
   emit_call_insn (pat);
   if (tmp_retval != retval)
     emit_move_insn (retval, tmp_retval);
 
-  nvptx_emit_joining (parallel);
+  nvptx_emit_joining (parallel, true);
 }
 
 /* Expand the oacc fork & join primitive into ptx-required unspecs.  */
@@ -1207,13 +1209,13 @@ nvptx_expand_call (rtx retval, rtx addre
 void
 nvptx_expand_oacc_fork (unsigned mode)
 {
-  nvptx_emit_forking (GOMP_DIM_MASK (mode));
+  nvptx_emit_forking (GOMP_DIM_MASK (mode), false);
 }
 
 void
 nvptx_expand_oacc_join (unsigned mode)
 {
-  nvptx_emit_joining (GOMP_DIM_MASK (mode));
+  nvptx_emit_joining (GOMP_DIM_MASK (mode), false);
 }
 
 /* Expander for reduction locking and unlocking.  We expect SRC to be
@@ -2611,7 +2613,8 @@ nvptx_discover_pars (bb_insn_map_t *map)
 		l = new parallel (l, mask);
 		l->forked_block = block;
 		l->forked_insn = end;
-		if (mask & GOMP_DIM_MASK (GOMP_DIM_WORKER))
+		if (!(mask & GOMP_DIM_MASK (GOMP_DIM_MAX))
+		    && (mask & GOMP_DIM_MASK (GOMP_DIM_WORKER)))
 		  l->fork_insn
 		    = nvptx_discover_pre (block, CODE_FOR_nvptx_fork);
 	      }
@@ -2626,7 +2629,8 @@ nvptx_discover_pars (bb_insn_map_t *map)
 		gcc_assert (l->mask == mask);
 		l->join_block = block;
 		l->join_insn = end;
-		if (mask & GOMP_DIM_MASK (GOMP_DIM_WORKER))
+		if (!(mask & GOMP_DIM_MASK (GOMP_DIM_MAX))
+		    && (mask & GOMP_DIM_MASK (GOMP_DIM_WORKER)))
 		  l->joining_insn
 		    = nvptx_discover_pre (block, CODE_FOR_nvptx_joining);
 		l = l->parent;
@@ -3013,7 +3017,9 @@ nvptx_process_pars (parallel *par)
       inner_mask |= par->inner_mask;
     }
 
-  if (par->mask & GOMP_DIM_MASK (GOMP_DIM_WORKER))
+  if (par->mask & GOMP_DIM_MASK (GOMP_DIM_MAX))
+    { /* No propagation needed for a call.  */ }
+  else if (par->mask & GOMP_DIM_MASK (GOMP_DIM_WORKER))
     {
       nvptx_wpropagate (false, par->forked_block, par->forked_insn);
       nvptx_wpropagate (true, par->forked_block, par->fork_insn);

Reply via email to