Re: [og7] vector_length extension part 2: Generalize state propagation and synchronization

Cesar Philippidis Wed, 21 Mar 2018 21:00:05 -0700

On 03/21/2018 10:10 AM, Tom de Vries wrote:
> On 03/02/2018 05:55 PM, Cesar Philippidis wrote:
>> In addition, nvptx_cta_sync and the corresponding nvptx_barsync insn,
>> have been extended to take a barrier ID and a thread count. The idea
>> here is to assign one barrier for each logical vector. Worker-single
>> synchronization is controlled by barrier 0. Therefore, the vector
>> barrier ID is set to tid.y+1 (because there's one vector unit per
>> worker) in nvptx_init_oacc_workers and placed into a register stored in
>> cfun->machine->sync_bar. If no workers are present, then the barrier ID
>> falls back to 0.
> 
> I compiled a worker loop before and after the patch series, and observed
> this change:
> ...
> @@ -70,7 +71,7 @@
>   $L2:
>    // joining 2;
>   $L5:
> -  bar.sync 1;
> +  bar.sync 0;
>    // join 2;
>    ret;
>  }
> ...
> 
> AFAICT from your explanation above, that change is intentional.
> 
> Changing the code generation scheme for workers is fine, but obviously
> that should be a minimal, separate patch that we can bisect back to.


That sounds reasonable. I'll apply this patch to og7 once testing has
completed. While all of the functionality it introduces is unnecessary
without the vector length changes, at least it can be applied independently.

Cesar

Update bar.sync usage

2018-03-21  Cesar Philippidis  <ce...@codesourcery.com>

	gcc/
	* config/nvptx/nvptx.c (nvptx_cta_sync): Change arguments to take
	in a lock and thread count.  Update call to gen_nvptx_barsync.
	(nvptx_single): Update call to nvptx_cta_sync.
	(nvptx_process_pars): Likewise.
	* config/nvptx/nvptx.md (nvptx_barsync): Adjust operands.

diff --git a/gcc/config/nvptx/nvptx.c b/gcc/config/nvptx/nvptx.c
index b7e3f59fed7..029628f8a0e 100644
--- a/gcc/config/nvptx/nvptx.c
+++ b/gcc/config/nvptx/nvptx.c
@@ -3936,13 +3936,14 @@ nvptx_shared_propagate (bool pre_p, bool is_call, basic_block block,
   return empty;
 }
 
-/* Emit a CTA-level synchronization barrier.  We use different
-   markers for before and after synchronizations.  */
+/* Emit a CTA-level synchronization barrier (bar.sync).  LOCK is the
+   barrier number, which is an integer or a register.  THREADS is the
+   number of threads controlled by the barrier.  */
 
 static rtx
-nvptx_cta_sync (bool after)
+nvptx_cta_sync (rtx lock, int threads)
 {
-  return gen_nvptx_barsync (GEN_INT (after));
+  return gen_nvptx_barsync (lock, GEN_INT (threads));
 }
 
 #if WORKAROUND_PTXJIT_BUG
@@ -4192,6 +4193,8 @@ nvptx_single (unsigned mask, basic_block from, basic_block to)
 	  /* Includes worker mode, do spill & fill.  By construction
 	     we should never have worker mode only. */
 	  broadcast_data_t data;
+	  rtx barrier = GEN_INT (0);
+	  int threads = 0;
 
 	  data.base = oacc_bcast_sym;
 	  data.ptr = 0;
@@ -4204,14 +4207,14 @@ nvptx_single (unsigned mask, basic_block from, basic_block to)
 						    false),
 			    before);
 	  /* Barrier so other workers can see the write.  */
-	  emit_insn_before (nvptx_cta_sync (false), tail);
+	  emit_insn_before (nvptx_cta_sync (barrier, threads), tail);
 	  data.offset = 0;
 	  emit_insn_before (nvptx_gen_shared_bcast (pvar, PM_write, 0, &data,
 						    false), tail);
 	  /* This barrier is needed to avoid worker zero clobbering
 	     the broadcast buffer before all the other workers have
 	     had a chance to read this instance of it.  */
-	  emit_insn_before (nvptx_cta_sync (true), tail);
+	  emit_insn_before (nvptx_cta_sync (barrier, threads), tail);
 	}
 
       extract_insn (tail);
@@ -4328,12 +4331,14 @@ nvptx_process_pars (parallel *par)
       bool empty = nvptx_shared_propagate (true, is_call,
 					   par->forked_block, par->fork_insn,
 					   false);
+      rtx barrier = GEN_INT (0);
+      int threads = 0;
 
       if (!empty || !is_call)
 	{
 	  /* Insert begin and end synchronizations.  */
-	  emit_insn_before (nvptx_cta_sync (false), par->forked_insn);
-	  emit_insn_before (nvptx_cta_sync (true), par->join_insn);
+	  emit_insn_before (nvptx_cta_sync (barrier, threads), par->forked_insn);
+	  emit_insn_before (nvptx_cta_sync (barrier, threads), par->join_insn);
 	}
     }
   else if (par->mask & GOMP_DIM_MASK (GOMP_DIM_VECTOR))
diff --git a/gcc/config/nvptx/nvptx.md b/gcc/config/nvptx/nvptx.md
index 2b4bcb3a45b..e638a13c366 100644
--- a/gcc/config/nvptx/nvptx.md
+++ b/gcc/config/nvptx/nvptx.md
@@ -1421,10 +1421,16 @@
   [(set_attr "atomic" "true")])
 
 (define_insn "nvptx_barsync"
-  [(unspec_volatile [(match_operand:SI 0 "const_int_operand" "")]
+  [(unspec_volatile [(match_operand:SI 0 "nvptx_nonmemory_operand" "Ri")
+		     (match_operand:SI 1 "const_int_operand")]
 		    UNSPECV_BARSYNC)]
   ""
-  "\\tbar.sync\\t%0;"
+  {
+    if (!REG_P (operands[0]))
+      return "\\tbar.sync\\t%0;";
+    else
+      return "\\tbar.sync\\t%0, %1;";
+  }
   [(set_attr "predicable" "false")])
 
 (define_insn "nvptx_nounroll"

Re: [og7] vector_length extension part 2: Generalize state propagation and synchronization

Reply via email to