Hi,
Currently, when we enable -mlong-vector-in-workers in gemm.f90, we get:
...
{
.reg.u32 %tidy;
.reg.u64 %t_bcast;
.reg.u64 %y64;
mov.u32 %tidy, %tid.y;
cvt.u64.u32 %y64, %tidy;
add.u64 %y64, %y64, 1;
cvta.shared.u64 %t_bcast, __oacc_bcast;
mad.lo.u64 %r166, %y64, 104, %t_bcast;
}
@ %r179 bra.uni $L28;
@ %r174 bra $L29;
...
setp.le.s32 %r114,%r113,0;
selp.u32 %r182,1,0,%r114;
st.u32 [%r166],%r182;
$L29:
$L28:
bar.sync %r167,128;
ld.u32 %r183,[%r166];
setp.ne.u32 %r114,%r183,0;
bar.sync %r167,128;
@ %r114 bra.uni $L1
...
The branch condition %114 is computed in a W0V0 region, and then
broadcast to a WAVA region. The broadcast is done using a partition of
the broadcast buffer at %r166, but this is a worker-specific buffer.
So since the writing of the buffer is done in worker 0 only, the read in
workers other than 0 is reading uninitialized memory.
This patch fixes this by using the generic broadcast buffer in this
case, rather than a worker-specific one.
Build x86_64 with nvptx accelerator and tested libgomp.
Committed to og7.
Thanks,
- Tom
[nvptx] Fix propagation of branch cond in vw-neutered code
2018-04-12 Tom de Vries <t...@codesourcery.com>
PR target/85246
* config/nvptx/nvptx.c (nvptx_single): Don't use partitioning when
propagating branch condition calculated in vector-worker-neutered code.
* testsuite/libgomp.oacc-fortran/gemm.f90: Use
-foffload=-mlong-vector-in-workers.
---
gcc/config/nvptx/nvptx.c | 3 ++-
libgomp/testsuite/libgomp.oacc-fortran/gemm.f90 | 1 +
2 files changed, 3 insertions(+), 1 deletion(-)
diff --git a/gcc/config/nvptx/nvptx.c b/gcc/config/nvptx/nvptx.c
index 547022e..9d011eb 100644
--- a/gcc/config/nvptx/nvptx.c
+++ b/gcc/config/nvptx/nvptx.c
@@ -4306,13 +4306,14 @@ nvptx_single (unsigned mask, basic_block from, basic_block to)
broadcast_data_t data;
unsigned size = GET_MODE_SIZE (SImode);
bool vector = (GOMP_DIM_MASK (GOMP_DIM_VECTOR) == mask) != 0;
+ bool worker = (GOMP_DIM_MASK (GOMP_DIM_WORKER) == mask) != 0;
rtx barrier = GEN_INT (0);
int threads = 0;
data.base = oacc_bcast_sym;
data.ptr = 0;
- bool use_partitioning_p = (vector
+ bool use_partitioning_p = (vector && !worker
&& nvptx_mach_max_workers () > 1
&& cfun->machine->bcast_partition);
if (use_partitioning_p)
diff --git a/libgomp/testsuite/libgomp.oacc-fortran/gemm.f90 b/libgomp/testsuite/libgomp.oacc-fortran/gemm.f90
index ad67dce..744d21e 100644
--- a/libgomp/testsuite/libgomp.oacc-fortran/gemm.f90
+++ b/libgomp/testsuite/libgomp.oacc-fortran/gemm.f90
@@ -1,6 +1,7 @@
! Exercise three levels of parallelism using SGEMM from BLAS.
! { dg-additional-options "-fopenacc-dim=-:-:128" }
+! { dg-additional-options "-foffload=-mlong-vector-in-workers" }
! Implicitly set vector_length to 128 using -fopenacc-dim.
subroutine openacc_sgemm (m, n, k, alpha, a, b, beta, c)