[Bug target/83920] [nvptx] bad predicate reset

2018-01-18 Thread cesar at gcc dot gnu.org
https://gcc.gnu.org/bugzilla/show_bug.cgi?id=83920

--- Comment #10 from cesar at gcc dot gnu.org ---
And here is the working code in -O2:

{
.reg.u32%x;
mov.u32 %x, %tid.x;
setp.ne.u32 %r71, %x, 0;
}
@%r71   bra $L13;
mov.u64 %r45, %ar0;
mov.u64 %r46, %ar1;
mov.u32 %r42, %ctaid.x;
shl.b32 %r48, %r42, 2;
add.u32 %r37, %r48, %r42;
mov.u32 %r31, 5;
setp.ne.u64 %r64, %r46, 1;
mov.u32 %r66, 0;
$L13:
$L3:
mov.pred%r74, %r64;
setp.eq.u32 %r64, 1, 0;
@%r71   bra $L12;
$L12:
mov.pred%r64, %r74;
selp.u32%r75, 1, 0, %r64;
shfl.idx.b32%r75, %r75, 0, 31;
setp.ne.u32 %r64, %r75, 0;
@%r64   bra.uni $L2;
$L6:

Notice how gcse's PRE pass hoisted the initialization of %r64 early in the
entry block.

I think we should go with my patch. If the register is live, it shouldn't
require your workaround.

[Bug target/83920] [nvptx] bad predicate reset

2018-01-18 Thread cesar at gcc dot gnu.org
https://gcc.gnu.org/bugzilla/show_bug.cgi?id=83920

--- Comment #9 from cesar at gcc dot gnu.org ---
I figured out why my patch does work. Here's the assembly code for your C test
case at -O0:

{
.reg.u32%x;
mov.u32 %x, %tid.x;
setp.ne.u32 %r81, %x, 0;
}
@%r81   bra $L11;
mov.u64 %r58, %ar0;
st.u64  [%frame+32], %r58;
mov.u64 %r59, %ar1;
st.u64  [%frame+40], %r59;
ld.u64  %r60, [%frame+40];
cvt.u32.u64 %r26, %r60;
st.u32  [%frame], %r26;
ld.u64  %r61, [%frame+32];
st.u64  [%frame+8], %r61;
mov.u32 %r22, 0;
mov.u32 %r28, 1;
mov.u32 %r29, 1;
$L11:
$L9:
mov.pred%r82, %r62;
setp.eq.u32 %r62, 1, 0;
@%r81   bra $L12;
mov.u32 %r55, %nctaid.x;
mov.u32 %r56, %ctaid.x;
mov.u32 %r48, 9;
add.u32 %r49, %r55, %r48;
div.s32 %r50, %r49, %r55;
mul.lo.u32  %r23, %r56, %r50;
mov.u32 %r57, %nctaid.x;
mov.u32 %r51, 9;
add.u32 %r52, %r57, %r51;
div.s32 %r53, %r52, %r57;
add.u32 %r54, %r23, %r53;
min.s32 %r30, %r54, 10;
setp.ge.s32 %r62, %r23, %r30;
$L12:
mov.pred%r62, %r82;
selp.u32%r83, 1, 0, %r62;
shfl.idx.b32%r83, %r83, 0, 31;
setp.ne.u32 %r62, %r83, 0;
@%r62   bra.uni $L2;
$L4:
mov.pred%r84, %r64;
setp.eq.u32 %r64, 1, 0;
@%r81   bra $L13;

The predicate register in question here is %r62. Notice how the JIT workaround
clobbers %r62 much earlier than it's defined. My patch just copied the register
predicate register before it was clobbered. That's fine, but when it restores
the value of %r62 in L12, r62 gets an uninitialized value.

[Bug target/83920] [nvptx] bad predicate reset

2018-01-18 Thread cesar at gcc dot gnu.org
https://gcc.gnu.org/bugzilla/show_bug.cgi?id=83920

--- Comment #8 from cesar at gcc dot gnu.org ---
I tweaked your proposed fix as follows:

diff --git a/gcc/config/nvptx/nvptx.c b/gcc/config/nvptx/nvptx.c
index 55c7e3cbf90..24625cd303f 100644
--- a/gcc/config/nvptx/nvptx.c
+++ b/gcc/config/nvptx/nvptx.c
@@ -4104,8 +4104,11 @@ nvptx_single (unsigned mask, basic_block from,
basic_block to)
mov.u32 %x,%tid.x;
setp.ne.u32 %rnotvzero,%x,0;
 }
+  reg.pred %rcond2; // Scratch copy of the original rcond.

+  mov.pred %rcond2, %rcond;
 @%rnotvzero bra Lskip;
+  mov.pred %rcond, %rcond2
 setp.. %rcond,op1,op2;
 Lskip:
 selp.u32 %rcondu32,1,0,%rcond;
@@ -4126,8 +4129,11 @@ nvptx_single (unsigned mask, basic_block from,
basic_block to)
 There is nothing in the PTX spec to suggest that this is wrong, or
 to explain why the extra initialization is needed.  So, we
classify
 it as a JIT bug, and the extra initialization as workaround.  */
- emit_insn_before (gen_movbi (pvar, const0_rtx),
-   bb_first_real_insn (from));
+   rtx_insn *from_insn = bb_first_real_insn (from);
+   rtx ptmp = gen_reg_rtx (GET_MODE (pvar));
+   emit_insn_before (gen_rtx_SET (ptmp, pvar), from_insn);
+   emit_insn_before (gen_movbi (pvar, const0_rtx), from_insn);
+   emit_insn_before (gen_rtx_SET (pvar, ptmp), tail);
 #endif
  emit_insn_before (nvptx_gen_vcast (pvar), tail);
}

This generates the following assembly code for gemm.f90:

$L34:
$L11:
mov.pred%r413, %r314;
setp.eq.u32 %r314, 1, 0;
@%r402  bra $L33;
$L33:
mov.pred%r314, %r413;
selp.u32%r414, 1, 0, %r314;
shfl.idx.b32%r414, %r414, 0, 31;
setp.ne.u32 %r314, %r414, 0;
@!%r314 bra.uni $L22;
bra $L3;
$L12:

I'm not sure what's going on here, because this patch causes illegal memory
access errors in lsdalton. Any thoughts?

Maybe a more involved workaround would be to leave r314 alone, and use the
scratch %r413 register as the predicate. But, then wouldn't the prevent the PRE
code hoisting optimization which moved the computation for %r314 outside of the
loop in the first place?

Is this original PTX JIT bug still present in the current Nvidia drivers? You
mentioned that this problem first appeared in 381.22. I wonder if it has been
resolved in 387.

[Bug target/83920] [nvptx] bad predicate reset

2018-01-18 Thread vries at gcc dot gnu.org
https://gcc.gnu.org/bugzilla/show_bug.cgi?id=83920

--- Comment #7 from Tom de Vries  ---
(In reply to Tom de Vries from comment #6)
> (In reply to Tom de Vries from comment #3)
> > Likewise, reversing the if-then-else order in gemm.f90 makes the testcase
> > fail on trunk without this patch.
> 
> Minimal version:

An even more minimal c version:
...
/* { dg-do run } */

extern void abort (void);

#define n 10

static void __attribute__((noinline)) __attribute__((noclone))
gemm (int beta, int *c)
{
  #pragma acc parallel copy(c[0:(n * n) - 1]) num_gangs(2)
  #pragma acc loop gang
  for (int j = 0; j < n; ++j)
if (beta != 1)
  {
#pragma acc loop vector
for (int i = 0; i < n; ++i)
  c[i + (j * n)] = 0;
  }
}

int
main (void)
{
  int c[n * n];
  c[0] = 1;
  gemm (0.0, c);
  if (c[0] != 0)
abort ();
}
...

Passes at O0, fails at O2.

[Bug target/83920] [nvptx] bad predicate reset

2018-01-18 Thread vries at gcc dot gnu.org
https://gcc.gnu.org/bugzilla/show_bug.cgi?id=83920

--- Comment #6 from Tom de Vries  ---
(In reply to Tom de Vries from comment #3)
> Likewise, reversing the if-then-else order in gemm.f90 makes the testcase
> fail on trunk without this patch.

Minimal version:
...
! { dg-do run } 

subroutine gemm (BETA, C)
  real ::  C(100,100)
  integer :: i, j, l
  real, parameter :: one = 1.0
  real :: beta

  !$acc parallel copy(c(1:100,1:100)) num_gangs(2)  
  !$acc loop gang   
  do j = 1, 100
 if (beta /= one) then
!$acc loop vector   
do i = 1, 100
   C(i,j) = 0.0
end do
 end if
  end do
  !$acc end parallel
end subroutine gemm

program test_gemm
  real :: c(100,100), beta
  beta = 0.0
  c(:,:) = 1.0
  call gemm (beta, c)
  if (c(1,1) /= 0.0) call abort ()
end program test_gemm
...

Passes at O0, fails at O1 and higher.

[Bug target/83920] [nvptx] bad predicate reset

2018-01-18 Thread vries at gcc dot gnu.org
https://gcc.gnu.org/bugzilla/show_bug.cgi?id=83920

--- Comment #5 from Tom de Vries  ---
This ( PR83589 - "[nvptx] mode-transitions.c and private-variables.{c,f90}
execution FAILs at GOMP_NVPTX_JIT=-O0" ) may be a duplicate.

[Bug target/83920] [nvptx] bad predicate reset

2018-01-18 Thread vries at gcc dot gnu.org
https://gcc.gnu.org/bugzilla/show_bug.cgi?id=83920

--- Comment #4 from Tom de Vries  ---
(In reply to cesar from comment #0)
> the underlying problem is present
> in og7 and impacts da-1.c). 

That's a failure I did not manage to reproduce. For me, at commit
b4dd21b9a1f9f499c613b55225cad689b7928a7f "Use functional parameters for data
mappings in OpenACC child functions", that test passes, also when using -O3.

[Bug target/83920] [nvptx] bad predicate reset

2018-01-18 Thread vries at gcc dot gnu.org
https://gcc.gnu.org/bugzilla/show_bug.cgi?id=83920

--- Comment #3 from Tom de Vries  ---
(In reply to cesar from comment #0)
> I think there
> might be other PTX JIT bugs lurking here, because the test program still
> works as intended.

I can make it fail on trunk, by changing the workaround to initialize with one
instead of zero:
...
diff --git a/gcc/config/nvptx/nvptx.c b/gcc/config/nvptx/nvptx.c
index 86fc13f4fc0..ab03f3b5fe7 100644
--- a/gcc/config/nvptx/nvptx.c
+++ b/gcc/config/nvptx/nvptx.c
@@ -4097,7 +4097,7 @@ nvptx_single (unsigned mask, basic_block from,
basic_block to)
 There is nothing in the PTX spec to suggest that this is wrong, or
 to explain why the extra initialization is needed.  So, we
classify
 it as a JIT bug, and the extra initialization as workaround.  */
- emit_insn_before (gen_movbi (pvar, const0_rtx),
+ emit_insn_before (gen_movbi (pvar, constm1_rtx),
bb_first_real_insn (from));
 #endif
  emit_insn_before (nvptx_gen_vcast (pvar), tail);
...
[ Note that there are no regressions with this patch, so the test-case triggers
something not present in the current trunk test set. ]


Likewise, reversing the if-then-else order in gemm.f90 makes the testcase fail
on trunk without this patch.


So, this has nothing to do with PTX JIT bugs.

[Bug target/83920] [nvptx] bad predicate reset

2018-01-18 Thread vries at gcc dot gnu.org
https://gcc.gnu.org/bugzilla/show_bug.cgi?id=83920

Tom de Vries  changed:

   What|Removed |Added

 Status|UNCONFIRMED |NEW
   Last reconfirmed||2018-01-18
 Ever confirmed|0   |1

--- Comment #2 from Tom de Vries  ---
Confirmed. The workaround is intended to work on code like this:
...
{   
.reg .u32 %x;   
mov.u32 %x,%tid.x;  
setp.ne.u32 %rnotvzero,%x,0;
 }  

 @%rnotvzero bra Lskip; 
 setp.. %rcond,op1,op2;   
 Lskip: 
 selp.u32 %rcondu32,1,0,%rcond; 
 shfl.idx.b32 %rcondu32,%rcondu32,0,31; 
 setp.ne.u32 %rcond,%rcondu32,0;
...
and adds 'setp.eq.u32 %rcond, 1, 0;' before "bra Lskip". However, if the branch
condition is not calculated in the basic block containing the conditional jump,
then the workaround overwrites the branch condition.

In the case of comment 1, it's the most extreme case: we're neutering an empty
block:
...
@%r341  bra $L33;
$L33:
...
so the condition must be defined elsewhere.

[Bug target/83920] [nvptx] bad predicate reset

2018-01-17 Thread cesar at gcc dot gnu.org
https://gcc.gnu.org/bugzilla/show_bug.cgi?id=83920

--- Comment #1 from cesar at gcc dot gnu.org ---
Created attachment 43165
  --> https://gcc.gnu.org/bugzilla/attachment.cgi?id=43165=edit
assembly code