[Bug target/84952] [nvptx] bar.sync generated in divergent code

2018-04-26 Thread vries at gcc dot gnu.org
https://gcc.gnu.org/bugzilla/show_bug.cgi?id=84952

--- Comment #11 from Tom de Vries  ---
Author: vries
Date: Thu Apr 26 13:26:48 2018
New Revision: 259677

URL: https://gcc.gnu.org/viewcvs?rev=259677=gcc=rev
Log:
[nvptx] Verify bar.sync position

2018-04-26  Tom de Vries  

PR target/84952
* config/nvptx/nvptx.c (verify_neutering_jumps)
(verify_neutering_labels): New function
(nvptx_single): Use verify_neutering_jumps and verify_neutering_labels.

Modified:
trunk/gcc/ChangeLog
trunk/gcc/config/nvptx/nvptx.c

[Bug target/84952] [nvptx] bar.sync generated in divergent code

2018-03-20 Thread vries at gcc dot gnu.org
https://gcc.gnu.org/bugzilla/show_bug.cgi?id=84952

Tom de Vries  changed:

   What|Removed |Added

 Status|UNCONFIRMED |RESOLVED
 Resolution|--- |FIXED
   Assignee|unassigned at gcc dot gnu.org  |vries at gcc dot gnu.org
   Target Milestone|--- |8.0

--- Comment #10 from Tom de Vries  ---
(In reply to Tom de Vries from comment #7)
> Created attachment 43708 [details]
> 0002-Verify-bar.sync-position.patch (stage4)

I've put this patch on my stage1 commit branch.

Marking resolved-fixed.

[Bug target/84952] [nvptx] bar.sync generated in divergent code

2018-03-20 Thread vries at gcc dot gnu.org
https://gcc.gnu.org/bugzilla/show_bug.cgi?id=84952

--- Comment #9 from Tom de Vries  ---
Author: vries
Date: Tue Mar 20 10:31:23 2018
New Revision: 258676

URL: https://gcc.gnu.org/viewcvs?rev=258676=gcc=rev
Log:
[nvptx] Fix bar.sync position

2018-03-20  Tom de Vries  

PR target/84952
* config/nvptx/nvptx.c (nvptx_single): Don't neuter bar.sync.
(nvptx_process_pars): Emit bar.sync asap and alap.

Modified:
trunk/gcc/ChangeLog
trunk/gcc/config/nvptx/nvptx.c

[Bug target/84952] [nvptx] bar.sync generated in divergent code

2018-03-19 Thread vries at gcc dot gnu.org
https://gcc.gnu.org/bugzilla/show_bug.cgi?id=84952

--- Comment #8 from Tom de Vries  ---
(In reply to Tom de Vries from comment #5)
> For stage4, however, we want a fix without fixing optimization issue
> PR84025

> ...

>   @ %r34 bra.uni $L8;
>   @ %r33 bra $L9;
>   // join 2;
>  $L9:
>  $L8:

> ...

Now I realize that prevent_branch_around_nothing is not triggering here, while
it should. Filed PR84954 - "[nvptx] prevent_branch_around_nothing doesn't
trigger often enough"

[Bug target/84952] [nvptx] bar.sync generated in divergent code

2018-03-19 Thread vries at gcc dot gnu.org
https://gcc.gnu.org/bugzilla/show_bug.cgi?id=84952

--- Comment #7 from Tom de Vries  ---
Created attachment 43708
  --> https://gcc.gnu.org/bugzilla/attachment.cgi?id=43708=edit
0002-Verify-bar.sync-position.patch (stage4)

[Bug target/84952] [nvptx] bar.sync generated in divergent code

2018-03-19 Thread vries at gcc dot gnu.org
https://gcc.gnu.org/bugzilla/show_bug.cgi?id=84952

--- Comment #6 from Tom de Vries  ---
Created attachment 43707
  --> https://gcc.gnu.org/bugzilla/attachment.cgi?id=43707=edit
0001-Fix-bar.sync-position.patch (stage4)

[Bug target/84952] [nvptx] bar.sync generated in divergent code

2018-03-19 Thread vries at gcc dot gnu.org
https://gcc.gnu.org/bugzilla/show_bug.cgi?id=84952

--- Comment #5 from Tom de Vries  ---
For stage4, however, we want a fix without fixing optimization issue PR84025,
so we have:
... 
$ git log --pretty=%s --reverse HEAD^^..HEAD | cat -n
 1  Fix bar.sync position
 2  Verify bar.sync position
...

which results in:
...
// BEGIN PREAMBLE
.version 3.1
.target sm_30
.address_size 64
// END PREAMBLE

// BEGIN FUNCTION DECL: main$_omp_fn$0
.entry main$_omp_fn$0 (.param .u64 %in_ar0);

//:FUNC_MAP "main$_omp_fn$0", 0x1, 0x20, 0x20

// BEGIN VAR DEF: __worker_bcast
.shared .align 8 .u8 __worker_bcast[8];

// BEGIN FUNCTION DEF: main$_omp_fn$0
.entry main$_omp_fn$0 (.param .u64 %in_ar0)
{
  .reg .u64 %ar0;
  ld.param.u64 %ar0,[%in_ar0];
  .reg .u32 %r24;
  .reg .u64 %r25;
  .reg .pred %r26;
  .reg .u64 %r27;
  .reg .u64 %r28;
  .reg .u64 %r29;
  .reg .u64 %r30;
  .reg .u64 %r31;
  .reg .u64 %r32;
  .reg .pred %r33;
  .reg .pred %r34;

  {
.reg .u32 %y;
mov.u32 %y,%tid.y;
setp.ne.u32 %r34,%y,0;
  }

  {
.reg .u32 %x;
mov.u32 %x,%tid.x;
setp.ne.u32 %r33,%x,0;
  }

  @ %r34 bra.uni $L6;
  @ %r33 bra $L7;
  mov.u64 %r25,%ar0;
  // fork 2;
  cvta.shared.u64 %r32,__worker_bcast;
  st.u64 [%r32],%r25;
 $L7:
 $L6:

  bar.sync 0;
  @ %r33 bra $L5;
  // forked 2;
  cvta.shared.u64 %r31,__worker_bcast;
  ld.u64 %r25,[%r31];
  mov.u32 %r24,%tid.y;
  setp.le.s32 %r26,%r24,9;
  @ %r26 bra $L2;
  bra $L3;
 $L2:
  ld.u64 %r27,[%r25];
  cvt.s64.s32 %r28,%r24;
  shl.b64 %r29,%r28,2;
  add.u64 %r30,%r27,%r29;
  st.u32 [%r30],%r24;
 $L3:
  // joining 2;
 $L5:
  bar.sync 1;

  @ %r34 bra.uni $L8;
  @ %r33 bra $L9;
  // join 2;
 $L9:
 $L8:

  ret;
}
...

[Bug target/84952] [nvptx] bar.sync generated in divergent code

2018-03-19 Thread vries at gcc dot gnu.org
https://gcc.gnu.org/bugzilla/show_bug.cgi?id=84952

--- Comment #4 from Tom de Vries  ---
Created attachment 43705
  --> https://gcc.gnu.org/bugzilla/attachment.cgi?id=43705=edit
0003-Verify-bar.sync-position.patch

[Bug target/84952] [nvptx] bar.sync generated in divergent code

2018-03-19 Thread vries at gcc dot gnu.org
https://gcc.gnu.org/bugzilla/show_bug.cgi?id=84952

--- Comment #3 from Tom de Vries  ---
Created attachment 43704
  --> https://gcc.gnu.org/bugzilla/attachment.cgi?id=43704=edit
0002-Fix-bar.sync-position.patch

[Bug target/84952] [nvptx] bar.sync generated in divergent code

2018-03-19 Thread vries at gcc dot gnu.org
https://gcc.gnu.org/bugzilla/show_bug.cgi?id=84952

--- Comment #2 from Tom de Vries  ---
Created attachment 43703
  --> https://gcc.gnu.org/bugzilla/attachment.cgi?id=43703=edit
0001-Fix-branch-around-nothing.patch

[Bug target/84952] [nvptx] bar.sync generated in divergent code

2018-03-19 Thread vries at gcc dot gnu.org
https://gcc.gnu.org/bugzilla/show_bug.cgi?id=84952

--- Comment #1 from Tom de Vries  ---
For stage1, I have the following patch set:
...
git log --pretty=%s --reverse HEAD^^^..HEAD | cat -n
 1  Fix branch-around-nothing
 2  Fix bar.sync position
 3  Verify bar.sync position
...
where 1 is a fix for PR84025, 2 is a fix for this PR and 3 is a verification
for this PR.

Using this patch set, we generate (edited for readability):
...
// BEGIN PREAMBLE
.version 3.1
.target sm_30
.address_size 64
// END PREAMBLE

// BEGIN FUNCTION DECL: main$_omp_fn$0
.entry main$_omp_fn$0 (.param .u64 %in_ar0);

//:FUNC_MAP "main$_omp_fn$0", 0x1, 0x20, 0x20

// BEGIN VAR DEF: __worker_bcast
.shared .align 8 .u8 __worker_bcast[8];

// BEGIN FUNCTION DEF: main$_omp_fn$0
.entry main$_omp_fn$0 (.param .u64 %in_ar0)
{
  .reg .u64 %ar0;
  ld.param.u64 %ar0,[%in_ar0];
  .reg .u32 %r24;
  .reg .u64 %r25;
  .reg .pred %r26;
  .reg .u64 %r27;
  .reg .u64 %r28;
  .reg .u64 %r29;
  .reg .u64 %r30;
  .reg .u64 %r31;
  .reg .u64 %r32;
  .reg .pred %r33;
  .reg .pred %r34;

  {
.reg .u32 %y;
mov.u32 %y,%tid.y;
setp.ne.u32 %r34,%y,0;
  }

  {
.reg .u32 %x;
mov.u32 %x,%tid.x;
setp.ne.u32 %r33,%x,0;
  }

  @ %r34 bra.uni $L6;
  @ %r33 bra $L7;
  mov.u64 %r25,%ar0;
  // fork 2;
  cvta.shared.u64 %r32,__worker_bcast;
  st.u64 [%r32],%r25;
 $L7:
 $L6:

  bar.sync 0;
  // forked 2;
  @ %r33 bra $L5;
  cvta.shared.u64 %r31,__worker_bcast;
  ld.u64 %r25,[%r31];
  mov.u32 %r24,%tid.y;
  setp.le.s32 %r26,%r24,9;
  @ %r26 bra $L2;
  bra $L3;
 $L2:
  ld.u64 %r27,[%r25];
  cvt.s64.s32 %r28,%r24;
  shl.b64 %r29,%r28,2;
  add.u64 %r30,%r27,%r29;
  st.u32 [%r30],%r24;
 $L3:
  // joining 2;
 $L5:
  bar.sync 1;
  // join 2;

  ret;
}
...