https://gcc.gnu.org/g:a4fd191a5bc44904409e05ad6dc702fb478c1484

commit a4fd191a5bc44904409e05ad6dc702fb478c1484
Author: Thomas Schwinge <tschwi...@baylibre.com>
Date:   Fri May 10 12:50:23 2024 +0200

    nvptx: Make 'nvptx_uniform_warp_check' fit for non-full-warp execution, via 
'vote.all.pred'
    
    For example, this allows for '-muniform-simt' code to be executed
    single-threaded, which currently fails (device-side 'trap'): the 
'0xffffffff'
    bitmask isn't correct if not all 32 threads of a warp are active.  The same
    issue/fix, I suppose but have not verified, would apply if we were to allow 
for
    OpenACC 'vector_length' smaller than 32, for example for OpenACC 'serial'.
    
    We use 'nvptx_uniform_warp_check' only for PTX ISA version less than 6.0.
    Otherwise we're using 'nvptx_warpsync', which emits 'bar.warp.sync 
0xffffffff',
    which evidently appears to do the right thing.  (I've tested 
'-muniform-simt'
    code executing single-threaded.)
    
    The change that I proposed on 2022-12-15 was to emit PTX code to calculate
    '(1 << %ntid.x) - 1' as the actual bitmask to use instead of '0xffffffff'.
    This works, but the PTX JIT generates SASS code to do this computation.
    
    In turn, this change now uses PTX 'vote.all.pred' -- which even simplifies 
upon
    the original code a little bit, see the following examplary SASS 'diff' 
before
    vs. after this change:
    
        [...]
                  /*[...]*/                   SYNC                              
                          (*"BRANCH_TARGETS .L_x_332"*)        }
          .L_x_332:
        -         /*[...]*/                   VOTE.ANY R9, PT, PT ;
        +         /*[...]*/                   VOTE.ALL P1, PT ;
        -         /*[...]*/                   ISETP.NE.U32.AND P1, PT, R9, 
-0x1, PT ;
        -         /*[...]*/              @!P1 BRA `(.L_x_333) ;
        +         /*[...]*/               @P1 BRA `(.L_x_333) ;
                  /*[...]*/                   BPT.TRAP 0x1 ;
          .L_x_333:
        -         /*[...]*/               @P1 EXIT ;
        +         /*[...]*/              @!P1 EXIT ;
        [...]
    
            gcc/
            * config/nvptx/nvptx.md (nvptx_uniform_warp_check): Make fit for
            non-full-warp execution, via 'vote.all.pred'.
            gcc/testsuite/
            * gcc.target/nvptx/nvptx.exp
            (check_effective_target_default_ptx_isa_version_at_least_6_0):
            New.
            * gcc.target/nvptx/uniform-simt-2.c: Adjust.
            * gcc.target/nvptx/uniform-simt-5.c: New.
    
    (cherry picked from commit b4e68dd9084e48ee3e83c11d7f27548d8cca7066)

Diff:
---
 gcc/ChangeLog.omp                               | 11 ++++++++++
 gcc/config/nvptx/nvptx.md                       | 13 +++++-------
 gcc/testsuite/gcc.target/nvptx/nvptx.exp        |  5 +++++
 gcc/testsuite/gcc.target/nvptx/uniform-simt-2.c |  2 +-
 gcc/testsuite/gcc.target/nvptx/uniform-simt-5.c | 28 +++++++++++++++++++++++++
 5 files changed, 50 insertions(+), 9 deletions(-)

diff --git a/gcc/ChangeLog.omp b/gcc/ChangeLog.omp
index b80e90cb32a..66d27e6ab7c 100644
--- a/gcc/ChangeLog.omp
+++ b/gcc/ChangeLog.omp
@@ -1,3 +1,14 @@
+2024-05-10  Thomas Schwinge  <tschwi...@baylibre.com>
+
+       * config/nvptx/nvptx.md (nvptx_uniform_warp_check): Make fit for
+       non-full-warp execution, via 'vote.all.pred'.
+       gcc/testsuite/
+       * gcc.target/nvptx/nvptx.exp
+       (check_effective_target_default_ptx_isa_version_at_least_6_0):
+       New.
+       * gcc.target/nvptx/uniform-simt-2.c: Adjust.
+       * gcc.target/nvptx/uniform-simt-5.c: New.
+
 2024-05-28  Thomas Schwinge  <tschwi...@baylibre.com>
 
        * config/nvptx/nvptx.h: Configure global constructor, destructor
diff --git a/gcc/config/nvptx/nvptx.md b/gcc/config/nvptx/nvptx.md
index 4118de52a9a..4c32a20176a 100644
--- a/gcc/config/nvptx/nvptx.md
+++ b/gcc/config/nvptx/nvptx.md
@@ -2316,14 +2316,11 @@
   {
     const char *insns[] = {
       "{",
-      "\\t"              ".reg.b32"        "\\t" "%%r_act;",
-      "%.\\t"            "vote.ballot.b32" "\\t" "%%r_act,1;",
-      "\\t"              ".reg.pred"       "\\t" "%%r_do_abort;",
-      "\\t"              "mov.pred"        "\\t" "%%r_do_abort,0;",
-      "%.\\t"            "setp.ne.b32"     "\\t" "%%r_do_abort,%%r_act,"
-                                                 "0xffffffff;",
-      "@ %%r_do_abort\\t" "trap;",
-      "@ %%r_do_abort\\t" "exit;",
+      "\\t"            ".reg.pred"     "\\t" "%%r_sync;",
+      "\\t"            "mov.pred"      "\\t" "%%r_sync, 1;",
+      "%.\\t"          "vote.all.pred" "\\t" "%%r_sync, 1;",
+      "@!%%r_sync\\t"  "trap;",
+      "@!%%r_sync\\t"  "exit;",
       "}",
       NULL
     };
diff --git a/gcc/testsuite/gcc.target/nvptx/nvptx.exp 
b/gcc/testsuite/gcc.target/nvptx/nvptx.exp
index 97aa7ae0852..3151381f51a 100644
--- a/gcc/testsuite/gcc.target/nvptx/nvptx.exp
+++ b/gcc/testsuite/gcc.target/nvptx/nvptx.exp
@@ -49,6 +49,11 @@ proc check_effective_target_default_ptx_isa_version_at_least 
{ major minor } {
     return $res
 }
 
+# Return 1 if code by default compiles for at least PTX ISA version 6.0.
+proc check_effective_target_default_ptx_isa_version_at_least_6_0 { } {
+    return [check_effective_target_default_ptx_isa_version_at_least 6 0]
+}
+
 # Return 1 if code with PTX ISA version major.minor or higher can be run.
 proc check_effective_target_runtime_ptx_isa_version_at_least { major minor } {
     set name runtime_ptx_isa_version_${major}_${minor}
diff --git a/gcc/testsuite/gcc.target/nvptx/uniform-simt-2.c 
b/gcc/testsuite/gcc.target/nvptx/uniform-simt-2.c
index b1eee0d618f..1d83c49a44b 100644
--- a/gcc/testsuite/gcc.target/nvptx/uniform-simt-2.c
+++ b/gcc/testsuite/gcc.target/nvptx/uniform-simt-2.c
@@ -17,4 +17,4 @@ f (void)
 
 /* { dg-final { scan-assembler-times "@%r\[0-9\]*\tatom.global.cas" 1 } } */
 /* { dg-final { scan-assembler-times "shfl.idx.b32" 1 } } */
-/* { dg-final { scan-assembler-times "vote.ballot.b32" 1 } } */
+/* { dg-final { scan-assembler-times "vote.all.pred" 1 } } */
diff --git a/gcc/testsuite/gcc.target/nvptx/uniform-simt-5.c 
b/gcc/testsuite/gcc.target/nvptx/uniform-simt-5.c
new file mode 100644
index 00000000000..cd6ea82d293
--- /dev/null
+++ b/gcc/testsuite/gcc.target/nvptx/uniform-simt-5.c
@@ -0,0 +1,28 @@
+/* Verify that '-muniform-simt' code may be executed single-threaded.
+
+   { dg-do run }
+   { dg-options {-save-temps -O2 -muniform-simt} } */
+
+enum memmodel
+{
+  MEMMODEL_RELAXED = 0
+};
+
+unsigned long long int v64;
+unsigned long long int *p64 = &v64;
+
+int
+main()
+{
+  /* Trigger uniform-SIMT processing.  */
+  __atomic_fetch_add (p64, v64, MEMMODEL_RELAXED);
+
+  return 0;
+}
+
+/* Per 'omp_simt_exit':
+     - 'nvptx_warpsync'
+       { dg-final { scan-assembler-times {bar\.warp\.sync\t0xffffffff;} 1 { 
target default_ptx_isa_version_at_least_6_0 } } }
+     - 'nvptx_uniform_warp_check'
+       { dg-final { scan-assembler-times {vote\.all\.pred\t%r_sync, 1;} 1 { 
target { ! default_ptx_isa_version_at_least_6_0 } } } }
+*/

Reply via email to